mirror of
https://github.com/JonasunderscoreJones/turbo-octo-potato.git
synced 2025-10-28 19:49:19 +01:00
moved to spearate directory
This commit is contained in:
parent
908630443b
commit
3e4dbc012d
11 changed files with 301 additions and 8 deletions
|
|
@ -1,139 +0,0 @@
|
|||
'''Fetch the monthly comeback/debut/release pages on kprofiles.com'''
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import requests
|
||||
import dotenv
|
||||
|
||||
import top_lib
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
WORKING_DIR = os.getenv('WORKING_DIR')
|
||||
|
||||
|
||||
def get_links():
|
||||
'''Get the links to the monthly comeback/debut/release pages on kprofiles.com
|
||||
|
||||
Returns: a list of links to the monthly comeback/debut/release pages on kprofiles.com'''
|
||||
links = []
|
||||
|
||||
# Starting month and year
|
||||
start_date = datetime.date(2020, 3, 1)
|
||||
|
||||
# End month and year
|
||||
end_date = datetime.date.today().replace(day=1)
|
||||
end_date = end_date.replace(month=end_date.month + 1) if end_date.month != 12 \
|
||||
else end_date.replace(year=end_date.year + 1, month=1)
|
||||
current_date = start_date
|
||||
while current_date <= end_date:
|
||||
# Construct the URL based on the current month and year
|
||||
links.append(f"https://kprofiles.com/{current_date.strftime('%B').lower() }" + \
|
||||
f"-{current_date.year}-comebacks-debuts-releases/")
|
||||
|
||||
# Move to the next month
|
||||
if current_date.month == 12:
|
||||
current_date = current_date.replace(year=current_date.year + 1, month=1)
|
||||
else:
|
||||
current_date = current_date.replace(month=current_date.month + 1)
|
||||
|
||||
return links
|
||||
|
||||
def check_link_extensions(link, comeback_compilation):
|
||||
'''Check if the link is valid
|
||||
|
||||
link: the link to check
|
||||
comeback_compilation: the text of the kprofiles comeback compilation page
|
||||
|
||||
Returns: the link if it is valid, None if it is not'''
|
||||
if link in comeback_compilation:
|
||||
return link
|
||||
link_base = link.copy()
|
||||
if link.replace("-debuts-releases", "") in comeback_compilation:
|
||||
link = link.replace("-debuts-releases", "")
|
||||
elif link.replace("-comebacks-debuts-releases", "") in comeback_compilation:
|
||||
link = link.replace("-comebacks-debuts-releases", "")
|
||||
elif link.replace("-comebacks-debuts-releases", "-kpop") in comeback_compilation:
|
||||
link = link.replace("-comebacks-debuts-releases", "-kpop")
|
||||
elif link[:-1] + "-2/" in comeback_compilation:
|
||||
link = link[:-1] + "-2/" # WHY IS OCTOBER 2020 THE ONLY MONTH WITH A -2
|
||||
elif link.replace("-comebacks-debuts-releases",
|
||||
"-kpop-comebacks-debuts-releases") in comeback_compilation:
|
||||
link = link.replace("-comebacks-debuts-releases", "-kpop-comebacks-debuts-releases")
|
||||
elif link.replace("-comebacks-debuts-releases", "-kpop-comebacks") in comeback_compilation:
|
||||
link = link.replace("-comebacks-debuts-releases", "-kpop-comebacks")
|
||||
if link != link_base:
|
||||
return link
|
||||
print("Link not found: " + link)
|
||||
|
||||
|
||||
def filter_valid_links(links):
|
||||
'''Filter out invalid links
|
||||
|
||||
links: the list of links to filter
|
||||
|
||||
Returns: a list of valid links'''
|
||||
valid_links = []
|
||||
compilation_link = "https://kprofiles.com/comebacks/page/"
|
||||
comeback_compilation = ""
|
||||
for i in range(1, 100):
|
||||
request = requests.get(compilation_link + str(i))
|
||||
if request.status_code == 200:
|
||||
comeback_compilation += request.text
|
||||
else:
|
||||
break
|
||||
|
||||
for link in links:
|
||||
is_valid = check_link_extensions(link, comeback_compilation)
|
||||
if is_valid:
|
||||
valid_links.append(is_valid)
|
||||
|
||||
return valid_links
|
||||
|
||||
def fetch_site(link):
|
||||
'''Fetch the site from the given link
|
||||
|
||||
link: the link to fetch
|
||||
|
||||
Returns: the text of the site'''
|
||||
#check if file already exists
|
||||
if os.path.isfile(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html") \
|
||||
and not FORCE_REFRESH:
|
||||
# read from file
|
||||
with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "r") as file:
|
||||
return file.read()
|
||||
request = requests.get(link)
|
||||
if request.status_code == 200:
|
||||
# save to file
|
||||
with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "w") as file:
|
||||
file.write(request.text)
|
||||
return request.text
|
||||
|
||||
def fetch_handler(links):
|
||||
'''Fetch the sites from the given links
|
||||
|
||||
links: the links to fetch
|
||||
|
||||
Returns: a list of the text of the sites'''
|
||||
data = []
|
||||
progress_bar = top_lib.Progressbar(total=len(links))
|
||||
progress_bar.print(0)
|
||||
try:
|
||||
os.makedirs(WORKING_DIR + "/kprofiles/", exist_ok=True)
|
||||
except OSError:
|
||||
print(f"Creation of the directory {WORKING_DIR}/kprofiles/ failed.")
|
||||
sys.exit(1)
|
||||
for link in links:
|
||||
data.append(fetch_site(link))
|
||||
progress_bar.print(links.index(link) + 1)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# launch args
|
||||
FORCE_REFRESH = "-f" in sys.argv
|
||||
|
||||
print("Fetching kprofiles.com... (This may take a while, kprofiles is slow...)")
|
||||
links = get_links()
|
||||
valid_links = filter_valid_links(links)
|
||||
data = fetch_handler(valid_links)
|
||||
Loading…
Add table
Add a link
Reference in a new issue