mirror of
				https://github.com/JonasunderscoreJones/turbo-octo-potato.git
				synced 2025-10-25 19:19:19 +02:00 
			
		
		
		
	
		
			
				
	
	
		
			139 lines
		
	
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			139 lines
		
	
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| '''Fetch the monthly comeback/debut/release pages on kprofiles.com'''
 | |
| import os
 | |
| import sys
 | |
| import datetime
 | |
| import requests
 | |
| import dotenv
 | |
| 
 | |
| import top_lib
 | |
| 
 | |
| dotenv.load_dotenv()
 | |
| 
 | |
| WORKING_DIR = os.getenv('WORKING_DIR')
 | |
| 
 | |
| 
 | |
| def get_links():
 | |
|     '''Get the links to the monthly comeback/debut/release pages on kprofiles.com
 | |
|     
 | |
|     Returns: a list of links to the monthly comeback/debut/release pages on kprofiles.com'''
 | |
|     links = []
 | |
| 
 | |
|     # Starting month and year
 | |
|     start_date = datetime.date(2020, 3, 1)
 | |
| 
 | |
|     # End month and year
 | |
|     end_date = datetime.date.today().replace(day=1)
 | |
|     end_date = end_date.replace(month=end_date.month + 1) if end_date.month != 12 \
 | |
|         else end_date.replace(year=end_date.year + 1, month=1)
 | |
|     current_date = start_date
 | |
|     while current_date <= end_date:
 | |
|         # Construct the URL based on the current month and year
 | |
|         links.append(f"https://kprofiles.com/{current_date.strftime('%B').lower() }" + \
 | |
|                      f"-{current_date.year}-comebacks-debuts-releases/")
 | |
| 
 | |
|         # Move to the next month
 | |
|         if current_date.month == 12:
 | |
|             current_date = current_date.replace(year=current_date.year + 1, month=1)
 | |
|         else:
 | |
|             current_date = current_date.replace(month=current_date.month + 1)
 | |
| 
 | |
|     return links
 | |
| 
 | |
| def check_link_extensions(link, comeback_compilation):
 | |
|     '''Check if the link is valid
 | |
|     
 | |
|     link: the link to check
 | |
|     comeback_compilation: the text of the kprofiles comeback compilation page
 | |
|     
 | |
|     Returns: the link if it is valid, None if it is not'''
 | |
|     if link in comeback_compilation:
 | |
|         return link
 | |
|     link_base = link.copy()
 | |
|     if link.replace("-debuts-releases", "") in comeback_compilation:
 | |
|         link = link.replace("-debuts-releases", "")
 | |
|     elif link.replace("-comebacks-debuts-releases", "") in comeback_compilation:
 | |
|         link = link.replace("-comebacks-debuts-releases", "")
 | |
|     elif link.replace("-comebacks-debuts-releases", "-kpop") in comeback_compilation:
 | |
|         link = link.replace("-comebacks-debuts-releases", "-kpop")
 | |
|     elif link[:-1] + "-2/" in comeback_compilation:
 | |
|         link = link[:-1] + "-2/" # WHY IS OCTOBER 2020 THE ONLY MONTH WITH A -2
 | |
|     elif link.replace("-comebacks-debuts-releases",
 | |
|                       "-kpop-comebacks-debuts-releases") in comeback_compilation:
 | |
|         link = link.replace("-comebacks-debuts-releases", "-kpop-comebacks-debuts-releases")
 | |
|     elif link.replace("-comebacks-debuts-releases", "-kpop-comebacks") in comeback_compilation:
 | |
|         link = link.replace("-comebacks-debuts-releases", "-kpop-comebacks")
 | |
|     if link != link_base:
 | |
|         return link
 | |
|     print("Link not found: " + link)
 | |
| 
 | |
| 
 | |
| def filter_valid_links(links):
 | |
|     '''Filter out invalid links
 | |
|     
 | |
|     links: the list of links to filter
 | |
|     
 | |
|     Returns: a list of valid links'''
 | |
|     valid_links = []
 | |
|     compilation_link = "https://kprofiles.com/comebacks/page/"
 | |
|     comeback_compilation = ""
 | |
|     for i in range(1, 100):
 | |
|         request = requests.get(compilation_link + str(i))
 | |
|         if request.status_code == 200:
 | |
|             comeback_compilation += request.text
 | |
|         else:
 | |
|             break
 | |
| 
 | |
|     for link in links:
 | |
|         is_valid = check_link_extensions(link, comeback_compilation)
 | |
|         if is_valid:
 | |
|             valid_links.append(is_valid)
 | |
| 
 | |
|     return valid_links
 | |
| 
 | |
| def fetch_site(link):
 | |
|     '''Fetch the site from the given link
 | |
| 
 | |
|     link: the link to fetch
 | |
| 
 | |
|     Returns: the text of the site'''
 | |
|     #check if file already exists
 | |
|     if os.path.isfile(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html") \
 | |
|         and not FORCE_REFRESH:
 | |
|         # read from file
 | |
|         with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "r") as file:
 | |
|             return file.read()
 | |
|     request = requests.get(link)
 | |
|     if request.status_code == 200:
 | |
|         # save to file
 | |
|         with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "w") as file:
 | |
|             file.write(request.text)
 | |
|         return request.text
 | |
| 
 | |
| def fetch_handler(links):
 | |
|     '''Fetch the sites from the given links
 | |
| 
 | |
|     links: the links to fetch
 | |
| 
 | |
|     Returns: a list of the text of the sites'''
 | |
|     data = []
 | |
|     progress_bar = top_lib.Progressbar(total=len(links))
 | |
|     progress_bar.print(0)
 | |
|     try:
 | |
|         os.makedirs(WORKING_DIR + "/kprofiles/", exist_ok=True)
 | |
|     except OSError:
 | |
|         print(f"Creation of the directory {WORKING_DIR}/kprofiles/ failed.")
 | |
|         sys.exit(1)
 | |
|     for link in links:
 | |
|         data.append(fetch_site(link))
 | |
|         progress_bar.print(links.index(link) + 1)
 | |
|     return data
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     # launch args
 | |
|     FORCE_REFRESH = "-f" in sys.argv
 | |
| 
 | |
|     print("Fetching kprofiles.com... (This may take a while, kprofiles is slow...)")
 | |
|     links = get_links()
 | |
|     valid_links = filter_valid_links(links)
 | |
|     data = fetch_handler(valid_links)
 |