mirror of
				https://github.com/JonasunderscoreJones/turbo-octo-potato.git
				synced 2025-10-25 11:09:18 +02:00 
			
		
		
		
	added some stuff
This commit is contained in:
		
							parent
							
								
									7b3577206f
								
							
						
					
					
						commit
						7e693d0cda
					
				
					 3 changed files with 306 additions and 0 deletions
				
			
		
							
								
								
									
										111
									
								
								kprofilesfetch.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								kprofilesfetch.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,111 @@ | ||||||
|  | import datetime, requests, dotenv, os, sys | ||||||
|  | import top_lib | ||||||
|  | 
 | ||||||
|  | dotenv.load_dotenv() | ||||||
|  | 
 | ||||||
|  | WORKING_DIR = os.getenv('WORKING_DIR') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def getLinks(): | ||||||
|  |     links = [] | ||||||
|  | 
 | ||||||
|  |     # Starting month and year | ||||||
|  |     start_date = datetime.date(2020, 3, 1) | ||||||
|  | 
 | ||||||
|  |     # End month and year | ||||||
|  |     end_date = datetime.date.today().replace(day=1) | ||||||
|  |     end_date = end_date.replace(month=end_date.month + 1) if end_date.month != 12 else end_date.replace(year=end_date.year + 1, month=1) | ||||||
|  |     current_date = start_date | ||||||
|  |     while current_date <= end_date: | ||||||
|  |         # Construct the URL based on the current month and year | ||||||
|  |         links.append(f"https://kprofiles.com/{current_date.strftime('%B').lower()}-{current_date.year}-comebacks-debuts-releases/") | ||||||
|  |          | ||||||
|  |         # Move to the next month | ||||||
|  |         if current_date.month == 12: | ||||||
|  |             current_date = current_date.replace(year=current_date.year + 1, month=1) | ||||||
|  |         else: | ||||||
|  |             current_date = current_date.replace(month=current_date.month + 1) | ||||||
|  |          | ||||||
|  |     return links | ||||||
|  | 
 | ||||||
|  | def checkLinkExtensions(link, comeback_compilation): | ||||||
|  |     if link in comeback_compilation: | ||||||
|  |         return link | ||||||
|  |     elif link.replace("-debuts-releases", "") in comeback_compilation: | ||||||
|  |         return link.replace("-debuts-releases", "") | ||||||
|  |     elif link.replace("-comebacks-debuts-releases", "") in comeback_compilation: | ||||||
|  |         return link.replace("-comebacks-debuts-releases", "") | ||||||
|  |     elif link.replace("-comebacks-debuts-releases", "-kpop") in comeback_compilation: | ||||||
|  |         return link.replace("-comebacks-debuts-releases", "-kpop") | ||||||
|  |     elif link[:-1] + "-2/" in comeback_compilation: | ||||||
|  |         return link[:-1] + "-2/" # WHY IS OCTOBER 2020 THE ONLY MONTH WITH A -2 | ||||||
|  |     elif link.replace("-comebacks-debuts-releases", "-kpop-comebacks-debuts-releases") in comeback_compilation: | ||||||
|  |         return link.replace("-comebacks-debuts-releases", "-kpop-comebacks-debuts-releases") | ||||||
|  |     elif link.replace("-comebacks-debuts-releases", "-kpop-comebacks") in comeback_compilation: | ||||||
|  |         return link.replace("-comebacks-debuts-releases", "-kpop-comebacks") | ||||||
|  |     print("Link not found: " + link) | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  | def filterValidLinks(links): | ||||||
|  |     # valid_links = [] | ||||||
|  |     # for link in links: | ||||||
|  |     #     if requests.get(link).status_code == 200: | ||||||
|  |     #         valid_links.append(link) | ||||||
|  |     #     else: | ||||||
|  |     #         print(requests.get(link).status_code) | ||||||
|  |     # return valid_links | ||||||
|  |     valid_links = [] | ||||||
|  |     compilation_link = "https://kprofiles.com/comebacks/page/" | ||||||
|  |     comeback_compilation = "" | ||||||
|  |     for i in range(1, 100): | ||||||
|  |         request = requests.get(compilation_link + str(i)) | ||||||
|  |         if request.status_code == 200: | ||||||
|  |             comeback_compilation += request.text | ||||||
|  |         else: | ||||||
|  |             break | ||||||
|  |      | ||||||
|  |     for link in links: | ||||||
|  |         is_valid = checkLinkExtensions(link, comeback_compilation) | ||||||
|  |         if is_valid: | ||||||
|  |             valid_links.append(is_valid) | ||||||
|  |      | ||||||
|  |     return valid_links | ||||||
|  | 
 | ||||||
|  | def fetchSite(link): | ||||||
|  |     #check if file already exists | ||||||
|  |     if os.path.isfile(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html") and not FORCE_REFRESH: | ||||||
|  |         # read from file | ||||||
|  |         with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "r") as file: | ||||||
|  |             return file.read() | ||||||
|  |     request = requests.get(link) | ||||||
|  |     if request.status_code == 200: | ||||||
|  |         # save to file | ||||||
|  |         with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "w") as file: | ||||||
|  |             file.write(request.text) | ||||||
|  |         return request.text | ||||||
|  | 
 | ||||||
|  | def fetchHandler(links): | ||||||
|  |     data = [] | ||||||
|  |     bar = top_lib.Progressbar(total=len(links)) | ||||||
|  |     bar.print(0) | ||||||
|  |     try: | ||||||
|  |         os.makedirs(WORKING_DIR + "/kprofiles/", exist_ok=True) | ||||||
|  |     except OSError: | ||||||
|  |         OSError("Creation of the directory %s failed" % WORKING_DIR + "/kprofiles/") | ||||||
|  |     for link in links: | ||||||
|  |         data.append(fetchSite(link)) | ||||||
|  |         bar.print(links.index(link) + 1) | ||||||
|  |     return data | ||||||
|  | 
 | ||||||
|  | def stripText(): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | if __name__ == '__main__': | ||||||
|  |     # launch args | ||||||
|  |     FORCE_REFRESH = True if "-f" in sys.argv else False | ||||||
|  | 
 | ||||||
|  |     print("Fetching kprofiles.com... (This may take a while, kprofiles is slow...)") | ||||||
|  |     links = getLinks() | ||||||
|  |     valid_links = filterValidLinks(links) | ||||||
|  |     data = fetchHandler(valid_links) | ||||||
							
								
								
									
										49
									
								
								sound-font.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								sound-font.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,49 @@ | ||||||
|  | from pydub import AudioSegment | ||||||
|  | import fluidsynth | ||||||
|  | import os | ||||||
|  | 
 | ||||||
|  | # Load the MP3 file | ||||||
|  | input_file = "/home/jonas_jones/Downloads/apple-crunch.mp3" | ||||||
|  | audio = AudioSegment.from_mp3(input_file) | ||||||
|  | 
 | ||||||
|  | # Define the piano pitch range (from C1 to C7) | ||||||
|  | piano_keys = 88  # 88 keys on a piano | ||||||
|  | pitch_range = list(range(1, piano_keys + 1)) | ||||||
|  | 
 | ||||||
|  | # Create a temporary directory to store individual pitch-shifted audio files | ||||||
|  | temp_dir = "temp_audio" | ||||||
|  | os.makedirs(temp_dir, exist_ok=True) | ||||||
|  | 
 | ||||||
|  | # Export and pitch-shift the audio in different pitches | ||||||
|  | for pitch in pitch_range: | ||||||
|  |     # Calculate the ratio for pitch shift (12 semitones = 1 octave) | ||||||
|  |     semitone_ratio = 2 ** (pitch / 12.0) | ||||||
|  |     # Shift the pitch | ||||||
|  |     shifted_audio = audio._spawn(audio.raw_data, overrides={ | ||||||
|  |         "frame_rate": int(audio.frame_rate * semitone_ratio) | ||||||
|  |     }) | ||||||
|  |     # Export the shifted audio | ||||||
|  |     output_file = os.path.join(temp_dir, f"output_pitch_{pitch}.wav") | ||||||
|  |     shifted_audio.export(output_file, format="wav") | ||||||
|  | 
 | ||||||
|  | print("Audio exported in different pitches.") | ||||||
|  | 
 | ||||||
|  | # Create an empty SoundFont | ||||||
|  | soundfont = fluidsynth.SoundFont() | ||||||
|  | 
 | ||||||
|  | # Load the pitch-shifted audio files into the SoundFont | ||||||
|  | for pitch in pitch_range: | ||||||
|  |     audio_file = os.path.join(temp_dir, f"output_pitch_{pitch}.wav") | ||||||
|  |     soundfont.add_sample(audio_file, preset=0, note=pitch) | ||||||
|  | 
 | ||||||
|  | # Save the SoundFont to a file | ||||||
|  | soundfont_file = "output_soundfont.sf2" | ||||||
|  | soundfont.write_to_file(soundfont_file) | ||||||
|  | 
 | ||||||
|  | print(f"SoundFont '{soundfont_file}' created.") | ||||||
|  | 
 | ||||||
|  | # Clean up: Delete temporary audio files and directory | ||||||
|  | for pitch in pitch_range: | ||||||
|  |     audio_file = os.path.join(temp_dir, f"output_pitch_{pitch}.wav") | ||||||
|  |     os.remove(audio_file) | ||||||
|  | os.rmdir(temp_dir) | ||||||
							
								
								
									
										146
									
								
								stripText.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										146
									
								
								stripText.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,146 @@ | ||||||
|  | import dotenv, os, re, datetime | ||||||
|  | import html as html_lib | ||||||
|  | 
 | ||||||
|  | dotenv.load_dotenv() | ||||||
|  | 
 | ||||||
|  | # Load the environment variables | ||||||
|  | WORKING_DIR = os.getenv('WORKING_DIR') | ||||||
|  | 
 | ||||||
|  | # Read file .working/kprofiles/march-2020-comebacks-debuts-releases.html | ||||||
|  | with open(os.path.join(WORKING_DIR, "kprofiles", "march-2020-comebacks-debuts-releases.html"), "r") as f: | ||||||
|  |     html = f.read() | ||||||
|  | 
 | ||||||
|  | def stripText(html, date:datetime.date=None): | ||||||
|  |     # remove the script and style sections | ||||||
|  |     script_pattern = re.compile('<script.*?</script>', re.DOTALL) | ||||||
|  |     style_pattern = re.compile('<style.*?</style>', re.DOTALL) | ||||||
|  |     text = re.sub(script_pattern, "", html) | ||||||
|  |     text = re.sub(style_pattern, "", text) | ||||||
|  |     text = html_lib.unescape(text) | ||||||
|  |     if html.startswith("<!DOCTYPE html>"): | ||||||
|  |         return text | ||||||
|  |     lines = text.split("•") | ||||||
|  | 
 | ||||||
|  |     if date: | ||||||
|  |         result = [] | ||||||
|  |     else: | ||||||
|  |         result = "" | ||||||
|  | 
 | ||||||
|  |     for line in lines: | ||||||
|  |         print(line) | ||||||
|  |         print(lines) | ||||||
|  |         line = line.replace("<strong>", "").replace("</strong>", "").replace("<span>", "").replace("</span>", "").replace("<br/>", "") | ||||||
|  |         if "[Comeback]" in line: | ||||||
|  |             line = line.split("[Comeback]")[0] + "[Comeback]" | ||||||
|  |         elif "[Debut]" in line: | ||||||
|  |             line = line.split("[Debut]")[0] + "[Debut]" | ||||||
|  |         elif "[Release]" in line: | ||||||
|  |             line = line.split("[Release]")[0] + "[Release]" | ||||||
|  |         elif "[Solo Debut]" in line: | ||||||
|  |             line = line.split("[Solo Debut]")[0] + "[Debut]" | ||||||
|  |         elif "[Solo Release]" in line: | ||||||
|  |             line = line.split("[Solo Release]")[0] + "[Release]" | ||||||
|  |         elif "[Pre-Debut Release]" in line: | ||||||
|  |             line = line.split("[Pre-Debut Release]")[0] + "[Pre-Debut Release]" | ||||||
|  |         elif "[Pre-Single Release]" in line: | ||||||
|  |             line = line.split("[Pre-Single Release]")[0] + "[Pre-Debut Release]" | ||||||
|  |         elif "[Japanese Comeback]" in line: | ||||||
|  |             line = line.split("[Japanese Comeback]")[0] + "[Japanese Comeback]" | ||||||
|  |         elif "[Japanese Debut]" in line: | ||||||
|  |             line = line.split("[Japanese Debut]")[0] + "[Japanese Debut]" | ||||||
|  |         elif "[Project Release]" in line: | ||||||
|  |             line = line.split("[Project Release]")[0] + "[Release]" | ||||||
|  |         elif "[Pre-Release Single]" in line: | ||||||
|  |             line = line.split("[Pre-Release Single]")[0] + "[Pre-Release]" | ||||||
|  |         elif "[Comeback Single]" in line: | ||||||
|  |             line = line.split("[Comeback Single]")[0] + "[Comeback]" | ||||||
|  |         elif "[Collab Release]" in line: | ||||||
|  |             line = line.split("[Collab Release]")[0] + "[Release]" | ||||||
|  |         elif "[Comeback Full Album]" in line: | ||||||
|  |             line = line.split("[Comeback Full Album]")[0] + "[Comeback]" | ||||||
|  |         elif "[Special Release]" in line: | ||||||
|  |             line = line.split("[Special Release]")[0] + "[Release]" | ||||||
|  |         elif "[Collab]" in line: | ||||||
|  |             line = line.split("[Collab]")[0] + "[Release]" | ||||||
|  |         elif "[Mixtape]" in line: | ||||||
|  |             line = line.split("[Mixtape]")[0] + "[Mixtape]" | ||||||
|  |         elif "[Japan Release]" in line: | ||||||
|  |             line = line.split("[Japan Release]")[0] + "[Japanese Release]" | ||||||
|  |         elif "[Single Release]" in line: | ||||||
|  |             line = line.split("[Single Release]")[0] + "[Release]" | ||||||
|  |         line = line.replace("\n", "").replace("‘", "").replace("’", "") | ||||||
|  |         if date and not line == "" and not line == " ": | ||||||
|  |             print(line) | ||||||
|  |             artist_title = line.split("[")[0].strip() | ||||||
|  |             artist = artist_title.split("|")[0].strip() | ||||||
|  |             title = artist_title.split("|")[1].strip().replace("‘", "").replace("’", "") | ||||||
|  |             release_type = line.split("[")[1].split("]")[0].strip() | ||||||
|  |             line = (str(date), artist, title, release_type) | ||||||
|  |             result.append(line) | ||||||
|  |             for i in ["\n", " ", ""]: | ||||||
|  |                 try: | ||||||
|  |                     result.remove(i) | ||||||
|  |                 except ValueError: | ||||||
|  |                     pass | ||||||
|  |         else: | ||||||
|  |             result += line + "\n" | ||||||
|  |     return result | ||||||
|  | 
 | ||||||
|  | def formatDate(date:datetime.date): | ||||||
|  |     formatted_date = "{} {}".format(date.strftime("%B"), date.day) | ||||||
|  |     if 10 <= date.day % 100 <= 20: | ||||||
|  |         suffix = 'th' | ||||||
|  |     else: | ||||||
|  |         suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(date.day % 10, 'th') | ||||||
|  |     formatted_date += suffix | ||||||
|  |     return formatted_date | ||||||
|  | 
 | ||||||
|  | def extract_between_strings(main_string, string1, string2): | ||||||
|  |     start_index = main_string.find(string1) | ||||||
|  |     end_index = main_string.find(string2) | ||||||
|  |      | ||||||
|  |     # Check if both strings are found in the main string | ||||||
|  |     if start_index != -1 and end_index != -1: | ||||||
|  |         # Extract the characters between string1 and string2 | ||||||
|  |         extracted_text = main_string[start_index + len(string1):end_index] | ||||||
|  |         return extracted_text | ||||||
|  |     else: | ||||||
|  |         # If either string1 or string2 is not found, return None or an empty string | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  | def increaseDateDay(date:datetime.date): | ||||||
|  |     return date + datetime.timedelta(days=1) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def do_dates(html, date:datetime.date): | ||||||
|  |     result = [] | ||||||
|  |     this_date = formatDate(date) | ||||||
|  |     this_string = None | ||||||
|  |     this_this_date = None | ||||||
|  |     for i in range(0, 46): | ||||||
|  |         this_date = formatDate(date) | ||||||
|  |         date = increaseDateDay(date) | ||||||
|  |         date_str = formatDate(date) | ||||||
|  |         extract = extract_between_strings(html, this_date, date_str) | ||||||
|  |         if this_string: | ||||||
|  |             extract = extract_between_strings(html, this_string, date_str) | ||||||
|  |         if not extract: | ||||||
|  |                 this_string = this_date | ||||||
|  |                 this_this_date = date | ||||||
|  |                 continue | ||||||
|  |         this_string = None | ||||||
|  |         #print("---------------------------------------------------") | ||||||
|  |         #print(this_date) | ||||||
|  |         this_this_date = date if this_this_date == None else this_this_date | ||||||
|  |         result += stripText(extract, this_this_date) | ||||||
|  |         this_this_date = None | ||||||
|  |     return result | ||||||
|  | 
 | ||||||
|  | result = do_dates(stripText(html), datetime.date(2020, 2, 15)) | ||||||
|  | for i in result: | ||||||
|  |     print(i) | ||||||
|  | print(len(result)) | ||||||
|  | 
 | ||||||
|  | #save output to file .working/kprofiles/march-2020-comebacks-debuts-releases.txt | ||||||
|  | with open(os.path.join(WORKING_DIR, "kprofiles", "may-2020-comebacks-debuts-releases.txt"), "w") as f: | ||||||
|  |     f.write(stripText(html)) | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue