From 7e693d0cda7a799812665b6470c149d745497542 Mon Sep 17 00:00:00 2001
From: Jonas_Jones <91549607+J-onasJones@users.noreply.github.com>
Date: Wed, 1 Nov 2023 21:44:37 +0100
Subject: [PATCH] added some stuff

---
 kprofilesfetch.py | 111 +++++++++++++++++++++++++++++++++++
 sound-font.py     |  49 ++++++++++++++++
 stripText.py      | 146 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 306 insertions(+)
 create mode 100644 kprofilesfetch.py
 create mode 100644 sound-font.py
 create mode 100644 stripText.py

diff --git a/kprofilesfetch.py b/kprofilesfetch.py
new file mode 100644
index 0000000..e1f2dd7
--- /dev/null
+++ b/kprofilesfetch.py
@@ -0,0 +1,111 @@
+import datetime, requests, dotenv, os, sys
+import top_lib
+
+dotenv.load_dotenv()
+
+WORKING_DIR = os.getenv('WORKING_DIR')
+
+
+def getLinks():
+    links = []
+
+    # Starting month and year
+    start_date = datetime.date(2020, 3, 1)
+
+    # End month and year
+    end_date = datetime.date.today().replace(day=1)
+    end_date = end_date.replace(month=end_date.month + 1) if end_date.month != 12 else end_date.replace(year=end_date.year + 1, month=1)
+    current_date = start_date
+    while current_date <= end_date:
+        # Construct the URL based on the current month and year
+        links.append(f"https://kprofiles.com/{current_date.strftime('%B').lower()}-{current_date.year}-comebacks-debuts-releases/")
+        
+        # Move to the next month
+        if current_date.month == 12:
+            current_date = current_date.replace(year=current_date.year + 1, month=1)
+        else:
+            current_date = current_date.replace(month=current_date.month + 1)
+        
+    return links
+
+def checkLinkExtensions(link, comeback_compilation):
+    if link in comeback_compilation:
+        return link
+    elif link.replace("-debuts-releases", "") in comeback_compilation:
+        return link.replace("-debuts-releases", "")
+    elif link.replace("-comebacks-debuts-releases", "") in comeback_compilation:
+        return link.replace("-comebacks-debuts-releases", "")
+    elif link.replace("-comebacks-debuts-releases", "-kpop") in comeback_compilation:
+        return link.replace("-comebacks-debuts-releases", "-kpop")
+    elif link[:-1] + "-2/" in comeback_compilation:
+        return link[:-1] + "-2/" # WHY IS OCTOBER 2020 THE ONLY MONTH WITH A -2
+    elif link.replace("-comebacks-debuts-releases", "-kpop-comebacks-debuts-releases") in comeback_compilation:
+        return link.replace("-comebacks-debuts-releases", "-kpop-comebacks-debuts-releases")
+    elif link.replace("-comebacks-debuts-releases", "-kpop-comebacks") in comeback_compilation:
+        return link.replace("-comebacks-debuts-releases", "-kpop-comebacks")
+    print("Link not found: " + link)
+    
+
+def filterValidLinks(links):
+    # valid_links = []
+    # for link in links:
+    #     if requests.get(link).status_code == 200:
+    #         valid_links.append(link)
+    #     else:
+    #         print(requests.get(link).status_code)
+    # return valid_links
+    valid_links = []
+    compilation_link = "https://kprofiles.com/comebacks/page/"
+    comeback_compilation = ""
+    for i in range(1, 100):
+        request = requests.get(compilation_link + str(i))
+        if request.status_code == 200:
+            comeback_compilation += request.text
+        else:
+            break
+    
+    for link in links:
+        is_valid = checkLinkExtensions(link, comeback_compilation)
+        if is_valid:
+            valid_links.append(is_valid)
+    
+    return valid_links
+
+def fetchSite(link):
+    #check if file already exists
+    if os.path.isfile(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html") and not FORCE_REFRESH:
+        # read from file
+        with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "r") as file:
+            return file.read()
+    request = requests.get(link)
+    if request.status_code == 200:
+        # save to file
+        with open(WORKING_DIR + "/kprofiles/" + link.split("/")[-2] + ".html", "w") as file:
+            file.write(request.text)
+        return request.text
+
+def fetchHandler(links):
+    data = []
+    bar = top_lib.Progressbar(total=len(links))
+    bar.print(0)
+    try:
+        os.makedirs(WORKING_DIR + "/kprofiles/", exist_ok=True)
+    except OSError:
+        OSError("Creation of the directory %s failed" % WORKING_DIR + "/kprofiles/")
+    for link in links:
+        data.append(fetchSite(link))
+        bar.print(links.index(link) + 1)
+    return data
+
+def stripText():
+    pass
+
+
+if __name__ == '__main__':
+    # launch args
+    FORCE_REFRESH = True if "-f" in sys.argv else False
+
+    print("Fetching kprofiles.com... (This may take a while, kprofiles is slow...)")
+    links = getLinks()
+    valid_links = filterValidLinks(links)
+    data = fetchHandler(valid_links)
diff --git a/sound-font.py b/sound-font.py
new file mode 100644
index 0000000..0c63992
--- /dev/null
+++ b/sound-font.py
@@ -0,0 +1,49 @@
+from pydub import AudioSegment
+import fluidsynth
+import os
+
+# Load the MP3 file
+input_file = "/home/jonas_jones/Downloads/apple-crunch.mp3"
+audio = AudioSegment.from_mp3(input_file)
+
+# Define the piano pitch range (from C1 to C7)
+piano_keys = 88  # 88 keys on a piano
+pitch_range = list(range(1, piano_keys + 1))
+
+# Create a temporary directory to store individual pitch-shifted audio files
+temp_dir = "temp_audio"
+os.makedirs(temp_dir, exist_ok=True)
+
+# Export and pitch-shift the audio in different pitches
+for pitch in pitch_range:
+    # Calculate the ratio for pitch shift (12 semitones = 1 octave)
+    semitone_ratio = 2 ** (pitch / 12.0)
+    # Shift the pitch
+    shifted_audio = audio._spawn(audio.raw_data, overrides={
+        "frame_rate": int(audio.frame_rate * semitone_ratio)
+    })
+    # Export the shifted audio
+    output_file = os.path.join(temp_dir, f"output_pitch_{pitch}.wav")
+    shifted_audio.export(output_file, format="wav")
+
+print("Audio exported in different pitches.")
+
+# Create an empty SoundFont
+soundfont = fluidsynth.SoundFont()
+
+# Load the pitch-shifted audio files into the SoundFont
+for pitch in pitch_range:
+    audio_file = os.path.join(temp_dir, f"output_pitch_{pitch}.wav")
+    soundfont.add_sample(audio_file, preset=0, note=pitch)
+
+# Save the SoundFont to a file
+soundfont_file = "output_soundfont.sf2"
+soundfont.write_to_file(soundfont_file)
+
+print(f"SoundFont '{soundfont_file}' created.")
+
+# Clean up: Delete temporary audio files and directory
+for pitch in pitch_range:
+    audio_file = os.path.join(temp_dir, f"output_pitch_{pitch}.wav")
+    os.remove(audio_file)
+os.rmdir(temp_dir)
diff --git a/stripText.py b/stripText.py
new file mode 100644
index 0000000..feb2335
--- /dev/null
+++ b/stripText.py
@@ -0,0 +1,146 @@
+import dotenv, os, re, datetime
+import html as html_lib
+
+dotenv.load_dotenv()
+
+# Load the environment variables
+WORKING_DIR = os.getenv('WORKING_DIR')
+
+# Read file .working/kprofiles/march-2020-comebacks-debuts-releases.html
+with open(os.path.join(WORKING_DIR, "kprofiles", "march-2020-comebacks-debuts-releases.html"), "r") as f:
+    html = f.read()
+
+def stripText(html, date:datetime.date=None):
+    # remove the script and style sections
+    script_pattern = re.compile('<script.*?</script>', re.DOTALL)
+    style_pattern = re.compile('<style.*?</style>', re.DOTALL)
+    text = re.sub(script_pattern, "", html)
+    text = re.sub(style_pattern, "", text)
+    text = html_lib.unescape(text)
+    if html.startswith("<!DOCTYPE html>"):
+        return text
+    lines = text.split("•")
+
+    if date:
+        result = []
+    else:
+        result = ""
+
+    for line in lines:
+        print(line)
+        print(lines)
+        line = line.replace("<strong>", "").replace("</strong>", "").replace("<span>", "").replace("</span>", "").replace("<br/>", "")
+        if "[Comeback]" in line:
+            line = line.split("[Comeback]")[0] + "[Comeback]"
+        elif "[Debut]" in line:
+            line = line.split("[Debut]")[0] + "[Debut]"
+        elif "[Release]" in line:
+            line = line.split("[Release]")[0] + "[Release]"
+        elif "[Solo Debut]" in line:
+            line = line.split("[Solo Debut]")[0] + "[Debut]"
+        elif "[Solo Release]" in line:
+            line = line.split("[Solo Release]")[0] + "[Release]"
+        elif "[Pre-Debut Release]" in line:
+            line = line.split("[Pre-Debut Release]")[0] + "[Pre-Debut Release]"
+        elif "[Pre-Single Release]" in line:
+            line = line.split("[Pre-Single Release]")[0] + "[Pre-Debut Release]"
+        elif "[Japanese Comeback]" in line:
+            line = line.split("[Japanese Comeback]")[0] + "[Japanese Comeback]"
+        elif "[Japanese Debut]" in line:
+            line = line.split("[Japanese Debut]")[0] + "[Japanese Debut]"
+        elif "[Project Release]" in line:
+            line = line.split("[Project Release]")[0] + "[Release]"
+        elif "[Pre-Release Single]" in line:
+            line = line.split("[Pre-Release Single]")[0] + "[Pre-Release]"
+        elif "[Comeback Single]" in line:
+            line = line.split("[Comeback Single]")[0] + "[Comeback]"
+        elif "[Collab Release]" in line:
+            line = line.split("[Collab Release]")[0] + "[Release]"
+        elif "[Comeback Full Album]" in line:
+            line = line.split("[Comeback Full Album]")[0] + "[Comeback]"
+        elif "[Special Release]" in line:
+            line = line.split("[Special Release]")[0] + "[Release]"
+        elif "[Collab]" in line:
+            line = line.split("[Collab]")[0] + "[Release]"
+        elif "[Mixtape]" in line:
+            line = line.split("[Mixtape]")[0] + "[Mixtape]"
+        elif "[Japan Release]" in line:
+            line = line.split("[Japan Release]")[0] + "[Japanese Release]"
+        elif "[Single Release]" in line:
+            line = line.split("[Single Release]")[0] + "[Release]"
+        line = line.replace("\n", "").replace("&#8216;", "").replace("&#8217;", "")
+        if date and not line == "" and not line == " ":
+            print(line)
+            artist_title = line.split("[")[0].strip()
+            artist = artist_title.split("|")[0].strip()
+            title = artist_title.split("|")[1].strip().replace("‘", "").replace("’", "")
+            release_type = line.split("[")[1].split("]")[0].strip()
+            line = (str(date), artist, title, release_type)
+            result.append(line)
+            for i in ["\n", " ", ""]:
+                try:
+                    result.remove(i)
+                except ValueError:
+                    pass
+        else:
+            result += line + "\n"
+    return result
+
+def formatDate(date:datetime.date):
+    formatted_date = "{} {}".format(date.strftime("%B"), date.day)
+    if 10 <= date.day % 100 <= 20:
+        suffix = 'th'
+    else:
+        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(date.day % 10, 'th')
+    formatted_date += suffix
+    return formatted_date
+
+def extract_between_strings(main_string, string1, string2):
+    start_index = main_string.find(string1)
+    end_index = main_string.find(string2)
+    
+    # Check if both strings are found in the main string
+    if start_index != -1 and end_index != -1:
+        # Extract the characters between string1 and string2
+        extracted_text = main_string[start_index + len(string1):end_index]
+        return extracted_text
+    else:
+        # If either string1 or string2 is not found, return None or an empty string
+        return None
+
+def increaseDateDay(date:datetime.date):
+    return date + datetime.timedelta(days=1)
+
+
+def do_dates(html, date:datetime.date):
+    result = []
+    this_date = formatDate(date)
+    this_string = None
+    this_this_date = None
+    for i in range(0, 46):
+        this_date = formatDate(date)
+        date = increaseDateDay(date)
+        date_str = formatDate(date)
+        extract = extract_between_strings(html, this_date, date_str)
+        if this_string:
+            extract = extract_between_strings(html, this_string, date_str)
+        if not extract:
+                this_string = this_date
+                this_this_date = date
+                continue
+        this_string = None
+        #print("---------------------------------------------------")
+        #print(this_date)
+        this_this_date = date if this_this_date == None else this_this_date
+        result += stripText(extract, this_this_date)
+        this_this_date = None
+    return result
+
+result = do_dates(stripText(html), datetime.date(2020, 2, 15))
+for i in result:
+    print(i)
+print(len(result))
+
+#save output to file .working/kprofiles/march-2020-comebacks-debuts-releases.txt
+with open(os.path.join(WORKING_DIR, "kprofiles", "may-2020-comebacks-debuts-releases.txt"), "w") as f:
+    f.write(stripText(html))