turbo-octo-potato/rpopfetch.py
Jonas_Jones fec4439d4a Fixed empty links list issue
Fixed an issue where in the case that an entry had no links, it's links value would be none and not an empty list
2023-12-16 03:32:58 +01:00

320 lines
12 KiB
Python

import praw
import dotenv
import os
import markdown
import re
import json
import time
import sys
def fetch_main_reddit_wiki_page(subreddit_name, page_name):
try:
# fetch the main wiki page with all the links to the monthly pages
subreddit = reddit.subreddit(subreddit_name)
wiki_page = subreddit.wiki[f"{page_name}"]
content = wiki_page.content_md # Markdown content
content.splitlines()
wiki_links = []
# parse the wiki page for the links to the monthly pages
for line in content.splitlines():
if line.startswith("###"):
wiki_links.append(line.split("(")[1].replace(")", "").replace(f"https://www.reddit.com/r/{subreddit_name}/wiki/", ""))
return wiki_links
# if there's an error fetching the wiki page, return None
except praw.exceptions.PRAWException as e:
print(f"Error fetching Reddit wiki page: {e}")
return None
def convert_monthly_content_to_json(content, year, month):
json_data = []
time_not_provided = ["2017-04", "2017-03"]
day = "0th"
for line in content.splitlines():
# break the loop before OSTs are parsed
if "#OST" in line:
break
try:
# split the line into parts from the table columns
parts = line.split("|")
# remove the first element, which is always empty string
parts.pop(0)
if year + "-" + month in time_not_provided:
# add a new element in between the 1st and 2nd element
# for these months, the time of release is not provided
parts.insert(1, "")
# if the list is not 7 elements long, append an empty string
# in this case, no song links were provided
if len(parts) != 7:
parts.append("")
# autocomplete the day for the entries below the first of the day
# as the day is only provided for the first entry of each day
if parts[0] == "":
parts[0] = day
# if the day is provided, save it for the next entry
day = parts[0]
# if the time is not provided, replace it with "--:--"
if parts[1] == "" or parts[1] == "?":
parts[1] = "--:--"
# if the text is surrounded by asterisks, remove them
# this is used to mark the text as bold in the wiki
if parts[2].startswith("*") and parts[2].endswith("*"):
parts[2] = parts[2][1:-1]
if parts[3].startswith("*") and parts[3].endswith("*"):
parts[3] = parts[3][1:-1]
if parts[4].startswith("*") and parts[4].endswith("*"):
parts[4] = parts[4][1:-1]
if parts[5].startswith("*") and parts[5].endswith("*"):
parts[5] = parts[5][1:-1]
if parts[6].startswith("*") and parts[6].endswith("*"):
parts[6] = parts[6][1:-1]
# check if parts[3] is a link
# if it is, remove the link syntax and keep the text
# clear the text from any leading spaces
while parts[3].startswith(" "):
parts[3] = parts[3][1:]
if (parts[3].startswith("[") and parts[3].endswith(")")):
parts[3] = parts[3][1:parts[3].find("](")]
if (parts[3].startswith("*[") and parts[3].endswith(")*")):
parts[3] = parts[3][2:parts[3].find("](")]
releasetype = []
# parse the release type
if "anniversary" in parts[4].lower():
releasetype.append("anniversary")
if "debut" in parts[4].lower():
releasetype.append("debut")
if "comeback" in parts[4].lower():
releasetype.append("comeback")
if "pre-release" in parts[4].lower():
releasetype.append("pre-release")
if "collab" in parts[4].lower():
releasetype.append("collab")
if "album" in parts[4].lower():
releasetype.append("album")
if "single" in parts[4].lower():
releasetype.append("single")
if "full-length album" in parts[4].lower():
releasetype.append("full-length")
if "mini album" in parts[4].lower():
releasetype.append("mini")
if "repackage" in parts[4].lower():
releasetype.append("repackage")
if "remix" in parts[4].lower():
releasetype.append("remix")
if "solo" in parts[4].lower():
releasetype.append("solo")
if "japanese" in parts[4].lower():
releasetype.append("japanese")
if "chinese" in parts[4].lower():
releasetype.append("chinese")
if "english" in parts[4].lower():
releasetype.append("english")
if "digital" in parts[4].lower():
releasetype.append("digital")
if "remake" in parts[4].lower():
releasetype.append("remake")
if "mixtape" in parts[4].lower():
releasetype.append("mixtape")
if "unit debut" in parts[4].lower():
releasetype.append("unit debut")
if "live" in parts[4].lower():
releasetype.append("live")
if "cover" in parts[4].lower():
releasetype.append("cover")
if "holiday" in parts[4].lower():
releasetype.append("holiday")
parts[4] = releasetype
# get the link from the markdown syntax
parts[5] = markdown.markdown(parts[5])
link_pattern = re.compile(r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"', re.IGNORECASE)
parts[5] = link_pattern.search(parts[5])
if parts[5]:
parts[5] = parts[5].group(1)
if parts[5].startswith("/"):
# if the link is a relative link, add the domain
parts[5] = "https://www.reddit.com" + parts[5]
# if the song links are provided, parse them
# do this by splitting the string by " / "
# and then parsing the markdown syntax
# to get the actual link
if parts[6] != "":
parts[6] = parts[6].split(" / ")
links = []
for link in parts[6]:
link = markdown.markdown(link)
link = link_pattern.search(link)
if link:
link = link.group(1)
links.append(link)
# replace the string with the list of links
parts[6] = links
# remove the last element if it's empty
# sometimes the markdown is messy
if parts[-1] == "":
parts.pop(-1)
else:
# if the song links are not provided, replace the string with an empty list
parts[6] = []
# add the reddit link to the list of links
reddit = parts.pop(5)
if reddit != "":
parts[5].append(reddit)
# remove the "th", "st", "nd", "rd" from the day
parts[0] = parts[0].replace('th', '').replace('st', '').replace('nd', '').replace('rd', '')
# make the links an empty list if it's null
if parts[5] == None:
parts[5] = []
# create a json entry from the parsed data
json_entry = {
"date": f"{year}-{month}-{parts[0]}",
"time": parts[1],
"artist": parts[2],
"title": parts[3],
"types": parts[4],
"links": parts[5]
}
json_data.append(json_entry)
except Exception as e:
# if the line doesn't start with a pipe, ignore it
# these lines are not part of the table
if not line.startswith("|"):
continue
# other issues are logged but ignored
else:
print("[IGNORED] Error parsing line: '" + line + "'")
print(e)
print(f" ==>Found {len(json_data)} entries in {year}-{month}.")
return json_data
def fetch_monthly_page(wiki_link, subreddit_name):
try:
subreddit = reddit.subreddit(subreddit_name)
wiki_page = subreddit.wiki[f"{wiki_link}"].content_md
# remove the first part of the wiki page before the table
wiki_page = wiki_page[wiki_page.find("|--|--|"):]
# remove the last part of the wiki page after the table
wiki_page = wiki_page[wiki_page.find("\n") + 1:]
#wiki_page = wiki_page[:wiki_page.find("\n\n")]
year = wiki_link.split('/')[1]
month = wiki_link.split('/')[2]
month = month.replace("january", "01")
month = month.replace("february", "02")
month = month.replace("march", "03")
month = month.replace("april", "04")
month = month.replace("may", "05")
month = month.replace("june", "06")
month = month.replace("july", "07")
month = month.replace("august", "08")
month = month.replace("september", "09")
month = month.replace("october", "10")
month = month.replace("november", "11")
month = month.replace("december", "12")
return convert_monthly_content_to_json(wiki_page, year, month)
except praw.exceptions.PRAWException as e:
print(f"Error fetching Reddit wiki page: {e}")
return None
UPLOAD_TO_CDN = True if "--cdn" in sys.argv else False
# reddit infos
subreddit_name = "kpop"
wiki_page_name = "upcoming-releases/archive"
# reddit instance
dotenv.load_dotenv()
reddit = praw.Reddit(
client_id=os.getenv('REDDIT_CLIENT_ID'),
client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
user_agent=os.getenv('REDDIT_USER_AGENT')
)
# fetch subreddit
try:
subreddit = reddit.subreddit(subreddit_name)
except praw.exceptions.PRAWException as e:
print(f"Error fetching subreddit: {e}")
# fetch wiki page
content = fetch_main_reddit_wiki_page(subreddit_name, wiki_page_name)
if content:
json_data = []
for wiki_link in content[::-1]:
progress = int(content[::-1].index(wiki_link)/len(content)*100)
if progress < 10:
progress = " " + str(progress)
elif progress < 100:
progress = " " + str(progress)
print(f"[{progress}%]Fetching monthly page: " + wiki_link)
try:
# fetch the monthly page and parse it
json_data += fetch_monthly_page(wiki_link, subreddit_name)
except Exception as e:
# write json_data to file
with open(f"{subreddit_name}_upcoming_releases-CANCELED.json", "w") as f:
f.write(json.dumps(json_data, indent=4))
print("Error fetching monthly page: " + wiki_link)
print(e)
exit(1)
print(f"[{progress}%]Parsed monthly page: " + wiki_link)
# sleep for 2 seconds to avoid getting rate limited
# reddit api is awful
time.sleep(2)
# add a first element to the list that holds the date of the last update
json_data.insert(0, {"last_update": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()) + " UTC"})
# save json_data to file
with open(f"rkpop_data.json", "w") as f:
f.write(json.dumps(json_data, indent=4))
print("Fetched", len(json_data) - 1, "entries.")
cdn_upload_cmd = "rclone copy rkpop_data.json cdn:cdn/api/kcomebacks/"
if UPLOAD_TO_CDN:
os.system(cdn_upload_cmd)
elif input("Upload to cdn? [Y/n]") in ["Y", "y", ""]:
os.system(cdn_upload_cmd)