From 4916f37100b7c3c0c57fee1726da3826fe3c5829 Mon Sep 17 00:00:00 2001 From: Hans5958 Date: Thu, 12 May 2022 16:28:00 +0700 Subject: [PATCH] Little documentation, catch exceptions, convert discord links --- tools/formatter.py | 89 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/tools/formatter.py b/tools/formatter.py index 8c2833e1..7270058e 100644 --- a/tools/formatter.py +++ b/tools/formatter.py @@ -3,6 +3,7 @@ import re import json import math +import traceback from calculate_center import polylabel @@ -40,13 +41,14 @@ CL_REGEX = r'\[(.+?)\]\((.+?)\)' CWTS_REGEX = { - "url": r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{2,20})(?:\/)$', + "url": r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{2,20})(?:\/)?$', "subreddit": r'^\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{2,20})\/?$' } CSTW_REGEX = { "website": r'^https?://[^\s/$.?#].[^\s]*$', "user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{2,20})$' } +CWTD_REGEX = r'^(?:(?:https?:\/\/)?(?:www\.)?(?:(?:discord)?\.?gg|discord(?:app)?\.com\/invite)\/)?([^\s/]+?)$' # r/... to /r/... SUBREDDIT_TEMPLATE = r"/r/\1" @@ -57,6 +59,7 @@ def format_subreddit(entry: dict): Fix formatting of the value on "subreddit". """ + # OLD FORMAT if "subreddit" in entry and entry["subreddit"]: subredditLink = entry["subreddit"] @@ -71,6 +74,7 @@ def format_subreddit(entry: dict): entry["subreddit"] = subredditLink + # NEW FORMAT if "links" in entry and "subreddit" in entry["links"]: for i in range(len(entry["links"]["subreddit"])): @@ -89,6 +93,7 @@ def collapse_links(entry: dict): Collapses Markdown links. """ + # OLD FORMAT if "website" in entry and entry['website']: website = entry["website"] @@ -100,6 +105,7 @@ def collapse_links(entry: dict): entry["website"] = website + # NEW FORMAT elif "links" in entry and "website" in entry["links"]: for i in range(len(entry["links"]["website"])): @@ -113,6 +119,7 @@ def collapse_links(entry: dict): entry["links"]["website"][i] = website + # OLD FORMAT if "subreddit" in entry and entry['subreddit']: subreddit = entry["subreddit"] @@ -124,6 +131,7 @@ def collapse_links(entry: dict): entry["subreddit"] = subreddit + # NEW FORMAT elif "links" in entry and "subreddit" in entry["links"]: for i in range(len(entry["links"]["subreddit"])): @@ -178,6 +186,7 @@ def remove_duplicate_points(entry: dict): if not "path" in entry: return entry + # OLD FORMAT if isinstance(entry['path'], list): path: list = entry['path'] previous: list = path[0] @@ -186,6 +195,8 @@ def remove_duplicate_points(entry: dict): if current == previous: path.pop(i) previous = current + + # NEW FORMAT else: for key in entry['path']: path: list = entry['path'][key] @@ -216,10 +227,13 @@ def fix_no_protocol_urls(entry: dict): Fixes URLs with no protocol by adding "https://" protocol. """ + # NEW FORMAT if "links" in entry and "website" in entry['links']: for i in range(len(entry["links"]["website"])): if entry["links"]["website"][i] and not entry["links"]["website"][i].startswith("http"): entry["links"]["website"][i] = "https://" + entry["website"] + + # OLD FORMAT elif "website" in entry and entry['website']: if not entry["website"].startswith("http"): entry["website"] = "https://" + entry["website"] @@ -231,27 +245,29 @@ def convert_website_to_subreddit(entry: dict): Converts the subreddit link on "website" to "subreddit" if possible. """ + # NEW FORMAT if "links" in entry and "website" in entry["links"]: for i in range(len(entry["links"]["website"])): if re.match(CWTS_REGEX["url"], entry["links"]["website"][i]): new_subreddit = re.sub(CWTS_REGEX["url"], r"\1", entry["links"]["website"][i]) + if not "subreddit" in entry["links"]: + entry["links"]["subreddit"] = [] if new_subreddit in entry["links"]["subreddit"]: entry["links"]["website"][i] = "" - elif not "subreddit" in entry["links"] or len(entry["subreddit"]) == 0: - if not "subreddit" in entry["links"]: - entry["links"]["subreddit"] = [] + elif not "subreddit" in entry["links"] or len(entry["links"]["subreddit"]) == 0: entry["links"]["subreddit"].append(new_subreddit) entry["links"]["website"][i] = "" elif re.match(CWTS_REGEX["subreddit"], entry["links"]["website"][i]): new_subreddit = re.sub(CWTS_REGEX["subreddit"], r"\1", entry["links"]["website"][i]) + if not "subreddit" in entry["links"]: + entry["links"]["subreddit"] = [] if new_subreddit in entry["links"]["subreddit"]: entry["links"]["website"][i] = "" - elif not "subreddit" in entry["links"] or len(entry["subreddit"]) == 0: - if not "subreddit" in entry["links"]: - entry["links"]["subreddit"] = [] + elif not "subreddit" in entry["links"] or len(entry["links"]["subreddit"]) == 0: entry["links"]["subreddit"].append(new_subreddit) entry["links"]["website"][i] = "" + # OLD FORMAT elif "website" in entry and entry['website']: if re.match(CWTS_REGEX["url"], entry["website"]): new_subreddit = re.sub(CWTS_REGEX["url"], SUBREDDIT_TEMPLATE, entry["website"]) @@ -270,11 +286,32 @@ def convert_website_to_subreddit(entry: dict): return entry +def convert_website_to_discord(entry: dict): + """ + Converts the Discord link on "website" to "discord" if possible. + """ + + # NEW FORMAT + if "links" in entry and "website" in entry["links"]: + for i in range(len(entry["links"]["website"])): + if re.match(CWTD_REGEX, entry["links"]["website"][i]): + new_discord = re.match(CWTD_REGEX, entry["links"]["website"][i])[1] + if not "discord" in entry["links"]: + entry["links"]["discord"] = [] + if new_discord in entry["links"]["discord"]: + entry["links"]["website"][i] = "" + elif not "discord" in entry["links"] or len(entry["links"]["discord"]) == 0: + entry["links"]["discord"].append(new_discord) + entry["links"]["website"][i] = "" + + return entry + def convert_subreddit_to_website(entry: dict): """ Converts the links on "subreddit" to a "website" if needed. This also supports Reddit users (/u/reddit). """ + # NEW FORMAT if "links" in entry and "subreddit" in entry["links"]: for i in range(len(entry["links"]["subreddit"])): if re.match(CSTW_REGEX["website"], entry["links"]["subreddit"][i]): @@ -293,6 +330,7 @@ def convert_subreddit_to_website(entry: dict): entry["website"].append("https://www.reddit.com/user/" + username) entry["links"]["subreddit"][i] = "" + # OLD FORMAT elif "subreddit" in entry and entry['subreddit']: if re.match(CSTW_REGEX["website"], entry["subreddit"]): if (entry["website"].lower() == entry["subreddit"].lower()): @@ -323,10 +361,13 @@ def update_center(entry: dict): if 'path' not in entry: return entry + # OLD FORMAT if isinstance(entry['path'], list): path = entry['path'] if len(path) > 1: entry['center'] = calculate_center(path) + + # NEW FORMAT else: for key in entry['path']: path = entry['path'][key] @@ -341,10 +382,17 @@ def remove_empty_and_similar(entry: dict): """ if "links" in entry: - - for key in entry["links"]: + + keys = list(entry["links"]) + for key in keys: small = list(map(lambda x: x.lower(), entry["links"][key])) entry["links"][key] = [x for x in entry["links"][key] if x and x.lower() in small] + if len(entry["links"][key]) == 0: del entry["links"][key] + + if "contributors" in entry: + + if len(entry["contributors"]) == 0: + del entry["contributors"] return entry @@ -367,6 +415,7 @@ def validate(entry: dict): entry['id'] = '[MISSING_ID]' if "path" in entry: + # OLD FORMAT if isinstance(entry['path'], list): if len(entry["path"]) == 0: print(f"Entry {entry['id']} has no points!") @@ -374,6 +423,8 @@ def validate(entry: dict): elif len(entry["path"]) < 3: print(f"Entry {entry['id']} only has {len(entry['path'])} point(s)!") return_status = 3 + + # NEW FORMAT else: for key in entry['path']: path = entry['path'][key] @@ -426,6 +477,8 @@ def print_(*args, **kwargs): entry = collapse_links(entry) print_("Converting website links to subreddit (if possible)...") entry = convert_website_to_subreddit(entry) + print_("Converting website links to Discord...") + entry = convert_website_to_discord(entry) print_("Converting subreddit links to website (if needed)...") entry = convert_subreddit_to_website(entry) print_("Fixing links without protocol...") @@ -434,8 +487,10 @@ def print_(*args, **kwargs): entry = remove_extras(entry) print_("Removing duplicate points...") entry = remove_duplicate_points(entry) + # This is the part where it goes slow. Comment when you needed it fast. print_("Updating center...") entry = update_center(entry) + # End of slow part. print_("Remove empty items...") entry = remove_empty_and_similar(entry) print_("Validating...") @@ -453,12 +508,16 @@ def go(path): entries = json.loads(f1.read()) for i in range(len(entries)): - entry_formatted, validation_status = format_all(entries[i], True) - if validation_status > 2: - print(f"Entry {entry_formatted['id']} will be removed! {json.dumps(entry_formatted)}") - entries[i] = None - else: - entries[i] = entry_formatted + try: + entry_formatted, validation_status = format_all(entries[i], True) + if validation_status > 2: + print(f"Entry {entry_formatted['id']} will be removed! {json.dumps(entry_formatted)}") + entries[i] = None + else: + entries[i] = entry_formatted + except Exception: + print(f"Exception occured when formatting ID {entries[i]['id']}") + print(traceback.format_exc()) if not (i % 200): print(f"{i} checked.")