From 5a660759bf0b5da7aca33b8429fd54c115ac366f Mon Sep 17 00:00:00 2001 From: Hans5958 Date: Thu, 7 Apr 2022 12:01:09 +0700 Subject: [PATCH] Improve and merge scripts, use JSON instead of regex --- tools/formatter.py | 135 ++++++++++++++++++++++++++++++++++++++ tools/less-md-links.py | 25 +++---- tools/misc-formats.py | 34 ++++------ tools/redditcrawl.py | 73 +++++++++------------ tools/subreddit-format.py | 69 +++++-------------- 5 files changed, 205 insertions(+), 131 deletions(-) create mode 100644 tools/formatter.py diff --git a/tools/formatter.py b/tools/formatter.py new file mode 100644 index 00000000..79eb2bff --- /dev/null +++ b/tools/formatter.py @@ -0,0 +1,135 @@ +#!/usr/bin/python + +import re +import json + +""" +Examples: +1. - /r/place + - r/place +2. /rplace +3. - https://www.reddit.com/r/place + - www.reddit.com/r/place + - reddit.com/r/place +4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place) + - [www.reddit.com/r/place](www.reddit.com/r/place) + - [reddit.com/r/place](reddit.com/r/place) +UNUSED AND FAULTY +5. - https://place.reddit.com + - place.reddit.com +6. - [https://place.reddit.com](https://place.reddit.com) + - [place.reddit.com](https://place.reddit.com) +""" +format_subreddit_regex = { + # r/... to /r/... + "template": r"/r/\1", + "commatization": r',* +', + "pattern1": r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?', + "pattern2": r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?', + "pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*', + "pattern4": r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)', + # "pattern5": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*', + # "pattern6": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"', +} + +collapse_links_regex = re.compile(r'\[(.+?)\]\((.+?)\)') + +def format_subreddit(entry: dict): + if not "subreddit" in entry: + return entry + + subredditLink = entry["subreddit"] + subredditLink = re.sub(format_subreddit_regex["commatization"], ', ', subredditLink) + subredditLink = re.sub(format_subreddit_regex["pattern4"], format_subreddit_regex["template"], subredditLink) + subredditLink = re.sub(format_subreddit_regex["pattern3"], format_subreddit_regex["template"], subredditLink) + subredditLink = re.sub(format_subreddit_regex["pattern1"], format_subreddit_regex["template"], subredditLink) + subredditLink = re.sub(format_subreddit_regex["pattern2"], format_subreddit_regex["template"], subredditLink) + + if not subredditLink: + return entry + + entry["subreddit"] = subredditLink + return entry + +def collapse_links(entry: dict): + if not "website" in entry: + return entry + website = entry["website"]; + if collapse_links_regex.search(website): + match = collapse_links_regex.search(website) + if match.group(1) == match.group(2): + website = match.group(2) + + entry["website"] = website + return entry + +def remove_extras(entry: dict): + for key in entry: + if not entry[key] or not isinstance(entry[key], str): + continue + # Leading and trailing spaces + entry[key] = re.sub(r'^(\s+)', r'', entry[key]) + entry[key] = re.sub(r'(\s+)$', r'', entry[key]) + # Double spaces and commas + entry[key] = re.sub(r' {2,}', r' ', entry[key]) + entry[key] = re.sub(r'\n{2,}', r' ', entry[key]) + entry[key] = re.sub(r',{2,}', r',', entry[key]) + # Psuedo-empty strings + if entry[key] in ["n/a", "N/A", "-", "null", "none", "None"]: + entry[key] = "" + + return entry + +def fix_r_caps(entry: dict): + if not "description" in entry: + return entry + + entry["description"] = re.sub(r'R\/', 'r/', entry["description"]) + + return entry + +def per_line_entries(entries: list): + out = "[\n" + for entry in entries: + out += json.dumps(entry) + ",\n" + out = out[:-2] + "\n]" + return out + +def format_all(entry: dict, silent=False): + def print_(*args, **kwargs): + if not silent: + print(*args, **kwargs) + print_("Removing extras...") + entry = remove_extras(entry) + print_("Fixing r/ capitalization...") + entry = fix_r_caps(entry) + print_("Collapsing Markdown links...") + entry = collapse_links(entry) + print_("Fix formatting of subreddit...") + entry = format_subreddit(entry) + print_("Completed!") + return entry + +if __name__ == '__main__': + + def go(path): + + print(f"Formatting {path}...") + + with open(path, "r+", encoding='UTF-8') as f1: + entries = json.loads(f1.read()) + + for i in range(len(entries)): + entries[i] = format_all(entries[i], True) + if not (i % 500): + print(f"{i} checked.") + + print(f"{len(entries)} checked.") + + with open(path, "w", encoding='UTF-8') as f2: + f2.write(per_line_entries(entries)) + + print("Writing completed. All done.") + + go("../web/atlas.json") + go("../web/atlas-before-ids-migration.json") \ No newline at end of file diff --git a/tools/less-md-links.py b/tools/less-md-links.py index d9178c83..b4814a5a 100644 --- a/tools/less-md-links.py +++ b/tools/less-md-links.py @@ -1,25 +1,26 @@ #!/usr/bin/python -import re -pattern = re.compile(r'\[(.+?)\]\((.+?)\)') +import json +from formatter import collapse_links, per_line_entries def go(path): - print(f"Fixing {path}...") + print(f"Formatting {path}...") with open(path, "r+", encoding='UTF-8') as f1: - contents = f1.read() + entries = json.loads(f1.read()) - for i in range(2): - for match in pattern.finditer(contents): - if match.group(1) == match.group(2): - contents = contents.replace(match.group(0), match.group(2), 1) - print(f"Stage {i+1} completed.") + for i in range(len(entries)): + entries[i] = collapse_links(entries[i]) + if not (i % 500): + print(f"{i} checked.") + + print(f"{len(entries)} checked.") with open(path, "w", encoding='UTF-8') as f2: - f2.write(contents) + f2.write(per_line_entries(entries)) + print("Writing completed. All done.") - go("../web/atlas.json") -go("../web/atlas-before-ids-migration.json") \ No newline at end of file +go("../web/atlas-before-ids-migration.json") \ No newline at end of file diff --git a/tools/misc-formats.py b/tools/misc-formats.py index fc654d1f..14bae161 100644 --- a/tools/misc-formats.py +++ b/tools/misc-formats.py @@ -1,36 +1,26 @@ #!/usr/bin/python -import re +import json +from formatter import remove_extras, fix_r_caps, per_line_entries def go(path): - print(f"Fixing {path}...") + print(f"Formatting {path}...") with open(path, "r+", encoding='UTF-8') as f1: - contents = f1.read() + entries = json.loads(f1.read()) - contents = re.sub(r'": "(\s+)', r'": "', contents) - contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents) - print("Leading and trailing spaces removed.") + for i in range(len(entries)): + entries[i] = remove_extras(entries[i]) + entries[i] = fix_r_caps(entries[i]) + if not (i % 500): + print(f"{i} checked.") - contents = re.sub(r' {2,}', r' ', contents) - print("Double spaces removed.") - - contents = re.sub(r',{2,}', r',', contents) - print("Double commas removed.") - - contents = re.sub(r'"n/a"', '""', contents) - contents = re.sub(r'"N/A"', '""', contents) - contents = re.sub(r'"-"', '""', contents) - contents = re.sub(r'"none"', '""', contents) - contents = re.sub(r'"null"', '""', contents) - print("Psuedo-empty strings converted into empty strings.") - - contents = re.sub(r'R\/', 'r/', contents) - print("Capitalization of r/ has been fixed.") + print(f"{len(entries)} checked.") with open(path, "w", encoding='UTF-8') as f2: - f2.write(contents) + f2.write(per_line_entries(entries)) + print("Writing completed. All done.") go("../web/atlas.json") diff --git a/tools/redditcrawl.py b/tools/redditcrawl.py index 7ab6e398..6d5878e5 100755 --- a/tools/redditcrawl.py +++ b/tools/redditcrawl.py @@ -4,6 +4,7 @@ import time import re import os +from formatter import format_all outfile = open('temp_atlas.json', 'w', encoding='utf-8') failfile = open('manual_atlas.json', 'w', encoding='utf-8') @@ -11,14 +12,12 @@ credentials = open('credentials', 'r') client_id = credentials.readline().strip(' \t\n\r') client_secret = credentials.readline().strip(' \t\n\r') -user = credentials.readline().strip(' \t\n\r') -pw = credentials.readline().strip(' \t\n\r') -reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw) -has_write_access = not reddit.read_only -if not has_write_access: - print("Warning: No write access. Post flairs will not be updated") - sleep(5) +reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot') + +failcount = 0 +successcount = 0 +totalcount = 0 jsonfile = open("../web/atlas.json", "r", encoding='utf-8') existing = json.load(jsonfile) @@ -28,17 +27,8 @@ for item in existing: existing_ids.append(item['id']) -def set_flair(submission, flair): - if has_write_access and submission.link_flair_text != flair: - flair_choices = submission.flair.choices() - flair = next(x for x in flair_choices if x["flair_text_editable"] and flair == x["flair_text"]) - submission.flair.select(flair["flair_template_id"]) - total_all_flairs = 0 duplicate_count = 0 -failcount = 0 -successcount = 0 -totalcount = 0 outfile.write("[\n") for submission in reddit.subreddit('placeAtlas2').new(limit=2000): """ @@ -52,8 +42,7 @@ def set_flair(submission, flair): 7. Append to file called "credentials" 8. Copy Secret 9. Append on newline to "credentials" file - 10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing) - 11. Run Script + 10. Run Script Running Script 1. Input the next ID to use @@ -63,43 +52,41 @@ def set_flair(submission, flair): """ total_all_flairs += 1 + if (submission.id in existing_ids): - set_flair(submission, "Processed Entry") print("Found first duplicate!") duplicate_count += 1 - if (duplicate_count > 0): + if (duplicate_count > 10): break else: continue + if(submission.link_flair_text == "New Entry"): + text = submission.selftext - #Old backslash filter: - #text = text.replace("\\", "") - #New one: One \\ escapes a backslash in python's parser - # Two escape it again in the regex parser, so \\\\ is \ - # Then anything but " or n is replaced with the first capture group (anything but " or n) - # Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"") - text = re.sub("\\\\([^\"n])", "\\1", text) + submission_json = "" + try: - text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",") - except AttributeError: - text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",") - - - lines = text.split("\n") - - for i, line in enumerate(lines): - if("\"id\": 0" in line): - lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"") - text = "\n".join(lines) - try: - outfile.write(json.dumps(json.loads(text))+" ,\n") - successcount += 1 - set_flair(submission, "Processed Entry") + submission_json = json.loads(text) except json.JSONDecodeError: failfile.write(text+",\n") failcount += 1 - set_flair(submission, "Rejected Entry") + + if (submission_json): + + submission_json_dummy = {"id": submission.id, "submitted_by": ""} + try: + submission_json_dummy["submitted_by"] = submission.author.name + except AttributeError: + submission_json_dummy["submitted_by"] + for key in submission_json: + if not key in submission_json_dummy: + submission_json_dummy[key] = submission_json[key]; + submission_json = format_all(submission_json_dummy) + + outfile.write(json.dumps(json.loads(text))+" ,\n") + successcount += 1 + print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago") totalcount += 1 diff --git a/tools/subreddit-format.py b/tools/subreddit-format.py index 1af9052d..bd4d0e75 100644 --- a/tools/subreddit-format.py +++ b/tools/subreddit-format.py @@ -1,65 +1,26 @@ #!/usr/bin/python -import re - -patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"') -patternCommatization = re.compile(r',* +') -pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?') -pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?') -pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*') -pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)') -# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*') -# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"') -""" -Examples: -1. - /r/place - - r/place -2. /rplace -3. - https://www.reddit.com/r/place - - www.reddit.com/r/place - - reddit.com/r/place -4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place) - - [www.reddit.com/r/place](www.reddit.com/r/place) - - [reddit.com/r/place](reddit.com/r/place) -UNUSED AND FAULTY -5. - https://place.reddit.com - - place.reddit.com -6. - [https://place.reddit.com](https://place.reddit.com) - - [place.reddit.com](https://place.reddit.com) -""" - -def replaceStage1(contents: str): - contents = re.sub(patternCommatization, ', ', contents) - - # r/... to /r/.. (change if not needed) - template = r"/r/\1" - contents = re.sub(pattern4, template, contents) - contents = re.sub(pattern3, template, contents) - contents = re.sub(pattern1, template, contents) - contents = re.sub(pattern2, template, contents) - return contents +import json +from formatter import format_subreddit, per_line_entries def go(path): - print(f"Fixing {path}...") + print(f"Formatting {path}...") - with open(path, "r+", encoding='UTF-8') as f1: - contents = f1.read() + with open(path, "r+", encoding='UTF-8') as f1: + entries = json.loads(f1.read()) - # Convert to r/... format first. - for matchParent in patternParent.finditer(contents): - subredditLink = matchParent.group(1) - subredditLink = replaceStage1(subredditLink) - if not subredditLink: - continue - if path == "../web/atlas-before-ids-migration.json": - contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1) - else: - contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1) + for i in range(len(entries)): + entries[i] = format_subreddit(entries[i]) + if not (i % 500): + print(f"{i} checked.") - with open(path, "w", encoding='UTF-8') as f2: - f2.write(contents) - print("Writing completed. All done.") + print(f"{len(entries)} checked.") + + with open(path, "w", encoding='UTF-8') as f2: + f2.write(per_line_entries(entries)) + + print("Writing completed. All done.") go("../web/atlas.json") go("../web/atlas-before-ids-migration.json") \ No newline at end of file