Improve and merge scripts, use JSON instead of regex

This commit is contained in:
Hans5958 2022-04-07 12:01:09 +07:00
parent 433764b03c
commit 5a660759bf
5 changed files with 205 additions and 131 deletions

135
tools/formatter.py Normal file
View file

@ -0,0 +1,135 @@
#!/usr/bin/python
import re
import json
"""
Examples:
1. - /r/place
- r/place
2. /rplace
3. - https://www.reddit.com/r/place
- www.reddit.com/r/place
- reddit.com/r/place
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
- [www.reddit.com/r/place](www.reddit.com/r/place)
- [reddit.com/r/place](reddit.com/r/place)
UNUSED AND FAULTY
5. - https://place.reddit.com
- place.reddit.com
6. - [https://place.reddit.com](https://place.reddit.com)
- [place.reddit.com](https://place.reddit.com)
"""
format_subreddit_regex = {
# r/... to /r/...
"template": r"/r/\1",
"commatization": r',* +',
"pattern1": r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern2": r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
"pattern4": r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)',
# "pattern5": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
# "pattern6": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
}
collapse_links_regex = re.compile(r'\[(.+?)\]\((.+?)\)')
def format_subreddit(entry: dict):
if not "subreddit" in entry:
return entry
subredditLink = entry["subreddit"]
subredditLink = re.sub(format_subreddit_regex["commatization"], ', ', subredditLink)
subredditLink = re.sub(format_subreddit_regex["pattern4"], format_subreddit_regex["template"], subredditLink)
subredditLink = re.sub(format_subreddit_regex["pattern3"], format_subreddit_regex["template"], subredditLink)
subredditLink = re.sub(format_subreddit_regex["pattern1"], format_subreddit_regex["template"], subredditLink)
subredditLink = re.sub(format_subreddit_regex["pattern2"], format_subreddit_regex["template"], subredditLink)
if not subredditLink:
return entry
entry["subreddit"] = subredditLink
return entry
def collapse_links(entry: dict):
if not "website" in entry:
return entry
website = entry["website"];
if collapse_links_regex.search(website):
match = collapse_links_regex.search(website)
if match.group(1) == match.group(2):
website = match.group(2)
entry["website"] = website
return entry
def remove_extras(entry: dict):
for key in entry:
if not entry[key] or not isinstance(entry[key], str):
continue
# Leading and trailing spaces
entry[key] = re.sub(r'^(\s+)', r'', entry[key])
entry[key] = re.sub(r'(\s+)$', r'', entry[key])
# Double spaces and commas
entry[key] = re.sub(r' {2,}', r' ', entry[key])
entry[key] = re.sub(r'\n{2,}', r' ', entry[key])
entry[key] = re.sub(r',{2,}', r',', entry[key])
# Psuedo-empty strings
if entry[key] in ["n/a", "N/A", "-", "null", "none", "None"]:
entry[key] = ""
return entry
def fix_r_caps(entry: dict):
if not "description" in entry:
return entry
entry["description"] = re.sub(r'R\/', 'r/', entry["description"])
return entry
def per_line_entries(entries: list):
out = "[\n"
for entry in entries:
out += json.dumps(entry) + ",\n"
out = out[:-2] + "\n]"
return out
def format_all(entry: dict, silent=False):
def print_(*args, **kwargs):
if not silent:
print(*args, **kwargs)
print_("Removing extras...")
entry = remove_extras(entry)
print_("Fixing r/ capitalization...")
entry = fix_r_caps(entry)
print_("Collapsing Markdown links...")
entry = collapse_links(entry)
print_("Fix formatting of subreddit...")
entry = format_subreddit(entry)
print_("Completed!")
return entry
if __name__ == '__main__':
def go(path):
print(f"Formatting {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
entries = json.loads(f1.read())
for i in range(len(entries)):
entries[i] = format_all(entries[i], True)
if not (i % 500):
print(f"{i} checked.")
print(f"{len(entries)} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(per_line_entries(entries))
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,25 +1,26 @@
#!/usr/bin/python
import re
pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
import json
from formatter import collapse_links, per_line_entries
def go(path):
print(f"Fixing {path}...")
print(f"Formatting {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
entries = json.loads(f1.read())
for i in range(2):
for match in pattern.finditer(contents):
if match.group(1) == match.group(2):
contents = contents.replace(match.group(0), match.group(2), 1)
print(f"Stage {i+1} completed.")
for i in range(len(entries)):
entries[i] = collapse_links(entries[i])
if not (i % 500):
print(f"{i} checked.")
print(f"{len(entries)} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
f2.write(per_line_entries(entries))
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,36 +1,26 @@
#!/usr/bin/python
import re
import json
from formatter import remove_extras, fix_r_caps, per_line_entries
def go(path):
print(f"Fixing {path}...")
print(f"Formatting {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
entries = json.loads(f1.read())
contents = re.sub(r'": "(\s+)', r'": "', contents)
contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
print("Leading and trailing spaces removed.")
for i in range(len(entries)):
entries[i] = remove_extras(entries[i])
entries[i] = fix_r_caps(entries[i])
if not (i % 500):
print(f"{i} checked.")
contents = re.sub(r' {2,}', r' ', contents)
print("Double spaces removed.")
contents = re.sub(r',{2,}', r',', contents)
print("Double commas removed.")
contents = re.sub(r'"n/a"', '""', contents)
contents = re.sub(r'"N/A"', '""', contents)
contents = re.sub(r'"-"', '""', contents)
contents = re.sub(r'"none"', '""', contents)
contents = re.sub(r'"null"', '""', contents)
print("Psuedo-empty strings converted into empty strings.")
contents = re.sub(r'R\/', 'r/', contents)
print("Capitalization of r/ has been fixed.")
print(f"{len(entries)} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
f2.write(per_line_entries(entries))
print("Writing completed. All done.")
go("../web/atlas.json")

View file

@ -4,6 +4,7 @@
import time
import re
import os
from formatter import format_all
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
@ -11,14 +12,12 @@
credentials = open('credentials', 'r')
client_id = credentials.readline().strip(' \t\n\r')
client_secret = credentials.readline().strip(' \t\n\r')
user = credentials.readline().strip(' \t\n\r')
pw = credentials.readline().strip(' \t\n\r')
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
has_write_access = not reddit.read_only
if not has_write_access:
print("Warning: No write access. Post flairs will not be updated")
sleep(5)
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot')
failcount = 0
successcount = 0
totalcount = 0
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
existing = json.load(jsonfile)
@ -28,17 +27,8 @@
for item in existing:
existing_ids.append(item['id'])
def set_flair(submission, flair):
if has_write_access and submission.link_flair_text != flair:
flair_choices = submission.flair.choices()
flair = next(x for x in flair_choices if x["flair_text_editable"] and flair == x["flair_text"])
submission.flair.select(flair["flair_template_id"])
total_all_flairs = 0
duplicate_count = 0
failcount = 0
successcount = 0
totalcount = 0
outfile.write("[\n")
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
"""
@ -52,8 +42,7 @@ def set_flair(submission, flair):
7. Append to file called "credentials"
8. Copy Secret
9. Append on newline to "credentials" file
10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
11. Run Script
10. Run Script
Running Script
1. Input the next ID to use
@ -63,43 +52,41 @@ def set_flair(submission, flair):
"""
total_all_flairs += 1
if (submission.id in existing_ids):
set_flair(submission, "Processed Entry")
print("Found first duplicate!")
duplicate_count += 1
if (duplicate_count > 0):
if (duplicate_count > 10):
break
else:
continue
if(submission.link_flair_text == "New Entry"):
text = submission.selftext
#Old backslash filter:
#text = text.replace("\\", "")
#New one: One \\ escapes a backslash in python's parser
# Two escape it again in the regex parser, so \\\\ is \
# Then anything but " or n is replaced with the first capture group (anything but " or n)
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
text = re.sub("\\\\([^\"n])", "\\1", text)
submission_json = ""
try:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
except AttributeError:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
lines = text.split("\n")
for i, line in enumerate(lines):
if("\"id\": 0" in line):
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
text = "\n".join(lines)
try:
outfile.write(json.dumps(json.loads(text))+" ,\n")
successcount += 1
set_flair(submission, "Processed Entry")
submission_json = json.loads(text)
except json.JSONDecodeError:
failfile.write(text+",\n")
failcount += 1
set_flair(submission, "Rejected Entry")
if (submission_json):
submission_json_dummy = {"id": submission.id, "submitted_by": ""}
try:
submission_json_dummy["submitted_by"] = submission.author.name
except AttributeError:
submission_json_dummy["submitted_by"]
for key in submission_json:
if not key in submission_json_dummy:
submission_json_dummy[key] = submission_json[key];
submission_json = format_all(submission_json_dummy)
outfile.write(json.dumps(json.loads(text))+" ,\n")
successcount += 1
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
totalcount += 1

View file

@ -1,65 +1,26 @@
#!/usr/bin/python
import re
patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
patternCommatization = re.compile(r',* +')
pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
"""
Examples:
1. - /r/place
- r/place
2. /rplace
3. - https://www.reddit.com/r/place
- www.reddit.com/r/place
- reddit.com/r/place
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
- [www.reddit.com/r/place](www.reddit.com/r/place)
- [reddit.com/r/place](reddit.com/r/place)
UNUSED AND FAULTY
5. - https://place.reddit.com
- place.reddit.com
6. - [https://place.reddit.com](https://place.reddit.com)
- [place.reddit.com](https://place.reddit.com)
"""
def replaceStage1(contents: str):
contents = re.sub(patternCommatization, ', ', contents)
# r/... to /r/.. (change if not needed)
template = r"/r/\1"
contents = re.sub(pattern4, template, contents)
contents = re.sub(pattern3, template, contents)
contents = re.sub(pattern1, template, contents)
contents = re.sub(pattern2, template, contents)
return contents
import json
from formatter import format_subreddit, per_line_entries
def go(path):
print(f"Fixing {path}...")
print(f"Formatting {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
with open(path, "r+", encoding='UTF-8') as f1:
entries = json.loads(f1.read())
# Convert to r/... format first.
for matchParent in patternParent.finditer(contents):
subredditLink = matchParent.group(1)
subredditLink = replaceStage1(subredditLink)
if not subredditLink:
continue
if path == "../web/atlas-before-ids-migration.json":
contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
else:
contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
for i in range(len(entries)):
entries[i] = format_subreddit(entries[i])
if not (i % 500):
print(f"{i} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
print(f"{len(entries)} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(per_line_entries(entries))
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")