Merge pull request #1074 from placeAtlas/cleanup

This commit is contained in:
Nicolas Abram 2022-04-08 11:01:42 -03:00 committed by GitHub
commit 23db7f330d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 975 additions and 2166 deletions

3
.gitignore vendored
View file

@ -12,4 +12,5 @@ allCharacters.txt
combined.js combined.js
*.DS_Store *.DS_Store
.vscode/ .vscode/
_img/place/ _img/place/
web/atlas-before-ids-migration.json

213
tools/formatter.py Normal file
View file

@ -0,0 +1,213 @@
#!/usr/bin/python
import re
import json
"""
Examples:
1. - /r/place
- r/place
2. /rplace
3. - https://www.reddit.com/r/place
- www.reddit.com/r/place
- reddit.com/r/place
UNUSED AND FAULTY
4. - https://place.reddit.com
- place.reddit.com
5. - [https://place.reddit.com](https://place.reddit.com)
- [place.reddit.com](https://place.reddit.com)
"""
FS_REGEX = {
"commatization": r'( *(,+ +|,+ |,+)| +)(and|&|;)( *(,+ +|,+ |,+)| +)|, *$| +',
"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
}
VALIDATE_REGEX = {
"subreddit": r'^ *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *(, *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *)*$|^$',
"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
}
CL_REGEX = r'\[(.+?)\]\((.+?)\)'
CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'
CSTW_REGEX = {
"website": r'^https?://[^\s/$.?#].[^\s]*$',
"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'
}
# r/... to /r/...
SUBREDDIT_TEMPLATE = r"/r/\1"
USER_TEMPLATE = r"/u/\1"
def format_subreddit(entry: dict):
if not "subreddit" in entry or not entry['subreddit']:
return entry
subredditLink = entry["subreddit"]
subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
if not subredditLink:
return entry
entry["subreddit"] = subredditLink
return entry
def collapse_links(entry: dict):
if not "website" in entry or not entry['website']:
return entry
website = entry["website"];
if re.search(CL_REGEX, website):
match = re.search(CL_REGEX, website)
if match.group(1) == match.group(2):
website = match.group(2)
entry["website"] = website
return entry
def remove_extras(entry: dict):
if "subreddit" in entry and entry["subreddit"]:
# if not entry["subreddit"].startswith('/r/'):
# entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])
for key in entry:
if not entry[key] or not isinstance(entry[key], str):
continue
# Leading and trailing spaces
entry[key] = entry[key].strip()
# Double characters
entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
entry[key] = re.sub(r' {3,}\n', r' ', entry[key])
entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
entry[key] = re.sub(r',{2,}', r',', entry[key])
# Psuedo-empty strings
if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
entry[key] = ""
return entry
def fix_r_caps(entry: dict):
if not "description" in entry or not entry['description']:
return entry
entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])
return entry
def fix_no_protocol_urls(entry: dict):
if not "website" in entry or not entry['website']:
return entry
if not entry["website"].startswith("http"):
entry["website"] = "https://" + entry["website"]
return entry
def convert_website_to_subreddit(entry: dict):
if not "website" in entry or not entry['website']:
return entry
if re.match(CWTS_REGEX, entry["website"]):
new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])
if (new_subreddit.lower() == entry["subreddit"].lower()):
entry["website"] = ""
elif not "subreddit" in entry or entry['subreddit'] == "":
entry["subreddit"] = new_subreddit
entry["website"] = ""
return entry
def convert_subreddit_to_website(entry: dict):
if not "subreddit" in entry or not entry['subreddit']:
return entry
if re.match(CSTW_REGEX["website"], entry["subreddit"]):
if (entry["website"].lower() == entry["subreddit"].lower()):
entry["subreddit"] = ""
elif not "website" in entry or entry['website'] == "":
entry["website"] = entry["subreddit"]
entry["subreddit"] = ""
elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
if not "website" in entry or entry['website'] == "":
username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
entry["website"] = "https://www.reddit.com/user/" + username
entry["subreddit"] = ""
return entry
def validate(entry: dict):
if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
print(f"Wait, no id here! How did this happened? {entry}")
return
for key in entry:
if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")
def per_line_entries(entries: list):
out = "[\n"
for entry in entries:
out += json.dumps(entry) + ",\n"
out = out[:-2] + "\n]"
return out
def format_all(entry: dict, silent=False):
def print_(*args, **kwargs):
if not silent:
print(*args, **kwargs)
print_("Fixing r/ capitalization...")
entry = fix_r_caps(entry)
print_("Fixing links without protocol...")
entry = fix_no_protocol_urls(entry)
print_("Fix formatting of subreddit...")
entry = format_subreddit(entry)
print_("Collapsing Markdown links...")
entry = collapse_links(entry)
print_("Converting website links to subreddit (if possible)...")
entry = convert_website_to_subreddit(entry)
print_("Converting subreddit links to website (if needed)...")
entry = convert_subreddit_to_website(entry)
print_("Removing extras...")
entry = remove_extras(entry)
print_("Validating...")
validate(entry)
print_("Completed!")
return entry
if __name__ == '__main__':
def go(path):
print(f"Formatting {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
entries = json.loads(f1.read())
for i in range(len(entries)):
entries[i] = format_all(entries[i], True)
if not (i % 500):
print(f"{i} checked.")
print(f"{len(entries)} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(per_line_entries(entries))
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,25 +0,0 @@
#!/usr/bin/python
import re
pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
def go(path):
print(f"Fixing {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
for i in range(2):
for match in pattern.finditer(contents):
if match.group(1) == match.group(2):
contents = contents.replace(match.group(0), match.group(2), 1)
print(f"Stage {i+1} completed.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,37 +0,0 @@
#!/usr/bin/python
import re
def go(path):
print(f"Fixing {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
contents = re.sub(r'": "(\s+)', r'": "', contents)
contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
print("Leading and trailing spaces removed.")
contents = re.sub(r' {2,}', r' ', contents)
print("Double spaces removed.")
contents = re.sub(r',{2,}', r',', contents)
print("Double commas removed.")
contents = re.sub(r'"n/a"', '""', contents)
contents = re.sub(r'"N/A"', '""', contents)
contents = re.sub(r'"-"', '""', contents)
contents = re.sub(r'"none"', '""', contents)
contents = re.sub(r'"null"', '""', contents)
print("Psuedo-empty strings converted into empty strings.")
contents = re.sub(r'R\/', 'r/', contents)
print("Capitalization of r/ has been fixed.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,24 +1,33 @@
import praw import praw
import json import json
import time import time
import re import re
import os import os
import traceback
from formatter import format_all
outfile = open('temp_atlas.json', 'w', encoding='utf-8') outfile = open('temp_atlas.json', 'w', encoding='utf-8')
failfile = open('manual_atlas.json', 'w', encoding='utf-8') failfile = open('manual_atlas.json', 'w', encoding='utf-8')
credentials = open('credentials', 'r') with open('credentials', 'r') as file:
client_id = credentials.readline().strip(' \t\n\r') credentials = file.readlines()
client_secret = credentials.readline().strip(' \t\n\r') client_id = credentials[0].strip()
user = credentials.readline().strip(' \t\n\r') client_secret = credentials[1].strip()
pw = credentials.readline().strip(' \t\n\r') username = credentials[2].strip()
password = credentials[3].strip()
reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
username=username,
password=password,
user_agent='atlas_bot'
)
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
has_write_access = not reddit.read_only has_write_access = not reddit.read_only
if not has_write_access: if not has_write_access:
print("Warning: No write access. Post flairs will not be updated") print("Warning: No write access. Post flairs will not be updated.")
sleep(5) time.sleep(5)
jsonfile = open("../web/atlas.json", "r", encoding='utf-8') jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
existing = json.load(jsonfile) existing = json.load(jsonfile)
@ -39,6 +48,7 @@ def set_flair(submission, flair):
failcount = 0 failcount = 0
successcount = 0 successcount = 0
totalcount = 0 totalcount = 0
outfile.write("[\n") outfile.write("[\n")
for submission in reddit.subreddit('placeAtlas2').new(limit=2000): for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
""" """
@ -48,12 +58,14 @@ def set_flair(submission, flair):
3. Give it a name and description 3. Give it a name and description
4. Select "script" 4. Select "script"
5. Redirect to http://localhost:8080 5. Redirect to http://localhost:8080
6. Copy ID (under Personal Use Script) 6. Create file "credentials" with the format below.
7. Append to file called "credentials"
8. Copy Secret [ID] <- Under "personal use script"
9. Append on newline to "credentials" file [Secret]
10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing) [Username] <- Must be a mod, don't do this if you │
11. Run Script [Password] <- don't know what you are doing. │
7. Run Script
Running Script Running Script
1. Input the next ID to use 1. Input the next ID to use
@ -63,6 +75,7 @@ def set_flair(submission, flair):
""" """
total_all_flairs += 1 total_all_flairs += 1
if (submission.id in existing_ids): if (submission.id in existing_ids):
set_flair(submission, "Processed Entry") set_flair(submission, "Processed Entry")
print("Found first duplicate!") print("Found first duplicate!")
@ -71,40 +84,59 @@ def set_flair(submission, flair):
break break
else: else:
continue continue
if(submission.link_flair_text == "New Entry"):
text = submission.selftext if (submission.link_flair_text == "New Entry"):
#Old backslash filter:
#text = text.replace("\\", "")
#New one: One \\ escapes a backslash in python's parser
# Two escape it again in the regex parser, so \\\\ is \
# Then anything but " or n is replaced with the first capture group (anything but " or n)
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
text = re.sub("\\\\([^\"n])", "\\1", text)
try: try:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
except AttributeError:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
text = submission.selftext
rawtext = text
lines = text.split("\n") text = text.replace('\u200c', '')
text = re.compile(r".*(\{.+\}).*", re.DOTALL).search(text).group(1)
# Test if it needs to escape the escape character. Usually happens on fancy mode.
try: json.loads(text)
except json.JSONDecodeError: text = re.sub(r"\\(.)", r"\1", text)
for i, line in enumerate(lines): submission_json = json.loads(text)
if("\"id\": 0" in line):
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"") if submission_json:
text = "\n".join(lines)
try: # Assert if path does not empty
outfile.write(json.dumps(json.loads(text))+" ,\n") assert len(submission_json["path"]) > 0
successcount += 1
set_flair(submission, "Processed Entry") submission_json_dummy = {"id": submission.id, "submitted_by": ""}
except json.JSONDecodeError: try:
failfile.write(text+",\n") submission_json_dummy["submitted_by"] = submission.author.name
except AttributeError:
submission_json_dummy["submitted_by"] = "unknown"
for key in submission_json:
if not key in submission_json_dummy:
submission_json_dummy[key] = submission_json[key];
submission_json = format_all(submission_json_dummy, True)
outfile.write(json.dumps(submission_json) + ",\n")
successcount += 1
set_flair(submission, "Processed Entry")
except Exception as e:
failfile.write(
"\n\n" + "="*40 + "\n\n" +
submission.id + "\n\n" +
traceback.format_exc() + "\n\n" +
"==== RAW ====" + "\n\n" +
rawtext + "\n\n"
"==== CLEAN ====" + "\n\n" +
text + "\n\n"
)
failcount += 1 failcount += 1
set_flair(submission, "Rejected Entry") set_flair(submission, "Rejected Entry")
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
print("Wrote "+submission.id+", submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
totalcount += 1 totalcount += 1
# Remove ,\n # Remove last trailing comma
outfile.seek(outfile.tell()-4, os.SEEK_SET) outfile.seek(outfile.tell()-3, os.SEEK_SET)
outfile.truncate() outfile.truncate()
outfile.write("\n]") outfile.write("\n]")

View file

@ -1,65 +0,0 @@
#!/usr/bin/python
import re
patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
patternCommatization = re.compile(r',* +')
pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
"""
Examples:
1. - /r/place
- r/place
2. /rplace
3. - https://www.reddit.com/r/place
- www.reddit.com/r/place
- reddit.com/r/place
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
- [www.reddit.com/r/place](www.reddit.com/r/place)
- [reddit.com/r/place](reddit.com/r/place)
UNUSED AND FAULTY
5. - https://place.reddit.com
- place.reddit.com
6. - [https://place.reddit.com](https://place.reddit.com)
- [place.reddit.com](https://place.reddit.com)
"""
def replaceStage1(contents: str):
contents = re.sub(patternCommatization, ', ', contents)
# r/... to /r/.. (change if not needed)
template = r"/r/\1"
contents = re.sub(pattern4, template, contents)
contents = re.sub(pattern3, template, contents)
contents = re.sub(pattern1, template, contents)
contents = re.sub(pattern2, template, contents)
return contents
def go(path):
print(f"Fixing {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
# Convert to r/... format first.
for matchParent in patternParent.finditer(contents):
subredditLink = matchParent.group(1)
subredditLink = replaceStage1(subredditLink)
if not subredditLink:
continue
if path == "../web/atlas-before-ids-migration.json":
contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
else:
contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff