Merge pull request #1074 from placeAtlas/cleanup

This commit is contained in:
Nicolas Abram 2022-04-08 11:01:42 -03:00 committed by GitHub
commit 23db7f330d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 975 additions and 2166 deletions

1
.gitignore vendored
View file

@ -13,3 +13,4 @@ combined.js
*.DS_Store
.vscode/
_img/place/
web/atlas-before-ids-migration.json

213
tools/formatter.py Normal file
View file

@ -0,0 +1,213 @@
#!/usr/bin/python
import re
import json
"""
Examples:
1. - /r/place
- r/place
2. /rplace
3. - https://www.reddit.com/r/place
- www.reddit.com/r/place
- reddit.com/r/place
UNUSED AND FAULTY
4. - https://place.reddit.com
- place.reddit.com
5. - [https://place.reddit.com](https://place.reddit.com)
- [place.reddit.com](https://place.reddit.com)
"""
FS_REGEX = {
"commatization": r'( *(,+ +|,+ |,+)| +)(and|&|;)( *(,+ +|,+ |,+)| +)|, *$| +',
"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
}
VALIDATE_REGEX = {
"subreddit": r'^ *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *(, *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *)*$|^$',
"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
}
CL_REGEX = r'\[(.+?)\]\((.+?)\)'
CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'
CSTW_REGEX = {
"website": r'^https?://[^\s/$.?#].[^\s]*$',
"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'
}
# r/... to /r/...
SUBREDDIT_TEMPLATE = r"/r/\1"
USER_TEMPLATE = r"/u/\1"
def format_subreddit(entry: dict):
if not "subreddit" in entry or not entry['subreddit']:
return entry
subredditLink = entry["subreddit"]
subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
if not subredditLink:
return entry
entry["subreddit"] = subredditLink
return entry
def collapse_links(entry: dict):
if not "website" in entry or not entry['website']:
return entry
website = entry["website"];
if re.search(CL_REGEX, website):
match = re.search(CL_REGEX, website)
if match.group(1) == match.group(2):
website = match.group(2)
entry["website"] = website
return entry
def remove_extras(entry: dict):
if "subreddit" in entry and entry["subreddit"]:
# if not entry["subreddit"].startswith('/r/'):
# entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])
for key in entry:
if not entry[key] or not isinstance(entry[key], str):
continue
# Leading and trailing spaces
entry[key] = entry[key].strip()
# Double characters
entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
entry[key] = re.sub(r' {3,}\n', r' ', entry[key])
entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
entry[key] = re.sub(r',{2,}', r',', entry[key])
# Psuedo-empty strings
if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
entry[key] = ""
return entry
def fix_r_caps(entry: dict):
if not "description" in entry or not entry['description']:
return entry
entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])
return entry
def fix_no_protocol_urls(entry: dict):
if not "website" in entry or not entry['website']:
return entry
if not entry["website"].startswith("http"):
entry["website"] = "https://" + entry["website"]
return entry
def convert_website_to_subreddit(entry: dict):
if not "website" in entry or not entry['website']:
return entry
if re.match(CWTS_REGEX, entry["website"]):
new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])
if (new_subreddit.lower() == entry["subreddit"].lower()):
entry["website"] = ""
elif not "subreddit" in entry or entry['subreddit'] == "":
entry["subreddit"] = new_subreddit
entry["website"] = ""
return entry
def convert_subreddit_to_website(entry: dict):
if not "subreddit" in entry or not entry['subreddit']:
return entry
if re.match(CSTW_REGEX["website"], entry["subreddit"]):
if (entry["website"].lower() == entry["subreddit"].lower()):
entry["subreddit"] = ""
elif not "website" in entry or entry['website'] == "":
entry["website"] = entry["subreddit"]
entry["subreddit"] = ""
elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
if not "website" in entry or entry['website'] == "":
username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
entry["website"] = "https://www.reddit.com/user/" + username
entry["subreddit"] = ""
return entry
def validate(entry: dict):
if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
print(f"Wait, no id here! How did this happened? {entry}")
return
for key in entry:
if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")
def per_line_entries(entries: list):
out = "[\n"
for entry in entries:
out += json.dumps(entry) + ",\n"
out = out[:-2] + "\n]"
return out
def format_all(entry: dict, silent=False):
def print_(*args, **kwargs):
if not silent:
print(*args, **kwargs)
print_("Fixing r/ capitalization...")
entry = fix_r_caps(entry)
print_("Fixing links without protocol...")
entry = fix_no_protocol_urls(entry)
print_("Fix formatting of subreddit...")
entry = format_subreddit(entry)
print_("Collapsing Markdown links...")
entry = collapse_links(entry)
print_("Converting website links to subreddit (if possible)...")
entry = convert_website_to_subreddit(entry)
print_("Converting subreddit links to website (if needed)...")
entry = convert_subreddit_to_website(entry)
print_("Removing extras...")
entry = remove_extras(entry)
print_("Validating...")
validate(entry)
print_("Completed!")
return entry
if __name__ == '__main__':
def go(path):
print(f"Formatting {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
entries = json.loads(f1.read())
for i in range(len(entries)):
entries[i] = format_all(entries[i], True)
if not (i % 500):
print(f"{i} checked.")
print(f"{len(entries)} checked.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(per_line_entries(entries))
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,25 +0,0 @@
#!/usr/bin/python
import re
pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
def go(path):
print(f"Fixing {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
for i in range(2):
for match in pattern.finditer(contents):
if match.group(1) == match.group(2):
contents = contents.replace(match.group(0), match.group(2), 1)
print(f"Stage {i+1} completed.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,37 +0,0 @@
#!/usr/bin/python
import re
def go(path):
print(f"Fixing {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
contents = re.sub(r'": "(\s+)', r'": "', contents)
contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
print("Leading and trailing spaces removed.")
contents = re.sub(r' {2,}', r' ', contents)
print("Double spaces removed.")
contents = re.sub(r',{2,}', r',', contents)
print("Double commas removed.")
contents = re.sub(r'"n/a"', '""', contents)
contents = re.sub(r'"N/A"', '""', contents)
contents = re.sub(r'"-"', '""', contents)
contents = re.sub(r'"none"', '""', contents)
contents = re.sub(r'"null"', '""', contents)
print("Psuedo-empty strings converted into empty strings.")
contents = re.sub(r'R\/', 'r/', contents)
print("Capitalization of r/ has been fixed.")
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

View file

@ -1,24 +1,33 @@
import praw
import json
import time
import re
import os
import traceback
from formatter import format_all
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
credentials = open('credentials', 'r')
client_id = credentials.readline().strip(' \t\n\r')
client_secret = credentials.readline().strip(' \t\n\r')
user = credentials.readline().strip(' \t\n\r')
pw = credentials.readline().strip(' \t\n\r')
with open('credentials', 'r') as file:
credentials = file.readlines()
client_id = credentials[0].strip()
client_secret = credentials[1].strip()
username = credentials[2].strip()
password = credentials[3].strip()
reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
username=username,
password=password,
user_agent='atlas_bot'
)
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
has_write_access = not reddit.read_only
if not has_write_access:
print("Warning: No write access. Post flairs will not be updated")
sleep(5)
print("Warning: No write access. Post flairs will not be updated.")
time.sleep(5)
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
existing = json.load(jsonfile)
@ -39,6 +48,7 @@ def set_flair(submission, flair):
failcount = 0
successcount = 0
totalcount = 0
outfile.write("[\n")
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
"""
@ -48,12 +58,14 @@ def set_flair(submission, flair):
3. Give it a name and description
4. Select "script"
5. Redirect to http://localhost:8080
6. Copy ID (under Personal Use Script)
7. Append to file called "credentials"
8. Copy Secret
9. Append on newline to "credentials" file
10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
11. Run Script
6. Create file "credentials" with the format below.
[ID] <- Under "personal use script"
[Secret]
[Username] <- Must be a mod, don't do this if you │
[Password] <- don't know what you are doing. │
7. Run Script
Running Script
1. Input the next ID to use
@ -63,6 +75,7 @@ def set_flair(submission, flair):
"""
total_all_flairs += 1
if (submission.id in existing_ids):
set_flair(submission, "Processed Entry")
print("Found first duplicate!")
@ -71,40 +84,59 @@ def set_flair(submission, flair):
break
else:
continue
if(submission.link_flair_text == "New Entry"):
if (submission.link_flair_text == "New Entry"):
try:
text = submission.selftext
#Old backslash filter:
#text = text.replace("\\", "")
#New one: One \\ escapes a backslash in python's parser
# Two escape it again in the regex parser, so \\\\ is \
# Then anything but " or n is replaced with the first capture group (anything but " or n)
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
text = re.sub("\\\\([^\"n])", "\\1", text)
rawtext = text
text = text.replace('\u200c', '')
text = re.compile(r".*(\{.+\}).*", re.DOTALL).search(text).group(1)
# Test if it needs to escape the escape character. Usually happens on fancy mode.
try: json.loads(text)
except json.JSONDecodeError: text = re.sub(r"\\(.)", r"\1", text)
submission_json = json.loads(text)
if submission_json:
# Assert if path does not empty
assert len(submission_json["path"]) > 0
submission_json_dummy = {"id": submission.id, "submitted_by": ""}
try:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
submission_json_dummy["submitted_by"] = submission.author.name
except AttributeError:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
submission_json_dummy["submitted_by"] = "unknown"
for key in submission_json:
if not key in submission_json_dummy:
submission_json_dummy[key] = submission_json[key];
submission_json = format_all(submission_json_dummy, True)
lines = text.split("\n")
for i, line in enumerate(lines):
if("\"id\": 0" in line):
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
text = "\n".join(lines)
try:
outfile.write(json.dumps(json.loads(text))+" ,\n")
outfile.write(json.dumps(submission_json) + ",\n")
successcount += 1
set_flair(submission, "Processed Entry")
except json.JSONDecodeError:
failfile.write(text+",\n")
except Exception as e:
failfile.write(
"\n\n" + "="*40 + "\n\n" +
submission.id + "\n\n" +
traceback.format_exc() + "\n\n" +
"==== RAW ====" + "\n\n" +
rawtext + "\n\n"
"==== CLEAN ====" + "\n\n" +
text + "\n\n"
)
failcount += 1
set_flair(submission, "Rejected Entry")
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
print("Wrote "+submission.id+", submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
totalcount += 1
# Remove ,\n
outfile.seek(outfile.tell()-4, os.SEEK_SET)
# Remove last trailing comma
outfile.seek(outfile.tell()-3, os.SEEK_SET)
outfile.truncate()
outfile.write("\n]")

View file

@ -1,65 +0,0 @@
#!/usr/bin/python
import re
patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
patternCommatization = re.compile(r',* +')
pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
"""
Examples:
1. - /r/place
- r/place
2. /rplace
3. - https://www.reddit.com/r/place
- www.reddit.com/r/place
- reddit.com/r/place
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
- [www.reddit.com/r/place](www.reddit.com/r/place)
- [reddit.com/r/place](reddit.com/r/place)
UNUSED AND FAULTY
5. - https://place.reddit.com
- place.reddit.com
6. - [https://place.reddit.com](https://place.reddit.com)
- [place.reddit.com](https://place.reddit.com)
"""
def replaceStage1(contents: str):
contents = re.sub(patternCommatization, ', ', contents)
# r/... to /r/.. (change if not needed)
template = r"/r/\1"
contents = re.sub(pattern4, template, contents)
contents = re.sub(pattern3, template, contents)
contents = re.sub(pattern1, template, contents)
contents = re.sub(pattern2, template, contents)
return contents
def go(path):
print(f"Fixing {path}...")
with open(path, "r+", encoding='UTF-8') as f1:
contents = f1.read()
# Convert to r/... format first.
for matchParent in patternParent.finditer(contents):
subredditLink = matchParent.group(1)
subredditLink = replaceStage1(subredditLink)
if not subredditLink:
continue
if path == "../web/atlas-before-ids-migration.json":
contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
else:
contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
with open(path, "w", encoding='UTF-8') as f2:
f2.write(contents)
print("Writing completed. All done.")
go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff