mirror of
https://github.com/placeAtlas/atlas.git
synced 2024-09-27 12:39:18 +02:00
Improve and merge scripts, use JSON instead of regex
This commit is contained in:
parent
433764b03c
commit
5a660759bf
5 changed files with 205 additions and 131 deletions
135
tools/formatter.py
Normal file
135
tools/formatter.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
"""
|
||||
Examples:
|
||||
1. - /r/place
|
||||
- r/place
|
||||
2. /rplace
|
||||
3. - https://www.reddit.com/r/place
|
||||
- www.reddit.com/r/place
|
||||
- reddit.com/r/place
|
||||
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
|
||||
- [www.reddit.com/r/place](www.reddit.com/r/place)
|
||||
- [reddit.com/r/place](reddit.com/r/place)
|
||||
UNUSED AND FAULTY
|
||||
5. - https://place.reddit.com
|
||||
- place.reddit.com
|
||||
6. - [https://place.reddit.com](https://place.reddit.com)
|
||||
- [place.reddit.com](https://place.reddit.com)
|
||||
"""
|
||||
format_subreddit_regex = {
|
||||
# r/... to /r/...
|
||||
"template": r"/r/\1",
|
||||
"commatization": r',* +',
|
||||
"pattern1": r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
|
||||
"pattern2": r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
|
||||
"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
|
||||
"pattern4": r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)',
|
||||
# "pattern5": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
|
||||
# "pattern6": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
|
||||
}
|
||||
|
||||
collapse_links_regex = re.compile(r'\[(.+?)\]\((.+?)\)')
|
||||
|
||||
def format_subreddit(entry: dict):
|
||||
if not "subreddit" in entry:
|
||||
return entry
|
||||
|
||||
subredditLink = entry["subreddit"]
|
||||
subredditLink = re.sub(format_subreddit_regex["commatization"], ', ', subredditLink)
|
||||
subredditLink = re.sub(format_subreddit_regex["pattern4"], format_subreddit_regex["template"], subredditLink)
|
||||
subredditLink = re.sub(format_subreddit_regex["pattern3"], format_subreddit_regex["template"], subredditLink)
|
||||
subredditLink = re.sub(format_subreddit_regex["pattern1"], format_subreddit_regex["template"], subredditLink)
|
||||
subredditLink = re.sub(format_subreddit_regex["pattern2"], format_subreddit_regex["template"], subredditLink)
|
||||
|
||||
if not subredditLink:
|
||||
return entry
|
||||
|
||||
entry["subreddit"] = subredditLink
|
||||
return entry
|
||||
|
||||
def collapse_links(entry: dict):
|
||||
if not "website" in entry:
|
||||
return entry
|
||||
website = entry["website"];
|
||||
if collapse_links_regex.search(website):
|
||||
match = collapse_links_regex.search(website)
|
||||
if match.group(1) == match.group(2):
|
||||
website = match.group(2)
|
||||
|
||||
entry["website"] = website
|
||||
return entry
|
||||
|
||||
def remove_extras(entry: dict):
|
||||
for key in entry:
|
||||
if not entry[key] or not isinstance(entry[key], str):
|
||||
continue
|
||||
# Leading and trailing spaces
|
||||
entry[key] = re.sub(r'^(\s+)', r'', entry[key])
|
||||
entry[key] = re.sub(r'(\s+)$', r'', entry[key])
|
||||
# Double spaces and commas
|
||||
entry[key] = re.sub(r' {2,}', r' ', entry[key])
|
||||
entry[key] = re.sub(r'\n{2,}', r' ', entry[key])
|
||||
entry[key] = re.sub(r',{2,}', r',', entry[key])
|
||||
# Psuedo-empty strings
|
||||
if entry[key] in ["n/a", "N/A", "-", "null", "none", "None"]:
|
||||
entry[key] = ""
|
||||
|
||||
return entry
|
||||
|
||||
def fix_r_caps(entry: dict):
|
||||
if not "description" in entry:
|
||||
return entry
|
||||
|
||||
entry["description"] = re.sub(r'R\/', 'r/', entry["description"])
|
||||
|
||||
return entry
|
||||
|
||||
def per_line_entries(entries: list):
|
||||
out = "[\n"
|
||||
for entry in entries:
|
||||
out += json.dumps(entry) + ",\n"
|
||||
out = out[:-2] + "\n]"
|
||||
return out
|
||||
|
||||
def format_all(entry: dict, silent=False):
|
||||
def print_(*args, **kwargs):
|
||||
if not silent:
|
||||
print(*args, **kwargs)
|
||||
print_("Removing extras...")
|
||||
entry = remove_extras(entry)
|
||||
print_("Fixing r/ capitalization...")
|
||||
entry = fix_r_caps(entry)
|
||||
print_("Collapsing Markdown links...")
|
||||
entry = collapse_links(entry)
|
||||
print_("Fix formatting of subreddit...")
|
||||
entry = format_subreddit(entry)
|
||||
print_("Completed!")
|
||||
return entry
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def go(path):
|
||||
|
||||
print(f"Formatting {path}...")
|
||||
|
||||
with open(path, "r+", encoding='UTF-8') as f1:
|
||||
entries = json.loads(f1.read())
|
||||
|
||||
for i in range(len(entries)):
|
||||
entries[i] = format_all(entries[i], True)
|
||||
if not (i % 500):
|
||||
print(f"{i} checked.")
|
||||
|
||||
print(f"{len(entries)} checked.")
|
||||
|
||||
with open(path, "w", encoding='UTF-8') as f2:
|
||||
f2.write(per_line_entries(entries))
|
||||
|
||||
print("Writing completed. All done.")
|
||||
|
||||
go("../web/atlas.json")
|
||||
go("../web/atlas-before-ids-migration.json")
|
|
@ -1,25 +1,26 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
|
||||
import json
|
||||
from formatter import collapse_links, per_line_entries
|
||||
|
||||
def go(path):
|
||||
|
||||
print(f"Fixing {path}...")
|
||||
print(f"Formatting {path}...")
|
||||
|
||||
with open(path, "r+", encoding='UTF-8') as f1:
|
||||
contents = f1.read()
|
||||
entries = json.loads(f1.read())
|
||||
|
||||
for i in range(2):
|
||||
for match in pattern.finditer(contents):
|
||||
if match.group(1) == match.group(2):
|
||||
contents = contents.replace(match.group(0), match.group(2), 1)
|
||||
print(f"Stage {i+1} completed.")
|
||||
for i in range(len(entries)):
|
||||
entries[i] = collapse_links(entries[i])
|
||||
if not (i % 500):
|
||||
print(f"{i} checked.")
|
||||
|
||||
print(f"{len(entries)} checked.")
|
||||
|
||||
with open(path, "w", encoding='UTF-8') as f2:
|
||||
f2.write(contents)
|
||||
f2.write(per_line_entries(entries))
|
||||
|
||||
print("Writing completed. All done.")
|
||||
|
||||
|
||||
go("../web/atlas.json")
|
||||
go("../web/atlas-before-ids-migration.json")
|
||||
go("../web/atlas-before-ids-migration.json")
|
|
@ -1,36 +1,26 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
import json
|
||||
from formatter import remove_extras, fix_r_caps, per_line_entries
|
||||
|
||||
def go(path):
|
||||
|
||||
print(f"Fixing {path}...")
|
||||
print(f"Formatting {path}...")
|
||||
|
||||
with open(path, "r+", encoding='UTF-8') as f1:
|
||||
contents = f1.read()
|
||||
entries = json.loads(f1.read())
|
||||
|
||||
contents = re.sub(r'": "(\s+)', r'": "', contents)
|
||||
contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
|
||||
print("Leading and trailing spaces removed.")
|
||||
for i in range(len(entries)):
|
||||
entries[i] = remove_extras(entries[i])
|
||||
entries[i] = fix_r_caps(entries[i])
|
||||
if not (i % 500):
|
||||
print(f"{i} checked.")
|
||||
|
||||
contents = re.sub(r' {2,}', r' ', contents)
|
||||
print("Double spaces removed.")
|
||||
|
||||
contents = re.sub(r',{2,}', r',', contents)
|
||||
print("Double commas removed.")
|
||||
|
||||
contents = re.sub(r'"n/a"', '""', contents)
|
||||
contents = re.sub(r'"N/A"', '""', contents)
|
||||
contents = re.sub(r'"-"', '""', contents)
|
||||
contents = re.sub(r'"none"', '""', contents)
|
||||
contents = re.sub(r'"null"', '""', contents)
|
||||
print("Psuedo-empty strings converted into empty strings.")
|
||||
|
||||
contents = re.sub(r'R\/', 'r/', contents)
|
||||
print("Capitalization of r/ has been fixed.")
|
||||
print(f"{len(entries)} checked.")
|
||||
|
||||
with open(path, "w", encoding='UTF-8') as f2:
|
||||
f2.write(contents)
|
||||
f2.write(per_line_entries(entries))
|
||||
|
||||
print("Writing completed. All done.")
|
||||
|
||||
go("../web/atlas.json")
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
import time
|
||||
import re
|
||||
import os
|
||||
from formatter import format_all
|
||||
|
||||
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
|
||||
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
|
||||
|
@ -11,14 +12,12 @@
|
|||
credentials = open('credentials', 'r')
|
||||
client_id = credentials.readline().strip(' \t\n\r')
|
||||
client_secret = credentials.readline().strip(' \t\n\r')
|
||||
user = credentials.readline().strip(' \t\n\r')
|
||||
pw = credentials.readline().strip(' \t\n\r')
|
||||
|
||||
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
|
||||
has_write_access = not reddit.read_only
|
||||
if not has_write_access:
|
||||
print("Warning: No write access. Post flairs will not be updated")
|
||||
sleep(5)
|
||||
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot')
|
||||
|
||||
failcount = 0
|
||||
successcount = 0
|
||||
totalcount = 0
|
||||
|
||||
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
|
||||
existing = json.load(jsonfile)
|
||||
|
@ -28,17 +27,8 @@
|
|||
for item in existing:
|
||||
existing_ids.append(item['id'])
|
||||
|
||||
def set_flair(submission, flair):
|
||||
if has_write_access and submission.link_flair_text != flair:
|
||||
flair_choices = submission.flair.choices()
|
||||
flair = next(x for x in flair_choices if x["flair_text_editable"] and flair == x["flair_text"])
|
||||
submission.flair.select(flair["flair_template_id"])
|
||||
|
||||
total_all_flairs = 0
|
||||
duplicate_count = 0
|
||||
failcount = 0
|
||||
successcount = 0
|
||||
totalcount = 0
|
||||
outfile.write("[\n")
|
||||
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
|
||||
"""
|
||||
|
@ -52,8 +42,7 @@ def set_flair(submission, flair):
|
|||
7. Append to file called "credentials"
|
||||
8. Copy Secret
|
||||
9. Append on newline to "credentials" file
|
||||
10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
|
||||
11. Run Script
|
||||
10. Run Script
|
||||
|
||||
Running Script
|
||||
1. Input the next ID to use
|
||||
|
@ -63,43 +52,41 @@ def set_flair(submission, flair):
|
|||
|
||||
"""
|
||||
total_all_flairs += 1
|
||||
|
||||
if (submission.id in existing_ids):
|
||||
set_flair(submission, "Processed Entry")
|
||||
print("Found first duplicate!")
|
||||
duplicate_count += 1
|
||||
if (duplicate_count > 0):
|
||||
if (duplicate_count > 10):
|
||||
break
|
||||
else:
|
||||
continue
|
||||
|
||||
if(submission.link_flair_text == "New Entry"):
|
||||
|
||||
text = submission.selftext
|
||||
#Old backslash filter:
|
||||
#text = text.replace("\\", "")
|
||||
#New one: One \\ escapes a backslash in python's parser
|
||||
# Two escape it again in the regex parser, so \\\\ is \
|
||||
# Then anything but " or n is replaced with the first capture group (anything but " or n)
|
||||
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
|
||||
text = re.sub("\\\\([^\"n])", "\\1", text)
|
||||
submission_json = ""
|
||||
|
||||
try:
|
||||
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
|
||||
except AttributeError:
|
||||
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
|
||||
|
||||
|
||||
lines = text.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if("\"id\": 0" in line):
|
||||
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
|
||||
text = "\n".join(lines)
|
||||
try:
|
||||
outfile.write(json.dumps(json.loads(text))+" ,\n")
|
||||
successcount += 1
|
||||
set_flair(submission, "Processed Entry")
|
||||
submission_json = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
failfile.write(text+",\n")
|
||||
failcount += 1
|
||||
set_flair(submission, "Rejected Entry")
|
||||
|
||||
if (submission_json):
|
||||
|
||||
submission_json_dummy = {"id": submission.id, "submitted_by": ""}
|
||||
try:
|
||||
submission_json_dummy["submitted_by"] = submission.author.name
|
||||
except AttributeError:
|
||||
submission_json_dummy["submitted_by"]
|
||||
for key in submission_json:
|
||||
if not key in submission_json_dummy:
|
||||
submission_json_dummy[key] = submission_json[key];
|
||||
submission_json = format_all(submission_json_dummy)
|
||||
|
||||
outfile.write(json.dumps(json.loads(text))+" ,\n")
|
||||
successcount += 1
|
||||
|
||||
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
|
||||
totalcount += 1
|
||||
|
||||
|
|
|
@ -1,65 +1,26 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
|
||||
patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
|
||||
patternCommatization = re.compile(r',* +')
|
||||
pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
|
||||
pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
|
||||
pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
|
||||
pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
|
||||
# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
|
||||
# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
|
||||
"""
|
||||
Examples:
|
||||
1. - /r/place
|
||||
- r/place
|
||||
2. /rplace
|
||||
3. - https://www.reddit.com/r/place
|
||||
- www.reddit.com/r/place
|
||||
- reddit.com/r/place
|
||||
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
|
||||
- [www.reddit.com/r/place](www.reddit.com/r/place)
|
||||
- [reddit.com/r/place](reddit.com/r/place)
|
||||
UNUSED AND FAULTY
|
||||
5. - https://place.reddit.com
|
||||
- place.reddit.com
|
||||
6. - [https://place.reddit.com](https://place.reddit.com)
|
||||
- [place.reddit.com](https://place.reddit.com)
|
||||
"""
|
||||
|
||||
def replaceStage1(contents: str):
|
||||
contents = re.sub(patternCommatization, ', ', contents)
|
||||
|
||||
# r/... to /r/.. (change if not needed)
|
||||
template = r"/r/\1"
|
||||
contents = re.sub(pattern4, template, contents)
|
||||
contents = re.sub(pattern3, template, contents)
|
||||
contents = re.sub(pattern1, template, contents)
|
||||
contents = re.sub(pattern2, template, contents)
|
||||
return contents
|
||||
import json
|
||||
from formatter import format_subreddit, per_line_entries
|
||||
|
||||
def go(path):
|
||||
|
||||
print(f"Fixing {path}...")
|
||||
print(f"Formatting {path}...")
|
||||
|
||||
with open(path, "r+", encoding='UTF-8') as f1:
|
||||
contents = f1.read()
|
||||
with open(path, "r+", encoding='UTF-8') as f1:
|
||||
entries = json.loads(f1.read())
|
||||
|
||||
# Convert to r/... format first.
|
||||
for matchParent in patternParent.finditer(contents):
|
||||
subredditLink = matchParent.group(1)
|
||||
subredditLink = replaceStage1(subredditLink)
|
||||
if not subredditLink:
|
||||
continue
|
||||
if path == "../web/atlas-before-ids-migration.json":
|
||||
contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
|
||||
else:
|
||||
contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
|
||||
for i in range(len(entries)):
|
||||
entries[i] = format_subreddit(entries[i])
|
||||
if not (i % 500):
|
||||
print(f"{i} checked.")
|
||||
|
||||
with open(path, "w", encoding='UTF-8') as f2:
|
||||
f2.write(contents)
|
||||
print("Writing completed. All done.")
|
||||
print(f"{len(entries)} checked.")
|
||||
|
||||
with open(path, "w", encoding='UTF-8') as f2:
|
||||
f2.write(per_line_entries(entries))
|
||||
|
||||
print("Writing completed. All done.")
|
||||
|
||||
go("../web/atlas.json")
|
||||
go("../web/atlas-before-ids-migration.json")
|
Loading…
Reference in a new issue