mirror of
https://github.com/placeAtlas/atlas.git
synced 2024-11-17 23:44:06 +01:00
Merge remote-tracking branch 'up/master' into reddit
This commit is contained in:
commit
93139d6748
8 changed files with 975 additions and 2165 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -12,4 +12,5 @@ allCharacters.txt
|
||||||
combined.js
|
combined.js
|
||||||
*.DS_Store
|
*.DS_Store
|
||||||
.vscode/
|
.vscode/
|
||||||
_img/place/
|
_img/place/
|
||||||
|
web/atlas-before-ids-migration.json
|
||||||
|
|
213
tools/formatter.py
Normal file
213
tools/formatter.py
Normal file
|
@ -0,0 +1,213 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
|
"""
|
||||||
|
Examples:
|
||||||
|
1. - /r/place
|
||||||
|
- r/place
|
||||||
|
2. /rplace
|
||||||
|
3. - https://www.reddit.com/r/place
|
||||||
|
- www.reddit.com/r/place
|
||||||
|
- reddit.com/r/place
|
||||||
|
UNUSED AND FAULTY
|
||||||
|
4. - https://place.reddit.com
|
||||||
|
- place.reddit.com
|
||||||
|
5. - [https://place.reddit.com](https://place.reddit.com)
|
||||||
|
- [place.reddit.com](https://place.reddit.com)
|
||||||
|
"""
|
||||||
|
FS_REGEX = {
|
||||||
|
"commatization": r'( *(,+ +|,+ |,+)| +)(and|&|;)( *(,+ +|,+ |,+)| +)|, *$| +',
|
||||||
|
"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
|
||||||
|
"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
|
||||||
|
"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
|
||||||
|
"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
|
||||||
|
"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
|
||||||
|
"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
|
||||||
|
# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
|
||||||
|
# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
|
||||||
|
}
|
||||||
|
|
||||||
|
VALIDATE_REGEX = {
|
||||||
|
"subreddit": r'^ *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *(, *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *)*$|^$',
|
||||||
|
"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
|
||||||
|
}
|
||||||
|
|
||||||
|
CL_REGEX = r'\[(.+?)\]\((.+?)\)'
|
||||||
|
CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'
|
||||||
|
CSTW_REGEX = {
|
||||||
|
"website": r'^https?://[^\s/$.?#].[^\s]*$',
|
||||||
|
"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'
|
||||||
|
}
|
||||||
|
|
||||||
|
# r/... to /r/...
|
||||||
|
SUBREDDIT_TEMPLATE = r"/r/\1"
|
||||||
|
USER_TEMPLATE = r"/u/\1"
|
||||||
|
|
||||||
|
def format_subreddit(entry: dict):
|
||||||
|
if not "subreddit" in entry or not entry['subreddit']:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
subredditLink = entry["subreddit"]
|
||||||
|
subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
|
||||||
|
subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
|
||||||
|
subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
|
||||||
|
subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
|
||||||
|
subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
|
||||||
|
subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
|
||||||
|
subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
|
||||||
|
|
||||||
|
if not subredditLink:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
entry["subreddit"] = subredditLink
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def collapse_links(entry: dict):
|
||||||
|
if not "website" in entry or not entry['website']:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
website = entry["website"];
|
||||||
|
if re.search(CL_REGEX, website):
|
||||||
|
match = re.search(CL_REGEX, website)
|
||||||
|
if match.group(1) == match.group(2):
|
||||||
|
website = match.group(2)
|
||||||
|
|
||||||
|
entry["website"] = website
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def remove_extras(entry: dict):
|
||||||
|
if "subreddit" in entry and entry["subreddit"]:
|
||||||
|
# if not entry["subreddit"].startswith('/r/'):
|
||||||
|
# entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
|
||||||
|
entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])
|
||||||
|
|
||||||
|
for key in entry:
|
||||||
|
if not entry[key] or not isinstance(entry[key], str):
|
||||||
|
continue
|
||||||
|
# Leading and trailing spaces
|
||||||
|
entry[key] = entry[key].strip()
|
||||||
|
# Double characters
|
||||||
|
entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
|
||||||
|
entry[key] = re.sub(r' {3,}\n', r' ', entry[key])
|
||||||
|
entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
|
||||||
|
entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
|
||||||
|
entry[key] = re.sub(r',{2,}', r',', entry[key])
|
||||||
|
# Psuedo-empty strings
|
||||||
|
if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
|
||||||
|
entry[key] = ""
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def fix_r_caps(entry: dict):
|
||||||
|
if not "description" in entry or not entry['description']:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
|
||||||
|
entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def fix_no_protocol_urls(entry: dict):
|
||||||
|
if not "website" in entry or not entry['website']:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
if not entry["website"].startswith("http"):
|
||||||
|
entry["website"] = "https://" + entry["website"]
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def convert_website_to_subreddit(entry: dict):
|
||||||
|
if not "website" in entry or not entry['website']:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
if re.match(CWTS_REGEX, entry["website"]):
|
||||||
|
new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])
|
||||||
|
if (new_subreddit.lower() == entry["subreddit"].lower()):
|
||||||
|
entry["website"] = ""
|
||||||
|
elif not "subreddit" in entry or entry['subreddit'] == "":
|
||||||
|
entry["subreddit"] = new_subreddit
|
||||||
|
entry["website"] = ""
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def convert_subreddit_to_website(entry: dict):
|
||||||
|
if not "subreddit" in entry or not entry['subreddit']:
|
||||||
|
return entry
|
||||||
|
|
||||||
|
if re.match(CSTW_REGEX["website"], entry["subreddit"]):
|
||||||
|
if (entry["website"].lower() == entry["subreddit"].lower()):
|
||||||
|
entry["subreddit"] = ""
|
||||||
|
elif not "website" in entry or entry['website'] == "":
|
||||||
|
entry["website"] = entry["subreddit"]
|
||||||
|
entry["subreddit"] = ""
|
||||||
|
elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
|
||||||
|
if not "website" in entry or entry['website'] == "":
|
||||||
|
username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
|
||||||
|
entry["website"] = "https://www.reddit.com/user/" + username
|
||||||
|
entry["subreddit"] = ""
|
||||||
|
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def validate(entry: dict):
|
||||||
|
if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
|
||||||
|
print(f"Wait, no id here! How did this happened? {entry}")
|
||||||
|
return
|
||||||
|
for key in entry:
|
||||||
|
if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
|
||||||
|
print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")
|
||||||
|
|
||||||
|
def per_line_entries(entries: list):
|
||||||
|
out = "[\n"
|
||||||
|
for entry in entries:
|
||||||
|
out += json.dumps(entry) + ",\n"
|
||||||
|
out = out[:-2] + "\n]"
|
||||||
|
return out
|
||||||
|
|
||||||
|
def format_all(entry: dict, silent=False):
|
||||||
|
def print_(*args, **kwargs):
|
||||||
|
if not silent:
|
||||||
|
print(*args, **kwargs)
|
||||||
|
print_("Fixing r/ capitalization...")
|
||||||
|
entry = fix_r_caps(entry)
|
||||||
|
print_("Fixing links without protocol...")
|
||||||
|
entry = fix_no_protocol_urls(entry)
|
||||||
|
print_("Fix formatting of subreddit...")
|
||||||
|
entry = format_subreddit(entry)
|
||||||
|
print_("Collapsing Markdown links...")
|
||||||
|
entry = collapse_links(entry)
|
||||||
|
print_("Converting website links to subreddit (if possible)...")
|
||||||
|
entry = convert_website_to_subreddit(entry)
|
||||||
|
print_("Converting subreddit links to website (if needed)...")
|
||||||
|
entry = convert_subreddit_to_website(entry)
|
||||||
|
print_("Removing extras...")
|
||||||
|
entry = remove_extras(entry)
|
||||||
|
print_("Validating...")
|
||||||
|
validate(entry)
|
||||||
|
print_("Completed!")
|
||||||
|
return entry
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
def go(path):
|
||||||
|
|
||||||
|
print(f"Formatting {path}...")
|
||||||
|
|
||||||
|
with open(path, "r+", encoding='UTF-8') as f1:
|
||||||
|
entries = json.loads(f1.read())
|
||||||
|
|
||||||
|
for i in range(len(entries)):
|
||||||
|
entries[i] = format_all(entries[i], True)
|
||||||
|
if not (i % 500):
|
||||||
|
print(f"{i} checked.")
|
||||||
|
|
||||||
|
print(f"{len(entries)} checked.")
|
||||||
|
|
||||||
|
with open(path, "w", encoding='UTF-8') as f2:
|
||||||
|
f2.write(per_line_entries(entries))
|
||||||
|
|
||||||
|
print("Writing completed. All done.")
|
||||||
|
|
||||||
|
go("../web/atlas.json")
|
||||||
|
go("../web/atlas-before-ids-migration.json")
|
|
@ -1,25 +0,0 @@
|
||||||
#!/usr/bin/python
|
|
||||||
|
|
||||||
import re
|
|
||||||
pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
|
|
||||||
|
|
||||||
def go(path):
|
|
||||||
|
|
||||||
print(f"Fixing {path}...")
|
|
||||||
|
|
||||||
with open(path, "r+", encoding='UTF-8') as f1:
|
|
||||||
contents = f1.read()
|
|
||||||
|
|
||||||
for i in range(2):
|
|
||||||
for match in pattern.finditer(contents):
|
|
||||||
if match.group(1) == match.group(2):
|
|
||||||
contents = contents.replace(match.group(0), match.group(2), 1)
|
|
||||||
print(f"Stage {i+1} completed.")
|
|
||||||
|
|
||||||
with open(path, "w", encoding='UTF-8') as f2:
|
|
||||||
f2.write(contents)
|
|
||||||
print("Writing completed. All done.")
|
|
||||||
|
|
||||||
|
|
||||||
go("../web/atlas.json")
|
|
||||||
go("../web/atlas-before-ids-migration.json")
|
|
|
@ -1,37 +0,0 @@
|
||||||
#!/usr/bin/python
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
def go(path):
|
|
||||||
|
|
||||||
print(f"Fixing {path}...")
|
|
||||||
|
|
||||||
with open(path, "r+", encoding='UTF-8') as f1:
|
|
||||||
contents = f1.read()
|
|
||||||
|
|
||||||
contents = re.sub(r'": "(\s+)', r'": "', contents)
|
|
||||||
contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
|
|
||||||
print("Leading and trailing spaces removed.")
|
|
||||||
|
|
||||||
contents = re.sub(r' {2,}', r' ', contents)
|
|
||||||
print("Double spaces removed.")
|
|
||||||
|
|
||||||
contents = re.sub(r',{2,}', r',', contents)
|
|
||||||
print("Double commas removed.")
|
|
||||||
|
|
||||||
contents = re.sub(r'"n/a"', '""', contents)
|
|
||||||
contents = re.sub(r'"N/A"', '""', contents)
|
|
||||||
contents = re.sub(r'"-"', '""', contents)
|
|
||||||
contents = re.sub(r'"none"', '""', contents)
|
|
||||||
contents = re.sub(r'"null"', '""', contents)
|
|
||||||
print("Psuedo-empty strings converted into empty strings.")
|
|
||||||
|
|
||||||
contents = re.sub(r'R\/', 'r/', contents)
|
|
||||||
print("Capitalization of r/ has been fixed.")
|
|
||||||
|
|
||||||
with open(path, "w", encoding='UTF-8') as f2:
|
|
||||||
f2.write(contents)
|
|
||||||
print("Writing completed. All done.")
|
|
||||||
|
|
||||||
go("../web/atlas.json")
|
|
||||||
go("../web/atlas-before-ids-migration.json")
|
|
|
@ -1,24 +1,33 @@
|
||||||
|
|
||||||
import praw
|
import praw
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
|
from formatter import format_all
|
||||||
|
|
||||||
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
|
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
|
||||||
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
|
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
|
||||||
|
|
||||||
credentials = open('credentials', 'r')
|
with open('credentials', 'r') as file:
|
||||||
client_id = credentials.readline().strip(' \t\n\r')
|
credentials = file.readlines()
|
||||||
client_secret = credentials.readline().strip(' \t\n\r')
|
client_id = credentials[0].strip()
|
||||||
user = credentials.readline().strip(' \t\n\r')
|
client_secret = credentials[1].strip()
|
||||||
pw = credentials.readline().strip(' \t\n\r')
|
username = credentials[2].strip()
|
||||||
|
password = credentials[3].strip()
|
||||||
|
|
||||||
|
reddit = praw.Reddit(
|
||||||
|
client_id=client_id,
|
||||||
|
client_secret=client_secret,
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
user_agent='atlas_bot'
|
||||||
|
)
|
||||||
|
|
||||||
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
|
|
||||||
has_write_access = not reddit.read_only
|
has_write_access = not reddit.read_only
|
||||||
if not has_write_access:
|
if not has_write_access:
|
||||||
print("Warning: No write access. Post flairs will not be updated")
|
print("Warning: No write access. Post flairs will not be updated.")
|
||||||
sleep(5)
|
time.sleep(5)
|
||||||
|
|
||||||
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
|
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
|
||||||
existing = json.load(jsonfile)
|
existing = json.load(jsonfile)
|
||||||
|
@ -39,6 +48,7 @@ def set_flair(submission, flair):
|
||||||
failcount = 0
|
failcount = 0
|
||||||
successcount = 0
|
successcount = 0
|
||||||
totalcount = 0
|
totalcount = 0
|
||||||
|
|
||||||
outfile.write("[\n")
|
outfile.write("[\n")
|
||||||
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
|
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
|
||||||
"""
|
"""
|
||||||
|
@ -48,12 +58,14 @@ def set_flair(submission, flair):
|
||||||
3. Give it a name and description
|
3. Give it a name and description
|
||||||
4. Select "script"
|
4. Select "script"
|
||||||
5. Redirect to http://localhost:8080
|
5. Redirect to http://localhost:8080
|
||||||
6. Copy ID (under Personal Use Script)
|
6. Create file "credentials" with the format below.
|
||||||
7. Append to file called "credentials"
|
┌─────────────────────────────────────────────────────┐
|
||||||
8. Copy Secret
|
│ [ID] <- Under "personal use script" │
|
||||||
9. Append on newline to "credentials" file
|
│ [Secret] │
|
||||||
10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
|
│ [Username] <- Must be a mod, don't do this if you │
|
||||||
11. Run Script
|
│ [Password] <- don't know what you are doing. │
|
||||||
|
└─────────────────────────────────────────────────────┘
|
||||||
|
7. Run Script
|
||||||
|
|
||||||
Running Script
|
Running Script
|
||||||
1. Input the next ID to use
|
1. Input the next ID to use
|
||||||
|
@ -63,6 +75,7 @@ def set_flair(submission, flair):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
total_all_flairs += 1
|
total_all_flairs += 1
|
||||||
|
|
||||||
if (submission.id in existing_ids):
|
if (submission.id in existing_ids):
|
||||||
set_flair(submission, "Processed Entry")
|
set_flair(submission, "Processed Entry")
|
||||||
print("Found first duplicate!")
|
print("Found first duplicate!")
|
||||||
|
@ -71,40 +84,59 @@ def set_flair(submission, flair):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
if(submission.link_flair_text == "New Entry"):
|
|
||||||
text = submission.selftext
|
if (submission.link_flair_text == "New Entry"):
|
||||||
#Old backslash filter:
|
|
||||||
#text = text.replace("\\", "")
|
|
||||||
#New one: One \\ escapes a backslash in python's parser
|
|
||||||
# Two escape it again in the regex parser, so \\\\ is \
|
|
||||||
# Then anything but " or n is replaced with the first capture group (anything but " or n)
|
|
||||||
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
|
|
||||||
text = re.sub("\\\\([^\"n])", "\\1", text)
|
|
||||||
try:
|
try:
|
||||||
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
|
|
||||||
except AttributeError:
|
|
||||||
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
|
|
||||||
|
|
||||||
|
text = submission.selftext
|
||||||
|
rawtext = text
|
||||||
|
|
||||||
lines = text.split("\n")
|
text = text.replace('\u200c', '')
|
||||||
|
text = re.compile(r".*(\{.+\}).*", re.DOTALL).search(text).group(1)
|
||||||
|
# Test if it needs to escape the escape character. Usually happens on fancy mode.
|
||||||
|
try: json.loads(text)
|
||||||
|
except json.JSONDecodeError: text = re.sub(r"\\(.)", r"\1", text)
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
submission_json = json.loads(text)
|
||||||
if("\"id\": 0" in line):
|
|
||||||
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
|
if submission_json:
|
||||||
text = "\n".join(lines)
|
|
||||||
try:
|
# Assert if path does not empty
|
||||||
outfile.write(json.dumps(json.loads(text))+" ,\n")
|
assert len(submission_json["path"]) > 0
|
||||||
successcount += 1
|
|
||||||
set_flair(submission, "Processed Entry")
|
submission_json_dummy = {"id": submission.id, "submitted_by": ""}
|
||||||
except json.JSONDecodeError:
|
try:
|
||||||
failfile.write(text+",\n")
|
submission_json_dummy["submitted_by"] = submission.author.name
|
||||||
|
except AttributeError:
|
||||||
|
submission_json_dummy["submitted_by"] = "unknown"
|
||||||
|
for key in submission_json:
|
||||||
|
if not key in submission_json_dummy:
|
||||||
|
submission_json_dummy[key] = submission_json[key];
|
||||||
|
submission_json = format_all(submission_json_dummy, True)
|
||||||
|
|
||||||
|
outfile.write(json.dumps(submission_json) + ",\n")
|
||||||
|
successcount += 1
|
||||||
|
set_flair(submission, "Processed Entry")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
failfile.write(
|
||||||
|
"\n\n" + "="*40 + "\n\n" +
|
||||||
|
submission.id + "\n\n" +
|
||||||
|
traceback.format_exc() + "\n\n" +
|
||||||
|
"==== RAW ====" + "\n\n" +
|
||||||
|
rawtext + "\n\n"
|
||||||
|
"==== CLEAN ====" + "\n\n" +
|
||||||
|
text + "\n\n"
|
||||||
|
)
|
||||||
failcount += 1
|
failcount += 1
|
||||||
set_flair(submission, "Rejected Entry")
|
set_flair(submission, "Rejected Entry")
|
||||||
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
|
|
||||||
|
print("Wrote "+submission.id+", submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
|
||||||
totalcount += 1
|
totalcount += 1
|
||||||
|
|
||||||
# Remove ,\n
|
# Remove last trailing comma
|
||||||
outfile.seek(outfile.tell()-4, os.SEEK_SET)
|
outfile.seek(outfile.tell()-3, os.SEEK_SET)
|
||||||
outfile.truncate()
|
outfile.truncate()
|
||||||
|
|
||||||
outfile.write("\n]")
|
outfile.write("\n]")
|
||||||
|
|
|
@ -1,65 +0,0 @@
|
||||||
#!/usr/bin/python
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
|
|
||||||
patternCommatization = re.compile(r',* +')
|
|
||||||
pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
|
|
||||||
pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
|
|
||||||
pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
|
|
||||||
pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
|
|
||||||
# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
|
|
||||||
# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
|
|
||||||
"""
|
|
||||||
Examples:
|
|
||||||
1. - /r/place
|
|
||||||
- r/place
|
|
||||||
2. /rplace
|
|
||||||
3. - https://www.reddit.com/r/place
|
|
||||||
- www.reddit.com/r/place
|
|
||||||
- reddit.com/r/place
|
|
||||||
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
|
|
||||||
- [www.reddit.com/r/place](www.reddit.com/r/place)
|
|
||||||
- [reddit.com/r/place](reddit.com/r/place)
|
|
||||||
UNUSED AND FAULTY
|
|
||||||
5. - https://place.reddit.com
|
|
||||||
- place.reddit.com
|
|
||||||
6. - [https://place.reddit.com](https://place.reddit.com)
|
|
||||||
- [place.reddit.com](https://place.reddit.com)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def replaceStage1(contents: str):
|
|
||||||
contents = re.sub(patternCommatization, ', ', contents)
|
|
||||||
|
|
||||||
# r/... to /r/.. (change if not needed)
|
|
||||||
template = r"/r/\1"
|
|
||||||
contents = re.sub(pattern4, template, contents)
|
|
||||||
contents = re.sub(pattern3, template, contents)
|
|
||||||
contents = re.sub(pattern1, template, contents)
|
|
||||||
contents = re.sub(pattern2, template, contents)
|
|
||||||
return contents
|
|
||||||
|
|
||||||
def go(path):
|
|
||||||
|
|
||||||
print(f"Fixing {path}...")
|
|
||||||
|
|
||||||
with open(path, "r+", encoding='UTF-8') as f1:
|
|
||||||
contents = f1.read()
|
|
||||||
|
|
||||||
# Convert to r/... format first.
|
|
||||||
for matchParent in patternParent.finditer(contents):
|
|
||||||
subredditLink = matchParent.group(1)
|
|
||||||
subredditLink = replaceStage1(subredditLink)
|
|
||||||
if not subredditLink:
|
|
||||||
continue
|
|
||||||
if path == "../web/atlas-before-ids-migration.json":
|
|
||||||
contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
|
|
||||||
else:
|
|
||||||
contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
|
|
||||||
|
|
||||||
with open(path, "w", encoding='UTF-8') as f2:
|
|
||||||
f2.write(contents)
|
|
||||||
print("Writing completed. All done.")
|
|
||||||
|
|
||||||
go("../web/atlas.json")
|
|
||||||
go("../web/atlas-before-ids-migration.json")
|
|
File diff suppressed because it is too large
Load diff
1434
web/atlas.json
1434
web/atlas.json
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue