Merge pull request #1074 from placeAtlas/cleanup

2024-12-26 05:04:20 +01:00 · 2022-04-08 11:01:42 -03:00 · 2022-04-08 11:01:42 -03:00 · 23db7f330d
commit 23db7f330d
parent eb68c65b53 47a049a409
8 changed files with 975 additions and 2166 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,5 @@ allCharacters.txt
 combined.js
 *.DS_Store
 .vscode/
-_img/place/
+_img/place/
+web/atlas-before-ids-migration.json
--- a/tools/formatter.py
+++ b/tools/formatter.py
@ -0,0 +1,213 @@
+#!/usr/bin/python
+
+import re
+import json
+
+"""
+Examples:
+1. - /r/place
+   - r/place
+2. /rplace
+3. - https://www.reddit.com/r/place
+   - www.reddit.com/r/place
+   - reddit.com/r/place
+UNUSED AND FAULTY
+4. - https://place.reddit.com
+   - place.reddit.com
+5. - [https://place.reddit.com](https://place.reddit.com)
+   - [place.reddit.com](https://place.reddit.com)
+"""
+FS_REGEX = {
+	"commatization": r'( *(,+ +|,+ |,+)| +)(and|&|;)( *(,+ +|,+ |,+)| +)|, *$| +',
+	"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
+	"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
+	"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
+	"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
+	"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
+	"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
+	# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
+	# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
+}
+
+VALIDATE_REGEX = {
+	"subreddit": r'^ *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *(, *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *)*$|^$',
+	"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
+}
+
+CL_REGEX = r'\[(.+?)\]\((.+?)\)'
+CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'
+CSTW_REGEX = {
+	"website": r'^https?://[^\s/$.?#].[^\s]*$',
+	"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'
+}
+
+# r/... to /r/...
+SUBREDDIT_TEMPLATE = r"/r/\1"
+USER_TEMPLATE = r"/u/\1"
+
+def format_subreddit(entry: dict):
+	if not "subreddit" in entry or not entry['subreddit']:
+		return entry
+
+	subredditLink = entry["subreddit"]
+	subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
+	subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
+	subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
+	subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
+	subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
+	subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
+	subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
+
+	if not subredditLink:
+		return entry
+	
+	entry["subreddit"] = subredditLink
+	return entry
+
+def collapse_links(entry: dict):
+	if not "website" in entry or not entry['website']:
+		return entry
+		
+	website = entry["website"];
+	if re.search(CL_REGEX, website):
+		match = re.search(CL_REGEX, website)
+		if match.group(1) == match.group(2):
+			website = match.group(2)
+
+	entry["website"] = website
+	return entry
+
+def remove_extras(entry: dict):
+	if "subreddit" in entry and entry["subreddit"]:
+		# if not entry["subreddit"].startswith('/r/'):
+		# 	entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
+		entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])
+
+	for key in entry:
+		if not entry[key] or not isinstance(entry[key], str): 
+			continue
+		# Leading and trailing spaces
+		entry[key] = entry[key].strip()
+		# Double characters
+		entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
+		entry[key] = re.sub(r' {3,}\n', r'  ', entry[key])
+		entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
+		entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
+		entry[key] = re.sub(r',{2,}', r',', entry[key])
+		# Psuedo-empty strings
+		if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
+			entry[key] = ""
+
+	return entry
+
+def fix_r_caps(entry: dict):
+	if not "description" in entry or not entry['description']:
+		return entry
+	
+	entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
+	entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])
+
+	return entry
+
+def fix_no_protocol_urls(entry: dict):
+	if not "website" in entry or not entry['website']:
+		return entry
+	
+	if not entry["website"].startswith("http"):
+		entry["website"] = "https://" + entry["website"]
+
+	return entry
+
+def convert_website_to_subreddit(entry: dict):
+	if not "website" in entry or not entry['website']:
+		return entry
+
+	if re.match(CWTS_REGEX, entry["website"]):
+		new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])
+		if (new_subreddit.lower() == entry["subreddit"].lower()):
+			entry["website"] = ""
+		elif not "subreddit" in entry or entry['subreddit'] == "":
+			entry["subreddit"] = new_subreddit
+			entry["website"] = ""
+
+	return entry
+
+def convert_subreddit_to_website(entry: dict):
+	if not "subreddit" in entry or not entry['subreddit']:
+		return entry
+
+	if re.match(CSTW_REGEX["website"], entry["subreddit"]):
+		if (entry["website"].lower() == entry["subreddit"].lower()):
+			entry["subreddit"] = ""
+		elif not "website" in entry or entry['website'] == "":
+			entry["website"] = entry["subreddit"]
+			entry["subreddit"] = ""
+	elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
+		if not "website" in entry or entry['website'] == "":
+			username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
+			entry["website"] = "https://www.reddit.com/user/" + username
+			entry["subreddit"] = ""
+
+	return entry
+	
+def validate(entry: dict):
+	if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
+		print(f"Wait, no id here! How did this happened? {entry}")
+		return
+	for key in entry:
+		if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
+			print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")
+
+def per_line_entries(entries: list):
+	out = "[\n"
+	for entry in entries:
+		out += json.dumps(entry) + ",\n"
+	out = out[:-2] + "\n]"
+	return out
+
+def format_all(entry: dict, silent=False):
+	def print_(*args, **kwargs):
+		if not silent:
+			print(*args, **kwargs)
+	print_("Fixing r/ capitalization...")
+	entry = fix_r_caps(entry)
+	print_("Fixing links without protocol...")
+	entry = fix_no_protocol_urls(entry)
+	print_("Fix formatting of subreddit...")
+	entry = format_subreddit(entry)
+	print_("Collapsing Markdown links...")
+	entry = collapse_links(entry)
+	print_("Converting website links to subreddit (if possible)...")
+	entry = convert_website_to_subreddit(entry)
+	print_("Converting subreddit links to website (if needed)...")
+	entry = convert_subreddit_to_website(entry)
+	print_("Removing extras...")
+	entry = remove_extras(entry)
+	print_("Validating...")
+	validate(entry)
+	print_("Completed!")
+	return entry
+
+if __name__ == '__main__':
+
+	def go(path):
+
+		print(f"Formatting {path}...")
+
+		with open(path, "r+", encoding='UTF-8') as f1:
+			entries = json.loads(f1.read())
+
+		for i in range(len(entries)):
+			entries[i] = format_all(entries[i], True)
+			if not (i % 500):
+				print(f"{i} checked.")
+
+		print(f"{len(entries)} checked.")
+
+		with open(path, "w", encoding='UTF-8') as f2:
+			f2.write(per_line_entries(entries))
+
+		print("Writing completed. All done.")
+
+	go("../web/atlas.json")
+	go("../web/atlas-before-ids-migration.json")
--- a/tools/less-md-links.py
+++ b/tools/less-md-links.py
@ -1,25 +0,0 @@
-#!/usr/bin/python
-
-import re
-pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
-
-def go(path):
-
-  print(f"Fixing {path}...")
-
-  with open(path, "r+", encoding='UTF-8') as f1:
-    contents = f1.read()
-
-  for i in range(2):
-    for match in pattern.finditer(contents):
-      if match.group(1) == match.group(2):
-        contents = contents.replace(match.group(0), match.group(2), 1)
-    print(f"Stage {i+1} completed.")
-
-  with open(path, "w", encoding='UTF-8') as f2:
-    f2.write(contents)
-  print("Writing completed. All done.")
-
-
-go("../web/atlas.json")
-go("../web/atlas-before-ids-migration.json") 
--- a/tools/misc-formats.py
+++ b/tools/misc-formats.py
@ -1,37 +0,0 @@
-#!/usr/bin/python
-
-import re
-
-def go(path):
-
-  print(f"Fixing {path}...")
-
-  with open(path, "r+", encoding='UTF-8') as f1:
-    contents = f1.read()
-
-  contents = re.sub(r'": "(\s+)', r'": "', contents)
-  contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
-  print("Leading and trailing spaces removed.")
-
-  contents = re.sub(r' {2,}', r' ', contents)
-  print("Double spaces removed.")
-
-  contents = re.sub(r',{2,}', r',', contents)
-  print("Double commas removed.")
-
-  contents = re.sub(r'"n/a"', '""', contents)
-  contents = re.sub(r'"N/A"', '""', contents)
-  contents = re.sub(r'"-"', '""', contents)
-  contents = re.sub(r'"none"', '""', contents)
-  contents = re.sub(r'"null"', '""', contents)
-  print("Psuedo-empty strings converted into empty strings.")
-
-  contents = re.sub(r'R\/', 'r/', contents)
-  print("Capitalization of r/ has been fixed.")
-
-  with open(path, "w", encoding='UTF-8') as f2:
-    f2.write(contents)
-  print("Writing completed. All done.")
-
-go("../web/atlas.json")
-go("../web/atlas-before-ids-migration.json")
--- a/tools/redditcrawl.py
+++ b/tools/redditcrawl.py
@ -1,24 +1,33 @@
-
 import praw
 import json
 import time
 import re
 import os
+import traceback
+from formatter import format_all

 outfile = open('temp_atlas.json', 'w', encoding='utf-8')
 failfile = open('manual_atlas.json', 'w', encoding='utf-8')

-credentials = open('credentials', 'r')
-client_id = credentials.readline().strip(' \t\n\r')
-client_secret = credentials.readline().strip(' \t\n\r')
-user = credentials.readline().strip(' \t\n\r')
-pw = credentials.readline().strip(' \t\n\r')
+with open('credentials', 'r') as file:
+	credentials = file.readlines()
+	client_id = credentials[0].strip()
+	client_secret = credentials[1].strip()
+	username = credentials[2].strip()
+	password = credentials[3].strip()
+
+reddit = praw.Reddit(
+	client_id=client_id, 
+	client_secret=client_secret,
+	username=username,
+	password=password,
+	user_agent='atlas_bot'
+)

-reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
 has_write_access = not reddit.read_only
 if not has_write_access:
-	print("Warning: No write access. Post flairs will not be updated")
-	sleep(5)
+	print("Warning: No write access. Post flairs will not be updated.")
+	time.sleep(5)

 jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
 existing = json.load(jsonfile)
@ -39,6 +48,7 @@ def set_flair(submission, flair):
 failcount = 0
 successcount = 0
 totalcount = 0
+
 outfile.write("[\n")
 for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
 	"""
@ -48,12 +58,14 @@ def set_flair(submission, flair):
 	3. Give it a name and description
 	4. Select "script"
 	5. Redirect to http://localhost:8080
-	6. Copy ID (under Personal Use Script)
-	7. Append to file called "credentials"
-	8. Copy Secret 
-	9. Append on newline to "credentials" file
-	10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
-	11. Run Script
+	6. Create file "credentials" with the format below.
+	┌─────────────────────────────────────────────────────┐
+	│ [ID]        <-  Under "personal use script"         │
+	│ [Secret]                                            │
+	│ [Username]  <-  Must be a mod, don't do this if you │
+	│ [Password]  <-  don't know what you are doing.      │
+	└─────────────────────────────────────────────────────┘
+	7. Run Script

 	Running Script
 	1. Input the next ID to use
@ -63,6 +75,7 @@ def set_flair(submission, flair):

 	"""
 	total_all_flairs += 1
+
 	if (submission.id in existing_ids):
 		set_flair(submission, "Processed Entry")
 		print("Found first duplicate!")
@ -71,40 +84,59 @@ def set_flair(submission, flair):
 			break
 		else:
 			continue
-	if(submission.link_flair_text == "New Entry"):
-		text = submission.selftext
-		#Old backslash filter:
-		#text = text.replace("\\", "")
-		#New one: One \\ escapes a backslash in python's parser
-		# Two escape it again in the regex parser, so \\\\ is \
-		# Then anything but " or n is replaced with the first capture group (anything but " or n)
-		# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
-		text = re.sub("\\\\([^\"n])", "\\1", text)
+
+	if (submission.link_flair_text == "New Entry"):
+
 		try:
-			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
-		except AttributeError:
-			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")

+			text = submission.selftext
+			rawtext = text

-		lines = text.split("\n")
+			text = text.replace('\u200c', '')
+			text = re.compile(r".*(\{.+\}).*", re.DOTALL).search(text).group(1)
+			# Test if it needs to escape the escape character. Usually happens on fancy mode.
+			try: json.loads(text)
+			except json.JSONDecodeError: text = re.sub(r"\\(.)", r"\1", text)

-		for i, line in enumerate(lines):
-			if("\"id\": 0" in line):
-				lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
-		text = "\n".join(lines)
-		try:
-			outfile.write(json.dumps(json.loads(text))+"  ,\n")
-			successcount += 1
-			set_flair(submission, "Processed Entry")
-		except json.JSONDecodeError:
-			failfile.write(text+",\n")
+			submission_json = json.loads(text)
+
+			if submission_json:
+
+				# Assert if path does not empty
+				assert len(submission_json["path"]) > 0
+
+				submission_json_dummy = {"id": submission.id, "submitted_by": ""}
+				try:
+					submission_json_dummy["submitted_by"] = submission.author.name
+				except AttributeError:
+					submission_json_dummy["submitted_by"] = "unknown"
+				for key in submission_json:
+					if not key in submission_json_dummy:
+						submission_json_dummy[key] = submission_json[key];
+				submission_json = format_all(submission_json_dummy, True)
+
+				outfile.write(json.dumps(submission_json) + ",\n")
+				successcount += 1
+				set_flair(submission, "Processed Entry")
+
+		except Exception as e:
+			failfile.write(
+				"\n\n" + "="*40 + "\n\n" +
+				submission.id + "\n\n" +
+				traceback.format_exc() + "\n\n" +
+				"==== RAW ====" + "\n\n" +
+				rawtext + "\n\n"
+				"==== CLEAN ====" + "\n\n" +
+				text + "\n\n"
+			)
 			failcount += 1
 			set_flair(submission, "Rejected Entry")
-		print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
+
+		print("Wrote "+submission.id+", submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
 		totalcount += 1

-# Remove ,\n
-outfile.seek(outfile.tell()-4, os.SEEK_SET)
+# Remove last trailing comma
+outfile.seek(outfile.tell()-3, os.SEEK_SET)
 outfile.truncate()

 outfile.write("\n]")
--- a/tools/subreddit-format.py
+++ b/tools/subreddit-format.py
@ -1,65 +0,0 @@
-#!/usr/bin/python
-
-import re
-
-patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
-patternCommatization = re.compile(r',* +')
-pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
-pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
-pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
-pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
-# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
-# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
-"""
-Examples:
-1. - /r/place
-   - r/place
-2. /rplace
-3. - https://www.reddit.com/r/place
-   - www.reddit.com/r/place
-   - reddit.com/r/place
-4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
-   - [www.reddit.com/r/place](www.reddit.com/r/place)
-   - [reddit.com/r/place](reddit.com/r/place)
-UNUSED AND FAULTY
-5. - https://place.reddit.com
-   - place.reddit.com
-6. - [https://place.reddit.com](https://place.reddit.com)
-   - [place.reddit.com](https://place.reddit.com)
-"""
-
-def replaceStage1(contents: str):
-	contents = re.sub(patternCommatization, ', ', contents)
-
-	# r/... to /r/.. (change if not needed)
-	template = r"/r/\1"
-	contents = re.sub(pattern4, template, contents)
-	contents = re.sub(pattern3, template, contents)
-	contents = re.sub(pattern1, template, contents)
-	contents = re.sub(pattern2, template, contents)
-	return contents
-
-def go(path):
-
-	print(f"Fixing {path}...")
-
-	with open(path, "r+", encoding='UTF-8') as f1:
-		contents = f1.read()
-
-	# Convert to r/... format first.
-	for matchParent in patternParent.finditer(contents):
-		subredditLink = matchParent.group(1)
-		subredditLink = replaceStage1(subredditLink)
-		if not subredditLink:
-			continue
-		if path == "../web/atlas-before-ids-migration.json":
-			contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
-		else:
-			contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
-
-	with open(path, "w", encoding='UTF-8') as f2:
-		f2.write(contents)
-	print("Writing completed. All done.")
-
-go("../web/atlas.json")
-go("../web/atlas-before-ids-migration.json")
--- a/web/atlas-before-ids-migration.json
+++ b/web/atlas-before-ids-migration.json
--- a/web/atlas.json
+++ b/web/atlas.json