Merge pull request #1074 from placeAtlas/cleanup

2024-10-16 14:29:03 +02:00 · 2022-04-08 11:01:42 -03:00 · 2022-04-08 11:01:42 -03:00 · 23db7f330d
commit 23db7f330d
parent eb68c65b53 47a049a409
8 changed files with 975 additions and 2166 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,5 @@ allCharacters.txt
 combined.js
 *.DS_Store
 .vscode/
-_img/place/
+_img/place/
 web/atlas-before-ids-migration.json
--- a/tools/formatter.py
+++ b/tools/formatter.py
@ -0,0 +1,213 @@
 #!/usr/bin/python
 import re
 import json
 """
 Examples:
 1. - /r/place
   - r/place
 2. /rplace
 3. - https://www.reddit.com/r/place
   - www.reddit.com/r/place
   - reddit.com/r/place
 UNUSED AND FAULTY
 4. - https://place.reddit.com
   - place.reddit.com
 5. - [https://place.reddit.com](https://place.reddit.com)
   - [place.reddit.com](https://place.reddit.com)
 """
 FS_REGEX = {
 	"commatization": r'( *(,+ +|,+ |,+)| +)(and|&|;)( *(,+ +|,+ |,+)| +)|, *$| +',
 	"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
 	"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
 	"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
 	"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
 	"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
 	"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
 	# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
 	# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
 }
 VALIDATE_REGEX = {
 	"subreddit": r'^ *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *(, *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *)*$|^$',
 	"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
 }
 CL_REGEX = r'\[(.+?)\]\((.+?)\)'
 CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'
 CSTW_REGEX = {
 	"website": r'^https?://[^\s/$.?#].[^\s]*$',
 	"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'
 }
 # r/... to /r/...
 SUBREDDIT_TEMPLATE = r"/r/\1"
 USER_TEMPLATE = r"/u/\1"
 def format_subreddit(entry: dict):
 	if not "subreddit" in entry or not entry['subreddit']:
 		return entry
 	subredditLink = entry["subreddit"]
 	subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
 	subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
 	subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
 	subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
 	subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
 	subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
 	subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
 	if not subredditLink:
 		return entry
 	entry["subreddit"] = subredditLink
 	return entry
 def collapse_links(entry: dict):
 	if not "website" in entry or not entry['website']:
 		return entry
 	website = entry["website"];
 	if re.search(CL_REGEX, website):
 		match = re.search(CL_REGEX, website)
 		if match.group(1) == match.group(2):
 			website = match.group(2)
 	entry["website"] = website
 	return entry
 def remove_extras(entry: dict):
 	if "subreddit" in entry and entry["subreddit"]:
 		# if not entry["subreddit"].startswith('/r/'):
 		# 	entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
 		entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])
 	for key in entry:
 		if not entry[key] or not isinstance(entry[key], str): 
 			continue
 		# Leading and trailing spaces
 		entry[key] = entry[key].strip()
 		# Double characters
 		entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
 		entry[key] = re.sub(r' {3,}\n', r'  ', entry[key])
 		entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
 		entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
 		entry[key] = re.sub(r',{2,}', r',', entry[key])
 		# Psuedo-empty strings
 		if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
 			entry[key] = ""
 	return entry
 def fix_r_caps(entry: dict):
 	if not "description" in entry or not entry['description']:
 		return entry
 	entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
 	entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])
 	return entry
 def fix_no_protocol_urls(entry: dict):
 	if not "website" in entry or not entry['website']:
 		return entry
 	if not entry["website"].startswith("http"):
 		entry["website"] = "https://" + entry["website"]
 	return entry
 def convert_website_to_subreddit(entry: dict):
 	if not "website" in entry or not entry['website']:
 		return entry
 	if re.match(CWTS_REGEX, entry["website"]):
 		new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])
 		if (new_subreddit.lower() == entry["subreddit"].lower()):
 			entry["website"] = ""
 		elif not "subreddit" in entry or entry['subreddit'] == "":
 			entry["subreddit"] = new_subreddit
 			entry["website"] = ""
 	return entry
 def convert_subreddit_to_website(entry: dict):
 	if not "subreddit" in entry or not entry['subreddit']:
 		return entry
 	if re.match(CSTW_REGEX["website"], entry["subreddit"]):
 		if (entry["website"].lower() == entry["subreddit"].lower()):
 			entry["subreddit"] = ""
 		elif not "website" in entry or entry['website'] == "":
 			entry["website"] = entry["subreddit"]
 			entry["subreddit"] = ""
 	elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
 		if not "website" in entry or entry['website'] == "":
 			username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
 			entry["website"] = "https://www.reddit.com/user/" + username
 			entry["subreddit"] = ""
 	return entry
 def validate(entry: dict):
 	if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
 		print(f"Wait, no id here! How did this happened? {entry}")
 		return
 	for key in entry:
 		if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
 			print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")
 def per_line_entries(entries: list):
 	out = "[\n"
 	for entry in entries:
 		out += json.dumps(entry) + ",\n"
 	out = out[:-2] + "\n]"
 	return out
 def format_all(entry: dict, silent=False):
 	def print_(*args, **kwargs):
 		if not silent:
 			print(*args, **kwargs)
 	print_("Fixing r/ capitalization...")
 	entry = fix_r_caps(entry)
 	print_("Fixing links without protocol...")
 	entry = fix_no_protocol_urls(entry)
 	print_("Fix formatting of subreddit...")
 	entry = format_subreddit(entry)
 	print_("Collapsing Markdown links...")
 	entry = collapse_links(entry)
 	print_("Converting website links to subreddit (if possible)...")
 	entry = convert_website_to_subreddit(entry)
 	print_("Converting subreddit links to website (if needed)...")
 	entry = convert_subreddit_to_website(entry)
 	print_("Removing extras...")
 	entry = remove_extras(entry)
 	print_("Validating...")
 	validate(entry)
 	print_("Completed!")
 	return entry
 if __name__ == '__main__':
 	def go(path):
 		print(f"Formatting {path}...")
 		with open(path, "r+", encoding='UTF-8') as f1:
 			entries = json.loads(f1.read())
 		for i in range(len(entries)):
 			entries[i] = format_all(entries[i], True)
 			if not (i % 500):
 				print(f"{i} checked.")
 		print(f"{len(entries)} checked.")
 		with open(path, "w", encoding='UTF-8') as f2:
 			f2.write(per_line_entries(entries))
 		print("Writing completed. All done.")
 	go("../web/atlas.json")
 	go("../web/atlas-before-ids-migration.json")
--- a/tools/less-md-links.py
+++ b/tools/less-md-links.py
@ -1,25 +0,0 @@
 #!/usr/bin/python
 import re
 pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
 def go(path):
  print(f"Fixing {path}...")
  with open(path, "r+", encoding='UTF-8') as f1:
    contents = f1.read()
  for i in range(2):
    for match in pattern.finditer(contents):
      if match.group(1) == match.group(2):
        contents = contents.replace(match.group(0), match.group(2), 1)
    print(f"Stage {i+1} completed.")
  with open(path, "w", encoding='UTF-8') as f2:
    f2.write(contents)
  print("Writing completed. All done.")
 go("../web/atlas.json")
 go("../web/atlas-before-ids-migration.json") 
--- a/tools/misc-formats.py
+++ b/tools/misc-formats.py
@ -1,37 +0,0 @@
 #!/usr/bin/python
 import re
 def go(path):
  print(f"Fixing {path}...")
  with open(path, "r+", encoding='UTF-8') as f1:
    contents = f1.read()
  contents = re.sub(r'": "(\s+)', r'": "', contents)
  contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
  print("Leading and trailing spaces removed.")
  contents = re.sub(r' {2,}', r' ', contents)
  print("Double spaces removed.")
  contents = re.sub(r',{2,}', r',', contents)
  print("Double commas removed.")
  contents = re.sub(r'"n/a"', '""', contents)
  contents = re.sub(r'"N/A"', '""', contents)
  contents = re.sub(r'"-"', '""', contents)
  contents = re.sub(r'"none"', '""', contents)
  contents = re.sub(r'"null"', '""', contents)
  print("Psuedo-empty strings converted into empty strings.")
  contents = re.sub(r'R\/', 'r/', contents)
  print("Capitalization of r/ has been fixed.")
  with open(path, "w", encoding='UTF-8') as f2:
    f2.write(contents)
  print("Writing completed. All done.")
 go("../web/atlas.json")
 go("../web/atlas-before-ids-migration.json")
--- a/tools/redditcrawl.py
+++ b/tools/redditcrawl.py
@ -1,24 +1,33 @@
 import praw
 import json
 import time
 import re
 import os
 import traceback
 from formatter import format_all
 outfile = open('temp_atlas.json', 'w', encoding='utf-8')
 failfile = open('manual_atlas.json', 'w', encoding='utf-8')
-credentials = open('credentials', 'r')
+with open('credentials', 'r') as file:
-client_id = credentials.readline().strip(' \t\n\r')
+	credentials = file.readlines()
-client_secret = credentials.readline().strip(' \t\n\r')
+	client_id = credentials[0].strip()
-user = credentials.readline().strip(' \t\n\r')
+	client_secret = credentials[1].strip()
-pw = credentials.readline().strip(' \t\n\r')
+	username = credentials[2].strip()
 	password = credentials[3].strip()
 reddit = praw.Reddit(
 	client_id=client_id, 
 	client_secret=client_secret,
 	username=username,
 	password=password,
 	user_agent='atlas_bot'
 )
 reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
 has_write_access = not reddit.read_only
 if not has_write_access:
-	print("Warning: No write access. Post flairs will not be updated")
+	print("Warning: No write access. Post flairs will not be updated.")
-	sleep(5)
+	time.sleep(5)
 jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
 existing = json.load(jsonfile)
@ -39,6 +48,7 @@ def set_flair(submission, flair):
 failcount = 0
 successcount = 0
 totalcount = 0
 outfile.write("[\n")
 for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
 	"""
@ -48,12 +58,14 @@ def set_flair(submission, flair):
 	3. Give it a name and description
 	4. Select "script"
 	5. Redirect to http://localhost:8080
-	6. Copy ID (under Personal Use Script)
+	6. Create file "credentials" with the format below.
-	7. Append to file called "credentials"
+	┌─────────────────────────────────────────────────────┐
-	8. Copy Secret 
+	│ [ID]        <-  Under "personal use script"         │
-	9. Append on newline to "credentials" file
+	│ [Secret]                                            │
-	10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
+	│ [Username]  <-  Must be a mod, don't do this if you │
-	11. Run Script
+	│ [Password]  <-  don't know what you are doing.      │
 	└─────────────────────────────────────────────────────┘
 	7. Run Script
 	Running Script
 	1. Input the next ID to use
@ -63,6 +75,7 @@ def set_flair(submission, flair):
 	"""
 	total_all_flairs += 1
 	if (submission.id in existing_ids):
 		set_flair(submission, "Processed Entry")
 		print("Found first duplicate!")
@ -71,40 +84,59 @@ def set_flair(submission, flair):
 			break
 		else:
 			continue
-	if(submission.link_flair_text == "New Entry"):
+
-		text = submission.selftext
+	if (submission.link_flair_text == "New Entry"):
-		#Old backslash filter:
+
 		#text = text.replace("\\", "")
 		#New one: One \\ escapes a backslash in python's parser
 		# Two escape it again in the regex parser, so \\\\ is \
 		# Then anything but " or n is replaced with the first capture group (anything but " or n)
 		# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
 		text = re.sub("\\\\([^\"n])", "\\1", text)
 		try:
 			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
 		except AttributeError:
 			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
 			text = submission.selftext
 			rawtext = text
-		lines = text.split("\n")
+			text = text.replace('\u200c', '')
 			text = re.compile(r".*(\{.+\}).*", re.DOTALL).search(text).group(1)
 			# Test if it needs to escape the escape character. Usually happens on fancy mode.
 			try: json.loads(text)
 			except json.JSONDecodeError: text = re.sub(r"\\(.)", r"\1", text)
-		for i, line in enumerate(lines):
+			submission_json = json.loads(text)
-			if("\"id\": 0" in line):
+
-				lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
+			if submission_json:
-		text = "\n".join(lines)
+
-		try:
+				# Assert if path does not empty
-			outfile.write(json.dumps(json.loads(text))+"  ,\n")
+				assert len(submission_json["path"]) > 0
-			successcount += 1
+
-			set_flair(submission, "Processed Entry")
+				submission_json_dummy = {"id": submission.id, "submitted_by": ""}
-		except json.JSONDecodeError:
+				try:
-			failfile.write(text+",\n")
+					submission_json_dummy["submitted_by"] = submission.author.name
 				except AttributeError:
 					submission_json_dummy["submitted_by"] = "unknown"
 				for key in submission_json:
 					if not key in submission_json_dummy:
 						submission_json_dummy[key] = submission_json[key];
 				submission_json = format_all(submission_json_dummy, True)
 				outfile.write(json.dumps(submission_json) + ",\n")
 				successcount += 1
 				set_flair(submission, "Processed Entry")
 		except Exception as e:
 			failfile.write(
 				"\n\n" + "="*40 + "\n\n" +
 				submission.id + "\n\n" +
 				traceback.format_exc() + "\n\n" +
 				"==== RAW ====" + "\n\n" +
 				rawtext + "\n\n"
 				"==== CLEAN ====" + "\n\n" +
 				text + "\n\n"
 			)
 			failcount += 1
 			set_flair(submission, "Rejected Entry")
-		print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
+
 		print("Wrote "+submission.id+", submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
 		totalcount += 1
-# Remove ,\n
+# Remove last trailing comma
-outfile.seek(outfile.tell()-4, os.SEEK_SET)
+outfile.seek(outfile.tell()-3, os.SEEK_SET)
 outfile.truncate()
 outfile.write("\n]")
--- a/tools/subreddit-format.py
+++ b/tools/subreddit-format.py
@ -1,65 +0,0 @@
 #!/usr/bin/python
 import re
 patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
 patternCommatization = re.compile(r',* +')
 pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
 pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
 pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
 pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
 # pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
 # pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
 """
 Examples:
 1. - /r/place
   - r/place
 2. /rplace
 3. - https://www.reddit.com/r/place
   - www.reddit.com/r/place
   - reddit.com/r/place
 4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
   - [www.reddit.com/r/place](www.reddit.com/r/place)
   - [reddit.com/r/place](reddit.com/r/place)
 UNUSED AND FAULTY
 5. - https://place.reddit.com
   - place.reddit.com
 6. - [https://place.reddit.com](https://place.reddit.com)
   - [place.reddit.com](https://place.reddit.com)
 """
 def replaceStage1(contents: str):
 	contents = re.sub(patternCommatization, ', ', contents)
 	# r/... to /r/.. (change if not needed)
 	template = r"/r/\1"
 	contents = re.sub(pattern4, template, contents)
 	contents = re.sub(pattern3, template, contents)
 	contents = re.sub(pattern1, template, contents)
 	contents = re.sub(pattern2, template, contents)
 	return contents
 def go(path):
 	print(f"Fixing {path}...")
 	with open(path, "r+", encoding='UTF-8') as f1:
 		contents = f1.read()
 	# Convert to r/... format first.
 	for matchParent in patternParent.finditer(contents):
 		subredditLink = matchParent.group(1)
 		subredditLink = replaceStage1(subredditLink)
 		if not subredditLink:
 			continue
 		if path == "../web/atlas-before-ids-migration.json":
 			contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
 		else:
 			contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
 	with open(path, "w", encoding='UTF-8') as f2:
 		f2.write(contents)
 	print("Writing completed. All done.")
 go("../web/atlas.json")
 go("../web/atlas-before-ids-migration.json")
--- a/web/atlas-before-ids-migration.json
+++ b/web/atlas-before-ids-migration.json
--- a/web/atlas.json
+++ b/web/atlas.json