Improve and merge scripts, use JSON instead of regex

2024-09-27 12:39:18 +02:00 · 2022-04-07 12:01:09 +07:00 · 2022-04-07 12:01:09 +07:00 · 5a660759bf
commit 5a660759bf
parent 433764b03c
5 changed files with 205 additions and 131 deletions
--- a/tools/formatter.py
+++ b/tools/formatter.py
@ -0,0 +1,135 @@
+#!/usr/bin/python
+
+import re
+import json
+
+"""
+Examples:
+1. - /r/place
+   - r/place
+2. /rplace
+3. - https://www.reddit.com/r/place
+   - www.reddit.com/r/place
+   - reddit.com/r/place
+4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
+   - [www.reddit.com/r/place](www.reddit.com/r/place)
+   - [reddit.com/r/place](reddit.com/r/place)
+UNUSED AND FAULTY
+5. - https://place.reddit.com
+   - place.reddit.com
+6. - [https://place.reddit.com](https://place.reddit.com)
+   - [place.reddit.com](https://place.reddit.com)
+"""
+format_subreddit_regex = {
+	# r/... to /r/...
+	"template": r"/r/\1",
+	"commatization": r',* +',
+	"pattern1": r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
+	"pattern2": r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
+	"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
+	"pattern4": r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)',
+	# "pattern5": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
+	# "pattern6": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
+}
+
+collapse_links_regex = re.compile(r'\[(.+?)\]\((.+?)\)')
+
+def format_subreddit(entry: dict):
+	if not "subreddit" in entry:
+		return entry
+
+	subredditLink = entry["subreddit"]
+	subredditLink = re.sub(format_subreddit_regex["commatization"], ', ', subredditLink)
+	subredditLink = re.sub(format_subreddit_regex["pattern4"], format_subreddit_regex["template"], subredditLink)
+	subredditLink = re.sub(format_subreddit_regex["pattern3"], format_subreddit_regex["template"], subredditLink)
+	subredditLink = re.sub(format_subreddit_regex["pattern1"], format_subreddit_regex["template"], subredditLink)
+	subredditLink = re.sub(format_subreddit_regex["pattern2"], format_subreddit_regex["template"], subredditLink)
+
+	if not subredditLink:
+		return entry
+	
+	entry["subreddit"] = subredditLink
+	return entry
+
+def collapse_links(entry: dict):
+	if not "website" in entry:
+		return entry
+	website = entry["website"];
+	if collapse_links_regex.search(website):
+		match = collapse_links_regex.search(website)
+		if match.group(1) == match.group(2):
+			website = match.group(2)
+
+	entry["website"] = website
+	return entry
+
+def remove_extras(entry: dict):
+	for key in entry:
+		if not entry[key] or not isinstance(entry[key], str): 
+			continue
+		# Leading and trailing spaces
+		entry[key] = re.sub(r'^(\s+)', r'', entry[key])
+		entry[key] = re.sub(r'(\s+)$', r'', entry[key])
+		# Double spaces and commas
+		entry[key] = re.sub(r' {2,}', r' ', entry[key])
+		entry[key] = re.sub(r'\n{2,}', r' ', entry[key])
+		entry[key] = re.sub(r',{2,}', r',', entry[key])
+		# Psuedo-empty strings
+		if entry[key] in ["n/a", "N/A", "-", "null", "none", "None"]:
+			entry[key] = ""
+
+	return entry
+
+def fix_r_caps(entry: dict):
+	if not "description" in entry:
+		return entry
+	
+	entry["description"] = re.sub(r'R\/', 'r/', entry["description"])
+
+	return entry
+
+def per_line_entries(entries: list):
+	out = "[\n"
+	for entry in entries:
+		out += json.dumps(entry) + ",\n"
+	out = out[:-2] + "\n]"
+	return out
+
+def format_all(entry: dict, silent=False):
+	def print_(*args, **kwargs):
+		if not silent:
+			print(*args, **kwargs)
+	print_("Removing extras...")
+	entry = remove_extras(entry)
+	print_("Fixing r/ capitalization...")
+	entry = fix_r_caps(entry)
+	print_("Collapsing Markdown links...")
+	entry = collapse_links(entry)
+	print_("Fix formatting of subreddit...")
+	entry = format_subreddit(entry)
+	print_("Completed!")
+	return entry
+
+if __name__ == '__main__':
+
+	def go(path):
+
+		print(f"Formatting {path}...")
+
+		with open(path, "r+", encoding='UTF-8') as f1:
+			entries = json.loads(f1.read())
+
+		for i in range(len(entries)):
+			entries[i] = format_all(entries[i], True)
+			if not (i % 500):
+				print(f"{i} checked.")
+
+		print(f"{len(entries)} checked.")
+
+		with open(path, "w", encoding='UTF-8') as f2:
+			f2.write(per_line_entries(entries))
+
+		print("Writing completed. All done.")
+
+	go("../web/atlas.json")
+	go("../web/atlas-before-ids-migration.json")
--- a/tools/less-md-links.py
+++ b/tools/less-md-links.py
@ -1,25 +1,26 @@
 #!/usr/bin/python

-import re
-pattern = re.compile(r'\[(.+?)\]\((.+?)\)')
+import json
+from formatter import collapse_links, per_line_entries

 def go(path):

-  print(f"Fixing {path}...")
+  print(f"Formatting {path}...")

  with open(path, "r+", encoding='UTF-8') as f1:
-    contents = f1.read()
+    entries = json.loads(f1.read())

-  for i in range(2):
-    for match in pattern.finditer(contents):
-      if match.group(1) == match.group(2):
-        contents = contents.replace(match.group(0), match.group(2), 1)
-    print(f"Stage {i+1} completed.")
+  for i in range(len(entries)):
+    entries[i] = collapse_links(entries[i])
+    if not (i % 500):
+      print(f"{i} checked.")
+
+  print(f"{len(entries)} checked.")

  with open(path, "w", encoding='UTF-8') as f2:
-    f2.write(contents)
+    f2.write(per_line_entries(entries))
+
  print("Writing completed. All done.")

-
 go("../web/atlas.json")
-go("../web/atlas-before-ids-migration.json") 
+go("../web/atlas-before-ids-migration.json")
--- a/tools/misc-formats.py
+++ b/tools/misc-formats.py
@ -1,36 +1,26 @@
 #!/usr/bin/python

-import re
+import json
+from formatter import remove_extras, fix_r_caps, per_line_entries

 def go(path):

-  print(f"Fixing {path}...")
+  print(f"Formatting {path}...")

  with open(path, "r+", encoding='UTF-8') as f1:
-    contents = f1.read()
+    entries = json.loads(f1.read())

-  contents = re.sub(r'": "(\s+)', r'": "', contents)
-  contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
-  print("Leading and trailing spaces removed.")
+  for i in range(len(entries)):
+    entries[i] = remove_extras(entries[i])
+    entries[i] = fix_r_caps(entries[i])
+    if not (i % 500):
+      print(f"{i} checked.")

-  contents = re.sub(r' {2,}', r' ', contents)
-  print("Double spaces removed.")
-
-  contents = re.sub(r',{2,}', r',', contents)
-  print("Double commas removed.")
-
-  contents = re.sub(r'"n/a"', '""', contents)
-  contents = re.sub(r'"N/A"', '""', contents)
-  contents = re.sub(r'"-"', '""', contents)
-  contents = re.sub(r'"none"', '""', contents)
-  contents = re.sub(r'"null"', '""', contents)
-  print("Psuedo-empty strings converted into empty strings.")
-
-  contents = re.sub(r'R\/', 'r/', contents)
-  print("Capitalization of r/ has been fixed.")
+  print(f"{len(entries)} checked.")

  with open(path, "w", encoding='UTF-8') as f2:
-    f2.write(contents)
+    f2.write(per_line_entries(entries))
+
  print("Writing completed. All done.")

 go("../web/atlas.json")
--- a/tools/redditcrawl.py
+++ b/tools/redditcrawl.py
@ -4,6 +4,7 @@
 import time
 import re
 import os
+from formatter import format_all

 outfile = open('temp_atlas.json', 'w', encoding='utf-8')
 failfile = open('manual_atlas.json', 'w', encoding='utf-8')
@ -11,14 +12,12 @@
 credentials = open('credentials', 'r')
 client_id = credentials.readline().strip(' \t\n\r')
 client_secret = credentials.readline().strip(' \t\n\r')
-user = credentials.readline().strip(' \t\n\r')
-pw = credentials.readline().strip(' \t\n\r')

-reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
-has_write_access = not reddit.read_only
-if not has_write_access:
-	print("Warning: No write access. Post flairs will not be updated")
-	sleep(5)
+reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot')
+
+failcount = 0
+successcount = 0
+totalcount = 0

 jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
 existing = json.load(jsonfile)
@ -28,17 +27,8 @@
 for item in existing:
 	existing_ids.append(item['id'])

-def set_flair(submission, flair):
-	if has_write_access and submission.link_flair_text != flair:
-		flair_choices = submission.flair.choices()
-		flair = next(x for x in flair_choices if x["flair_text_editable"] and flair == x["flair_text"])
-		submission.flair.select(flair["flair_template_id"])
-
 total_all_flairs = 0
 duplicate_count = 0
-failcount = 0
-successcount = 0
-totalcount = 0
 outfile.write("[\n")
 for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
 	"""
@ -52,8 +42,7 @@ def set_flair(submission, flair):
 	7. Append to file called "credentials"
 	8. Copy Secret 
 	9. Append on newline to "credentials" file
-	10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
-	11. Run Script
+	10. Run Script

 	Running Script
 	1. Input the next ID to use
@ -63,43 +52,41 @@ def set_flair(submission, flair):

 	"""
 	total_all_flairs += 1
+
 	if (submission.id in existing_ids):
-		set_flair(submission, "Processed Entry")
 		print("Found first duplicate!")
 		duplicate_count += 1
-		if (duplicate_count > 0):
+		if (duplicate_count > 10):
 			break
 		else:
 			continue
+
 	if(submission.link_flair_text == "New Entry"):
+
 		text = submission.selftext
-		#Old backslash filter:
-		#text = text.replace("\\", "")
-		#New one: One \\ escapes a backslash in python's parser
-		# Two escape it again in the regex parser, so \\\\ is \
-		# Then anything but " or n is replaced with the first capture group (anything but " or n)
-		# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
-		text = re.sub("\\\\([^\"n])", "\\1", text)
+		submission_json = ""
+
 		try:
-			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
-		except AttributeError:
-			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
-
-
-		lines = text.split("\n")
-
-		for i, line in enumerate(lines):
-			if("\"id\": 0" in line):
-				lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
-		text = "\n".join(lines)
-		try:
-			outfile.write(json.dumps(json.loads(text))+"  ,\n")
-			successcount += 1
-			set_flair(submission, "Processed Entry")
+			submission_json = json.loads(text)
 		except json.JSONDecodeError:
 			failfile.write(text+",\n")
 			failcount += 1
-			set_flair(submission, "Rejected Entry")
+
+		if (submission_json):
+
+			submission_json_dummy = {"id": submission.id, "submitted_by": ""}
+			try:
+				submission_json_dummy["submitted_by"] = submission.author.name
+			except AttributeError:
+				submission_json_dummy["submitted_by"]
+			for key in submission_json:
+				if not key in submission_json_dummy:
+					submission_json_dummy[key] = submission_json[key];
+			submission_json = format_all(submission_json_dummy)
+
+			outfile.write(json.dumps(json.loads(text))+"  ,\n")
+			successcount += 1
+
 		print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
 		totalcount += 1

--- a/tools/subreddit-format.py
+++ b/tools/subreddit-format.py
@ -1,65 +1,26 @@
 #!/usr/bin/python

-import re
-
-patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
-patternCommatization = re.compile(r',* +')
-pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
-pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
-pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
-pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
-# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
-# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
-"""
-Examples:
-1. - /r/place
-   - r/place
-2. /rplace
-3. - https://www.reddit.com/r/place
-   - www.reddit.com/r/place
-   - reddit.com/r/place
-4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
-   - [www.reddit.com/r/place](www.reddit.com/r/place)
-   - [reddit.com/r/place](reddit.com/r/place)
-UNUSED AND FAULTY
-5. - https://place.reddit.com
-   - place.reddit.com
-6. - [https://place.reddit.com](https://place.reddit.com)
-   - [place.reddit.com](https://place.reddit.com)
-"""
-
-def replaceStage1(contents: str):
-	contents = re.sub(patternCommatization, ', ', contents)
-
-	# r/... to /r/.. (change if not needed)
-	template = r"/r/\1"
-	contents = re.sub(pattern4, template, contents)
-	contents = re.sub(pattern3, template, contents)
-	contents = re.sub(pattern1, template, contents)
-	contents = re.sub(pattern2, template, contents)
-	return contents
+import json
+from formatter import format_subreddit, per_line_entries

 def go(path):

-	print(f"Fixing {path}...")
+  print(f"Formatting {path}...")

-	with open(path, "r+", encoding='UTF-8') as f1:
-		contents = f1.read()
+  with open(path, "r+", encoding='UTF-8') as f1:
+    entries = json.loads(f1.read())

-	# Convert to r/... format first.
-	for matchParent in patternParent.finditer(contents):
-		subredditLink = matchParent.group(1)
-		subredditLink = replaceStage1(subredditLink)
-		if not subredditLink:
-			continue
-		if path == "../web/atlas-before-ids-migration.json":
-			contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
-		else:
-			contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)
+  for i in range(len(entries)):
+    entries[i] = format_subreddit(entries[i])
+    if not (i % 500):
+      print(f"{i} checked.")

-	with open(path, "w", encoding='UTF-8') as f2:
-		f2.write(contents)
-	print("Writing completed. All done.")
+  print(f"{len(entries)} checked.")
+
+  with open(path, "w", encoding='UTF-8') as f2:
+    f2.write(per_line_entries(entries))
+
+  print("Writing completed. All done.")

 go("../web/atlas.json")
 go("../web/atlas-before-ids-migration.json")