Merge branch 'Codixer:cleanup' into cleanup

2024-12-28 04:24:33 +01:00 · 2022-04-06 02:10:09 -07:00 · 2022-04-06 02:10:09 -07:00 · d9f1a1d62f
commit d9f1a1d62f
parent ca365b8177 d8de4dddd0
6 changed files with 3309 additions and 3244 deletions
--- a/tools/less-md-links.py
+++ b/tools/less-md-links.py
@ -5,19 +5,21 @@

 def go(path):

+  print(f"Fixing {path}...")
+
  with open(path, "r+", encoding='UTF-8') as f1:
    contents = f1.read()

-  for match in pattern.finditer(contents):
-    if match.group(1) == match.group(2):
-      contents = contents.replace(match.group(0), match.group(2), 1)
-
-  for match in pattern.finditer(contents):
-    if match.group(1) == match.group(2):
-      contents = contents.replace(match.group(0), match.group(2), 1)
+  for i in range(2):
+    for match in pattern.finditer(contents):
+      if match.group(1) == match.group(2):
+        contents = contents.replace(match.group(0), match.group(2), 1)
+    print(f"Stage {i+1} completed.")

  with open(path, "w", encoding='UTF-8') as f2:
    f2.write(contents)
+  print("Writing completed. All done.")
+

 go("../web/atlas.json")
-go("../web/atlas-before-ids-migration.json")
+go("../web/atlas-before-ids-migration.json") 
--- a/tools/misc-formats.py
+++ b/tools/misc-formats.py
@ -0,0 +1,37 @@
+#!/usr/bin/python
+
+import re
+
+def go(path):
+
+  print(f"Fixing {path}...")
+
+  with open(path, "r+", encoding='UTF-8') as f1:
+    contents = f1.read()
+
+  contents = re.sub(r'": "(\s+)', r'": "', contents)
+  contents = re.sub(r'(\s+)"(, |,|\})', r'"\2', contents)
+  print("Leading and trailing spaces removed.")
+
+  contents = re.sub(r' {2,}', r' ', contents)
+  print("Double spaces removed.")
+
+  contents = re.sub(r',{2,}', r',', contents)
+  print("Double commas removed.")
+
+  contents = re.sub(r'"n/a"', '""', contents)
+  contents = re.sub(r'"N/A"', '""', contents)
+  contents = re.sub(r'"-"', '""', contents)
+  contents = re.sub(r'"none"', '""', contents)
+  contents = re.sub(r'"null"', '""', contents)
+  print("Psuedo-empty strings converted into empty strings.")
+
+  contents = re.sub(r'R\/', 'r/', contents)
+  print("Capitalization of r/ has been fixed.")
+
+  with open(path, "w", encoding='UTF-8') as f2:
+    f2.write(contents)
+  print("Writing completed. All done.")
+
+go("../web/atlas.json")
+go("../web/atlas-before-ids-migration.json")
--- a/tools/redditcrawl.py
+++ b/tools/redditcrawl.py
@ -59,7 +59,7 @@
 		# Two escape it again in the regex parser, so \\\\ is \
 		# Then anything but " or n is replaced with the first capture group (anything but " or n)
 		# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
-		re.sub("\\\\([^\"n])", "\\1", text)
+		text = re.sub("\\\\([^\"n])", "\\1", text)
 		try:
 			text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
 		except AttributeError:
--- a/tools/subreddit-format.py
+++ b/tools/subreddit-format.py
@ -1,38 +1,65 @@
 #!/usr/bin/python

 import re
-pattern1 = re.compile(r'"subreddit": "\/r\/(.+?)/?"')
-pattern2 = re.compile(r'"subreddit": "r\/(.+?)/?"')
-pattern3 = re.compile(r'"subreddit": "\/?r(?!\/)(.+?)/?"')
-pattern4 = re.compile(r'"subreddit": "(?:(?:https:\/\/)?www.)?reddit.com\/r\/(.+?)(/[^"]*)*"')
-pattern5 = re.compile(r'"subreddit": "\[(?:(?:https:\/\/)?www.)?reddit.com\/r\/(.+?)(/[^"]*)*\]\((?:(?:https:\/\/)?www.)?reddit.com\/r\/(.+?)(/[^"]*)*\)"')
+
+patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
+patternCommatization = re.compile(r',* +')
+pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
+pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
+pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
+pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
+# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
+# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
+"""
+Examples:
+1. - /r/place
+   - r/place
+2. /rplace
+3. - https://www.reddit.com/r/place
+   - www.reddit.com/r/place
+   - reddit.com/r/place
+4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
+   - [www.reddit.com/r/place](www.reddit.com/r/place)
+   - [reddit.com/r/place](reddit.com/r/place)
+UNUSED AND FAULTY
+5. - https://place.reddit.com
+   - place.reddit.com
+6. - [https://place.reddit.com](https://place.reddit.com)
+   - [place.reddit.com](https://place.reddit.com)
+"""
+
+def replaceStage1(contents: str):
+	contents = re.sub(patternCommatization, ', ', contents)
+
+	# r/... to /r/.. (change if not needed)
+	template = r"/r/\1"
+	contents = re.sub(pattern4, template, contents)
+	contents = re.sub(pattern3, template, contents)
+	contents = re.sub(pattern1, template, contents)
+	contents = re.sub(pattern2, template, contents)
+	return contents

 def go(path):

+	print(f"Fixing {path}...")
+
 	with open(path, "r+", encoding='UTF-8') as f1:
 		contents = f1.read()

-	for match in pattern5.finditer(contents):
-		contents = contents.replace(match.group(0), '"subreddit": "r/' + match.group(2) + '"', 1)
-
-	for match in pattern4.finditer(contents):
-		contents = contents.replace(match.group(0), '"subreddit": "r/' + match.group(1) + '"', 1)
-
-	for match in pattern1.finditer(contents):
-		contents = contents.replace(match.group(0), '"subreddit": "r/' + match.group(1) + '"', 1)
-
-	for match in pattern2.finditer(contents):
-		contents = contents.replace(match.group(0), '"subreddit": "r/' + match.group(1) + '"', 1)
-
-	for match in pattern3.finditer(contents):
-		contents = contents.replace(match.group(0), '"subreddit": "r/' + match.group(1) + '"', 1)
-
-	# # r/... to /r/.. (comment if not needed)
-	for match in pattern2.finditer(contents):
-		contents = contents.replace(match.group(0), '"subreddit": "/r/' + match.group(1) + '"', 1)
+	# Convert to r/... format first.
+	for matchParent in patternParent.finditer(contents):
+		subredditLink = matchParent.group(1)
+		subredditLink = replaceStage1(subredditLink)
+		if not subredditLink:
+			continue
+		if path == "../web/atlas-before-ids-migration.json":
+			contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
+		else:
+			contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)

 	with open(path, "w", encoding='UTF-8') as f2:
 		f2.write(contents)
+	print("Writing completed. All done.")

 go("../web/atlas.json")
 go("../web/atlas-before-ids-migration.json")
--- a/web/atlas-before-ids-migration.json
+++ b/web/atlas-before-ids-migration.json
--- a/web/atlas.json
+++ b/web/atlas.json