atlas/tools/subreddit-format.py

#!/usr/bin/python

import re

patternParent = re.compile(r'"subreddit": ?"(?!")(.+?)"')
patternCommatization = re.compile(r',* +')
pattern1 = re.compile(r'\/?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern2 = re.compile(r'^\/?[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?')
pattern3 = re.compile(r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*')
pattern4 = re.compile(r'\[[A-Za-z0-9][A-Za-z0-9_]{1,20}\]\((?:(?:https:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*\)')
# pattern5 = re.compile(r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*')
# pattern6 = re.compile(r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"')
"""
Examples:
1. - /r/place
   - r/place
2. /rplace
3. - https://www.reddit.com/r/place
   - www.reddit.com/r/place
   - reddit.com/r/place
4. - [https://www.reddit.com/r/place](https://www.reddit.com/r/place)
   - [www.reddit.com/r/place](www.reddit.com/r/place)
   - [reddit.com/r/place](reddit.com/r/place)
UNUSED AND FAULTY
5. - https://place.reddit.com
   - place.reddit.com
6. - [https://place.reddit.com](https://place.reddit.com)
   - [place.reddit.com](https://place.reddit.com)
"""

def replaceStage1(contents: str):
	contents = re.sub(patternCommatization, ', ', contents)

	# r/... to /r/.. (change if not needed)
	template = r"/r/\1"
	contents = re.sub(pattern4, template, contents)
	contents = re.sub(pattern3, template, contents)
	contents = re.sub(pattern1, template, contents)
	contents = re.sub(pattern2, template, contents)
	return contents

def go(path):

	print(f"Fixing {path}...")

	with open(path, "r+", encoding='UTF-8') as f1:
		contents = f1.read()

	# Convert to r/... format first.
	for matchParent in patternParent.finditer(contents):
		subredditLink = matchParent.group(1)
		subredditLink = replaceStage1(subredditLink)
		if not subredditLink:
			continue
		if path == "../web/atlas-before-ids-migration.json":
			contents = contents.replace(matchParent.group(0), '"subreddit":"' + subredditLink + '"', 1)
		else:
			contents = contents.replace(matchParent.group(0), '"subreddit": "' + subredditLink + '"', 1)

	with open(path, "w", encoding='UTF-8') as f2:
		f2.write(contents)
	print("Writing completed. All done.")

go("../web/atlas.json")
go("../web/atlas-before-ids-migration.json")