atlas/tools/formatter.py

#!/usr/bin/python

import re
import json

"""
Examples:
1. - /r/place
   - r/place
2. /rplace
3. - https://www.reddit.com/r/place
   - www.reddit.com/r/place
   - reddit.com/r/place
UNUSED AND FAULTY
4. - https://place.reddit.com
   - place.reddit.com
5. - [https://place.reddit.com](https://place.reddit.com)
   - [place.reddit.com](https://place.reddit.com)
"""
FS_REGEX = {
	"commatization": r',*(?: +(and|&))? +',
	"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
	"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
	# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
	# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
}

VALIDATE_REGEX = {
	"subreddit": r'^\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(, *\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}))*$|^$',
	"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
}

CL_REGEX = r'\[(.+?)\]\((.+?)\)'
CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'
CSTW_REGEX = r'^https?://[^\s/$.?#].[^\s]*$'

# r/... to /r/...
SUBREDDIT_TEMPLATE = r"/r/\1"
USER_TEMPLATE = r"/r/\1"

def format_subreddit(entry: dict):
	if not "subreddit" in entry or not entry['subreddit']:
		return entry

	subredditLink = entry["subreddit"]
	subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)

	if not subredditLink:
		return entry
	
	entry["subreddit"] = subredditLink
	return entry

def collapse_links(entry: dict):
	if not "website" in entry or not entry['website']:
		return entry
		
	website = entry["website"];
	if re.search(CL_REGEX, website):
		match = re.search(CL_REGEX, website)
		if match.group(1) == match.group(2):
			website = match.group(2)

	entry["website"] = website
	return entry

def remove_extras(entry: dict):
	if "subreddit" in entry and entry["subreddit"]:
		# if not entry["subreddit"].startswith('/r/'):
		# 	entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
		entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])

	for key in entry:
		if not entry[key] or not isinstance(entry[key], str): 
			continue
		# Leading and trailing spaces
		entry[key] = re.sub(r'^(\s+)', r'', entry[key])
		entry[key] = re.sub(r'(\s+)$', r'', entry[key])
		# Double characters
		entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
		entry[key] = re.sub(r' {3,}\n', r'  ', entry[key])
		entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
		entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
		entry[key] = re.sub(r',{2,}', r',', entry[key])
		# Psuedo-empty strings
		if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
			entry[key] = ""

	return entry

def fix_r_caps(entry: dict):
	if not "description" in entry or not entry['description']:
		return entry
	
	entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
	entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])

	return entry

def fix_no_protocol_urls(entry: dict):
	if not "website" in entry or not entry['website']:
		return entry
	
	if not entry["website"].startswith("http"):
		entry["website"] = "https://" + entry["website"]

	return entry

def convert_website_to_subreddit(entry: dict):
	if not "website" in entry or not entry['website']:
		return entry

	if re.match(CWTS_REGEX, entry["website"]):
		new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])
		if (new_subreddit.lower() == entry["subreddit"].lower()):
			entry["website"] = ""
		elif "subreddit" in entry and entry['subreddit'] == "":
			entry["subreddit"] = new_subreddit
			entry["website"] = ""

	return entry

def convert_subreddit_to_website(entry: dict):
	if not "subreddit" in entry or not entry['subreddit']:
		return entry

	if re.match(CSTW_REGEX, entry["subreddit"]):
		if (entry["website"].lower() == entry["subreddit"].lower()):
			entry["subreddit"] = ""
		elif "website" in entry and entry['website'] == "":
			entry["website"] = entry["subreddit"]
			entry["subreddit"] = ""

	return entry
	
def validate(entry: dict):
	if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
		print(f"Wait, no id here! How did this happened? {entry}")
		return
	for key in entry:
		if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
			print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")

def per_line_entries(entries: list):
	out = "[\n"
	for entry in entries:
		out += json.dumps(entry) + ",\n"
	out = out[:-2] + "\n]"
	return out

def format_all(entry: dict, silent=False):
	def print_(*args, **kwargs):
		if not silent:
			print(*args, **kwargs)
	print_("Fixing r/ capitalization...")
	entry = fix_r_caps(entry)
	print_("Fixing links without protocol...")
	entry = fix_no_protocol_urls(entry)
	print_("Fix formatting of subreddit...")
	entry = format_subreddit(entry)
	print_("Collapsing Markdown links...")
	entry = collapse_links(entry)
	print_("Converting website links to subreddit (if possible)...")
	entry = convert_website_to_subreddit(entry)
	print_("Converting subreddit links to website (if needed)...")
	entry = convert_subreddit_to_website(entry)
	print_("Removing extras...")
	entry = remove_extras(entry)
	print_("Validating...")
	validate(entry)
	print_("Completed!")
	return entry

if __name__ == '__main__':

	def go(path):

		print(f"Formatting {path}...")

		with open(path, "r+", encoding='UTF-8') as f1:
			entries = json.loads(f1.read())

		for i in range(len(entries)):
			entries[i] = format_all(entries[i], True)
			if not (i % 500):
				print(f"{i} checked.")

		print(f"{len(entries)} checked.")

		with open(path, "w", encoding='UTF-8') as f2:
			f2.write(per_line_entries(entries))

		print("Writing completed. All done.")

	go("../web/atlas.json")
	go("../web/atlas-before-ids-migration.json")
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`#!/usr/bin/python`

			`import re`
			`import json`

			`"""`
			`Examples:`
			`1. - /r/place`
			`- r/place`
			`2. /rplace`
			`3. - https://www.reddit.com/r/place`
			`- www.reddit.com/r/place`
			`- reddit.com/r/place`
			`UNUSED AND FAULTY`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`4. - https://place.reddit.com`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`- place.reddit.com`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`5. - [https://place.reddit.com](https://place.reddit.com)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`- [place.reddit.com](https://place.reddit.com)`
			`"""`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`FS_REGEX = {`
Support some other symbols 2022-04-07 18:25:25 +02:00			`"commatization": r',*(?: +(and\|&))? +',`
Support accidental extra slashes 2022-04-07 17:47:35 +02:00			`"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
			`"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www\|old\|new\|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ])',`
Support accidental extra slashes 2022-04-07 17:47:35 +02:00			`"pattern1user": r'\/*(?:u\|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
			`"pattern2user": r'^\/*(?:u\|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
Add support(?) for users 2022-04-07 16:39:25 +02:00			`"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www\|old\|new\|np)\.)?)?reddit\.com\/(?:u\|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ])',`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"])',`
			`# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"])\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"])\)"',`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`}`

Add validation 2022-04-07 17:03:30 +02:00			`VALIDATE_REGEX = {`
			`"subreddit": r'^\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(, \/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}))$\|^$',`
Better URL regex by stephenhay from https://mathiasbynens.be/demo/url-regex 2022-04-07 17:52:47 +02:00			`"website": r'^https?://[^\s/$.?#].[^\s]*$\|^$'`
Add validation 2022-04-07 17:03:30 +02:00			`}`

Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`CL_REGEX = r'\[(.+?)\]\((.+?)\)'`
			`CWTS_REGEX = r'^(?:(?:https?:\/\/)?(?:(?:www\|old\|new\|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$'`
Better URL regex by stephenhay from https://mathiasbynens.be/demo/url-regex 2022-04-07 17:52:47 +02:00			`CSTW_REGEX = r'^https?://[^\s/$.?#].[^\s]*$'`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00
			`# r/... to /r/...`
			`SUBREDDIT_TEMPLATE = r"/r/\1"`
Add support(?) for users 2022-04-07 16:39:25 +02:00			`USER_TEMPLATE = r"/r/\1"`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`def format_subreddit(entry: dict):`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "subreddit" in entry or not entry['subreddit']:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`return entry`

			`subredditLink = entry["subreddit"]`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)`
Add support(?) for users 2022-04-07 16:39:25 +02:00			`subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`if not subredditLink:`
			`return entry`

			`entry["subreddit"] = subredditLink`
			`return entry`

			`def collapse_links(entry: dict):`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "website" in entry or not entry['website']:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`return entry`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`website = entry["website"];`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`if re.search(CL_REGEX, website):`
			`match = re.search(CL_REGEX, website)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`if match.group(1) == match.group(2):`
			`website = match.group(2)`

			`entry["website"] = website`
			`return entry`

			`def remove_extras(entry: dict):`
Support some other symbols 2022-04-07 18:25:25 +02:00			`if "subreddit" in entry and entry["subreddit"]:`
			`# if not entry["subreddit"].startswith('/r/'):`
			`# entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])`
			`entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])`

Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`for key in entry:`
			`if not entry[key] or not isinstance(entry[key], str):`
			`continue`
			`# Leading and trailing spaces`
			`entry[key] = re.sub(r'^(\s+)', r'', entry[key])`
			`entry[key] = re.sub(r'(\s+)$', r'', entry[key])`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`# Double characters`
I mean, do this instead. This assumes that it uses Markdown 2022-04-07 16:50:28 +02:00			`entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])`
			`entry[key] = re.sub(r' {3,}\n', r' ', entry[key])`
			`entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])`
Happy little accidents 3 2022-04-07 16:33:10 +02:00			`entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`entry[key] = re.sub(r',{2,}', r',', entry[key])`
			`# Psuedo-empty strings`
Add more psuedo-empty strings 2022-04-07 17:38:26 +02:00			`if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`entry[key] = ""`

			`return entry`

			`def fix_r_caps(entry: dict):`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "description" in entry or not entry['description']:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`return entry`

Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`entry["description"] = re.sub(r'([^\w]\|^)\/R\/', '\1/r/', entry["description"])`
			`entry["description"] = re.sub(r'([^\w]\|^)R\/', '\1r/', entry["description"])`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`return entry`

Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`def fix_no_protocol_urls(entry: dict):`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "website" in entry or not entry['website']:`
Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`return entry`

Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not entry["website"].startswith("http"):`
Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`entry["website"] = "https://" + entry["website"]`

			`return entry`

Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`def convert_website_to_subreddit(entry: dict):`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`if not "website" in entry or not entry['website']:`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`return entry`

			`if re.match(CWTS_REGEX, entry["website"]):`
			`new_subreddit = re.sub(CWTS_REGEX, SUBREDDIT_TEMPLATE, entry["website"])`
			`if (new_subreddit.lower() == entry["subreddit"].lower()):`
			`entry["website"] = ""`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`elif "subreddit" in entry and entry['subreddit'] == "":`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`entry["subreddit"] = new_subreddit`
			`entry["website"] = ""`

			`return entry`

Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00			`def convert_subreddit_to_website(entry: dict):`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`if not "subreddit" in entry or not entry['subreddit']:`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00			`return entry`

			`if re.match(CSTW_REGEX, entry["subreddit"]):`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`if (entry["website"].lower() == entry["subreddit"].lower()):`
			`entry["subreddit"] = ""`
			`elif "website" in entry and entry['website'] == "":`
			`entry["website"] = entry["subreddit"]`
			`entry["subreddit"] = ""`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00
			`return entry`

Add validation 2022-04-07 17:03:30 +02:00			`def validate(entry: dict):`
			`if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):`
			`print(f"Wait, no id here! How did this happened? {entry}")`
			`return`
			`for key in entry:`
			`if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):`
			`print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`def per_line_entries(entries: list):`
			`out = "[\n"`
			`for entry in entries:`
			`out += json.dumps(entry) + ",\n"`
			`out = out[:-2] + "\n]"`
			`return out`

			`def format_all(entry: dict, silent=False):`
			`def print_(args, *kwargs):`
			`if not silent:`
			`print(args, *kwargs)`
			`print_("Fixing r/ capitalization...")`
			`entry = fix_r_caps(entry)`
Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`print_("Fixing links without protocol...")`
			`entry = fix_no_protocol_urls(entry)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`print_("Fix formatting of subreddit...")`
			`entry = format_subreddit(entry)`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`print_("Collapsing Markdown links...")`
			`entry = collapse_links(entry)`
Move things to make it more effective 2022-04-07 11:43:12 +02:00			`print_("Converting website links to subreddit (if possible)...")`
			`entry = convert_website_to_subreddit(entry)`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00			`print_("Converting subreddit links to website (if needed)...")`
			`entry = convert_subreddit_to_website(entry)`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`print_("Removing extras...")`
			`entry = remove_extras(entry)`
Add validation 2022-04-07 17:03:30 +02:00			`print_("Validating...")`
			`validate(entry)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`print_("Completed!")`
			`return entry`

			`if __name__ == '__main__':`

			`def go(path):`

			`print(f"Formatting {path}...")`

			`with open(path, "r+", encoding='UTF-8') as f1:`
			`entries = json.loads(f1.read())`

			`for i in range(len(entries)):`
			`entries[i] = format_all(entries[i], True)`
			`if not (i % 500):`
			`print(f"{i} checked.")`

			`print(f"{len(entries)} checked.")`

			`with open(path, "w", encoding='UTF-8') as f2:`
			`f2.write(per_line_entries(entries))`

			`print("Writing completed. All done.")`

			`go("../web/atlas.json")`
			`go("../web/atlas-before-ids-migration.json")`