atlas/tools/formatter.py

#!/usr/bin/python

import re
import json

"""
Examples:
1. - /r/place
   - r/place
2. /rplace
3. - https://www.reddit.com/r/place
   - www.reddit.com/r/place
   - reddit.com/r/place
UNUSED AND FAULTY
4. - https://place.reddit.com
   - place.reddit.com
5. - [https://place.reddit.com](https://place.reddit.com)
   - [place.reddit.com](https://place.reddit.com)
"""
FS_REGEX = {
	"commatization": r'( *(,+ +|,+ |,+)| +)(and|&|;)( *(,+ +|,+ |,+)| +)|, *$| +',
	"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
	"pattern1user": r'\/*(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern2user": r'^\/*(?:u|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',
	"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/(?:u|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ]*)*',
	# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*',
	# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"]*)*\)"',
}

VALIDATE_REGEX = {
	"subreddit": r'^ *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *(, *\/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) *)*$|^$',
	"website": r'^https?://[^\s/$.?#].[^\s]*$|^$'
}

CL_REGEX = r'\[(.+?)\]\((.+?)\)'
CWTS_REGEX = {
	"url": r'^(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$',
	"subreddit": r'^\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})\/?$'
}
CSTW_REGEX = {
	"website": r'^https?://[^\s/$.?#].[^\s]*$',
	"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'
}

# r/... to /r/...
SUBREDDIT_TEMPLATE = r"/r/\1"
USER_TEMPLATE = r"/u/\1"

def format_subreddit(entry: dict):
	"""
	Fix formatting of the value on "subreddit".
	"""
	if not "subreddit" in entry or not entry['subreddit']:
		return entry

	subredditLink = entry["subreddit"]
	subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
	subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)

	if not subredditLink:
		return entry
	
	entry["subreddit"] = subredditLink
	return entry

def collapse_links(entry: dict):
	if "website" in entry and entry['website']:
		website = entry["website"];
		if re.search(CL_REGEX, website):
			match = re.search(CL_REGEX, website)
			if match.group(1) == match.group(2):
				website = match.group(2)

		entry["website"] = website

	if "subreddit" in entry and entry['subreddit']:
		subreddit = entry["subreddit"];
		if re.search(CL_REGEX, subreddit):
			match = re.search(CL_REGEX, subreddit)
			if match.group(1) == match.group(2):
				subreddit = match.group(2)

		entry["subreddit"] = subreddit

	return entry

def remove_extras(entry: dict):
	"""
	Removing unnecessary extra characters and converts select characters.
	"""
	if "subreddit" in entry and entry["subreddit"]:
		# if not entry["subreddit"].startswith('/r/'):
		# 	entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
		entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])

	for key in entry:
		if not entry[key] or not isinstance(entry[key], str): 
			continue
		# Leading and trailing spaces
		entry[key] = entry[key].strip()
		# Double characters
		entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])
		entry[key] = re.sub(r' {3,}\n', r'  ', entry[key])
		entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])
		entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])
		entry[key] = re.sub(r',{2,}', r',', entry[key])
		# Smart quotation marks
		entry[key] = re.sub(r'[\u201c\u201d]', '"', entry[key])
		entry[key] = re.sub(r'[\u2018\u2019]', "'", entry[key])
		# Psuedo-empty strings
		if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:
			entry[key] = ""

	return entry

def fix_r_caps(entry: dict):
	"""
	Fixes capitalization of /r/. (/R/place -> /r/place)
	"""
	if not "description" in entry or not entry['description']:
		return entry
	
	entry["description"] = re.sub(r'([^\w]|^)\/R\/', '\1/r/', entry["description"])
	entry["description"] = re.sub(r'([^\w]|^)R\/', '\1r/', entry["description"])

	return entry

def fix_no_protocol_urls(entry: dict):
	"""
	Fixes URLs with no protocol by adding "https://" protocol.
	"""
	if not "website" in entry or not entry['website']:
		return entry
	
	if not entry["website"].startswith("http"):
		entry["website"] = "https://" + entry["website"]

	return entry

def convert_website_to_subreddit(entry: dict):
	"""
	Converts the subreddit link on "website" to "subreddit" if possible.
	"""
	if not "website" in entry or not entry['website']:
		return entry

	if re.match(CWTS_REGEX["url"], entry["website"]):
		new_subreddit = re.sub(CWTS_REGEX["url"], SUBREDDIT_TEMPLATE, entry["website"])
		if (new_subreddit.lower() == entry["subreddit"].lower()):
			entry["website"] = ""
		elif not "subreddit" in entry or entry['subreddit'] == "":
			entry["subreddit"] = new_subreddit
			entry["website"] = ""
	elif re.match(CWTS_REGEX["subreddit"], entry["website"]):
		new_subreddit = re.sub(CWTS_REGEX["subreddit"], SUBREDDIT_TEMPLATE, entry["website"])
		if (new_subreddit.lower() == entry["subreddit"].lower()):
			entry["website"] = ""
		elif not "subreddit" in entry or entry['subreddit'] == "":
			entry["subreddit"] = new_subreddit
			entry["website"] = ""

	return entry

def convert_subreddit_to_website(entry: dict):
	"""
	Converts the links on "subreddit" to a "website" if needed. This also supports Reddit users (/u/reddit). 
	"""
	if not "subreddit" in entry or not entry['subreddit']:
		return entry

	if re.match(CSTW_REGEX["website"], entry["subreddit"]):
		if (entry["website"].lower() == entry["subreddit"].lower()):
			entry["subreddit"] = ""
		elif not "website" in entry or entry['website'] == "":
			entry["website"] = entry["subreddit"]
			entry["subreddit"] = ""
	elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
		if not "website" in entry or entry['website'] == "":
			username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
			entry["website"] = "https://www.reddit.com/user/" + username
			entry["subreddit"] = ""

	return entry

def calculate_center(path: list):
	"""
	Caluclates the center of a polygon

	adapted from /web/_js/draw.js:calucalteCenter()
	"""
	area = 0
	x = 0
	y = 0

	for i in range(len(path)):
		point1 = path[i]
		point2 = path[i-1 if i != 0 else len(path)-1]
		f = point1[0] * point2[1] - point2[0] * point1[1]
		area += f
		x += (point1[0] + point2[0]) * f
		y += (point1[1] + point2[1]) * f

	area *= 3

	if area != 0:
		return [x // area + 0.5, y // area + 0.5]
	else:
		# get the center of a straight line
		max_x = max(i[0] for i in path)
		min_x = min(i[0] for i in path)
		max_y = max(i[1] for i in path)
		min_y = min(i[1] for i in path)
		return [(max_x + min_x) // 2 + 0.5, (max_y + min_y) // 2 + 0.5]

def update_center(entry: dict):
	"""
	checks if the center of a entry is up to date, and updates it if it's either missing or outdated
	"""
	if 'path' not in entry:
		return entry
	path = entry['path']
	if len(path) > 1:
		calculated_center = calculate_center(path)
		if 'center' not in entry or entry['center'] != calculated_center:
			entry['center'] = calculated_center
	return entry

def validate(entry: dict):
	"""
	Validates the entry. Catch errors and tell warnings related to the entry.

	Status code key:
	0: All valid, no problems
	1: Informational logs that may be ignored
	2: Warnings that may effect user experience when interacting with the entry
	3: Errors that make the entry inaccessible or broken.
	"""
	return_status = 0
	if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
		print(f"Wait, no id here! How did this happened? {entry}")
		return_status = 3
		entry['id'] = '[MISSING_ID]'
	if not ("path" in entry and isinstance(entry["path"], list) and len(entry["path"]) > 0):
		print(f"Entry {entry['id']} has no points!")
		return_status = 3
	elif len(entry["path"]) < 3:
		print(f"Entry {entry['id']} only has {len(entry['path'])} point(s)!")
		return_status = 3
	for key in entry:
		if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
			if return_status < 2: return_status = 2
			print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")
	return return_status

def per_line_entries(entries: list):
	"""
	Returns a string of all the entries, with every entry in one line.
	"""
	out = "[\n"
	for entry in entries:
		if entry:
			out += json.dumps(entry, ensure_ascii=False) + ",\n"
	out = out[:-2] + "\n]"
	return out

def format_all(entry: dict, silent=False):
	"""
	Format using all the available formatters.
	Outputs a tuple containing the entry and the validation status code.

	Status code key:
	0: All valid, no problems
	1: Informational logs that may be ignored
	2: Warnings that may effect user experience when interacting with the entry
	3: Errors that make the entry inaccessible or broken.
	"""
	def print_(*args, **kwargs):
		if not silent:
			print(*args, **kwargs)
	print_("Fixing r/ capitalization...")
	entry = fix_r_caps(entry)
	print_("Fix formatting of subreddit...")
	entry = format_subreddit(entry)
	print_("Collapsing Markdown links...")
	entry = collapse_links(entry)
	print_("Converting website links to subreddit (if possible)...")
	entry = convert_website_to_subreddit(entry)
	print_("Converting subreddit links to website (if needed)...")
	entry = convert_subreddit_to_website(entry)
	print_("Fixing links without protocol...")
	entry = fix_no_protocol_urls(entry)
	print_("Removing extras...")
	entry = remove_extras(entry)
	print_("Updating center")
	entry = update_center(entry)
	print_("Validating...")
	status_code = validate(entry)
	print_("Completed!")
	return ( entry, status_code )

if __name__ == '__main__':

	def go(path):

		print(f"Formatting {path}...")

		with open(path, "r+", encoding='UTF-8') as f1:
			entries = json.loads(f1.read())

		for i in range(len(entries)):
			entry_formatted, validation_status = format_all(entries[i], True)
			if validation_status > 2:
				print(f"Entry {entry_formatted['id']} will be removed! {json.dumps(entry_formatted)}")
				entries[i] = None
			else:
				entries[i] = entry_formatted
			if not (i % 500):
				print(f"{i} checked.")

		print(f"{len(entries)} checked.")

		with open(path, "w", encoding='utf-8', newline='\n') as f2:
			f2.write(per_line_entries(entries))

		print("Writing completed. All done.")

	go("../web/atlas.json")
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`#!/usr/bin/python`

			`import re`
			`import json`

			`"""`
			`Examples:`
			`1. - /r/place`
			`- r/place`
			`2. /rplace`
			`3. - https://www.reddit.com/r/place`
			`- www.reddit.com/r/place`
			`- reddit.com/r/place`
			`UNUSED AND FAULTY`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`4. - https://place.reddit.com`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`- place.reddit.com`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`5. - [https://place.reddit.com](https://place.reddit.com)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`- [place.reddit.com](https://place.reddit.com)`
			`"""`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`FS_REGEX = {`
Make validator more represents the JS script, commatization on spaces Fun fact: The JS script also includes those that have no r/, but I made the validator warns it so future contributors can confirm and change it into a proper format, or delete it. 2022-04-08 06:48:13 +02:00			`"commatization": r'( (,+ +\|,+ \|,+)\| +)(and\|&\|;)( (,+ +\|,+ \|,+)\| +)\|, *$\| +',`
Support accidental extra slashes 2022-04-07 17:47:35 +02:00			`"pattern1": r'\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
			`"pattern2": r'^\/*[rR](?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`"pattern3": r'(?:(?:https?:\/\/)?(?:(?:www\|old\|new\|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ])',`
Support accidental extra slashes 2022-04-07 17:47:35 +02:00			`"pattern1user": r'\/*(?:u\|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
			`"pattern2user": r'^\/*(?:u\|user)(?!\/)([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/$)?',`
Add support(?) for users 2022-04-07 16:39:25 +02:00			`"pattern3user": r'(?:(?:https?:\/\/)?(?:(?:www\|old\|new\|np)\.)?)?reddit\.com\/(?:u\|user)\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/[^" ])',`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`# "pattern4": r'(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"])',`
			`# "pattern5": r'\[(?:https?:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"])\]\((?:https:\/\/)?(?!^www\.)(.+)\.reddit\.com(?:\/[^"])\)"',`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`}`

Add validation 2022-04-07 17:03:30 +02:00			`VALIDATE_REGEX = {`
Make validator more represents the JS script, commatization on spaces Fun fact: The JS script also includes those that have no r/, but I made the validator warns it so future contributors can confirm and change it into a proper format, or delete it. 2022-04-08 06:48:13 +02:00			`"subreddit": r'^ \/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) (, \/?r\/([A-Za-z0-9][A-Za-z0-9_]{1,20}) )*$\|^$',`
Better URL regex by stephenhay from https://mathiasbynens.be/demo/url-regex 2022-04-07 17:52:47 +02:00			`"website": r'^https?://[^\s/$.?#].[^\s]*$\|^$'`
Add validation 2022-04-07 17:03:30 +02:00			`}`

Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`CL_REGEX = r'\[(.+?)\]\((.+?)\)'`
Move subreddit-formatted websites to subreddit 2022-04-08 10:53:57 +02:00			`CWTS_REGEX = {`
			`"url": r'^(?:(?:https?:\/\/)?(?:(?:www\|old\|new\|np)\.)?)?reddit\.com\/r\/([A-Za-z0-9][A-Za-z0-9_]{1,20})(?:\/)$',`
			`"subreddit": r'^\/*[rR]\/([A-Za-z0-9][A-Za-z0-9_]{1,20})\/?$'`
			`}`
Move user links to website, small logic fix 2022-04-08 07:06:34 +02:00			`CSTW_REGEX = {`
			`"website": r'^https?://[^\s/$.?#].[^\s]*$',`
			`"user": r'^\/*u\/([A-Za-z0-9][A-Za-z0-9_]{1,20})$'`
			`}`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00
			`# r/... to /r/...`
			`SUBREDDIT_TEMPLATE = r"/r/\1"`
Forgot to change that 2022-04-08 07:02:02 +02:00			`USER_TEMPLATE = r"/u/\1"`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`def format_subreddit(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Fix formatting of the value on "subreddit".`
			`"""`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "subreddit" in entry or not entry['subreddit']:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`return entry`

			`subredditLink = entry["subreddit"]`
Somehow commatization gone 2022-04-07 18:31:13 +02:00			`subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)`
Add support(?) for users 2022-04-07 16:39:25 +02:00			`subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)`
			`subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`if not subredditLink:`
			`return entry`

			`entry["subreddit"] = subredditLink`
			`return entry`

			`def collapse_links(entry: dict):`
Forgot to also parse subreddits 2022-04-08 18:05:48 +02:00			`if "website" in entry and entry['website']:`
			`website = entry["website"];`
			`if re.search(CL_REGEX, website):`
			`match = re.search(CL_REGEX, website)`
			`if match.group(1) == match.group(2):`
			`website = match.group(2)`

			`entry["website"] = website`

			`if "subreddit" in entry and entry['subreddit']:`
			`subreddit = entry["subreddit"];`
			`if re.search(CL_REGEX, subreddit):`
			`match = re.search(CL_REGEX, subreddit)`
			`if match.group(1) == match.group(2):`
			`subreddit = match.group(2)`

			`entry["subreddit"] = subreddit`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`return entry`

			`def remove_extras(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Removing unnecessary extra characters and converts select characters.`
			`"""`
Support some other symbols 2022-04-07 18:25:25 +02:00			`if "subreddit" in entry and entry["subreddit"]:`
			`# if not entry["subreddit"].startswith('/r/'):`
			`# entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])`
			`entry["subreddit"] = re.sub(r'[.,]+$', r'', entry["subreddit"])`

Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`for key in entry:`
			`if not entry[key] or not isinstance(entry[key], str):`
			`continue`
			`# Leading and trailing spaces`
Forgot to just use strip, also remove trailing comma on subs 2022-04-08 06:36:57 +02:00			`entry[key] = entry[key].strip()`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`# Double characters`
I mean, do this instead. This assumes that it uses Markdown 2022-04-07 16:50:28 +02:00			`entry[key] = re.sub(r' {2,}(?!\n)', r' ', entry[key])`
			`entry[key] = re.sub(r' {3,}\n', r' ', entry[key])`
			`entry[key] = re.sub(r'\n{3,}', r'\n\n', entry[key])`
Happy little accidents 3 2022-04-07 16:33:10 +02:00			`entry[key] = re.sub(r'r\/{2,}', r'r\/', entry[key])`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`entry[key] = re.sub(r',{2,}', r',', entry[key])`
Smart quotes, why? Get lost. 2022-04-08 11:11:54 +02:00			`# Smart quotation marks`
			`entry[key] = re.sub(r'[\u201c\u201d]', '"', entry[key])`
			`entry[key] = re.sub(r'[\u2018\u2019]', "'", entry[key])`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`# Psuedo-empty strings`
Add more psuedo-empty strings 2022-04-07 17:38:26 +02:00			`if entry[key] in ["n/a", "N/A", "na", "NA", "-", "null", "none", "None"]:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`entry[key] = ""`

			`return entry`

			`def fix_r_caps(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Fixes capitalization of /r/. (/R/place -> /r/place)`
			`"""`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "description" in entry or not entry['description']:`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`return entry`

Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`entry["description"] = re.sub(r'([^\w]\|^)\/R\/', '\1/r/', entry["description"])`
			`entry["description"] = re.sub(r'([^\w]\|^)R\/', '\1r/', entry["description"])`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`return entry`

Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`def fix_no_protocol_urls(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Fixes URLs with no protocol by adding "https://" protocol.`
			`"""`
Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not "website" in entry or not entry['website']:`
Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`return entry`

Happy little accidents 2, convert and on subreddit 2022-04-07 09:36:29 +02:00			`if not entry["website"].startswith("http"):`
Adapt ARP's url fix Co-authored-by: Cheng Hann Gan <chenghanngan.us@gmail.com> 2022-04-07 09:10:08 +02:00			`entry["website"] = "https://" + entry["website"]`

			`return entry`

Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`def convert_website_to_subreddit(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Converts the subreddit link on "website" to "subreddit" if possible.`
			`"""`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`if not "website" in entry or not entry['website']:`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`return entry`

Move subreddit-formatted websites to subreddit 2022-04-08 10:53:57 +02:00			`if re.match(CWTS_REGEX["url"], entry["website"]):`
			`new_subreddit = re.sub(CWTS_REGEX["url"], SUBREDDIT_TEMPLATE, entry["website"])`
			`if (new_subreddit.lower() == entry["subreddit"].lower()):`
			`entry["website"] = ""`
			`elif not "subreddit" in entry or entry['subreddit'] == "":`
			`entry["subreddit"] = new_subreddit`
			`entry["website"] = ""`
			`elif re.match(CWTS_REGEX["subreddit"], entry["website"]):`
			`new_subreddit = re.sub(CWTS_REGEX["subreddit"], SUBREDDIT_TEMPLATE, entry["website"])`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`if (new_subreddit.lower() == entry["subreddit"].lower()):`
			`entry["website"] = ""`
Move user links to website, small logic fix 2022-04-08 07:06:34 +02:00			`elif not "subreddit" in entry or entry['subreddit'] == "":`
Tiny refactor, remove redundant subreddit link on website Resolves #707 2022-04-07 11:02:43 +02:00			`entry["subreddit"] = new_subreddit`
			`entry["website"] = ""`

			`return entry`

Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00			`def convert_subreddit_to_website(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Converts the links on "subreddit" to a "website" if needed. This also supports Reddit users (/u/reddit).`
			`"""`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`if not "subreddit" in entry or not entry['subreddit']:`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00			`return entry`

Move user links to website, small logic fix 2022-04-08 07:06:34 +02:00			`if re.match(CSTW_REGEX["website"], entry["subreddit"]):`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`if (entry["website"].lower() == entry["subreddit"].lower()):`
			`entry["subreddit"] = ""`
Move user links to website, small logic fix 2022-04-08 07:06:34 +02:00			`elif not "website" in entry or entry['website'] == "":`
Improve CSTW and whoops 2022-04-07 17:18:07 +02:00			`entry["website"] = entry["subreddit"]`
			`entry["subreddit"] = ""`
Move user links to website, small logic fix 2022-04-08 07:06:34 +02:00			`elif re.match(CSTW_REGEX["user"], entry["subreddit"]):`
			`if not "website" in entry or entry['website'] == "":`
			`username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)`
			`entry["website"] = "https://www.reddit.com/user/" + username`
			`entry["subreddit"] = ""`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00
			`return entry`
Added a cleanup step to re calulate the center This change calculates the center of each entry and compares it to the current center. If they are different, the center get's updated. 2022-04-09 21:08:36 +02:00
			`def calculate_center(path: list):`
			`"""`
			`Caluclates the center of a polygon`

			`adapted from /web/_js/draw.js:calucalteCenter()`
			`"""`
			`area = 0`
			`x = 0`
			`y = 0`

			`for i in range(len(path)):`
			`point1 = path[i]`
			`point2 = path[i-1 if i != 0 else len(path)-1]`
			`f = point1[0] * point2[1] - point2[0] * point1[1]`
			`area += f`
			`x += (point1[0] + point2[0]) * f`
			`y += (point1[1] + point2[1]) * f`

			`area *= 3`

			`if area != 0:`
			`return [x // area + 0.5, y // area + 0.5]`
			`else:`
			`# get the center of a straight line`
			`max_x = max(i[0] for i in path)`
			`min_x = min(i[0] for i in path)`
			`max_y = max(i[1] for i in path)`
			`min_y = min(i[1] for i in path)`
			`return [(max_x + min_x) // 2 + 0.5, (max_y + min_y) // 2 + 0.5]`

			`def update_center(entry: dict):`
			`"""`
			`checks if the center of a entry is up to date, and updates it if it's either missing or outdated`
			`"""`
			`if 'path' not in entry:`
			`return entry`
			`path = entry['path']`
			`if len(path) > 1:`
			`calculated_center = calculate_center(path)`
			`if 'center' not in entry or entry['center'] != calculated_center:`
			`entry['center'] = calculated_center`
			`return entry`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00
Add validation 2022-04-07 17:03:30 +02:00			`def validate(entry: dict):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Validates the entry. Catch errors and tell warnings related to the entry.`

			`Status code key:`
			`0: All valid, no problems`
			`1: Informational logs that may be ignored`
			`2: Warnings that may effect user experience when interacting with the entry`
			`3: Errors that make the entry inaccessible or broken.`
			`"""`
			`return_status = 0`
Add validation 2022-04-07 17:03:30 +02:00			`if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):`
			`print(f"Wait, no id here! How did this happened? {entry}")`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`return_status = 3`
			`entry['id'] = '[MISSING_ID]'`
			`if not ("path" in entry and isinstance(entry["path"], list) and len(entry["path"]) > 0):`
			`print(f"Entry {entry['id']} has no points!")`
			`return_status = 3`
			`elif len(entry["path"]) < 3:`
			`print(f"Entry {entry['id']} only has {len(entry['path'])} point(s)!")`
Remove entries with less than 3 points 2022-04-08 17:36:55 +02:00			`return_status = 3`
Add validation 2022-04-07 17:03:30 +02:00			`for key in entry:`
			`if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`if return_status < 2: return_status = 2`
Add validation 2022-04-07 17:03:30 +02:00			`print(f"{key} of entry {entry['id']} is still invalid! {entry[key]}")`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`return return_status`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`def per_line_entries(entries: list):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Returns a string of all the entries, with every entry in one line.`
			`"""`
More beautiful way of fixing this 2022-04-09 23:30:11 +02:00			`out = "[\n"`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`for entry in entries:`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`if entry:`
More beautiful way of fixing this 2022-04-09 23:30:11 +02:00			`out += json.dumps(entry, ensure_ascii=False) + ",\n"`
			`out = out[:-2] + "\n]"`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`return out`

			`def format_all(entry: dict, silent=False):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`"""`
			`Format using all the available formatters.`
			`Outputs a tuple containing the entry and the validation status code.`

			`Status code key:`
			`0: All valid, no problems`
			`1: Informational logs that may be ignored`
			`2: Warnings that may effect user experience when interacting with the entry`
			`3: Errors that make the entry inaccessible or broken.`
			`"""`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`def print_(args, *kwargs):`
			`if not silent:`
			`print(args, *kwargs)`
			`print_("Fixing r/ capitalization...")`
			`entry = fix_r_caps(entry)`
			`print_("Fix formatting of subreddit...")`
			`entry = format_subreddit(entry)`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`print_("Collapsing Markdown links...")`
			`entry = collapse_links(entry)`
Move things to make it more effective 2022-04-07 11:43:12 +02:00			`print_("Converting website links to subreddit (if possible)...")`
			`entry = convert_website_to_subreddit(entry)`
Add CSTW, clarity fixes 2022-04-07 16:31:56 +02:00			`print_("Converting subreddit links to website (if needed)...")`
			`entry = convert_subreddit_to_website(entry)`
Move subreddit-formatted websites to subreddit 2022-04-08 10:53:57 +02:00			`print_("Fixing links without protocol...")`
			`entry = fix_no_protocol_urls(entry)`
Optimize and remove redundant code, make some tamer 2022-04-07 14:43:57 +02:00			`print_("Removing extras...")`
			`entry = remove_extras(entry)`
Added a cleanup step to re calulate the center This change calculates the center of each entry and compares it to the current center. If they are different, the center get's updated. 2022-04-09 21:08:36 +02:00			`print_("Updating center")`
			`entry = update_center(entry)`
Add validation 2022-04-07 17:03:30 +02:00			`print_("Validating...")`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`status_code = validate(entry)`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`print_("Completed!")`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`return ( entry, status_code )`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`if __name__ == '__main__':`

			`def go(path):`

			`print(f"Formatting {path}...")`

			`with open(path, "r+", encoding='UTF-8') as f1:`
			`entries = json.loads(f1.read())`

			`for i in range(len(entries)):`
Add docs on formatter, move path length checker to formatter 2022-04-08 13:04:49 +02:00			`entry_formatted, validation_status = format_all(entries[i], True)`
			`if validation_status > 2:`
			`print(f"Entry {entry_formatted['id']} will be removed! {json.dumps(entry_formatted)}")`
			`entries[i] = None`
			`else:`
			`entries[i] = entry_formatted`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00			`if not (i % 500):`
			`print(f"{i} checked.")`

			`print(f"{len(entries)} checked.")`

Using LF instead of RCLF 2022-04-10 16:10:40 +02:00			`with open(path, "w", encoding='utf-8', newline='\n') as f2:`
More beautiful way of fixing this 2022-04-09 23:30:11 +02:00			`f2.write(per_line_entries(entries))`
Improve and merge scripts, use JSON instead of regex 2022-04-07 07:01:09 +02:00
			`print("Writing completed. All done.")`

Added a cleanup step to re calulate the center This change calculates the center of each entry and compares it to the current center. If they are different, the center get's updated. 2022-04-09 21:08:36 +02:00			`go("../web/atlas.json")`