Formatter support for new format

2024-09-27 20:48:56 +02:00 · 2022-04-15 18:55:51 +07:00 · 2022-04-15 18:55:51 +07:00 · bb2704c2d2
commit bb2704c2d2
parent 4648401a38
1 changed files with 213 additions and 68 deletions
--- a/tools/formatter.py
+++ b/tools/formatter.py
@ -52,27 +52,45 @@ def format_subreddit(entry: dict):
 	"""
 	Fix formatting of the value on "subreddit".
 	"""
-	if not "subreddit" in entry or not entry['subreddit']:
-		return entry

-	subredditLink = entry["subreddit"]
-	subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
-	subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
-	subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
-	subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
-	subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
-	subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
-	subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
+	if "subreddit" in entry and entry["subreddit"]:
+
+		subredditLink = entry["subreddit"]
+
+		subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
+		subredditLink = re.sub(FS_REGEX["pattern3"], SUBREDDIT_TEMPLATE, subredditLink)
+		subredditLink = re.sub(FS_REGEX["pattern1"], SUBREDDIT_TEMPLATE, subredditLink)
+		subredditLink = re.sub(FS_REGEX["pattern2"], SUBREDDIT_TEMPLATE, subredditLink)
+		subredditLink = re.sub(FS_REGEX["pattern3user"], USER_TEMPLATE, subredditLink)
+		subredditLink = re.sub(FS_REGEX["pattern1user"], USER_TEMPLATE, subredditLink)
+		subredditLink = re.sub(FS_REGEX["pattern2user"], USER_TEMPLATE, subredditLink)
+
+		entry["subreddit"] = subredditLink
+
+	if "links" in entry and "subreddit" in entry["links"]:
+
+		for i in range(len(entry["links"]["subreddit"])):
+
+			subredditLink = entry["links"]["subreddit"][i]
+
+			subredditLink = re.sub(FS_REGEX["commatization"], ', ', subredditLink)
+			subredditLink = re.sub(FS_REGEX["pattern3"], r"\1", subredditLink)
+			subredditLink = re.sub(FS_REGEX["pattern1"], r"\1", subredditLink)
+			subredditLink = re.sub(FS_REGEX["pattern2"], r"\1", subredditLink)
+
+			entry["links"]["subreddit"][i] = subredditLink

-	if not subredditLink:
-		return entry
-	
-	entry["subreddit"] = subredditLink
 	return entry

 def collapse_links(entry: dict):
+	"""
+	Collapses Markdown links.
+	"""
+
 	if "website" in entry and entry['website']:
-		website = entry["website"];
+
+		website = entry["website"]
+
 		if re.search(CL_REGEX, website):
 			match = re.search(CL_REGEX, website)
 			if match.group(1) == match.group(2):
@ -80,8 +98,23 @@ def collapse_links(entry: dict):

 		entry["website"] = website

+	elif "links" in entry and "website" in entry["links"]:
+
+		for i in range(len(entry["links"]["website"])):
+
+			website = entry["links"]["website"][i]
+
+			if re.search(CL_REGEX, website):
+				match = re.search(CL_REGEX, website)
+				if match.group(1) == match.group(2):
+					website = match.group(2)
+
+			entry["links"]["website"][i] = website
+
 	if "subreddit" in entry and entry['subreddit']:
-		subreddit = entry["subreddit"];
+
+		subreddit = entry["subreddit"]
+
 		if re.search(CL_REGEX, subreddit):
 			match = re.search(CL_REGEX, subreddit)
 			if match.group(1) == match.group(2):
@ -89,12 +122,27 @@ def collapse_links(entry: dict):

 		entry["subreddit"] = subreddit

+	elif "links" in entry and "subreddit" in entry["links"]:
+
+		for i in range(len(entry["links"]["subreddit"])):
+
+			subreddit = entry["links"]["subreddit"][i]
+
+			if re.search(CL_REGEX, subreddit):
+				match = re.search(CL_REGEX, subreddit)
+				if match.group(1) == match.group(2):
+					subreddit = match.group(2)
+
+			entry["links"]["subreddit"][i] = subreddit
+	
+
 	return entry

 def remove_extras(entry: dict):
 	"""
 	Removing unnecessary extra characters and converts select characters.
 	"""
+
 	if "subreddit" in entry and entry["subreddit"]:
 		# if not entry["subreddit"].startswith('/r/'):
 		# 	entry["subreddit"] = re.sub(r'^(.*)(?=\/r\/)', r'', entry["subreddit"])
@ -124,13 +172,27 @@ def remove_duplicate_points(entry: dict):
 	"""
 	Removes points from paths that occur twice after each other
 	"""
-	path: list = entry['path']
-	previous: list = path[0]
-	for i in range(len(path)-1, -1, -1):
-		current: list = path[i]
-		if current == previous:
-			path.pop(i)
-		previous = current
+
+	if not "path" in entry:
+		return entry
+
+	if isinstance(entry['path'], list):
+		path: list = entry['path']
+		previous: list = path[0]
+		for i in range(len(path)-1, -1, -1):
+			current: list = path[i]
+			if current == previous:
+				path.pop(i)
+			previous = current
+	else:
+		for key in entry['path']:
+			path: list = entry['path'][key]
+			previous: list = path[0]
+			for i in range(len(path)-1, -1, -1):
+				current: list = path[i]
+				if current == previous:
+					path.pop(i)
+				previous = current

 	return entry

@ -138,6 +200,7 @@ def fix_r_caps(entry: dict):
 	"""
 	Fixes capitalization of /r/. (/R/place -> /r/place)
 	"""
+
 	if not "description" in entry or not entry['description']:
 		return entry
 	
@ -150,11 +213,14 @@ def fix_no_protocol_urls(entry: dict):
 	"""
 	Fixes URLs with no protocol by adding "https://" protocol.
 	"""
-	if not "website" in entry or not entry['website']:
-		return entry
-	
-	if not entry["website"].startswith("http"):
-		entry["website"] = "https://" + entry["website"]
+
+	if "links" in entry and "website" in entry['links']:
+		for i in range(len(entry["links"]["website"])):
+			if entry["links"]["website"][i] and not entry["links"]["website"][i].startswith("http"):
+				entry["links"]["website"][i] = "https://" + entry["website"]
+	elif "website" in entry and not entry['website']:
+		if not entry["website"].startswith("http"):
+			entry["website"] = "https://" + entry["website"]

 	return entry

@ -162,23 +228,43 @@ def convert_website_to_subreddit(entry: dict):
 	"""
 	Converts the subreddit link on "website" to "subreddit" if possible.
 	"""
-	if not "website" in entry or not entry['website']:
-		return entry

-	if re.match(CWTS_REGEX["url"], entry["website"]):
-		new_subreddit = re.sub(CWTS_REGEX["url"], SUBREDDIT_TEMPLATE, entry["website"])
-		if (new_subreddit.lower() == entry["subreddit"].lower()):
-			entry["website"] = ""
-		elif not "subreddit" in entry or entry['subreddit'] == "":
-			entry["subreddit"] = new_subreddit
-			entry["website"] = ""
-	elif re.match(CWTS_REGEX["subreddit"], entry["website"]):
-		new_subreddit = re.sub(CWTS_REGEX["subreddit"], SUBREDDIT_TEMPLATE, entry["website"])
-		if (new_subreddit.lower() == entry["subreddit"].lower()):
-			entry["website"] = ""
-		elif not "subreddit" in entry or entry['subreddit'] == "":
-			entry["subreddit"] = new_subreddit
-			entry["website"] = ""
+	if "links" in entry and "website" in entry["links"]:
+		for i in range(len(entry["links"]["website"])):
+			if re.match(CWTS_REGEX["url"], entry["links"]["website"][i]):
+				new_subreddit = re.sub(CWTS_REGEX["url"], r"\1", entry["links"]["website"][i])
+				if new_subreddit in entry["links"]["subreddit"]:
+					entry["links"]["website"][i] = ""
+				elif not "subreddit" in entry["links"] or len(entry["subreddit"]) == 0:
+					if not "subreddit" in entry["links"]:
+						entry["links"]["subreddit"] = []
+					entry["links"]["subreddit"].append(new_subreddit)
+					entry["links"]["website"][i] = ""
+			elif re.match(CWTS_REGEX["subreddit"], entry["links"]["website"][i]):
+				new_subreddit = re.sub(CWTS_REGEX["subreddit"], r"\1", entry["links"]["website"][i])
+				if new_subreddit in entry["links"]["subreddit"]:
+					entry["links"]["website"][i] = ""
+				elif not "subreddit" in entry["links"] or len(entry["subreddit"]) == 0:
+					if not "subreddit" in entry["links"]:
+						entry["links"]["subreddit"] = []
+					entry["links"]["subreddit"].append(new_subreddit)
+					entry["links"]["website"][i] = ""
+
+	elif "website" in entry and entry['website']:
+		if re.match(CWTS_REGEX["url"], entry["website"]):
+			new_subreddit = re.sub(CWTS_REGEX["url"], SUBREDDIT_TEMPLATE, entry["website"])
+			if (new_subreddit.lower() == entry["subreddit"].lower()):
+				entry["website"] = ""
+			elif not "subreddit" in entry or entry['subreddit'] == "":
+				entry["subreddit"] = new_subreddit
+				entry["website"] = ""
+		elif re.match(CWTS_REGEX["subreddit"], entry["website"]):
+			new_subreddit = re.sub(CWTS_REGEX["subreddit"], SUBREDDIT_TEMPLATE, entry["website"])
+			if (new_subreddit.lower() == entry["subreddit"].lower()):
+				entry["website"] = ""
+			elif not "subreddit" in entry or entry['subreddit'] == "":
+				entry["subreddit"] = new_subreddit
+				entry["website"] = ""

 	return entry

@ -186,20 +272,37 @@ def convert_subreddit_to_website(entry: dict):
 	"""
 	Converts the links on "subreddit" to a "website" if needed. This also supports Reddit users (/u/reddit). 
 	"""
-	if not "subreddit" in entry or not entry['subreddit']:
-		return entry

-	if re.match(CSTW_REGEX["website"], entry["subreddit"]):
-		if (entry["website"].lower() == entry["subreddit"].lower()):
-			entry["subreddit"] = ""
-		elif not "website" in entry or entry['website'] == "":
-			entry["website"] = entry["subreddit"]
-			entry["subreddit"] = ""
-	elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
-		if not "website" in entry or entry['website'] == "":
-			username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
-			entry["website"] = "https://www.reddit.com/user/" + username
-			entry["subreddit"] = ""
+	if "links" in entry and "subreddit" in entry["links"]:
+		for i in range(len(entry["links"]["subreddit"])):
+			if re.match(CSTW_REGEX["website"], entry["links"]["subreddit"][i]):
+				if "website" in entry["links"] and entry["links"]["subreddit"][i] in entry["links"]["website"]:
+					entry["links"]["subreddit"][i] = ""
+				elif not "website" in entry["links"] or len(entry["website"]) == 0:
+					if not "website" in entry["links"]:
+						entry["links"]["website"] = []
+					entry["website"].append(entry["links"]["subreddit"][i])
+					entry["links"]["subreddit"][i] = ""
+			elif re.match(CSTW_REGEX["user"], entry["links"]["subreddit"][i]):
+				if not "website" in entry["links"] or len(entry["website"]) == 0:
+					username = re.match(CSTW_REGEX["user"], entry["links"]["subreddit"][i]).group(1)
+					if not "website" in entry["links"]:
+						entry["links"]["website"] = []
+					entry["website"].append("https://www.reddit.com/user/" + username)
+					entry["links"]["subreddit"][i] = ""
+
+	elif "subreddit" in entry and entry['subreddit']:
+		if re.match(CSTW_REGEX["website"], entry["subreddit"]):
+			if (entry["website"].lower() == entry["subreddit"].lower()):
+				entry["subreddit"] = ""
+			elif not "website" in entry or entry['website'] == "":
+				entry["website"] = entry["subreddit"]
+				entry["subreddit"] = ""
+		elif re.match(CSTW_REGEX["user"], entry["subreddit"]):
+			if not "website" in entry or entry['website'] == "":
+				username = re.match(CSTW_REGEX["user"], entry["subreddit"]).group(1)
+				entry["website"] = "https://www.reddit.com/user/" + username
+				entry["subreddit"] = ""

 	return entry

@ -235,17 +338,40 @@ def calculate_center(path: list):

 def update_center(entry: dict):
 	"""
-	checks if the center of a entry is up to date, and updates it if it's either missing or outdated
+	checks if the center of a entry is up to date, and updates it if it's either missing or outdated.
 	"""
+	
 	if 'path' not in entry:
 		return entry
-	path = entry['path']
-	if len(path) > 1:
-		calculated_center = calculate_center(path)
-		if 'center' not in entry or entry['center'] != calculated_center:
-			entry['center'] = calculated_center
+
+	if isinstance(entry['path'], list):
+		path = entry['path']
+		if len(path) > 1:
+			calculated_center = calculate_center(path)
+			if 'center' not in entry or entry['center'] != calculated_center:
+				entry['center'] = calculated_center
+	else:
+		for key in entry['path']:
+			path = entry['path'][key]
+			if len(path) > 1:
+				calculated_center = calculate_center(path)
+				if 'center' not in entry or key not in entry['center'] or entry['center'][key] != calculated_center:
+					entry['center'][key] = calculated_center
+	
 	return entry

+def remove_empty_and_similar(entry: dict):
+	"""
+	Removes empty items on lists, usually from the past formattings.
+	"""
+
+	for key in entry["links"]:
+		small = list(map(lambda x: x.lower(), entry["links"][key]))
+		entry["links"][key] = [x for x in entry["links"][key] if x and x.lower() in small]
+
+	return entry
+
+
 def validate(entry: dict):
 	"""
 	Validates the entry. Catch errors and tell warnings related to the entry.
@ -256,17 +382,34 @@ def validate(entry: dict):
 	2: Warnings that may effect user experience when interacting with the entry
 	3: Errors that make the entry inaccessible or broken.
 	"""
+	
 	return_status = 0
 	if (not "id" in entry or (not entry['id'] and not entry['id'] == 0)):
 		print(f"Wait, no id here! How did this happened? {entry}")
 		return_status = 3
 		entry['id'] = '[MISSING_ID]'
-	if not ("path" in entry and isinstance(entry["path"], list) and len(entry["path"]) > 0):
-		print(f"Entry {entry['id']} has no points!")
-		return_status = 3
-	elif len(entry["path"]) < 3:
-		print(f"Entry {entry['id']} only has {len(entry['path'])} point(s)!")
+
+	if "path" in entry:
+		if isinstance(entry['path'], list):
+			if len(entry["path"]) > 0:
+				print(f"Entry {entry['id']} has no points!")
+				return_status = 3
+			elif len(entry["path"]) < 3:
+				print(f"Entry {entry['id']} only has {len(entry['path'])} point(s)!")
+				return_status = 3
+		else:
+			for key in entry['path']:
+				path = entry['path'][key]
+				if len(path) > 0:
+					print(f"Period {key} of entry {entry['id']} has no points!")
+					return_status = 3
+				elif len(path) < 3:
+					print(f"Period {key} of entry {entry['id']} only has {len(entry['path'])} point(s)!")
+					return_status = 3
+	else:
+		print(f"Entry {entry['id']} has no path at all!")
 		return_status = 3
+
 	for key in entry:
 		if key in VALIDATE_REGEX and not re.match(VALIDATE_REGEX[key], entry[key]):
 			if return_status < 2: return_status = 2
@ -316,6 +459,8 @@ def print_(*args, **kwargs):
 	entry = remove_duplicate_points(entry)
 	print_("Updating center...")
 	entry = update_center(entry)
+	print_("Remove empty items...")
+	entry = remove_empty_and_similar(entry)
 	print_("Validating...")
 	status_code = validate(entry)
 	print_("Completed!")