atlas/tools/migrate_atlas_format.py
Hans5958 74b8073a7b Do stricter (and proper) subreddit regex 2
I forgot that I made it so it excludes underscore, here's a proper fix
2022-04-19 16:42:20 +07:00

105 lines
2.6 KiB
Python

import os
import json
import re
'''
Migrator script from old atlas format to remastered atlas format.
- center and path: single -> time-specific
- website and subreddit: single strings -> links object
- submitted_by -> contributors
'''
#
# Migrates the old atlas format (single center/path) to the remastered atlas format (time-boxed centers/paths)
def per_line_entries(entries: list):
out = '[\n'
for entry in entries:
out += json.dumps(entry, ensure_ascii=False) + ',\n'
return out[:-2] + '\n]'
file_path = os.path.join('..', 'web', 'atlas.json')
END_IMAGE = 166
INIT_CANVAS_RANGE = (1, END_IMAGE)
EXPANSION_1_RANGE = (56, END_IMAGE)
EXPANSION_2_RANGE = (109, END_IMAGE)
COMMATIZATION = re.compile(r'(?: *(?:,+ +|,+ |,+)| +)(?:and|&|;)(?: *(?:,+ +|,+ |,+)| +)|, *$| +')
FS_REGEX = re.compile(r'(?:(?:(?:(?:https?:\/\/)?(?:(?:www|old|new|np)\.)?)?reddit\.com)?\/)?[rR]\/([A-Za-z0-9][A-Za-z0-9_]{2,20})(?:\/[^" ]*)*')
with open(file_path, 'r+', encoding='UTF-8') as file:
entries = json.loads(file.read())
index = 0
for entry in entries:
new_entry = {
"id": "",
"name": "",
"description": "",
"links": {},
"center": {},
"path": {},
"contributors": []
}
center = entry['center']
path = entry['path']
if isinstance(center, list):
# Use the center to figure out which canvas expansion the entry is in.
if center[1] > 1000:
time_range = EXPANSION_2_RANGE
elif center[0] > 1000:
time_range = EXPANSION_1_RANGE
else:
time_range = INIT_CANVAS_RANGE
time_key = '%d-%d, T:0-2' % time_range
new_entry = {
**new_entry,
"center": {
time_key: center
},
"path": {
time_key: path
}
}
del entry['center']
del entry['path']
if "website" in entry:
if isinstance(entry["website"], str) and entry["website"]:
new_entry['links']['website'] = [entry['website']]
del entry['website']
if "subreddit" in entry:
if isinstance(entry["subreddit"], str) and entry["subreddit"]:
new_entry['links']['subreddit'] = list(map(lambda x: FS_REGEX.sub(r"\1", x), COMMATIZATION.split(entry['subreddit'])))
del entry['subreddit']
if "submitted_by" in entry:
new_entry['contributors'].append(entry['submitted_by'])
del entry['submitted_by']
entries[index] = {
**new_entry,
**entry
}
index += 1
if not (index % 1000):
print(f"{index} checked.")
print(f"{len(entries)} checked.")
print("Writing...")
with open(file_path, 'w', encoding='utf-8', newline='\n') as f2:
f2.write(per_line_entries(entries))
print("All done!")