Fix reddit crawl newline and quote handling

This commit is contained in:
unknown 2022-04-05 19:00:31 -03:00
parent 2848f93f47
commit 35672fdb47

View file

@ -2,6 +2,7 @@
import praw import praw
import json import json
import time import time
import re
outfile = open('temp_atlas.json', 'w', encoding='utf-8') outfile = open('temp_atlas.json', 'w', encoding='utf-8')
failfile = open('manual_atlas.json', 'w', encoding='utf-8') failfile = open('manual_atlas.json', 'w', encoding='utf-8')
@ -51,7 +52,13 @@
break break
if(submission.link_flair_text == "New Entry"): if(submission.link_flair_text == "New Entry"):
text = submission.selftext text = submission.selftext
text = text.replace("\\", "") #Old backslash filter:
#text = text.replace("\\", "")
#New one: One \\ escapes a backslash in python's parser
# Two escape it again in the regex parser, so \\\\ is \
# Then anything but " or n is replaced with the first capture group (anything but " or n)
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
re.sub("\\\\([^\"n])", "\\1", text)
try: try:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",") text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
except AttributeError: except AttributeError: