mirror of
https://github.com/placeAtlas/atlas.git
synced 2024-12-27 08:34:01 +01:00
Fix reddit crawl newline and quote handling
This commit is contained in:
parent
2848f93f47
commit
35672fdb47
1 changed files with 8 additions and 1 deletions
|
@ -2,6 +2,7 @@
|
||||||
import praw
|
import praw
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
|
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
|
||||||
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
|
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
|
||||||
|
@ -51,7 +52,13 @@
|
||||||
break
|
break
|
||||||
if(submission.link_flair_text == "New Entry"):
|
if(submission.link_flair_text == "New Entry"):
|
||||||
text = submission.selftext
|
text = submission.selftext
|
||||||
text = text.replace("\\", "")
|
#Old backslash filter:
|
||||||
|
#text = text.replace("\\", "")
|
||||||
|
#New one: One \\ escapes a backslash in python's parser
|
||||||
|
# Two escape it again in the regex parser, so \\\\ is \
|
||||||
|
# Then anything but " or n is replaced with the first capture group (anything but " or n)
|
||||||
|
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
|
||||||
|
re.sub("\\\\([^\"n])", "\\1", text)
|
||||||
try:
|
try:
|
||||||
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
|
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
|
Loading…
Reference in a new issue