mirror of
https://github.com/placeAtlas/atlas.git
synced 2024-12-26 17:54:06 +01:00
Fix reddit crawl newline and quote handling
This commit is contained in:
parent
2848f93f47
commit
35672fdb47
1 changed files with 8 additions and 1 deletions
|
@ -2,6 +2,7 @@
|
|||
import praw
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
|
||||
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
|
||||
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
|
||||
|
@ -51,7 +52,13 @@
|
|||
break
|
||||
if(submission.link_flair_text == "New Entry"):
|
||||
text = submission.selftext
|
||||
text = text.replace("\\", "")
|
||||
#Old backslash filter:
|
||||
#text = text.replace("\\", "")
|
||||
#New one: One \\ escapes a backslash in python's parser
|
||||
# Two escape it again in the regex parser, so \\\\ is \
|
||||
# Then anything but " or n is replaced with the first capture group (anything but " or n)
|
||||
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
|
||||
re.sub("\\\\([^\"n])", "\\1", text)
|
||||
try:
|
||||
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
|
||||
except AttributeError:
|
||||
|
|
Loading…
Reference in a new issue