From 35672fdb4756c9d55740d256789ab01ba8cc7230 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 5 Apr 2022 19:00:31 -0300 Subject: [PATCH] Fix reddit crawl newline and quote handling --- tools/redditcrawl.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/redditcrawl.py b/tools/redditcrawl.py index ab7ca429..6be23e56 100755 --- a/tools/redditcrawl.py +++ b/tools/redditcrawl.py @@ -2,6 +2,7 @@ import praw import json import time +import re outfile = open('temp_atlas.json', 'w', encoding='utf-8') failfile = open('manual_atlas.json', 'w', encoding='utf-8') @@ -51,7 +52,13 @@ break if(submission.link_flair_text == "New Entry"): text = submission.selftext - text = text.replace("\\", "") + #Old backslash filter: + #text = text.replace("\\", "") + #New one: One \\ escapes a backslash in python's parser + # Two escape it again in the regex parser, so \\\\ is \ + # Then anything but " or n is replaced with the first capture group (anything but " or n) + # Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"") + re.sub("\\\\([^\"n])", "\\1", text) try: text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",") except AttributeError: