atlas/tools/redditcrawl.py

85 lines
2.6 KiB
Python
Raw Normal View History

2017-04-04 20:12:45 +02:00
import praw
2022-04-04 18:49:57 +02:00
import json
import time
import re
2017-04-04 20:12:45 +02:00
2022-04-05 13:15:30 +02:00
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
2017-04-04 20:12:45 +02:00
credentials = open('credentials', 'r')
client_id = credentials.readline().strip(' \t\n\r')
client_secret = credentials.readline().strip(' \t\n\r')
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot')
2022-04-04 18:49:57 +02:00
failcount = 0
successcount = 0
2022-04-06 03:33:57 +02:00
totalcount = 0
2022-04-04 18:49:57 +02:00
2022-04-05 23:19:49 +02:00
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
existing = json.load(jsonfile)
existing_ids = []
for item in existing:
existing_ids.append(item['id'])
2022-04-05 21:57:08 +02:00
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
2022-04-04 20:02:15 +02:00
"""
Auth setup
1. Head to https://www.reddit.com/prefs/apps
2. Click "create another app"
3. Give it a name and description
4. Select "script"
5. Redirect to http://localhost:8080
6. Copy ID (under Personal Use Script)
7. Append to file called "credentials"
8. Copy Secret
9. Append on newline to "credentials" file
10. Run Script
Running Script
1. Input the next ID to use
2. Manually resolve errors in manual_atlas.json
3. Copy temp_atlas.json entries into web/_js/atlas.js
4. Pull Request
"""
#print(dir(submission))
if (submission.id in existing_ids):
2022-04-05 14:47:36 +02:00
print("Found first duplicate!")
break
if(submission.link_flair_text == "New Entry"):
2017-04-04 20:12:45 +02:00
text = submission.selftext
#Old backslash filter:
#text = text.replace("\\", "")
#New one: One \\ escapes a backslash in python's parser
# Two escape it again in the regex parser, so \\\\ is \
# Then anything but " or n is replaced with the first capture group (anything but " or n)
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
2022-04-06 09:48:39 +02:00
text = re.sub("\\\\([^\"n])", "\\1", text)
try:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
except AttributeError:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
lines = text.split("\n")
2022-04-04 19:03:25 +02:00
for i, line in enumerate(lines):
if("\"id\": 0" in line):
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
text = "\n".join(lines)
2022-04-04 18:49:57 +02:00
try:
outfile.write(json.dumps(json.loads(text))+",\n")
2022-04-06 03:33:57 +02:00
successcount += 1
2022-04-04 18:49:57 +02:00
except json.JSONDecodeError:
failfile.write(text+",\n")
failcount += 1
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
2022-04-06 03:33:57 +02:00
totalcount += 1
2022-04-04 18:49:57 +02:00
2022-04-06 03:33:57 +02:00
print(f"\n\nSuccess: {successcount}/{totalcount}\nFail: {failcount}/{totalcount}\nPlease check manual_atlas.txt for failed entries to manually resolve.")