atlas/tools/redditcrawl.py

113 lines
3.7 KiB
Python
Raw Normal View History

2017-04-04 20:12:45 +02:00
import praw
2022-04-04 18:49:57 +02:00
import json
import time
import re
import os
2017-04-04 20:12:45 +02:00
2022-04-05 13:15:30 +02:00
outfile = open('temp_atlas.json', 'w', encoding='utf-8')
failfile = open('manual_atlas.json', 'w', encoding='utf-8')
2017-04-04 20:12:45 +02:00
credentials = open('credentials', 'r')
client_id = credentials.readline().strip(' \t\n\r')
client_secret = credentials.readline().strip(' \t\n\r')
2022-04-07 22:00:30 +02:00
user = credentials.readline().strip(' \t\n\r')
pw = credentials.readline().strip(' \t\n\r')
2017-04-04 20:12:45 +02:00
2022-04-07 22:00:30 +02:00
reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent='atlas_bot',username=user,password=pw)
has_write_access = not reddit.read_only
if not has_write_access:
print("Warning: No write access. Post flairs will not be updated")
sleep(5)
2022-04-04 18:49:57 +02:00
2022-04-05 23:19:49 +02:00
jsonfile = open("../web/atlas.json", "r", encoding='utf-8')
existing = json.load(jsonfile)
existing_ids = []
for item in existing:
existing_ids.append(item['id'])
2022-04-07 22:00:30 +02:00
def set_flair(submission, flair):
if has_write_access and submission.link_flair_text != flair:
flair_choices = submission.flair.choices()
flair = next(x for x in flair_choices if x["flair_text_editable"] and flair == x["flair_text"])
submission.flair.select(flair["flair_template_id"])
total_all_flairs = 0
duplicate_count = 0
2022-04-07 22:00:30 +02:00
failcount = 0
successcount = 0
totalcount = 0
outfile.write("[\n")
2022-04-05 21:57:08 +02:00
for submission in reddit.subreddit('placeAtlas2').new(limit=2000):
2022-04-04 20:02:15 +02:00
"""
Auth setup
1. Head to https://www.reddit.com/prefs/apps
2. Click "create another app"
3. Give it a name and description
4. Select "script"
5. Redirect to http://localhost:8080
6. Copy ID (under Personal Use Script)
7. Append to file called "credentials"
8. Copy Secret
9. Append on newline to "credentials" file
2022-04-07 22:00:30 +02:00
10. If you want flair write access append 2 newlines with username and password (Must be a mod, don't do this if you don't know what you're doing)
11. Run Script
2022-04-04 20:02:15 +02:00
Running Script
1. Input the next ID to use
2. Manually resolve errors in manual_atlas.json
3. Copy temp_atlas.json entries into web/_js/atlas.js
4. Pull Request
"""
total_all_flairs += 1
if (submission.id in existing_ids):
2022-04-07 22:00:30 +02:00
set_flair(submission, "Processed Entry")
2022-04-05 14:47:36 +02:00
print("Found first duplicate!")
duplicate_count += 1
2022-04-07 22:00:30 +02:00
if (duplicate_count > 0):
break
else:
continue
if(submission.link_flair_text == "New Entry"):
2017-04-04 20:12:45 +02:00
text = submission.selftext
#Old backslash filter:
#text = text.replace("\\", "")
#New one: One \\ escapes a backslash in python's parser
# Two escape it again in the regex parser, so \\\\ is \
# Then anything but " or n is replaced with the first capture group (anything but " or n)
# Test in repl: re.sub("\\\\([^\"n])", "\\1", "\\t < removed slash, t stays and > stays \\n \\\"")
2022-04-06 09:48:39 +02:00
text = re.sub("\\\\([^\"n])", "\\1", text)
try:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+submission.author.name+"\",")
except AttributeError:
text = text.replace("\"id\": 0,", "\"id\": 0,\n\t\t\"submitted_by\": \""+"unknown"+"\",")
lines = text.split("\n")
2022-04-04 19:03:25 +02:00
for i, line in enumerate(lines):
if("\"id\": 0" in line):
lines[i] = line.replace("\"id\": 0", "\"id\": "+"\""+str(submission.id)+"\"")
text = "\n".join(lines)
2022-04-04 18:49:57 +02:00
try:
outfile.write(json.dumps(json.loads(text))+" ,\n")
2022-04-06 03:33:57 +02:00
successcount += 1
2022-04-07 22:00:30 +02:00
set_flair(submission, "Processed Entry")
2022-04-04 18:49:57 +02:00
except json.JSONDecodeError:
failfile.write(text+",\n")
failcount += 1
2022-04-07 22:00:30 +02:00
set_flair(submission, "Rejected Entry")
print("written "+submission.id+" submitted "+str(round(time.time()-submission.created_utc))+" seconds ago")
2022-04-06 03:33:57 +02:00
totalcount += 1
2022-04-04 18:49:57 +02:00
# Remove ,\n
outfile.seek(outfile.tell()-4, os.SEEK_SET)
outfile.truncate()
outfile.write("\n]")
print(f"\n\nTotal all flairs:{total_all_flairs}\nSuccess: {successcount}/{totalcount}\nFail: {failcount}/{totalcount}\nPlease check manual_atlas.txt for failed entries to manually resolve.")