-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathonionbot.py
129 lines (99 loc) · 3.85 KB
/
onionbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import tweepy
from pprint import pprint
from time import sleep
# from credentials import *
import markovify
import datetime
import feedparser
import ssl
import csv
from os import environ
import requests
from bs4 import BeautifulSoup
import random
CONSUMER_KEY = environ['CONSUMER_KEY']
CONSUMER_SECRET = environ['CONSUMER_SECRET']
ACCESS_TOKEN = environ['ACCESS_TOKEN']
ACCESS_TOKEN_SECRET = environ['ACCESS_TOKEN_SECRET']
def check_rss():
ssl._create_default_https_context = ssl._create_unverified_context
d = feedparser.parse('https://www.theonion.com/rss')
headlines_list = []
with open('headlines.csv', 'r+') as headlines:
# check all the headlines in the rss feed
for entry in d['entries']:
# get the headline
headline = entry['title']
reader = csv.reader(headlines, delimiter=',')
# put first 1000 headlines into list, should be enough
for row in reader:
headlines_list.append(row[0])
# add new headlines to csv file
if headline in headlines_list:
pass
else:
writer = csv.writer(headlines, delimiter=',')
writer.writerow([headline])
headlines_list.append(headline) # have to put it in headlines_list as well otherwise you get doubles
def scrape_site():
base_url = 'https://www.theonion.com/'
extra = ''
headlines_list = []
with open('headlines.csv', 'r+') as headlines:
# put existing headlines in list
reader = csv.reader(headlines, delimiter=',')
for row in reader:
headlines_list.append(row[0])
# start looping the pages
for counter in range(0,2):
response = requests.get(base_url + extra)
html = response.content
page_content = BeautifulSoup(html, "html.parser")
# find all headlines
headlines_on_page = page_content.select('.content-meta__headline h6, .content-meta__headline h3, .content-meta__headline__wrapper h5, article.js_post_item h1')
# for each headline, get the text and add it if it doesn't exist yet
for item in headlines_on_page:
headline = item.text
if headline in headlines_list:
pass
else:
writer = csv.writer(headlines, delimiter=',')
writer.writerow([headline])
# find the load more button to load the next page
load_more_link = page_content.find(attrs={'data-ga': '[["Front page click","More stories click"]]'}).get('href')
# increase the counter, build the next page link
counter += 1
if load_more_link is not None:
extra = load_more_link
def make_headline():
# Get raw text as string.
with open("headlines.csv") as headlines:
text = headlines.read()
# Build the model.
more_or_less_random = random.randrange(2,4)
text_model = markovify.NewlineText(text, state_size=more_or_less_random)
# make a new headline no more than 280 characters long
return text_model.make_short_sentence(280)
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
now = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
def update_status():
tweet = make_headline()
try:
api.update_status(tweet)
log_write = open("log.txt", "w")
log_write.write(str(now + ' - ' + tweet))
log_write.close()
except tweepy.TweepError as e:
log_write = open("log.txt", "w")
log_write.write(str(now + ' - ' + e.reason))
log_write.close()
try:
check_rss()
scrape_site()
except:
log_write = open("log.txt", "w")
log_write.write(str(now))
finally:
update_status()