forked from rasbt/datacollect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_timeline.py
executable file
·140 lines (111 loc) · 4.97 KB
/
twitter_timeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python
# Tested in Python 3
# Sebastian Raschka, 2014
# An interactive command line app for
# downloading your personal twitter timeline.
#
# For help, execute
# ./twitter_timeline.py --help
import twitter
from datetime import datetime
import time
import re
import sys
import pandas as pd
import pyprind as pp
import oauth_info as auth # our local file with the OAuth infos
class TimelineMiner(object):
def __init__(self, access_token, access_secret, consumer_key, consumer_secret, user_name):
self.access_token = access_token
self.access_secret = access_secret
self.consumer_key = consumer_key
self.consumer_secret = consumer_secret
self.user_name = user_name
self.auth = None
self.df = pd.DataFrame(columns=['timestamp', 'tweet'], dtype='str')
def authenticate(self):
self.auth = twitter.Twitter(auth=twitter.OAuth(self.access_token,
self.access_secret, self.consumer_key,
self.consumer_secret))
return bool(isinstance(self.auth, twitter.api.Twitter))
def get_timeline(self, max=0, keywords=[]):
if keywords:
self.df['keywords'] = ''
tweet_ids = [self.auth.statuses.user_timeline(
user_id=self.user_name, count=1
)[0]['id']] # the ID of my last tweet
last_count = 200
counter = 0
while last_count == 200:
timeline = self.auth.statuses.user_timeline(user_id=self.user_name, count=200, max_id=tweet_ids[-1])
for tweet in range(len(timeline)):
text = timeline[tweet]['text'].replace('"', '\'')
tweet_id = int(timeline[tweet]['id'])
date = self.__get_date(timeline, tweet)
if keywords:
for k in keywords:
if self.__check_keyword(text,k):
self.df.loc[counter,'tweet'] = text
self.df.loc[counter,'timestamp'] = date
try:
self.df.loc[counter,'keywords'].append(k)
except AttributeError:
self.df.loc[counter,'keywords'] = [k]
try:
self.df.loc[counter,'keywords'] = ';'.join(self.df.loc[counter,'keywords'])
except KeyError:
pass
else:
self.df.loc[counter,'tweet'] = text
self.df.loc[counter,'timestamp'] = date
counter += 1
if max and counter >= max:
break
sys.stdout.flush()
sys.stdout.write('\rTweets downloaded: %s' %counter)
if max and counter >= max:
break
last_count = len(timeline)
tweet_ids.append(timeline[-1]['id'])
time.sleep(1)
print()
def make_csv(self, path):
self.df.to_csv(path, encoding='utf8')
def __get_date(self, timeline, tweet):
timest = datetime.strptime(timeline[tweet]['created_at'],
"%a %b %d %H:%M:%S +0000 %Y")
date = timest.strftime("%Y-%d-%m %H:%M:%S")
return date
def __check_keyword(self, s, key):
return bool(re.search(key, s, re.IGNORECASE))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description='A command line tool to download your personal twitter timeline.',
formatter_class=argparse.RawTextHelpFormatter,
epilog='\nExample:\n'\
'./twitter_timeline.py -o my_timeline.csv -k Python,Github')
parser.add_argument('-o', '--out', help='Filename for creating the output CSV file.')
parser.add_argument('-m', '--max', help='Maximum number (integer) of timeline tweets query (searches all by default)')
parser.add_argument('-k', '--keywords', help='A comma separated list of keywords for filtering (optional).')
parser.add_argument('-v', '--version', action='version', version='v. 1.0.1')
args = parser.parse_args()
if not args.out:
print('Please provide a filename for creating the output CSV file.')
quit()
tm = TimelineMiner(auth.ACCESS_TOKEN,
auth.ACCESS_TOKEN_SECRET,
auth.CONSUMER_KEY,
auth.CONSUMER_SECRET,
auth.USER_NAME)
if not args.max:
max_t = 0
else:
max_t = int(args.max)
if args.keywords:
keywords = args.keywords.split(',')
else:
keywords = args.keywords
print('Authentification successful: %s' %tm.authenticate())
tm.get_timeline(max=max_t, keywords=keywords)
tm.make_csv(args.out)