-
Notifications
You must be signed in to change notification settings - Fork 1
/
browser_cloud.py
58 lines (50 loc) · 1.84 KB
/
browser_cloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
import re, csv, json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
# only include words that include of a least 2 letters, numbers or underscores
re_word = re.compile(r'\w{2,}')
# very simple regex to match something that may be a URL, path, file name or assignment
re_uri = re.compile(r'\S*[\/\.=]\S*')
stopwords = set(stopwords.words('english'))
texts = {'ie': [], 'firefox': [], 'chrome': [], 'opera': [], 'safari': []}
counts = {'ie': 0, 'firefox': 0, 'chrome': 0, 'opera': 0, 'safari': 0}
freqdists = {}
browsers = {
'ie': re.compile(r'\b(?:ie\s*\d*|internet\s*explorer)\b', re.I),
'firefox': re.compile(r'\bfirefox\b', re.I),
'chrome': re.compile(r'\bchrome\b', re.I),
'opera': re.compile(r'\bopera\b', re.I),
'safari': re.compile(r'\bsafari\b', re.I),
}
def check_word(word):
if (len(word) > 1 and
re.match(re_word, word) and
w not in stopwords and
('-' not in word or word.count('-') < 2) and # too many hyphens
('_' not in word) # no underscores
):
return True
return False
fcsv = open('browser_mentions.csv', 'rb')
reader = csv.reader(fcsv)
headers = reader.next()
for record in reader:
text = record[0]
tokens = word_tokenize(re.sub(re_uri, '', text.lower()))
words = [w for w in tokens if check_word(w)]
for b in browsers:
if re.search(browsers[b], text):
# replace occurences of browser itself
#texts[b] += [w for w in words if not re.search(browsers[b], w)]
texts[b] += words
counts[b] += 1
fcsv.close()
print counts
for b in texts:
fdist = FreqDist(w for w in texts[b]).items()
freqdists[b] = [(w, c) for w, c in fdist if c >= 2][:500]
fjs = open('data.js', 'w')
fjs.write('var browsers = %s;' % json.dumps(freqdists))
fjs.close()