-
Notifications
You must be signed in to change notification settings - Fork 0
/
material_gbk.py
50 lines (44 loc) · 1.46 KB
/
material_gbk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding:utf-8 -*-
import io
import re
import json
import pickle
freq_map = {}
with open("freq_map.pickle","rb") as f:
freq_map = pickle.load(f)
def getMaterial(line):
#pieces = re.split(r'[0-9]|[A-Z]|[a-z]|“|”|,|。|:|;|、', line)
pieces = re.split(r'[0-9A-Za-z“”,。:;、?【】\[\]《》\.()@%—— ·/!!□■\+\(\)……‘’\-\":\'\,\<\>\?~#①②③④⑤⑥⑦⑧⑨;°℃ \|●&∶~_= 0123456789-.,/\[]|]'.decode("utf-8"), line)
for p in pieces:
if p != "":
for i in range(len(p)):
c = p[i]
if c not in freq_map:
freq_map[c] = 1
else:
freq_map[c] += 1
w = p[i : i + 2]
if w not in freq_map:
freq_map[w] = 1
else:
freq_map[w] += 1
#print len(p)
#print p.encode("utf-8")
for num in range(1,12):
path = "sina_news_gbk/2016-%02d.txt" % (num)
print "processing:"
print path
with io.open("sina_news_gbk/2016-09.txt", 'r', encoding = "gbk") as f:
for eachline in f:
line = json.loads(eachline)
#print json.dumps(line, encoding="gbk", ensure_ascii=False)
try:
getMaterial(line["html"].encode("gbk").decode("gbk").encode("utf8").decode("utf8"))
except:
print "error in 41"
try:
getMaterial(line["title"].encode("gbk").decode("gbk").encode("utf8").decode("utf8"))
except:
print "error in 45"
with open("freq_map_after_gbk.pickle","wb") as f:
pickle.dump(freq_map, f)