-
Notifications
You must be signed in to change notification settings - Fork 118
/
Copy pathdata_prepare.py
61 lines (53 loc) · 1.84 KB
/
data_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
import jieba
import random
from tensorflow.contrib import learn
class Data_Prepare(object):
def readfile(self, filename):
texta = []
textb = []
tag = []
with open(filename, 'r') as f:
for line in f.readlines():
line = line.strip().split("\t")
texta.append(self.pre_processing(line[0]))
textb.append(self.pre_processing(line[1]))
tag.append(line[2])
# shuffle
index = [x for x in range(len(texta))]
random.shuffle(index)
texta_new = [texta[x] for x in index]
textb_new = [textb[x] for x in index]
tag_new = [tag[x] for x in index]
type = list(set(tag_new))
dicts = {}
tags_vec = []
for x in tag_new:
if x not in dicts.keys():
dicts[x] = 1
else:
dicts[x] += 1
temp = [0] * len(type)
temp[int(x)] = 1
tags_vec.append(temp)
print(dicts)
return texta_new, textb_new, tags_vec
def pre_processing(self, text):
# 删除()里的内容
text = re.sub('([^(.]*)', '', text)
# 只保留中文部分
text = ''.join([x for x in text if '\u4e00' <= x <= '\u9fa5'])
# 利用jieba进行分词
words = ' '.join(jieba.cut(text)).split(" ")
# 不分词
words = [x for x in ''.join(words)]
return ' '.join(words)
def build_vocab(self, sentences, path):
lens = [len(sentence.split(" ")) for sentence in sentences]
max_length = max(lens)
vocab_processor = learn.preprocessing.VocabularyProcessor(max_length)
vocab_processor.fit(sentences)
vocab_processor.save(path)
if __name__ == '__main__':
data_pre = Data_Prepare()
data_pre.readfile('data/train.txt')