-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_pca_model.py
116 lines (100 loc) · 4.13 KB
/
create_pca_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# File : core.classifiers.RCNLPTextClassifier.py
# Description : Echo State Network for text classification.
# Auteur : Nils Schaetti <nils.schaetti@unine.ch>
# Date : 01.02.2017 17:59:05
# Lieu : Nyon, Suisse
#
# This file is part of the Reservoir Computing NLP Project.
# The Reservoir Computing Memory Project is a set of free software:
# you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Foobar is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with Foobar. If not, see <http://www.gnu.org/licenses/>.
#
import argparse
from core.tools.RCNLPLogging import RCNLPLogging
import core.clustering.functions as cf
import numpy as np
import os
from scipy.stats import ttest_1samp
from core.converters.RCNLPPosConverter import RCNLPPosConverter
from core.converters.RCNLPTagConverter import RCNLPTagConverter
from core.converters.RCNLPFuncWordConverter import RCNLPFuncWordConverter
from core.converters.RCNLPWordVectorConverter import RCNLPWordVectorConverter
from core.converters.LetterConverter import LetterConverter
import io
from sklearn.decomposition import PCA
import pickle
####################################################
# Main function
####################################################
if __name__ == "__main__":
# Argument parser
parser = argparse.ArgumentParser(description="RCNLP - Create PCA model of symbolic representations.")
# Argument
parser.add_argument("--texts", type=str, help="Text directory.")
parser.add_argument("--startup", type=int, help="Number of start-up states to remove.", default=20)
parser.add_argument("--components", type=int, help="Number of principal component to reduce inputs.", required=True)
parser.add_argument("--converter", type=str, help="The text converter to use (fw, pos, tag, wv).")
parser.add_argument("--lang", type=str, help="Language model", default='en')
parser.add_argument("--samples", type=int, help="Number of authors to take", default=20)
parser.add_argument("--output", type=str, help="Output model file", default='pca_output.p')
args = parser.parse_args()
# >> 1. Convert the text to symbolic or continuous representations
if args.converter == "pos":
converter = RCNLPPosConverter()
elif args.converter == "tag":
converter = RCNLPTagConverter()
elif args.converter == "fw":
converter = RCNLPFuncWordConverter()
elif args.converter == "letter":
converter = LetterConverter()
else:
converter = RCNLPWordVectorConverter()
# end if
# Get texts
for i in np.arange(0, args.samples):
# Choose authors and text
authors_id = np.random.randint(1, 50)
texts = os.path.join(args.texts, str(authors_id))
# Generate states for first author
print("Transforming texts from author %s to symbols" % texts)
for index, text_file in enumerate(os.listdir(texts)):
# Convert the text to Temporal Vector Representation
doc_array = converter(io.open(os.path.join(texts, text_file), 'r').read())[args.startup:]
# Add
if i == 0:
symb_rep = doc_array
else:
symb_rep = np.vstack((symb_rep, doc_array))
# end if
# end for
# end for
# PCA
pca = PCA(n_components=args.components)
pca.fit(symb_rep)
# Explained variance
print("Explained variance : ")
print(pca.explained_variance_)
# Explained variance ratio
print("Explained variance ratio : ")
print(pca.explained_variance_ratio_)
# Mean
print("Mean : ")
print(pca.mean_)
# Noise variance
print("Noise variance : ")
print(pca.noise_variance_)
# Save
pickle.dump(pca, open(args.output, 'w'))
# end if