forked from BuonOmo/inf8007
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtd2.py
160 lines (136 loc) · 7.27 KB
/
td2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
from functools import reduce
from collections import namedtuple, defaultdict
from operator import itemgetter
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from os import listdir
from os.path import isfile, join, basename
import re
import argparse
from numpy.dual import norm
from numpy.ma import dot
# --------------------------------------------------------------------------------- argument parsing
def parse_arguments(args_=None):
def acronym(v):
try:
return re.match("^[A-Z]{3}\d{4}\w?$", v).group(0)
except:
raise argparse.ArgumentTypeError('"{}" n’est pas un sigle de cours correct'.format(v))
parser = argparse.ArgumentParser(description='Script du TD2, similarité de textes')
parser.add_argument('acronym', metavar='SIGLE', type=acronym, nargs='?',
help='Nom du cours à verifier')
parser.add_argument('-d', type=str, dest='path', help='Chemin vers la liste de fichiers')
parser.add_argument('-n', type=int, dest='length', help='Nombre de résultats à afficher')
parser_verbose_handling = parser.add_mutually_exclusive_group(required=False)
parser_verbose_handling.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help="Affiche beaucoup d’informations")
parser_verbose_handling.add_argument('-q', '--quiet', dest='verbose', action='store_false',
help="Affiche le minimum d’informations")
parser.set_defaults(acronym='INF8007', path='02/PolyHEC', length=10, verbose=True)
return parser.parse_args(args_)
# -------------------------------------------------------------------------------------------- utils
def parse_course(path, do_raise=False):
""" Parse a course file to retrieve it’s title and description, may raise error if file is
corrupted or not found """
with open(path) as stream:
title = stream.readline()[12:].strip()
description = stream.readline()[18:].strip()
leftover = stream.read()
if leftover:
if do_raise:
raise Exception('File not totally parsed', path, leftover)
description += leftover.strip()
return title, description
class Keydefaultdict(defaultdict):
def __missing__(self, key):
if self.default_factory is None:
raise KeyError(key)
else:
ret = self[key] = self.default_factory(key)
return ret
# -------------------------------------------------------------------------------------- text parser
class Parser:
WORD_SEPARATOR = r'(?:(?: )?[\s.,:;?!()\\/\'\"])+'
def __init__(self, language='french', default_remove_stopwords=False, default_stem=False):
self.stopwords = set(stopwords.words(language))
self.stemmer = SnowballStemmer(language=language)
self.default_remove_stopwords = default_remove_stopwords
self.default_stem = default_stem
def tokenize(self, string, remove_stop_words=None, stem=None):
remove_stop_words = self.default_remove_stopwords if remove_stop_words is None \
else remove_stop_words
stem = self.default_stem if stem is None else stem
# break string up into tokens
words = re.split(self.WORD_SEPARATOR, string.strip().lower())
# filter empty strings
words = filter(lambda w: bool(w), words)
if remove_stop_words:
words = filter(lambda w: w not in self.stopwords, words)
if stem:
words = map(self.stemmer.stem, words)
# usage of map and filter instead of array comprehension allows to iterate only once
# through the list
return list(words)
# ------------------------------------------------------------------------------------ search engine
class SearchEngine:
files = {}
vectors = {}
def __init__(self, files, language='french'):
self.parser = Parser(language=language, default_remove_stopwords=True, default_stem=True)
# retrieve file contents and tokenize it
ParsedFile = namedtuple('ParsedFile', 'title content original_content uniq_words')
for file in files:
title, original_content = parse_course(file)
content = self.parser.tokenize(original_content) + self.parser.tokenize(title)
self.files[basename(file)[:-4]] = ParsedFile(title, content, original_content,
set(content))
# list of all uniq words, eventually optimised with stemming and stopwords sorting.
word_list = set(word for file in self.files.values() for word in file.uniq_words)
number_of_documents = len(files)
# this next operation process idf for each word in the document, it can take a while.
self.words_index = {word: (index, number_of_documents / self.__count_docs(word))
for (index, word) in enumerate(word_list)}
for acronym, file in self.files.items():
vector = [0] * len(self.words_index)
for word in file.content:
# we add idf each time we see a word, this ends up having tf*idf
vector[self.words_index[word][0]] += self.words_index[word][0]
self.vectors[acronym] = vector
self.norms = Keydefaultdict(lambda acronym: norm(self.vectors[acronym]))
self.cosines = Keydefaultdict(lambda acr_a: Keydefaultdict(lambda acr_b: self.__cosine(
self.vectors[acr_a],
self.vectors[acr_b],
self.norms[acr_a],
self.norms[acr_b]
)))
@staticmethod
def __cosine(a, b, norm_a, norm_b):
return float(dot(a, b) / (norm_a * norm_b))
def search(self, acronym, sort=True, reverse_sort=True):
rv = [(acr, self.cosines[min(acr, acronym)][max(acr, acronym)])
for (acr, other_vec) in self.vectors.items() if acronym != acr]
return sorted(rv, key=itemgetter(1), reverse=reverse_sort) if sort else rv
def __count_docs(self, word):
""" Count in how many documents a word is present, used for the TF-IDF. """
# todo: move this to init loop
return reduce(lambda count, file: count + 1 if word in file.uniq_words else count,
self.files.values(), 0)
# --------------------------------------------------------------------------------- main application
def main(path, acronym, n=10, be_verbose=True):
title, description = parse_course(join(path, acronym + '.txt'))
if be_verbose:
print("Recherche des cours similaires au cours {0} ({1}):".format(acronym, title))
files = [join(path, f) for f in listdir(path) if isfile(join(path, f))]
search_result = SearchEngine(files).search(acronym, sort=True)
for acr, score in search_result[:n]:
if be_verbose:
title, description = parse_course(join(path, acr + '.txt'))
print(" - {acronym}: {title} (score={score})".format(acronym=acr, title=title,
score=score))
print(description + "\n")
else:
print('{}: {}'.format(acr, score))
if __name__ == '__main__':
args = parse_arguments()
main(path=args.path, acronym=args.acronym, n=args.length, be_verbose=args.verbose)