-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathKeyword_Extractor.py
167 lines (132 loc) · 4.66 KB
/
Keyword_Extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import re
import pandas as pd
import sys, os
import numpy as np
import nltk
import operator
import math
class Extractor():
def __init__(self):
self.softskills=self.load_skills('softskills.txt')
self.hardskills=self.load_skills('hardskills.txt')
self.jb_distribution=self.build_ngram_distribution(sys.argv[-2])
self.cv_distribution=self.build_ngram_distribution(sys.argv[-1])
self.table=[]
self.outFile="Extracted_keywords.csv"
def load_skills(self,filename):
f=open(filename,'r')
skills=[]
for line in f:
#removing punctuation and upper cases
skills.append(self.clean_phrase(line))
f.close()
return list(set(skills)) #remove duplicates
def build_ngram_distribution(self,filename):
n_s=[1,2,3] #mono-, bi-, and tri-grams
dist={}
for n in n_s:
dist.update(self.parse_file(filename,n))
return dist
def parse_file(self,filename,n):
f=open(filename,'r')
results={}
for line in f:
words=self.clean_phrase(line).split(" ")
ngrams=self.ngrams(words,n)
for tup in ngrams:
phrase=" ".join(tup)
if phrase in results.keys():
results[phrase]+=1
else:
results[phrase]=1
return results
def clean_phrase(self,line):
return re.sub(r'[^\w\s]','',line.replace('\n','').replace('\t','').lower())
def ngrams(self,input_list, n):
return list(zip(*[input_list[i:] for i in range(n)]))
def measure1(self,v1,v2):
return v1-v2
def measure2(self,v1,v2):
return max(v1-v2,0)
def measure3(self,v1,v2):#cosine similarity
#"compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(v1)):
x = v1[i]; y = v2[i]
sumxx += x*x
sumyy += y*y
sumxy += x*y
return sumxy/math.sqrt(sumxx*sumyy)
def sendToFile(self):
try:
os.remove(self.outFile)
except OSError:
pass
df=pd.DataFrame(self.table,columns=['type','skill','job','cv','m1','m2'])
df_sorted=df.sort_values(by=['job','cv'], ascending=[False,False])
df_sorted.to_csv(self.outFile, columns=['type','skill','job','cv'],index=False)
def printMeasures(self):
n_rows=len(self.table)
v1=[self.table[m1][4] for m1 in range(n_rows)]
v2=[self.table[m2][5] for m2 in range(n_rows)]
print("Measure 1: ",str(sum(v1)))
print("Measure 2: ",str(sum(v2)))
v1=[self.table[jb][2] for jb in range(n_rows)]
v2=[self.table[cv][3] for cv in range(n_rows)]
print("Measure 3 (cosine sim): ",str(self.measure3(v1,v2)))
for type in ['hard','soft','general']:
v1=[self.table[jb][2] for jb in range(n_rows) if self.table[jb][0]==type]
v2=[self.table[cv][3] for cv in range(n_rows) if self.table[cv][0]==type]
print("Cosine similarity for "+type+" skills: "+str(self.measure3(v1,v2)))
def makeTable(self):
#I am interested in verbs, nouns, adverbs, and adjectives
parts_of_speech=['CD','JJ','JJR','JJS','MD','NN','NNS','NNP','NNPS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ']
graylist=["you", "will"]
tmp_table=[]
#look if the skills are mentioned in the job description and then in your cv
for skill in self.hardskills:
if skill in self.jb_distribution:
count_jb=self.jb_distribution[skill]
if skill in self.cv_distribution:
count_cv=self.cv_distribution[skill]
else:
count_cv=0
m1=self.measure1(count_jb,count_cv)
m2=self.measure2(count_jb,count_cv)
tmp_table.append(['hard',skill,count_jb,count_cv,m1,m2])
for skill in self.softskills:
if skill in self.jb_distribution:
count_jb=self.jb_distribution[skill]
if skill in self.cv_distribution:
count_cv=self.cv_distribution[skill]
else:
count_cv=0
m1=self.measure1(count_jb,count_cv)
m2=self.measure2(count_jb,count_cv)
tmp_table.append(['soft',skill,count_jb,count_cv,m1,m2])
#And now for the general language of the job description:
#Sort the distribution by the words most used in the job description
general_language = sorted(self.jb_distribution.items(), key=operator.itemgetter(1),reverse=True)
for tuple in general_language:
skill = tuple[0]
if skill in self.hardskills or skill in self.softskills or skill in graylist:
continue
count_jb = tuple[1]
tokens=nltk.word_tokenize(skill)
parts=nltk.pos_tag(tokens)
if all([parts[i][1]in parts_of_speech for i in range(len(parts))]):
if skill in self.cv_distribution:
count_cv=self.cv_distribution[skill]
else:
count_cv=0
m1=self.measure1(count_jb,count_cv)
m2=self.measure2(count_jb,count_cv)
tmp_table.append(['general',skill,count_jb,count_cv,m1,m2])
self.table=tmp_table
def main():
K=Extractor()
K.makeTable()
K.sendToFile()
K.printMeasures()
if __name__ == "__main__":
main()