-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStanford+NLTK.py
120 lines (105 loc) · 3.13 KB
/
Stanford+NLTK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
import pprint
import re
import string
from nltk.tree import *
#Uses Stanford parser and NLTK to count occurrences of phrases (noun, verb, prepositional)
parser_folder = "stanford-parser-full-2017-06-09/"
parser_jar = parser_folder + "stanford-parser.jar"
models_jar = parser_folder + "stanford-parser-3.8.0-models.jar"
const_parser = StanfordParser(parser_jar, models_jar)
dep_parser = StanfordDependencyParser(parser_jar, models_jar)
def countTrees(tree):
nps = 1
vps = 1
pps = 1
NPlength = []
VPlength = []
PPlength = []
for item in tree:
for subtree in Tree.subtrees(item):
if Tree.label(subtree) == 'NP':
NPlength.append(len(Tree.leaves(subtree)))
nps = nps+1
elif Tree.label(subtree) == 'VP':
VPlength.append(len(Tree.leaves(subtree))) #Length of the phrase
# print(Tree.label(subtree))
vps = vps+1
elif Tree.label(subtree) == 'PP':
# print(Tree.label(subtree))
PPlength.append(len(Tree.leaves(subtree)))
pps = pps+1
#NPct.append(nps)
#VPct.append(vps)
#PPct.append(pps)
return [nps,vps,pps, sum(NPlength)/nps, sum(VPlength)/vps, sum(PPlength)/pps]
ELE = []
INT = []
ADV = []
mainlist = []
text = open('/Users/Ivana/Desktop/RA Sowmya/INTADV.txt', 'r')
for line in text:
line = re.sub('\*\*\*\*\*\*\*', '', line)
line = re.sub('\n', '', line)
if line != '':
mainlist.append(line)
ELE = mainlist[0::2] #takes odd sentences (ele ones)
INT = mainlist[1::2] #takes even sentences
print(len(ELE))
print(len(INT))
NPct = []
VPct = []
PPct = []
NPLen = []
VPLen = []
PPLen = []
#pprint.pprint((list))
count = 0
for item in ELE:
#print(item)
const_output = const_parser.raw_parse(item)
#print(item)
output = countTrees(const_output)
NPct.append(output[0])
VPct.append(output[1])
PPct.append(output[2])
NPLen.append(output[3])
VPLen.append(output[4])
PPLen.append(output[5])
count = count +1
#print(type(output))
#print(intEI)
print("Printing NP, VP, PP counts for Elementary sentences: ")
print(sum(NPct)/len(NPct))
print(sum(VPct)/len(VPct))
print(sum(PPct)/len(PPct))
print("Printing NPLen, VPLen, PPLen for Elementary sentences: ")
print(sum(NPLen)/len(NPct))
print(sum(VPLen)/len(VPct))
print(sum(PPLen)/len(PPct))
NPctI = []
VPctI = []
PPctI = []
NPLenI = []
VPLenI = []
PPLenI = []
for item in INT:
#print(item)
const_output = const_parser.raw_parse(item)
#print(item)
output = countTrees(const_output)
NPctI.append(output[0])
VPctI.append(output[1])
PPctI.append(output[2])
NPLenI.append(output[3])
VPLenI.append(output[4])
PPLenI.append(output[5])
print("Printing NP, VP, PP counts for Intermediate sentences: ")
print(sum(NPctI)/len(NPctI))
print(sum(VPctI)/len(VPctI))
print(sum(PPctI)/len(PPctI))
print("Printing NPLen, VPLen, PPLen for Intermediate sentences: ")
print(sum(NPLenI)/len(NPctI))
print(sum(VPLenI)/len(VPctI))
print(sum(PPLenI)/len(PPctI))