-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsyntax_pcfg_shinjini.py
96 lines (75 loc) · 2.79 KB
/
syntax_pcfg_shinjini.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
"""
A Probabilistic Context Free Grammer (PCFG) Parser implemented a weighted graph search
Code adapted from Zhang Long's NLP models
"""
import codecs
from collections import defaultdict
import math
import os
f_grammar = os.path.join("test", "08-grammar.txt")
non_terminal = []
preterm = defaultdict(list)
grammar_file = codecs.open(f_grammar, 'r', 'utf-8')
index = 0
for rule in grammar_file:
words = rule.split('\t')
lhs = words[0]
rhs = words[1]
prob = float(words[2])
rhs_symbols = rhs.split(' ')
if len(rhs_symbols) == 1:
preterm[rhs].append([lhs, math.log(prob)])
else:
non_terminal.insert(index, [lhs, rhs_symbols[0], rhs_symbols[1], math.log(prob)])
# add pre-terminals
f_text = os.path.join("test", "08-input.txt")
text_file = codecs.open(f_text, 'r', 'utf-8')
# init best score with lowest level
best_score = defaultdict(lambda: float('-inf'))
best_edge = {}
for line in text_file:
words = line.split(' ')
for i in range(len(words)):
word = words[i].strip()
for item in (preterm[word]):
lhs = item[0]
log_prob = item[1]
ibs = lhs + ' ' + str(i) + ' ' + str(i + 1)
best_score[ibs] = log_prob
text_file.close()
# cyk, calculate the rest levels
text_file = codecs.open(f_text, 'r', 'utf-8')
my_lp = float('-inf')
for j in range(2, len(words) + 1):
for i in range(j - 2, -1, -1):
for k in range(i + 1, j):
# rules in grammar table
for nrul in range(len(non_terminal)):
sym = non_terminal[nrul][0]
lsym = non_terminal[nrul][1]
rsym = non_terminal[nrul][2]
logprob = non_terminal[nrul][3]
ilsym = lsym + ' ' + str(i) + ' ' + str(k)
irsym = rsym + ' ' + str(k) + ' ' + str(j)
if best_score[ilsym] > float('-inf') and best_score[irsym] > float('-inf'):
my_lp = best_score[ilsym] + best_score[irsym] + logprob
isymi = sym + ' ' + str(i) + ' ' + str(j)
if my_lp > best_score[isymi]:
best_score[isymi] = my_lp
best_edge[isymi] = [ilsym, irsym]
def my_print(sym, best_edge, words):
if sym in best_edge:
symp = sym.split(' ')[0]
return "(" + symp + " " \
+ my_print(best_edge[sym][0], best_edge, words) + " " + my_print(best_edge[sym][1], best_edge, words) \
+ ")"
else:
i = sym.split(' ')[1]
symp = sym.split(' ')[0]
return "(" + sym + " " + words[int(i)] + ")"
print(my_print('S 0 7', best_edge, words))
def main():
pass
if __name__ == '__main__':
main()