-
Notifications
You must be signed in to change notification settings - Fork 1
/
lib_transcribe.rb
65 lines (59 loc) · 1.54 KB
/
lib_transcribe.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
require 'unicode_utils/downcase'
def read_dict(dict, ipa_hash)
dict.each_line do |line|
word,ipa = line.chomp.split("\t")
norm_word = UnicodeUtils.downcase(word)
if word.match(/[A-Z]/) && ipa_hash[norm_word]
then next
end
ipa_hash[norm_word] = ipa
end
end
def read_wordlist(wordlist, wordlist_hash)
wordlist.each_line do |line|
word,ipa = line.chomp.split("\t")
norm_word = UnicodeUtils.downcase(word)
if word.match(/[A-Z]/) && ipa_hash[norm_word]
then next
end
wordlist_hash[norm_word] = ipa
end
end
def transcribe_ipa(file,dict_file)
text = File.read(file)
dict = File.read(dict_file)
ipa_hash = {}
wordlist_hash = {}
outtxt = ""
read_dict(dict, ipa_hash)
if @wordlist
wordlist = File.read(@wordlist)
read_wordlist(wordlist, wordlist_hash)
end
text.each_line do |line|
if !line.match(/^# /)
if line.match(/[#\*]/) || line.match(/^$/)
outtxt << line
next
end
end
words = UnicodeUtils.downcase(line).gsub(/[,\.\?\!:;"’“”„«»\(\)?!,。、·…;:¿¡–—،؟\َُِِّْ]/, "").split(" ")
tr_line = ""
words.each do |w|
hit = ipa_hash[w]
wl_hit = wordlist_hash[w]
if wl_hit
tr_line << wl_hit.gsub(/\//, "") + " "
elsif hit
if hit.match(", ")
hit = hit.split(", ")[0]
end
tr_line << hit.gsub(/\//, "") + " "
else
tr_line << "@" + w + " "
end
end
outtxt << tr_line.gsub(/@#/, "#").gsub(/\s+$/, "") + "\n"
end
outtxt
end