goodatlas · hwiorn · Jul 4, 2019
diff --git a/s5/data/local/lm/buildLM/_scripts_/genPronunciation_cmu.py b/s5/data/local/lm/buildLM/_scripts_/genPronunciation_cmu.py
@@ -1697,7 +1697,7 @@ def fUpper(match):
     return match.group(0).upper()
 
 def main():
-    filename='buildLM/_scripts_/cmudict-0.7b.txt'
+    filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cmudict-0.7b.txt')
     if not os.path.exists(filename):
         print('No dictionary file: %s' % filename)
         sys.exit()

diff --git a/s5/local/generateExtraLexicon.sh b/s5/local/generateExtraLexicon.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Copyright 2019 hwiorn <hwiorn@gmail.com>
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <data-dir> <lm-dir>"
+    echo "   ex: $0 data/kspon data/local/lm"
+    exit 1
+fi
+data_dir=$1
+lm_dir=$2
+
+text=$data_dir/text
+script_dir=$lm_dir/buildLM/_scripts_
+
+if [ ! -f $text ]; then
+	echo $0: No such file: $text
+	exit 1
+fi
+
+if [ ! -f $script_dir/cmudict-0.7b.txt ]; then
+	echo $0: No such cmudict dictionary file.
+	exit 1
+fi
+
+cut -d' ' -f2- $text > $text.tmp
+echo "$0: Segmenting word class"
+cat $text.tmp |\
+	perl -Mutf8 -CS -pe 's/(?<=[가-힣])([^ 가-힣])/ \1/g; s/(?<=[^ 가-힣])([가-힣])/ \1/g;
+			     s/(?<=[a-zA-Z])([^ a-zA-Z])/ \1/g; s/(?<=[^ a-zA-Z])([a-zA-Z])/ \1/g;
+			     s/(?<=[0-9])([^0-9])/ \1/g; s/(?<=[^0-9])([0-9])/ \1/g' >\
+		$text.tmp2
+
+echo "$0: Generating unique word from AM text"
+parallel -a $text.tmp2 --pipepart --block=11M tr -s '[:space:]' '\\n' \| env LC_ALL=C sort -u | env LC_ALL=C sort -u > $data_dir/uniqWords.txt
+
+grep -P '[가-힣]+_?' $data_dir/uniqWords.txt  | env LC_ALL=C sort -u > $data_dir/uniqWords.hangul
+grep -v -P '[가-힣]+_?' $data_dir/uniqWords.txt | env LC_ALL=C sort -u > $data_dir/uniqWords.nonhangul
+cat $data_dir/uniqWords.nonhangul | grep -E "^[A-Z]+_? " > $data_dir/uniqWords.nonhangul.alphabet
+cat $data_dir/uniqWords.nonhangul | grep -v -E "^[A-Z]+_? " | awk '{print $1}' > $data_dir/uniqWords.nonhangul.etc
+cat $data_dir/uniqWords.nonhangul.{alphabet,etc} | env LC_ALL=C sort -u > $data_dir/uniqWords.nonhangul.sorted
+
+echo "$0: Generating pronunciation for non-hangul morphemes"
+env LC_ALL=en_US.UTF-8 $script_dir/genPronunciation_cmu.py $data_dir/uniqWords.nonhangul.sorted > $data_dir/tmp
+env LC_ALL=en_US.UTF-8 $script_dir/genPronunciation.py $data_dir/tmp > $data_dir/tmp2
+
+awk -F'\t' '{if(NF>1){print $0}}' $data_dir/tmp2 > $data_dir/uniqWords.nonhangul.sorted.pron
+awk -F'\t' '{if(NF<2){print $0}}' $data_dir/tmp2 > $data_dir/noPronList
+noPronCount=$(wc -l <$data_dir/noPronList)
+if [ $noPronCount -ne 0 ]; then
+        echo $0: There exist morphemes without pronunciation, plz check noPronList: $noPronCount
+        head $data_dir/noPronList
+        echo "... (omitted) ..."
+        #rm -f $data_dir/noPronList
+        #exit 1
+fi
+
+echo $0: Generating pronunciation
+cat $data_dir/uniqWords.nonhangul.sorted.pron $data_dir/uniqWords.hangul > $data_dir/finalList
+env LC_ALL=en_US.UTF-8 $script_dir/genPhoneSeq.py $data_dir/finalList
+
+echo $0: Extracting uniq lexicon
+env LC_ALL=en_US.UTF-8 $script_dir/genLexicon.py dic.pronun | perl -pe 's/^\s+$//g' > $data_dir/extra_lexicon.tmp
+utils/filter_scp.pl --exclude $lm_dir/zeroth_lexicon $data_dir/extra_lexicon.tmp > $data_dir/extra_lexicon
+mv -f dic.pronun $data_dir/dic.pronun
+
+[ ! -s $data_dir/noPronList ] && rm -f $data_dir/noPronList
+[ ! -s $data_dir/extra_lexicon ] && rm -f $data_dir/extra_lexicon
+
+rm -f $text.tmp* $data_dir/tmp* $data_dir/*.tmp* $data_dir/uniqWords.* $data_dir/{dic.pronun,finalList,pronoun.dict}
+echo $0: done
+
+exit 0
diff --git a/s5/local/kspon_data_prep.sh b/s5/local/kspon_data_prep.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# Copyright 2019   hwiorn <hwiorn@gmail.com>
+# Apache 2.0.
+
+nj=8
+max_gen=10
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <src-dir> <dst-dir>"
+    echo "e.g.: $0 /mnt/data/aihub/KsponSpeechAll data/aihub_kspon"
+    exit 1
+fi
+
+src=${1%/}
+dst=${2%/}
+
+echo "$0: Data preparation for AI Hub Korean speech datasets(2000 speakers, ~1000hrs)"
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+part_dirs=( $src/KsponSpeech_0{1,2,3,4,5} )
+
+echo $0: Unzipping datasets ...
+for part in ${part_dirs[@]}; do
+    if [ ! -f $part/.done ]; then
+        if [ ! -f $part.zip ]; then
+            echo $0: $part.zip is not exists
+            exit 1
+        fi
+
+        (
+            set -e
+            unzip -oqq $part.zip -d $src/
+            touch $part/.done
+        ) &
+    fi
+done
+wait $(jobs -p)
+
+gen_item() {
+    path=${1%.pcm}
+    txt=$path.txt
+    echo -en "$path\t" | perl -pe 's:^./::g;s:/:_:g'
+    echo -en "$1\t"
+    perl -MEncode -ne '$_=encode("utf-8", decode("cp949", $_)); chomp; @dict=();
+         s:\r::g; # remove LF
+         for(m:(\(.+?\)/\(.+?\)):) { push @dict, $1; }
+         $d=join "\t",@dict; # gather up filler words.
+         print "$_\t$d\n";' $txt
+}
+export -f gen_item
+
+echo -n "$0: Generating files.info "
+iter=0
+old_iter=0
+files_info=()
+for part_dir in ${part_dirs[@]}; do
+    files_info+=($part_dir/files.info)
+    if [ ! -s "$part_dir/files.info" ]; then
+        ( find $part_dir -name '*.pcm'| parallel gen_item {} > $part_dir/files.info ) &
+        iter=$[iter+1]
+    fi
+
+    if [ $old_iter -ne $iter -a $[iter%max_gen] -eq 0 ]; then
+        echo -n .
+        wait $(jobs -p)
+    fi
+    old_iter=$iter
+done
+echo
+wait $(jobs -p)
+
+echo -n "$0: Generating kspon dataset from files.info "
+:>$dst/wav.scp.tmp>$dst/text.tmp>$dst/utt2spk.tmp>$dst/pronoun.dict.tmp
+for info in ${files_info[@]}; do
+    echo -n .
+    awk -F'\t' '{print $1 " " $1}' $info >>$dst/utt2spk.tmp || exit 1
+
+    # Ref. http://ai-hub.promptech.co.kr/notice_product/569
+    perl -F'\t' -ane '#next if ($F[2] =~ m:u/:); # Skip the inaudiable(or unclear) utterance of sentence
+         $F[2] =~ s:(\d+)\s*/\s*(\d+):\2 분에 \1:g; #Chagne division mark
+         $F[2] =~ s:\.+:.:g; # remove multi-dots
+         $F[2] =~ s:([a-zA-Z])\.([a-zA-Z])\.:\1\2:g; # e.g., D.C.
+         $F[2] =~ s:u/::g; # Unclear utterance of sentence mark
+         $F[2] =~ s:o/::g; # Noise mark of utterance
+         $F[2] =~ s:[lbn]/::g; # Breath, laugh, BG noise mark
+         $F[2] =~ s:([가-힣]+?)/:\1:g; # Replace a interjection(filler words)
+         $F[2] =~ s:\+::g; # Utterance repetation mark
+         $F[2] =~ s:\Q*\E::g; # Unclear words utterance mark
+         $F[2] =~ s:[\?\#\!,]::g; # Some other symbols
+         $F[2] =~ s:([^\d])\.([^\d]):\1\2:g; # Remove dot with non-numbers
+         $F[2] =~ s:([\d])\.([^\d ]):\1\2:g; # Remove dot with non-numbers
+         $F[2] =~ s:([^\d ])\.([\d]):\1\2:g; # Remove dot with non-numbers
+         #$F[2] =~ s:\((.+?)\)/\((.+?)\):\1:g; #representation (it needs too much exception)
+         $F[2] =~ s:\((.+?)\)/\((.+?)\):\2:g; #representation
+         $F[2] =~ s:([\w가-힣])-([\w가-힣]):\1 \2:g; #remove hyphen mark used to join words
+         $F[2] =~ s:/::g; # remove some slash
+         $F[2] =~ s:^[\s\.]+::g; # trim left
+         $F[2] =~ s:[\s\.]+$::g; # trim right
+         $F[2] =~ s: +: :g; # remove multi-spaces
+         print "$F[0] $F[2]\n";' $info >>$dst/text.tmp || exit 1
+    awk -F'\t' '{print $1 " sox -t raw -r 16k -b 16 -e signed-integer -L \"" $2 "\" -t wav - | "}' $info >>$dst/wav.scp.tmp || exit 1
+    cut -f4- $info | perl -lane 'next if(/^$/); print "$1\t$2" if(/\(\s*(.+?)\s*\)\/\(\s*(.+?)\s*\)/)' >>$dst/pronoun.dict.tmp || exit 1
+done
+echo
+
+echo $0: Sorting kspon dataset
+env LC_ALL=C sort -u $dst/pronoun.dict.tmp > $dst/pronoun.dict
+env LC_ALL=C sort -u $dst/text.tmp > $dst/text.tmp2
+env LC_ALL=C sort -u $dst/utt2spk.tmp > $dst/utt2spk.tmp2
+env LC_ALL=C sort -u $dst/wav.scp.tmp > $dst/wav.scp.tmp2
+
+echo $0: Filtering inaudiable data
+mv $dst/text.tmp2 $dst/text
+utils/filter_scp.pl $dst/text $dst/utt2spk.tmp2 > $dst/utt2spk || exit 1
+utils/filter_scp.pl $dst/text $dst/wav.scp.tmp2 > $dst/wav.scp || exit 1
+rm -f $dst/*.tmp $dst/*.tmp2
+
+echo $0: Generating spk2utt from utt2spk
+utils/utt2spk_to_spk2utt.pl <$dst/utt2spk >$dst/spk2utt || exit 1
+echo $0: Generating utt2dur
+utils/data/get_utt2dur.sh --cmd "$train_cmd" --nj $nj $dst 1>&2 || exit 1
+echo $0: Checking data
+utils/validate_data_dir.sh --no-feats $dst || exit 1;
+
+echo $0: done
+exit 0
diff --git a/s5/local/prepare_dict.sh b/s5/local/prepare_dict.sh
@@ -21,6 +21,10 @@ lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
 
 if [[ ! -s "$lexicon_raw_nosil" ]]; then
 	cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1
+	if [ -s $lm_dir/extra_lexicon ]; then
+		env LC_ALL=C sort -u $lexicon_raw_nosil $lm_dir/extra_lexicon > $lexicon_raw_nosil.tmp
+		mv -f $lexicon_raw_nosil.tmp $lexicon_raw_nosil
+	fi
 fi
 
 silence_phones=$dst_dir/silence_phones.txt