Skip to content
This repository has been archived by the owner on Aug 23, 2023. It is now read-only.

Added AI-Hub kspon(Korean speech dataset) related scripts #13

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion s5/data/local/lm/buildLM/_scripts_/genPronunciation_cmu.py
Original file line number Diff line number Diff line change
Expand Up @@ -1697,7 +1697,7 @@ def fUpper(match):
return match.group(0).upper()

def main():
filename='buildLM/_scripts_/cmudict-0.7b.txt'
filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cmudict-0.7b.txt')
if not os.path.exists(filename):
print('No dictionary file: %s' % filename)
sys.exit()
Expand Down
74 changes: 74 additions & 0 deletions s5/local/generateExtraLexicon.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash

# Copyright 2019 hwiorn <hwiorn@gmail.com>
# Apache 2.0

if [ "$#" -ne 2 ]; then
echo "Usage: $0 <data-dir> <lm-dir>"
echo " ex: $0 data/kspon data/local/lm"
exit 1
fi
data_dir=$1
lm_dir=$2

text=$data_dir/text
script_dir=$lm_dir/buildLM/_scripts_

if [ ! -f $text ]; then
echo $0: No such file: $text
exit 1
fi

if [ ! -f $script_dir/cmudict-0.7b.txt ]; then
echo $0: No such cmudict dictionary file.
exit 1
fi

cut -d' ' -f2- $text > $text.tmp
echo "$0: Segmenting word class"
cat $text.tmp |\
perl -Mutf8 -CS -pe 's/(?<=[가-힣])([^ 가-힣])/ \1/g; s/(?<=[^ 가-힣])([가-힣])/ \1/g;
s/(?<=[a-zA-Z])([^ a-zA-Z])/ \1/g; s/(?<=[^ a-zA-Z])([a-zA-Z])/ \1/g;
s/(?<=[0-9])([^0-9])/ \1/g; s/(?<=[^0-9])([0-9])/ \1/g' >\
$text.tmp2

echo "$0: Generating unique word from AM text"
parallel -a $text.tmp2 --pipepart --block=11M tr -s '[:space:]' '\\n' \| env LC_ALL=C sort -u | env LC_ALL=C sort -u > $data_dir/uniqWords.txt

grep -P '[가-힣]+_?' $data_dir/uniqWords.txt | env LC_ALL=C sort -u > $data_dir/uniqWords.hangul
grep -v -P '[가-힣]+_?' $data_dir/uniqWords.txt | env LC_ALL=C sort -u > $data_dir/uniqWords.nonhangul
cat $data_dir/uniqWords.nonhangul | grep -E "^[A-Z]+_? " > $data_dir/uniqWords.nonhangul.alphabet
cat $data_dir/uniqWords.nonhangul | grep -v -E "^[A-Z]+_? " | awk '{print $1}' > $data_dir/uniqWords.nonhangul.etc
cat $data_dir/uniqWords.nonhangul.{alphabet,etc} | env LC_ALL=C sort -u > $data_dir/uniqWords.nonhangul.sorted

echo "$0: Generating pronunciation for non-hangul morphemes"
env LC_ALL=en_US.UTF-8 $script_dir/genPronunciation_cmu.py $data_dir/uniqWords.nonhangul.sorted > $data_dir/tmp
env LC_ALL=en_US.UTF-8 $script_dir/genPronunciation.py $data_dir/tmp > $data_dir/tmp2

awk -F'\t' '{if(NF>1){print $0}}' $data_dir/tmp2 > $data_dir/uniqWords.nonhangul.sorted.pron
awk -F'\t' '{if(NF<2){print $0}}' $data_dir/tmp2 > $data_dir/noPronList
noPronCount=$(wc -l <$data_dir/noPronList)
if [ $noPronCount -ne 0 ]; then
echo $0: There exist morphemes without pronunciation, plz check noPronList: $noPronCount
head $data_dir/noPronList
echo "... (omitted) ..."
#rm -f $data_dir/noPronList
#exit 1
fi

echo $0: Generating pronunciation
cat $data_dir/uniqWords.nonhangul.sorted.pron $data_dir/uniqWords.hangul > $data_dir/finalList
env LC_ALL=en_US.UTF-8 $script_dir/genPhoneSeq.py $data_dir/finalList

echo $0: Extracting uniq lexicon
env LC_ALL=en_US.UTF-8 $script_dir/genLexicon.py dic.pronun | perl -pe 's/^\s+$//g' > $data_dir/extra_lexicon.tmp
utils/filter_scp.pl --exclude $lm_dir/zeroth_lexicon $data_dir/extra_lexicon.tmp > $data_dir/extra_lexicon
mv -f dic.pronun $data_dir/dic.pronun

[ ! -s $data_dir/noPronList ] && rm -f $data_dir/noPronList
[ ! -s $data_dir/extra_lexicon ] && rm -f $data_dir/extra_lexicon

rm -f $text.tmp* $data_dir/tmp* $data_dir/*.tmp* $data_dir/uniqWords.* $data_dir/{dic.pronun,finalList,pronoun.dict}
echo $0: done

exit 0
131 changes: 131 additions & 0 deletions s5/local/kspon_data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/bin/bash
# Copyright 2019 hwiorn <hwiorn@gmail.com>
# Apache 2.0.

nj=8
max_gen=10

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g.: $0 /mnt/data/aihub/KsponSpeechAll data/aihub_kspon"
exit 1
fi

src=${1%/}
dst=${2%/}

echo "$0: Data preparation for AI Hub Korean speech datasets(2000 speakers, ~1000hrs)"
mkdir -p $dst || exit 1

[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
part_dirs=( $src/KsponSpeech_0{1,2,3,4,5} )

echo $0: Unzipping datasets ...
for part in ${part_dirs[@]}; do
if [ ! -f $part/.done ]; then
if [ ! -f $part.zip ]; then
echo $0: $part.zip is not exists
exit 1
fi

(
set -e
unzip -oqq $part.zip -d $src/
touch $part/.done
) &
fi
done
wait $(jobs -p)

gen_item() {
path=${1%.pcm}
txt=$path.txt
echo -en "$path\t" | perl -pe 's:^./::g;s:/:_:g'
echo -en "$1\t"
perl -MEncode -ne '$_=encode("utf-8", decode("cp949", $_)); chomp; @dict=();
s:\r::g; # remove LF
for(m:(\(.+?\)/\(.+?\)):) { push @dict, $1; }
$d=join "\t",@dict; # gather up filler words.
print "$_\t$d\n";' $txt
}
export -f gen_item

echo -n "$0: Generating files.info "
iter=0
old_iter=0
files_info=()
for part_dir in ${part_dirs[@]}; do
files_info+=($part_dir/files.info)
if [ ! -s "$part_dir/files.info" ]; then
( find $part_dir -name '*.pcm'| parallel gen_item {} > $part_dir/files.info ) &
iter=$[iter+1]
fi

if [ $old_iter -ne $iter -a $[iter%max_gen] -eq 0 ]; then
echo -n .
wait $(jobs -p)
fi
old_iter=$iter
done
echo
wait $(jobs -p)

echo -n "$0: Generating kspon dataset from files.info "
:>$dst/wav.scp.tmp>$dst/text.tmp>$dst/utt2spk.tmp>$dst/pronoun.dict.tmp
for info in ${files_info[@]}; do
echo -n .
awk -F'\t' '{print $1 " " $1}' $info >>$dst/utt2spk.tmp || exit 1

# Ref. http://ai-hub.promptech.co.kr/notice_product/569
perl -F'\t' -ane '#next if ($F[2] =~ m:u/:); # Skip the inaudiable(or unclear) utterance of sentence
$F[2] =~ s:(\d+)\s*/\s*(\d+):\2 분에 \1:g; #Chagne division mark
$F[2] =~ s:\.+:.:g; # remove multi-dots
$F[2] =~ s:([a-zA-Z])\.([a-zA-Z])\.:\1\2:g; # e.g., D.C.
$F[2] =~ s:u/::g; # Unclear utterance of sentence mark
$F[2] =~ s:o/::g; # Noise mark of utterance
$F[2] =~ s:[lbn]/::g; # Breath, laugh, BG noise mark
$F[2] =~ s:([가-힣]+?)/:\1:g; # Replace a interjection(filler words)
$F[2] =~ s:\+::g; # Utterance repetation mark
$F[2] =~ s:\Q*\E::g; # Unclear words utterance mark
$F[2] =~ s:[\?\#\!,]::g; # Some other symbols
$F[2] =~ s:([^\d])\.([^\d]):\1\2:g; # Remove dot with non-numbers
$F[2] =~ s:([\d])\.([^\d ]):\1\2:g; # Remove dot with non-numbers
$F[2] =~ s:([^\d ])\.([\d]):\1\2:g; # Remove dot with non-numbers
#$F[2] =~ s:\((.+?)\)/\((.+?)\):\1:g; #representation (it needs too much exception)
$F[2] =~ s:\((.+?)\)/\((.+?)\):\2:g; #representation
$F[2] =~ s:([\w가-힣])-([\w가-힣]):\1 \2:g; #remove hyphen mark used to join words
$F[2] =~ s:/::g; # remove some slash
$F[2] =~ s:^[\s\.]+::g; # trim left
$F[2] =~ s:[\s\.]+$::g; # trim right
$F[2] =~ s: +: :g; # remove multi-spaces
print "$F[0] $F[2]\n";' $info >>$dst/text.tmp || exit 1
awk -F'\t' '{print $1 " sox -t raw -r 16k -b 16 -e signed-integer -L \"" $2 "\" -t wav - | "}' $info >>$dst/wav.scp.tmp || exit 1
cut -f4- $info | perl -lane 'next if(/^$/); print "$1\t$2" if(/\(\s*(.+?)\s*\)\/\(\s*(.+?)\s*\)/)' >>$dst/pronoun.dict.tmp || exit 1
done
echo

echo $0: Sorting kspon dataset
env LC_ALL=C sort -u $dst/pronoun.dict.tmp > $dst/pronoun.dict
env LC_ALL=C sort -u $dst/text.tmp > $dst/text.tmp2
env LC_ALL=C sort -u $dst/utt2spk.tmp > $dst/utt2spk.tmp2
env LC_ALL=C sort -u $dst/wav.scp.tmp > $dst/wav.scp.tmp2

echo $0: Filtering inaudiable data
mv $dst/text.tmp2 $dst/text
utils/filter_scp.pl $dst/text $dst/utt2spk.tmp2 > $dst/utt2spk || exit 1
utils/filter_scp.pl $dst/text $dst/wav.scp.tmp2 > $dst/wav.scp || exit 1
rm -f $dst/*.tmp $dst/*.tmp2

echo $0: Generating spk2utt from utt2spk
utils/utt2spk_to_spk2utt.pl <$dst/utt2spk >$dst/spk2utt || exit 1
echo $0: Generating utt2dur
utils/data/get_utt2dur.sh --cmd "$train_cmd" --nj $nj $dst 1>&2 || exit 1
echo $0: Checking data
utils/validate_data_dir.sh --no-feats $dst || exit 1;

echo $0: done
exit 0
4 changes: 4 additions & 0 deletions s5/local/prepare_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt

if [[ ! -s "$lexicon_raw_nosil" ]]; then
cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1
if [ -s $lm_dir/extra_lexicon ]; then
env LC_ALL=C sort -u $lexicon_raw_nosil $lm_dir/extra_lexicon > $lexicon_raw_nosil.tmp
mv -f $lexicon_raw_nosil.tmp $lexicon_raw_nosil
fi
fi

silence_phones=$dst_dir/silence_phones.txt
Expand Down
Loading