Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Add C# and JavaScript (wasm) API for MatchaTTS models #1682

Merged
merged 11 commits into from
Jan 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions .github/scripts/test-dot-net.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,27 @@

cd dotnet-examples/

cd ./offline-speaker-diarization
cd ./offline-tts
./run-matcha-zh.sh
ls -lh *.wav
./run-matcha-en.sh
ls -lh *.wav
./run-aishell3.sh
ls -lh *.wav
./run-piper.sh
ls -lh *.wav
./run-hf-fanchen.sh
ls -lh *.wav
ls -lh

pushd ../..

mkdir tts

cp dotnet-examples/offline-tts/*.wav ./tts
popd

cd ../offline-speaker-diarization
./run.sh
rm -rfv *.onnx
rm -fv *.wav
Expand Down Expand Up @@ -76,14 +96,4 @@ cd ../spoken-language-identification
./run.sh
rm -rf sherpa-onnx-*

cd ../offline-tts
./run-aishell3.sh
./run-piper.sh
./run-hf-fanchen.sh
ls -lh

cd ../..

mkdir tts

cp dotnet-examples/offline-tts/*.wav ./tts
54 changes: 42 additions & 12 deletions .github/scripts/test-nodejs-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,48 @@ git status
ls -lh
ls -lh node_modules

# offline tts
#
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test-offline-tts-matcha-zh.js

rm -rf matcha-icefall-zh-baker
rm hifigan_v2.onnx

echo "---"

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

node ./test-offline-tts-matcha-en.js

rm -rf matcha-icefall-en_US-ljspeech
rm hifigan_v2.onnx

echo "---"

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
node ./test-offline-tts-vits-en.js
rm -rf vits-piper-en_US-amy-low*

echo "---"

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
node ./test-offline-tts-vits-zh.js
rm -rf vits-icefall-zh-aishell3*

ls -lh *.wav

echo '-----speaker diarization----------'
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Expand Down Expand Up @@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2
node ./test-online-zipformer2-ctc-hlg.js
rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18

# offline tts

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2
tar xf vits-piper-en_US-amy-low.tar.bz2
node ./test-offline-tts-en.js
rm -rf vits-piper-en_US-amy-low*

curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
tar xvf vits-icefall-zh-aishell3.tar.bz2
node ./test-offline-tts-zh.js
rm -rf vits-icefall-zh-aishell3*
44 changes: 44 additions & 0 deletions .github/workflows/test-dot-net.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,50 @@ jobs:
python-version: ["3.8"]

steps:
- name: Check space
shell: bash
run: |
df -h

- name: Free space
shell: bash
run: |
df -h
rm -rf /opt/hostedtoolcache
df -h

- name: Free more space
shell: bash
run: |
# https://github.com/orgs/community/discussions/25678
cd /opt
find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';'

sudo rm -rf /usr/share/dotnet
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"

- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false

# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: false
haskell: true
large-packages: true
docker-images: false
swap-storage: true

- name: Check space
shell: bash
run: |
df -h

- uses: actions/checkout@v4
with:
fetch-depth: 0
Expand Down
109 changes: 82 additions & 27 deletions dotnet-examples/offline-tts-play/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,48 +21,56 @@ class OfflineTtsPlayDemo
{
class Options
{

[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
public string? RuleFsts { get; set; }
public string RuleFsts { get; set; } = string.Empty;

[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
public string RuleFars { get; set; } = string.Empty;

[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
public string? DictDir { get; set; }
[Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
public string DictDir { get; set; } = string.Empty;

[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string? DataDir { get; set; }
[Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string DataDir { get; set; } = string.Empty;

[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
public float LengthScale { get; set; }
[Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
public float LengthScale { get; set; } = 1;

[Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")]
public float NoiseScale { get; set; }
[Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")]
public float NoiseScale { get; set; } = 0.667F;

[Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")]
public float NoiseScaleW { get; set; }
[Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")]
public float NoiseScaleW { get; set; } = 0.8F;

[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
public string? Lexicon { get; set; }
[Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
public string Lexicon { get; set; } = string.Empty;

[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
public string? Tokens { get; set; }
[Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")]
public string Tokens { get; set; } = string.Empty;

[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
public int MaxNumSentences { get; set; }
public int MaxNumSentences { get; set; } = 1;

[Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")]
public int Debug { get; set; }
public int Debug { get; set; } = 0;

[Option("vits-model", Required = false, HelpText = "Path to VITS model")]
public string Model { get; set; } = string.Empty;

[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
public string? Model { get; set; }
[Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")]
public string AcousticModel { get; set; } = "";

[Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")]
public string Vocoder { get; set; } = "";

[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
public int SpeakerId { get; set; }
public int SpeakerId { get; set; } = 0;

[Option("text", Required = true, HelpText = "Text to synthesize")]
public string? Text { get; set; }
public string Text { get; set; } = string.Empty;

[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
public string? OutputFilename { get; set; }
public string OutputFilename { get; set; } = "./generated.wav";
}

static void Main(string[] args)
Expand All @@ -78,15 +86,51 @@ static void Main(string[] args)
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
{
string usage = @"
# matcha-icefall-zh-baker

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

dotnet run \
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--dict-dir=./matcha-icefall-zh-baker/dict \
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'

# matcha-icefall-en_US-ljspeech

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2

wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx

dotnet run \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'

# vits-aishell3

wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2
tar xf vits-zh-aishell3.tar.bz2

dotnet run \
--vits-model=./vits-zh-aishell3/vits-aishell3.onnx \
--vits-tokens=./vits-zh-aishell3/tokens.txt \
--vits-lexicon=./vits-zh-aishell3/lexicon.txt \
--tokens=./vits-zh-aishell3/tokens.txt \
--lexicon=./vits-zh-aishell3/lexicon.txt \
--tts-rule-fsts=./vits-zh-aishell3/rule.fst \
--sid=66 \
--debug=1 \
Expand All @@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2

dotnet run \
--vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \
--vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \
--vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
---tokens=./vits-piper-en_US-amy-low/tokens.txt \
--data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \
--debug=1 \
--output-filename=./amy.wav \
--text='This is a text to speech application in dotnet with Next Generation Kaldi'
Expand All @@ -124,6 +168,7 @@ to download more models.
private static void Run(Options options)
{
var config = new OfflineTtsConfig();

config.Model.Vits.Model = options.Model;
config.Model.Vits.Lexicon = options.Lexicon;
config.Model.Vits.Tokens = options.Tokens;
Expand All @@ -132,6 +177,16 @@ private static void Run(Options options)
config.Model.Vits.NoiseScale = options.NoiseScale;
config.Model.Vits.NoiseScaleW = options.NoiseScaleW;
config.Model.Vits.LengthScale = options.LengthScale;

config.Model.Matcha.AcousticModel = options.AcousticModel;
config.Model.Matcha.Vocoder = options.Vocoder;
config.Model.Matcha.Lexicon = options.Lexicon;
config.Model.Matcha.Tokens = options.Tokens;
config.Model.Matcha.DataDir = options.DataDir;
config.Model.Matcha.DictDir = options.DictDir;
config.Model.Matcha.NoiseScale = options.NoiseScale;
config.Model.Matcha.LengthScale = options.LengthScale;

config.Model.NumThreads = 1;
config.Model.Debug = options.Debug;
config.Model.Provider = "cpu";
Expand Down
4 changes: 2 additions & 2 deletions dotnet-examples/offline-tts-play/run-hf-fanchen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ fi

dotnet run \
--vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \
--vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \
--vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
--tokens=./vits-zh-hf-fanchen-C/tokens.txt \
--lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \
--tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \
--vits-dict-dir=./vits-zh-hf-fanchen-C/dict \
--sid=100 \
Expand Down
26 changes: 26 additions & 0 deletions dotnet-examples/offline-tts-play/run-matcha-en.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -ex


# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2
tar xf matcha-icefall-en_US-ljspeech.tar.bz2
rm matcha-icefall-en_US-ljspeech.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi

dotnet run \
--matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \
--data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \
--debug=1 \
--output-filename=./matcha-en.wav \
--text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'
27 changes: 27 additions & 0 deletions dotnet-examples/offline-tts-play/run-matcha-zh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -ex

# please visit
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker
# to download more models
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
fi

if [ ! -f ./hifigan_v2.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx
fi


dotnet run \
--matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \
--matcha-vocoder=./hifigan_v2.onnx \
--lexicon=./matcha-icefall-zh-baker/lexicon.txt \
--tokens=./matcha-icefall-zh-baker/tokens.txt \
--dict-dir=./matcha-icefall-zh-baker/dict \
--tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \
--debug=1 \
--output-filename=./matcha-zh.wav \
--text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。"
Loading
Loading