From d212ac0f49b5e85e57a1605d8ada8b7aca57fc22 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 28 Nov 2023 21:21:37 +0800 Subject: [PATCH 01/19] Use piper-phonemize to convert text to token IDs --- python-api-examples/offline-tts.py | 13 +++- sherpa-onnx/csrc/CMakeLists.txt | 1 + sherpa-onnx/csrc/lexicon.cc | 2 +- sherpa-onnx/csrc/lexicon.h | 5 +- sherpa-onnx/csrc/offline-tts-vits-impl.h | 36 ++++++++--- .../csrc/offline-tts-vits-model-config.cc | 60 +++++++++++++----- .../csrc/offline-tts-vits-model-config.h | 6 ++ sherpa-onnx/csrc/offline-tts-vits-model.cc | 8 +++ sherpa-onnx/csrc/offline-tts-vits-model.h | 3 +- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 63 +++++++++++++++++++ sherpa-onnx/csrc/piper-phonemize-lexicon.h | 26 ++++++++ sherpa-onnx/csrc/piper-phonemize-test.cc | 10 +-- .../csrc/offline-tts-vits-model-config.cc | 8 ++- 13 files changed, 204 insertions(+), 37 deletions(-) create mode 100644 sherpa-onnx/csrc/piper-phonemize-lexicon.cc create mode 100644 sherpa-onnx/csrc/piper-phonemize-lexicon.h diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index c05cde1eb0..246eedf444 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -63,15 +63,25 @@ def get_args(): parser.add_argument( "--vits-lexicon", type=str, + default="", help="Path to lexicon.txt", ) parser.add_argument( "--vits-tokens", type=str, + default="", help="Path to tokens.txt", ) + parser.add_argument( + "--vits-data-dir", + type=str, + default="", + help="""Path to the dict director of espeak-ng. If it is specified, + --vits-lexicon and --vits-tokens are ignored""", + ) + parser.add_argument( "--tts-rule-fsts", type=str, @@ -142,13 +152,14 @@ def main(): vits=sherpa_onnx.OfflineTtsVitsModelConfig( model=args.vits_model, lexicon=args.vits_lexicon, + data_dir=args.vits_data_dir, tokens=args.vits_tokens, ), provider=args.provider, debug=args.debug, num_threads=args.num_threads, ), - rule_fsts=args.tts_rule_fsts + rule_fsts=args.tts_rule_fsts, ) tts = sherpa_onnx.OfflineTts(tts_config) diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 02ec56c0b0..304a18674b 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -74,6 +74,7 @@ set(sources packed-sequence.cc pad-sequence.cc parse-options.cc + piper-phonemize-lexicon.cc provider.cc resample.cc session.cc diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index e53320b534..ea0ed06599 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -130,7 +130,7 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon, #endif std::vector Lexicon::ConvertTextToTokenIds( - const std::string &text) const { + const std::string &text, const std::string & /*voice*/ /*= ""*/) const { switch (language_) { case Language::kEnglish: return ConvertTextToTokenIdsEnglish(text); diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index c79ac11a37..38ddc98cf4 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -23,6 +23,8 @@ namespace sherpa_onnx { // TODO(fangjun): Refactor it to an abstract class class Lexicon { public: + virtual ~Lexicon() = default; + Lexicon() = default; // for subclasses Lexicon(const std::string &lexicon, const std::string &tokens, const std::string &punctuations, const std::string &language, bool debug = false, bool is_piper = false); @@ -34,7 +36,8 @@ class Lexicon { bool is_piper = false); #endif - std::vector ConvertTextToTokenIds(const std::string &text) const; + virtual std::vector ConvertTextToTokenIds( + const std::string &text, const std::string &voice = "") const; private: std::vector ConvertTextToTokenIdsGerman( diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index da5435af71..2ace0f57d9 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -21,6 +21,7 @@ #include "sherpa-onnx/csrc/offline-tts-impl.h" #include "sherpa-onnx/csrc/offline-tts-vits-model.h" #include "sherpa-onnx/csrc/onnx-utils.h" +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" #include "sherpa-onnx/csrc/text-utils.h" namespace sherpa_onnx { @@ -29,10 +30,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { public: explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) : config_(config), - model_(std::make_unique(config.model)), - lexicon_(config.model.vits.lexicon, config.model.vits.tokens, - model_->Punctuations(), model_->Language(), config.model.debug, - model_->IsPiper()) { + model_(std::make_unique(config.model)) { + InitLexicon(); + if (!config.rule_fsts.empty()) { std::vector files; SplitStringToVector(config.rule_fsts, ",", false, &files); @@ -50,9 +50,10 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config) : config_(config), model_(std::make_unique(mgr, config.model)), - lexicon_(mgr, config.model.vits.lexicon, config.model.vits.tokens, - model_->Punctuations(), model_->Language(), config.model.debug, - model_->IsPiper()) { + lexicon_(std::make_unique( + mgr, config.model.vits.lexicon, config.model.vits.tokens, + model_->Punctuations(), model_->Language(), config.model.debug, + model_->IsPiper())) { if (!config.rule_fsts.empty()) { std::vector files; SplitStringToVector(config.rule_fsts, ",", false, &files); @@ -101,13 +102,14 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { } } - std::vector x = lexicon_.ConvertTextToTokenIds(text); + std::vector x = + lexicon_->ConvertTextToTokenIds(text, model_->Voice()); if (x.empty()) { SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str()); return {}; } - if (model_->AddBlank()) { + if (model_->AddBlank() && config_.model.vits.data_dir.empty()) { std::vector buffer(x.size() * 2 + 1); int32_t i = 1; for (auto k : x) { @@ -143,11 +145,25 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { return ans; } + private: + void InitLexicon() { + if (model_->IsPiper() && model_->Language() == "English" && + !config_.model.vits.data_dir.empty()) { + lexicon_ = + std::make_unique(config_.model.vits.data_dir); + } else { + lexicon_ = std::make_unique( + config_.model.vits.lexicon, config_.model.vits.tokens, + model_->Punctuations(), model_->Language(), config_.model.debug, + model_->IsPiper()); + } + } + private: OfflineTtsConfig config_; std::unique_ptr model_; std::vector> tn_list_; - Lexicon lexicon_; + std::unique_ptr lexicon_; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc index 71c19e570f..feb0b1e369 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc @@ -13,6 +13,9 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { po->Register("vits-model", &model, "Path to VITS model"); po->Register("vits-lexicon", &lexicon, "Path to lexicon.txt for VITS models"); po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models"); + po->Register("vits-data-dir", &data_dir, + "Path to the directory containing dict for espeak-ng. If it is " + "given, --vits-lexicon and --vits-tokens are ignored."); po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); po->Register("vits-noise-scale-w", &noise_scale_w, "noise_scale_w for VITS models"); @@ -31,24 +34,50 @@ bool OfflineTtsVitsModelConfig::Validate() const { return false; } - if (lexicon.empty()) { - SHERPA_ONNX_LOGE("Please provide --vits-lexicon"); - return false; - } + if (data_dir.empty()) { + if (lexicon.empty()) { + SHERPA_ONNX_LOGE("Please provide --vits-lexicon"); + return false; + } - if (!FileExists(lexicon)) { - SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str()); - return false; - } + if (!FileExists(lexicon)) { + SHERPA_ONNX_LOGE("--vits-lexicon: %s does not exist", lexicon.c_str()); + return false; + } - if (tokens.empty()) { - SHERPA_ONNX_LOGE("Please provide --vits-tokens"); - return false; - } + if (tokens.empty()) { + SHERPA_ONNX_LOGE("Please provide --vits-tokens"); + return false; + } - if (!FileExists(tokens)) { - SHERPA_ONNX_LOGE("--vits-tokens: %s does not exist", tokens.c_str()); - return false; + if (!FileExists(tokens)) { + SHERPA_ONNX_LOGE("--vits-tokens: %s does not exist", tokens.c_str()); + return false; + } + } else { + if (!FileExists(data_dir + "/phontab")) { + SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test", + data_dir.c_str()); + return false; + } + + if (!FileExists(data_dir + "/phonindex")) { + SHERPA_ONNX_LOGE("%s/phonindex does not exist. Skipping test", + data_dir.c_str()); + return false; + } + + if (!FileExists(data_dir + "/phondata")) { + SHERPA_ONNX_LOGE("%s/phondata does not exist. Skipping test", + data_dir.c_str()); + return false; + } + + if (!FileExists(data_dir + "/intonations")) { + SHERPA_ONNX_LOGE("%s/intonations does not exist. Skipping test", + data_dir.c_str()); + return false; + } } return true; @@ -61,6 +90,7 @@ std::string OfflineTtsVitsModelConfig::ToString() const { os << "model=\"" << model << "\", "; os << "lexicon=\"" << lexicon << "\", "; os << "tokens=\"" << tokens << "\", "; + os << "data_dir=\"" << data_dir << "\", "; os << "noise_scale=" << noise_scale << ", "; os << "noise_scale_w=" << noise_scale_w << ", "; os << "length_scale=" << length_scale << ")"; diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.h b/sherpa-onnx/csrc/offline-tts-vits-model-config.h index 62bc566bef..99ee86b063 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.h @@ -16,6 +16,10 @@ struct OfflineTtsVitsModelConfig { std::string lexicon; std::string tokens; + // If data_dir is given, lexicon and tokens are ignored + // data_dir is for piper-phonemize, which uses espeak-ng + std::string data_dir; + float noise_scale = 0.667; float noise_scale_w = 0.8; float length_scale = 1; @@ -28,11 +32,13 @@ struct OfflineTtsVitsModelConfig { OfflineTtsVitsModelConfig(const std::string &model, const std::string &lexicon, const std::string &tokens, + const std::string &data_dir, float noise_scale = 0.667, float noise_scale_w = 0.8, float length_scale = 1) : model(model), lexicon(lexicon), tokens(tokens), + data_dir(data_dir), noise_scale(noise_scale), noise_scale_w(noise_scale_w), length_scale(length_scale) {} diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc index dafe5052ad..2e8cfe766e 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc @@ -51,6 +51,7 @@ class OfflineTtsVitsModel::Impl { std::string Punctuations() const { return punctuations_; } std::string Language() const { return language_; } + std::string Voice() const { return voice_; } bool IsPiper() const { return is_piper_; } int32_t NumSpeakers() const { return num_speakers_; } @@ -78,6 +79,11 @@ class OfflineTtsVitsModel::Impl { SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers"); SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation"); SHERPA_ONNX_READ_META_DATA_STR(language_, "language"); + // SHERPA_ONNX_READ_META_DATA_STR(voice_, "voice"); + if (language_ == "English") { + // FIXME(fangjun): Read voice from the metadata + voice_ = "en-us"; + } std::string comment; SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); @@ -215,6 +221,7 @@ class OfflineTtsVitsModel::Impl { int32_t num_speakers_; std::string punctuations_; std::string language_; + std::string voice_; bool is_piper_ = false; }; @@ -244,6 +251,7 @@ std::string OfflineTtsVitsModel::Punctuations() const { } std::string OfflineTtsVitsModel::Language() const { return impl_->Language(); } +std::string OfflineTtsVitsModel::Voice() const { return impl_->Voice(); } bool OfflineTtsVitsModel::IsPiper() const { return impl_->IsPiper(); } diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.h b/sherpa-onnx/csrc/offline-tts-vits-model.h index 1cf9ad2ea5..7708144c63 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model.h @@ -46,7 +46,8 @@ class OfflineTtsVitsModel { bool AddBlank() const; std::string Punctuations() const; - std::string Language() const; + std::string Language() const; // e.g., Chinese, English, German, etc. + std::string Voice() const; // e.g., en-us, for espeak-ng bool IsPiper() const; int32_t NumSpeakers() const; diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc new file mode 100644 index 0000000000..a0335c18a0 --- /dev/null +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -0,0 +1,63 @@ +// sherpa-onnx/csrc/piper-phonemize-lexicon.cc +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" + +#include +#include // NOLINT + +#include "espeak-ng/speak_lib.h" +#include "phoneme_ids.hpp" +#include "phonemize.hpp" +#include "sherpa-onnx/csrc/macros.h" + +namespace sherpa_onnx { + +void InitEspeak(const std::string &data_dir) { + static std::once_flag init_flag; + std::call_once(init_flag, [data_dir]() { + int32_t result = + espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, data_dir.c_str(), 0); + if (result != 22050) { + SHERPA_ONNX_LOGE( + "Failed to initialize espeak-ng with data dir: %s. Return code is: " + "%d", + data_dir.c_str(), result); + exit(-1); + } + }); +} + +PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &data_dir) + : data_dir_(data_dir) { + InitEspeak(data_dir_); +} + +std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( + const std::string &text, const std::string &voice /*= ""*/) const { + piper::eSpeakPhonemeConfig config; + + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices + // to list available voices + config.voice = voice; // e.g., voice is en-us + + std::vector> phonemes; + piper::phonemize_eSpeak(text, config, phonemes); + + std::vector phoneme_ids; + std::map missing_phonemes; + + std::vector ans; + piper::PhonemeIdConfig id_config; + for (const auto &p : phonemes) { + phoneme_ids.clear(); + missing_phonemes.clear(); + phonemes_to_ids(p, id_config, phoneme_ids, missing_phonemes); + ans.insert(ans.end(), phoneme_ids.begin(), phoneme_ids.end()); + } + + return ans; +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h new file mode 100644 index 0000000000..5f29addf32 --- /dev/null +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -0,0 +1,26 @@ +// sherpa-onnx/csrc/piper-phonemize-lexicon.h +// +// Copyright (c) 2022-2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ +#define SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ + +#include "sherpa-onnx/csrc/lexicon.h" + +namespace sherpa_onnx { + +class PiperPhonemizeLexicon : public Lexicon { + public: + explicit PiperPhonemizeLexicon(const std::string &data_dir); + + std::vector ConvertTextToTokenIds( + const std::string &text, const std::string &voice = "") const override; + + private: + std::string voice_; + std::string data_dir_; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ diff --git a/sherpa-onnx/csrc/piper-phonemize-test.cc b/sherpa-onnx/csrc/piper-phonemize-test.cc index b1d0790a6a..47e2c57b66 100644 --- a/sherpa-onnx/csrc/piper-phonemize-test.cc +++ b/sherpa-onnx/csrc/piper-phonemize-test.cc @@ -48,7 +48,7 @@ TEST(PiperPhonemize, Case1) { piper::eSpeakPhonemeConfig config; - // ./bin/espeak-ng --path ./install/share/espeak-ng-data/ --voices + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices // to list available voices config.voice = "en-us"; @@ -61,15 +61,15 @@ TEST(PiperPhonemize, Case1) { } std::cout << "\n"; - std::vector phonemeIds; - std::map missingPhonemes; + std::vector phoneme_ids; + std::map missing_phonemes; { piper::PhonemeIdConfig config; - phonemes_to_ids(phonemes[0], config, phonemeIds, missingPhonemes); + phonemes_to_ids(phonemes[0], config, phoneme_ids, missing_phonemes); } - for (int32_t p : phonemeIds) { + for (int32_t p : phoneme_ids) { std::cout << p << " "; } std::cout << "\n"; diff --git a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc index 60521ef9a0..d488a957c4 100644 --- a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc @@ -16,13 +16,15 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { py::class_(*m, "OfflineTtsVitsModelConfig") .def(py::init<>()) .def(py::init(), + const std::string &, const std::string, float, float, + float>(), py::arg("model"), py::arg("lexicon"), py::arg("tokens"), - py::arg("noise_scale") = 0.667, py::arg("noise_scale_w") = 0.8, - py::arg("length_scale") = 1.0) + py::arg("data_dir") = "", py::arg("noise_scale") = 0.667, + py::arg("noise_scale_w") = 0.8, py::arg("length_scale") = 1.0) .def_readwrite("model", &PyClass::model) .def_readwrite("lexicon", &PyClass::lexicon) .def_readwrite("tokens", &PyClass::tokens) + .def_readwrite("data_dir", &PyClass::data_dir) .def_readwrite("noise_scale", &PyClass::noise_scale) .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) .def_readwrite("length_scale", &PyClass::length_scale) From ad5dadac6ab73bf12bf15d2712854038f9fcaab1 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 29 Nov 2023 17:23:29 +0800 Subject: [PATCH 02/19] use tokens.txt to convert phonemes to IDs --- python-api-examples/offline-tts.py | 3 + sherpa-onnx/csrc/macros.h | 31 ++++++ sherpa-onnx/csrc/offline-tts-vits-impl.h | 7 +- .../csrc/offline-tts-vits-model-config.cc | 21 ++-- .../csrc/offline-tts-vits-model-config.h | 2 +- sherpa-onnx/csrc/offline-tts-vits-model.cc | 11 +-- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 95 +++++++++++++++++-- sherpa-onnx/csrc/piper-phonemize-lexicon.h | 8 +- .../csrc/offline-tts-vits-model-config.cc | 3 +- sherpa-onnx/python/csrc/offline-tts.cc | 1 + 10 files changed, 149 insertions(+), 33 deletions(-) diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index 246eedf444..265b09af20 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -161,6 +161,9 @@ def main(): ), rule_fsts=args.tts_rule_fsts, ) + if not tts_config.validate(): + raise ValueError("Please check your config") + tts = sherpa_onnx.OfflineTts(tts_config) start = time.time() diff --git a/sherpa-onnx/csrc/macros.h b/sherpa-onnx/csrc/macros.h index fae92dc600..39a2146cc2 100644 --- a/sherpa-onnx/csrc/macros.h +++ b/sherpa-onnx/csrc/macros.h @@ -43,6 +43,21 @@ } \ } while (0) +#define SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(dst, src_key, default_value) \ + do { \ + auto value = \ + meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \ + if (!value) { \ + dst = default_value; \ + } else { \ + dst = atoi(value.get()); \ + if (dst < 0) { \ + SHERPA_ONNX_LOGE("Invalid value %d for %s", dst, src_key); \ + exit(-1); \ + } \ + } \ + } while (0) + // read a vector of integers #define SHERPA_ONNX_READ_META_DATA_VEC(dst, src_key) \ do { \ @@ -112,4 +127,20 @@ } \ } while (0) +#define SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(dst, src_key, \ + default_value) \ + do { \ + auto value = \ + meta_data.LookupCustomMetadataMapAllocated(src_key, allocator); \ + if (!value) { \ + dst = default_value; \ + } else { \ + dst = value.get(); \ + if (dst.empty()) { \ + SHERPA_ONNX_LOGE("Invalid value for %s\n", src_key); \ + exit(-1); \ + } \ + } \ + } while (0) + #endif // SHERPA_ONNX_CSRC_MACROS_H_ diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 2ace0f57d9..20d9c37ac6 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -147,10 +147,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { private: void InitLexicon() { - if (model_->IsPiper() && model_->Language() == "English" && - !config_.model.vits.data_dir.empty()) { - lexicon_ = - std::make_unique(config_.model.vits.data_dir); + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { + lexicon_ = std::make_unique( + config_.model.vits.tokens, config_.model.vits.data_dir); } else { lexicon_ = std::make_unique( config_.model.vits.lexicon, config_.model.vits.tokens, diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc index feb0b1e369..b9fce0f6ba 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.cc @@ -15,7 +15,7 @@ void OfflineTtsVitsModelConfig::Register(ParseOptions *po) { po->Register("vits-tokens", &tokens, "Path to tokens.txt for VITS models"); po->Register("vits-data-dir", &data_dir, "Path to the directory containing dict for espeak-ng. If it is " - "given, --vits-lexicon and --vits-tokens are ignored."); + "given, --vits-lexicon is ignored."); po->Register("vits-noise-scale", &noise_scale, "noise_scale for VITS models"); po->Register("vits-noise-scale-w", &noise_scale_w, "noise_scale_w for VITS models"); @@ -34,6 +34,16 @@ bool OfflineTtsVitsModelConfig::Validate() const { return false; } + if (tokens.empty()) { + SHERPA_ONNX_LOGE("Please provide --vits-tokens"); + return false; + } + + if (!FileExists(tokens)) { + SHERPA_ONNX_LOGE("--vits-tokens: %s does not exist", tokens.c_str()); + return false; + } + if (data_dir.empty()) { if (lexicon.empty()) { SHERPA_ONNX_LOGE("Please provide --vits-lexicon"); @@ -45,15 +55,6 @@ bool OfflineTtsVitsModelConfig::Validate() const { return false; } - if (tokens.empty()) { - SHERPA_ONNX_LOGE("Please provide --vits-tokens"); - return false; - } - - if (!FileExists(tokens)) { - SHERPA_ONNX_LOGE("--vits-tokens: %s does not exist", tokens.c_str()); - return false; - } } else { if (!FileExists(data_dir + "/phontab")) { SHERPA_ONNX_LOGE("%s/phontab does not exist. Skipping test", diff --git a/sherpa-onnx/csrc/offline-tts-vits-model-config.h b/sherpa-onnx/csrc/offline-tts-vits-model-config.h index 99ee86b063..cde8b39209 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model-config.h +++ b/sherpa-onnx/csrc/offline-tts-vits-model-config.h @@ -16,7 +16,7 @@ struct OfflineTtsVitsModelConfig { std::string lexicon; std::string tokens; - // If data_dir is given, lexicon and tokens are ignored + // If data_dir is given, lexicon is ignored // data_dir is for piper-phonemize, which uses espeak-ng std::string data_dir; diff --git a/sherpa-onnx/csrc/offline-tts-vits-model.cc b/sherpa-onnx/csrc/offline-tts-vits-model.cc index 2e8cfe766e..31e3a7c31d 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-model.cc +++ b/sherpa-onnx/csrc/offline-tts-vits-model.cc @@ -75,15 +75,12 @@ class OfflineTtsVitsModel::Impl { Ort::AllocatorWithDefaultOptions allocator; // used in the macro below SHERPA_ONNX_READ_META_DATA(sample_rate_, "sample_rate"); - SHERPA_ONNX_READ_META_DATA(add_blank_, "add_blank"); + SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(add_blank_, "add_blank", 0); SHERPA_ONNX_READ_META_DATA(num_speakers_, "n_speakers"); - SHERPA_ONNX_READ_META_DATA_STR(punctuations_, "punctuation"); + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(punctuations_, "punctuation", + ""); SHERPA_ONNX_READ_META_DATA_STR(language_, "language"); - // SHERPA_ONNX_READ_META_DATA_STR(voice_, "voice"); - if (language_ == "English") { - // FIXME(fangjun): Read voice from the metadata - voice_ = "en-us"; - } + SHERPA_ONNX_READ_META_DATA_STR_WITH_DEFAULT(voice_, "voice", ""); std::string comment; SHERPA_ONNX_READ_META_DATA_STR(comment, "comment"); diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index a0335c18a0..91d3eee801 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -4,8 +4,11 @@ #include "sherpa-onnx/csrc/piper-phonemize-lexicon.h" +#include +#include #include #include // NOLINT +#include #include "espeak-ng/speak_lib.h" #include "phoneme_ids.hpp" @@ -14,6 +17,80 @@ namespace sherpa_onnx { +static std::unordered_map ReadTokens(std::istream &is) { + std::wstring_convert, char32_t> conv; + std::unordered_map token2id; + + std::string line; + + std::string sym; + std::u32string s; + int32_t id; + while (std::getline(is, line)) { + std::istringstream iss(line); + iss >> sym; + if (iss.eof()) { + id = atoi(sym.c_str()); + sym = " "; + } else { + iss >> id; + } + + // eat the trailing \r\n on windows + iss >> std::ws; + if (!iss.eof()) { + SHERPA_ONNX_LOGE("Error when reading tokens: %s", line.c_str()); + exit(-1); + } + + s = conv.from_bytes(sym); + if (s.size() != 1) { + SHERPA_ONNX_LOGE("Error when reading tokens at Line %s. size: %d", + line.c_str(), static_cast(s.size())); + exit(-1); + } + char32_t c = s[0]; + + if (token2id.count(c)) { + SHERPA_ONNX_LOGE("Duplicated token %s. Line %s. Existing ID: %d", + sym.c_str(), line.c_str(), token2id.at(c)); + exit(-1); + } + + token2id.insert({c, id}); + } + + return token2id; +} + +// see the function "phonemes_to_ids" from +// https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb +static std::vector PhonemesToIds( + const std::unordered_map &token2id, + const std::vector &phonemes) { + // see + // https://github.com/rhasspy/piper-phonemize/blob/master/src/phoneme_ids.hpp#L17 + int32_t pad = token2id.at(U'_'); + int32_t bos = token2id.at(U'^'); + int32_t eos = token2id.at(U'$'); + + std::vector ans; + ans.reserve(phonemes.size()); + + ans.push_back(bos); + for (auto p : phonemes) { + if (token2id.count(p)) { + ans.push_back(token2id.at(p)); + ans.push_back(pad); + } else { + SHERPA_ONNX_LOGE("Skip unkown phonemes. Unicode codepoint: \\U+%04x.", p); + } + } + ans.push_back(eos); + + return ans; +} + void InitEspeak(const std::string &data_dir) { static std::once_flag init_flag; std::call_once(init_flag, [data_dir]() { @@ -29,8 +106,14 @@ void InitEspeak(const std::string &data_dir) { }); } -PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &data_dir) +PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens, + const std::string &data_dir) : data_dir_(data_dir) { + { + std::ifstream is(tokens); + token2id_ = ReadTokens(is); + } + InitEspeak(data_dir_); } @@ -45,15 +128,11 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( std::vector> phonemes; piper::phonemize_eSpeak(text, config, phonemes); - std::vector phoneme_ids; - std::map missing_phonemes; - std::vector ans; - piper::PhonemeIdConfig id_config; + + std::vector phoneme_ids; for (const auto &p : phonemes) { - phoneme_ids.clear(); - missing_phonemes.clear(); - phonemes_to_ids(p, id_config, phoneme_ids, missing_phonemes); + phoneme_ids = PhonemesToIds(token2id_, p); ans.insert(ans.end(), phoneme_ids.begin(), phoneme_ids.end()); } diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h index 5f29addf32..627fc9397b 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.h +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -5,20 +5,24 @@ #ifndef SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ #define SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ +#include + #include "sherpa-onnx/csrc/lexicon.h" namespace sherpa_onnx { class PiperPhonemizeLexicon : public Lexicon { public: - explicit PiperPhonemizeLexicon(const std::string &data_dir); + explicit PiperPhonemizeLexicon(const std::string &tokens, + const std::string &data_dir); std::vector ConvertTextToTokenIds( const std::string &text, const std::string &voice = "") const override; private: - std::string voice_; std::string data_dir_; + // map unicode codepoint to an integer ID + std::unordered_map token2id_; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc index d488a957c4..6e016715dc 100644 --- a/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc +++ b/sherpa-onnx/python/csrc/offline-tts-vits-model-config.cc @@ -28,7 +28,8 @@ void PybindOfflineTtsVitsModelConfig(py::module *m) { .def_readwrite("noise_scale", &PyClass::noise_scale) .def_readwrite("noise_scale_w", &PyClass::noise_scale_w) .def_readwrite("length_scale", &PyClass::length_scale) - .def("__str__", &PyClass::ToString); + .def("__str__", &PyClass::ToString) + .def("validate", &PyClass::Validate); } } // namespace sherpa_onnx diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc index 538ceceed3..39669a0e4c 100644 --- a/sherpa-onnx/python/csrc/offline-tts.cc +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -34,6 +34,7 @@ static void PybindOfflineTtsConfig(py::module *m) { py::arg("model"), py::arg("rule_fsts") = "") .def_readwrite("model", &PyClass::model) .def_readwrite("rule_fsts", &PyClass::rule_fsts) + .def("validate", &PyClass::Validate) .def("__str__", &PyClass::ToString); } From 0f87339b26a27eb36f638403a071848df7a25146 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 29 Nov 2023 19:03:13 +0800 Subject: [PATCH 03/19] fix building wheels --- cmake/cmake_extension.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index c67abc502b..be65543835 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -73,6 +73,10 @@ def build_extension(self, ext: setuptools.extension.Extension): extra_cmake_args = f" -DCMAKE_INSTALL_PREFIX={install_dir} " extra_cmake_args += " -DBUILD_SHARED_LIBS=ON " + extra_cmake_args += " -DBUILD_PIPER_PHONMIZE_EXE=OFF " + extra_cmake_args += " -DBUILD_PIPER_PHONMIZE_TESTS=OFF " + extra_cmake_args += " -DBUILD_ESPEAK_NG_EXE=OFF " + extra_cmake_args += " -DBUILD_ESPEAK_NG_TESTS=OFF " extra_cmake_args += " -DSHERPA_ONNX_ENABLE_CHECK=OFF " extra_cmake_args += " -DSHERPA_ONNX_ENABLE_PYTHON=ON " @@ -161,5 +165,6 @@ def build_extension(self, ext: setuptools.extension.Extension): shutil.copy(f"{src_file}", f"{out_bin_dir}/") shutil.rmtree(f"{install_dir}/bin") + shutil.rmtree(f"{install_dir}/share") if is_windows(): shutil.rmtree(f"{install_dir}/lib") From 812fce8f0d801629cd8a0fd8d63f23b844e2ae59 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 10:37:01 +0800 Subject: [PATCH 04/19] Fix pip install on Windows --- cmake/cmake_extension.py | 3 +++ setup.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index be65543835..ba8d2e7a20 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -150,6 +150,9 @@ def build_extension(self, ext: setuptools.extension.Extension): binaries += ["sherpa-onnx-core.dll"] binaries += ["sherpa-onnx-portaudio.dll"] binaries += ["onnxruntime.dll"] + binaries += ["piper_phonemize.dll"] + binaries += ["espeak-ng.dll"] + binaries += ["ucd.dll"] binaries += ["kaldi-decoder-core.dll"] binaries += ["sherpa-onnx-fst.lib"] binaries += ["sherpa-onnx-kaldifst-core.lib"] diff --git a/setup.py b/setup.py index 7b21311be6..8807129329 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,9 @@ def get_binaries_to_install(): binaries += ["sherpa-onnx-core.dll"] binaries += ["sherpa-onnx-portaudio.dll"] binaries += ["onnxruntime.dll"] + binaries += ["piper_phonemize.dll"] + binaries += ["espeak-ng.dll"] + binaries += ["ucd.dll"] binaries += ["kaldi-decoder-core.dll"] binaries += ["sherpa-onnx-fst.lib"] binaries += ["sherpa-onnx-kaldifst-core.lib"] From 397c32fd596b6e9d9eb4c19a6842fa7804a6befe Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 10:42:34 +0800 Subject: [PATCH 05/19] test building wheels --- .github/workflows/test-build-wheel.yaml | 80 +++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 .github/workflows/test-build-wheel.yaml diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml new file mode 100644 index 0000000000..ac5c38dccb --- /dev/null +++ b/.github/workflows/test-build-wheel.yaml @@ -0,0 +1,80 @@ +name: test-build-wheel + +on: + push: + branches: + - english-piper-phonemize-2 + + pull_request: + + workflow_dispatch: + +concurrency: + group: test-build-wheel-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-build-wheel: + name: ${{ matrix.os }} ${{ matrix.python_version }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ matrix.os }}-${{ matrix.python_version }} + + - name: Install python dependencies + shell: bash + run: | + python3 -m pip install --upgrade pip + python3 -m pip install wheel twine setuptools + + - name: Build + shell: bash + run: | + export CMAKE_CXX_COMPILER_LAUNCHER=ccache + export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH" + cmake --version + + export SHERPA_ONNX_MAKE_ARGS="VERBOSE=1 -j" + + python3 setup.py bdist_wheel + ls -lh dist + + - name: Display wheel + shell: bash + run: | + ls -lh dist + + - name: Install wheel + shell: bash + run: | + pip install --verbose ./dist/*.whl + + - name: Test + shell: bash + run: | + # For windows + export PATH=/c/hostedtoolcache/windows/Python/3.7.9/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.8.10/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.9.13/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.10.11/x64/bin:$PATH + export PATH=/c/hostedtoolcache/windows/Python/3.11.6/x64/bin:$PATH + + which sherpa-onnx + sherpa-onnx --help From 826a6d15f1e4cfe54889eb392d122bfdd87dde94 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 14:48:13 +0800 Subject: [PATCH 06/19] refactor lexicon --- python-api-examples/offline-tts.py | 12 ++ sherpa-onnx/csrc/lexicon.cc | 10 +- sherpa-onnx/csrc/lexicon.h | 22 ++-- sherpa-onnx/csrc/offline-tts-frontend.h | 35 ++++++ sherpa-onnx/csrc/offline-tts-vits-impl.h | 122 ++++++++++++++++---- sherpa-onnx/csrc/offline-tts.cc | 9 +- sherpa-onnx/csrc/offline-tts.h | 11 +- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 9 +- sherpa-onnx/csrc/piper-phonemize-lexicon.h | 8 +- sherpa-onnx/python/csrc/offline-tts.cc | 7 +- 10 files changed, 195 insertions(+), 50 deletions(-) create mode 100644 sherpa-onnx/csrc/offline-tts-frontend.h diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index 265b09af20..c806ce4cba 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -89,6 +89,17 @@ def get_args(): help="Path to rule.fst", ) + parser.add_argument( + "--max-num-sentences", + type=int, + default=2, + help="""Max number of sentences in a batch to avoid OOM if the input + text is very long. Set it to -1 to process all the sentences in a + single batch. A smaller value does not mean it is slower compared + to a larger one on CPU. + """, + ) + parser.add_argument( "--output-filename", type=str, @@ -160,6 +171,7 @@ def main(): num_threads=args.num_threads, ), rule_fsts=args.tts_rule_fsts, + max_num_sentences=args.max_num_sentences, ) if not tts_config.validate(): raise ValueError("Please check your config") diff --git a/sherpa-onnx/csrc/lexicon.cc b/sherpa-onnx/csrc/lexicon.cc index ea0ed06599..0b57e3f46c 100644 --- a/sherpa-onnx/csrc/lexicon.cc +++ b/sherpa-onnx/csrc/lexicon.cc @@ -129,7 +129,7 @@ Lexicon::Lexicon(AAssetManager *mgr, const std::string &lexicon, } #endif -std::vector Lexicon::ConvertTextToTokenIds( +std::vector> Lexicon::ConvertTextToTokenIds( const std::string &text, const std::string & /*voice*/ /*= ""*/) const { switch (language_) { case Language::kEnglish: @@ -150,7 +150,7 @@ std::vector Lexicon::ConvertTextToTokenIds( return {}; } -std::vector Lexicon::ConvertTextToTokenIdsChinese( +std::vector> Lexicon::ConvertTextToTokenIdsChinese( const std::string &text) const { std::vector words; if (pattern_) { @@ -245,10 +245,10 @@ std::vector Lexicon::ConvertTextToTokenIdsChinese( ans.push_back(eos); } - return ans; + return {ans}; } -std::vector Lexicon::ConvertTextToTokenIdsEnglish( +std::vector> Lexicon::ConvertTextToTokenIdsEnglish( const std::string &_text) const { std::string text(_text); ToLowerCase(&text); @@ -301,7 +301,7 @@ std::vector Lexicon::ConvertTextToTokenIdsEnglish( ans.push_back(token2id_.at("$")); // eos } - return ans; + return {ans}; } void Lexicon::InitTokens(std::istream &is) { token2id_ = ReadTokens(is); } diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index 38ddc98cf4..d219aaaec1 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -13,6 +13,8 @@ #include #include +#include "sherpa-onnx/csrc/offline-tts-frontend.h" + #if __ANDROID_API__ >= 9 #include "android/asset_manager.h" #include "android/asset_manager_jni.h" @@ -20,11 +22,11 @@ namespace sherpa_onnx { -// TODO(fangjun): Refactor it to an abstract class -class Lexicon { +class Lexicon : public OfflineTtsFrontend { public: - virtual ~Lexicon() = default; Lexicon() = default; // for subclasses + // + // Note: for models from piper, we won't use this class. Lexicon(const std::string &lexicon, const std::string &tokens, const std::string &punctuations, const std::string &language, bool debug = false, bool is_piper = false); @@ -36,29 +38,29 @@ class Lexicon { bool is_piper = false); #endif - virtual std::vector ConvertTextToTokenIds( - const std::string &text, const std::string &voice = "") const; + std::vector> ConvertTextToTokenIds( + const std::string &text, const std::string &voice = "") const override; private: - std::vector ConvertTextToTokenIdsGerman( + std::vector> ConvertTextToTokenIdsGerman( const std::string &text) const { return ConvertTextToTokenIdsEnglish(text); } - std::vector ConvertTextToTokenIdsSpanish( + std::vector> ConvertTextToTokenIdsSpanish( const std::string &text) const { return ConvertTextToTokenIdsEnglish(text); } - std::vector ConvertTextToTokenIdsFrench( + std::vector> ConvertTextToTokenIdsFrench( const std::string &text) const { return ConvertTextToTokenIdsEnglish(text); } - std::vector ConvertTextToTokenIdsEnglish( + std::vector> ConvertTextToTokenIdsEnglish( const std::string &text) const; - std::vector ConvertTextToTokenIdsChinese( + std::vector> ConvertTextToTokenIdsChinese( const std::string &text) const; void InitLanguage(const std::string &lang); diff --git a/sherpa-onnx/csrc/offline-tts-frontend.h b/sherpa-onnx/csrc/offline-tts-frontend.h new file mode 100644 index 0000000000..9f116f1256 --- /dev/null +++ b/sherpa-onnx/csrc/offline-tts-frontend.h @@ -0,0 +1,35 @@ +// sherpa-onnx/csrc/offline-tts-frontend.h +// +// Copyright (c) 2023 Xiaomi Corporation + +#ifndef SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ +#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ +#include +#include +#include + +namespace sherpa_onnx { + +class OfflineTtsFrontend { + public: + virtual ~OfflineTtsFrontend() = default; + + /** Convert a string to token IDs. + * + * @param text The input text. + * Example 1: "This is the first sample sentence; this is the + * second one." Example 2: "这是第一句。这是第二句。" + * @param voice Optional. It is for espeak-ng. + * + * @return Return a vector-of-vector of token IDs. Each subvector contains + * a sentence that can be processed independently. + * If a frontend does not support splitting the text into sentences, + * the resulting vector contains only one subvector. + */ + virtual std::vector> ConvertTextToTokenIds( + const std::string &text, const std::string &voice = "") const = 0; +}; + +} // namespace sherpa_onnx + +#endif // SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_ diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index 20d9c37ac6..c6e9cd8ffb 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -18,6 +18,7 @@ #include "kaldifst/csrc/text-normalizer.h" #include "sherpa-onnx/csrc/lexicon.h" #include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/offline-tts-frontend.h" #include "sherpa-onnx/csrc/offline-tts-impl.h" #include "sherpa-onnx/csrc/offline-tts-vits-model.h" #include "sherpa-onnx/csrc/onnx-utils.h" @@ -50,7 +51,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config) : config_(config), model_(std::make_unique(mgr, config.model)), - lexicon_(std::make_unique( + frontend_(std::make_unique( mgr, config.model.vits.lexicon, config.model.vits.tokens, model_->Punctuations(), model_->Language(), config.model.debug, model_->IsPiper())) { @@ -102,21 +103,107 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { } } - std::vector x = - lexicon_->ConvertTextToTokenIds(text, model_->Voice()); - if (x.empty()) { + std::vector> x = + frontend_->ConvertTextToTokenIds(text, model_->Voice()); + + if (x.empty() || (x.size() == 1 && x[0].empty())) { SHERPA_ONNX_LOGE("Failed to convert %s to token IDs", text.c_str()); return {}; } if (model_->AddBlank() && config_.model.vits.data_dir.empty()) { - std::vector buffer(x.size() * 2 + 1); - int32_t i = 1; - for (auto k : x) { - buffer[i] = k; - i += 2; + for (auto &k : x) { + k = AddBlank(k); + } + } + + int32_t x_size = static_cast(x.size()); + + if (config_.max_num_sentences <= 0 || x_size <= config_.max_num_sentences) { + return Process(x, sid, speed); + } + + // the input text is too long, we process sentences within it in batches + // to avoid OOM. Batch size is config_.max_num_sentences + std::vector> batch; + int32_t batch_size = config_.max_num_sentences; + batch.reserve(config_.max_num_sentences); + int32_t num_batches = x_size / batch_size; + + if (config_.model.debug) { + SHERPA_ONNX_LOGE( + "Text is too long. Split it into %d batches. batch size: %d. Number " + "of sentences: %d", + num_batches, batch_size, x_size); + } + + GeneratedAudio ans; + + int32_t k = 0; + + for (int32_t b = 0; b != num_batches; ++b) { + batch.clear(); + for (int32_t i = 0; i != batch_size; ++i, ++k) { + batch.push_back(std::move(x[k])); } - x = std::move(buffer); + + auto audio = Process(batch, sid, speed); + ans.sample_rate = audio.sample_rate; + ans.samples.insert(ans.samples.end(), audio.samples.begin(), + audio.samples.end()); + } + + batch.clear(); + while (k < x.size()) { + batch.push_back(std::move(x[k])); + ++k; + } + + if (!batch.empty()) { + auto audio = Process(batch, sid, speed); + ans.sample_rate = audio.sample_rate; + ans.samples.insert(ans.samples.end(), audio.samples.begin(), + audio.samples.end()); + } + + return ans; + } + + private: + void InitLexicon() { + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { + frontend_ = std::make_unique( + config_.model.vits.tokens, config_.model.vits.data_dir); + } else { + frontend_ = std::make_unique( + config_.model.vits.lexicon, config_.model.vits.tokens, + model_->Punctuations(), model_->Language(), config_.model.debug, + model_->IsPiper()); + } + } + + std::vector AddBlank(const std::vector &x) const { + // we assume the blank ID is 0 + std::vector buffer(x.size() * 2 + 1); + int32_t i = 1; + for (auto k : x) { + buffer[i] = k; + i += 2; + } + return buffer; + } + + GeneratedAudio Process(const std::vector> &tokens, + int32_t sid, float speed) const { + int32_t num_tokens = 0; + for (const auto &k : tokens) { + num_tokens += k.size(); + } + + std::vector x; + x.reserve(num_tokens); + for (const auto &k : tokens) { + x.insert(x.end(), k.begin(), k.end()); } auto memory_info = @@ -145,24 +232,11 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { return ans; } - private: - void InitLexicon() { - if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { - lexicon_ = std::make_unique( - config_.model.vits.tokens, config_.model.vits.data_dir); - } else { - lexicon_ = std::make_unique( - config_.model.vits.lexicon, config_.model.vits.tokens, - model_->Punctuations(), model_->Language(), config_.model.debug, - model_->IsPiper()); - } - } - private: OfflineTtsConfig config_; std::unique_ptr model_; std::vector> tn_list_; - std::unique_ptr lexicon_; + std::unique_ptr frontend_; }; } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/offline-tts.cc b/sherpa-onnx/csrc/offline-tts.cc index b8536e26ec..aaaf22fbbc 100644 --- a/sherpa-onnx/csrc/offline-tts.cc +++ b/sherpa-onnx/csrc/offline-tts.cc @@ -21,6 +21,12 @@ void OfflineTtsConfig::Register(ParseOptions *po) { "Multiple filenames are separated by a comma and they are " "applied from left to right. An example value: " "rule1.fst,rule2,fst,rule3.fst"); + + po->Register( + "tts-max-num-sentences", &max_num_sentences, + "Maximum number of sentences that we process at a time. " + "This is to avoid OOM for very long input text. " + "If you set it to -1, then we process all sentences in a single batch."); } bool OfflineTtsConfig::Validate() const { @@ -43,7 +49,8 @@ std::string OfflineTtsConfig::ToString() const { os << "OfflineTtsConfig("; os << "model=" << model.ToString() << ", "; - os << "rule_fsts=\"" << rule_fsts << "\")"; + os << "rule_fsts=\"" << rule_fsts << "\", "; + os << "max_num_sentences=" << max_num_sentences << ")"; return os.str(); } diff --git a/sherpa-onnx/csrc/offline-tts.h b/sherpa-onnx/csrc/offline-tts.h index f581ea04ed..20a86913f7 100644 --- a/sherpa-onnx/csrc/offline-tts.h +++ b/sherpa-onnx/csrc/offline-tts.h @@ -28,10 +28,17 @@ struct OfflineTtsConfig { // If there are multiple rules, they are applied from left to right. std::string rule_fsts; + // Maximum number of sentences that we process at a time. + // This is to avoid OOM for very long input text. + // If you set it to -1, then we process all sentences in a single batch. + int32_t max_num_sentences = 2; + OfflineTtsConfig() = default; OfflineTtsConfig(const OfflineTtsModelConfig &model, - const std::string &rule_fsts) - : model(model), rule_fsts(rule_fsts) {} + const std::string &rule_fsts, int32_t max_num_sentences) + : model(model), + rule_fsts(rule_fsts), + max_num_sentences(max_num_sentences) {} void Register(ParseOptions *po); bool Validate() const; diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index 91d3eee801..cd8f6027b5 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -9,6 +9,9 @@ #include #include // NOLINT #include +#include +#include +#include #include "espeak-ng/speak_lib.h" #include "phoneme_ids.hpp" @@ -117,7 +120,7 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens, InitEspeak(data_dir_); } -std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( +std::vector> PiperPhonemizeLexicon::ConvertTextToTokenIds( const std::string &text, const std::string &voice /*= ""*/) const { piper::eSpeakPhonemeConfig config; @@ -128,12 +131,12 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( std::vector> phonemes; piper::phonemize_eSpeak(text, config, phonemes); - std::vector ans; + std::vector> ans; std::vector phoneme_ids; for (const auto &p : phonemes) { phoneme_ids = PhonemesToIds(token2id_, p); - ans.insert(ans.end(), phoneme_ids.begin(), phoneme_ids.end()); + ans.push_back(std::move(phoneme_ids)); } return ans; diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h index 627fc9397b..9e2dd5584b 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.h +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -5,18 +5,20 @@ #ifndef SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ #define SHERPA_ONNX_CSRC_PIPER_PHONEMIZE_LEXICON_H_ +#include #include +#include -#include "sherpa-onnx/csrc/lexicon.h" +#include "sherpa-onnx/csrc/offline-tts-frontend.h" namespace sherpa_onnx { -class PiperPhonemizeLexicon : public Lexicon { +class PiperPhonemizeLexicon : public OfflineTtsFrontend { public: explicit PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir); - std::vector ConvertTextToTokenIds( + std::vector> ConvertTextToTokenIds( const std::string &text, const std::string &voice = "") const override; private: diff --git a/sherpa-onnx/python/csrc/offline-tts.cc b/sherpa-onnx/python/csrc/offline-tts.cc index 39669a0e4c..e191c081b2 100644 --- a/sherpa-onnx/python/csrc/offline-tts.cc +++ b/sherpa-onnx/python/csrc/offline-tts.cc @@ -30,10 +30,13 @@ static void PybindOfflineTtsConfig(py::module *m) { using PyClass = OfflineTtsConfig; py::class_(*m, "OfflineTtsConfig") .def(py::init<>()) - .def(py::init(), - py::arg("model"), py::arg("rule_fsts") = "") + .def(py::init(), + py::arg("model"), py::arg("rule_fsts") = "", + py::arg("max_num_sentences") = 2) .def_readwrite("model", &PyClass::model) .def_readwrite("rule_fsts", &PyClass::rule_fsts) + .def_readwrite("max_num_sentences", &PyClass::max_num_sentences) .def("validate", &PyClass::Validate) .def("__str__", &PyClass::ToString); } From 0a19b9c44f34e0f9f231dd5930d108ac695626bc Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 15:40:08 +0800 Subject: [PATCH 07/19] fix ci for c api --- .github/scripts/test-offline-tts.sh | 18 +++++++++++ c-api-examples/offline-tts-c-api.c | 50 +++++++++++++++++++++++++++-- sherpa-onnx/c-api/c-api.cc | 3 ++ sherpa-onnx/c-api/c-api.h | 2 ++ 4 files changed, 71 insertions(+), 2 deletions(-) diff --git a/.github/scripts/test-offline-tts.sh b/.github/scripts/test-offline-tts.sh index 15be2d9210..dca90b1d3f 100755 --- a/.github/scripts/test-offline-tts.sh +++ b/.github/scripts/test-offline-tts.sh @@ -16,6 +16,24 @@ which $EXE # test waves are saved in ./tts mkdir ./tts +log "------------------------------------------------------------" +log "vits-piper-en_US-amy-low" +log "------------------------------------------------------------" +curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 +tar xf vits-piper-en_US-amy-low.tar.bz2 +rm vits-piper-en_US-amy-low.tar.bz2 + +$EXE \ + --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ + --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + --debug=1 \ + --output-filename=./tts/amy.wav \ + "“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.” The sun shone bleakly in the sky, its meager light struggling to penetrate the thick foliage of the forest. Birds sang their songs up in the crowns of the trees, fluttering from one branch to the other. A blanket of total tranquility lied over the forest. The peace was only broken by the steady gallop of the horses of the soldiers who were traveling to their upcoming knighting the morrow at Camelot, and rowdy conversation. “Finally we will get what we deserve,” “It’s been about time,” Perceval agreed. “We’ve been risking our arses for the past two years. It’s the least they could give us.” Merlin remained ostensibly silent, refusing to join the verbal parade of self-aggrandizing his fellow soldiers have engaged in. He found it difficult to happy about anything, when even if they had won the war, he had lost everything else in the process." + +file ./tts/amy.wav +rm -rf vits-piper-en_US-amy-low + log "------------------------------------------------------------" log "vits-ljs test" log "------------------------------------------------------------" diff --git a/c-api-examples/offline-tts-c-api.c b/c-api-examples/offline-tts-c-api.c index c4e9be62bd..7fbdb004ca 100644 --- a/c-api-examples/offline-tts-c-api.c +++ b/c-api-examples/offline-tts-c-api.c @@ -65,6 +65,29 @@ static struct cag_option options[] = { .identifier = 'a', .description = "Filename to save the generated audio. Default to ./generated.wav"}, + + {.access_name = "tts-rule-fsts", + .value_name = "/path/to/rule.fst", + .identifier = 'b', + .description = "It not empty, it contains a list of rule FST filenames." + "Multiple filenames are separated by a comma and they are " + "applied from left to right. An example value: " + "rule1.fst,rule2,fst,rule3.fst"}, + + {.access_name = "max-num-sentences", + .value_name = "2", + .identifier = 'c', + .description = "Maximum number of sentences that we process at a time. " + "This is to avoid OOM for very long input text. " + "If you set it to -1, then we process all sentences in a " + "single batch."}, + + {.access_name = "vits-data-dir", + .value_name = "/path/to/espeak-ng-data", + .identifier = 'd', + .description = + "Path to espeak-ng-data. If it is given, --vits-lexicon is ignored"}, + }; static void ShowUsage() { @@ -163,15 +186,38 @@ int32_t main(int32_t argc, char *argv[]) { free((void *)filename); filename = strdup(value); break; + case 'b': + config.rule_fsts = value; + break; + case 'c': + config.max_num_sentences = atoi(value); + break; + case 'd': + config.model.vits.data_dir = value; + break; + case '?': + fprintf(stderr, "Unknown option\n"); + // fall through case 'h': // fall through default: ShowUsage(); } } + fprintf(stderr, "here\n"); + + if (!config.model.vits.model) { + fprintf(stderr, "Please provide --vits-model\n"); + ShowUsage(); + } + + if (!config.model.vits.tokens) { + fprintf(stderr, "Please provide --vits-tokens\n"); + ShowUsage(); + } - if (!config.model.vits.model || !config.model.vits.lexicon || - !config.model.vits.tokens) { + if (!config.model.vits.data_dir && !config.model.vits.lexicon) { + fprintf(stderr, "Please provide --vits-data-dir or --vits-lexicon\n"); ShowUsage(); } diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index a88063defb..9c796cd43f 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -547,6 +547,8 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( tts_config.model.vits.lexicon = SHERPA_ONNX_OR(config->model.vits.lexicon, ""); tts_config.model.vits.tokens = SHERPA_ONNX_OR(config->model.vits.tokens, ""); + tts_config.model.vits.data_dir = + SHERPA_ONNX_OR(config->model.vits.data_dir, ""); tts_config.model.vits.noise_scale = SHERPA_ONNX_OR(config->model.vits.noise_scale, 0.667); tts_config.model.vits.noise_scale_w = @@ -558,6 +560,7 @@ SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTts( tts_config.model.debug = config->model.debug; tts_config.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu"); tts_config.rule_fsts = SHERPA_ONNX_OR(config->rule_fsts, ""); + tts_config.max_num_sentences = SHERPA_ONNX_OR(config->max_num_sentences, 2); if (tts_config.model.debug) { fprintf(stderr, "%s\n", tts_config.ToString().c_str()); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 9fd48afded..66d7669e1a 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -607,6 +607,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsVitsModelConfig { const char *model; const char *lexicon; const char *tokens; + const char *data_dir; float noise_scale; float noise_scale_w; @@ -623,6 +624,7 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsModelConfig { SHERPA_ONNX_API typedef struct SherpaOnnxOfflineTtsConfig { SherpaOnnxOfflineTtsModelConfig model; const char *rule_fsts; + int32_t max_num_sentences; } SherpaOnnxOfflineTtsConfig; SHERPA_ONNX_API typedef struct SherpaOnnxGeneratedAudio { From 3233c5deea21dc831993f476e8e358b9a6a083be Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 15:44:24 +0800 Subject: [PATCH 08/19] Fix CI error about wstring_convert not found --- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index cd8f6027b5..03c6063dd0 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -6,6 +6,7 @@ #include #include +#include #include #include // NOLINT #include From 11835550d00abba82e5159663ab182f6f5335fa5 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 16:04:04 +0800 Subject: [PATCH 09/19] Fix nodejs test --- .gitignore | 3 +++ nodejs-examples/.gitignore | 1 + nodejs-examples/README.md | 7 +++---- nodejs-examples/test-offline-tts-en.js | 13 +++++++------ scripts/nodejs/index.js | 2 ++ 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 0b429133de..d63a3eee8a 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,6 @@ xcuserdata/ vits-vctk vits-zh-aishell3 jslint.mjs +vits-piper-en_US-amy-low +vits-piper-*-*-* +log diff --git a/nodejs-examples/.gitignore b/nodejs-examples/.gitignore index d5f19d89b3..1c2d5f338f 100644 --- a/nodejs-examples/.gitignore +++ b/nodejs-examples/.gitignore @@ -1,2 +1,3 @@ node_modules +lib package-lock.json diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index 1ee665e31b..647609a230 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -42,15 +42,14 @@ In the following, we demonstrate how to run text-to-speech. ## ./test-offline-tts-en.js [./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use -a VITS pretrained model -[VCTK](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers) +[vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2) for text-to-speech. You can use the following command to run it: ```bash -wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 -tar xvf vits-vctk.tar.bz2 +wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 +tar xvf vits-piper-en_US-amy-low.tar.bz2 node ./test-offline-tts-en.js ``` diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-en.js index e44e1a55ca..8f0e0c02a6 100644 --- a/nodejs-examples/test-offline-tts-en.js +++ b/nodejs-examples/test-offline-tts-en.js @@ -4,9 +4,9 @@ const sherpa_onnx = require('sherpa-onnx'); function createOfflineTts() { const vits = new sherpa_onnx.OfflineTtsVitsModelConfig(); - vits.model = './vits-vctk/vits-vctk.onnx'; - vits.lexicon = './vits-vctk/lexicon.txt'; - vits.tokens = './vits-vctk/tokens.txt'; + vits.model = 'vits-piper-en_US-amy-low/en_US-amy-low.onnx' + vits.tokens = './vits-piper-en_US-amy-low/tokens.txt'; + vits.dataDir = './vits-piper-en_US-amy-low/espeak-ng-data' const modelConfig = new sherpa_onnx.OfflineTtsModelConfig(); modelConfig.vits = vits; @@ -18,10 +18,11 @@ function createOfflineTts() { } const tts = createOfflineTts(); -const speakerId = 99; +const speakerId = 0; const speed = 1.0; -const audio = - tts.generate('Good morning. How are you doing?', speakerId, speed); +const audio = tts.generate( + '“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”', + speakerId, speed); audio.save('./test-en.wav'); console.log('Saved to test-en.wav successfully.'); tts.free(); diff --git a/scripts/nodejs/index.js b/scripts/nodejs/index.js index ac77ae4a1a..b61f29550b 100644 --- a/scripts/nodejs/index.js +++ b/scripts/nodejs/index.js @@ -186,6 +186,7 @@ const SherpaOnnxOfflineTtsVitsModelConfig = StructType({ "model" : cstring, "lexicon" : cstring, "tokens" : cstring, + "dataDir" : cstring, "noiseScale" : float, "noiseScaleW" : float, "lengthScale" : float, @@ -201,6 +202,7 @@ const SherpaOnnxOfflineTtsModelConfig = StructType({ const SherpaOnnxOfflineTtsConfig = StructType({ "model" : SherpaOnnxOfflineTtsModelConfig, "ruleFsts" : cstring, + "maxNumSentences" : int32_t, }); const SherpaOnnxGeneratedAudio = StructType({ From c6ea9348ff4c99d5584ff648509597e6fd71b9ea Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 16:12:26 +0800 Subject: [PATCH 10/19] fix nodejs CI --- .github/workflows/test-nodejs.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test-nodejs.yaml b/.github/workflows/test-nodejs.yaml index c65ebecc82..8dd9440660 100644 --- a/.github/workflows/test-nodejs.yaml +++ b/.github/workflows/test-nodejs.yaml @@ -70,6 +70,10 @@ jobs: mkdir -p scripts/nodejs/lib/win-x64 dst=scripts/nodejs/lib/win-x64 fi + ls -lh build/install/lib/ + + rm -rf build/install/lib/pkgconfig + cp -v build/install/lib/* $dst/ - name: replace files From 81e916a6c04c8541f47c25c3ad7045e8928ba0ea Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 16:15:06 +0800 Subject: [PATCH 11/19] remove extra files for pip install --- .github/scripts/test-nodejs-npm.sh | 12 +++--------- cmake/cmake_extension.py | 2 ++ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 0dc799283f..7f4af4a6c5 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -52,14 +52,8 @@ node ./test-online-transducer.js rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 # offline tts -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 -tar xvf vits-vctk.tar.bz2 -rm vits-vctk.tar.bz2 -node ./test-offline-tts-en.js -rm -rf vits-vctk -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 -tar xvf vits-zh-aishell3.tar.bz2 -rm vits-zh-aishell3.tar.bz2 +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 +tar xf vits-piper-en_US-amy-low.tar.bz2 node ./test-offline-tts-zh.js -rm -rf vits-zh-aishell3 +rm vits-piper-en_US-amy-low.tar.bz2 diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py index ba8d2e7a20..008dbd67d4 100644 --- a/cmake/cmake_extension.py +++ b/cmake/cmake_extension.py @@ -169,5 +169,7 @@ def build_extension(self, ext: setuptools.extension.Extension): shutil.rmtree(f"{install_dir}/bin") shutil.rmtree(f"{install_dir}/share") + shutil.rmtree(f"{install_dir}/lib/pkgconfig") + if is_windows(): shutil.rmtree(f"{install_dir}/lib") From 3e18fecc72caa4220ff846ce92549838d111594d Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 16:43:47 +0800 Subject: [PATCH 12/19] Fix swift api --- build-swift-macos.sh | 5 ++++- swift-api-examples/SherpaOnnx.swift | 8 ++++++-- swift-api-examples/run-tts.sh | 15 +++++---------- swift-api-examples/tts.swift | 14 ++++++++------ 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/build-swift-macos.sh b/build-swift-macos.sh index a1b06205c0..e5cdbc0866 100755 --- a/build-swift-macos.sh +++ b/build-swift-macos.sh @@ -29,4 +29,7 @@ libtool -static -o ./install/lib/libsherpa-onnx.a \ ./install/lib/libkaldi-native-fbank-core.a \ ./install/lib/libsherpa-onnx-fst.a \ ./install/lib/libsherpa-onnx-kaldifst-core.a \ - ./install/lib/libkaldi-decoder-core.a + ./install/lib/libkaldi-decoder-core.a \ + ./install/lib/libucd.a \ + ./install/lib/libpiper_phonemize.a \ + ./install/lib/libespeak-ng.a diff --git a/swift-api-examples/SherpaOnnx.swift b/swift-api-examples/SherpaOnnx.swift index 2b310b0390..cf7e69aade 100644 --- a/swift-api-examples/SherpaOnnx.swift +++ b/swift-api-examples/SherpaOnnx.swift @@ -578,6 +578,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( model: String, lexicon: String, tokens: String, + dataDir: String = "", noiseScale: Float = 0.667, noiseScaleW: Float = 0.8, lengthScale: Float = 1.0 @@ -586,6 +587,7 @@ func sherpaOnnxOfflineTtsVitsModelConfig( model: toCPointer(model), lexicon: toCPointer(lexicon), tokens: toCPointer(tokens), + data_dir: toCPointer(dataDir), noise_scale: noiseScale, noise_scale_w: noiseScaleW, length_scale: lengthScale) @@ -607,11 +609,13 @@ func sherpaOnnxOfflineTtsModelConfig( func sherpaOnnxOfflineTtsConfig( model: SherpaOnnxOfflineTtsModelConfig, - ruleFsts: String = "" + ruleFsts: String = "", + maxNumSenetences: Int = 2 ) -> SherpaOnnxOfflineTtsConfig { return SherpaOnnxOfflineTtsConfig( model: model, - rule_fsts: toCPointer(ruleFsts) + rule_fsts: toCPointer(ruleFsts), + max_num_sentences: Int32(maxNumSenetences) ) } diff --git a/swift-api-examples/run-tts.sh b/swift-api-examples/run-tts.sh index 2bfcc3e7fd..5604a43a84 100755 --- a/swift-api-examples/run-tts.sh +++ b/swift-api-examples/run-tts.sh @@ -7,17 +7,12 @@ if [ ! -d ../build-swift-macos ]; then exit 1 fi -if [ ! -d ./vits-vctk ]; then - echo "Please download the pre-trained model for testing." - echo "You can refer to" - echo "" - echo "https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vctk-english-multi-speaker-109-speakers" - echo "" - echo "for help" +if [ ! -d ./vits-piper-en_US-amy-low ]; then + echo "Download a pre-trained model for testing." - wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-vctk.tar.bz2 - tar xvf vits-vctk.tar.bz2 - rm vits-vctk.tar.bz2 + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 + tar xf vits-piper-en_US-amy-low.tar.bz2 + rm vits-piper-en_US-amy-low.tar.bz2 fi if [ ! -e ./tts ]; then diff --git a/swift-api-examples/tts.swift b/swift-api-examples/tts.swift index 28819646ab..61c68bce3a 100644 --- a/swift-api-examples/tts.swift +++ b/swift-api-examples/tts.swift @@ -1,18 +1,20 @@ func run() { - let model = "./vits-vctk/vits-vctk.onnx" - let lexicon = "./vits-vctk/lexicon.txt" - let tokens = "./vits-vctk/tokens.txt" + let model = "./vits-piper-en_US-amy-low/en_US-amy-low.onnx" + let tokens = "./vits-piper-en_US-amy-low/tokens.txt" + let dataDir = "./vits-piper-en_US-amy-low/espeak-ng-data" let vits = sherpaOnnxOfflineTtsVitsModelConfig( model: model, - lexicon: lexicon, - tokens: tokens + lexicon: "", + tokens: tokens, + dataDir: dataDir ) let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) var ttsConfig = sherpaOnnxOfflineTtsConfig(model: modelConfig) let tts = SherpaOnnxOfflineTtsWrapper(config: &ttsConfig) - let text = "How are you doing? Fantastic!" + let text = + "“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”" let sid = 99 let speed: Float = 1.0 From 2b530da8ea47a06a1e25d256c49a421c80c51c85 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 16:57:51 +0800 Subject: [PATCH 13/19] Fix pkg-config --- .github/scripts/test-nodejs-npm.sh | 7 ++++++- cmake/sherpa-onnx.pc.in | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 7f4af4a6c5..8da89ffa17 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -55,5 +55,10 @@ rm -rf sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 -node ./test-offline-tts-zh.js +node ./test-offline-tts-en.js rm vits-piper-en_US-amy-low.tar.bz2 + +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 +tar xvf vits-zh-aishell3.tar.bz2 +node ./test-offline-tts-zh.js +rm vits-zh-aishell3.tar.bz2 diff --git a/cmake/sherpa-onnx.pc.in b/cmake/sherpa-onnx.pc.in index 1d38622e2a..b9c5af9cad 100644 --- a/cmake/sherpa-onnx.pc.in +++ b/cmake/sherpa-onnx.pc.in @@ -13,4 +13,4 @@ Cflags: -I"${includedir}" # Note: -lcargs is required only for the following file # https://github.com/k2-fsa/sherpa-onnx/blob/master/c-api-examples/decode-file-c-api.c # We add it here so that users don't need to specify -lcargs when compiling decode-file-c-api.c -Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lcargs -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ +Libs: -L"${libdir}" -lsherpa-onnx-c-api -lsherpa-onnx-core -lonnxruntime -lkaldi-decoder-core -lsherpa-onnx-kaldifst-core -lsherpa-onnx-fst -lkaldi-native-fbank-core -lpiper_phonemize -lespeak-ng -lucd -lcargs -Wl,-rpath,${libdir} @SHERPA_ONNX_PKG_CONFIG_EXTRA_LIBS@ From fb799f61146735ab79b03ce5c327a1dac11518a7 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 17:57:35 +0800 Subject: [PATCH 14/19] Fix iOS --- build-ios.sh | 13 ++++++-- .../SherpaOnnx.xcodeproj/project.pbxproj | 2 +- .../SherpaOnnxTts/ContentView.swift | 2 +- .../SherpaOnnxTts/ViewModel.swift | 30 ++++++++++++++++++- 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/build-ios.sh b/build-ios.sh index 5f82c40b5a..bed00c471c 100755 --- a/build-ios.sh +++ b/build-ios.sh @@ -140,7 +140,8 @@ echo "Generate xcframework" mkdir -p "build/simulator/lib" for f in libkaldi-native-fbank-core.a libsherpa-onnx-c-api.a libsherpa-onnx-core.a \ - libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a; do + libsherpa-onnx-fst.a libsherpa-onnx-kaldifst-core.a libkaldi-decoder-core.a \ + libucd.a libpiper_phonemize.a libespeak-ng.a; do lipo -create build/simulator_arm64/lib/${f} \ build/simulator_x86_64/lib/${f} \ -output build/simulator/lib/${f} @@ -154,7 +155,10 @@ libtool -static -o build/simulator/sherpa-onnx.a \ build/simulator/lib/libsherpa-onnx-core.a \ build/simulator/lib/libsherpa-onnx-fst.a \ build/simulator/lib/libsherpa-onnx-kaldifst-core.a \ - build/simulator/lib/libkaldi-decoder-core.a + build/simulator/lib/libkaldi-decoder-core.a \ + build/simulator/lib/libucd.a \ + build/simulator/lib/libpiper_phonemize.a \ + build/simulator/lib/libespeak-ng.a \ libtool -static -o build/os64/sherpa-onnx.a \ build/os64/lib/libkaldi-native-fbank-core.a \ @@ -162,7 +166,10 @@ libtool -static -o build/os64/sherpa-onnx.a \ build/os64/lib/libsherpa-onnx-core.a \ build/os64/lib/libsherpa-onnx-fst.a \ build/os64/lib/libsherpa-onnx-kaldifst-core.a \ - build/os64/lib/libkaldi-decoder-core.a + build/os64/lib/libkaldi-decoder-core.a \ + build/os64/lib/libucd.a \ + build/os64/lib/libpiper_phonemize.a \ + build/os64/lib/libespeak-ng.a \ rm -rf sherpa-onnx.xcframework diff --git a/ios-swift/SherpaOnnx/SherpaOnnx.xcodeproj/project.pbxproj b/ios-swift/SherpaOnnx/SherpaOnnx.xcodeproj/project.pbxproj index f74b87f45d..0d3be225f9 100644 --- a/ios-swift/SherpaOnnx/SherpaOnnx.xcodeproj/project.pbxproj +++ b/ios-swift/SherpaOnnx/SherpaOnnx.xcodeproj/project.pbxproj @@ -40,7 +40,7 @@ /* End PBXContainerItemProxy section */ /* Begin PBXFileReference section */ - C93989AF2A89FE33009AB859 /* onnxruntime.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = onnxruntime.xcframework; path = "../../build-ios/ios-onnxruntime/1.16.0/onnxruntime.xcframework"; sourceTree = ""; }; + C93989AF2A89FE33009AB859 /* onnxruntime.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = onnxruntime.xcframework; path = "../../build-ios/ios-onnxruntime/1.16.3/onnxruntime.xcframework"; sourceTree = ""; }; C93989B12A89FF78009AB859 /* decoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = decoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/decoder.int8.onnx"; sourceTree = ""; }; C93989B22A89FF78009AB859 /* encoder.int8.onnx */ = {isa = PBXFileReference; lastKnownFileType = file; name = encoder.int8.onnx; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/encoder.int8.onnx"; sourceTree = ""; }; C93989B32A89FF78009AB859 /* tokens.txt */ = {isa = PBXFileReference; lastKnownFileType = text; name = tokens.txt; path = "../../../icefall-models/sherpa-onnx-streaming-paraformer-bilingual-zh-en/tokens.txt"; sourceTree = ""; }; diff --git a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift index debc8cb5e6..fca8db7ba5 100644 --- a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift +++ b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ContentView.swift @@ -65,7 +65,7 @@ struct ContentView: View { self.filename = tempDirectoryURL.appendingPathComponent("test.wav") } - let ret = audio.save(filename: filename.path) + let _ = audio.save(filename: filename.path) self.audioPlayer = try! AVAudioPlayer(contentsOf: filename) self.audioPlayer.play() diff --git a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift index a42ecc78ba..f29de9e801 100644 --- a/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift +++ b/ios-swiftui/SherpaOnnxTts/SherpaOnnxTts/ViewModel.swift @@ -7,6 +7,12 @@ import Foundation + +// used to get the path to espeak-ng-data +func resourceURL(to path: String) -> String { + return URL(string: path, relativeTo: Bundle.main.resourceURL)!.path +} + func getResource(_ forResource: String, _ ofType: String) -> String { let path = Bundle.main.path(forResource: forResource, ofType: ofType) precondition( @@ -59,8 +65,30 @@ func getTtsForAishell3() -> SherpaOnnxOfflineTtsWrapper { return SherpaOnnxOfflineTtsWrapper(config: &config) } +// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models +func getTtsFor_en_US_amy_low() -> SherpaOnnxOfflineTtsWrapper { + // please see https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 + + // vits-vctk.onnx + let model = getResource("en_US-amy-low", "onnx") + + // tokens.txt + let tokens = getResource("tokens", "txt") + + // in this case, we don't need lexicon.txt + let dataDir = resourceURL(to: "espeak-ng-data") + + let vits = sherpaOnnxOfflineTtsVitsModelConfig(model: model, lexicon: "", tokens: tokens, dataDir: dataDir) + let modelConfig = sherpaOnnxOfflineTtsModelConfig(vits: vits) + var config = sherpaOnnxOfflineTtsConfig(model: modelConfig) + + return SherpaOnnxOfflineTtsWrapper(config: &config) +} + func createOfflineTts() -> SherpaOnnxOfflineTtsWrapper { - return getTtsForVCTK() + return getTtsFor_en_US_amy_low() + + // return getTtsForVCTK() // return getTtsForAishell3() From f43d4f3e48edef45b6949780f208ee61a995a74d Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 18:10:45 +0800 Subject: [PATCH 15/19] Fix kotlin --- .github/workflows/test-build-wheel.yaml | 2 +- .../app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt | 4 +++- kotlin-api-examples/Main.kt | 12 +++++++----- kotlin-api-examples/run.sh | 7 ++++--- sherpa-onnx/jni/jni.cc | 9 +++++++++ 5 files changed, 24 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-build-wheel.yaml b/.github/workflows/test-build-wheel.yaml index ac5c38dccb..ddbc409977 100644 --- a/.github/workflows/test-build-wheel.yaml +++ b/.github/workflows/test-build-wheel.yaml @@ -3,7 +3,7 @@ name: test-build-wheel on: push: branches: - - english-piper-phonemize-2 + - master pull_request: diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt index cf6b1e2544..8dd75746bd 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt @@ -5,8 +5,9 @@ import android.content.res.AssetManager data class OfflineTtsVitsModelConfig( var model: String, - var lexicon: String, + var lexicon: String = "", var tokens: String, + var dataDir: String = "", var noiseScale: Float = 0.667f, var noiseScaleW: Float = 0.8f, var lengthScale: Float = 1.0f, @@ -22,6 +23,7 @@ data class OfflineTtsModelConfig( data class OfflineTtsConfig( var model: OfflineTtsModelConfig, var ruleFsts: String = "", + var maxNumSentences: Int = 2, ) class GeneratedAudio( diff --git a/kotlin-api-examples/Main.kt b/kotlin-api-examples/Main.kt index 69c41dfd10..4d7b5ff682 100644 --- a/kotlin-api-examples/Main.kt +++ b/kotlin-api-examples/Main.kt @@ -8,20 +8,22 @@ fun main() { } fun testTts() { + // see https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models + // https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 var config = OfflineTtsConfig( model=OfflineTtsModelConfig( vits=OfflineTtsVitsModelConfig( - model="./vits-zh-aishell3/vits-aishell3.onnx", - lexicon="./vits-zh-aishell3/lexicon.txt", - tokens="./vits-zh-aishell3/tokens.txt", + model="./vits-piper-en_US-amy-low/en_US-amy-low.onnx", + tokens="./vits-piper-en_US-amy-low/tokens.txt", + dataDir="./vits-piper-en_US-amy-low/espeak-ng-data", ), numThreads=1, debug=true, ) ) val tts = OfflineTts(config=config) - val audio = tts.generate(text="林美丽最美丽!", sid=99, speed=1.2f) - audio.save(filename="99.wav") + val audio = tts.generate(text="“Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.”") + audio.save(filename="test-en.wav") } fun testAsr() { diff --git a/kotlin-api-examples/run.sh b/kotlin-api-examples/run.sh index 21e36430a4..499221e7bf 100755 --- a/kotlin-api-examples/run.sh +++ b/kotlin-api-examples/run.sh @@ -34,9 +34,10 @@ if [ ! -f ./sherpa-onnx-streaming-zipformer-en-2023-02-21/tokens.txt ]; then git clone https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-en-2023-02-21 fi -if [ ! -f ./vits-zh-aishell3/tokens.txt ]; then - git lfs install - git clone https://huggingface.co/csukuangfj/vits-zh-aishell3 +if [ ! -f ./vits-piper-en_US-amy-low/en_US-amy-low.onnx ]; then + wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 + tar xf vits-piper-en_US-amy-low.tar.bz2 + rm vits-piper-en_US-amy-low.tar.bz2 fi kotlinc-jvm -include-runtime -d main.jar Main.kt WaveReader.kt SherpaOnnx.kt faked-asset-manager.kt Tts.kt diff --git a/sherpa-onnx/jni/jni.cc b/sherpa-onnx/jni/jni.cc index 0a039f6490..3e524803aa 100644 --- a/sherpa-onnx/jni/jni.cc +++ b/sherpa-onnx/jni/jni.cc @@ -545,6 +545,12 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { ans.model.vits.tokens = p; env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(vits_cls, "dataDir", "Ljava/lang/String;"); + s = (jstring)env->GetObjectField(vits, fid); + p = env->GetStringUTFChars(s, nullptr); + ans.model.vits.data_dir = p; + env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(vits_cls, "noiseScale", "F"); ans.model.vits.noise_scale = env->GetFloatField(vits, fid); @@ -573,6 +579,9 @@ static OfflineTtsConfig GetOfflineTtsConfig(JNIEnv *env, jobject config) { ans.rule_fsts = p; env->ReleaseStringUTFChars(s, p); + fid = env->GetFieldID(cls, "maxNumSentences", "I"); + ans.max_num_sentences = env->GetIntField(config, fid); + return ans; } From 48189623a3f9ccca8b53e8c3cdc6fff602975e7f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 20:42:41 +0800 Subject: [PATCH 16/19] Fix Android --- .../app/src/main/AndroidManifest.xml | 2 + .../com/k2fsa/sherpa/onnx/MainActivity.kt | 98 ++++++++++++++++--- .../main/java/com/k2fsa/sherpa/onnx/Tts.kt | 15 ++- build-android-arm64-v8a.sh | 1 + build-android-armv7-eabi.sh | 1 + build-android-x86-64.sh | 1 + build-android-x86.sh | 1 + sherpa-onnx/csrc/lexicon.h | 4 +- sherpa-onnx/csrc/offline-tts-vits-impl.h | 24 +++-- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 24 +++++ sherpa-onnx/csrc/piper-phonemize-lexicon.h | 13 ++- sherpa-onnx/jni/jni.cc | 5 + 12 files changed, 162 insertions(+), 27 deletions(-) diff --git a/android/SherpaOnnxTts/app/src/main/AndroidManifest.xml b/android/SherpaOnnxTts/app/src/main/AndroidManifest.xml index 584bbec5e8..628423a266 100644 --- a/android/SherpaOnnxTts/app/src/main/AndroidManifest.xml +++ b/android/SherpaOnnxTts/app/src/main/AndroidManifest.xml @@ -2,6 +2,8 @@ + + ? + try { + assets = application.assets.list(path) + if (assets!!.isEmpty()) { + copyFile(path) + } else { + val fullPath = "${application.getExternalFilesDir(null)}/$path" + val dir = File(fullPath) + dir.mkdirs() + for (asset in assets.iterator()) { + val p: String = if (path == "") "" else path + "/" + copyAssets(p + asset) + } + } + } catch (ex: IOException) { + Log.e(TAG, "Failed to copy $path. ${ex.toString()}") + } + } + + private fun copyFile(filename: String) { + try { + val istream = application.assets.open(filename) + val newFilename = application.getExternalFilesDir(null).toString() + "/" + filename + val ostream = FileOutputStream(newFilename) + Log.i(TAG, "Copying $filename to $newFilename") + val buffer = ByteArray(1024) + var read = 0 + while (read != -1) { + ostream.write(buffer, 0, read) + read = istream.read(buffer) + } + istream.close() + ostream.flush() + ostream.close() + } catch (ex: Exception) { + Log.e(TAG, "Failed to copy $filename, ${ex.toString()}") + } } } diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt index 8dd75746bd..132e8b06aa 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt @@ -119,18 +119,25 @@ class OfflineTts( // please refer to // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html // to download models -fun getOfflineTtsConfig(modelDir: String, modelName: String, ruleFsts: String): OfflineTtsConfig? { +fun getOfflineTtsConfig( + modelDir: String, + modelName: String, + lexicon: String, + dataDir: String, + ruleFsts: String +): OfflineTtsConfig? { return OfflineTtsConfig( model = OfflineTtsModelConfig( vits = OfflineTtsVitsModelConfig( model = "$modelDir/$modelName", - lexicon = "$modelDir/lexicon.txt", - tokens = "$modelDir/tokens.txt" + lexicon = "$modelDir/$lexicon", + tokens = "$modelDir/tokens.txt", + dataDir = "$dataDir" ), numThreads = 2, debug = true, provider = "cpu", ), - ruleFsts=ruleFsts, + ruleFsts = ruleFsts, ) } diff --git a/build-android-arm64-v8a.sh b/build-android-arm64-v8a.sh index 8297561f7b..a424cbcaf7 100755 --- a/build-android-arm64-v8a.sh +++ b/build-android-arm64-v8a.sh @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" make -j4 make install/strip cp -fv android-onnxruntime-libs/jni/arm64-v8a/libonnxruntime.so install/lib +rm -rf install/lib/pkgconfig diff --git a/build-android-armv7-eabi.sh b/build-android-armv7-eabi.sh index 9157f4cdbd..fc93a6279e 100755 --- a/build-android-armv7-eabi.sh +++ b/build-android-armv7-eabi.sh @@ -92,3 +92,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" make -j4 make install/strip cp -fv android-onnxruntime-libs/jni/armeabi-v7a/libonnxruntime.so install/lib +rm -rf install/lib/pkgconfig diff --git a/build-android-x86-64.sh b/build-android-x86-64.sh index 02a7b9590f..728f38d28c 100755 --- a/build-android-x86-64.sh +++ b/build-android-x86-64.sh @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" make -j4 make install/strip cp -fv android-onnxruntime-libs/jni/x86_64/libonnxruntime.so install/lib +rm -rf install/lib/pkgconfig diff --git a/build-android-x86.sh b/build-android-x86.sh index 1b6194f37d..99091a17c3 100755 --- a/build-android-x86.sh +++ b/build-android-x86.sh @@ -94,3 +94,4 @@ cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" make -j4 make install/strip cp -fv android-onnxruntime-libs/jni/x86/libonnxruntime.so install/lib +rm -rf install/lib/pkgconfig diff --git a/sherpa-onnx/csrc/lexicon.h b/sherpa-onnx/csrc/lexicon.h index d219aaaec1..197b0afe6c 100644 --- a/sherpa-onnx/csrc/lexicon.h +++ b/sherpa-onnx/csrc/lexicon.h @@ -13,13 +13,13 @@ #include #include -#include "sherpa-onnx/csrc/offline-tts-frontend.h" - #if __ANDROID_API__ >= 9 #include "android/asset_manager.h" #include "android/asset_manager_jni.h" #endif +#include "sherpa-onnx/csrc/offline-tts-frontend.h" + namespace sherpa_onnx { class Lexicon : public OfflineTtsFrontend { diff --git a/sherpa-onnx/csrc/offline-tts-vits-impl.h b/sherpa-onnx/csrc/offline-tts-vits-impl.h index c6e9cd8ffb..49208bc96a 100644 --- a/sherpa-onnx/csrc/offline-tts-vits-impl.h +++ b/sherpa-onnx/csrc/offline-tts-vits-impl.h @@ -32,7 +32,7 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { explicit OfflineTtsVitsImpl(const OfflineTtsConfig &config) : config_(config), model_(std::make_unique(config.model)) { - InitLexicon(); + InitFrontend(); if (!config.rule_fsts.empty()) { std::vector files; @@ -50,11 +50,9 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { #if __ANDROID_API__ >= 9 OfflineTtsVitsImpl(AAssetManager *mgr, const OfflineTtsConfig &config) : config_(config), - model_(std::make_unique(mgr, config.model)), - frontend_(std::make_unique( - mgr, config.model.vits.lexicon, config.model.vits.tokens, - model_->Punctuations(), model_->Language(), config.model.debug, - model_->IsPiper())) { + model_(std::make_unique(mgr, config.model)) { + InitFrontend(mgr); + if (!config.rule_fsts.empty()) { std::vector files; SplitStringToVector(config.rule_fsts, ",", false, &files); @@ -170,7 +168,19 @@ class OfflineTtsVitsImpl : public OfflineTtsImpl { } private: - void InitLexicon() { + void InitFrontend(AAssetManager *mgr) { + if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { + frontend_ = std::make_unique( + mgr, config_.model.vits.tokens, config_.model.vits.data_dir); + } else { + frontend_ = std::make_unique( + mgr, config_.model.vits.lexicon, config_.model.vits.tokens, + model_->Punctuations(), model_->Language(), config_.model.debug, + model_->IsPiper()); + } + } + + void InitFrontend() { if (model_->IsPiper() && !config_.model.vits.data_dir.empty()) { frontend_ = std::make_unique( config_.model.vits.tokens, config_.model.vits.data_dir); diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index 03c6063dd0..4ad6374c0b 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -14,10 +14,18 @@ #include #include +#if __ANDROID_API__ >= 9 +#include + +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + #include "espeak-ng/speak_lib.h" #include "phoneme_ids.hpp" #include "phonemize.hpp" #include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/onnx-utils.h" namespace sherpa_onnx { @@ -121,6 +129,22 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon(const std::string &tokens, InitEspeak(data_dir_); } +#if __ANDROID_API__ >= 9 +PiperPhonemizeLexicon::PiperPhonemizeLexicon(AAssetManager *mgr, + const std::string &tokens, + const std::string &data_dir) { + { + auto buf = ReadFile(mgr, tokens); + std::istrstream is(buf.data(), buf.size()); + token2id_ = ReadTokens(is); + } + + // We should copy the directory of espeak-ng-data from the asset to + // some internal or external storage and then pass the directory to data_dir. + InitEspeak(data_dir_); +} +#endif + std::vector> PiperPhonemizeLexicon::ConvertTextToTokenIds( const std::string &text, const std::string &voice /*= ""*/) const { piper::eSpeakPhonemeConfig config; diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h index 9e2dd5584b..d2cdad2a83 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.h +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -9,14 +9,23 @@ #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + #include "sherpa-onnx/csrc/offline-tts-frontend.h" namespace sherpa_onnx { class PiperPhonemizeLexicon : public OfflineTtsFrontend { public: - explicit PiperPhonemizeLexicon(const std::string &tokens, - const std::string &data_dir); + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir); + +#if __ANDROID_API__ >= 9 + PiperPhonemizeLexicon(AAssetManager *mgr, const std::string &tokens, + const std::string &data_dir); +#endif std::vector> ConvertTextToTokenIds( const std::string &text, const std::string &voice = "") const override; diff --git a/sherpa-onnx/jni/jni.cc b/sherpa-onnx/jni/jni.cc index 3e524803aa..a3c7952cff 100644 --- a/sherpa-onnx/jni/jni.cc +++ b/sherpa-onnx/jni/jni.cc @@ -598,6 +598,11 @@ JNIEXPORT jlong JNICALL Java_com_k2fsa_sherpa_onnx_OfflineTts_new( #endif auto config = sherpa_onnx::GetOfflineTtsConfig(env, _config); SHERPA_ONNX_LOGE("config:\n%s", config.ToString().c_str()); + + if (!config.Validate()) { + SHERPA_ONNX_LOGE("Erros found in config!"); + } + auto tts = new sherpa_onnx::SherpaOnnxOfflineTts( #if __ANDROID_API__ >= 9 mgr, From d9dd0d7843fbf81a6be166fb57abbde2d948c44e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 20:58:44 +0800 Subject: [PATCH 17/19] fix building apk --- .../com/k2fsa/sherpa/onnx/MainActivity.kt | 2 +- scripts/apk/build-apk-tts.sh.in | 18 +++-- scripts/apk/generate-tts-apk-script.py | 72 +++++++++++-------- 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt index 97fa64b78e..50caa49fce 100644 --- a/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt +++ b/android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx/MainActivity.kt @@ -186,7 +186,7 @@ class MainActivity : AppCompatActivity() { val istream = application.assets.open(filename) val newFilename = application.getExternalFilesDir(null).toString() + "/" + filename val ostream = FileOutputStream(newFilename) - Log.i(TAG, "Copying $filename to $newFilename") + // Log.i(TAG, "Copying $filename to $newFilename") val buffer = ByteArray(1024) var read = 0 while (read != -1) { diff --git a/scripts/apk/build-apk-tts.sh.in b/scripts/apk/build-apk-tts.sh.in index e6c873575e..acdf35a542 100644 --- a/scripts/apk/build-apk-tts.sh.in +++ b/scripts/apk/build-apk-tts.sh.in @@ -37,13 +37,9 @@ model_dir={{ tts_model.model_dir }} model_name={{ tts_model.model_name }} lang={{ tts_model.lang }} -mkdir $model_dir -cd $model_dir -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/$model_name -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/lexicon.txt -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/tokens.txt -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/MODEL_CARD 2>/dev/null || true -wget -qq https://huggingface.co/csukuangfj/$model_dir/resolve/main/rule.fst 2>/dev/null || true +wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/$model_dir.tar.bz2 +tar xf $model_dir.tar.bz2 +rm $model_dir.tar.bz2 popd # Now we are at the project root directory @@ -52,11 +48,19 @@ git checkout . pushd android/SherpaOnnxTts/app/src/main/java/com/k2fsa/sherpa/onnx sed -i.bak s/"modelDir = null"/"modelDir = \"$model_dir\""/ ./MainActivity.kt sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt + {% if tts_model.rule_fsts %} rule_fsts={{ tts_model.rule_fsts }} sed -i.bak s%"ruleFsts = null"%"ruleFsts = \"$rule_fsts\""% ./MainActivity.kt {% endif %} +{% if tts_model.data_dir %} + data_dir={{ tts_model.data_dir }} + sed -i.bak s%"dataDir = null"%"dataDir = \"$dataDir\""% ./MainActivity.kt +{% else %} + sed -i.bak s/"lexicon = null"/"lexicon = \"lexicon.txt\""/ ./MainActivity.kt +{% endif %} + git diff popd diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index a9f94ae61d..06f44b37b2 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -27,9 +27,47 @@ def get_args(): @dataclass class TtsModel: model_dir: str - model_name: str - lang: str # en, zh, fr, de, etc. + model_name: str = "" + lang: str = "" # en, zh, fr, de, etc. rule_fsts: Optional[List[str]] = None + data_dir: Optional[str] = None + + +def get_piper_english_models() -> List[TtsModel]: + models = [ + TtsModel(model_dir="vits-piper-en_US-amy-low"), + TtsModel(model_dir="vits-piper-en_US-amy-medium"), + TtsModel(model_dir="vits-piper-en_US-arctic-medium"), + TtsModel(model_dir="vits-piper-en_US-danny-low"), + TtsModel(model_dir="vits-piper-en_US-hfc_male-medium"), + TtsModel(model_dir="vits-piper-en_US-joe-medium"), + TtsModel(model_dir="vits-piper-en_US-kathleen-low"), + TtsModel(model_dir="vits-piper-en_US-kusal-medium"), + TtsModel(model_dir="vits-piper-en_US-l2arctic-medium"), + TtsModel(model_dir="vits-piper-en_US-lessac-low"), + TtsModel(model_dir="vits-piper-en_US-lessac-medium"), + TtsModel(model_dir="vits-piper-en_US-lessac-high"), + TtsModel(model_dir="vits-piper-en_US-libritts-high"), + TtsModel(model_dir="vits-piper-en_US-libritts_r-medium"), + TtsModel(model_dir="vits-piper-en_US-ryan-low"), + TtsModel(model_dir="vits-piper-en_US-ryan-medium"), + TtsModel(model_dir="vits-piper-en_US-ryan-high"), + # English (GB) + TtsModel(model_dir="vits-piper-en_GB-alan-low"), + TtsModel(model_dir="vits-piper-en_GB-alan-medium"), + TtsModel(model_dir="vits-piper-en_GB-alba-medium"), + TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium"), + TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium"), + TtsModel(model_dir="vits-piper-en_GB-semaine-medium"), + TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low"), + TtsModel(model_dir="vits-piper-en_GB-vctk-medium"), + ] + for m in models: + m.data_dir = m.model_dir + "/" + "espeak-data-ng" + m.model_name = m.model_dir[len("vits-piper-") :] + ".onnx" + m.lang = "en" + + return models def get_all_models() -> List[TtsModel]: @@ -98,32 +136,7 @@ def get_all_models() -> List[TtsModel]: # English (US) TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"), TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"), - TtsModel(model_dir="vits-piper-en_US-amy-low", model_name="en_US-amy-low.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-amy-medium", model_name="en_US-amy-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-arctic-medium", model_name="en_US-arctic-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-danny-low", model_name="en_US-danny-low.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-hfc_male-medium", model_name="en_US-hfc_male-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-joe-medium", model_name="en_US-joe-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-kathleen-low", model_name="en_US-kathleen-low.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-kusal-medium", model_name="en_US-kusal-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-l2arctic-medium", model_name="en_US-l2arctic-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-lessac-low", model_name="en_US-lessac-low.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-lessac-medium", model_name="en_US-lessac-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-lessac-high", model_name="en_US-lessac-high.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-libritts-high", model_name="en_US-libritts-high.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-libritts_r-medium", model_name="en_US-libritts_r-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-ryan-low", model_name="en_US-ryan-low.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-ryan-medium", model_name="en_US-ryan-medium.onnx", lang="en",), - TtsModel(model_dir="vits-piper-en_US-ryan-high", model_name="en_US-ryan-high.onnx", lang="en",), - # English (GB) - TtsModel(model_dir="vits-piper-en_GB-alan-low", model_name="en_GB-alan-low.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-alan-medium", model_name="en_GB-alan-medium.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-alba-medium", model_name="en_GB-alba-medium.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium", model_name="en_GB-jenny_dioco-medium.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium", model_name="en_GB-northern_english_male-medium.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-semaine-medium", model_name="en_GB-semaine-medium.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low", model_name="en_GB-southern_english_female-low.onnx",lang="en",), - TtsModel(model_dir="vits-piper-en_GB-vctk-medium", model_name="en_GB-vctk-medium.onnx",lang="en",), + # German (DE) TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low", model_name="de_DE-eva_k-x_low.onnx",lang="de",), TtsModel(model_dir="vits-piper-de_DE-karlsson-low", model_name="de_DE-karlsson-low.onnx",lang="de",), @@ -162,7 +175,8 @@ def main(): s = f.read() template = environment.from_string(s) d = dict() - all_model_list = get_all_models() + # all_model_list = get_all_models() + all_model_list = get_piper_english_models() num_models = len(all_model_list) num_per_runner = num_models // total From e3bf4313f3decc52fcadc24f8788a618d8e6af7e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 21:06:17 +0800 Subject: [PATCH 18/19] fix android warnings --- cmake/espeak-ng-for-piper.cmake | 2 +- scripts/apk/build-apk-tts.sh.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/espeak-ng-for-piper.cmake b/cmake/espeak-ng-for-piper.cmake index e1dfb2fc0e..38f3fa558d 100644 --- a/cmake/espeak-ng-for-piper.cmake +++ b/cmake/espeak-ng-for-piper.cmake @@ -86,7 +86,7 @@ function(download_espeak_ng_for_piper) -Wno-unused-result -Wno-format-overflow -Wno-format-truncation - -Wno-maybe-uninitialized + -Wno-uninitialized -Wno-format ) diff --git a/scripts/apk/build-apk-tts.sh.in b/scripts/apk/build-apk-tts.sh.in index acdf35a542..779bb79782 100644 --- a/scripts/apk/build-apk-tts.sh.in +++ b/scripts/apk/build-apk-tts.sh.in @@ -56,7 +56,7 @@ sed -i.bak s/"modelName = null"/"modelName = \"$model_name\""/ ./MainActivity.kt {% if tts_model.data_dir %} data_dir={{ tts_model.data_dir }} - sed -i.bak s%"dataDir = null"%"dataDir = \"$dataDir\""% ./MainActivity.kt + sed -i.bak s%"dataDir = null"%"dataDir = \"$data_dir\""% ./MainActivity.kt {% else %} sed -i.bak s/"lexicon = null"/"lexicon = \"lexicon.txt\""/ ./MainActivity.kt {% endif %} From a8c68fbdbe720473af88553e1c4cd5976317127e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 30 Nov 2023 21:39:36 +0800 Subject: [PATCH 19/19] add more data --- .github/workflows/apk-tts.yaml | 4 +- scripts/apk/generate-tts-apk-script.py | 128 +++++++++++++++++-------- 2 files changed, 91 insertions(+), 41 deletions(-) diff --git a/.github/workflows/apk-tts.yaml b/.github/workflows/apk-tts.yaml index ffd7ab1d54..6672cc5002 100644 --- a/.github/workflows/apk-tts.yaml +++ b/.github/workflows/apk-tts.yaml @@ -26,8 +26,8 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - total: ["12"] - index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"] + total: ["30"] + index: ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29"] steps: - uses: actions/checkout@v4 diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 06f44b37b2..958f554757 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -33,8 +33,35 @@ class TtsModel: data_dir: Optional[str] = None -def get_piper_english_models() -> List[TtsModel]: +def get_piper_models() -> List[TtsModel]: models = [ + TtsModel(model_dir="vits-piper-ar_JO-kareem-low"), + TtsModel(model_dir="vits-piper-ar_JO-kareem-medium"), + TtsModel(model_dir="vits-piper-ca_ES-upc_ona-medium"), + TtsModel(model_dir="vits-piper-ca_ES-upc_ona-x_low"), + TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"), + TtsModel(model_dir="vits-piper-ca_ES-upc_pau-x_low"), + TtsModel(model_dir="vits-piper-cs_CZ-jirka-medium"), + TtsModel(model_dir="vits-piper-da_DK-talesyntese-medium"), + TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low"), + TtsModel(model_dir="vits-piper-de_DE-karlsson-low"), + TtsModel(model_dir="vits-piper-de_DE-kerstin-low"), + TtsModel(model_dir="vits-piper-de_DE-pavoque-low"), + TtsModel(model_dir="vits-piper-de_DE-ramona-low"), + TtsModel(model_dir="vits-piper-de_DE-thorsten-high"), + TtsModel(model_dir="vits-piper-de_DE-thorsten-low"), + TtsModel(model_dir="vits-piper-de_DE-thorsten-medium"), + TtsModel(model_dir="vits-piper-de_DE-thorsten_emotional-medium"), + TtsModel(model_dir="vits-piper-el_GR-rapunzelina-low"), + TtsModel(model_dir="vits-piper-en_GB-alan-low"), + TtsModel(model_dir="vits-piper-en_GB-alan-medium"), + TtsModel(model_dir="vits-piper-en_GB-alba-medium"), + TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium"), + TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium"), + TtsModel(model_dir="vits-piper-en_GB-semaine-medium"), + TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low"), + TtsModel(model_dir="vits-piper-en_GB-sweetbbak-amy"), + TtsModel(model_dir="vits-piper-en_GB-vctk-medium"), TtsModel(model_dir="vits-piper-en_US-amy-low"), TtsModel(model_dir="vits-piper-en_US-amy-medium"), TtsModel(model_dir="vits-piper-en_US-arctic-medium"), @@ -44,26 +71,74 @@ def get_piper_english_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-en_US-kathleen-low"), TtsModel(model_dir="vits-piper-en_US-kusal-medium"), TtsModel(model_dir="vits-piper-en_US-l2arctic-medium"), + TtsModel(model_dir="vits-piper-en_US-lessac-high"), TtsModel(model_dir="vits-piper-en_US-lessac-low"), TtsModel(model_dir="vits-piper-en_US-lessac-medium"), - TtsModel(model_dir="vits-piper-en_US-lessac-high"), TtsModel(model_dir="vits-piper-en_US-libritts-high"), TtsModel(model_dir="vits-piper-en_US-libritts_r-medium"), + TtsModel(model_dir="vits-piper-en_US-ryan-high"), TtsModel(model_dir="vits-piper-en_US-ryan-low"), TtsModel(model_dir="vits-piper-en_US-ryan-medium"), - TtsModel(model_dir="vits-piper-en_US-ryan-high"), - # English (GB) - TtsModel(model_dir="vits-piper-en_GB-alan-low"), - TtsModel(model_dir="vits-piper-en_GB-alan-medium"), - TtsModel(model_dir="vits-piper-en_GB-alba-medium"), - TtsModel(model_dir="vits-piper-en_GB-jenny_dioco-medium"), - TtsModel(model_dir="vits-piper-en_GB-northern_english_male-medium"), - TtsModel(model_dir="vits-piper-en_GB-semaine-medium"), - TtsModel(model_dir="vits-piper-en_GB-southern_english_female-low"), - TtsModel(model_dir="vits-piper-en_GB-vctk-medium"), + TtsModel(model_dir="vits-piper-es_ES-carlfm-x_low"), + TtsModel(model_dir="vits-piper-es_ES-davefx-medium"), + TtsModel(model_dir="vits-piper-es_ES-mls_10246-low"), + TtsModel(model_dir="vits-piper-es_ES-mls_9972-low"), + TtsModel(model_dir="vits-piper-es_ES-sharvard-medium"), + TtsModel(model_dir="vits-piper-es_MX-ald-medium"), + TtsModel(model_dir="vits-piper-fi_FI-harri-low"), + TtsModel(model_dir="vits-piper-fi_FI-harri-medium"), + TtsModel(model_dir="vits-piper-fr_FR-siwis-low"), + TtsModel(model_dir="vits-piper-fr_FR-siwis-medium"), + TtsModel(model_dir="vits-piper-fr_FR-upmc-medium"), + TtsModel(model_dir="vits-piper-hu_HU-anna-medium"), + TtsModel(model_dir="vits-piper-hu_HU-berta-medium"), + TtsModel(model_dir="vits-piper-hu_HU-imre-medium"), + TtsModel(model_dir="vits-piper-is_IS-bui-medium"), + TtsModel(model_dir="vits-piper-is_IS-salka-medium"), + TtsModel(model_dir="vits-piper-is_IS-steinn-medium"), + TtsModel(model_dir="vits-piper-is_IS-ugla-medium"), + TtsModel(model_dir="vits-piper-it_IT-riccardo-x_low"), + TtsModel(model_dir="vits-piper-ka_GE-natia-medium"), + TtsModel(model_dir="vits-piper-kk_KZ-iseke-x_low"), + TtsModel(model_dir="vits-piper-kk_KZ-issai-high"), + TtsModel(model_dir="vits-piper-kk_KZ-raya-x_low"), + TtsModel(model_dir="vits-piper-lb_LU-marylux-medium"), + TtsModel(model_dir="vits-piper-ne_NP-google-medium"), + TtsModel(model_dir="vits-piper-ne_NP-google-x_low"), + TtsModel(model_dir="vits-piper-nl_BE-nathalie-medium"), + TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"), + TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"), + TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"), + TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"), + TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), + TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"), + TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"), + TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"), + TtsModel(model_dir="vits-piper-pl_PL-mc_speech-medium"), + TtsModel(model_dir="vits-piper-pl_PL-mls_6892-low"), + TtsModel(model_dir="vits-piper-pt_BR-edresson-low"), + TtsModel(model_dir="vits-piper-pt_BR-faber-medium"), + TtsModel(model_dir="vits-piper-pt_PT-tugao-medium"), + TtsModel(model_dir="vits-piper-ro_RO-mihai-medium"), + TtsModel(model_dir="vits-piper-ru_RU-denis-medium"), + TtsModel(model_dir="vits-piper-ru_RU-dmitri-medium"), + TtsModel(model_dir="vits-piper-ru_RU-irina-medium"), + TtsModel(model_dir="vits-piper-ru_RU-ruslan-medium"), + TtsModel(model_dir="vits-piper-sk_SK-lili-medium"), + TtsModel(model_dir="vits-piper-sr_RS-serbski_institut-medium"), + TtsModel(model_dir="vits-piper-sv_SE-nst-medium"), + TtsModel(model_dir="vits-piper-sw_CD-lanfrica-medium"), + TtsModel(model_dir="vits-piper-tr_TR-dfki-medium"), + TtsModel(model_dir="vits-piper-tr_TR-fahrettin-medium"), + TtsModel(model_dir="vits-piper-uk_UA-lada-x_low"), + TtsModel(model_dir="vits-piper-uk_UA-ukrainian_tts-medium"), + TtsModel(model_dir="vits-piper-vi_VN-25hours_single-low"), + TtsModel(model_dir="vits-piper-vi_VN-vais1000-medium"), + TtsModel(model_dir="vits-piper-vi_VN-vivos-x_low"), + TtsModel(model_dir="vits-piper-zh_CN-huayan-medium"), ] for m in models: - m.data_dir = m.model_dir + "/" + "espeak-data-ng" + m.data_dir = m.model_dir + "/" + "espeak-ng-data" m.model_name = m.model_dir[len("vits-piper-") :] + ".onnx" m.lang = "en" @@ -136,31 +211,6 @@ def get_all_models() -> List[TtsModel]: # English (US) TtsModel(model_dir="vits-vctk", model_name="vits-vctk.onnx", lang="en"), TtsModel(model_dir="vits-ljs", model_name="vits-ljs.onnx", lang="en"), - - # German (DE) - TtsModel(model_dir="vits-piper-de_DE-eva_k-x_low", model_name="de_DE-eva_k-x_low.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-karlsson-low", model_name="de_DE-karlsson-low.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-kerstin-low", model_name="de_DE-kerstin-low.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-pavoque-low", model_name="de_DE-pavoque-low.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-ramona-low", model_name="de_DE-ramona-low.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-thorsten-low", model_name="de_DE-thorsten-low.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-thorsten-medium", model_name="de_DE-thorsten-medium.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-thorsten-high", model_name="de_DE-thorsten-high.onnx",lang="de",), - TtsModel(model_dir="vits-piper-de_DE-thorsten_emotional-medium", model_name="de_DE-thorsten_emotional-medium.onnx",lang="de",), - # French (FR) - TtsModel(model_dir="vits-piper-fr_FR-upmc-medium", model_name="fr_FR-upmc-medium.onnx",lang="fr",), - TtsModel(model_dir="vits-piper-fr_FR-siwis-low", model_name="fr_FR-siwis-low.onnx",lang="fr",), - TtsModel(model_dir="vits-piper-fr_FR-siwis-medium", model_name="fr_FR-siwis-medium.onnx",lang="fr",), - - # Spanish (ES) - TtsModel(model_dir="vits-piper-es_ES-carlfm-x_low", model_name="es_ES-carlfm-x_low.onnx",lang="es",), - TtsModel(model_dir="vits-piper-es_ES-davefx-medium", model_name="es_ES-davefx-medium.onnx",lang="es",), - TtsModel(model_dir="vits-piper-es_ES-mls_10246-low", model_name="es_ES-mls_10246-low.onnx",lang="es",), - TtsModel(model_dir="vits-piper-es_ES-mls_9972-low", model_name="es_ES-mls_9972-low.onnx",lang="es",), - TtsModel(model_dir="vits-piper-es_ES-sharvard-medium", model_name="es_ES-sharvard-medium.onnx",lang="es",), - - # Spanish (MX) - TtsModel(model_dir="vits-piper-es_MX-ald-medium", model_name="es_MX-ald-medium.onnx",lang="es",), # fmt: on ] @@ -176,7 +226,7 @@ def main(): template = environment.from_string(s) d = dict() # all_model_list = get_all_models() - all_model_list = get_piper_english_models() + all_model_list = get_piper_models() num_models = len(all_model_list) num_per_runner = num_models // total