Initial commit.

bshall · Nov 13, 2020 · 4e0f66f · 4e0f66f
1 parent 51f4bca
commit 4e0f66f
Show file tree

Hide file tree

Showing 22 changed files with 134,259 additions and 46 deletions.
diff --git a/.gitignore b/.gitignore
@@ -115,9 +115,10 @@ datasets/
 *.npy
 
 # Submission
-submission/
 *.wav
-submission.zip
 
 # Hydra outputs
-outputs/
+outputs/
+
+# tensorboard outputs
+tensorboard/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+recursive-include tacotron *.yaml
+recursive-include tacotron *.txt
+include LICENSE
diff --git a/README.md b/README.md
@@ -1 +1,101 @@
-# Tacotron
+# Tacotron (with Dynamic Convolution Attention)
+
+A PyTorch implementation of [Location-Relative Attention Mechanisms For Robust Long-Form Speech Synthesis](https://arxiv.org/abs/1910.10288). Audio samples can be found [here](bshall.github.io/tacotron/).
+
+<div align="center">
+    <img width="655" height="390" alt="Tacotron (with Dynamic Convolution Attention)" 
+      src="https://raw.githubusercontent.com/bshall/Tacotron/main/tacotron.png"><br>
+    <sup><strong>Fig 1:</strong>Tacotron (with Dynamic Convolution Attention).</sup>
+</div>
+
+<div align="center">
+    <img width="897" height="154" alt="Example Mel-spectrogram and attention plot" 
+      src="https://raw.githubusercontent.com/bshall/Tacotron/main/example.png"><br>
+    <sup><strong>Fig 2:</strong>Example Mel-spectrogram and attention plot.</sup>
+</div>
+
+## Quick Start
+
+Ensure you have Python 3.6 and PyTorch 1.7 or greater installed. Then install this package with:
+```
+pip install tacotron
+```
+
+## Example Usage
+
+```python
+import torch
+import soundfile as sf
+from univoc import Vocoder
+from tacotron import load_cmudict, text_to_id, Tacotron
+
+# download pretrained weights for the vocoder (and optionally move to GPU)
+vocoder = Vocoder.from_pretrained(
+    "https://github.com/bshall/UniversalVocoding/releases/download/v0.2/univoc-ljspeech-7mtpaq.pt"
+).cuda()
+
+# download pretrained weights for tacotron (and optionally move to GPU)
+tacotron = Tacotron.from_pretrained(
+    "https://github.com/bshall/Tacotron/releases/download/v0.1/tacotron-ljspeech-yspjx3.pt"
+).cuda()
+
+# load cmudict and add pronunciation of PyTorch
+cmudict = load_cmudict()
+cmudict["PYTORCH"] = "P AY1 T AO2 R CH"
+
+text = "A PyTorch implementation of Location-Relative Attention Mechanisms For Robust Long-Form Speech Synthesis."
+
+# convert text to phone ids
+text = torch.LongTensor(text_to_id(text, cmudict)).unsqueeze(0).cuda()
+
+# synthesize audio
+with torch.no_grad():
+    mel, _ = tacotron.generate(text)
+    wav, sr = vocoder.generate(mel.transpose(1, 2))
+
+# save output
+sf.write("location_relative_attention.wav", wav, sr)
+```
+
+## Train from Scatch
+
+1. Clone the repo:
+```
+git clone https://github.com/bshall/Tacotron
+cd ./Tacotron
+```
+2. Install requirements:
+```
+pip install -r requirements.txt
+```
+3. Download and extract the [LJ-Speech dataset](https://keithito.com/LJ-Speech-Dataset/):
+```
+wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+tar -xvjf LJSpeech-1.1.tar.bz2
+```
+4. Download the train split [here](https://github.com/bshall/Tacotron/releases/tag/v0.1) and extract it in the root directory of the repo.  
+5. Extract Mel spectrograms and preprocess audio:
+```
+python preprocess.py in_dir=path/to/LJSpeech-1.1 out_dir=datasets/LJSpeech-1.1
+```
+6. Train the model:
+```
+python train.py checkpoint_dir=ljspeech dataset_dir=datasets/LJSpeech-1.1 text_dir=path/to/LJSpeech-1.1/metadata.csv
+```
+
+## Pretrained Models
+
+Pretrained weights for the LJSpeech model are available [here](https://github.com/bshall/Tacotron/releases/tag/v0.1).
+
+## Notable Differences from the Paper
+
+1. Trained using a batch size of 64 on a single GPU (using automatic mixed precision).
+2. Used a gradient clipping threshold of 0.05 as it seems to stabilize the alignment with the smaller batch size.
+3. Used a different learning rate schedule (again to deal with smaller batch size).
+4. Used 80-bin (instead of 128 bin) log-Mel spectrograms.
+
+## Acknowlegements
+
+- https://github.com/keithito/tacotron
+- https://github.com/PetrochukM/PyTorch-NLP
+- https://github.com/fatchord/WaveRNN
diff --git a/example.png b/example.png
diff --git a/preprocess.py b/preprocess.py
@@ -17,7 +17,7 @@ def melspectrogram(
     hop_length=200,
     win_length=800,
     n_fft=2048,
-    n_mels=80,
+    n_mels=128,
     fmin=50,
     preemph=0.97,
     top_db=80,
@@ -51,7 +51,8 @@ def process_wav(wav_path, out_path, cfg):
     wav, _ = librosa.load(wav_path.with_suffix(".wav"), sr=cfg.sr)
     loudness = meter.integrated_loudness(wav)
     wav = pyln.normalize.loudness(wav, loudness, -24)
-    if (peak := np.abs(wav).max()) >= 1:
+    peak = np.abs(wav).max()
+    if peak >= 1:
         wav = wav / peak * 0.999
 
     logmel = melspectrogram(
@@ -80,9 +81,8 @@ def process_wav(wav_path, out_path, cfg):
 
 @hydra.main(config_path="config", config_name="preprocess")
 def preprocess_dataset(cfg):
-    print(OmegaConf.to_yaml(cfg))
     in_dir = Path(utils.to_absolute_path(cfg.in_dir))
-    out_dir = Path(utils.to_absolute_path("datasets"))
+    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
     out_dir.mkdir(parents=True, exist_ok=True)
 
     executor = ProcessPoolExecutor(max_workers=cpu_count())

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+librosa>=0.8.0
+numpy>=1.18.0
+tqdm>=4.41
+hydra-core>=1.0.3
+pyloudnorm>=0.1.0
+tensorboard>=2.3.0
+importlib-resources
diff --git a/sentences.txt b/sentences.txt
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,41 @@
+[metadata]
+name = tacotron
+version = 0.1.0
+author = Benjamin van Niekerk
+author_email = benjamin.l.van.niekerk@gmail.com
+url = https://github.com/bshall/Tacotron
+description = A PyTorch implementation of Location-Relative Attention Mechanisms For Robust Long-Form Speech Synthesis.
+long_description = file:README.md
+long_description_content_type = text/markdown
+project_urls = 
+    Source = https://github.com/bshall/Tacotron
+    Samples = https://bshall.github.io/tacotron/
+keywords = 
+    Speech Synthesis
+    Tacotron
+    Text-to-Speech
+    PyTorch
+classifiers =
+    Natural Language :: English
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: MIT License
+    Operating System :: POSIX :: Linux
+    Programming Language :: Python
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Topic :: Scientific/Engineering
+    Topic :: Scientific/Engineering :: Artificial Intelligence
+
+[options]
+packages = tacotron
+include_package_data = True
+python_requires = >=3.6
+install_requires =
+    librosa>=0.8.0
+    numpy>=1.18.0
+    tqdm>=4.41
+    requests
+    importlib-resources
+    omegaconf>=2.0.3
diff --git a/tacotron.png b/tacotron.png
diff --git a/tacotron/__init__.py b/tacotron/__init__.py
@@ -0,0 +1,3 @@
+from .model import Tacotron
+from .text import load_cmudict, text_to_id
+from .dataset import TTSDataset, BucketBatchSampler, pad_collate
diff --git a/tacotron/config/__init__.py b/tacotron/config/__init__.py
diff --git a/tacotron/config/config.yaml b/tacotron/config/config.yaml
@@ -0,0 +1,72 @@
+# @package _group_
+preprocess:
+  sr: 16000
+  hop_length: 200
+  win_length: 800
+  n_fft: 2048
+  n_mels: 80
+  fmin: 50
+  preemph: 0.97
+  top_db: 80
+  ref_db: 20
+  mulaw:
+    bits: 10
+
+train:
+  batch_size: 64
+  bucket_size_multiplier: 5
+  n_steps: 250000
+  clip_grad_norm: 0.05
+  optimizer:
+    lr: 1e-3
+  scheduler:
+    milestones:
+      - 20000
+      - 40000
+      - 100000
+      - 150000
+      - 200000
+    gamma: 0.5
+  checkpoint_interval: 5000
+  n_workers: 8
+
+
+model:
+  encoder:
+    n_symbols: 91
+    embedding_dim: 256
+    prenet:
+      input_size: ${model.encoder.embedding_dim}
+      hidden_size: 256
+      output_size: 128
+      dropout: 0.5
+    cbhg:
+      input_channels: ${model.encoder.prenet.output_size}
+      K: 16
+      channels: 128
+      projection_channels: 128
+      n_highways: 4
+      highway_size: 128
+      rnn_size: 128
+  decoder:
+    prenet:
+      input_size: ${preprocess.n_mels}
+      hidden_size: 256
+      output_size: 128
+      dropout: 0.5
+    attention:
+      attn_rnn_size: ${model.decoder.attn_rnn_size}
+      hidden_size: 128
+      static_channels: 8
+      static_kernel_size: 21
+      dynamic_channels: 8
+      dynamic_kernel_size: 21
+      prior_length: 11
+      alpha: 0.1
+      beta: 0.9
+    input_size: ${model.encoder.cbhg.channels}
+    n_mels: ${preprocess.n_mels}
+    attn_rnn_size: 256
+    decoder_rnn_size: 256
+    reduction_factor: 2
+    zoneout_prob: 0.1
diff --git a/tacotron/config/preprocess.yaml b/tacotron/config/preprocess.yaml
@@ -0,0 +1,5 @@
+defaults:
+  - config
+
+in_dir: ???
+out_dir: ???
diff --git a/tacotron/config/train.yaml b/tacotron/config/train.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - config
+
+resume: false
+checkpoint_dir: ???
+text_path: ???
+dataset_dir: ???
diff --git a/dataset.py → tacotron/dataset.py b/dataset.py → tacotron/dataset.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 import json
 
-from text import load_cmudict, parse_text, symbols_to_id, symbol_to_id
+from tacotron import load_cmudict, text_to_id
 
 
 class SortedSampler(samplers.Sampler):
@@ -74,7 +74,7 @@ def __len__(self):
 
 
 class TTSDataset(Dataset):
-    def __init__(self, root, text_path, cmudict_path):
+    def __init__(self, root, text_path):
         self.root = Path(root)
 
         with open(self.root / "train.json") as file:
@@ -90,13 +90,15 @@ def __init__(self, root, text_path, cmudict_path):
 
         self.index_longest_mel = np.argmax(self.lengths)
 
-        self.cmudict = load_cmudict(cmudict_path)
+        self.cmudict = load_cmudict()
 
+        keys = {path.stem for path in self.metadata}
         with open(text_path) as file:
             self.text = {}
             for line in file:
                 key, _, transcript = line.strip().split("|")
-                self.text[key] = parse_text(transcript, self.cmudict)
+                if key in keys:
+                    self.text[key] = transcript
 
     def sort_key(self, index):
         return self.lengths[index]
@@ -109,8 +111,7 @@ def __getitem__(self, index):
 
         mel = np.load(path.with_suffix(".mel.npy"))
 
-        symbols = self.text[path.stem]
-        symbols = symbols_to_id(symbols)
+        text = text_to_id(self.text[path.stem], self.cmudict)
 
         return (
             torch.Tensor(mel).transpose_(0, 1),

diff --git a/tacotron/dictionary/__init__.py b/tacotron/dictionary/__init__.py