muffled audio output for custom dataset

I have followed your work with Flowtron and it produces audio really nicely. I am using the same dataset of 1.5 hrs of a single speaker but I have trouble getting some output that is not as muffled here. The alignment is decent:

![Screenshot 2022-07-21 182200](https://user-images.githubusercontent.com/42710108/180191653-4d264aca-f8fe-4a0e-b607-502ff9064a76.png)


 and I am using the following parameters, am I missing something? Or is it just because I am training from scratch (I already did the decoder step)? Any help will be appreciated, thanks!

{
    "train_config": {
        "output_directory": "/debug",
        "epochs": 10000000,
        "optim_algo": "RAdam",
        "learning_rate": 1e-4,
        "weight_decay": 1e-6,
        "sigma": 1.0,
         "iters_per_checkpoint": 185,
         "batch_size": 4,
        "seed": null,
        "checkpoint_path": "",
        "ignore_layers": [],
        "ignore_layers_warmstart": [],
        "finetune_layers": [],
        "include_layers": [],
         "vocoder_config_path": "/content/drive/MyDrive/hifigan_22khz_config.json",
         "vocoder_checkpoint_path": "/content/drive/MyDrive/hifigan_libritts100360_generator0p5.pt",
        "log_attribute_samples": true,
        "log_decoder_samples": true,


        "warmstart_checkpoint_path": "path/to/pretrained/decoder",
        "use_amp": false,
        "grad_clip_val": 1.0,
        "loss_weights": {
            "blank_logprob": -1, 
            "ctc_loss_weight": 0.1,
            "binarization_loss_weight": 1.0,
            "dur_loss_weight": 1.0,
            "f0_loss_weight": 1.0,
            "energy_loss_weight": 1.0,
            "vpred_loss_weight": 1.0
        },
        "binarization_start_iter": 0,
        "kl_loss_start_iter": 0,
        "unfreeze_modules": "durf0energyvpred"
    },
    "data_config": {
        "training_files": {
            "LJS": {
                "basedir": "",
                "audiodir": "",
                "filelist":  "/content/drive/MyDrive/bastila_wav/filelists/bastila_train_rad.txt",
                "lmdbpath": ""
            }
        },
        "validation_files": {
            "LJS": {
                "basedir": "",
                "audiodir": "",
                "filelist":  "/content/drive/MyDrive/bastila_wav/filelists/bastila_val_rad.txt",
                "lmdbpath": ""
            }
        },
        "dur_min": 0.1,
        "dur_max": 10.2,
        "sampling_rate": 22050,
        "filter_length": 1024,
        "hop_length": 256,
        "win_length": 1024,
        "n_mel_channels": 80,
        "mel_fmin": 0.0,
        "mel_fmax": 8000.0,
        "f0_min": 80.0,
        "f0_max": 640.0,
        "max_wav_value": 32768.0,
        "use_f0": true,
        "use_log_f0": 0,
        "use_energy_avg": true,
        "use_scaled_energy": true,
        "symbol_set": "radtts",
        "cleaner_names": ["radtts_cleaners"],
        "heteronyms_path": "tts_text_processing/heteronyms",
        "phoneme_dict_path": "tts_text_processing/cmudict-0.7b",
        "p_phoneme": 1.0,
        "handle_phoneme": "word",
        "handle_phoneme_ambiguous": "ignore",
        "include_speakers": null,
        "n_frames": -1,
        "betabinom_cache_path": "data_cache/",
        "lmdb_cache_path": "", 
        "use_attn_prior_masking": true,
        "prepend_space_to_text": true,
        "append_space_to_text": true,
        "add_bos_eos_to_text": false,
        "betabinom_scaling_factor": 1.0,
        "distance_tx_unvoiced": false,
        "mel_noise_scale": 0.0
    },
    "dist_config": {
        "dist_backend": "nccl",
        "dist_url": "tcp://localhost:54321"
    },
    "model_config": {
        "n_speakers": 1,
        "n_speaker_dim": 16,
        "n_text": 185,
        "n_text_dim": 512,
        "n_flows": 8,
        "n_conv_layers_per_step": 4,
        "n_mel_channels": 80,
        "n_hidden": 1024,
        "mel_encoder_n_hidden": 512,
        "dummy_speaker_embedding": false,
        "n_early_size": 2,
        "n_early_every": 2,
        "n_group_size": 2,
        "affine_model": "wavenet",
        "include_modules": "decatndpmvpredapm",
        "scaling_fn": "tanh",
        "matrix_decomposition": "LUS",
        "learn_alignments": true,
        "use_speaker_emb_for_alignment": false,
        "attn_straight_through_estimator": true,
        "use_context_lstm": true,
        "context_lstm_norm": "spectral",
        "context_lstm_w_f0_and_energy": true,
        "text_encoder_lstm_norm": "spectral",
        "n_f0_dims": 1,
        "n_energy_avg_dims": 1,
        "use_first_order_features": false,
        "unvoiced_bias_activation": "relu",
        "decoder_use_partial_padding": true,
        "decoder_use_unvoiced_bias": true,
        "ap_pred_log_f0": true,
        "ap_use_unvoiced_bias": true,
        "ap_use_voiced_embeddings": true,
        "dur_model_config": {
            "name": "dap",
            "hparams": {
                "n_speaker_dim": 16,
                "bottleneck_hparams": {
                    "in_dim": 512,
                    "reduction_factor": 16,
                    "norm": "weightnorm",
                    "non_linearity": "relu"
                },
                "take_log_of_input": true,
                "arch_hparams": {
                    "out_dim": 1,
                    "n_layers": 2,
                    "n_channels": 256,
                    "kernel_size": 3,
                    "p_dropout": 0.25
                }
            }
        },
        "f0_model_config": {
            "name": "agap",
            "hparams": {
                "n_in_dim": 1,
                "n_group_size": 1,
                "take_log_of_input": false,
                "n_speaker_dim": 22,
                "n_flows": 2,
                "n_hidden": 128,
                "n_lstm_layers": 1,
                "scaling_fn": "tanh",
                "bottleneck_hparams": {
                    "in_dim": 512,
                    "reduction_factor": 16,
                    "norm": "weightnorm",
                    "non_linearity": "relu"
                },
                "spline_flow_params": {
                    "n_in_channels": 1,
                    "n_context_dim": 128,
                    "n_layers": 4,
                    "n_bins": 24,
                    "use_quadratic": true
                }
            }
        },
        "energy_model_config": {
            "name": "agap",
            "hparams": {
                "n_in_dim": 1,
                "n_group_size": 1,
                "take_log_of_input": false,
                "n_speaker_dim": 22,
                "n_flows": 4,
                "n_hidden": 128,
                "n_lstm_layers": 1,
                "scaling_fn": "tanh",
                "bottleneck_hparams": {
                    "in_dim": 512,
                    "reduction_factor": 16,
                    "norm": "weightnorm",
                    "non_linearity": "relu"
                },
                "spline_flow_params": {
                    "n_in_channels": 1,
                    "n_context_dim": 128,
                    "n_layers": 4,
                    "n_bins": 24,
                    "use_quadratic": true
                }
            }
        },
        "v_model_config": {
            "name": "dap",
            "hparams": {
                "n_speaker_dim": 16,
                "take_log_of_input": false,
                "bottleneck_hparams": {
                    "in_dim": 512,
                    "reduction_factor": 16,
                    "norm": "weightnorm",
                    "non_linearity": "relu"
                },
                "arch_hparams": {
                    "out_dim": 1,
                    "n_layers": 2,
                    "n_channels": 256,
                    "kernel_size": 3,
                    "p_dropout": 0.5,
                    "lstm_type": "",
                    "use_linear": 1
                }
            }
        }
    }
}


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

muffled audio output for custom dataset #7

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

muffled audio output for custom dataset #7

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions