Skip to content

muffled audio output for custom dataset #7

@reubzdubz

Description

@reubzdubz

I have followed your work with Flowtron and it produces audio really nicely. I am using the same dataset of 1.5 hrs of a single speaker but I have trouble getting some output that is not as muffled here. The alignment is decent:

Screenshot 2022-07-21 182200

and I am using the following parameters, am I missing something? Or is it just because I am training from scratch (I already did the decoder step)? Any help will be appreciated, thanks!

{
"train_config": {
"output_directory": "/debug",
"epochs": 10000000,
"optim_algo": "RAdam",
"learning_rate": 1e-4,
"weight_decay": 1e-6,
"sigma": 1.0,
"iters_per_checkpoint": 185,
"batch_size": 4,
"seed": null,
"checkpoint_path": "",
"ignore_layers": [],
"ignore_layers_warmstart": [],
"finetune_layers": [],
"include_layers": [],
"vocoder_config_path": "/content/drive/MyDrive/hifigan_22khz_config.json",
"vocoder_checkpoint_path": "/content/drive/MyDrive/hifigan_libritts100360_generator0p5.pt",
"log_attribute_samples": true,
"log_decoder_samples": true,

    "warmstart_checkpoint_path": "path/to/pretrained/decoder",
    "use_amp": false,
    "grad_clip_val": 1.0,
    "loss_weights": {
        "blank_logprob": -1, 
        "ctc_loss_weight": 0.1,
        "binarization_loss_weight": 1.0,
        "dur_loss_weight": 1.0,
        "f0_loss_weight": 1.0,
        "energy_loss_weight": 1.0,
        "vpred_loss_weight": 1.0
    },
    "binarization_start_iter": 0,
    "kl_loss_start_iter": 0,
    "unfreeze_modules": "durf0energyvpred"
},
"data_config": {
    "training_files": {
        "LJS": {
            "basedir": "",
            "audiodir": "",
            "filelist":  "/content/drive/MyDrive/bastila_wav/filelists/bastila_train_rad.txt",
            "lmdbpath": ""
        }
    },
    "validation_files": {
        "LJS": {
            "basedir": "",
            "audiodir": "",
            "filelist":  "/content/drive/MyDrive/bastila_wav/filelists/bastila_val_rad.txt",
            "lmdbpath": ""
        }
    },
    "dur_min": 0.1,
    "dur_max": 10.2,
    "sampling_rate": 22050,
    "filter_length": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 80,
    "mel_fmin": 0.0,
    "mel_fmax": 8000.0,
    "f0_min": 80.0,
    "f0_max": 640.0,
    "max_wav_value": 32768.0,
    "use_f0": true,
    "use_log_f0": 0,
    "use_energy_avg": true,
    "use_scaled_energy": true,
    "symbol_set": "radtts",
    "cleaner_names": ["radtts_cleaners"],
    "heteronyms_path": "tts_text_processing/heteronyms",
    "phoneme_dict_path": "tts_text_processing/cmudict-0.7b",
    "p_phoneme": 1.0,
    "handle_phoneme": "word",
    "handle_phoneme_ambiguous": "ignore",
    "include_speakers": null,
    "n_frames": -1,
    "betabinom_cache_path": "data_cache/",
    "lmdb_cache_path": "", 
    "use_attn_prior_masking": true,
    "prepend_space_to_text": true,
    "append_space_to_text": true,
    "add_bos_eos_to_text": false,
    "betabinom_scaling_factor": 1.0,
    "distance_tx_unvoiced": false,
    "mel_noise_scale": 0.0
},
"dist_config": {
    "dist_backend": "nccl",
    "dist_url": "tcp://localhost:54321"
},
"model_config": {
    "n_speakers": 1,
    "n_speaker_dim": 16,
    "n_text": 185,
    "n_text_dim": 512,
    "n_flows": 8,
    "n_conv_layers_per_step": 4,
    "n_mel_channels": 80,
    "n_hidden": 1024,
    "mel_encoder_n_hidden": 512,
    "dummy_speaker_embedding": false,
    "n_early_size": 2,
    "n_early_every": 2,
    "n_group_size": 2,
    "affine_model": "wavenet",
    "include_modules": "decatndpmvpredapm",
    "scaling_fn": "tanh",
    "matrix_decomposition": "LUS",
    "learn_alignments": true,
    "use_speaker_emb_for_alignment": false,
    "attn_straight_through_estimator": true,
    "use_context_lstm": true,
    "context_lstm_norm": "spectral",
    "context_lstm_w_f0_and_energy": true,
    "text_encoder_lstm_norm": "spectral",
    "n_f0_dims": 1,
    "n_energy_avg_dims": 1,
    "use_first_order_features": false,
    "unvoiced_bias_activation": "relu",
    "decoder_use_partial_padding": true,
    "decoder_use_unvoiced_bias": true,
    "ap_pred_log_f0": true,
    "ap_use_unvoiced_bias": true,
    "ap_use_voiced_embeddings": true,
    "dur_model_config": {
        "name": "dap",
        "hparams": {
            "n_speaker_dim": 16,
            "bottleneck_hparams": {
                "in_dim": 512,
                "reduction_factor": 16,
                "norm": "weightnorm",
                "non_linearity": "relu"
            },
            "take_log_of_input": true,
            "arch_hparams": {
                "out_dim": 1,
                "n_layers": 2,
                "n_channels": 256,
                "kernel_size": 3,
                "p_dropout": 0.25
            }
        }
    },
    "f0_model_config": {
        "name": "agap",
        "hparams": {
            "n_in_dim": 1,
            "n_group_size": 1,
            "take_log_of_input": false,
            "n_speaker_dim": 22,
            "n_flows": 2,
            "n_hidden": 128,
            "n_lstm_layers": 1,
            "scaling_fn": "tanh",
            "bottleneck_hparams": {
                "in_dim": 512,
                "reduction_factor": 16,
                "norm": "weightnorm",
                "non_linearity": "relu"
            },
            "spline_flow_params": {
                "n_in_channels": 1,
                "n_context_dim": 128,
                "n_layers": 4,
                "n_bins": 24,
                "use_quadratic": true
            }
        }
    },
    "energy_model_config": {
        "name": "agap",
        "hparams": {
            "n_in_dim": 1,
            "n_group_size": 1,
            "take_log_of_input": false,
            "n_speaker_dim": 22,
            "n_flows": 4,
            "n_hidden": 128,
            "n_lstm_layers": 1,
            "scaling_fn": "tanh",
            "bottleneck_hparams": {
                "in_dim": 512,
                "reduction_factor": 16,
                "norm": "weightnorm",
                "non_linearity": "relu"
            },
            "spline_flow_params": {
                "n_in_channels": 1,
                "n_context_dim": 128,
                "n_layers": 4,
                "n_bins": 24,
                "use_quadratic": true
            }
        }
    },
    "v_model_config": {
        "name": "dap",
        "hparams": {
            "n_speaker_dim": 16,
            "take_log_of_input": false,
            "bottleneck_hparams": {
                "in_dim": 512,
                "reduction_factor": 16,
                "norm": "weightnorm",
                "non_linearity": "relu"
            },
            "arch_hparams": {
                "out_dim": 1,
                "n_layers": 2,
                "n_channels": 256,
                "kernel_size": 3,
                "p_dropout": 0.5,
                "lstm_type": "",
                "use_linear": 1
            }
        }
    }
}

}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions