-
Notifications
You must be signed in to change notification settings - Fork 1.9k
PaddleSpeech Saved Model Config
Hui Zhang edited this page Feb 18, 2022
·
6 revisions
- 用户存在使用问题。如 #1367
预训练模型发布需要包含:模型文件、训练配置文件、元数据文件。
其中元数据文件包含:cmvn、vocab、spm model、phone_id.map、tone_id.map、spk_id.map等。
目前存在如下问题:
1)打包时文件多,会遗漏文件,耗费人力且效率低。
2)后端推理引擎(CLI、Server、SpeechX)依赖多个文件,且训练配置文件中大部分内容在推理时并不需要。
在模型保存时,保存 Saved Model Config 文件。
该文件合并元数据文件和训练配置文件,剔除训练相关参数,只保留和推理相关的参数。
推理相关参数包含:模型配置、输入数据元数据、模型输入/输出 shape、解码器配置等。
语音识别常用源文件包含:cmvn、vocab、spm_model。
###########################################################
# Data SETTING #
###########################################################
input_dim: 80
output_dim: 4233
preprocess_config:
process:
- dither: 0.1
fs: 16000
n_mels: 80
n_shift: 160
type: fbank_kaldi
win_length: 400
- cmvn_path:
frame_num: 54068199
mean_stat:
- 533749178.7549203
- 537379151.941282
- 604649423.9932463
var_stat:
- 5413314850.920174
- 5559847287.933601
- 6150990253.613782
type: cmvn_json
- inplace: true
max_time_warp: 5
mode: PIL
type: time_warp
- F: 30
inplace: true
n_mask: 2
replace_with_zero: false
type: freq_mask
- T: 40
inplace: true
n_mask: 2
replace_with_zero: false
type: time_mask
unit_type: char
vocab:
- <blank>
- <unk>
- "\u4E00"
- <eos>
###########################################################
# Model SETTING #
###########################################################
num_encs: 1
decoder: transformer
decoder_conf:
attention_heads: 4
dropout_rate: 0.1
linear_units: 2048
num_blocks: 6
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
encoder: transformer
encoder_conf:
attention_dropout_rate: 0.0
attention_heads: 4
dropout_rate: 0.1
input_layer: conv2d
linear_units: 2048
normalize_before: true
num_blocks: 12
output_size: 256
positional_dropout_rate: 0.1
###########################################################
# Decoder SETTING #
###########################################################
decode:
beam_size: 10
ctc_weight: 0.5
decode_batch_size: 128
decoding_chunk_size: -1
decoding_method: attention
error_rate_type: cer
num_decoding_left_chunks: -1
simulate_streaming: false
语音识别常用源文件包含:cmvn、phone_id.map、tone_id.map、spk_id.map。
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # sr
n_fft: 2048 # FFT size (samples).
n_shift: 300 # Hop size (samples). 12.5ms
win_length: 1200 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
# Only used for feats_type != raw
fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.
# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.
input_dim: 512 # Input dim
output_dim: 80 # Output dim
###########################################################
# MODEL SETTING #
###########################################################
model:
adim: 384 # attention dimension
aheads: 2 # number of attention heads
elayers: 4 # number of encoder layers
eunits: 1536 # number of encoder ff units
dlayers: 4 # number of decoder layers
dunits: 1536 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
encoder_normalize_before: True # whether to perform layer normalization before the input
decoder_normalize_before: True # whether to perform layer normalization before the input
reduction_factor: 1 # reduction factor
init_type: xavier_uniform # initialization type
init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type
###########################################################
# Meta Data #
###########################################################
cmvn:
- cmvn_stats:
istd:
- 0.35553239898659167
- 0.49447592669215484
- 0.6062828060163584
mean:
- -15.016273397914093
- -28.53514450850192
- -29.57712718174046
pitch_cmvn:
- cmvn_stats:
istd:
- 0.6062828060163584
mean:
- -15.016273397914093
energy_cmvn:
- cmvn_stats:
istd:
- 0.6062828060163584
mean:
- -15.016273397914093
phone_map:
- <sp>
- <unk>
- b
- zh
...
- ang
- ie
spk_map:
- spk1
- spk2
...
- spkn
- 目前ASR采用离线生产 Saved Model Config 文件,参看 generate_infer_yaml.py