dc9ed88f78
* initial_commit Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * init diarizer Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * vad+speaker Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * vad update Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * speaker done Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * initial working version Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * compare outputs Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * added uem support Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * pyannote improvements Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * updated config and script name Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * style fix Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update Jenkins file Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * jenkins fix Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * jenkins fix Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update file path in jenkins Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update file path in jenkins Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update file path in jenkins Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * jenkins quote fix Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update offline speaker diarization notebook Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * intial working asr_with_diarization Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * almost done, revist scoring part Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * fixed eval in offline diarization with asr Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update write2manifest to consider only up to max audio duration Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * asr with diarization notebook Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * Fixed ASR_with_diarization tutorial.ipynb and diarization_utils and edited config yaml file Signed-off-by: Taejin Park <tango4j@gmail.com> * Fixed VAD parameters in Speaker_Diarization_Inference.ipynb Signed-off-by: Taejin Park <tango4j@gmail.com> * Added Jenkins test, doc strings and updated README Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update jenkins test Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * Doc info in offline_diarization_with_asr Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * Review comments Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * update outdir paths Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> Co-authored-by: Taejin Park <tango4j@gmail.com>
48 lines
2.5 KiB
YAML
48 lines
2.5 KiB
YAML
name: &name "ClusterDiarizer"
|
|
|
|
num_workers: 4
|
|
sample_rate: 16000
|
|
batch_size: 64
|
|
|
|
diarizer:
|
|
manifest_filepath: ???
|
|
out_dir: ???
|
|
oracle_vad: False # if True, uses RTTM files provided in manifest file to get speech activity (VAD) timestamps
|
|
collar: 0.25 # collar value for scoring
|
|
ignore_overlap: True # consider or ignore overlap segments while scoring
|
|
|
|
vad:
|
|
model_path: null #.nemo local model path or pretrained model name or none
|
|
external_vad_manifest: null # This option is provided to use external vad and provide its speech activity labels for speaker embeddings extraction. Only one of model_path or external_vad_manifest should be set
|
|
|
|
parameters: # Tuned parameter for CH109! (with 11 moved multi-speech sessions as dev set)
|
|
window_length_in_sec: 0.15 # window length in sec for VAD context input
|
|
shift_length_in_sec: 0.01 # shift length in sec for generate frame level VAD prediction
|
|
smoothing: "median" # False or type of smoothing method (eg: median)
|
|
overlap: 0.875 # Overlap ratio for overlapped mean/median smoothing filter
|
|
onset: 0.4 # onset threshold for detecting the beginning and end of a speech
|
|
offset: 0.7 # offset threshold for detecting the end of a speech.
|
|
pad_onset: 0.05 # adding durations before each speech segment
|
|
pad_offset: -0.1 # adding durations after each speech segment
|
|
min_duration_on: 0.2 # threshold for small non_speech deletion
|
|
min_duration_off: 0.2 # threshold for short speech segment deletion
|
|
filter_speech_first: True
|
|
|
|
speaker_embeddings:
|
|
model_path: ??? #.nemo local model path or pretrained model name (ecapa_tdnn or speakerverification_speakernet)
|
|
parameters:
|
|
window_length_in_sec: 1.5 # window length in sec for speaker embedding extraction
|
|
shift_length_in_sec: 0.75 # shift length in sec for speaker embedding extraction
|
|
save_embeddings: False # save embeddings as pickle file for each audio input
|
|
|
|
clustering:
|
|
parameters:
|
|
oracle_num_speakers: False # if True, uses num of speakers value provided in manifest file
|
|
max_num_speakers: 20 # max number of speakers for each recording. If oracle num speakers is passed this value is ignored.
|
|
enhanced_count_thres: 80
|
|
max_rp_threshold: 0.25
|
|
sparse_search_volume: 30
|
|
|
|
# json manifest line example
|
|
# {"audio_filepath": "/path/to/audio_file", "offset": 0, "duration": null, label: "infer", "text": "-", "num_speakers": null, "rttm_filepath": "/path/to/rttm/file", "uem_filepath"="/path/to/uem/filepath"}
|