sabuhigr/sabuhi-model-v2 🖼️❓📝🔢✓ → ❓
About
Whisper Model that can be use for adding domain-specific words
Example Output
Output
{"text":" Hi, my name is Sabuki. I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.","segments":[{"end":4.71,"text":" Hi, my name is Sabuki.","start":0.828,"words":[{"end":1.068,"word":"Hi,","score":0.824,"start":0.828,"speaker":"SPEAKER_00"},{"end":1.949,"word":"my","score":0.964,"start":1.829,"speaker":"SPEAKER_00"},{"end":2.149,"word":"name","score":0.814,"start":1.989,"speaker":"SPEAKER_00"},{"end":2.249,"word":"is","score":0.798,"start":2.189,"speaker":"SPEAKER_00"},{"end":2.769,"word":"Sabuki.","score":0.815,"start":2.309,"speaker":"SPEAKER_00"}],"speaker":"SPEAKER_00"},{"end":23.297,"text":"I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.","start":4.71,"words":[{"end":4.79,"word":"I","score":0.988,"start":4.71,"speaker":"SPEAKER_00"},{"end":5.09,"word":"like","score":0.893,"start":4.85,"speaker":"SPEAKER_00"},{"end":5.27,"word":"to","score":0.843,"start":5.15,"speaker":"SPEAKER_00"},{"end":5.59,"word":"play","score":0.896,"start":5.29,"speaker":"SPEAKER_00"},{"end":6.15,"word":"games","score":0.805,"start":5.79,"speaker":"SPEAKER_00"},{"end":7.631,"word":"like","score":0.962,"start":7.411,"speaker":"SPEAKER_00"},{"end":8.851,"word":"Dungeons","score":0.838,"start":8.291,"speaker":"SPEAKER_00"},{"end":9.011,"word":"and","score":0.891,"start":8.931,"speaker":"SPEAKER_00"},{"end":9.472,"word":"Dragons","score":0.91,"start":9.051,"speaker":"SPEAKER_00"},{"end":9.792,"word":"game,","score":0.817,"start":9.512,"speaker":"SPEAKER_00"},{"end":11.012,"word":"which","score":0.97,"start":10.852,"speaker":"SPEAKER_00"},{"end":11.893,"word":"includes","score":0.909,"start":11.072,"speaker":"SPEAKER_00"},{"end":12.493,"word":"NPC","score":0.739,"start":12.153,"speaker":"SPEAKER_00"},{"end":12.753,"word":"names","score":0.827,"start":12.513,"speaker":"SPEAKER_00"},{"end":13.073,"word":"like","score":0.98,"start":12.833,"speaker":"SPEAKER_00"},{"end":14.033,"word":"Zerthimon,","score":0.87,"start":13.393,"speaker":"SPEAKER_00"},{"end":15.034,"word":"Vlaakith","score":0.698,"start":14.534,"speaker":"SPEAKER_00"},{"end":15.574,"word":"and","score":0.978,"start":15.454,"speaker":"SPEAKER_00"},{"end":16.454,"word":"Mordenkainen,","score":0.801,"start":15.674,"speaker":"SPEAKER_00"},{"end":16.774,"word":"as","score":0.929,"start":16.714,"speaker":"SPEAKER_00"},{"end":16.955,"word":"well","score":0.866,"start":16.814,"speaker":"SPEAKER_00"},{"end":17.055,"word":"as","score":0.765,"start":16.995,"speaker":"SPEAKER_00"},{"end":18.255,"word":"place","score":0.872,"start":17.975,"speaker":"SPEAKER_00"},{"end":18.575,"word":"names","score":0.799,"start":18.335,"speaker":"SPEAKER_00"},{"end":18.915,"word":"like","score":0.842,"start":18.655,"speaker":"SPEAKER_00"},{"end":20.616,"word":"Agni'hotri,","score":0.893,"start":19.776,"speaker":"SPEAKER_00"},{"end":21.816,"word":"Tu'narath","score":0.692,"start":21.216,"speaker":"SPEAKER_00"},{"end":22.397,"word":"and","score":0.807,"start":22.237,"speaker":"SPEAKER_00"},{"end":23.297,"word":"Niam'd'regal.","score":0.73,"start":22.617,"speaker":"SPEAKER_00"}],"speaker":"SPEAKER_00"}],"translation":null,"transcription":"[{'start': 0.828, 'end': 4.71, 'text': ' Hi, my name is Sabuki.', 'words': [{'word': 'Hi,', 'start': 0.828, 'end': 1.068, 'score': 0.824, 'speaker': 'SPEAKER_00'}, {'word': 'my', 'start': 1.829, 'end': 1.949, 'score': 0.964, 'speaker': 'SPEAKER_00'}, {'word': 'name', 'start': 1.989, 'end': 2.149, 'score': 0.814, 'speaker': 'SPEAKER_00'}, {'word': 'is', 'start': 2.189, 'end': 2.249, 'score': 0.798, 'speaker': 'SPEAKER_00'}, {'word': 'Sabuki.', 'start': 2.309, 'end': 2.769, 'score': 0.815, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 4.71, 'end': 23.297, 'text': "I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.", 'words': [{'word': 'I', 'start': 4.71, 'end': 4.79, 'score': 0.988, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 4.85, 'end': 5.09, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': 'to', 'start': 5.15, 'end': 5.27, 'score': 0.843, 'speaker': 'SPEAKER_00'}, {'word': 'play', 'start': 5.29, 'end': 5.59, 'score': 0.896, 'speaker': 'SPEAKER_00'}, {'word': 'games', 'start': 5.79, 'end': 6.15, 'score': 0.805, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 7.411, 'end': 7.631, 'score': 0.962, 'speaker': 'SPEAKER_00'}, {'word': 'Dungeons', 'start': 8.291, 'end': 8.851, 'score': 0.838, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 8.931, 'end': 9.011, 'score': 0.891, 'speaker': 'SPEAKER_00'}, {'word': 'Dragons', 'start': 9.051, 'end': 9.472, 'score': 0.91, 'speaker': 'SPEAKER_00'}, {'word': 'game,', 'start': 9.512, 'end': 9.792, 'score': 0.817, 'speaker': 'SPEAKER_00'}, {'word': 'which', 'start': 10.852, 'end': 11.012, 'score': 0.97, 'speaker': 'SPEAKER_00'}, {'word': 'includes', 'start': 11.072, 'end': 11.893, 'score': 0.909, 'speaker': 'SPEAKER_00'}, {'word': 'NPC', 'start': 12.153, 'end': 12.493, 'score': 0.739, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 12.513, 'end': 12.753, 'score': 0.827, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 12.833, 'end': 13.073, 'score': 0.98, 'speaker': 'SPEAKER_00'}, {'word': 'Zerthimon,', 'start': 13.393, 'end': 14.033, 'score': 0.87, 'speaker': 'SPEAKER_00'}, {'word': 'Vlaakith', 'start': 14.534, 'end': 15.034, 'score': 0.698, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 15.454, 'end': 15.574, 'score': 0.978, 'speaker': 'SPEAKER_00'}, {'word': 'Mordenkainen,', 'start': 15.674, 'end': 16.454, 'score': 0.801, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.714, 'end': 16.774, 'score': 0.929, 'speaker': 'SPEAKER_00'}, {'word': 'well', 'start': 16.814, 'end': 16.955, 'score': 0.866, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.995, 'end': 17.055, 'score': 0.765, 'speaker': 'SPEAKER_00'}, {'word': 'place', 'start': 17.975, 'end': 18.255, 'score': 0.872, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 18.335, 'end': 18.575, 'score': 0.799, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 18.655, 'end': 18.915, 'score': 0.842, 'speaker': 'SPEAKER_00'}, {'word': "Agni'hotri,", 'start': 19.776, 'end': 20.616, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': "Tu'narath", 'start': 21.216, 'end': 21.816, 'score': 0.692, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 22.237, 'end': 22.397, 'score': 0.807, 'speaker': 'SPEAKER_00'}, {'word': "Niam'd'regal.", 'start': 22.617, 'end': 23.297, 'score': 0.73, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}]","detected_language":"english","diarization_status":true}
Performance Metrics
41.32s
Prediction Time
389.93s
Total Time
All Input Parameters
{
"audio": "https://replicate.delivery/pbxt/JZJaU6Kirgn3PbXeJQI8Z73csEeltew3ZHrWpRTpMkYdWneV/initial_prompt_test_record.wav",
"model": "large-v2",
"hf_token": "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",
"language": "en",
"translate": false,
"temperature": 0,
"max_speakers": 1,
"min_speakers": 1,
"transcription": "plain text",
"suppress_tokens": "-1",
"logprob_threshold": -1,
"no_speech_threshold": 0.6,
"domain_specific_words": "The following conversation talks about: Dungeons and Dragons, Zerthimon, Vlaakith, Mordenkainen, Agni'hotri, Tu'narath, Niam'd'regal.",
"condition_on_previous_text": true,
"compression_ratio_threshold": 2.4,
"temperature_increment_on_fallback": 0.2
}
Input Parameters
- audio (required)
- Audio file
- model
- Choose a Whisper model. [for now only have Large-v2 for best performance]
- hf_token (required)
- Your Hugging Face token for speaker diarization
- language (required)
- language spoken in the audio, specify None to perform language detection
- patience
- optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
- translate
- Translate the text to English when set to True
- temperature
- temperature to use for sampling
- max_speakers
- Select 2 if record is stereo, 1 if is mono.Default is 1 for mono records
- min_speakers
- Select 2 if record is stereo, 1 if is mono.Default is 1 for mono records
- transcription
- Choose the format for the transcription
- suppress_tokens
- comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
- logprob_threshold
- if the average log probability is lower than this value, treat the decoding as failed
- no_speech_threshold
- if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
- domain_specific_words
- Use your domain-specific words here with comma separated like 'Udghreethn, Soiyhnlaaqh, antibiotic, etc.'
- condition_on_previous_text
- if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
- compression_ratio_threshold
- if the gzip compression ratio is higher than this value, treat the decoding as failed
- temperature_increment_on_fallback
- temperature to increase when falling back when the decoding fails to meet either of the thresholds below
Output Schema
Example Execution Logs
Transcribe with large-v2 model
Max speakers is: 1
Min speakers is: 1
0%| | 0.00/16.9M [00:00<?, ?iB/s]
0%| | 40.0k/16.9M [00:00<01:22, 215kiB/s]
1%|▎ | 160k/16.9M [00:00<00:27, 649kiB/s]
3%|█▏ | 520k/16.9M [00:00<00:12, 1.34MiB/s]
8%|███ | 1.40M/16.9M [00:00<00:04, 3.64MiB/s]
20%|███████▍ | 3.38M/16.9M [00:00<00:01, 8.64MiB/s]
31%|███████████▌ | 5.27M/16.9M [00:00<00:01, 11.9MiB/s]
51%|██████████████████▉ | 8.63M/16.9M [00:00<00:00, 17.2MiB/s]
71%|██████████████████████████ | 11.9M/16.9M [00:01<00:00, 22.0MiB/s]
89%|████████████████████████████████▉ | 15.0M/16.9M [00:01<00:00, 22.1MiB/s]
100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 15.1MiB/s]
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../root/.cache/torch/whisperx-vad-segmentation.bin`
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu118. Bad things might happen unless you revert torch to 1.x.
Domain specific words is: The following conversation talks about: Dungeons and Dragons, Zerthimon, Vlaakith, Mordenkainen, Agni'hotri, Tu'narath, Niam'd'regal.
Downloading (…)olve/2.1/config.yaml: 0%| | 0.00/500 [00:00<?, ?B/s]
Downloading (…)olve/2.1/config.yaml: 100%|██████████| 500/500 [00:00<00:00, 2.25MB/s]
Downloading pytorch_model.bin: 0%| | 0.00/17.7M [00:00<?, ?B/s]
Downloading pytorch_model.bin: 59%|█████▉ | 10.5M/17.7M [00:00<00:00, 44.9MB/s]
Downloading pytorch_model.bin: 100%|██████████| 17.7M/17.7M [00:00<00:00, 57.4MB/s]
Downloading (…)/2022.07/config.yaml: 0%| | 0.00/318 [00:00<?, ?B/s]
Downloading (…)/2022.07/config.yaml: 100%|██████████| 318/318 [00:00<00:00, 1.22MB/s]
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu118. Bad things might happen unless you revert torch to 1.x.
Downloading (…)ain/hyperparams.yaml: 0%| | 0.00/1.92k [00:00<?, ?B/s]
Downloading (…)ain/hyperparams.yaml: 100%|██████████| 1.92k/1.92k [00:00<00:00, 5.91MB/s]
Downloading embedding_model.ckpt: 0%| | 0.00/83.3M [00:00<?, ?B/s]
Downloading embedding_model.ckpt: 25%|██▌ | 21.0M/83.3M [00:00<00:00, 110MB/s]
Downloading embedding_model.ckpt: 50%|█████ | 41.9M/83.3M [00:00<00:00, 111MB/s]
Downloading embedding_model.ckpt: 76%|███████▌ | 62.9M/83.3M [00:00<00:00, 111MB/s]
Downloading embedding_model.ckpt: 100%|██████████| 83.3M/83.3M [00:00<00:00, 112MB/s]
Downloading embedding_model.ckpt: 100%|██████████| 83.3M/83.3M [00:00<00:00, 110MB/s]
Downloading (…)an_var_norm_emb.ckpt: 0%| | 0.00/1.92k [00:00<?, ?B/s]
Downloading (…)an_var_norm_emb.ckpt: 100%|██████████| 1.92k/1.92k [00:00<00:00, 5.68MB/s]
Downloading classifier.ckpt: 0%| | 0.00/5.53M [00:00<?, ?B/s]
Downloading classifier.ckpt: 100%|██████████| 5.53M/5.53M [00:00<00:00, 117MB/s]
Downloading (…)in/label_encoder.txt: 0%| | 0.00/129k [00:00<?, ?B/s]
Downloading (…)in/label_encoder.txt: 100%|██████████| 129k/129k [00:00<00:00, 5.99MB/s]
0 1 ... intersection union
0 [ 00:00:00.497 --> 00:00:02.826] A ... -19.790437 22.799188
1 [ 00:00:03.906 --> 00:00:23.397] B ... 0.680000 19.490625
[2 rows x 7 columns]
[{'start': 0.828, 'end': 4.71, 'text': ' Hi, my name is Sabuki.', 'words': [{'word': 'Hi,', 'start': 0.828, 'end': 1.068, 'score': 0.824, 'speaker': 'SPEAKER_00'}, {'word': 'my', 'start': 1.829, 'end': 1.949, 'score': 0.964, 'speaker': 'SPEAKER_00'}, {'word': 'name', 'start': 1.989, 'end': 2.149, 'score': 0.814, 'speaker': 'SPEAKER_00'}, {'word': 'is', 'start': 2.189, 'end': 2.249, 'score': 0.798, 'speaker': 'SPEAKER_00'}, {'word': 'Sabuki.', 'start': 2.309, 'end': 2.769, 'score': 0.815, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 4.71, 'end': 23.297, 'text': "I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.", 'words': [{'word': 'I', 'start': 4.71, 'end': 4.79, 'score': 0.988, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 4.85, 'end': 5.09, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': 'to', 'start': 5.15, 'end': 5.27, 'score': 0.843, 'speaker': 'SPEAKER_00'}, {'word': 'play', 'start': 5.29, 'end': 5.59, 'score': 0.896, 'speaker': 'SPEAKER_00'}, {'word': 'games', 'start': 5.79, 'end': 6.15, 'score': 0.805, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 7.411, 'end': 7.631, 'score': 0.962, 'speaker': 'SPEAKER_00'}, {'word': 'Dungeons', 'start': 8.291, 'end': 8.851, 'score': 0.838, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 8.931, 'end': 9.011, 'score': 0.891, 'speaker': 'SPEAKER_00'}, {'word': 'Dragons', 'start': 9.051, 'end': 9.472, 'score': 0.91, 'speaker': 'SPEAKER_00'}, {'word': 'game,', 'start': 9.512, 'end': 9.792, 'score': 0.817, 'speaker': 'SPEAKER_00'}, {'word': 'which', 'start': 10.852, 'end': 11.012, 'score': 0.97, 'speaker': 'SPEAKER_00'}, {'word': 'includes', 'start': 11.072, 'end': 11.893, 'score': 0.909, 'speaker': 'SPEAKER_00'}, {'word': 'NPC', 'start': 12.153, 'end': 12.493, 'score': 0.739, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 12.513, 'end': 12.753, 'score': 0.827, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 12.833, 'end': 13.073, 'score': 0.98, 'speaker': 'SPEAKER_00'}, {'word': 'Zerthimon,', 'start': 13.393, 'end': 14.033, 'score': 0.87, 'speaker': 'SPEAKER_00'}, {'word': 'Vlaakith', 'start': 14.534, 'end': 15.034, 'score': 0.698, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 15.454, 'end': 15.574, 'score': 0.978, 'speaker': 'SPEAKER_00'}, {'word': 'Mordenkainen,', 'start': 15.674, 'end': 16.454, 'score': 0.801, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.714, 'end': 16.774, 'score': 0.929, 'speaker': 'SPEAKER_00'}, {'word': 'well', 'start': 16.814, 'end': 16.955, 'score': 0.866, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.995, 'end': 17.055, 'score': 0.765, 'speaker': 'SPEAKER_00'}, {'word': 'place', 'start': 17.975, 'end': 18.255, 'score': 0.872, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 18.335, 'end': 18.575, 'score': 0.799, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 18.655, 'end': 18.915, 'score': 0.842, 'speaker': 'SPEAKER_00'}, {'word': "Agni'hotri,", 'start': 19.776, 'end': 20.616, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': "Tu'narath", 'start': 21.216, 'end': 21.816, 'score': 0.692, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 22.237, 'end': 22.397, 'score': 0.807, 'speaker': 'SPEAKER_00'}, {'word': "Niam'd'regal.", 'start': 22.617, 'end': 23.297, 'score': 0.73, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}]
Version Details
- Version ID
f030b3c37c8341d86a383c9b93bfd7684b2962ce26814191e664c88c4d931d35- Version Created
- January 8, 2025