sabuhigr/sabuhi-model-v2 🖼️❓📝🔢✓ → ❓

▶️ 33.0K runs 📅 Jul 2023 ⚙️ Cog 0.13.6

speaker-diarization speech-to-text speech-translation

Performance

41.3sTypical run time

~390sCold start (first call)

33.0KTotal runs

About

Whisper Model that can be use for adding domain-specific words

Example Output

Output

{"text":" Hi, my name is Sabuki. I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.","segments":[{"end":4.71,"text":" Hi, my name is Sabuki.","start":0.828,"words":[{"end":1.068,"word":"Hi,","score":0.824,"start":0.828,"speaker":"SPEAKER_00"},{"end":1.949,"word":"my","score":0.964,"start":1.829,"speaker":"SPEAKER_00"},{"end":2.149,"word":"name","score":0.814,"start":1.989,"speaker":"SPEAKER_00"},{"end":2.249,"word":"is","score":0.798,"start":2.189,"speaker":"SPEAKER_00"},{"end":2.769,"word":"Sabuki.","score":0.815,"start":2.309,"speaker":"SPEAKER_00"}],"speaker":"SPEAKER_00"},{"end":23.297,"text":"I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.","start":4.71,"words":[{"end":4.79,"word":"I","score":0.988,"start":4.71,"speaker":"SPEAKER_00"},{"end":5.09,"word":"like","score":0.893,"start":4.85,"speaker":"SPEAKER_00"},{"end":5.27,"word":"to","score":0.843,"start":5.15,"speaker":"SPEAKER_00"},{"end":5.59,"word":"play","score":0.896,"start":5.29,"speaker":"SPEAKER_00"},{"end":6.15,"word":"games","score":0.805,"start":5.79,"speaker":"SPEAKER_00"},{"end":7.631,"word":"like","score":0.962,"start":7.411,"speaker":"SPEAKER_00"},{"end":8.851,"word":"Dungeons","score":0.838,"start":8.291,"speaker":"SPEAKER_00"},{"end":9.011,"word":"and","score":0.891,"start":8.931,"speaker":"SPEAKER_00"},{"end":9.472,"word":"Dragons","score":0.91,"start":9.051,"speaker":"SPEAKER_00"},{"end":9.792,"word":"game,","score":0.817,"start":9.512,"speaker":"SPEAKER_00"},{"end":11.012,"word":"which","score":0.97,"start":10.852,"speaker":"SPEAKER_00"},{"end":11.893,"word":"includes","score":0.909,"start":11.072,"speaker":"SPEAKER_00"},{"end":12.493,"word":"NPC","score":0.739,"start":12.153,"speaker":"SPEAKER_00"},{"end":12.753,"word":"names","score":0.827,"start":12.513,"speaker":"SPEAKER_00"},{"end":13.073,"word":"like","score":0.98,"start":12.833,"speaker":"SPEAKER_00"},{"end":14.033,"word":"Zerthimon,","score":0.87,"start":13.393,"speaker":"SPEAKER_00"},{"end":15.034,"word":"Vlaakith","score":0.698,"start":14.534,"speaker":"SPEAKER_00"},{"end":15.574,"word":"and","score":0.978,"start":15.454,"speaker":"SPEAKER_00"},{"end":16.454,"word":"Mordenkainen,","score":0.801,"start":15.674,"speaker":"SPEAKER_00"},{"end":16.774,"word":"as","score":0.929,"start":16.714,"speaker":"SPEAKER_00"},{"end":16.955,"word":"well","score":0.866,"start":16.814,"speaker":"SPEAKER_00"},{"end":17.055,"word":"as","score":0.765,"start":16.995,"speaker":"SPEAKER_00"},{"end":18.255,"word":"place","score":0.872,"start":17.975,"speaker":"SPEAKER_00"},{"end":18.575,"word":"names","score":0.799,"start":18.335,"speaker":"SPEAKER_00"},{"end":18.915,"word":"like","score":0.842,"start":18.655,"speaker":"SPEAKER_00"},{"end":20.616,"word":"Agni'hotri,","score":0.893,"start":19.776,"speaker":"SPEAKER_00"},{"end":21.816,"word":"Tu'narath","score":0.692,"start":21.216,"speaker":"SPEAKER_00"},{"end":22.397,"word":"and","score":0.807,"start":22.237,"speaker":"SPEAKER_00"},{"end":23.297,"word":"Niam'd'regal.","score":0.73,"start":22.617,"speaker":"SPEAKER_00"}],"speaker":"SPEAKER_00"}],"translation":null,"transcription":"[{'start': 0.828, 'end': 4.71, 'text': ' Hi, my name is Sabuki.', 'words': [{'word': 'Hi,', 'start': 0.828, 'end': 1.068, 'score': 0.824, 'speaker': 'SPEAKER_00'}, {'word': 'my', 'start': 1.829, 'end': 1.949, 'score': 0.964, 'speaker': 'SPEAKER_00'}, {'word': 'name', 'start': 1.989, 'end': 2.149, 'score': 0.814, 'speaker': 'SPEAKER_00'}, {'word': 'is', 'start': 2.189, 'end': 2.249, 'score': 0.798, 'speaker': 'SPEAKER_00'}, {'word': 'Sabuki.', 'start': 2.309, 'end': 2.769, 'score': 0.815, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 4.71, 'end': 23.297, 'text': "I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.", 'words': [{'word': 'I', 'start': 4.71, 'end': 4.79, 'score': 0.988, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 4.85, 'end': 5.09, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': 'to', 'start': 5.15, 'end': 5.27, 'score': 0.843, 'speaker': 'SPEAKER_00'}, {'word': 'play', 'start': 5.29, 'end': 5.59, 'score': 0.896, 'speaker': 'SPEAKER_00'}, {'word': 'games', 'start': 5.79, 'end': 6.15, 'score': 0.805, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 7.411, 'end': 7.631, 'score': 0.962, 'speaker': 'SPEAKER_00'}, {'word': 'Dungeons', 'start': 8.291, 'end': 8.851, 'score': 0.838, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 8.931, 'end': 9.011, 'score': 0.891, 'speaker': 'SPEAKER_00'}, {'word': 'Dragons', 'start': 9.051, 'end': 9.472, 'score': 0.91, 'speaker': 'SPEAKER_00'}, {'word': 'game,', 'start': 9.512, 'end': 9.792, 'score': 0.817, 'speaker': 'SPEAKER_00'}, {'word': 'which', 'start': 10.852, 'end': 11.012, 'score': 0.97, 'speaker': 'SPEAKER_00'}, {'word': 'includes', 'start': 11.072, 'end': 11.893, 'score': 0.909, 'speaker': 'SPEAKER_00'}, {'word': 'NPC', 'start': 12.153, 'end': 12.493, 'score': 0.739, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 12.513, 'end': 12.753, 'score': 0.827, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 12.833, 'end': 13.073, 'score': 0.98, 'speaker': 'SPEAKER_00'}, {'word': 'Zerthimon,', 'start': 13.393, 'end': 14.033, 'score': 0.87, 'speaker': 'SPEAKER_00'}, {'word': 'Vlaakith', 'start': 14.534, 'end': 15.034, 'score': 0.698, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 15.454, 'end': 15.574, 'score': 0.978, 'speaker': 'SPEAKER_00'}, {'word': 'Mordenkainen,', 'start': 15.674, 'end': 16.454, 'score': 0.801, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.714, 'end': 16.774, 'score': 0.929, 'speaker': 'SPEAKER_00'}, {'word': 'well', 'start': 16.814, 'end': 16.955, 'score': 0.866, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.995, 'end': 17.055, 'score': 0.765, 'speaker': 'SPEAKER_00'}, {'word': 'place', 'start': 17.975, 'end': 18.255, 'score': 0.872, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 18.335, 'end': 18.575, 'score': 0.799, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 18.655, 'end': 18.915, 'score': 0.842, 'speaker': 'SPEAKER_00'}, {'word': "Agni'hotri,", 'start': 19.776, 'end': 20.616, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': "Tu'narath", 'start': 21.216, 'end': 21.816, 'score': 0.692, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 22.237, 'end': 22.397, 'score': 0.807, 'speaker': 'SPEAKER_00'}, {'word': "Niam'd'regal.", 'start': 22.617, 'end': 23.297, 'score': 0.73, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}]","detected_language":"english","diarization_status":true}

Performance Metrics

41.32s Prediction Time

389.93s Total Time

All Input Parameters

{
  "audio": "https://replicate.delivery/pbxt/JZJaU6Kirgn3PbXeJQI8Z73csEeltew3ZHrWpRTpMkYdWneV/initial_prompt_test_record.wav",
  "model": "large-v2",
  "hf_token": "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",
  "language": "en",
  "translate": false,
  "temperature": 0,
  "max_speakers": 1,
  "min_speakers": 1,
  "transcription": "plain text",
  "suppress_tokens": "-1",
  "logprob_threshold": -1,
  "no_speech_threshold": 0.6,
  "domain_specific_words": "The following conversation talks about: Dungeons and Dragons, Zerthimon, Vlaakith, Mordenkainen, Agni'hotri, Tu'narath, Niam'd'regal.",
  "condition_on_previous_text": true,
  "compression_ratio_threshold": 2.4,
  "temperature_increment_on_fallback": 0.2
}

Input Parameters

audio (required) Type: string: Audio file
model Default: large-v2: Choose a Whisper model. [for now only have Large-v2 for best performance]
hf_token (required) Type: string: Your Hugging Face token for speaker diarization
language (required): language spoken in the audio, specify None to perform language detection
patience Type: number: optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search
translate Type: booleanDefault: false: Translate the text to English when set to True
temperature Type: numberDefault: 0: temperature to use for sampling
max_speakers Default: 1: Select 2 if record is stereo, 1 if is mono.Default is 1 for mono records
min_speakers Default: 1: Select 2 if record is stereo, 1 if is mono.Default is 1 for mono records
transcription Default: plain text: Choose the format for the transcription
suppress_tokens Type: stringDefault: -1: comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations
logprob_threshold Type: numberDefault: -1: if the average log probability is lower than this value, treat the decoding as failed
no_speech_threshold Type: numberDefault: 0.6: if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
domain_specific_words Type: string: Use your domain-specific words here with comma separated like 'Udghreethn, Soiyhnlaaqh, antibiotic, etc.'
condition_on_previous_text Type: booleanDefault: true: if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop
compression_ratio_threshold Type: numberDefault: 2.4: if the gzip compression ratio is higher than this value, treat the decoding as failed
temperature_increment_on_fallback Type: numberDefault: 0.2: temperature to increase when falling back when the decoding fails to meet either of the thresholds below

Output Schema

Example Execution Logs

Transcribe with large-v2 model
Max speakers is: 1
Min speakers is: 1
  0%|                                              | 0.00/16.9M [00:00<?, ?iB/s]
  0%|                                      | 40.0k/16.9M [00:00<01:22, 215kiB/s]
  1%|▎                                      | 160k/16.9M [00:00<00:27, 649kiB/s]
  3%|█▏                                    | 520k/16.9M [00:00<00:12, 1.34MiB/s]
  8%|███                                  | 1.40M/16.9M [00:00<00:04, 3.64MiB/s]
 20%|███████▍                             | 3.38M/16.9M [00:00<00:01, 8.64MiB/s]
 31%|███████████▌                         | 5.27M/16.9M [00:00<00:01, 11.9MiB/s]
 51%|██████████████████▉                  | 8.63M/16.9M [00:00<00:00, 17.2MiB/s]
 71%|██████████████████████████           | 11.9M/16.9M [00:01<00:00, 22.0MiB/s]
 89%|████████████████████████████████▉    | 15.0M/16.9M [00:01<00:00, 22.1MiB/s]
100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 15.1MiB/s]
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../root/.cache/torch/whisperx-vad-segmentation.bin`
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu118. Bad things might happen unless you revert torch to 1.x.
Domain specific words is: The following conversation talks about: Dungeons and Dragons, Zerthimon, Vlaakith, Mordenkainen, Agni'hotri, Tu'narath, Niam'd'regal.
Downloading (…)olve/2.1/config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]
Downloading (…)olve/2.1/config.yaml: 100%|██████████| 500/500 [00:00<00:00, 2.25MB/s]
Downloading pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]
Downloading pytorch_model.bin:  59%|█████▉    | 10.5M/17.7M [00:00<00:00, 44.9MB/s]
Downloading pytorch_model.bin: 100%|██████████| 17.7M/17.7M [00:00<00:00, 57.4MB/s]
Downloading (…)/2022.07/config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]
Downloading (…)/2022.07/config.yaml: 100%|██████████| 318/318 [00:00<00:00, 1.22MB/s]
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.0.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint --file ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1+cu118. Bad things might happen unless you revert torch to 1.x.
Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]
Downloading (…)ain/hyperparams.yaml: 100%|██████████| 1.92k/1.92k [00:00<00:00, 5.91MB/s]
Downloading embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]
Downloading embedding_model.ckpt:  25%|██▌       | 21.0M/83.3M [00:00<00:00, 110MB/s]
Downloading embedding_model.ckpt:  50%|█████     | 41.9M/83.3M [00:00<00:00, 111MB/s]
Downloading embedding_model.ckpt:  76%|███████▌  | 62.9M/83.3M [00:00<00:00, 111MB/s]
Downloading embedding_model.ckpt: 100%|██████████| 83.3M/83.3M [00:00<00:00, 112MB/s]
Downloading embedding_model.ckpt: 100%|██████████| 83.3M/83.3M [00:00<00:00, 110MB/s]
Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]
Downloading (…)an_var_norm_emb.ckpt: 100%|██████████| 1.92k/1.92k [00:00<00:00, 5.68MB/s]
Downloading classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]
Downloading classifier.ckpt: 100%|██████████| 5.53M/5.53M [00:00<00:00, 117MB/s]
Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]
Downloading (…)in/label_encoder.txt: 100%|██████████| 129k/129k [00:00<00:00, 5.99MB/s]
0  1  ... intersection      union
0  [ 00:00:00.497 -->  00:00:02.826]  A  ...   -19.790437  22.799188
1  [ 00:00:03.906 -->  00:00:23.397]  B  ...     0.680000  19.490625
[2 rows x 7 columns]
[{'start': 0.828, 'end': 4.71, 'text': ' Hi, my name is Sabuki.', 'words': [{'word': 'Hi,', 'start': 0.828, 'end': 1.068, 'score': 0.824, 'speaker': 'SPEAKER_00'}, {'word': 'my', 'start': 1.829, 'end': 1.949, 'score': 0.964, 'speaker': 'SPEAKER_00'}, {'word': 'name', 'start': 1.989, 'end': 2.149, 'score': 0.814, 'speaker': 'SPEAKER_00'}, {'word': 'is', 'start': 2.189, 'end': 2.249, 'score': 0.798, 'speaker': 'SPEAKER_00'}, {'word': 'Sabuki.', 'start': 2.309, 'end': 2.769, 'score': 0.815, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}, {'start': 4.71, 'end': 23.297, 'text': "I like to play games like Dungeons and Dragons game, which includes NPC names like Zerthimon, Vlaakith and Mordenkainen, as well as place names like Agni'hotri, Tu'narath and Niam'd'regal.", 'words': [{'word': 'I', 'start': 4.71, 'end': 4.79, 'score': 0.988, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 4.85, 'end': 5.09, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': 'to', 'start': 5.15, 'end': 5.27, 'score': 0.843, 'speaker': 'SPEAKER_00'}, {'word': 'play', 'start': 5.29, 'end': 5.59, 'score': 0.896, 'speaker': 'SPEAKER_00'}, {'word': 'games', 'start': 5.79, 'end': 6.15, 'score': 0.805, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 7.411, 'end': 7.631, 'score': 0.962, 'speaker': 'SPEAKER_00'}, {'word': 'Dungeons', 'start': 8.291, 'end': 8.851, 'score': 0.838, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 8.931, 'end': 9.011, 'score': 0.891, 'speaker': 'SPEAKER_00'}, {'word': 'Dragons', 'start': 9.051, 'end': 9.472, 'score': 0.91, 'speaker': 'SPEAKER_00'}, {'word': 'game,', 'start': 9.512, 'end': 9.792, 'score': 0.817, 'speaker': 'SPEAKER_00'}, {'word': 'which', 'start': 10.852, 'end': 11.012, 'score': 0.97, 'speaker': 'SPEAKER_00'}, {'word': 'includes', 'start': 11.072, 'end': 11.893, 'score': 0.909, 'speaker': 'SPEAKER_00'}, {'word': 'NPC', 'start': 12.153, 'end': 12.493, 'score': 0.739, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 12.513, 'end': 12.753, 'score': 0.827, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 12.833, 'end': 13.073, 'score': 0.98, 'speaker': 'SPEAKER_00'}, {'word': 'Zerthimon,', 'start': 13.393, 'end': 14.033, 'score': 0.87, 'speaker': 'SPEAKER_00'}, {'word': 'Vlaakith', 'start': 14.534, 'end': 15.034, 'score': 0.698, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 15.454, 'end': 15.574, 'score': 0.978, 'speaker': 'SPEAKER_00'}, {'word': 'Mordenkainen,', 'start': 15.674, 'end': 16.454, 'score': 0.801, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.714, 'end': 16.774, 'score': 0.929, 'speaker': 'SPEAKER_00'}, {'word': 'well', 'start': 16.814, 'end': 16.955, 'score': 0.866, 'speaker': 'SPEAKER_00'}, {'word': 'as', 'start': 16.995, 'end': 17.055, 'score': 0.765, 'speaker': 'SPEAKER_00'}, {'word': 'place', 'start': 17.975, 'end': 18.255, 'score': 0.872, 'speaker': 'SPEAKER_00'}, {'word': 'names', 'start': 18.335, 'end': 18.575, 'score': 0.799, 'speaker': 'SPEAKER_00'}, {'word': 'like', 'start': 18.655, 'end': 18.915, 'score': 0.842, 'speaker': 'SPEAKER_00'}, {'word': "Agni'hotri,", 'start': 19.776, 'end': 20.616, 'score': 0.893, 'speaker': 'SPEAKER_00'}, {'word': "Tu'narath", 'start': 21.216, 'end': 21.816, 'score': 0.692, 'speaker': 'SPEAKER_00'}, {'word': 'and', 'start': 22.237, 'end': 22.397, 'score': 0.807, 'speaker': 'SPEAKER_00'}, {'word': "Niam'd'regal.", 'start': 22.617, 'end': 23.297, 'score': 0.73, 'speaker': 'SPEAKER_00'}], 'speaker': 'SPEAKER_00'}]

Version Details

Version ID: f030b3c37c8341d86a383c9b93bfd7684b2962ce26814191e664c88c4d931d35
Version Created: January 8, 2025

Run on Replicate →