villesau/whisper-timestamped ✓❓🔢📝🖼️ → ❓

▶️ 5.9K runs 📅 Oct 2024 ⚙️ Cog 0.9.24 🔗 GitHub ⚖️ License
speech-to-text word-level-timestamps

About

Transcribes audio using Whisper Large V3 with precise word-level timestamps and confidence scores.

Example Output

Output

{"text":" That is exactly what happened.","language":"en","segments":[{"id":0,"end":2.19,"seek":0,"text":" That is exactly what happened.","start":0.57,"words":[{"end":0.79,"text":"That","start":0.57,"confidence":0.83},{"end":1.01,"text":"is","start":0.79,"confidence":0.984},{"end":1.53,"text":"exactly","start":1.01,"confidence":0.994},{"end":1.79,"text":"what","start":1.53,"confidence":0.998},{"end":2.19,"text":"happened.","start":1.79,"confidence":0.995}],"tokens":[50365,663,307,2293,437,2011,13,50474],"confidence":0.958,"avg_logprob":-0.34747041596306694,"temperature":0,"no_speech_prob":0.04611295834183693,"compression_ratio":0.8571428571428571}],"language_probs":{"af":0.000002371588834648719,"am":6.9843961725268855e-9,"ar":0.00005482712367665954,"as":2.9062825035452988e-8,"az":2.945290873412887e-7,"ba":9.690693802966166e-10,"be":6.728683388246282e-7,"bg":0.0000029399832328635966,"bn":0.0000016882902400539024,"bo":2.985835010349547e-7,"br":0.000011138805348309688,"bs":0.000002210563707194524,"ca":0.000003383899866093998,"cs":0.000012233585039211903,"cy":0.0006274632178246975,"da":0.000011764923328883015,"de":0.00038960075471550226,"el":0.000039800219383323565,"en":0.9933839440345764,"es":0.0010508032282814384,"et":4.752749305225734e-7,"eu":5.459707494992472e-7,"fa":0.000005535690434044227,"fi":0.00005612731183646247,"fo":7.26124596894806e-7,"fr":0.0002457181108184159,"gl":0.0000023995444280444644,"gu":5.464226049411991e-8,"ha":2.30658558919572e-9,"he":0.00001144758516602451,"hi":0.000022501006242237054,"hr":0.000006943310836504679,"ht":0.0000010081241725856671,"hu":0.00001402587349730311,"hy":3.7086812199049746e-7,"id":0.00002703558129724115,"is":0.000002271842504342203,"it":0.00009622459765523672,"ja":0.00022026030637789518,"jw":0.000040744060243014246,"ka":3.7445172296202145e-8,"kk":2.761446182830696e-7,"km":0.000018221870050183497,"kn":1.0984273757230767e-7,"ko":0.00015618793258909136,"la":0.00029871726292185485,"lb":2.6137041420071228e-9,"ln":1.9256308547710432e-8,"lo":1.5950840293044166e-7,"lt":0.0000011158916777276318,"lv":0.000001728326878946973,"mg":1.0437287700781894e-9,"mi":0.000044748594518750906,"mk":6.507966787694386e-8,"ml":0.0000017418822153558722,"mn":3.9248061511898413e-7,"mr":2.4850325530678674e-7,"ms":0.000025596773411962204,"mt":8.413714880362022e-8,"my":3.1413753731612815e-7,"ne":3.6440593476072536e-7,"nl":0.00017698411829769611,"nn":0.0006081582396291196,"no":0.000014756519703951199,"oc":3.350514248268155e-7,"pa":5.089006549496844e-7,"pl":0.00009927909559337422,"ps":2.4132847897817555e-7,"pt":0.0005494251381605864,"ro":0.00001627029814699199,"ru":0.0010841591283679008,"sa":0.000003081075647060061,"sd":2.893970645345689e-7,"si":0.000011996966350125149,"sk":0.0000026046805032819975,"sl":0.0000049043501348933205,"sn":0.00001487225654273061,"so":1.1598242366517297e-9,"sq":9.222627284088958e-8,"sr":4.499812291669514e-7,"su":4.686813159793246e-9,"sv":0.00007493978773709387,"sw":0.0000033444757718825713,"ta":0.00000844091937324265,"te":0.0000026768848329083994,"tg":2.1332433597365252e-9,"th":0.000010546009434619918,"tk":7.971348026103442e-10,"tl":0.00009399555710842833,"tr":0.000055257140047615394,"tt":3.349514221540062e-9,"uk":0.000009677562957222108,"ur":0.000013073552509013098,"uz":2.3152679773374985e-10,"vi":0.000021979773009661585,"yi":0.0000010943083452730207,"yo":0.0000023623429115104955,"zh":0.00008693167910678312,"haw":0.0001264848542632535,"yue":5.26081407770107e-7},"speech_activity":[{"end":2.8735,"start":0.046}]}

Performance Metrics

2.70s Prediction Time
103.27s Total Time
All Input Parameters
{
  "vad": true,
  "task": "transcribe",
  "verbose": false,
  "language": "auto",
  "audio_file": "https://github.com/CheyneyComputerScience/CREMA-D/raw/refs/heads/master/AudioMP3/1012_TIE_NEU_XX.mp3?download=",
  "temperature": 0,
  "suppress_tokens": "-1",
  "logprob_threshold": -1,
  "detect_disfluencies": false,
  "no_speech_threshold": 0.6,
  "compute_word_confidence": true,
  "condition_on_previous_text": true,
  "compression_ratio_threshold": 2.4
}
Input Parameters
vad Type: booleanDefault: false
Use Voice Activity Detection
task Default: transcribe
Task to perform
best_of Type: integer
Number of candidates when sampling with non-zero temperature
verbose Type: booleanDefault: false
Whether to display the text being decoded
language Type: stringDefault: auto
Language code (e.g., 'en') or 'auto' for auto-detect
patience Type: number
Optional patience value to use in beam decoding
beam_size Type: integer
Number of beams in beam search, only applicable when temperature is zero
audio_file (required) Type: string
Audio file to transcribe
temperature Type: numberDefault: 0
Temperature for sampling
initial_prompt Type: string
Optional text to provide as a prompt for the first window
length_penalty Type: number
Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144
suppress_tokens Type: stringDefault: -1
Comma-separated list of token ids to suppress during sampling
logprob_threshold Type: numberDefault: -1
Threshold for average log probability
detect_disfluencies Type: booleanDefault: false
Detect speech disfluencies
no_speech_threshold Type: numberDefault: 0.6
Threshold for no speech probability
compute_word_confidence Type: booleanDefault: true
Compute word confidence scores
condition_on_previous_text Type: booleanDefault: true
Whether to condition on previous text
compression_ratio_threshold Type: numberDefault: 2.4
Threshold for compression ratio
Output Schema

Output

Type: object

Example Execution Logs
Detected language: English
  0%|          | 0/282 [00:00<?, ?frames/s]
100%|██████████| 282/282 [00:00<00:00, 649.02frames/s]
100%|██████████| 282/282 [00:00<00:00, 648.72frames/s]
Version Details
Version ID
c5b122b7e513b1b5a6ef849891c538869b77cc932cbd0f8203e11d3b357553b8
Version Created
October 4, 2024
Run on Replicate →