diff --git a/src/whisper.cpp b/src/whisper.cpp index 2f356da0f06..56139a10158 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -7841,10 +7841,22 @@ int whisper_full_parallel( for (int i = 0; i < n_processors - 1; ++i) { auto& results_i = states[i]->result_all; + const int64_t chunk_offset = 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t; + for (auto& result : results_i) { // correct the segment timestamp taking into account the offset - result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t; - result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t; + result.t0 += chunk_offset; + result.t1 += chunk_offset; + + // also correct per-token timestamps inside this segment. + // Without this, segments report correct absolute times in the + // final audio but token-level timestamps reset to 0 at each + // worker-chunk boundary when --processors > 1. + // ref: https://github.com/ggml-org/whisper.cpp/issues/3726 + for (auto& token : result.tokens) { + token.t0 += chunk_offset; + token.t1 += chunk_offset; + } // make sure that segments are not overlapping if (!ctx->state->result_all.empty()) { @@ -8039,11 +8051,25 @@ whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segm } struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) { - return state->result_all[i_segment].tokens[i_token]; + whisper_token_data token = state->result_all[i_segment].tokens[i_token]; + + // Map VAD-processed token timestamps back to the original audio timeline. + // Without this, tokens report timestamps in the VAD-stripped timeline + // (starting at 0 after whisper_full_with_state sees only the speech- + // filtered samples), while segment timestamps are already mapped back by + // whisper_full_get_segment_t0/t1_from_state. The two diverge when VAD + // strips a non-speech prefix (e.g. music before speech). + // ref: https://github.com/ggml-org/whisper.cpp/issues/3754 + if (state->has_vad_segments && !state->vad_mapping_table.empty()) { + token.t0 = map_processed_to_original_time(token.t0, state->vad_mapping_table); + token.t1 = map_processed_to_original_time(token.t1, state->vad_mapping_table); + } + + return token; } struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) { - return ctx->state->result_all[i_segment].tokens[i_token]; + return whisper_full_get_token_data_from_state(ctx->state, i_segment, i_token); } float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {