Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7841,10 +7841,22 @@ int whisper_full_parallel(
for (int i = 0; i < n_processors - 1; ++i) {
auto& results_i = states[i]->result_all;

const int64_t chunk_offset = 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;

for (auto& result : results_i) {
// correct the segment timestamp taking into account the offset
result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
result.t0 += chunk_offset;
result.t1 += chunk_offset;

// also correct per-token timestamps inside this segment.
// Without this, segments report correct absolute times in the
// final audio but token-level timestamps reset to 0 at each
// worker-chunk boundary when --processors > 1.
// ref: https://github.com/ggml-org/whisper.cpp/issues/3726
for (auto& token : result.tokens) {
token.t0 += chunk_offset;
token.t1 += chunk_offset;
}

// make sure that segments are not overlapping
if (!ctx->state->result_all.empty()) {
Expand Down Expand Up @@ -8039,11 +8051,25 @@ whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segm
}

struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) {
return state->result_all[i_segment].tokens[i_token];
whisper_token_data token = state->result_all[i_segment].tokens[i_token];

// Map VAD-processed token timestamps back to the original audio timeline.
// Without this, tokens report timestamps in the VAD-stripped timeline
// (starting at 0 after whisper_full_with_state sees only the speech-
// filtered samples), while segment timestamps are already mapped back by
// whisper_full_get_segment_t0/t1_from_state. The two diverge when VAD
// strips a non-speech prefix (e.g. music before speech).
// ref: https://github.com/ggml-org/whisper.cpp/issues/3754
if (state->has_vad_segments && !state->vad_mapping_table.empty()) {
token.t0 = map_processed_to_original_time(token.t0, state->vad_mapping_table);
token.t1 = map_processed_to_original_time(token.t1, state->vad_mapping_table);
}

return token;
}

struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) {
return ctx->state->result_all[i_segment].tokens[i_token];
return whisper_full_get_token_data_from_state(ctx->state, i_segment, i_token);
}

float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
Expand Down