Skip to content

Commit 29f7ccb

Browse files
committed
Decoupling work continues, now using streaming first flow
1 parent 92faa32 commit 29f7ccb

25 files changed

Lines changed: 1023 additions & 237 deletions

CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ set(MUTTERKEY_CORE_SOURCES
4545
src/transcription/transcriptiontypes.h
4646
src/transcription/transcriptionengine.cpp
4747
src/transcription/transcriptionengine.h
48+
src/transcription/audiochunker.cpp
49+
src/transcription/audiochunker.h
50+
src/transcription/transcriptassembler.cpp
51+
src/transcription/transcriptassembler.h
52+
src/transcription/transcriptioncompat.cpp
53+
src/transcription/transcriptioncompat.h
4854
src/transcription/transcriptionworker.cpp
4955
src/transcription/transcriptionworker.h
5056
src/transcription/whispercpptranscriber.cpp
@@ -87,7 +93,7 @@ target_include_directories(mutterkey-tray PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sr
8793
target_link_libraries(mutterkey_core PUBLIC Qt6::Core Qt6::Gui Qt6::Multimedia KF6::GlobalAccel KF6::GuiAddons)
8894
target_link_libraries(mutterkey_control PUBLIC Qt6::Core Qt6::Network mutterkey_core)
8995
target_link_libraries(mutterkey_app PUBLIC Qt6::Core Qt6::Gui mutterkey_control)
90-
target_link_libraries(mutterkey PRIVATE mutterkey_app whisper)
96+
target_link_libraries(mutterkey PRIVATE mutterkey_app)
9197
target_link_libraries(mutterkey-tray PRIVATE Qt6::Core Qt6::Gui Qt6::Widgets mutterkey_control)
9298
set_target_properties(mutterkey PROPERTIES
9399
BUILD_RPATH "$ORIGIN/../lib"
@@ -206,7 +212,7 @@ add_subdirectory(third_party/whisper.cpp EXCLUDE_FROM_ALL)
206212
# upstream public headers as part of its own package layout.
207213
set_target_properties(whisper ggml PROPERTIES PUBLIC_HEADER "")
208214

209-
target_link_libraries(mutterkey_core PUBLIC whisper)
215+
target_link_libraries(mutterkey_core PRIVATE whisper)
210216

211217
install(TARGETS mutterkey RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
212218
install(TARGETS mutterkey-tray RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})

README.md

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,16 @@ Current behavior:
1818
- copies the resulting text to the clipboard
1919
- expects you to paste the text yourself with `Ctrl+V`
2020

21+
Current runtime shape:
22+
23+
- `TranscriptionEngine` is the immutable runtime/provider boundary
24+
- `TranscriptionSession` is the mutable per-session decode boundary
25+
- internal audio flow is streaming-first through normalized chunks and transcript events
26+
- `BackendCapabilities` reports static backend support, while `RuntimeDiagnostics`
27+
reports runtime/device/model inspection data
28+
- the current daemon and `once` user flows still collapse the streaming path back
29+
into a final clipboard-friendly transcript
30+
2131
Current direction:
2232

2333
- KDE-first
@@ -362,9 +372,12 @@ Repository layout:
362372
- `src/audio/audiorecorder.*`: microphone capture
363373
- `src/audio/recording.h`: shared recorded-audio payload passed between subsystems
364374
- `src/audio/recordingnormalizer.*`: conversion to Whisper-ready mono `float32` at `16 kHz`
365-
- `src/transcription/whispercpptranscriber.*`: embedded Whisper integration
375+
- `src/transcription/audiochunker.*`: fixed-size normalized streaming chunk generation
376+
- `src/transcription/transcriptassembler.*`: final transcript assembly from streaming events
377+
- `src/transcription/transcriptioncompat.*`: compatibility wrapper from one-shot recordings to the streaming runtime path
378+
- `src/transcription/whispercpptranscriber.*`: embedded Whisper integration behind the app-owned runtime seam
366379
- `src/transcription/transcriptionworker.*`: worker object on a dedicated `QThread`
367-
- `src/transcription/transcriptiontypes.h`: normalized-audio and transcription result value types
380+
- `src/transcription/transcriptiontypes.h`: runtime diagnostics, normalized-audio, chunk, event, and error value types
368381
- `src/clipboardwriter.*`: clipboard writes with KDE-first fallback behavior
369382
- `src/config.*`: JSON config loading and defaults
370383
- `src/app/*`: shared CLI/runtime command helpers used by the main entrypoint
@@ -462,7 +475,7 @@ Notes:
462475
libraries without inheriting upstream header-install warnings
463476
- the `valgrind` target runs the repo-owned Memcheck lane used for release readiness
464477
- tests are small headless `Qt Test` cases
465-
- `config` and `recordingnormalizer` currently have the main unit-test coverage because they contain the most deterministic logic without KDE session or device dependencies
478+
- streaming runtime helpers and worker orchestration now also have deterministic headless coverage through fake backends
466479
- GitHub Actions CI runs the hygiene job on Ubuntu 24.04 and the configure/build/test job in a Debian Trixie container because the needed KF6 dev packages are not available on the stock Ubuntu 24.04 runner image
467480
- successful `main` branch CI runs publish `build/docs/doxygen/html` to GitHub Pages with the official Pages actions
468481
- GitHub Actions release checks run a separate Valgrind Memcheck lane on manual dispatch and `v*` tags so normal PR CI stays faster

docs/mainpage.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,27 @@ Current runtime shape:
2121

2222
- `TranscriptionEngine` is the immutable runtime/provider boundary
2323
- `TranscriptionSession` is the mutable per-session decode boundary
24-
- `BackendCapabilities` reports engine-owned runtime metadata used for
25-
diagnostics and orchestration
24+
- internal audio flow is streaming-first through normalized chunks and
25+
transcript events
26+
- `BackendCapabilities` reports static backend support used for orchestration
27+
- `RuntimeDiagnostics` reports runtime/device/model inspection data separately
28+
from static capabilities
2629
- `RuntimeError` and `RuntimeErrorCode` provide typed runtime failures
2730
- `TranscriptionWorker` hosts transcription on a dedicated `QThread` and
2831
creates live sessions lazily on that worker thread
32+
- the shipped daemon and `once` flows still use a compatibility wrapper that
33+
assembles a final transcript from the streaming runtime path
2934
- config parsing under `src/config.*` stays product-shaped and permissive, while
3035
backend-specific support checks live in the runtime layer
3136

3237
Core API surface covered here:
3338

3439
- `HotkeyManager` registers the global push-to-talk shortcut through KDE.
3540
- `AudioRecorder` captures microphone audio while the shortcut is held.
36-
- `RecordingNormalizer` converts captured audio to Whisper-ready mono `float32`
41+
- `RecordingNormalizer` converts captured audio to runtime-ready mono `float32`
3742
samples at `16 kHz`.
43+
- `AudioChunker` splits normalized audio into deterministic stream chunks.
44+
- `TranscriptAssembler` builds final transcript text from streaming events.
3845
- `TranscriptionEngine` and `TranscriptionSession` define the app-owned runtime
3946
seam.
4047
- `WhisperCppTranscriber` performs in-process transcription through vendored

src/app/applicationcommands.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "clipboardwriter.h"
77
#include "control/daemoncontrolserver.h"
88
#include "service.h"
9+
#include "transcription/transcriptioncompat.h"
910
#include "transcription/transcriptionengine.h"
1011
#include "transcription/transcriptiontypes.h"
1112

@@ -82,7 +83,9 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds)
8283
}
8384
}
8485

85-
QTimer::singleShot(0, &app, [&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds]() {
86+
QTimer::singleShot(0,
87+
&app,
88+
[&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds, normalizer = RecordingNormalizer()]() {
8689
QString errorMessage;
8790
if (!recorder.start(&errorMessage)) {
8891
qCCritical(appLog) << "Failed to start one-shot recording:" << errorMessage;
@@ -91,15 +94,15 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds)
9194
}
9295

9396
qCInfo(appLog) << "Recording for" << seconds << "seconds";
94-
QTimer::singleShot(static_cast<int>(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter]() {
97+
QTimer::singleShot(static_cast<int>(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter, normalizer]() {
9598
const Recording recording = recorder.stop();
9699
if (!recording.isValid()) {
97100
qCCritical(appLog) << "Recorder returned no audio";
98101
QGuiApplication::exit(1);
99102
return;
100103
}
101104

102-
const TranscriptionResult result = transcriber->transcribe(recording);
105+
const TranscriptionResult result = transcribeRecordingViaStreaming(*transcriber, recording, normalizer);
103106
if (!result.success) {
104107
qCCritical(appLog) << "One-shot transcription failed:" << result.error.message;
105108
QGuiApplication::exit(1);

src/audio/recordingnormalizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ std::vector<float> resampleLinear(const std::vector<float> &samples, int inputSa
9595

9696
} // namespace
9797

98-
bool RecordingNormalizer::normalizeForWhisper(const Recording &recording,
98+
bool RecordingNormalizer::normalizeForRuntime(const Recording &recording,
9999
NormalizedAudio *normalizedAudio,
100100
QString *errorMessage) const
101101
{

src/audio/recordingnormalizer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ class RecordingNormalizer final
1717
{
1818
public:
1919
/**
20-
* @brief Converts a captured recording into Whisper input audio.
20+
* @brief Converts a captured recording into runtime input audio.
2121
* @param recording Source recording and its original device format.
2222
* @param normalizedAudio Output location for normalized samples.
2323
* @param errorMessage Optional output for conversion failures.
2424
* @return `true` when normalization succeeded.
2525
*/
26-
bool normalizeForWhisper(const Recording &recording, NormalizedAudio *normalizedAudio, QString *errorMessage = nullptr) const;
26+
bool normalizeForRuntime(const Recording &recording, NormalizedAudio *normalizedAudio, QString *errorMessage = nullptr) const;
2727
};

src/service.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,12 @@ QJsonObject MutterkeyService::diagnostics() const
8080
object.insert(QStringLiteral("transcriptions_completed"), m_transcriptionsCompleted);
8181
object.insert(QStringLiteral("transcriber_backend"),
8282
m_transcriptionWorker != nullptr ? m_transcriptionWorker->backendName() : QStringLiteral("unconfigured"));
83-
object.insert(QStringLiteral("transcriber_model"),
84-
m_transcriptionWorker != nullptr ? m_transcriptionWorker->loadedModelDescription() : QString());
83+
const RuntimeDiagnostics runtimeDiagnostics =
84+
m_transcriptionWorker != nullptr ? m_transcriptionWorker->runtimeDiagnostics() : m_transcriptionEngine->diagnostics();
85+
object.insert(QStringLiteral("transcriber_model"), runtimeDiagnostics.loadedModelDescription);
86+
object.insert(QStringLiteral("transcriber_runtime"), runtimeDiagnostics.runtimeDescription);
8587
const BackendCapabilities capabilities =
8688
m_transcriptionWorker != nullptr ? m_transcriptionWorker->capabilities() : m_transcriptionEngine->capabilities();
87-
object.insert(QStringLiteral("transcriber_runtime"), capabilities.runtimeDescription);
8889
object.insert(QStringLiteral("transcriber_supports_translation"), capabilities.supportsTranslation);
8990
object.insert(QStringLiteral("transcriber_supports_auto_language"), capabilities.supportsAutoLanguage);
9091
return object;
@@ -168,7 +169,7 @@ void MutterkeyService::transcribeInBackground(Recording recording)
168169
// another owner of the PCM payload alive on the service thread.
169170
QMetaObject::invokeMethod(m_transcriptionWorker,
170171
[worker = m_transcriptionWorker, recording = std::move(recording)]() mutable {
171-
worker->transcribe(recording);
172+
worker->transcribeRecordingCompat(recording);
172173
},
173174
Qt::QueuedConnection);
174175
}

src/transcription/audiochunker.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#include "transcription/audiochunker.h"
2+
3+
#include <algorithm>
4+
#include <cstddef>
5+
6+
namespace {
7+
8+
constexpr int kChunkDurationMs = 200;
9+
10+
} // namespace
11+
12+
bool AudioChunker::chunkAudio(const NormalizedAudio &audio, std::vector<AudioChunk> *chunks, QString *errorMessage) const
13+
{
14+
if (chunks == nullptr) {
15+
if (errorMessage != nullptr) {
16+
*errorMessage = QStringLiteral("Internal error: missing audio chunk output");
17+
}
18+
return false;
19+
}
20+
21+
chunks->clear();
22+
23+
if (!audio.isValid()) {
24+
if (errorMessage != nullptr) {
25+
*errorMessage = QStringLiteral("Normalized audio is empty");
26+
}
27+
return false;
28+
}
29+
30+
if (audio.sampleRate <= 0 || audio.channels != 1) {
31+
if (errorMessage != nullptr) {
32+
*errorMessage = QStringLiteral("Normalized audio format is invalid");
33+
}
34+
return false;
35+
}
36+
37+
const int chunkFrames = std::max(1, (audio.sampleRate * kChunkDurationMs) / 1000);
38+
chunks->reserve((static_cast<int>(audio.samples.size()) + chunkFrames - 1) / chunkFrames);
39+
40+
std::int64_t streamOffsetFrames = 0;
41+
for (std::size_t startIndex = 0; startIndex < audio.samples.size(); startIndex += static_cast<std::size_t>(chunkFrames)) {
42+
const std::size_t endIndex =
43+
std::min(startIndex + static_cast<std::size_t>(chunkFrames), audio.samples.size());
44+
45+
AudioChunk chunk;
46+
chunk.sampleRate = audio.sampleRate;
47+
chunk.channels = audio.channels;
48+
chunk.streamOffsetFrames = streamOffsetFrames;
49+
chunk.samples.assign(audio.samples.begin() + static_cast<std::ptrdiff_t>(startIndex),
50+
audio.samples.begin() + static_cast<std::ptrdiff_t>(endIndex));
51+
chunks->push_back(std::move(chunk));
52+
streamOffsetFrames += static_cast<std::int64_t>(endIndex - startIndex);
53+
}
54+
55+
return !chunks->empty();
56+
}

src/transcription/audiochunker.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#pragma once
2+
3+
#include "transcription/transcriptiontypes.h"
4+
5+
#include <QString>
6+
#include <vector>
7+
8+
/**
9+
* @file
10+
* @brief Helpers for splitting normalized audio into deterministic stream chunks.
11+
*/
12+
13+
/**
14+
* @brief Splits normalized utterance audio into fixed-size streaming chunks.
15+
*/
16+
class AudioChunker final
17+
{
18+
public:
19+
/**
20+
* @brief Converts normalized audio into ordered stream chunks.
21+
* @param audio Normalized utterance audio.
22+
* @param chunks Output destination for generated chunks.
23+
* @param errorMessage Optional output for validation failures.
24+
* @return `true` when chunking succeeded.
25+
*/
26+
bool chunkAudio(const NormalizedAudio &audio, std::vector<AudioChunk> *chunks, QString *errorMessage = nullptr) const;
27+
};
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#include "transcription/transcriptassembler.h"
2+
3+
void TranscriptAssembler::reset()
4+
{
5+
m_finalTranscript.clear();
6+
m_latestPartial.clear();
7+
}
8+
9+
void TranscriptAssembler::applyUpdate(const TranscriptUpdate &update)
10+
{
11+
for (const TranscriptEvent &event : update.events) {
12+
const QString trimmedText = event.text.trimmed();
13+
if (trimmedText.isEmpty()) {
14+
continue;
15+
}
16+
17+
if (event.kind == TranscriptEventKind::Final) {
18+
if (!m_finalTranscript.isEmpty()) {
19+
m_finalTranscript += QLatin1Char(' ');
20+
}
21+
m_finalTranscript += trimmedText;
22+
m_latestPartial.clear();
23+
continue;
24+
}
25+
26+
m_latestPartial = trimmedText;
27+
}
28+
}
29+
30+
QString TranscriptAssembler::finalTranscript() const
31+
{
32+
return m_finalTranscript.trimmed();
33+
}

0 commit comments

Comments
 (0)