Skip to content

Commit 984892e

Browse files
rui-renruiren_microsoftCopilotCopilot
authored
Add Nemotron-ASR streaming inference to C++ SDK (#655)
## Add Nemotron-ASR streaming inference to Python SDK ### Description Adds real-time audio streaming support to the Foundry Local C++ SDK, enabling live microphone-to-text transcription via ONNX Runtime GenAI's StreamingProcessor API (Nemotron ASR). This is the C++ port of C# PR #485 with full feature parity. The existing AudioClient only supports file-based transcription. This PR introduces LiveAudioTranscriptionSession that accepts continuous PCM audio chunks (e.g., from a microphone) and returns partial/final transcription results as a synchronous generator. --------- Co-authored-by: ruiren_microsoft <ruiren@microsoft.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent c641118 commit 984892e

14 files changed

Lines changed: 1082 additions & 1 deletion

sdk/cpp/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ add_library(CppSdk STATIC
5454
src/catalog.cpp
5555
src/openai_chat_client.cpp
5656
src/openai_audio_client.cpp
57+
src/openai_live_audio_types.cpp
58+
src/openai_live_audio_client.cpp
5759
src/foundry_local_manager.cpp
5860
)
5961

@@ -91,6 +93,7 @@ if (BUILD_TESTING)
9193
test/model_variant_test.cpp
9294
test/catalog_test.cpp
9395
test/client_test.cpp
96+
test/live_audio_test.cpp
9497
)
9598

9699
target_include_directories(CppSdkTests

sdk/cpp/include/foundry_local.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@
1616
#include "openai/openai_tool_types.h"
1717
#include "openai/openai_chat_client.h"
1818
#include "openai/openai_audio_client.h"
19+
#include "openai/openai_live_audio_types.h"
20+
#include "openai/openai_live_audio_client.h"

sdk/cpp/include/openai/openai_audio_client.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <string_view>
88
#include <functional>
99
#include <filesystem>
10+
#include <memory>
1011

1112
#include <gsl/pointers>
1213

@@ -22,6 +23,8 @@ namespace foundry_local {
2223
std::string text;
2324
};
2425

26+
class LiveAudioTranscriptionSession;
27+
2528
class OpenAIAudioClient final {
2629
public:
2730
explicit OpenAIAudioClient(const IModel& model);
@@ -34,6 +37,9 @@ namespace foundry_local {
3437
using StreamCallback = std::function<void(const AudioCreateTranscriptionResponse& chunk)>;
3538
void TranscribeAudioStreaming(const std::filesystem::path& audioFilePath, const StreamCallback& onChunk) const;
3639

40+
/// Create a new live audio transcription session for streaming PCM audio.
41+
std::unique_ptr<LiveAudioTranscriptionSession> CreateLiveTranscriptionSession() const;
42+
3743
private:
3844
OpenAIAudioClient(gsl::not_null<foundry_local::Internal::IFoundryLocalCore*> core, std::string_view modelId,
3945
gsl::not_null<ILogger*> logger);
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include <string>
7+
#include <memory>
8+
#include <thread>
9+
#include <mutex>
10+
#include <atomic>
11+
#include <chrono>
12+
#include <cstdint>
13+
#include <vector>
14+
15+
#include <gsl/pointers>
16+
17+
#include "openai_live_audio_types.h"
18+
19+
namespace foundry_local::Internal {
20+
struct IFoundryLocalCore;
21+
template <typename T> class ThreadSafeQueue;
22+
} // namespace foundry_local::Internal
23+
24+
namespace foundry_local {
25+
class ILogger;
26+
27+
class LiveAudioTranscriptionSession final {
28+
public:
29+
LiveAudioTranscriptionSession(gsl::not_null<Internal::IFoundryLocalCore*> core,
30+
std::string modelId,
31+
gsl::not_null<ILogger*> logger);
32+
~LiveAudioTranscriptionSession() noexcept;
33+
34+
// Non-copyable, non-movable
35+
LiveAudioTranscriptionSession(const LiveAudioTranscriptionSession&) = delete;
36+
LiveAudioTranscriptionSession& operator=(const LiveAudioTranscriptionSession&) = delete;
37+
LiveAudioTranscriptionSession(LiveAudioTranscriptionSession&&) = delete;
38+
LiveAudioTranscriptionSession& operator=(LiveAudioTranscriptionSession&&) = delete;
39+
40+
/// Mutable settings reference; only effective before Start().
41+
LiveAudioTranscriptionOptions& Settings() { return settings_; }
42+
/// Read-only settings reference.
43+
const LiveAudioTranscriptionOptions& Settings() const { return settings_; }
44+
/// Settings that were active when Start() was called.
45+
const LiveAudioTranscriptionOptions& ActiveSettings() const { return activeSettings_; }
46+
47+
/// Begin the streaming session. Must be called before Append/TryAppend.
48+
void Start();
49+
50+
/// Enqueue PCM audio data. Blocks if the push queue is full.
51+
void Append(const uint8_t* pcmData, size_t length);
52+
53+
/// Try to get the next transcription result within the given timeout.
54+
TranscriptionStatus TryGetNext(LiveAudioTranscriptionResponse& result,
55+
std::chrono::milliseconds timeout = std::chrono::seconds(5));
56+
57+
/// Signal the end of audio input and stop the session.
58+
void Stop();
59+
60+
/// Returns the error message if the session is in an error state.
61+
std::string GetErrorMessage() const;
62+
63+
/// Returns true if the session has been started.
64+
bool IsStarted() const;
65+
66+
/// Returns true if the session has been stopped.
67+
bool IsStopped() const;
68+
69+
private:
70+
enum class SessionState {
71+
Created,
72+
Starting,
73+
Started,
74+
Stopped
75+
};
76+
77+
void PushWorkerLoop();
78+
void StopInternal(std::unique_lock<std::mutex>& lock);
79+
80+
gsl::not_null<Internal::IFoundryLocalCore*> core_;
81+
std::string modelId_;
82+
gsl::not_null<ILogger*> logger_;
83+
84+
LiveAudioTranscriptionOptions settings_;
85+
LiveAudioTranscriptionOptions activeSettings_;
86+
87+
mutable std::mutex mutex_;
88+
SessionState state_ = SessionState::Created;
89+
std::string sessionHandle_;
90+
91+
using AudioChunk = std::vector<uint8_t>;
92+
std::unique_ptr<Internal::ThreadSafeQueue<AudioChunk>> pushQueue_;
93+
std::unique_ptr<Internal::ThreadSafeQueue<LiveAudioTranscriptionResponse>> resultQueue_;
94+
95+
std::thread pushThread_;
96+
std::string errorMessage_;
97+
LiveAudioTranscriptionResponse finalResult_;
98+
bool hasFinalResult_ = false;
99+
};
100+
101+
} // namespace foundry_local
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright (c) Microsoft Corporation. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include <string>
7+
#include <vector>
8+
#include <optional>
9+
10+
namespace foundry_local {
11+
12+
struct ContentPart {
13+
std::string text;
14+
std::string transcript;
15+
};
16+
17+
struct LiveAudioTranscriptionResponse {
18+
std::string text;
19+
bool is_final = false;
20+
std::optional<double> start_time;
21+
std::optional<double> end_time;
22+
std::vector<ContentPart> content;
23+
24+
static LiveAudioTranscriptionResponse FromJson(const std::string& json);
25+
};
26+
27+
struct LiveAudioTranscriptionOptions {
28+
int sample_rate = 16000;
29+
int channels = 1;
30+
int bits_per_sample = 16;
31+
std::optional<std::string> language;
32+
int push_queue_capacity = 100;
33+
};
34+
35+
struct CoreErrorResponse {
36+
std::string code;
37+
std::string message;
38+
bool is_transient = false;
39+
40+
static std::optional<CoreErrorResponse> TryParse(const std::string& error_string);
41+
};
42+
43+
enum class TranscriptionStatus {
44+
Result,
45+
Timeout,
46+
Closed,
47+
Error
48+
};
49+
50+
} // namespace foundry_local

sdk/cpp/src/core.h

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Copyright (c) Microsoft Corporation. All rights reserved.
22
// Licensed under the MIT License.
33
//
4-
// Core DLL interop loads Microsoft.AI.Foundry.Local.Core.dll at runtime.
4+
// Core DLL interop loads Microsoft.AI.Foundry.Local.Core.dll at runtime.
55
// Internal header, not part of the public API.
66

77
#pragma once
@@ -46,6 +46,7 @@ namespace foundry_local {
4646
module_.reset();
4747
execCmd_ = nullptr;
4848
execCbCmd_ = nullptr;
49+
execBinaryCmd_ = nullptr;
4950
freeResCmd_ = nullptr;
5051
}
5152

@@ -91,10 +92,55 @@ namespace foundry_local {
9192
return result;
9293
}
9394

95+
CoreResponse callWithBinary(std::string_view command, ILogger& logger,
96+
const std::string* dataArgument,
97+
const uint8_t* binaryData, size_t binaryDataLength) const override {
98+
if (!module_ || !execBinaryCmd_ || !freeResCmd_) {
99+
throw Exception("Core is not loaded. Cannot call command: " + std::string(command), logger);
100+
}
101+
102+
StreamingRequestBuffer request{};
103+
request.Command = command.empty() ? nullptr : command.data();
104+
request.CommandLength = static_cast<int32_t>(command.size());
105+
106+
if (dataArgument && !dataArgument->empty()) {
107+
request.Data = dataArgument->data();
108+
request.DataLength = static_cast<int32_t>(dataArgument->size());
109+
}
110+
111+
if (binaryData && binaryDataLength > 0) {
112+
if (binaryDataLength > static_cast<size_t>(INT32_MAX)) {
113+
throw Exception("Binary data length exceeds maximum supported size (INT32_MAX).", logger);
114+
}
115+
request.BinaryData = binaryData;
116+
request.BinaryDataLength = static_cast<int32_t>(binaryDataLength);
117+
}
118+
119+
ResponseBuffer response{};
120+
auto safeDeleter = [fn = freeResCmd_](ResponseBuffer* buf) {
121+
if (fn)
122+
fn(buf);
123+
};
124+
std::unique_ptr<ResponseBuffer, decltype(safeDeleter)> responseGuard(&response, safeDeleter);
125+
126+
execBinaryCmd_(&request, &response);
127+
128+
CoreResponse result;
129+
if (response.Error && response.ErrorLength > 0) {
130+
result.error.assign(static_cast<const char*>(response.Error), response.ErrorLength);
131+
return result;
132+
}
133+
if (response.Data && response.DataLength > 0) {
134+
result.data.assign(static_cast<const char*>(response.Data), response.DataLength);
135+
}
136+
return result;
137+
}
138+
94139
private:
95140
wil::unique_hmodule module_;
96141
execute_command_fn execCmd_{};
97142
execute_command_with_callback_fn execCbCmd_{};
143+
execute_command_with_binary_fn execBinaryCmd_{};
98144
free_response_fn freeResCmd_{};
99145

100146
void LoadFromPath(const std::filesystem::path& path) {
@@ -105,6 +151,8 @@ namespace foundry_local {
105151
execCmd_ = reinterpret_cast<execute_command_fn>(RequireProc(m.get(), "execute_command"));
106152
execCbCmd_ = reinterpret_cast<execute_command_with_callback_fn>(
107153
RequireProc(m.get(), "execute_command_with_callback"));
154+
execBinaryCmd_ = reinterpret_cast<execute_command_with_binary_fn>(
155+
RequireProc(m.get(), "execute_command_with_binary"));
108156
freeResCmd_ = reinterpret_cast<free_response_fn>(RequireProc(m.get(), "free_response"));
109157

110158
module_ = std::move(m);

sdk/cpp/src/flcore_native.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,25 @@ extern "C"
2626
// Callback signature: void(*)(void* data, int length, void* userData)
2727
using UserCallbackFn = void(__cdecl*)(void*, int32_t, void*);
2828

29+
struct StreamingRequestBuffer {
30+
const void* Command;
31+
int32_t CommandLength;
32+
const void* Data;
33+
int32_t DataLength;
34+
const void* BinaryData;
35+
int32_t BinaryDataLength;
36+
};
37+
2938
// Exported function pointer types
3039
using execute_command_fn = void(__cdecl*)(RequestBuffer*, ResponseBuffer*);
3140
using execute_command_with_callback_fn = void(__cdecl*)(RequestBuffer*, ResponseBuffer*, void* /*callback*/,
3241
void* /*userData*/);
42+
using execute_command_with_binary_fn = void(__cdecl*)(StreamingRequestBuffer*, ResponseBuffer*);
3343
using free_response_fn = void(__cdecl*)(ResponseBuffer*);
3444

3545
static_assert(std::is_standard_layout<RequestBuffer>::value, "RequestBuffer must be standard layout");
3646
static_assert(std::is_standard_layout<ResponseBuffer>::value, "ResponseBuffer must be standard layout");
47+
static_assert(std::is_standard_layout<StreamingRequestBuffer>::value, "StreamingRequestBuffer must be standard layout");
3748

3849
#pragma pack(pop)
3950
}

sdk/cpp/src/foundry_local_internal_core.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ namespace foundry_local {
3131
virtual CoreResponse call(std::string_view command, ILogger& logger,
3232
const std::string* dataArgument = nullptr, NativeCallbackFn callback = nullptr,
3333
void* data = nullptr) const = 0;
34+
35+
virtual CoreResponse callWithBinary(std::string_view command, ILogger& logger,
36+
const std::string* dataArgument,
37+
const uint8_t* binaryData, size_t binaryDataLength) const = 0;
38+
3439
virtual void unload() = 0;
3540
};
3641

sdk/cpp/src/openai_audio_client.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "core_helpers.h"
1717
#include "logger.h"
1818

19+
#include "openai/openai_live_audio_client.h"
20+
1921
namespace foundry_local {
2022

2123
OpenAIAudioClient::OpenAIAudioClient(gsl::not_null<Internal::IFoundryLocalCore*> core, std::string_view modelId,
@@ -67,4 +69,8 @@ namespace foundry_local {
6769
}
6870
}
6971

72+
std::unique_ptr<LiveAudioTranscriptionSession> OpenAIAudioClient::CreateLiveTranscriptionSession() const {
73+
return std::make_unique<LiveAudioTranscriptionSession>(core_, modelId_, logger_);
74+
}
75+
7076
} // namespace foundry_local

0 commit comments

Comments
 (0)