Skip to content

Commit 3a11f3c

Browse files
ruiren_microsoftCopilot
andcommitted
Add C++ SDK for live audio transcription streaming
Port C# LiveAudioTranscriptionSession (PR #485) to idiomatic C++17. New files: - sdk/cpp/include/foundry_local/ Public headers - sdk/cpp/src/ Implementation - sdk/cpp/tests/ Unit tests (57) and E2E tests (3) - sdk/cpp/CMakeLists.txt Build system with FetchContent deps API surface: - LiveAudioTranscriptionSession: start/append/try_get_next/stop - AudioClient: factory for transcription sessions - CoreInterop: dynamic FFI loading for native core - ThreadSafeQueue: bounded producer-consumer queue Design highlights: - Full API parity with C# implementation - Thread-safe append from any thread (audio callbacks) - Bounded backpressure on push queue - Settings frozen at start time - RAII cleanup (noexcept destructor) - Error propagation from push loop to output stream - ConversationItem-shaped response (content[0].text/transcript) All 60 tests passing (57 unit + 3 E2E with real Nemotron model). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent d1cf434 commit 3a11f3c

21 files changed

Lines changed: 2537 additions & 2 deletions

sdk/cpp/CMakeLists.txt

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
cmake_minimum_required(VERSION 3.14)
2+
project(foundry_local_cpp_sdk VERSION 1.0.0 LANGUAGES CXX)
3+
4+
set(CMAKE_CXX_STANDARD 17)
5+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
6+
set(CMAKE_CXX_EXTENSIONS OFF)
7+
8+
# Options
9+
option(FOUNDRY_LOCAL_BUILD_TESTS "Build tests" ON)
10+
option(FOUNDRY_LOCAL_BUILD_E2E "Build E2E test (requires native core DLL)" OFF)
11+
12+
# --------------------------------------------------------------------------
13+
# Dependencies via FetchContent
14+
# --------------------------------------------------------------------------
15+
include(FetchContent)
16+
17+
FetchContent_Declare(
18+
nlohmann_json
19+
GIT_REPOSITORY https://github.com/nlohmann/json.git
20+
GIT_TAG v3.11.3
21+
)
22+
FetchContent_MakeAvailable(nlohmann_json)
23+
24+
# --------------------------------------------------------------------------
25+
# Main SDK library
26+
# --------------------------------------------------------------------------
27+
add_library(foundry_local_sdk
28+
src/core_interop_types.cpp
29+
src/core_interop.cpp
30+
src/live_audio_transcription_types.cpp
31+
src/live_audio_transcription_session.cpp
32+
src/audio_client.cpp
33+
)
34+
35+
target_include_directories(foundry_local_sdk
36+
PUBLIC
37+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
38+
$<INSTALL_INTERFACE:include>
39+
)
40+
41+
target_link_libraries(foundry_local_sdk
42+
PUBLIC
43+
nlohmann_json::nlohmann_json
44+
)
45+
46+
# Platform-specific link dependencies
47+
if(WIN32)
48+
# No extra libs needed — LoadLibraryW/GetProcAddress are in kernel32
49+
else()
50+
target_link_libraries(foundry_local_sdk PRIVATE dl pthread)
51+
endif()
52+
53+
# Compiler warnings
54+
if(MSVC)
55+
target_compile_options(foundry_local_sdk PRIVATE /W4 /WX)
56+
else()
57+
target_compile_options(foundry_local_sdk PRIVATE -Wall -Wextra -Werror -Wpedantic)
58+
endif()
59+
60+
# --------------------------------------------------------------------------
61+
# Tests
62+
# --------------------------------------------------------------------------
63+
if(FOUNDRY_LOCAL_BUILD_TESTS)
64+
enable_testing()
65+
66+
FetchContent_Declare(
67+
googletest
68+
GIT_REPOSITORY https://github.com/google/googletest.git
69+
GIT_TAG v1.14.0
70+
)
71+
# Prevent overriding parent project's compiler/linker settings on Windows
72+
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
73+
FetchContent_MakeAvailable(googletest)
74+
75+
# Unit tests
76+
add_executable(foundry_local_tests
77+
tests/test_live_audio_transcription_types.cpp
78+
tests/test_thread_safe_queue.cpp
79+
tests/test_live_audio_transcription_session.cpp
80+
)
81+
82+
target_link_libraries(foundry_local_tests
83+
PRIVATE
84+
foundry_local_sdk
85+
GTest::gtest_main
86+
)
87+
88+
include(GoogleTest)
89+
gtest_discover_tests(foundry_local_tests)
90+
91+
# E2E test (only built when native core is available)
92+
if(FOUNDRY_LOCAL_BUILD_E2E)
93+
add_executable(foundry_local_e2e_test
94+
tests/test_e2e_live_audio.cpp
95+
)
96+
97+
target_link_libraries(foundry_local_e2e_test
98+
PRIVATE
99+
foundry_local_sdk
100+
GTest::gtest_main
101+
)
102+
103+
gtest_discover_tests(foundry_local_e2e_test)
104+
endif()
105+
endif()

sdk/cpp/README.md

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Foundry Local C++ SDK — Live Audio Transcription
2+
3+
C++ SDK for real-time audio streaming transcription using Foundry Local's native core.
4+
5+
## API Overview
6+
7+
```cpp
8+
#include "foundry_local/audio_client.h"
9+
#include "foundry_local/core_interop.h"
10+
11+
// 1. Load native core
12+
auto core = std::make_shared<foundry_local::detail::CoreInterop>("path/to/Microsoft.AI.Foundry.Local.Core.dll");
13+
core->initialize({{"AppName", "my-app"}});
14+
15+
// 2. Create audio client and session
16+
auto client = foundry_local::AudioClient("nemotron", core);
17+
auto session = client.create_live_transcription_session();
18+
19+
// 3. Configure audio format (before start)
20+
session->settings().sample_rate = 16000;
21+
session->settings().channels = 1;
22+
session->settings().bits_per_sample = 16;
23+
session->settings().language = "en";
24+
25+
// 4. Start session
26+
session->start();
27+
28+
// 5. Push PCM audio chunks (thread-safe, from any thread)
29+
session->append(pcm_data, pcm_length); // blocking if queue full
30+
session->try_append(pcm_data, pcm_length); // non-blocking
31+
session->try_append_for(pcm_data, pcm_length, 100ms); // timed
32+
33+
// 6. Read transcription results
34+
foundry_local::LiveAudioTranscriptionResponse result;
35+
auto status = session->try_get_next(result, std::chrono::seconds(5));
36+
if (status == foundry_local::TranscriptionStatus::result) {
37+
std::cout << result.text << std::endl;
38+
// Also available: result.content[0].text, result.content[0].transcript
39+
}
40+
41+
// 7. Stop session (drains remaining audio, delivers final results)
42+
session->stop();
43+
// Or let RAII handle it — destructor calls stop()
44+
```
45+
46+
## Building
47+
48+
```bash
49+
mkdir build && cd build
50+
cmake .. -DFOUNDRY_LOCAL_BUILD_TESTS=ON
51+
cmake --build . --config Release
52+
ctest -C Release
53+
```
54+
55+
### E2E Tests (requires native core DLL + model)
56+
57+
```bash
58+
cmake .. -DFOUNDRY_LOCAL_BUILD_E2E=ON
59+
cmake --build . --config Release
60+
FOUNDRY_CORE_LIB_PATH=/path/to/libs ./Release/foundry_local_e2e_test
61+
```
62+
63+
## Requirements
64+
65+
- C++17 compiler (MSVC 2019+, GCC 9+, Clang 10+)
66+
- CMake 3.14+
67+
- Dependencies fetched automatically:
68+
- [nlohmann/json](https://github.com/nlohmann/json) v3.11.3
69+
- [GoogleTest](https://github.com/google/googletest) v1.14.0
70+
71+
## Design Highlights
72+
73+
- **Thread-safe push**: `append()` can be called from any thread (e.g., audio device callbacks)
74+
- **Bounded backpressure**: Internal push queue prevents unbounded memory growth
75+
- **Settings freeze**: Audio format settings are snapshot-copied at `start()` and immutable during the session
76+
- **RAII cleanup**: Destructor calls `stop()` in a best-effort, noexcept manner
77+
- **Error propagation**: Push-loop errors are surfaced through the output stream
78+
- **Full C# API parity**: Matches the C# LiveAudioTranscriptionSession from PR #485

sdk/cpp/codex.md

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Codex Review Report — C++ Live Audio Transcription SDK
2+
3+
## Summary
4+
5+
C++ port of the C# LiveAudioTranscriptionSession (PR #485) with full API parity,
6+
60 passing tests (57 unit + 3 E2E), and verified real-time streaming through the
7+
native core with the Nemotron ASR model.
8+
9+
## Issues Found & Resolved
10+
11+
### 1. Response Type Parity (Fixed)
12+
- **Issue**: Initial implementation used flat `text` field instead of C#'s
13+
`ConversationItem`-shaped `Content[0].Text` / `Content[0].Transcript`.
14+
- **Fix**: Added `ContentPart` struct with `text`/`transcript` aliasing.
15+
Both flat `result.text` and structured `result.content[0].text` are available.
16+
17+
### 2. Error State Preservation (Fixed)
18+
- **Issue**: `stop()` calling `output_queue_->close()` could overwrite a
19+
push-loop error, silently converting a fatal ASR failure into a normal close.
20+
- **Fix**: `ThreadSafeQueue::close()` now preserves existing error state.
21+
Test `PushErrorThenStop_PreservesError` validates this.
22+
23+
### 3. Null Pointer Safety (Fixed)
24+
- **Issue**: `append(nullptr, n)` caused undefined behavior via
25+
`std::vector<uint8_t>(nullptr, nullptr + n)`.
26+
- **Fix**: Added null guard — throws on `nullptr` with `length > 0`,
27+
no-op for `length == 0`. Tests added.
28+
29+
### 4. Queue Capacity Validation (Fixed)
30+
- **Issue**: `push_queue_capacity = 0` silently became unbounded;
31+
negative values silently became huge via `static_cast<size_t>`.
32+
- **Fix**: `start()` validates `push_queue_capacity > 0` and throws.
33+
Tests added for 0 and -1.
34+
35+
## Performance Considerations
36+
37+
- **Push loop latency**: Single `std::thread` per session, no thread pool.
38+
Matches C#'s `Task.Run` approach. Adequate for real-time audio (typically
39+
10-100ms chunk intervals).
40+
- **Memory copies**: Each `append()` copies the PCM buffer. Required for
41+
safety (callers often reuse buffers in audio callbacks). Zero-copy would
42+
require `std::span` with caller-guaranteed lifetime — not safe for the
43+
general case.
44+
- **Queue contention**: `std::mutex` + `std::condition_variable` per queue.
45+
Lock-free queues could reduce latency but add complexity without
46+
measurable benefit at typical audio rates.
47+
48+
## Concurrency Analysis
49+
50+
- **Thread safety**: Verified via `ConcurrentAppendAndStop` test —
51+
simultaneous push and stop from different threads completes without
52+
deadlock or data corruption.
53+
- **State machine**: `not_started → started → stopping → stopped/failed`
54+
transitions protected by `std::mutex`. No unguarded state access.
55+
- **Destructor**: `noexcept`, best-effort `stop()`, swallows exceptions.
56+
Safe during stack unwinding.
57+
58+
## Memory Safety
59+
60+
- **RAII**: `ScopedResponse` wraps native `ResponseBuffer` and calls
61+
`free_response` on destruction.
62+
- **Smart pointers**: `std::unique_ptr` for queues, `std::shared_ptr`
63+
for `CoreInterop` (shared across sessions).
64+
- **Buffer lifetime**: Audio data copied in `append()`, original buffer
65+
safe to reuse immediately.
66+
- **No raw `new`/`delete`**: All allocations via RAII wrappers.
67+
68+
## Test Coverage Summary
69+
70+
| Category | Tests | Status |
71+
|----------|-------|--------|
72+
| JSON deserialization | 8 ||
73+
| Options defaults | 1 ||
74+
| CoreErrorResponse | 3 ||
75+
| ThreadSafeQueue | 15 ||
76+
| Session state guards | 7 ||
77+
| Session lifecycle (mock) | 12 ||
78+
| CoreInteropRequest | 2 ||
79+
| TranscriptionStatus | 1 ||
80+
| Audit fixes (null, capacity, error) | 5 ||
81+
| Concurrency | 2 ||
82+
| Destructor | 1 ||
83+
| E2E (real core + model) | 3 ||
84+
| **Total** | **60** | **✅ All passing** |
85+
86+
## Remaining Risks
87+
88+
1. **Platform coverage**: Only tested on Windows (MSVC). Linux/macOS paths
89+
exist in code but are untested. The `dlopen`/`dlsym` paths and library
90+
extension logic should be validated on CI.
91+
92+
2. **free_response availability**: The code falls back gracefully if
93+
`free_response` is not exported by the native core, but relies on the
94+
native core allocating response buffers with `Marshal.AllocHGlobal`
95+
(Windows: `LocalFree`-compatible). If the native core changes its
96+
allocation strategy, memory could leak.
97+
98+
3. **Model loading**: The E2E test loads the Nemotron model directly via
99+
`execute_command("load_model")`. A production SDK should integrate with
100+
the full `FoundryLocalManager` lifecycle (catalog, download, cache, load).
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
// Licensed under the MIT License.
3+
4+
#pragma once
5+
6+
#include "core_interop.h"
7+
#include "live_audio_transcription_session.h"
8+
#include "live_audio_transcription_types.h"
9+
10+
#include <memory>
11+
#include <string>
12+
13+
namespace foundry_local {
14+
15+
/// Audio client that provides audio transcription capabilities.
16+
/// Mirrors the C# OpenAIAudioClient API surface.
17+
class AudioClient {
18+
public:
19+
/// @param model_id The model identifier for audio operations.
20+
/// @param core_interop Shared pointer to the core interop layer.
21+
AudioClient(const std::string& model_id,
22+
std::shared_ptr<detail::CoreInterop> core_interop);
23+
24+
/// Create a real-time streaming transcription session.
25+
/// Audio data is pushed in as PCM chunks and transcription results are
26+
/// returned incrementally via the session's pull API.
27+
/// @return A unique pointer to the transcription session.
28+
std::unique_ptr<LiveAudioTranscriptionSession> create_live_transcription_session();
29+
30+
private:
31+
std::string model_id_;
32+
std::shared_ptr<detail::CoreInterop> core_interop_;
33+
};
34+
35+
} // namespace foundry_local

0 commit comments

Comments
 (0)