From 8bddd9f02a56db382ac852a02a560443c989312d Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Sat, 30 May 2026 10:00:49 +1000 Subject: [PATCH 1/2] Initial proposed set of types for review/refinement --- .../generative/audio/SPEECH_TYPES.md | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md b/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md new file mode 100644 index 000000000..c9f44befd --- /dev/null +++ b/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md @@ -0,0 +1,129 @@ +# Speech Types — Design + +Native SDK types returned by `AudioSession` for speech-to-text (file and live), +translation, and ASR scenarios. References: OpenAI `verbose_json` / Realtime +transcription events, Azure Speech SDK recognition results. + +## Design rules + +- **One set of types covers transcription, translation, and ASR.** Task + selection (transcribe vs translate, target language) is a Request parameter, + not a type variant. `text` is the recognized-or-translated string either way. +- **One shared segment type** for both streaming events and final-result entries, + discriminated by `kind`. +- **No event wrapper, no `event_id`, no segment `id`.** Ordering is a property of + the callback channel; segment identity is implicit in stream order (zero-or-more + `kPartial` for the current segment, then one `kFinal` closes it). A web service + above the SDK can add envelope/sequence metadata. +- **`text` on `kPartial` is the cumulative current hypothesis for the segment**, + not a delta-since-last-event (Azure-style). A delta is recoverable by diffing + against the previous hypothesis. +- **`utterance_start` is a boolean on the segment.** Knowable at emission time + (VAD says "speech started" → producer tags the first `kPartial` of the new + segment). There is no `utterance_end` field: end-of-utterance can't be known + when the `kFinal` is emitted without delaying it by the silence threshold. + Instead, end is implicit — the next `utterance_start` marks it (consumer + infers end at the previous `kFinal.end_time`), a future `kSilence` event + marks it explicitly, or the final `SpeechResult` marks it for file + transcription. +- **Time as `int64_t` milliseconds.** Must survive the C ABI. Typedef'd so the + unit is legible and changeable in one place. +- **Two C ABI item types** — one for streaming segments, one for the final + aggregate. Both additive to existing items. + +## Types + +```cpp +namespace fl { + +using DurationMs = std::int64_t; // milliseconds; C ABI-safe + +enum class SpeechSegmentKind : int { + kNone = 0, // entry in a final aggregate result + kPartial = 1, // streaming: hypothesis for the current segment; may change + kFinal = 2, // streaming: segment is stable, or an entry in the final result +}; + +struct SpeechWord { + std::string text; + std::optional start_time; + std::optional end_time; + std::optional confidence; // 0..1 + std::optional speaker_id; +}; + +struct SpeechSegment { + SpeechSegmentKind kind = SpeechSegmentKind::kNone; + + std::string text; // for kPartial: cumulative current hypothesis + std::optional start_time; + std::optional end_time; + + // Utterance start signal — tagged on the first kPartial of a new utterance. + // Knowable at emission time. End-of-utterance is implicit (see design rules). + bool utterance_start = false; + + std::vector words; // word-timestamp opt-in + + // Future / opt-in. Included here for visibility in review. + // We should only add fields that we expect to use as the C API types need to be ABI stable, + // so we can't remove anything added. + std::optional confidence; // 0..1 aggregate + std::optional language; // per-segment, for code-switching + std::optional speaker_id; + std::optional channel; + // we could maybe use something more generic if we want to report these things instead of having per-value fields + // e.g. shared float[] of fixed size and an enum saying which value is in which slot. + std::optional avg_logprob; // Whisper-family diagnostic + std::optional no_speech_prob; // Whisper-family diagnostic + std::optional compression_ratio; // Whisper-family diagnostic +}; + +struct SpeechResult { + std::string text; // concatenated final transcript + std::optional language; // detected source language + std::optional duration; // total audio duration + std::vector segments; // entries are kFinal or kNone +}; + +} // namespace fl +``` + +## C ABI item types + +```c +FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31, // pushed via streaming callback +FOUNDRY_LOCAL_ITEM_SPEECH_RESULT = 32, // final aggregate in response.items +``` + +`TextItem` remains the trivial fallback for `response_format: "text"`. + +## V1 scope + +Populated in the initial implementation: + +- `SpeechSegmentKind`: `kNone`, `kPartial`, `kFinal` +- `SpeechSegment`: `kind`, `text`, `start_time`, `end_time`, + `utterance_start` (defaulted; populated when computable) +- `SpeechResult`: `text`, `language`, `duration`, `segments` + +Defined in the header but unpopulated until a producer exists: + +- `SpeechWord` and `SpeechSegment::words` (word-timestamp opt-in) +- `confidence` (segment and word) +- `avg_logprob`, `no_speech_prob`, `compression_ratio` (Whisper diagnostics) +- `language` / `speaker_id` / `channel` on segment +- `speaker_id` on word + +## Growth headroom (not built) + +- **Diarization**: `speaker_id` already present on word and segment. +- **Multi-channel audio**: `channel` already present on segment. +- **N-best alternatives**: future `std::vector alternatives` + on `SpeechSegment`. +- **OpenAI `verbose_json` compatibility**: handled by a + `ToOpenAIVerboseJson(const SpeechResult&)` adapter in + `contracts/audio_transcriptions.*`, not by changing native types. + +Multi-target translation in a single pass is intentionally out of scope — +that's a server-side concern, not a local-inferencing one. From c0dd6c245c8f65d2d555e2596de0fc54a73fed21 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Tue, 2 Jun 2026 17:57:20 +1000 Subject: [PATCH 2/2] Add speech result types and wire up for initial feedback --- sdk_v2/cpp/CMakeLists.txt | 1 + .../generative/audio => docs}/SPEECH_TYPES.md | 30 ++-- sdk_v2/cpp/examples/realtime_audio/main.cc | 22 ++- .../include/foundry_local/foundry_local_c.h | 93 +++++++++- .../include/foundry_local/foundry_local_cpp.h | 54 ++++++ .../foundry_local/foundry_local_cpp.inline.h | 56 ++++++ sdk_v2/cpp/src/c_api.cc | 41 +++++ .../generative/audio/audio_session.cc | 149 +++++++++++++-- .../generative/audio/audio_session.h | 24 ++- sdk_v2/cpp/src/items/speech_result_item.h | 70 ++++++++ sdk_v2/cpp/src/items/speech_segment_item.cc | 34 ++++ sdk_v2/cpp/src/items/speech_segment_item.h | 73 ++++++++ sdk_v2/cpp/test/internal_api/c_api_test.cc | 28 +++ sdk_v2/cpp/test/internal_api/item_test.cc | 169 ++++++++++++++++++ .../test/sdk_api/audio_transcriptions_test.cc | 73 ++++++++ sdk_v2/cpp/test/sdk_api/model_fixture.h | 3 + .../cpp/test/sdk_api/streaming_audio_test.cc | 13 +- 17 files changed, 878 insertions(+), 55 deletions(-) rename sdk_v2/cpp/{src/inferencing/generative/audio => docs}/SPEECH_TYPES.md (81%) create mode 100644 sdk_v2/cpp/src/items/speech_result_item.h create mode 100644 sdk_v2/cpp/src/items/speech_segment_item.cc create mode 100644 sdk_v2/cpp/src/items/speech_segment_item.h diff --git a/sdk_v2/cpp/CMakeLists.txt b/sdk_v2/cpp/CMakeLists.txt index c203deec6..3b4c82629 100644 --- a/sdk_v2/cpp/CMakeLists.txt +++ b/sdk_v2/cpp/CMakeLists.txt @@ -130,6 +130,7 @@ set(FOUNDRY_LOCAL_SOURCES src/items/item.cc src/items/image_item.cc src/items/message_item.cc + src/items/speech_segment_item.cc src/catalog/base_model_catalog.cc src/catalog/azure_model_catalog.cc src/catalog/azure_catalog_models.cc diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md b/sdk_v2/cpp/docs/SPEECH_TYPES.md similarity index 81% rename from sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md rename to sdk_v2/cpp/docs/SPEECH_TYPES.md index c9f44befd..7d01eaba6 100644 --- a/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md +++ b/sdk_v2/cpp/docs/SPEECH_TYPES.md @@ -6,6 +6,10 @@ transcription events, Azure Speech SDK recognition results. ## Design rules +- **Output-only types.** These types are produced by `AudioSession` and flow out + through the streaming callback and the final `Response`. Callers never + construct them as inputs. The C ABI therefore exposes only Get accessors — + no Set functions, no `Item_Create` for these types. - **One set of types covers transcription, translation, and ASR.** Task selection (transcribe vs translate, target language) is a Request parameter, not a type variant. `text` is the recognized-or-translated string either way. @@ -65,18 +69,7 @@ struct SpeechSegment { std::vector words; // word-timestamp opt-in - // Future / opt-in. Included here for visibility in review. - // We should only add fields that we expect to use as the C API types need to be ABI stable, - // so we can't remove anything added. - std::optional confidence; // 0..1 aggregate std::optional language; // per-segment, for code-switching - std::optional speaker_id; - std::optional channel; - // we could maybe use something more generic if we want to report these things instead of having per-value fields - // e.g. shared float[] of fixed size and an enum saying which value is in which slot. - std::optional avg_logprob; // Whisper-family diagnostic - std::optional no_speech_prob; // Whisper-family diagnostic - std::optional compression_ratio; // Whisper-family diagnostic }; struct SpeechResult { @@ -96,6 +89,10 @@ FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31, // pushed via streaming callback FOUNDRY_LOCAL_ITEM_SPEECH_RESULT = 32, // final aggregate in response.items ``` +These types are output-only — the ABI exposes `GetSpeechSegment` / +`GetSpeechResult` accessors, but no setters and no `Item_Create` support. +Attempting to create one returns `FOUNDRY_LOCAL_ERROR_INVALID_USAGE`. + `TextItem` remains the trivial fallback for `response_format: "text"`. ## V1 scope @@ -110,17 +107,18 @@ Populated in the initial implementation: Defined in the header but unpopulated until a producer exists: - `SpeechWord` and `SpeechSegment::words` (word-timestamp opt-in) -- `confidence` (segment and word) -- `avg_logprob`, `no_speech_prob`, `compression_ratio` (Whisper diagnostics) -- `language` / `speaker_id` / `channel` on segment +- `confidence` on word +- `language` on segment - `speaker_id` on word ## Growth headroom (not built) -- **Diarization**: `speaker_id` already present on word and segment. -- **Multi-channel audio**: `channel` already present on segment. +- **Diarization**: `speaker_id` already present on word. - **N-best alternatives**: future `std::vector alternatives` on `SpeechSegment`. +- **Per-segment diagnostics** (Whisper `avg_logprob`, `no_speech_prob`, + `compression_ratio`; multi-channel `channel`; etc.): pushed as a separate + diagnostic item type rather than overloading `SpeechSegment`. - **OpenAI `verbose_json` compatibility**: handled by a `ToOpenAIVerboseJson(const SpeechResult&)` adapter in `contracts/audio_transcriptions.*`, not by changing native types. diff --git a/sdk_v2/cpp/examples/realtime_audio/main.cc b/sdk_v2/cpp/examples/realtime_audio/main.cc index 1a1453bfa..94a52a09b 100644 --- a/sdk_v2/cpp/examples/realtime_audio/main.cc +++ b/sdk_v2/cpp/examples/realtime_audio/main.cc @@ -46,7 +46,12 @@ void RealtimeAudioChat(IModel& model, const std::string& audio_path) { flItem* raw_item = nullptr; if (item_api->ItemQueue_TryPop(event.item_queue, &raw_item)) { Item item(*raw_item); - if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) { + if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) { + auto seg = item.GetSpeechSegment(); + std::cout.write(seg.text.data(), seg.text.size()); + std::cout.flush(); + } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) { + // `response_format=text` session option produces a simple TextItem stream. std::cout << item.GetText().text << std::flush; } else { std::cerr << "Unexpected item type" << std::endl; @@ -133,9 +138,18 @@ void RealtimeAudioChat(IModel& model, const std::string& audio_path) { << ", completion: " << usage.completion_tokens << ", total: " << usage.total_tokens << "\n"; - // 8. The full response items are also available. - for (const auto& item : response.GetItems()) { - if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) { + // 8. The full response is a single item — SpeechResultItem by default, or TextItem + // if the session was configured with `response_format=text`. + const auto& items = response.GetItems(); + if (!items.empty()) { + const auto& item = items.front(); + if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) { + auto result = item.GetSpeechResult(); + std::cout << "Full response: "; + std::cout.write(result.text.data(), result.text.size()); + std::cout << "\n"; + std::cout << "Segments: " << result.segments.size() << "\n"; + } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) { std::cout << "Full response: " << item.GetText().text << "\n"; } } diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_c.h b/sdk_v2/cpp/include/foundry_local/foundry_local_c.h index a32c686fc..15cc20f17 100644 --- a/sdk_v2/cpp/include/foundry_local/foundry_local_c.h +++ b/sdk_v2/cpp/include/foundry_local/foundry_local_c.h @@ -304,12 +304,15 @@ typedef enum flItemType { FOUNDRY_LOCAL_ITEM_BYTES = 1, // Raw bytes with an item type tag. FOUNDRY_LOCAL_ITEM_TENSOR = 10, FOUNDRY_LOCAL_ITEM_TEXT = 20, - FOUNDRY_LOCAL_ITEM_MESSAGE = 21, // role + content string. - FOUNDRY_LOCAL_ITEM_IMAGE = 25, // Image input/output. Could be bytes or URI (file, memory address, url, etc.) - FOUNDRY_LOCAL_ITEM_AUDIO = 30, // Audio input/output. Could be bytes or URI. - FOUNDRY_LOCAL_ITEM_TOOL_CALL = 100, // request to call tool: call id, tool name, arguments - FOUNDRY_LOCAL_ITEM_TOOL_RESULT = 101, // response from tool: call id, result - FOUNDRY_LOCAL_ITEM_QUEUE = 200, // An item containing an flItemQueue of sub-items. Turtles all the way down. + FOUNDRY_LOCAL_ITEM_MESSAGE = 21, // role + content string. + FOUNDRY_LOCAL_ITEM_IMAGE = 25, // Image input/output. Could be bytes or URI (file, memory address, url, etc.) + FOUNDRY_LOCAL_ITEM_AUDIO = 30, // Audio input/output. Could be bytes or URI. + FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31, // Output-only. Recognized/translated speech segment. + // Pushed via streaming callback during AudioSession. + FOUNDRY_LOCAL_ITEM_SPEECH_RESULT = 32, // Output-only. Final aggregate from AudioSession. + FOUNDRY_LOCAL_ITEM_TOOL_CALL = 100, // request to call tool: call id, tool name, arguments + FOUNDRY_LOCAL_ITEM_TOOL_RESULT = 101, // response from tool: call id, result + FOUNDRY_LOCAL_ITEM_QUEUE = 200, // An item containing an flItemQueue of sub-items. Turtles all the way down. } flItemType; typedef enum flTextItemType { @@ -492,6 +495,70 @@ typedef struct flToolResultData { /* V2 fields go here. */ } flToolResultData; +/* ----------------------------------------------------------------------- + * Speech recognition output types. + * + * SPEECH_SEGMENT and SPEECH_RESULT items are produced by AudioSession and + * delivered via the streaming callback / final Response. Callers never + * construct them — the ABI exposes only Get accessors. + * + * Streaming model: zero-or-more kPartial segments for the current utterance, + * then exactly one kFinal closes it. Segment identity is implicit in stream + * order; there is no segment id. + * + * kPartial text is the cumulative current hypothesis for the segment, not a + * delta-since-last-event. Consumers replace by stream position. + * ----------------------------------------------------------------------- */ + +/// Sentinel for absent flSpeechWord / flSpeechSegmentData / flSpeechResultData +/// time fields. Required because the C ABI cannot carry std::optional. +#define FOUNDRY_LOCAL_DURATION_UNSET INT64_MIN + +typedef enum flSpeechSegmentKind { + FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE = 0, ///< Entry in a final aggregate result. + FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL = 1, ///< Streaming: hypothesis for the current segment; may change. + FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL = 2, ///< Streaming: segment is stable, or entry in the final result. +} flSpeechSegmentKind; + +/// Versioned struct for a single word within a speech segment. +/// All optional fields use sentinels (FOUNDRY_LOCAL_DURATION_UNSET / NULL) when absent. +typedef struct flSpeechWord { + uint32_t version; ///< Set to FOUNDRY_LOCAL_API_VERSION. + const char* text; ///< UTF-8 word text. Always populated. + int64_t start_time_ms; ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent. + int64_t end_time_ms; ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent. + bool has_confidence; ///< True iff `confidence` is populated. + float confidence; ///< 0..1 model posterior. Valid iff has_confidence is true. + const char* speaker_id; ///< Diarization label. NULL if absent. + /* V2 fields go here. */ +} flSpeechWord; + +/// Versioned struct for SPEECH_SEGMENT item content (output-only). +typedef struct flSpeechSegmentData { + uint32_t version; ///< Set to FOUNDRY_LOCAL_API_VERSION. + flSpeechSegmentKind kind; ///< NONE / PARTIAL / FINAL. + const char* text; ///< UTF-8. For PARTIAL: cumulative current hypothesis. May be NULL/"". + int64_t start_time_ms; ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent. + int64_t end_time_ms; ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent. + bool utterance_start; ///< True on the first PARTIAL of a new utterance. End is implicit. + const flSpeechWord* words; ///< Borrowed array. Length = words_count. + size_t words_count; + const char* language; ///< Per-segment language for code-switching. NULL if absent. + /* V2 fields go here. */ +} flSpeechSegmentData; + +/// Versioned struct for SPEECH_RESULT item content (output-only). +/// `segments` entries are SPEECH_SEGMENT items with kind = FINAL or NONE. +typedef struct flSpeechResultData { + uint32_t version; ///< Set to FOUNDRY_LOCAL_API_VERSION. + const char* text; ///< UTF-8 concatenated final transcript. May be NULL/"". + const char* language; ///< Detected source language. NULL if absent. + int64_t duration_ms; ///< Total audio duration. FOUNDRY_LOCAL_DURATION_UNSET if absent. + const flItem* const* segments; ///< Borrowed array of SPEECH_SEGMENT items. Length = segments_count. + size_t segments_count; + /* V2 fields go here. */ +} flSpeechResultData; + /// Versioned struct that we pass to a callback during Session::ProcessRequest. /// Guarantees ordering and synchronization via the flItemQueue. typedef struct flStreamingCallbackData { @@ -707,6 +774,20 @@ struct flItemApi { /// Borrowed pointers in the returned struct are owned by the item and valid until the item is released. FL_API_STATUS(GetToolResult, _In_ const flItem* item, _Out_ flToolResultData* out_tool_result); + /// Get content of a SPEECH_SEGMENT item into a versioned struct. + /// Output-only type — there is no SetSpeechSegment. Item_Create with + /// FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT returns FOUNDRY_LOCAL_ERROR_INVALID_USAGE. + /// Borrowed pointers in the returned struct (text, words array, language) are owned by the item and + /// valid until the item is released. + FL_API_STATUS(GetSpeechSegment, _In_ const flItem* item, _Out_ flSpeechSegmentData* out_segment); + + /// Get content of a SPEECH_RESULT item into a versioned struct. + /// Output-only type — there is no SetSpeechResult. Item_Create with + /// FOUNDRY_LOCAL_ITEM_SPEECH_RESULT returns FOUNDRY_LOCAL_ERROR_INVALID_USAGE. + /// Borrowed pointers in the returned struct (text, language, segments array) are owned by the item and + /// valid until the item is released. Each entry of `segments` is a SPEECH_SEGMENT item. + FL_API_STATUS(GetSpeechResult, _In_ const flItem* item, _Out_ flSpeechResultData* out_result); + /// Get metadata from the item (read-only). /// Returned flKeyValuePairs is owned by the item and valid until the item is released — do not release it. FL_API_STATUS(GetMetadata, _In_ const flItem* item, _Outptr_ const flKeyValuePairs** out_metadata); diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h index d8ab4e0a4..c0119637f 100644 --- a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h +++ b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h @@ -458,6 +458,40 @@ struct ToolResultContent { std::string_view result; }; +/// One word in a SPEECH_SEGMENT. Optional fields use std::optional / empty string_view. +struct SpeechWord { + std::string_view text; + std::optional start_time_ms; + std::optional end_time_ms; + std::optional confidence; + std::optional speaker_id; +}; + +/// Content returned from a SPEECH_SEGMENT item (output-only). +/// +/// See SPEECH_TYPES.md for the streaming model. PARTIAL `text` is the +/// cumulative current hypothesis for the segment, not a delta. As an entry of +/// a SpeechResultContent, `kind` is FINAL (or NONE for a single non-segmented +/// transcript). +struct SpeechSegmentContent { + flSpeechSegmentKind kind; + std::string_view text; + std::optional start_time_ms; + std::optional end_time_ms; + bool utterance_start; + std::vector words; + std::optional language; +}; + +/// Content returned from a SPEECH_RESULT item (output-only). +/// `segments` exposes non-owning views over segment items owned by the result. +struct SpeechResultContent { + std::string_view text; + std::optional language; + std::optional duration_ms; + std::vector segments; +}; + // =========================================================================== // Item // =========================================================================== @@ -485,6 +519,8 @@ class Item { MessageContent GetMessage() const; ToolCallContent GetToolCall() const; ToolResultContent GetToolResult() const; + SpeechSegmentContent GetSpeechSegment() const; + SpeechResultContent GetSpeechResult() const; const flItem* native_handle() const noexcept { return handle_.get(); } flItem* native_handle_mutable() { return handle_.get_mutable(); } @@ -973,6 +1009,24 @@ class ChatSession : public Session { void UndoTurns(size_t count); }; +/// Session for automatic-speech-recognition (transcription) models. +/// +/// Output format is controlled by the session option `response_format`: +/// - Unset, or any value other than "text" (default): each request produces a +/// SpeechResultItem and the streaming callback receives SpeechSegmentItems +/// (one per decoded token). +/// - "text": plain-text output — each request produces a TextItem and the +/// streaming callback receives TextItems. +/// +/// Set this once on the session via `SetOptions` before issuing requests: +/// +/// AudioSession session(model); +/// RequestOptions opts; +/// opts.additional_options.Set("response_format", "text"); +/// session.SetOptions(opts); +/// +/// Output format is a session-level decision and is intentionally NOT honoured +/// when set on a per-request `RequestOptions`. class AudioSession : public Session { public: explicit AudioSession(IModel& model); diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h index 80402185e..6b714202f 100644 --- a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h +++ b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h @@ -710,6 +710,62 @@ inline ToolResultContent Item::GetToolResult() const { tr.result ? std::string_view{tr.result} : std::string_view{}}; } +namespace detail { + +inline std::optional SpeechDuration(int64_t v) { + return v == FOUNDRY_LOCAL_DURATION_UNSET ? std::optional{} : std::optional{v}; +} + +inline std::optional SpeechOptStr(const char* s) { + return s ? std::optional{s} : std::optional{}; +} + +} // namespace detail + +inline SpeechSegmentContent Item::GetSpeechSegment() const { + flSpeechSegmentData s{}; + s.version = FOUNDRY_LOCAL_API_VERSION; + Check(detail::item_api()->GetSpeechSegment(handle_.get(), &s)); + + SpeechSegmentContent out; + out.kind = s.kind; + out.text = s.text ? std::string_view{s.text} : std::string_view{}; + out.start_time_ms = detail::SpeechDuration(s.start_time_ms); + out.end_time_ms = detail::SpeechDuration(s.end_time_ms); + out.utterance_start = s.utterance_start; + out.language = detail::SpeechOptStr(s.language); + out.words.reserve(s.words_count); + for (size_t i = 0; i < s.words_count; ++i) { + const flSpeechWord& w = s.words[i]; + SpeechWord sw; + sw.text = w.text ? std::string_view{w.text} : std::string_view{}; + sw.start_time_ms = detail::SpeechDuration(w.start_time_ms); + sw.end_time_ms = detail::SpeechDuration(w.end_time_ms); + sw.confidence = w.has_confidence ? std::optional{w.confidence} : std::optional{}; + sw.speaker_id = detail::SpeechOptStr(w.speaker_id); + out.words.push_back(std::move(sw)); + } + return out; +} + +inline SpeechResultContent Item::GetSpeechResult() const { + flSpeechResultData r{}; + r.version = FOUNDRY_LOCAL_API_VERSION; + Check(detail::item_api()->GetSpeechResult(handle_.get(), &r)); + + SpeechResultContent out; + out.text = r.text ? std::string_view{r.text} : std::string_view{}; + out.language = detail::SpeechOptStr(r.language); + out.duration_ms = detail::SpeechDuration(r.duration_ms); + out.segments.reserve(r.segments_count); + for (size_t i = 0; i < r.segments_count; ++i) { + if (r.segments[i]) { + out.segments.emplace_back(*r.segments[i]); + } + } + return out; +} + inline flItem* detail::CreateItem(flItemType type) { flItem* item = nullptr; Check(detail::item_api()->Create(type, &item)); diff --git a/sdk_v2/cpp/src/c_api.cc b/sdk_v2/cpp/src/c_api.cc index 26fb647a9..076a480c9 100644 --- a/sdk_v2/cpp/src/c_api.cc +++ b/sdk_v2/cpp/src/c_api.cc @@ -13,6 +13,8 @@ #include "items/bytes_item.h" #include "items/image_item.h" #include "items/message_item.h" +#include "items/speech_result_item.h" +#include "items/speech_segment_item.h" #include "items/tensor_item.h" #include "items/text_item.h" #include "items/tool_call_item.h" @@ -988,6 +990,11 @@ FL_API_STATUS_IMPL(Item_CreateImpl, flItemType type, flItem** out_item) { return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "null out_item"); } + if (type == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT || type == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) { + return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_USAGE, + "SPEECH_SEGMENT / SPEECH_RESULT are output-only and cannot be created by callers"); + } + auto item = fl::Item::Create(type); if (!item) { return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "unknown item type"); @@ -1304,6 +1311,38 @@ FL_API_STATUS_IMPL(Item_GetToolResultImpl, const flItem* item, flToolResultData* API_IMPL_END } +// --- Speech (output-only) --- + +FL_API_STATUS_IMPL(Item_GetSpeechSegmentImpl, const flItem* item, flSpeechSegmentData* out_segment) { + API_IMPL_BEGIN + if (!item || !out_segment) { + return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "null argument"); + } + + if (AsImpl(item)->type != FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) { + return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_USAGE, "item is not a SPEECH_SEGMENT item"); + } + + AsItemType(item)->GetApiData(*out_segment); + return nullptr; + API_IMPL_END +} + +FL_API_STATUS_IMPL(Item_GetSpeechResultImpl, const flItem* item, flSpeechResultData* out_result) { + API_IMPL_BEGIN + if (!item || !out_result) { + return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "null argument"); + } + + if (AsImpl(item)->type != FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) { + return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_USAGE, "item is not a SPEECH_RESULT item"); + } + + AsItemType(item)->GetApiData(*out_result); + return nullptr; + API_IMPL_END +} + // --- Bytes --- FL_API_STATUS_IMPL(Item_SetBytesImpl, flItem* item, const flBytesData* bytes) { @@ -1460,6 +1499,8 @@ static const flItemApi g_item_api = { Item_GetAudioImpl, Item_GetToolCallImpl, Item_GetToolResultImpl, + Item_GetSpeechSegmentImpl, + Item_GetSpeechResultImpl, Item_GetMetadataImpl, Item_GetMutableMetadataImpl, Item_GetQueueImpl, diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc index bea71f1a5..a50b07744 100644 --- a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc +++ b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc @@ -11,6 +11,8 @@ #include "items/audio_item.h" #include "items/bytes_item.h" #include "items/item_queue.h" +#include "items/speech_result_item.h" +#include "items/speech_segment_item.h" #include "items/text_item.h" #include "model.h" #include "utils.h" @@ -22,6 +24,54 @@ namespace fl { +namespace { + +// Build a single SpeechSegmentItem with kind NONE wrapping the given text. +// +// TODO: emit kPartial / kFinal once we integrate a model that exposes segmentation +// hypotheses (e.g. a streaming ASR that revises in-flight transcripts before finalising). +// Today's audio models (Whisper, Nemotron streaming) only surface decoded tokens, so the +// stream has no notion of "hypothesis being revised" vs "utterance finalised" — NONE is +// the honest label. +std::unique_ptr MakeNoneSegment(std::string text) { + auto seg = std::make_unique(FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE, std::move(text)); + seg->Finalize(); + return seg; +} + +// Assemble the final SpeechResultItem from the cumulative text and the per-token segments +// accumulated during generation. `language` and `duration_ms` are intentionally left unset: +// the request-side language is just a hint, and GenAI does not report a detected source +// language or audio duration. +std::unique_ptr BuildSpeechResult( + std::string text, std::vector> segments) { + auto result = std::make_unique(std::move(text)); + result->segments = std::move(segments); + result->Finalize(); + return result; +} + +// Initial capacity for the per-token accumulators. Picked empirically: a few seconds of speech +// (~10s on Whisper, ~5s on Nemotron streaming) produces under 256 tokens, so most short-form +// transcriptions avoid any reallocation. Longer transcriptions still grow geometrically. +constexpr size_t kInitialTokenCapacity = 256; + +// Concatenate the per-token strings into a single buffer with one allocation. +std::string JoinTokens(const std::vector& token_texts) { + size_t total = 0; + for (const auto& t : token_texts) { + total += t.size(); + } + std::string out; + out.reserve(total); + for (const auto& t : token_texts) { + out.append(t); + } + return out; +} + +} // namespace + AudioSession::AudioSession(const fl::Model& catalog_model, GenAIModelInstance& model, ILogger& logger, ITelemetry& telemetry) : Session(catalog_model, logger, telemetry), logger_(logger), model_(model) { @@ -51,6 +101,11 @@ SessionType AudioSession::Type() const { void AudioSession::SetSessionOptionsImpl(const KeyValuePairs& options) { session_options_ = SearchOptions::FromParameters(options); + + // Recompute the cached output-format flag. Default = SpeechResultItem; any value other than + // "text" also maps to SpeechResultItem. response_format is intentionally session-level only. + auto it = options.find("response_format"); + text_output_ = (it != options.end() && it->second == "text"); } void AudioSession::ProcessRequestImpl(const Request& request, Response& response) { @@ -139,19 +194,33 @@ void AudioSession::ProcessRequestImpl(const Request& request, Response& response // Token-by-token generation with optional streaming. // Check request.canceled each iteration — a streaming callback returning // non-zero sets this flag asynchronously via CallbackHandler. - std::string text; + std::vector token_texts; + token_texts.reserve(kInitialTokenCapacity); auto streaming_callback = CreateCallbackHandler(request); + const bool want_speech = !text_output_; + std::vector> segments; + if (want_speech) { + segments.reserve(kInitialTokenCapacity); + } while (!generator->IsDone() && !request.canceled) { generator->GenerateNextToken(); std::string token = generator->Decode(); if (!token.empty()) { - text += token; + if (want_speech) { + segments.push_back(MakeNoneSegment(token)); + } if (streaming_callback) { - streaming_callback->PushItem(std::make_unique(token)); + if (want_speech) { + streaming_callback->PushItem(MakeNoneSegment(token)); + } else { + streaming_callback->PushItem(std::make_unique(token)); + } } + + token_texts.push_back(std::move(token)); } if (request.canceled) { @@ -162,8 +231,14 @@ void AudioSession::ProcessRequestImpl(const Request& request, Response& response int total_tokens = generator->TokenCount(); int completion_tokens = total_tokens - prompt_tokens; - // Add the full transcription as a text item - response.items.push_back(std::make_unique(std::move(text))); + std::string text = JoinTokens(token_texts); + + // Add the transcription. Default = SpeechResultItem; legacy = TextItem. + if (want_speech) { + response.items.push_back(BuildSpeechResult(std::move(text), std::move(segments))); + } else { + response.items.push_back(std::make_unique(std::move(text))); + } // Set finish reason if (request.canceled) { @@ -226,14 +301,24 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue auto tokenizer_stream = OgaTokenizerStream::Create(Model().GetOgaTokenizer()); auto streaming_callback = CreateCallbackHandler(request); - std::string full_text; + std::vector token_texts; + token_texts.reserve(kInitialTokenCapacity); + const bool want_speech = !text_output_; + std::vector> segments; + if (want_speech) { + segments.reserve(kInitialTokenCapacity); + } + std::vector>* segments_ptr = want_speech ? &segments : nullptr; + // Streaming ASR has no text prompt (input is audio), so prompt_tokens stays 0. + // We track every decoded token (whether it produced visible text or not) as completion_tokens. + int completion_tokens = 0; // 3. If the AudioItem itself has initial data, process it first if (format_item.data && format_item.data_size > 0) { auto float_samples = ConvertS16LEToFloat( static_cast(format_item.data), format_item.data_size); ProcessChunk(*processor, *generator, *tokenizer_stream, - float_samples, full_text, streaming_callback, request); + float_samples, token_texts, segments_ptr, streaming_callback, request, completion_tokens); } // 4. Read from queue until finished or cancelled @@ -258,7 +343,7 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue static_cast(bytes.data), bytes.data_size); ProcessChunk(*processor, *generator, *tokenizer_stream, - float_samples, full_text, streaming_callback, request); + float_samples, token_texts, segments_ptr, streaming_callback, request, completion_tokens); } // 5. Flush remaining buffered audio @@ -267,12 +352,20 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue if (flush_tensors) { generator->SetInputs(*flush_tensors); - DecodeTokens(*generator, *tokenizer_stream, full_text, streaming_callback, request); + DecodeTokens(*generator, *tokenizer_stream, token_texts, segments_ptr, streaming_callback, request, + completion_tokens); } } - // 6. Produce response - response.items.push_back(std::make_unique(std::move(full_text))); + // 6. Produce response. Default = SpeechResultItem carrying all per-token segments; + // legacy `response_format=text` keeps the bare TextItem output. + std::string full_text = JoinTokens(token_texts); + const size_t full_text_size = full_text.size(); + if (want_speech) { + response.items.push_back(BuildSpeechResult(std::move(full_text), std::move(segments))); + } else { + response.items.push_back(std::make_unique(std::move(full_text))); + } if (request.canceled) { response.finish_reason = FOUNDRY_LOCAL_FINISH_NONE; @@ -280,28 +373,36 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue response.finish_reason = FOUNDRY_LOCAL_FINISH_STOP; } + response.usage.prompt_tokens = 0; + response.usage.completion_tokens = completion_tokens; + response.usage.total_tokens = completion_tokens; + logger_.Log(LogLevel::Debug, fmt::format("Streaming audio transcription complete, text length: {}", - response.items.empty() ? 0 : full_text.size())); + response.items.empty() ? 0 : full_text_size)); } void AudioSession::ProcessChunk(OgaStreamingProcessor& processor, OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream, const std::vector& samples, - std::string& full_text, + std::vector& token_texts, + std::vector>* segments, const std::unique_ptr& callback, - const Request& request) { + const Request& request, + int& completion_tokens) { auto tensors = processor.Process(samples.data(), samples.size()); if (tensors) { generator.SetInputs(*tensors); - DecodeTokens(generator, tokenizer_stream, full_text, callback, request); + DecodeTokens(generator, tokenizer_stream, token_texts, segments, callback, request, completion_tokens); } } void AudioSession::DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream, - std::string& full_text, + std::vector& token_texts, + std::vector>* segments, const std::unique_ptr& callback, - const Request& request) { + const Request& request, + int& completion_tokens) { while (!generator.IsDone() && !generator.IsSessionTerminated() && !request.canceled) { generator.GenerateNextToken(); auto next_tokens = generator.GetNextTokens(); @@ -310,15 +411,25 @@ void AudioSession::DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tok continue; } + ++completion_tokens; + int32_t token_id = next_tokens[0]; const char* token_text = tokenizer_stream.Decode(token_id); if (token_text && token_text[0] != '\0') { - full_text += token_text; + if (segments) { + segments->push_back(MakeNoneSegment(token_text)); + } if (callback) { - callback->PushItem(std::make_unique(std::string(token_text))); + if (segments) { + callback->PushItem(MakeNoneSegment(token_text)); + } else { + callback->PushItem(std::make_unique(std::string(token_text))); + } } + + token_texts.emplace_back(token_text); } } } diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h index 24e0fc89c..46417f4ce 100644 --- a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h +++ b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h @@ -21,6 +21,7 @@ namespace fl { class GenAIModelInstance; struct AudioItem; struct ItemQueue; +struct SpeechSegmentItem; /// Audio transcription session. /// Stateless — each request processes one audio file independently (no history). @@ -59,16 +60,25 @@ class AudioSession : public Session { /// Feed float32 PCM samples to the StreamingProcessor. If a full encoder chunk is ready, /// set the tensors on the generator and decode tokens. /// IMPORTANT: DecodeTokens must drain to IsDone() before the next SetInputs() call. + /// `segments` (when non-null) accumulates a SpeechSegmentItem per decoded token; the same + /// per-token segments are also what gets pushed to the streaming callback. When null, the + /// callback receives plain TextItems (legacy `response_format=text` mode). void ProcessChunk(OgaStreamingProcessor& processor, OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream, const std::vector& samples, - std::string& full_text, const std::unique_ptr& callback, - const Request& request); + std::vector& token_texts, + std::vector>* segments, + const std::unique_ptr& callback, + const Request& request, + int& completion_tokens); /// Decode all available tokens from the generator. This MUST run to completion /// (IsDone() == true) before the next SetInputs() call. void DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream, - std::string& full_text, const std::unique_ptr& callback, - const Request& request); + std::vector& token_texts, + std::vector>* segments, + const std::unique_ptr& callback, + const Request& request, + int& completion_tokens); GenAIModelInstance& Model() { return model_; } const GenAIModelInstance& Model() const { return model_; } @@ -79,6 +89,12 @@ class AudioSession : public Session { // moved-from instance so the refcount transfers cleanly across moves. bool owns_session_ = true; SearchOptions session_options_; + + // Cached flag derived from session_options_["response_format"]: true when the session is + // configured for plain-text output (TextItem only). Updated in SetSessionOptionsImpl + // so each request just reads a bool instead of hitting the KeyValuePairs map. Output format + // is a session-level decision and per-request `response_format` is intentionally ignored. + bool text_output_ = false; }; } // namespace fl diff --git a/sdk_v2/cpp/src/items/speech_result_item.h b/sdk_v2/cpp/src/items/speech_result_item.h new file mode 100644 index 000000000..ac9f67d66 --- /dev/null +++ b/sdk_v2/cpp/src/items/speech_result_item.h @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once + +#include "items/item.h" +#include "items/speech_segment_item.h" + +#include +#include +#include +#include +#include +#include + +namespace fl { + +/// Final aggregate result for a completed audio request. +/// Output-only. `segments` entries are SpeechSegmentItems with kind = FINAL +/// (or NONE for a single non-segmented transcript). +struct SpeechResultItem : Item { + std::string text; + std::string language; // empty when absent + std::optional duration_ms; + std::vector> segments; + + SpeechResultItem() : Item(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {} + + explicit SpeechResultItem(std::string text_in) + : Item(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT), text(std::move(text_in)) {} + + SpeechResultItem(const SpeechResultItem&) = delete; + SpeechResultItem& operator=(const SpeechResultItem&) = delete; + SpeechResultItem(SpeechResultItem&&) = default; + SpeechResultItem& operator=(SpeechResultItem&&) = default; + + /// Snapshot the current field values (and each segment's) into the cached + /// C ABI representation. Must be called once after the item is fully + /// populated and before any `GetApiData` call. Calls `Finalize()` on every + /// child segment. Fields must not be mutated after Finalize(). + void Finalize() { + cached_segment_ptrs_.clear(); + cached_segment_ptrs_.reserve(segments.size()); + for (const auto& s : segments) { + if (s) { + s->Finalize(); + cached_segment_ptrs_.push_back(s->AsApiType()); + } else { + cached_segment_ptrs_.push_back(nullptr); + } + } + + cached_ = {}; + cached_.version = FOUNDRY_LOCAL_API_VERSION; + cached_.text = text.c_str(); + cached_.language = language.empty() ? nullptr : language.c_str(); + cached_.duration_ms = duration_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET); + cached_.segments = cached_segment_ptrs_.empty() ? nullptr : cached_segment_ptrs_.data(); + cached_.segments_count = cached_segment_ptrs_.size(); + } + + /// Copy the cached C ABI snapshot into `out`. Requires a prior Finalize(). + /// Borrowed pointers in `out` remain valid for the lifetime of this item. + void GetApiData(flSpeechResultData& out) const { out = cached_; } + + private: + flSpeechResultData cached_{}; + std::vector cached_segment_ptrs_; +}; + +} // namespace fl diff --git a/sdk_v2/cpp/src/items/speech_segment_item.cc b/sdk_v2/cpp/src/items/speech_segment_item.cc new file mode 100644 index 000000000..31eec3212 --- /dev/null +++ b/sdk_v2/cpp/src/items/speech_segment_item.cc @@ -0,0 +1,34 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#include "items/speech_segment_item.h" + +namespace fl { + +void SpeechSegmentItem::Finalize() { + cached_words_.clear(); + cached_words_.reserve(words.size()); + for (const auto& w : words) { + flSpeechWord aw{}; + aw.version = FOUNDRY_LOCAL_API_VERSION; + aw.text = w.text.c_str(); + aw.start_time_ms = w.start_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET); + aw.end_time_ms = w.end_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET); + aw.has_confidence = w.confidence.has_value(); + aw.confidence = w.confidence.value_or(0.0f); + aw.speaker_id = w.speaker_id.empty() ? nullptr : w.speaker_id.c_str(); + cached_words_.push_back(aw); + } + + cached_ = {}; + cached_.version = FOUNDRY_LOCAL_API_VERSION; + cached_.kind = kind; + cached_.text = text.c_str(); + cached_.start_time_ms = start_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET); + cached_.end_time_ms = end_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET); + cached_.utterance_start = utterance_start; + cached_.language = language.empty() ? nullptr : language.c_str(); + cached_.words = cached_words_.empty() ? nullptr : cached_words_.data(); + cached_.words_count = cached_words_.size(); +} + +} // namespace fl diff --git a/sdk_v2/cpp/src/items/speech_segment_item.h b/sdk_v2/cpp/src/items/speech_segment_item.h new file mode 100644 index 000000000..5d65d9ebc --- /dev/null +++ b/sdk_v2/cpp/src/items/speech_segment_item.h @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#pragma once + +#include "items/item.h" + +#include +#include +#include +#include +#include + +namespace fl { + +/// One word within a speech segment. Output-only — produced by the SDK, +/// never constructed by callers via the C ABI. +struct SpeechWord { + std::string text; + std::optional start_time_ms; + std::optional end_time_ms; + std::optional confidence; + std::string speaker_id; // empty when absent +}; + +/// A recognized / translated speech segment. +/// +/// Streaming model (see SPEECH_TYPES.md): zero-or-more PARTIAL segments for +/// the current segment, then exactly one FINAL closes it. Segment identity +/// is implicit in stream order; there is no segment id. `utterance_start` +/// tags the first segment of a new utterance. +/// +/// PARTIAL `text` is the cumulative current hypothesis for the segment, not +/// a delta. +/// +/// As an entry of a SpeechResultItem, `kind` is FINAL (or NONE for a single +/// non-segmented transcript). +struct SpeechSegmentItem : Item { + flSpeechSegmentKind kind = FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE; + std::string text; + std::optional start_time_ms; + std::optional end_time_ms; + bool utterance_start = false; + std::vector words; + std::string language; // empty when absent + + SpeechSegmentItem() : Item(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) {} + + SpeechSegmentItem(flSpeechSegmentKind kind_in, std::string text_in) + : Item(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT), + kind(kind_in), + text(std::move(text_in)) {} + + // Move-only; copying would invalidate cached C ABI pointers. + SpeechSegmentItem(const SpeechSegmentItem&) = delete; + SpeechSegmentItem& operator=(const SpeechSegmentItem&) = delete; + SpeechSegmentItem(SpeechSegmentItem&&) = default; + SpeechSegmentItem& operator=(SpeechSegmentItem&&) = default; + + /// Snapshot the current field values into the cached C ABI representation. + /// Must be called once after the item is fully populated and before any + /// `GetApiData` call. Fields must not be mutated after Finalize(). + void Finalize(); + + /// Copy the cached C ABI snapshot into `out`. Requires a prior Finalize(). + /// Borrowed pointers in `out` remain valid for the lifetime of this item. + void GetApiData(flSpeechSegmentData& out) const { out = cached_; } + + private: + flSpeechSegmentData cached_{}; + std::vector cached_words_; +}; + +} // namespace fl diff --git a/sdk_v2/cpp/test/internal_api/c_api_test.cc b/sdk_v2/cpp/test/internal_api/c_api_test.cc index a8f072410..576615c03 100644 --- a/sdk_v2/cpp/test/internal_api/c_api_test.cc +++ b/sdk_v2/cpp/test/internal_api/c_api_test.cc @@ -596,6 +596,34 @@ TEST(CApiTest, ItemReleaseNullIsNoOp) { api->GetItemApi()->Item_Release(nullptr); } +// ======================================================================== +// Speech items (output-only) — Create is rejected; Get works on internally-built items +// ======================================================================== + +TEST(CApiTest, ItemCreateSpeechSegmentRejected) { + const flApi* api = GetApi(); + const flItemApi* item_api = api->GetItemApi(); + + flItem* item = nullptr; + flStatus* status = item_api->Create(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT, &item); + ASSERT_NE(status, nullptr); + EXPECT_EQ(api->Status_GetErrorCode(status), FOUNDRY_LOCAL_ERROR_INVALID_USAGE); + EXPECT_EQ(item, nullptr); + api->Status_Release(status); +} + +TEST(CApiTest, ItemCreateSpeechResultRejected) { + const flApi* api = GetApi(); + const flItemApi* item_api = api->GetItemApi(); + + flItem* item = nullptr; + flStatus* status = item_api->Create(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT, &item); + ASSERT_NE(status, nullptr); + EXPECT_EQ(api->Status_GetErrorCode(status), FOUNDRY_LOCAL_ERROR_INVALID_USAGE); + EXPECT_EQ(item, nullptr); + api->Status_Release(status); +} + // ======================================================================== // Inference API — Request / Response // ======================================================================== diff --git a/sdk_v2/cpp/test/internal_api/item_test.cc b/sdk_v2/cpp/test/internal_api/item_test.cc index ca16167f4..75bc6aded 100644 --- a/sdk_v2/cpp/test/internal_api/item_test.cc +++ b/sdk_v2/cpp/test/internal_api/item_test.cc @@ -9,6 +9,8 @@ #include "items/image_item.h" #include "items/item_queue.h" #include "items/message_item.h" +#include "items/speech_result_item.h" +#include "items/speech_segment_item.h" #include "items/tensor_item.h" #include "items/text_item.h" #include "items/tool_call_item.h" @@ -16,6 +18,7 @@ #include "inferencing/session/session.h" #include "exception.h" +#include #include #include @@ -142,6 +145,18 @@ TEST(ItemCreateTest, UnknownTypeReturnsNullptr) { EXPECT_EQ(item, nullptr); } +TEST(ItemCreateTest, SpeechSegmentNotCreatable) { + // SPEECH_SEGMENT is output-only; the factory does not produce one. + auto item = Item::Create(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT); + EXPECT_EQ(item, nullptr); +} + +TEST(ItemCreateTest, SpeechResultNotCreatable) { + // SPEECH_RESULT is output-only; the factory does not produce one. + auto item = Item::Create(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT); + EXPECT_EQ(item, nullptr); +} + TEST(ItemCreateTest, InvalidEnumValueReturnsNullptr) { auto item = Item::Create(static_cast(9999)); EXPECT_EQ(item, nullptr); @@ -188,6 +203,160 @@ TEST(ToolResultItemTest, ConstructWithValues) { EXPECT_EQ(item.result, "72 degrees"); } +// ======================================================================== +// Speech items (output-only) +// ======================================================================== + +TEST(SpeechSegmentItemTest, DefaultsAreEmpty) { + SpeechSegmentItem item; + EXPECT_TRUE(item.type == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT); + EXPECT_EQ(item.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE); + EXPECT_TRUE(item.text.empty()); + EXPECT_FALSE(item.start_time_ms.has_value()); + EXPECT_FALSE(item.end_time_ms.has_value()); + EXPECT_FALSE(item.utterance_start); + EXPECT_TRUE(item.words.empty()); + EXPECT_TRUE(item.language.empty()); +} + +TEST(SpeechSegmentItemTest, ConstructWithKindAndText) { + SpeechSegmentItem item(FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL, "hello"); + EXPECT_EQ(item.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL); + EXPECT_EQ(item.text, "hello"); +} + +TEST(SpeechSegmentItemTest, GetApiDataMapsOptionalsToSentinel) { + SpeechSegmentItem item(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "the cat sat"); + item.utterance_start = true; + item.start_time_ms = 100; + item.end_time_ms = 1500; + item.language = "en"; + item.words.push_back({"the", 100, 200, 0.95f, ""}); + item.words.push_back({"cat", std::nullopt, std::nullopt, std::nullopt, "spk_1"}); + item.Finalize(); + + flSpeechSegmentData out{}; + out.version = FOUNDRY_LOCAL_API_VERSION; + item.GetApiData(out); + + EXPECT_EQ(out.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL); + EXPECT_STREQ(out.text, "the cat sat"); + EXPECT_EQ(out.start_time_ms, 100); + EXPECT_EQ(out.end_time_ms, 1500); + EXPECT_TRUE(out.utterance_start); + EXPECT_STREQ(out.language, "en"); + ASSERT_EQ(out.words_count, 2u); + + EXPECT_STREQ(out.words[0].text, "the"); + EXPECT_EQ(out.words[0].start_time_ms, 100); + EXPECT_EQ(out.words[0].end_time_ms, 200); + EXPECT_TRUE(out.words[0].has_confidence); + EXPECT_FLOAT_EQ(out.words[0].confidence, 0.95f); + EXPECT_EQ(out.words[0].speaker_id, nullptr); + + EXPECT_STREQ(out.words[1].text, "cat"); + EXPECT_EQ(out.words[1].start_time_ms, FOUNDRY_LOCAL_DURATION_UNSET); + EXPECT_EQ(out.words[1].end_time_ms, FOUNDRY_LOCAL_DURATION_UNSET); + EXPECT_FALSE(out.words[1].has_confidence); + EXPECT_STREQ(out.words[1].speaker_id, "spk_1"); +} + +TEST(SpeechSegmentItemTest, GetApiDataEmptyOptionalsBecomeSentinel) { + SpeechSegmentItem item; + item.Finalize(); + flSpeechSegmentData out{}; + out.version = FOUNDRY_LOCAL_API_VERSION; + item.GetApiData(out); + + EXPECT_EQ(out.start_time_ms, FOUNDRY_LOCAL_DURATION_UNSET); + EXPECT_EQ(out.end_time_ms, FOUNDRY_LOCAL_DURATION_UNSET); + EXPECT_EQ(out.language, nullptr); + EXPECT_EQ(out.words, nullptr); + EXPECT_EQ(out.words_count, 0u); +} + +TEST(SpeechResultItemTest, DefaultsAreEmpty) { + SpeechResultItem item; + EXPECT_TRUE(item.type == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT); + EXPECT_TRUE(item.text.empty()); + EXPECT_TRUE(item.language.empty()); + EXPECT_FALSE(item.duration_ms.has_value()); + EXPECT_TRUE(item.segments.empty()); +} + +TEST(SpeechResultItemTest, GetApiDataExposesSegmentsAsItemPointers) { + SpeechResultItem result("the cat sat"); + result.language = "en"; + result.duration_ms = 1500; + result.segments.push_back(std::make_unique(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "the cat")); + result.segments.push_back(std::make_unique(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "sat")); + result.Finalize(); + + flSpeechResultData out{}; + out.version = FOUNDRY_LOCAL_API_VERSION; + result.GetApiData(out); + + EXPECT_STREQ(out.text, "the cat sat"); + EXPECT_STREQ(out.language, "en"); + EXPECT_EQ(out.duration_ms, 1500); + ASSERT_EQ(out.segments_count, 2u); + + // Each segment pointer should resolve back to a SPEECH_SEGMENT item. + for (size_t i = 0; i < out.segments_count; ++i) { + ASSERT_NE(out.segments[i], nullptr); + EXPECT_TRUE(reinterpret_cast(out.segments[i])->type == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT); + } +} + +// Wrapper translation: sentinel int64 ↔ std::optional, NULL ↔ std::optional, +// and that segments are exposed as a vector of SPEECH_SEGMENT items. +TEST(SpeechWrapperTest, ReadsThroughPublicCppApi) { + SpeechSegmentItem seg(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "the cat sat"); + seg.start_time_ms = 100; + seg.utterance_start = true; + // end_time_ms intentionally left unset. + seg.words.push_back({"the", 100, 200, 0.95f, ""}); + seg.words.push_back({"cat", std::nullopt, std::nullopt, std::nullopt, ""}); + seg.Finalize(); + + const fl::SpeechSegmentItem& cseg = seg; + foundry_local::Item view(*cseg.AsApiType()); + auto content = view.GetSpeechSegment(); + EXPECT_EQ(content.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL); + EXPECT_EQ(content.text, "the cat sat"); + ASSERT_TRUE(content.start_time_ms.has_value()); + EXPECT_EQ(*content.start_time_ms, 100); + EXPECT_FALSE(content.end_time_ms.has_value()); + EXPECT_TRUE(content.utterance_start); + EXPECT_FALSE(content.language.has_value()); + ASSERT_EQ(content.words.size(), 2u); + + EXPECT_EQ(content.words[0].text, "the"); + ASSERT_TRUE(content.words[0].confidence.has_value()); + EXPECT_FLOAT_EQ(*content.words[0].confidence, 0.95f); + EXPECT_FALSE(content.words[0].speaker_id.has_value()); + + EXPECT_FALSE(content.words[1].start_time_ms.has_value()); + EXPECT_FALSE(content.words[1].confidence.has_value()); +} + +TEST(SpeechWrapperTest, ResultExposesSegmentsAsItemViews) { + SpeechResultItem result("hi"); + result.duration_ms = 1500; + result.segments.push_back(std::make_unique(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "hi")); + result.Finalize(); + + const fl::SpeechResultItem& cresult = result; + foundry_local::Item view(*cresult.AsApiType()); + auto content = view.GetSpeechResult(); + EXPECT_EQ(content.text, "hi"); + ASSERT_TRUE(content.duration_ms.has_value()); + EXPECT_EQ(*content.duration_ms, 1500); + ASSERT_EQ(content.segments.size(), 1u); + EXPECT_EQ(content.segments[0].GetType(), FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT); + EXPECT_EQ(content.segments[0].GetSpeechSegment().text, "hi"); +} + TEST(JsonItemTest, OpenAIJsonTextItem) { TextItem item(R"({"model":"gpt-4","input":"hello"})", FOUNDRY_LOCAL_TEXT_ITEM_TYPE_OPENAI_JSON); EXPECT_TRUE(item.type == FOUNDRY_LOCAL_ITEM_TEXT); diff --git a/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc b/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc index 1c6f61fc2..8a6496410 100644 --- a/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc +++ b/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc @@ -135,6 +135,79 @@ TEST_F(AudioSessionFixture, TranscribeWithSessionLevelOptions) { ExpectTranscriptionContent(text); } +TEST_F(AudioSessionFixture, TranscribeProducesSpeechResultItem) { + using namespace foundry_local; + + Request request; + request.AddItem(Item::AudioFromUri(audio_file_path())); + + AudioSession session(audio_model()); + Response response = session.ProcessRequest(request); + + EXPECT_EQ(response.GetFinishReason(), FOUNDRY_LOCAL_FINISH_STOP); + + const Item* speech_item = nullptr; + for (const auto& item : response.GetItems()) { + if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) { + speech_item = &item; + break; + } + } + ASSERT_NE(speech_item, nullptr) << "Expected a SPEECH_RESULT item by default"; + + auto result = speech_item->GetSpeechResult(); + std::string result_text(result.text); + EXPECT_FALSE(result_text.empty()); + ExpectTranscriptionContent(result_text); + // One segment per decoded token, kind NONE. Concatenated text matches result.text. + ASSERT_FALSE(result.segments.empty()); + std::string concatenated; + for (const auto& seg_item : result.segments) { + auto seg = seg_item.GetSpeechSegment(); + EXPECT_EQ(seg.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE); + EXPECT_FALSE(seg.start_time_ms.has_value()); + EXPECT_FALSE(seg.end_time_ms.has_value()); + EXPECT_TRUE(seg.words.empty()); + concatenated.append(seg.text.data(), seg.text.size()); + } + EXPECT_EQ(concatenated, result_text); + // Detected source language is not reported by GenAI today — reserved for future translation. + EXPECT_FALSE(result.language.has_value()); + EXPECT_FALSE(result.duration_ms.has_value()); +} + +TEST_F(AudioSessionFixture, TranscribeWithResponseFormatTextProducesTextItem) { + using namespace foundry_local; + + Request request; + request.AddItem(Item::AudioFromUri(audio_file_path())); + + // response_format is a session-level option; setting it on the request must NOT take effect. + AudioSession session(audio_model()); + RequestOptions session_opts; + session_opts.additional_options.Set("response_format", "text"); + session.SetOptions(session_opts); + + Response response = session.ProcessRequest(request); + + EXPECT_EQ(response.GetFinishReason(), FOUNDRY_LOCAL_FINISH_STOP); + + bool saw_text = false; + bool saw_speech = false; + for (const auto& item : response.GetItems()) { + if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) { + saw_text = true; + } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) { + saw_speech = true; + } + } + EXPECT_TRUE(saw_text) << "response_format=text should produce a TEXT item"; + EXPECT_FALSE(saw_speech) << "response_format=text should NOT produce a SPEECH_RESULT item"; + + std::string text = CollectResponseText(response); + ExpectTranscriptionContent(text); +} + // ---- Error paths — exercise ProcessRequestImpl validation branches. ---- TEST_F(AudioSessionFixture, RejectsEmptyRequest) { diff --git a/sdk_v2/cpp/test/sdk_api/model_fixture.h b/sdk_v2/cpp/test/sdk_api/model_fixture.h index f26744da9..312abce51 100644 --- a/sdk_v2/cpp/test/sdk_api/model_fixture.h +++ b/sdk_v2/cpp/test/sdk_api/model_fixture.h @@ -63,6 +63,9 @@ inline std::string CollectResponseText(const foundry_local::Response& response) text += item.GetText().text; } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_MESSAGE) { text += CollectMessageText(item.GetMessage()); + } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) { + auto sr = item.GetSpeechResult(); + text.append(sr.text.data(), sr.text.size()); } } diff --git a/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc b/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc index 290604838..be6981391 100644 --- a/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc +++ b/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc @@ -260,12 +260,13 @@ TEST_F(StreamingAudioFixture, StreamingCallbackReceivesTokens) { // Wrap in Item for RAII release and checked accessors. Item item(*raw_item); - if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) { - auto text = item.GetText().text; - if (!text.empty()) { - std::lock_guard lock(text_mutex); - streamed_text += text; - } + // Default audio output is SpeechSegmentItem per token (we never set response_format=text + // on this session, so a TextItem would be unexpected). + EXPECT_EQ(item.GetType(), FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT); + auto seg = item.GetSpeechSegment(); + if (!seg.text.empty()) { + std::lock_guard lock(text_mutex); + streamed_text.append(seg.text.data(), seg.text.size()); } callback_count++;