From 8bddd9f02a56db382ac852a02a560443c989312d Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Sat, 30 May 2026 10:00:49 +1000
Subject: [PATCH 1/2] Initial proposed set of types for review/refinement

---
 .../generative/audio/SPEECH_TYPES.md          | 129 ++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md
diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md b/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md
new file mode 100644
index 000000000..c9f44befd
--- /dev/null
+++ b/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md
@@ -0,0 +1,129 @@
+# Speech Types — Design
+
+Native SDK types returned by `AudioSession` for speech-to-text (file and live),
+translation, and ASR scenarios. References: OpenAI `verbose_json` / Realtime
+transcription events, Azure Speech SDK recognition results.
+
+## Design rules
+
+- **One set of types covers transcription, translation, and ASR.** Task
+  selection (transcribe vs translate, target language) is a Request parameter,
+  not a type variant. `text` is the recognized-or-translated string either way.
+- **One shared segment type** for both streaming events and final-result entries,
+  discriminated by `kind`.
+- **No event wrapper, no `event_id`, no segment `id`.** Ordering is a property of
+  the callback channel; segment identity is implicit in stream order (zero-or-more
+  `kPartial` for the current segment, then one `kFinal` closes it). A web service
+  above the SDK can add envelope/sequence metadata.
+- **`text` on `kPartial` is the cumulative current hypothesis for the segment**,
+  not a delta-since-last-event (Azure-style). A delta is recoverable by diffing
+  against the previous hypothesis.
+- **`utterance_start` is a boolean on the segment.** Knowable at emission time
+  (VAD says "speech started" → producer tags the first `kPartial` of the new
+  segment). There is no `utterance_end` field: end-of-utterance can't be known
+  when the `kFinal` is emitted without delaying it by the silence threshold.
+  Instead, end is implicit — the next `utterance_start` marks it (consumer
+  infers end at the previous `kFinal.end_time`), a future `kSilence` event
+  marks it explicitly, or the final `SpeechResult` marks it for file
+  transcription.
+- **Time as `int64_t` milliseconds.** Must survive the C ABI. Typedef'd so the
+  unit is legible and changeable in one place.
+- **Two C ABI item types** — one for streaming segments, one for the final
+  aggregate. Both additive to existing items.
+
+## Types
+
+```cpp
+namespace fl {
+
+using DurationMs = std::int64_t;  // milliseconds; C ABI-safe
+
+enum class SpeechSegmentKind : int {
+  kNone     = 0,   // entry in a final aggregate result
+  kPartial  = 1,   // streaming: hypothesis for the current segment; may change
+  kFinal    = 2,   // streaming: segment is stable, or an entry in the final result
+};
+
+struct SpeechWord {
+  std::string text;
+  std::optional<DurationMs> start_time;
+  std::optional<DurationMs> end_time;
+  std::optional<float> confidence;        // 0..1
+  std::optional<std::string> speaker_id;
+};
+
+struct SpeechSegment {
+  SpeechSegmentKind kind = SpeechSegmentKind::kNone;
+
+  std::string text;                       // for kPartial: cumulative current hypothesis
+  std::optional<DurationMs> start_time;
+  std::optional<DurationMs> end_time;
+
+  // Utterance start signal — tagged on the first kPartial of a new utterance.
+  // Knowable at emission time. End-of-utterance is implicit (see design rules).
+  bool utterance_start = false;
+
+  std::vector<SpeechWord> words;          // word-timestamp opt-in
+
+  // Future / opt-in. Included here for visibility in review. 
+  // We should only add fields that we expect to use as the C API types need to be ABI stable,
+  // so we can't remove anything added.
+  std::optional<float> confidence;        // 0..1 aggregate
+  std::optional<std::string> language;    // per-segment, for code-switching
+  std::optional<std::string> speaker_id;
+  std::optional<std::int32_t> channel;
+  // we could maybe use something more generic if we want to report these things instead of having per-value fields
+  // e.g. shared float[] of fixed size and an enum saying which value is in which slot.
+  std::optional<float> avg_logprob;       // Whisper-family diagnostic
+  std::optional<float> no_speech_prob;    // Whisper-family diagnostic
+  std::optional<float> compression_ratio; // Whisper-family diagnostic
+};
+
+struct SpeechResult {
+  std::string text;                       // concatenated final transcript
+  std::optional<std::string> language;    // detected source language
+  std::optional<DurationMs> duration;     // total audio duration
+  std::vector<SpeechSegment> segments;    // entries are kFinal or kNone
+};
+
+}  // namespace fl
+```
+
+## C ABI item types
+
+```c
+FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31,  // pushed via streaming callback
+FOUNDRY_LOCAL_ITEM_SPEECH_RESULT  = 32,  // final aggregate in response.items
+```
+
+`TextItem` remains the trivial fallback for `response_format: "text"`.
+
+## V1 scope
+
+Populated in the initial implementation:
+
+- `SpeechSegmentKind`: `kNone`, `kPartial`, `kFinal`
+- `SpeechSegment`: `kind`, `text`, `start_time`, `end_time`,
+  `utterance_start` (defaulted; populated when computable)
+- `SpeechResult`: `text`, `language`, `duration`, `segments`
+
+Defined in the header but unpopulated until a producer exists:
+
+- `SpeechWord` and `SpeechSegment::words` (word-timestamp opt-in)
+- `confidence` (segment and word)
+- `avg_logprob`, `no_speech_prob`, `compression_ratio` (Whisper diagnostics)
+- `language` / `speaker_id` / `channel` on segment
+- `speaker_id` on word
+
+## Growth headroom (not built)
+
+- **Diarization**: `speaker_id` already present on word and segment.
+- **Multi-channel audio**: `channel` already present on segment.
+- **N-best alternatives**: future `std::vector<SpeechAlternative> alternatives`
+  on `SpeechSegment`.
+- **OpenAI `verbose_json` compatibility**: handled by a
+  `ToOpenAIVerboseJson(const SpeechResult&)` adapter in
+  `contracts/audio_transcriptions.*`, not by changing native types.
+
+Multi-target translation in a single pass is intentionally out of scope —
+that's a server-side concern, not a local-inferencing one.

From c0dd6c245c8f65d2d555e2596de0fc54a73fed21 Mon Sep 17 00:00:00 2001
From: Scott McKay <Scott.McKay@microsoft.com>
Date: Tue, 2 Jun 2026 17:57:20 +1000
Subject: [PATCH 2/2] Add speech result types and wire up for initial feedback

---
 sdk_v2/cpp/CMakeLists.txt                     |   1 +
 .../generative/audio => docs}/SPEECH_TYPES.md |  30 ++--
 sdk_v2/cpp/examples/realtime_audio/main.cc    |  22 ++-
 .../include/foundry_local/foundry_local_c.h   |  93 +++++++++-
 .../include/foundry_local/foundry_local_cpp.h |  54 ++++++
 .../foundry_local/foundry_local_cpp.inline.h  |  56 ++++++
 sdk_v2/cpp/src/c_api.cc                       |  41 +++++
 .../generative/audio/audio_session.cc         | 149 +++++++++++++--
 .../generative/audio/audio_session.h          |  24 ++-
 sdk_v2/cpp/src/items/speech_result_item.h     |  70 ++++++++
 sdk_v2/cpp/src/items/speech_segment_item.cc   |  34 ++++
 sdk_v2/cpp/src/items/speech_segment_item.h    |  73 ++++++++
 sdk_v2/cpp/test/internal_api/c_api_test.cc    |  28 +++
 sdk_v2/cpp/test/internal_api/item_test.cc     | 169 ++++++++++++++++++
 .../test/sdk_api/audio_transcriptions_test.cc |  73 ++++++++
 sdk_v2/cpp/test/sdk_api/model_fixture.h       |   3 +
 .../cpp/test/sdk_api/streaming_audio_test.cc  |  13 +-
 17 files changed, 878 insertions(+), 55 deletions(-)
 rename sdk_v2/cpp/{src/inferencing/generative/audio => docs}/SPEECH_TYPES.md (81%)
 create mode 100644 sdk_v2/cpp/src/items/speech_result_item.h
 create mode 100644 sdk_v2/cpp/src/items/speech_segment_item.cc
 create mode 100644 sdk_v2/cpp/src/items/speech_segment_item.h

diff --git a/sdk_v2/cpp/CMakeLists.txt b/sdk_v2/cpp/CMakeLists.txt
index c203deec6..3b4c82629 100644
--- a/sdk_v2/cpp/CMakeLists.txt
+++ b/sdk_v2/cpp/CMakeLists.txt
@@ -130,6 +130,7 @@ set(FOUNDRY_LOCAL_SOURCES
     src/items/item.cc
     src/items/image_item.cc
     src/items/message_item.cc
+    src/items/speech_segment_item.cc
     src/catalog/base_model_catalog.cc
     src/catalog/azure_model_catalog.cc
     src/catalog/azure_catalog_models.cc
diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md b/sdk_v2/cpp/docs/SPEECH_TYPES.md
similarity index 81%
rename from sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md
rename to sdk_v2/cpp/docs/SPEECH_TYPES.md
index c9f44befd..7d01eaba6 100644
--- a/sdk_v2/cpp/src/inferencing/generative/audio/SPEECH_TYPES.md
+++ b/sdk_v2/cpp/docs/SPEECH_TYPES.md
@@ -6,6 +6,10 @@ transcription events, Azure Speech SDK recognition results.
 
 ## Design rules
 
+- **Output-only types.** These types are produced by `AudioSession` and flow out
+  through the streaming callback and the final `Response`. Callers never
+  construct them as inputs. The C ABI therefore exposes only Get accessors —
+  no Set functions, no `Item_Create` for these types.
 - **One set of types covers transcription, translation, and ASR.** Task
   selection (transcribe vs translate, target language) is a Request parameter,
   not a type variant. `text` is the recognized-or-translated string either way.
@@ -65,18 +69,7 @@ struct SpeechSegment {
 
   std::vector<SpeechWord> words;          // word-timestamp opt-in
 
-  // Future / opt-in. Included here for visibility in review. 
-  // We should only add fields that we expect to use as the C API types need to be ABI stable,
-  // so we can't remove anything added.
-  std::optional<float> confidence;        // 0..1 aggregate
   std::optional<std::string> language;    // per-segment, for code-switching
-  std::optional<std::string> speaker_id;
-  std::optional<std::int32_t> channel;
-  // we could maybe use something more generic if we want to report these things instead of having per-value fields
-  // e.g. shared float[] of fixed size and an enum saying which value is in which slot.
-  std::optional<float> avg_logprob;       // Whisper-family diagnostic
-  std::optional<float> no_speech_prob;    // Whisper-family diagnostic
-  std::optional<float> compression_ratio; // Whisper-family diagnostic
 };
 
 struct SpeechResult {
@@ -96,6 +89,10 @@ FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31,  // pushed via streaming callback
 FOUNDRY_LOCAL_ITEM_SPEECH_RESULT  = 32,  // final aggregate in response.items
 ```
 
+These types are output-only — the ABI exposes `GetSpeechSegment` /
+`GetSpeechResult` accessors, but no setters and no `Item_Create` support.
+Attempting to create one returns `FOUNDRY_LOCAL_ERROR_INVALID_USAGE`.
+
 `TextItem` remains the trivial fallback for `response_format: "text"`.
 
 ## V1 scope
@@ -110,17 +107,18 @@ Populated in the initial implementation:
 Defined in the header but unpopulated until a producer exists:
 
 - `SpeechWord` and `SpeechSegment::words` (word-timestamp opt-in)
-- `confidence` (segment and word)
-- `avg_logprob`, `no_speech_prob`, `compression_ratio` (Whisper diagnostics)
-- `language` / `speaker_id` / `channel` on segment
+- `confidence` on word
+- `language` on segment
 - `speaker_id` on word
 
 ## Growth headroom (not built)
 
-- **Diarization**: `speaker_id` already present on word and segment.
-- **Multi-channel audio**: `channel` already present on segment.
+- **Diarization**: `speaker_id` already present on word.
 - **N-best alternatives**: future `std::vector<SpeechAlternative> alternatives`
   on `SpeechSegment`.
+- **Per-segment diagnostics** (Whisper `avg_logprob`, `no_speech_prob`,
+  `compression_ratio`; multi-channel `channel`; etc.): pushed as a separate
+  diagnostic item type rather than overloading `SpeechSegment`.
 - **OpenAI `verbose_json` compatibility**: handled by a
   `ToOpenAIVerboseJson(const SpeechResult&)` adapter in
   `contracts/audio_transcriptions.*`, not by changing native types.
diff --git a/sdk_v2/cpp/examples/realtime_audio/main.cc b/sdk_v2/cpp/examples/realtime_audio/main.cc
index 1a1453bfa..94a52a09b 100644
--- a/sdk_v2/cpp/examples/realtime_audio/main.cc
+++ b/sdk_v2/cpp/examples/realtime_audio/main.cc
@@ -46,7 +46,12 @@ void RealtimeAudioChat(IModel& model, const std::string& audio_path) {
     flItem* raw_item = nullptr;
     if (item_api->ItemQueue_TryPop(event.item_queue, &raw_item)) {
       Item item(*raw_item);
-      if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+      if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) {
+        auto seg = item.GetSpeechSegment();
+        std::cout.write(seg.text.data(), seg.text.size());
+        std::cout.flush();
+      } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+        // `response_format=text` session option produces a simple TextItem stream.
         std::cout << item.GetText().text << std::flush;
       } else {
         std::cerr << "Unexpected item type" << std::endl;
@@ -133,9 +138,18 @@ void RealtimeAudioChat(IModel& model, const std::string& audio_path) {
             << ", completion: " << usage.completion_tokens
             << ", total: " << usage.total_tokens << "\n";
 
-  // 8. The full response items are also available.
-  for (const auto& item : response.GetItems()) {
-    if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+  // 8. The full response is a single item — SpeechResultItem by default, or TextItem
+  // if the session was configured with `response_format=text`.
+  const auto& items = response.GetItems();
+  if (!items.empty()) {
+    const auto& item = items.front();
+    if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+      auto result = item.GetSpeechResult();
+      std::cout << "Full response: ";
+      std::cout.write(result.text.data(), result.text.size());
+      std::cout << "\n";
+      std::cout << "Segments: " << result.segments.size() << "\n";
+    } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
       std::cout << "Full response: " << item.GetText().text << "\n";
     }
   }
diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_c.h b/sdk_v2/cpp/include/foundry_local/foundry_local_c.h
index a32c686fc..15cc20f17 100644
--- a/sdk_v2/cpp/include/foundry_local/foundry_local_c.h
+++ b/sdk_v2/cpp/include/foundry_local/foundry_local_c.h
@@ -304,12 +304,15 @@ typedef enum flItemType {
   FOUNDRY_LOCAL_ITEM_BYTES = 1,  // Raw bytes with an item type tag.
   FOUNDRY_LOCAL_ITEM_TENSOR = 10,
   FOUNDRY_LOCAL_ITEM_TEXT = 20,
-  FOUNDRY_LOCAL_ITEM_MESSAGE = 21,       // role + content string.
-  FOUNDRY_LOCAL_ITEM_IMAGE = 25,         // Image input/output. Could be bytes or URI (file, memory address, url, etc.)
-  FOUNDRY_LOCAL_ITEM_AUDIO = 30,         // Audio input/output. Could be bytes or URI.
-  FOUNDRY_LOCAL_ITEM_TOOL_CALL = 100,    // request to call tool: call id, tool name, arguments
-  FOUNDRY_LOCAL_ITEM_TOOL_RESULT = 101,  // response from tool: call id, result
-  FOUNDRY_LOCAL_ITEM_QUEUE = 200,        // An item containing an flItemQueue of sub-items. Turtles all the way down.
+  FOUNDRY_LOCAL_ITEM_MESSAGE = 21,         // role + content string.
+  FOUNDRY_LOCAL_ITEM_IMAGE = 25,           // Image input/output. Could be bytes or URI (file, memory address, url, etc.)
+  FOUNDRY_LOCAL_ITEM_AUDIO = 30,           // Audio input/output. Could be bytes or URI.
+  FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31,  // Output-only. Recognized/translated speech segment.
+                                           // Pushed via streaming callback during AudioSession.
+  FOUNDRY_LOCAL_ITEM_SPEECH_RESULT = 32,   // Output-only. Final aggregate from AudioSession.
+  FOUNDRY_LOCAL_ITEM_TOOL_CALL = 100,      // request to call tool: call id, tool name, arguments
+  FOUNDRY_LOCAL_ITEM_TOOL_RESULT = 101,    // response from tool: call id, result
+  FOUNDRY_LOCAL_ITEM_QUEUE = 200,          // An item containing an flItemQueue of sub-items. Turtles all the way down.
 } flItemType;
 
 typedef enum flTextItemType {
@@ -492,6 +495,70 @@ typedef struct flToolResultData {
   /* V2 fields go here. */
 } flToolResultData;
 
+/* -----------------------------------------------------------------------
+ * Speech recognition output types.
+ *
+ * SPEECH_SEGMENT and SPEECH_RESULT items are produced by AudioSession and
+ * delivered via the streaming callback / final Response. Callers never
+ * construct them — the ABI exposes only Get accessors.
+ *
+ * Streaming model: zero-or-more kPartial segments for the current utterance,
+ * then exactly one kFinal closes it. Segment identity is implicit in stream
+ * order; there is no segment id.
+ *
+ * kPartial text is the cumulative current hypothesis for the segment, not a
+ * delta-since-last-event. Consumers replace by stream position.
+ * ----------------------------------------------------------------------- */
+
+/// Sentinel for absent flSpeechWord / flSpeechSegmentData / flSpeechResultData
+/// time fields. Required because the C ABI cannot carry std::optional.
+#define FOUNDRY_LOCAL_DURATION_UNSET INT64_MIN
+
+typedef enum flSpeechSegmentKind {
+  FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE = 0,     ///< Entry in a final aggregate result.
+  FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL = 1,  ///< Streaming: hypothesis for the current segment; may change.
+  FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL = 2,    ///< Streaming: segment is stable, or entry in the final result.
+} flSpeechSegmentKind;
+
+/// Versioned struct for a single word within a speech segment.
+/// All optional fields use sentinels (FOUNDRY_LOCAL_DURATION_UNSET / NULL) when absent.
+typedef struct flSpeechWord {
+  uint32_t version;        ///< Set to FOUNDRY_LOCAL_API_VERSION.
+  const char* text;        ///< UTF-8 word text. Always populated.
+  int64_t start_time_ms;   ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  int64_t end_time_ms;     ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  bool has_confidence;     ///< True iff `confidence` is populated.
+  float confidence;        ///< 0..1 model posterior. Valid iff has_confidence is true.
+  const char* speaker_id;  ///< Diarization label. NULL if absent.
+  /* V2 fields go here. */
+} flSpeechWord;
+
+/// Versioned struct for SPEECH_SEGMENT item content (output-only).
+typedef struct flSpeechSegmentData {
+  uint32_t version;           ///< Set to FOUNDRY_LOCAL_API_VERSION.
+  flSpeechSegmentKind kind;   ///< NONE / PARTIAL / FINAL.
+  const char* text;           ///< UTF-8. For PARTIAL: cumulative current hypothesis. May be NULL/"".
+  int64_t start_time_ms;      ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  int64_t end_time_ms;        ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  bool utterance_start;       ///< True on the first PARTIAL of a new utterance. End is implicit.
+  const flSpeechWord* words;  ///< Borrowed array. Length = words_count.
+  size_t words_count;
+  const char* language;  ///< Per-segment language for code-switching. NULL if absent.
+  /* V2 fields go here. */
+} flSpeechSegmentData;
+
+/// Versioned struct for SPEECH_RESULT item content (output-only).
+/// `segments` entries are SPEECH_SEGMENT items with kind = FINAL or NONE.
+typedef struct flSpeechResultData {
+  uint32_t version;               ///< Set to FOUNDRY_LOCAL_API_VERSION.
+  const char* text;               ///< UTF-8 concatenated final transcript. May be NULL/"".
+  const char* language;           ///< Detected source language. NULL if absent.
+  int64_t duration_ms;            ///< Total audio duration. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  const flItem* const* segments;  ///< Borrowed array of SPEECH_SEGMENT items. Length = segments_count.
+  size_t segments_count;
+  /* V2 fields go here. */
+} flSpeechResultData;
+
 /// Versioned struct that we pass to a callback during Session::ProcessRequest.
 /// Guarantees ordering and synchronization via the flItemQueue.
 typedef struct flStreamingCallbackData {
@@ -707,6 +774,20 @@ struct flItemApi {
   /// Borrowed pointers in the returned struct are owned by the item and valid until the item is released.
   FL_API_STATUS(GetToolResult, _In_ const flItem* item, _Out_ flToolResultData* out_tool_result);
 
+  /// Get content of a SPEECH_SEGMENT item into a versioned struct.
+  /// Output-only type — there is no SetSpeechSegment. Item_Create with
+  /// FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT returns FOUNDRY_LOCAL_ERROR_INVALID_USAGE.
+  /// Borrowed pointers in the returned struct (text, words array, language) are owned by the item and
+  /// valid until the item is released.
+  FL_API_STATUS(GetSpeechSegment, _In_ const flItem* item, _Out_ flSpeechSegmentData* out_segment);
+
+  /// Get content of a SPEECH_RESULT item into a versioned struct.
+  /// Output-only type — there is no SetSpeechResult. Item_Create with
+  /// FOUNDRY_LOCAL_ITEM_SPEECH_RESULT returns FOUNDRY_LOCAL_ERROR_INVALID_USAGE.
+  /// Borrowed pointers in the returned struct (text, language, segments array) are owned by the item and
+  /// valid until the item is released. Each entry of `segments` is a SPEECH_SEGMENT item.
+  FL_API_STATUS(GetSpeechResult, _In_ const flItem* item, _Out_ flSpeechResultData* out_result);
+
   /// Get metadata from the item (read-only).
   /// Returned flKeyValuePairs is owned by the item and valid until the item is released — do not release it.
   FL_API_STATUS(GetMetadata, _In_ const flItem* item, _Outptr_ const flKeyValuePairs** out_metadata);
diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h
index d8ab4e0a4..c0119637f 100644
--- a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h
+++ b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h
@@ -458,6 +458,40 @@ struct ToolResultContent {
   std::string_view result;
 };
 
+/// One word in a SPEECH_SEGMENT. Optional fields use std::optional / empty string_view.
+struct SpeechWord {
+  std::string_view text;
+  std::optional<int64_t> start_time_ms;
+  std::optional<int64_t> end_time_ms;
+  std::optional<float> confidence;
+  std::optional<std::string_view> speaker_id;
+};
+
+/// Content returned from a SPEECH_SEGMENT item (output-only).
+///
+/// See SPEECH_TYPES.md for the streaming model. PARTIAL `text` is the
+/// cumulative current hypothesis for the segment, not a delta. As an entry of
+/// a SpeechResultContent, `kind` is FINAL (or NONE for a single non-segmented
+/// transcript).
+struct SpeechSegmentContent {
+  flSpeechSegmentKind kind;
+  std::string_view text;
+  std::optional<int64_t> start_time_ms;
+  std::optional<int64_t> end_time_ms;
+  bool utterance_start;
+  std::vector<SpeechWord> words;
+  std::optional<std::string_view> language;
+};
+
+/// Content returned from a SPEECH_RESULT item (output-only).
+/// `segments` exposes non-owning views over segment items owned by the result.
+struct SpeechResultContent {
+  std::string_view text;
+  std::optional<std::string_view> language;
+  std::optional<int64_t> duration_ms;
+  std::vector<Item> segments;
+};
+
 // ===========================================================================
 // Item
 // ===========================================================================
@@ -485,6 +519,8 @@ class Item {
   MessageContent GetMessage() const;
   ToolCallContent GetToolCall() const;
   ToolResultContent GetToolResult() const;
+  SpeechSegmentContent GetSpeechSegment() const;
+  SpeechResultContent GetSpeechResult() const;
 
   const flItem* native_handle() const noexcept { return handle_.get(); }
   flItem* native_handle_mutable() { return handle_.get_mutable(); }
@@ -973,6 +1009,24 @@ class ChatSession : public Session {
   void UndoTurns(size_t count);
 };
 
+/// Session for automatic-speech-recognition (transcription) models.
+///
+/// Output format is controlled by the session option `response_format`:
+///   - Unset, or any value other than "text" (default): each request produces a
+///     SpeechResultItem and the streaming callback receives SpeechSegmentItems
+///     (one per decoded token).
+///   - "text": plain-text output — each request produces a TextItem and the
+///     streaming callback receives TextItems.
+///
+/// Set this once on the session via `SetOptions` before issuing requests:
+///
+///     AudioSession session(model);
+///     RequestOptions opts;
+///     opts.additional_options.Set("response_format", "text");
+///     session.SetOptions(opts);
+///
+/// Output format is a session-level decision and is intentionally NOT honoured
+/// when set on a per-request `RequestOptions`.
 class AudioSession : public Session {
  public:
   explicit AudioSession(IModel& model);
diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h
index 80402185e..6b714202f 100644
--- a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h
+++ b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.inline.h
@@ -710,6 +710,62 @@ inline ToolResultContent Item::GetToolResult() const {
           tr.result ? std::string_view{tr.result} : std::string_view{}};
 }
 
+namespace detail {
+
+inline std::optional<int64_t> SpeechDuration(int64_t v) {
+  return v == FOUNDRY_LOCAL_DURATION_UNSET ? std::optional<int64_t>{} : std::optional<int64_t>{v};
+}
+
+inline std::optional<std::string_view> SpeechOptStr(const char* s) {
+  return s ? std::optional<std::string_view>{s} : std::optional<std::string_view>{};
+}
+
+}  // namespace detail
+
+inline SpeechSegmentContent Item::GetSpeechSegment() const {
+  flSpeechSegmentData s{};
+  s.version = FOUNDRY_LOCAL_API_VERSION;
+  Check(detail::item_api()->GetSpeechSegment(handle_.get(), &s));
+
+  SpeechSegmentContent out;
+  out.kind = s.kind;
+  out.text = s.text ? std::string_view{s.text} : std::string_view{};
+  out.start_time_ms = detail::SpeechDuration(s.start_time_ms);
+  out.end_time_ms = detail::SpeechDuration(s.end_time_ms);
+  out.utterance_start = s.utterance_start;
+  out.language = detail::SpeechOptStr(s.language);
+  out.words.reserve(s.words_count);
+  for (size_t i = 0; i < s.words_count; ++i) {
+    const flSpeechWord& w = s.words[i];
+    SpeechWord sw;
+    sw.text = w.text ? std::string_view{w.text} : std::string_view{};
+    sw.start_time_ms = detail::SpeechDuration(w.start_time_ms);
+    sw.end_time_ms = detail::SpeechDuration(w.end_time_ms);
+    sw.confidence = w.has_confidence ? std::optional<float>{w.confidence} : std::optional<float>{};
+    sw.speaker_id = detail::SpeechOptStr(w.speaker_id);
+    out.words.push_back(std::move(sw));
+  }
+  return out;
+}
+
+inline SpeechResultContent Item::GetSpeechResult() const {
+  flSpeechResultData r{};
+  r.version = FOUNDRY_LOCAL_API_VERSION;
+  Check(detail::item_api()->GetSpeechResult(handle_.get(), &r));
+
+  SpeechResultContent out;
+  out.text = r.text ? std::string_view{r.text} : std::string_view{};
+  out.language = detail::SpeechOptStr(r.language);
+  out.duration_ms = detail::SpeechDuration(r.duration_ms);
+  out.segments.reserve(r.segments_count);
+  for (size_t i = 0; i < r.segments_count; ++i) {
+    if (r.segments[i]) {
+      out.segments.emplace_back(*r.segments[i]);
+    }
+  }
+  return out;
+}
+
 inline flItem* detail::CreateItem(flItemType type) {
   flItem* item = nullptr;
   Check(detail::item_api()->Create(type, &item));
diff --git a/sdk_v2/cpp/src/c_api.cc b/sdk_v2/cpp/src/c_api.cc
index 26fb647a9..076a480c9 100644
--- a/sdk_v2/cpp/src/c_api.cc
+++ b/sdk_v2/cpp/src/c_api.cc
@@ -13,6 +13,8 @@
 #include "items/bytes_item.h"
 #include "items/image_item.h"
 #include "items/message_item.h"
+#include "items/speech_result_item.h"
+#include "items/speech_segment_item.h"
 #include "items/tensor_item.h"
 #include "items/text_item.h"
 #include "items/tool_call_item.h"
@@ -988,6 +990,11 @@ FL_API_STATUS_IMPL(Item_CreateImpl, flItemType type, flItem** out_item) {
     return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "null out_item");
   }
 
+  if (type == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT || type == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+    return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_USAGE,
+                      "SPEECH_SEGMENT / SPEECH_RESULT are output-only and cannot be created by callers");
+  }
+
   auto item = fl::Item::Create(type);
   if (!item) {
     return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "unknown item type");
@@ -1304,6 +1311,38 @@ FL_API_STATUS_IMPL(Item_GetToolResultImpl, const flItem* item, flToolResultData*
   API_IMPL_END
 }
 
+// --- Speech (output-only) ---
+
+FL_API_STATUS_IMPL(Item_GetSpeechSegmentImpl, const flItem* item, flSpeechSegmentData* out_segment) {
+  API_IMPL_BEGIN
+  if (!item || !out_segment) {
+    return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "null argument");
+  }
+
+  if (AsImpl(item)->type != FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) {
+    return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_USAGE, "item is not a SPEECH_SEGMENT item");
+  }
+
+  AsItemType<fl::SpeechSegmentItem>(item)->GetApiData(*out_segment);
+  return nullptr;
+  API_IMPL_END
+}
+
+FL_API_STATUS_IMPL(Item_GetSpeechResultImpl, const flItem* item, flSpeechResultData* out_result) {
+  API_IMPL_BEGIN
+  if (!item || !out_result) {
+    return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_ARGUMENT, "null argument");
+  }
+
+  if (AsImpl(item)->type != FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+    return MakeStatus(FOUNDRY_LOCAL_ERROR_INVALID_USAGE, "item is not a SPEECH_RESULT item");
+  }
+
+  AsItemType<fl::SpeechResultItem>(item)->GetApiData(*out_result);
+  return nullptr;
+  API_IMPL_END
+}
+
 // --- Bytes ---
 
 FL_API_STATUS_IMPL(Item_SetBytesImpl, flItem* item, const flBytesData* bytes) {
@@ -1460,6 +1499,8 @@ static const flItemApi g_item_api = {
     Item_GetAudioImpl,
     Item_GetToolCallImpl,
     Item_GetToolResultImpl,
+    Item_GetSpeechSegmentImpl,
+    Item_GetSpeechResultImpl,
     Item_GetMetadataImpl,
     Item_GetMutableMetadataImpl,
     Item_GetQueueImpl,
diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc
index bea71f1a5..a50b07744 100644
--- a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc
+++ b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.cc
@@ -11,6 +11,8 @@
 #include "items/audio_item.h"
 #include "items/bytes_item.h"
 #include "items/item_queue.h"
+#include "items/speech_result_item.h"
+#include "items/speech_segment_item.h"
 #include "items/text_item.h"
 #include "model.h"
 #include "utils.h"
@@ -22,6 +24,54 @@
 
 namespace fl {
 
+namespace {
+
+// Build a single SpeechSegmentItem with kind NONE wrapping the given text.
+//
+// TODO: emit kPartial / kFinal once we integrate a model that exposes segmentation
+// hypotheses (e.g. a streaming ASR that revises in-flight transcripts before finalising).
+// Today's audio models (Whisper, Nemotron streaming) only surface decoded tokens, so the
+// stream has no notion of "hypothesis being revised" vs "utterance finalised" — NONE is
+// the honest label.
+std::unique_ptr<SpeechSegmentItem> MakeNoneSegment(std::string text) {
+  auto seg = std::make_unique<SpeechSegmentItem>(FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE, std::move(text));
+  seg->Finalize();
+  return seg;
+}
+
+// Assemble the final SpeechResultItem from the cumulative text and the per-token segments
+// accumulated during generation. `language` and `duration_ms` are intentionally left unset:
+// the request-side language is just a hint, and GenAI does not report a detected source
+// language or audio duration.
+std::unique_ptr<SpeechResultItem> BuildSpeechResult(
+    std::string text, std::vector<std::unique_ptr<SpeechSegmentItem>> segments) {
+  auto result = std::make_unique<SpeechResultItem>(std::move(text));
+  result->segments = std::move(segments);
+  result->Finalize();
+  return result;
+}
+
+// Initial capacity for the per-token accumulators. Picked empirically: a few seconds of speech
+// (~10s on Whisper, ~5s on Nemotron streaming) produces under 256 tokens, so most short-form
+// transcriptions avoid any reallocation. Longer transcriptions still grow geometrically.
+constexpr size_t kInitialTokenCapacity = 256;
+
+// Concatenate the per-token strings into a single buffer with one allocation.
+std::string JoinTokens(const std::vector<std::string>& token_texts) {
+  size_t total = 0;
+  for (const auto& t : token_texts) {
+    total += t.size();
+  }
+  std::string out;
+  out.reserve(total);
+  for (const auto& t : token_texts) {
+    out.append(t);
+  }
+  return out;
+}
+
+}  // namespace
+
 AudioSession::AudioSession(const fl::Model& catalog_model, GenAIModelInstance& model,
                            ILogger& logger, ITelemetry& telemetry)
     : Session(catalog_model, logger, telemetry), logger_(logger), model_(model) {
@@ -51,6 +101,11 @@ SessionType AudioSession::Type() const {
 
 void AudioSession::SetSessionOptionsImpl(const KeyValuePairs& options) {
   session_options_ = SearchOptions::FromParameters(options);
+
+  // Recompute the cached output-format flag. Default = SpeechResultItem; any value other than
+  // "text" also maps to SpeechResultItem. response_format is intentionally session-level only.
+  auto it = options.find("response_format");
+  text_output_ = (it != options.end() && it->second == "text");
 }
 
 void AudioSession::ProcessRequestImpl(const Request& request, Response& response) {
@@ -139,19 +194,33 @@ void AudioSession::ProcessRequestImpl(const Request& request, Response& response
   // Token-by-token generation with optional streaming.
   // Check request.canceled each iteration — a streaming callback returning
   // non-zero sets this flag asynchronously via CallbackHandler.
-  std::string text;
+  std::vector<std::string> token_texts;
+  token_texts.reserve(kInitialTokenCapacity);
   auto streaming_callback = CreateCallbackHandler(request);
+  const bool want_speech = !text_output_;
+  std::vector<std::unique_ptr<SpeechSegmentItem>> segments;
+  if (want_speech) {
+    segments.reserve(kInitialTokenCapacity);
+  }
 
   while (!generator->IsDone() && !request.canceled) {
     generator->GenerateNextToken();
     std::string token = generator->Decode();
 
     if (!token.empty()) {
-      text += token;
+      if (want_speech) {
+        segments.push_back(MakeNoneSegment(token));
+      }
 
       if (streaming_callback) {
-        streaming_callback->PushItem(std::make_unique<TextItem>(token));
+        if (want_speech) {
+          streaming_callback->PushItem(MakeNoneSegment(token));
+        } else {
+          streaming_callback->PushItem(std::make_unique<TextItem>(token));
+        }
       }
+
+      token_texts.push_back(std::move(token));
     }
 
     if (request.canceled) {
@@ -162,8 +231,14 @@ void AudioSession::ProcessRequestImpl(const Request& request, Response& response
   int total_tokens = generator->TokenCount();
   int completion_tokens = total_tokens - prompt_tokens;
 
-  // Add the full transcription as a text item
-  response.items.push_back(std::make_unique<TextItem>(std::move(text)));
+  std::string text = JoinTokens(token_texts);
+
+  // Add the transcription. Default = SpeechResultItem; legacy = TextItem.
+  if (want_speech) {
+    response.items.push_back(BuildSpeechResult(std::move(text), std::move(segments)));
+  } else {
+    response.items.push_back(std::make_unique<TextItem>(std::move(text)));
+  }
 
   // Set finish reason
   if (request.canceled) {
@@ -226,14 +301,24 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue
   auto tokenizer_stream = OgaTokenizerStream::Create(Model().GetOgaTokenizer());
 
   auto streaming_callback = CreateCallbackHandler(request);
-  std::string full_text;
+  std::vector<std::string> token_texts;
+  token_texts.reserve(kInitialTokenCapacity);
+  const bool want_speech = !text_output_;
+  std::vector<std::unique_ptr<SpeechSegmentItem>> segments;
+  if (want_speech) {
+    segments.reserve(kInitialTokenCapacity);
+  }
+  std::vector<std::unique_ptr<SpeechSegmentItem>>* segments_ptr = want_speech ? &segments : nullptr;
+  // Streaming ASR has no text prompt (input is audio), so prompt_tokens stays 0.
+  // We track every decoded token (whether it produced visible text or not) as completion_tokens.
+  int completion_tokens = 0;
 
   // 3. If the AudioItem itself has initial data, process it first
   if (format_item.data && format_item.data_size > 0) {
     auto float_samples = ConvertS16LEToFloat(
         static_cast<const uint8_t*>(format_item.data), format_item.data_size);
     ProcessChunk(*processor, *generator, *tokenizer_stream,
-                 float_samples, full_text, streaming_callback, request);
+                 float_samples, token_texts, segments_ptr, streaming_callback, request, completion_tokens);
   }
 
   // 4. Read from queue until finished or cancelled
@@ -258,7 +343,7 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue
         static_cast<const uint8_t*>(bytes.data), bytes.data_size);
 
     ProcessChunk(*processor, *generator, *tokenizer_stream,
-                 float_samples, full_text, streaming_callback, request);
+                 float_samples, token_texts, segments_ptr, streaming_callback, request, completion_tokens);
   }
 
   // 5. Flush remaining buffered audio
@@ -267,12 +352,20 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue
 
     if (flush_tensors) {
       generator->SetInputs(*flush_tensors);
-      DecodeTokens(*generator, *tokenizer_stream, full_text, streaming_callback, request);
+      DecodeTokens(*generator, *tokenizer_stream, token_texts, segments_ptr, streaming_callback, request,
+                   completion_tokens);
     }
   }
 
-  // 6. Produce response
-  response.items.push_back(std::make_unique<TextItem>(std::move(full_text)));
+  // 6. Produce response. Default = SpeechResultItem carrying all per-token segments;
+  // legacy `response_format=text` keeps the bare TextItem output.
+  std::string full_text = JoinTokens(token_texts);
+  const size_t full_text_size = full_text.size();
+  if (want_speech) {
+    response.items.push_back(BuildSpeechResult(std::move(full_text), std::move(segments)));
+  } else {
+    response.items.push_back(std::make_unique<TextItem>(std::move(full_text)));
+  }
 
   if (request.canceled) {
     response.finish_reason = FOUNDRY_LOCAL_FINISH_NONE;
@@ -280,28 +373,36 @@ void AudioSession::ProcessStreamingAudio(const AudioItem& format_item, ItemQueue
     response.finish_reason = FOUNDRY_LOCAL_FINISH_STOP;
   }
 
+  response.usage.prompt_tokens = 0;
+  response.usage.completion_tokens = completion_tokens;
+  response.usage.total_tokens = completion_tokens;
+
   logger_.Log(LogLevel::Debug, fmt::format("Streaming audio transcription complete, text length: {}",
-                                           response.items.empty() ? 0 : full_text.size()));
+                                           response.items.empty() ? 0 : full_text_size));
 }
 
 void AudioSession::ProcessChunk(OgaStreamingProcessor& processor, OgaGenerator& generator,
                                 OgaTokenizerStream& tokenizer_stream,
                                 const std::vector<float>& samples,
-                                std::string& full_text,
+                                std::vector<std::string>& token_texts,
+                                std::vector<std::unique_ptr<SpeechSegmentItem>>* segments,
                                 const std::unique_ptr<CallbackHandler>& callback,
-                                const Request& request) {
+                                const Request& request,
+                                int& completion_tokens) {
   auto tensors = processor.Process(samples.data(), samples.size());
 
   if (tensors) {
     generator.SetInputs(*tensors);
-    DecodeTokens(generator, tokenizer_stream, full_text, callback, request);
+    DecodeTokens(generator, tokenizer_stream, token_texts, segments, callback, request, completion_tokens);
   }
 }
 
 void AudioSession::DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream,
-                                std::string& full_text,
+                                std::vector<std::string>& token_texts,
+                                std::vector<std::unique_ptr<SpeechSegmentItem>>* segments,
                                 const std::unique_ptr<CallbackHandler>& callback,
-                                const Request& request) {
+                                const Request& request,
+                                int& completion_tokens) {
   while (!generator.IsDone() && !generator.IsSessionTerminated() && !request.canceled) {
     generator.GenerateNextToken();
     auto next_tokens = generator.GetNextTokens();
@@ -310,15 +411,25 @@ void AudioSession::DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tok
       continue;
     }
 
+    ++completion_tokens;
+
     int32_t token_id = next_tokens[0];
     const char* token_text = tokenizer_stream.Decode(token_id);
 
     if (token_text && token_text[0] != '\0') {
-      full_text += token_text;
+      if (segments) {
+        segments->push_back(MakeNoneSegment(token_text));
+      }
 
       if (callback) {
-        callback->PushItem(std::make_unique<TextItem>(std::string(token_text)));
+        if (segments) {
+          callback->PushItem(MakeNoneSegment(token_text));
+        } else {
+          callback->PushItem(std::make_unique<TextItem>(std::string(token_text)));
+        }
       }
+
+      token_texts.emplace_back(token_text);
     }
   }
 }
diff --git a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h
index 24e0fc89c..46417f4ce 100644
--- a/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h
+++ b/sdk_v2/cpp/src/inferencing/generative/audio/audio_session.h
@@ -21,6 +21,7 @@ namespace fl {
 class GenAIModelInstance;
 struct AudioItem;
 struct ItemQueue;
+struct SpeechSegmentItem;
 
 /// Audio transcription session.
 /// Stateless — each request processes one audio file independently (no history).
@@ -59,16 +60,25 @@ class AudioSession : public Session {
   /// Feed float32 PCM samples to the StreamingProcessor. If a full encoder chunk is ready,
   /// set the tensors on the generator and decode tokens.
   /// IMPORTANT: DecodeTokens must drain to IsDone() before the next SetInputs() call.
+  /// `segments` (when non-null) accumulates a SpeechSegmentItem per decoded token; the same
+  /// per-token segments are also what gets pushed to the streaming callback. When null, the
+  /// callback receives plain TextItems (legacy `response_format=text` mode).
   void ProcessChunk(OgaStreamingProcessor& processor, OgaGenerator& generator,
                     OgaTokenizerStream& tokenizer_stream, const std::vector<float>& samples,
-                    std::string& full_text, const std::unique_ptr<CallbackHandler>& callback,
-                    const Request& request);
+                    std::vector<std::string>& token_texts,
+                    std::vector<std::unique_ptr<SpeechSegmentItem>>* segments,
+                    const std::unique_ptr<CallbackHandler>& callback,
+                    const Request& request,
+                    int& completion_tokens);
 
   /// Decode all available tokens from the generator. This MUST run to completion
   /// (IsDone() == true) before the next SetInputs() call.
   void DecodeTokens(OgaGenerator& generator, OgaTokenizerStream& tokenizer_stream,
-                    std::string& full_text, const std::unique_ptr<CallbackHandler>& callback,
-                    const Request& request);
+                    std::vector<std::string>& token_texts,
+                    std::vector<std::unique_ptr<SpeechSegmentItem>>* segments,
+                    const std::unique_ptr<CallbackHandler>& callback,
+                    const Request& request,
+                    int& completion_tokens);
 
   GenAIModelInstance& Model() { return model_; }
   const GenAIModelInstance& Model() const { return model_; }
@@ -79,6 +89,12 @@ class AudioSession : public Session {
   // moved-from instance so the refcount transfers cleanly across moves.
   bool owns_session_ = true;
   SearchOptions session_options_;
+
+  // Cached flag derived from session_options_["response_format"]: true when the session is
+  // configured for plain-text output (TextItem only). Updated in SetSessionOptionsImpl
+  // so each request just reads a bool instead of hitting the KeyValuePairs map. Output format
+  // is a session-level decision and per-request `response_format` is intentionally ignored.
+  bool text_output_ = false;
 };
 
 }  // namespace fl
diff --git a/sdk_v2/cpp/src/items/speech_result_item.h b/sdk_v2/cpp/src/items/speech_result_item.h
new file mode 100644
index 000000000..ac9f67d66
--- /dev/null
+++ b/sdk_v2/cpp/src/items/speech_result_item.h
@@ -0,0 +1,70 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "items/item.h"
+#include "items/speech_segment_item.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace fl {
+
+/// Final aggregate result for a completed audio request.
+/// Output-only. `segments` entries are SpeechSegmentItems with kind = FINAL
+/// (or NONE for a single non-segmented transcript).
+struct SpeechResultItem : Item {
+  std::string text;
+  std::string language;  // empty when absent
+  std::optional<std::int64_t> duration_ms;
+  std::vector<std::unique_ptr<SpeechSegmentItem>> segments;
+
+  SpeechResultItem() : Item(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {}
+
+  explicit SpeechResultItem(std::string text_in)
+      : Item(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT), text(std::move(text_in)) {}
+
+  SpeechResultItem(const SpeechResultItem&) = delete;
+  SpeechResultItem& operator=(const SpeechResultItem&) = delete;
+  SpeechResultItem(SpeechResultItem&&) = default;
+  SpeechResultItem& operator=(SpeechResultItem&&) = default;
+
+  /// Snapshot the current field values (and each segment's) into the cached
+  /// C ABI representation. Must be called once after the item is fully
+  /// populated and before any `GetApiData` call. Calls `Finalize()` on every
+  /// child segment. Fields must not be mutated after Finalize().
+  void Finalize() {
+    cached_segment_ptrs_.clear();
+    cached_segment_ptrs_.reserve(segments.size());
+    for (const auto& s : segments) {
+      if (s) {
+        s->Finalize();
+        cached_segment_ptrs_.push_back(s->AsApiType());
+      } else {
+        cached_segment_ptrs_.push_back(nullptr);
+      }
+    }
+
+    cached_ = {};
+    cached_.version = FOUNDRY_LOCAL_API_VERSION;
+    cached_.text = text.c_str();
+    cached_.language = language.empty() ? nullptr : language.c_str();
+    cached_.duration_ms = duration_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET);
+    cached_.segments = cached_segment_ptrs_.empty() ? nullptr : cached_segment_ptrs_.data();
+    cached_.segments_count = cached_segment_ptrs_.size();
+  }
+
+  /// Copy the cached C ABI snapshot into `out`. Requires a prior Finalize().
+  /// Borrowed pointers in `out` remain valid for the lifetime of this item.
+  void GetApiData(flSpeechResultData& out) const { out = cached_; }
+
+ private:
+  flSpeechResultData cached_{};
+  std::vector<const flItem*> cached_segment_ptrs_;
+};
+
+}  // namespace fl
diff --git a/sdk_v2/cpp/src/items/speech_segment_item.cc b/sdk_v2/cpp/src/items/speech_segment_item.cc
new file mode 100644
index 000000000..31eec3212
--- /dev/null
+++ b/sdk_v2/cpp/src/items/speech_segment_item.cc
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "items/speech_segment_item.h"
+
+namespace fl {
+
+void SpeechSegmentItem::Finalize() {
+  cached_words_.clear();
+  cached_words_.reserve(words.size());
+  for (const auto& w : words) {
+    flSpeechWord aw{};
+    aw.version = FOUNDRY_LOCAL_API_VERSION;
+    aw.text = w.text.c_str();
+    aw.start_time_ms = w.start_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET);
+    aw.end_time_ms = w.end_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET);
+    aw.has_confidence = w.confidence.has_value();
+    aw.confidence = w.confidence.value_or(0.0f);
+    aw.speaker_id = w.speaker_id.empty() ? nullptr : w.speaker_id.c_str();
+    cached_words_.push_back(aw);
+  }
+
+  cached_ = {};
+  cached_.version = FOUNDRY_LOCAL_API_VERSION;
+  cached_.kind = kind;
+  cached_.text = text.c_str();
+  cached_.start_time_ms = start_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET);
+  cached_.end_time_ms = end_time_ms.value_or(FOUNDRY_LOCAL_DURATION_UNSET);
+  cached_.utterance_start = utterance_start;
+  cached_.language = language.empty() ? nullptr : language.c_str();
+  cached_.words = cached_words_.empty() ? nullptr : cached_words_.data();
+  cached_.words_count = cached_words_.size();
+}
+
+}  // namespace fl
diff --git a/sdk_v2/cpp/src/items/speech_segment_item.h b/sdk_v2/cpp/src/items/speech_segment_item.h
new file mode 100644
index 000000000..5d65d9ebc
--- /dev/null
+++ b/sdk_v2/cpp/src/items/speech_segment_item.h
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "items/item.h"
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace fl {
+
+/// One word within a speech segment. Output-only — produced by the SDK,
+/// never constructed by callers via the C ABI.
+struct SpeechWord {
+  std::string text;
+  std::optional<std::int64_t> start_time_ms;
+  std::optional<std::int64_t> end_time_ms;
+  std::optional<float> confidence;
+  std::string speaker_id;  // empty when absent
+};
+
+/// A recognized / translated speech segment.
+///
+/// Streaming model (see SPEECH_TYPES.md): zero-or-more PARTIAL segments for
+/// the current segment, then exactly one FINAL closes it. Segment identity
+/// is implicit in stream order; there is no segment id. `utterance_start`
+/// tags the first segment of a new utterance.
+///
+/// PARTIAL `text` is the cumulative current hypothesis for the segment, not
+/// a delta.
+///
+/// As an entry of a SpeechResultItem, `kind` is FINAL (or NONE for a single
+/// non-segmented transcript).
+struct SpeechSegmentItem : Item {
+  flSpeechSegmentKind kind = FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE;
+  std::string text;
+  std::optional<std::int64_t> start_time_ms;
+  std::optional<std::int64_t> end_time_ms;
+  bool utterance_start = false;
+  std::vector<SpeechWord> words;
+  std::string language;  // empty when absent
+
+  SpeechSegmentItem() : Item(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) {}
+
+  SpeechSegmentItem(flSpeechSegmentKind kind_in, std::string text_in)
+      : Item(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT),
+        kind(kind_in),
+        text(std::move(text_in)) {}
+
+  // Move-only; copying would invalidate cached C ABI pointers.
+  SpeechSegmentItem(const SpeechSegmentItem&) = delete;
+  SpeechSegmentItem& operator=(const SpeechSegmentItem&) = delete;
+  SpeechSegmentItem(SpeechSegmentItem&&) = default;
+  SpeechSegmentItem& operator=(SpeechSegmentItem&&) = default;
+
+  /// Snapshot the current field values into the cached C ABI representation.
+  /// Must be called once after the item is fully populated and before any
+  /// `GetApiData` call. Fields must not be mutated after Finalize().
+  void Finalize();
+
+  /// Copy the cached C ABI snapshot into `out`. Requires a prior Finalize().
+  /// Borrowed pointers in `out` remain valid for the lifetime of this item.
+  void GetApiData(flSpeechSegmentData& out) const { out = cached_; }
+
+ private:
+  flSpeechSegmentData cached_{};
+  std::vector<flSpeechWord> cached_words_;
+};
+
+}  // namespace fl
diff --git a/sdk_v2/cpp/test/internal_api/c_api_test.cc b/sdk_v2/cpp/test/internal_api/c_api_test.cc
index a8f072410..576615c03 100644
--- a/sdk_v2/cpp/test/internal_api/c_api_test.cc
+++ b/sdk_v2/cpp/test/internal_api/c_api_test.cc
@@ -596,6 +596,34 @@ TEST(CApiTest, ItemReleaseNullIsNoOp) {
   api->GetItemApi()->Item_Release(nullptr);
 }
 
+// ========================================================================
+// Speech items (output-only) — Create is rejected; Get works on internally-built items
+// ========================================================================
+
+TEST(CApiTest, ItemCreateSpeechSegmentRejected) {
+  const flApi* api = GetApi();
+  const flItemApi* item_api = api->GetItemApi();
+
+  flItem* item = nullptr;
+  flStatus* status = item_api->Create(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT, &item);
+  ASSERT_NE(status, nullptr);
+  EXPECT_EQ(api->Status_GetErrorCode(status), FOUNDRY_LOCAL_ERROR_INVALID_USAGE);
+  EXPECT_EQ(item, nullptr);
+  api->Status_Release(status);
+}
+
+TEST(CApiTest, ItemCreateSpeechResultRejected) {
+  const flApi* api = GetApi();
+  const flItemApi* item_api = api->GetItemApi();
+
+  flItem* item = nullptr;
+  flStatus* status = item_api->Create(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT, &item);
+  ASSERT_NE(status, nullptr);
+  EXPECT_EQ(api->Status_GetErrorCode(status), FOUNDRY_LOCAL_ERROR_INVALID_USAGE);
+  EXPECT_EQ(item, nullptr);
+  api->Status_Release(status);
+}
+
 // ========================================================================
 // Inference API — Request / Response
 // ========================================================================
diff --git a/sdk_v2/cpp/test/internal_api/item_test.cc b/sdk_v2/cpp/test/internal_api/item_test.cc
index ca16167f4..75bc6aded 100644
--- a/sdk_v2/cpp/test/internal_api/item_test.cc
+++ b/sdk_v2/cpp/test/internal_api/item_test.cc
@@ -9,6 +9,8 @@
 #include "items/image_item.h"
 #include "items/item_queue.h"
 #include "items/message_item.h"
+#include "items/speech_result_item.h"
+#include "items/speech_segment_item.h"
 #include "items/tensor_item.h"
 #include "items/text_item.h"
 #include "items/tool_call_item.h"
@@ -16,6 +18,7 @@
 #include "inferencing/session/session.h"
 #include "exception.h"
 
+#include <foundry_local/foundry_local_cpp.h>
 #include <gtest/gtest.h>
 
 #include <atomic>
@@ -142,6 +145,18 @@ TEST(ItemCreateTest, UnknownTypeReturnsNullptr) {
   EXPECT_EQ(item, nullptr);
 }
 
+TEST(ItemCreateTest, SpeechSegmentNotCreatable) {
+  // SPEECH_SEGMENT is output-only; the factory does not produce one.
+  auto item = Item::Create(FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT);
+  EXPECT_EQ(item, nullptr);
+}
+
+TEST(ItemCreateTest, SpeechResultNotCreatable) {
+  // SPEECH_RESULT is output-only; the factory does not produce one.
+  auto item = Item::Create(FOUNDRY_LOCAL_ITEM_SPEECH_RESULT);
+  EXPECT_EQ(item, nullptr);
+}
+
 TEST(ItemCreateTest, InvalidEnumValueReturnsNullptr) {
   auto item = Item::Create(static_cast<flItemType>(9999));
   EXPECT_EQ(item, nullptr);
@@ -188,6 +203,160 @@ TEST(ToolResultItemTest, ConstructWithValues) {
   EXPECT_EQ(item.result, "72 degrees");
 }
 
+// ========================================================================
+// Speech items (output-only)
+// ========================================================================
+
+TEST(SpeechSegmentItemTest, DefaultsAreEmpty) {
+  SpeechSegmentItem item;
+  EXPECT_TRUE(item.type == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT);
+  EXPECT_EQ(item.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE);
+  EXPECT_TRUE(item.text.empty());
+  EXPECT_FALSE(item.start_time_ms.has_value());
+  EXPECT_FALSE(item.end_time_ms.has_value());
+  EXPECT_FALSE(item.utterance_start);
+  EXPECT_TRUE(item.words.empty());
+  EXPECT_TRUE(item.language.empty());
+}
+
+TEST(SpeechSegmentItemTest, ConstructWithKindAndText) {
+  SpeechSegmentItem item(FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL, "hello");
+  EXPECT_EQ(item.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL);
+  EXPECT_EQ(item.text, "hello");
+}
+
+TEST(SpeechSegmentItemTest, GetApiDataMapsOptionalsToSentinel) {
+  SpeechSegmentItem item(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "the cat sat");
+  item.utterance_start = true;
+  item.start_time_ms = 100;
+  item.end_time_ms = 1500;
+  item.language = "en";
+  item.words.push_back({"the", 100, 200, 0.95f, ""});
+  item.words.push_back({"cat", std::nullopt, std::nullopt, std::nullopt, "spk_1"});
+  item.Finalize();
+
+  flSpeechSegmentData out{};
+  out.version = FOUNDRY_LOCAL_API_VERSION;
+  item.GetApiData(out);
+
+  EXPECT_EQ(out.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL);
+  EXPECT_STREQ(out.text, "the cat sat");
+  EXPECT_EQ(out.start_time_ms, 100);
+  EXPECT_EQ(out.end_time_ms, 1500);
+  EXPECT_TRUE(out.utterance_start);
+  EXPECT_STREQ(out.language, "en");
+  ASSERT_EQ(out.words_count, 2u);
+
+  EXPECT_STREQ(out.words[0].text, "the");
+  EXPECT_EQ(out.words[0].start_time_ms, 100);
+  EXPECT_EQ(out.words[0].end_time_ms, 200);
+  EXPECT_TRUE(out.words[0].has_confidence);
+  EXPECT_FLOAT_EQ(out.words[0].confidence, 0.95f);
+  EXPECT_EQ(out.words[0].speaker_id, nullptr);
+
+  EXPECT_STREQ(out.words[1].text, "cat");
+  EXPECT_EQ(out.words[1].start_time_ms, FOUNDRY_LOCAL_DURATION_UNSET);
+  EXPECT_EQ(out.words[1].end_time_ms, FOUNDRY_LOCAL_DURATION_UNSET);
+  EXPECT_FALSE(out.words[1].has_confidence);
+  EXPECT_STREQ(out.words[1].speaker_id, "spk_1");
+}
+
+TEST(SpeechSegmentItemTest, GetApiDataEmptyOptionalsBecomeSentinel) {
+  SpeechSegmentItem item;
+  item.Finalize();
+  flSpeechSegmentData out{};
+  out.version = FOUNDRY_LOCAL_API_VERSION;
+  item.GetApiData(out);
+
+  EXPECT_EQ(out.start_time_ms, FOUNDRY_LOCAL_DURATION_UNSET);
+  EXPECT_EQ(out.end_time_ms, FOUNDRY_LOCAL_DURATION_UNSET);
+  EXPECT_EQ(out.language, nullptr);
+  EXPECT_EQ(out.words, nullptr);
+  EXPECT_EQ(out.words_count, 0u);
+}
+
+TEST(SpeechResultItemTest, DefaultsAreEmpty) {
+  SpeechResultItem item;
+  EXPECT_TRUE(item.type == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT);
+  EXPECT_TRUE(item.text.empty());
+  EXPECT_TRUE(item.language.empty());
+  EXPECT_FALSE(item.duration_ms.has_value());
+  EXPECT_TRUE(item.segments.empty());
+}
+
+TEST(SpeechResultItemTest, GetApiDataExposesSegmentsAsItemPointers) {
+  SpeechResultItem result("the cat sat");
+  result.language = "en";
+  result.duration_ms = 1500;
+  result.segments.push_back(std::make_unique<SpeechSegmentItem>(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "the cat"));
+  result.segments.push_back(std::make_unique<SpeechSegmentItem>(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "sat"));
+  result.Finalize();
+
+  flSpeechResultData out{};
+  out.version = FOUNDRY_LOCAL_API_VERSION;
+  result.GetApiData(out);
+
+  EXPECT_STREQ(out.text, "the cat sat");
+  EXPECT_STREQ(out.language, "en");
+  EXPECT_EQ(out.duration_ms, 1500);
+  ASSERT_EQ(out.segments_count, 2u);
+
+  // Each segment pointer should resolve back to a SPEECH_SEGMENT item.
+  for (size_t i = 0; i < out.segments_count; ++i) {
+    ASSERT_NE(out.segments[i], nullptr);
+    EXPECT_TRUE(reinterpret_cast<const Item*>(out.segments[i])->type == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT);
+  }
+}
+
+// Wrapper translation: sentinel int64 ↔ std::optional<int64>, NULL ↔ std::optional<string_view>,
+// and that segments are exposed as a vector<Item> of SPEECH_SEGMENT items.
+TEST(SpeechWrapperTest, ReadsThroughPublicCppApi) {
+  SpeechSegmentItem seg(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "the cat sat");
+  seg.start_time_ms = 100;
+  seg.utterance_start = true;
+  // end_time_ms intentionally left unset.
+  seg.words.push_back({"the", 100, 200, 0.95f, ""});
+  seg.words.push_back({"cat", std::nullopt, std::nullopt, std::nullopt, ""});
+  seg.Finalize();
+
+  const fl::SpeechSegmentItem& cseg = seg;
+  foundry_local::Item view(*cseg.AsApiType());
+  auto content = view.GetSpeechSegment();
+  EXPECT_EQ(content.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL);
+  EXPECT_EQ(content.text, "the cat sat");
+  ASSERT_TRUE(content.start_time_ms.has_value());
+  EXPECT_EQ(*content.start_time_ms, 100);
+  EXPECT_FALSE(content.end_time_ms.has_value());
+  EXPECT_TRUE(content.utterance_start);
+  EXPECT_FALSE(content.language.has_value());
+  ASSERT_EQ(content.words.size(), 2u);
+
+  EXPECT_EQ(content.words[0].text, "the");
+  ASSERT_TRUE(content.words[0].confidence.has_value());
+  EXPECT_FLOAT_EQ(*content.words[0].confidence, 0.95f);
+  EXPECT_FALSE(content.words[0].speaker_id.has_value());
+
+  EXPECT_FALSE(content.words[1].start_time_ms.has_value());
+  EXPECT_FALSE(content.words[1].confidence.has_value());
+}
+
+TEST(SpeechWrapperTest, ResultExposesSegmentsAsItemViews) {
+  SpeechResultItem result("hi");
+  result.duration_ms = 1500;
+  result.segments.push_back(std::make_unique<SpeechSegmentItem>(FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL, "hi"));
+  result.Finalize();
+
+  const fl::SpeechResultItem& cresult = result;
+  foundry_local::Item view(*cresult.AsApiType());
+  auto content = view.GetSpeechResult();
+  EXPECT_EQ(content.text, "hi");
+  ASSERT_TRUE(content.duration_ms.has_value());
+  EXPECT_EQ(*content.duration_ms, 1500);
+  ASSERT_EQ(content.segments.size(), 1u);
+  EXPECT_EQ(content.segments[0].GetType(), FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT);
+  EXPECT_EQ(content.segments[0].GetSpeechSegment().text, "hi");
+}
+
 TEST(JsonItemTest, OpenAIJsonTextItem) {
   TextItem item(R"({"model":"gpt-4","input":"hello"})", FOUNDRY_LOCAL_TEXT_ITEM_TYPE_OPENAI_JSON);
   EXPECT_TRUE(item.type == FOUNDRY_LOCAL_ITEM_TEXT);
diff --git a/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc b/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc
index 1c6f61fc2..8a6496410 100644
--- a/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc
+++ b/sdk_v2/cpp/test/sdk_api/audio_transcriptions_test.cc
@@ -135,6 +135,79 @@ TEST_F(AudioSessionFixture, TranscribeWithSessionLevelOptions) {
   ExpectTranscriptionContent(text);
 }
 
+TEST_F(AudioSessionFixture, TranscribeProducesSpeechResultItem) {
+  using namespace foundry_local;
+
+  Request request;
+  request.AddItem(Item::AudioFromUri(audio_file_path()));
+
+  AudioSession session(audio_model());
+  Response response = session.ProcessRequest(request);
+
+  EXPECT_EQ(response.GetFinishReason(), FOUNDRY_LOCAL_FINISH_STOP);
+
+  const Item* speech_item = nullptr;
+  for (const auto& item : response.GetItems()) {
+    if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+      speech_item = &item;
+      break;
+    }
+  }
+  ASSERT_NE(speech_item, nullptr) << "Expected a SPEECH_RESULT item by default";
+
+  auto result = speech_item->GetSpeechResult();
+  std::string result_text(result.text);
+  EXPECT_FALSE(result_text.empty());
+  ExpectTranscriptionContent(result_text);
+  // One segment per decoded token, kind NONE. Concatenated text matches result.text.
+  ASSERT_FALSE(result.segments.empty());
+  std::string concatenated;
+  for (const auto& seg_item : result.segments) {
+    auto seg = seg_item.GetSpeechSegment();
+    EXPECT_EQ(seg.kind, FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE);
+    EXPECT_FALSE(seg.start_time_ms.has_value());
+    EXPECT_FALSE(seg.end_time_ms.has_value());
+    EXPECT_TRUE(seg.words.empty());
+    concatenated.append(seg.text.data(), seg.text.size());
+  }
+  EXPECT_EQ(concatenated, result_text);
+  // Detected source language is not reported by GenAI today — reserved for future translation.
+  EXPECT_FALSE(result.language.has_value());
+  EXPECT_FALSE(result.duration_ms.has_value());
+}
+
+TEST_F(AudioSessionFixture, TranscribeWithResponseFormatTextProducesTextItem) {
+  using namespace foundry_local;
+
+  Request request;
+  request.AddItem(Item::AudioFromUri(audio_file_path()));
+
+  // response_format is a session-level option; setting it on the request must NOT take effect.
+  AudioSession session(audio_model());
+  RequestOptions session_opts;
+  session_opts.additional_options.Set("response_format", "text");
+  session.SetOptions(session_opts);
+
+  Response response = session.ProcessRequest(request);
+
+  EXPECT_EQ(response.GetFinishReason(), FOUNDRY_LOCAL_FINISH_STOP);
+
+  bool saw_text = false;
+  bool saw_speech = false;
+  for (const auto& item : response.GetItems()) {
+    if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+      saw_text = true;
+    } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+      saw_speech = true;
+    }
+  }
+  EXPECT_TRUE(saw_text) << "response_format=text should produce a TEXT item";
+  EXPECT_FALSE(saw_speech) << "response_format=text should NOT produce a SPEECH_RESULT item";
+
+  std::string text = CollectResponseText(response);
+  ExpectTranscriptionContent(text);
+}
+
 // ---- Error paths — exercise ProcessRequestImpl validation branches. ----
 
 TEST_F(AudioSessionFixture, RejectsEmptyRequest) {
diff --git a/sdk_v2/cpp/test/sdk_api/model_fixture.h b/sdk_v2/cpp/test/sdk_api/model_fixture.h
index f26744da9..312abce51 100644
--- a/sdk_v2/cpp/test/sdk_api/model_fixture.h
+++ b/sdk_v2/cpp/test/sdk_api/model_fixture.h
@@ -63,6 +63,9 @@ inline std::string CollectResponseText(const foundry_local::Response& response)
       text += item.GetText().text;
     } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_MESSAGE) {
       text += CollectMessageText(item.GetMessage());
+    } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+      auto sr = item.GetSpeechResult();
+      text.append(sr.text.data(), sr.text.size());
     }
   }
 
diff --git a/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc b/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc
index 290604838..be6981391 100644
--- a/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc
+++ b/sdk_v2/cpp/test/sdk_api/streaming_audio_test.cc
@@ -260,12 +260,13 @@ TEST_F(StreamingAudioFixture, StreamingCallbackReceivesTokens) {
     // Wrap in Item for RAII release and checked accessors.
     Item item(*raw_item);
 
-    if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
-      auto text = item.GetText().text;
-      if (!text.empty()) {
-        std::lock_guard<std::mutex> lock(text_mutex);
-        streamed_text += text;
-      }
+    // Default audio output is SpeechSegmentItem per token (we never set response_format=text
+    // on this session, so a TextItem would be unexpected).
+    EXPECT_EQ(item.GetType(), FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT);
+    auto seg = item.GetSpeechSegment();
+    if (!seg.text.empty()) {
+      std::lock_guard<std::mutex> lock(text_mutex);
+      streamed_text.append(seg.text.data(), seg.text.size());
     }
 
     callback_count++;