microsoft · skottmckay · May 30, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/sdk_v2/cpp/CMakeLists.txt b/sdk_v2/cpp/CMakeLists.txt
@@ -130,6 +130,7 @@ set(FOUNDRY_LOCAL_SOURCES
     src/items/item.cc
     src/items/image_item.cc
     src/items/message_item.cc
+    src/items/speech_segment_item.cc
     src/catalog/base_model_catalog.cc
     src/catalog/azure_model_catalog.cc
     src/catalog/azure_catalog_models.cc

diff --git a/sdk_v2/cpp/docs/SPEECH_TYPES.md b/sdk_v2/cpp/docs/SPEECH_TYPES.md
@@ -0,0 +1,127 @@
+# Speech Types — Design
+
+Native SDK types returned by `AudioSession` for speech-to-text (file and live),
+translation, and ASR scenarios. References: OpenAI `verbose_json` / Realtime
+transcription events, Azure Speech SDK recognition results.
+
+## Design rules
+
+- **Output-only types.** These types are produced by `AudioSession` and flow out
+  through the streaming callback and the final `Response`. Callers never
+  construct them as inputs. The C ABI therefore exposes only Get accessors —
+  no Set functions, no `Item_Create` for these types.
+- **One set of types covers transcription, translation, and ASR.** Task
+  selection (transcribe vs translate, target language) is a Request parameter,
+  not a type variant. `text` is the recognized-or-translated string either way.
+- **One shared segment type** for both streaming events and final-result entries,
+  discriminated by `kind`.
+- **No event wrapper, no `event_id`, no segment `id`.** Ordering is a property of
+  the callback channel; segment identity is implicit in stream order (zero-or-more
+  `kPartial` for the current segment, then one `kFinal` closes it). A web service
+  above the SDK can add envelope/sequence metadata.
+- **`text` on `kPartial` is the cumulative current hypothesis for the segment**,
+  not a delta-since-last-event (Azure-style). A delta is recoverable by diffing
+  against the previous hypothesis.
+- **`utterance_start` is a boolean on the segment.** Knowable at emission time
+  (VAD says "speech started" → producer tags the first `kPartial` of the new
+  segment). There is no `utterance_end` field: end-of-utterance can't be known
+  when the `kFinal` is emitted without delaying it by the silence threshold.
+  Instead, end is implicit — the next `utterance_start` marks it (consumer
+  infers end at the previous `kFinal.end_time`), a future `kSilence` event
+  marks it explicitly, or the final `SpeechResult` marks it for file
+  transcription.
+- **Time as `int64_t` milliseconds.** Must survive the C ABI. Typedef'd so the
+  unit is legible and changeable in one place.
+- **Two C ABI item types** — one for streaming segments, one for the final
+  aggregate. Both additive to existing items.
+
+## Types
+
+```cpp
+namespace fl {
+
+using DurationMs = std::int64_t;  // milliseconds; C ABI-safe
+
+enum class SpeechSegmentKind : int {
+  kNone     = 0,   // entry in a final aggregate result
+  kPartial  = 1,   // streaming: hypothesis for the current segment; may change
+  kFinal    = 2,   // streaming: segment is stable, or an entry in the final result
+};
+
+struct SpeechWord {
+  std::string text;
+  std::optional<DurationMs> start_time;
+  std::optional<DurationMs> end_time;
+  std::optional<float> confidence;        // 0..1
+  std::optional<std::string> speaker_id;
+};
+
+struct SpeechSegment {
+  SpeechSegmentKind kind = SpeechSegmentKind::kNone;
+
+  std::string text;                       // for kPartial: cumulative current hypothesis
+  std::optional<DurationMs> start_time;
+  std::optional<DurationMs> end_time;
+
+  // Utterance start signal — tagged on the first kPartial of a new utterance.
+  // Knowable at emission time. End-of-utterance is implicit (see design rules).
+  bool utterance_start = false;
+
+  std::vector<SpeechWord> words;          // word-timestamp opt-in
+
+  std::optional<std::string> language;    // per-segment, for code-switching
+};
+
+struct SpeechResult {
+  std::string text;                       // concatenated final transcript
+  std::optional<std::string> language;    // detected source language
+  std::optional<DurationMs> duration;     // total audio duration
+  std::vector<SpeechSegment> segments;    // entries are kFinal or kNone
+};
+
+}  // namespace fl
+```
+
+## C ABI item types
+
+```c
+FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31,  // pushed via streaming callback
+FOUNDRY_LOCAL_ITEM_SPEECH_RESULT  = 32,  // final aggregate in response.items
+```
+
+These types are output-only — the ABI exposes `GetSpeechSegment` /
+`GetSpeechResult` accessors, but no setters and no `Item_Create` support.
+Attempting to create one returns `FOUNDRY_LOCAL_ERROR_INVALID_USAGE`.
+
+`TextItem` remains the trivial fallback for `response_format: "text"`.
+
+## V1 scope
+
+Populated in the initial implementation:
+
+- `SpeechSegmentKind`: `kNone`, `kPartial`, `kFinal`
+- `SpeechSegment`: `kind`, `text`, `start_time`, `end_time`,
+  `utterance_start` (defaulted; populated when computable)
+- `SpeechResult`: `text`, `language`, `duration`, `segments`
+
+Defined in the header but unpopulated until a producer exists:
+
+- `SpeechWord` and `SpeechSegment::words` (word-timestamp opt-in)
+- `confidence` on word
+- `language` on segment
+- `speaker_id` on word
+
+## Growth headroom (not built)
+
+- **Diarization**: `speaker_id` already present on word.
+- **N-best alternatives**: future `std::vector<SpeechAlternative> alternatives`
+  on `SpeechSegment`.
+- **Per-segment diagnostics** (Whisper `avg_logprob`, `no_speech_prob`,
+  `compression_ratio`; multi-channel `channel`; etc.): pushed as a separate
+  diagnostic item type rather than overloading `SpeechSegment`.
+- **OpenAI `verbose_json` compatibility**: handled by a
+  `ToOpenAIVerboseJson(const SpeechResult&)` adapter in
+  `contracts/audio_transcriptions.*`, not by changing native types.
+
+Multi-target translation in a single pass is intentionally out of scope —
+that's a server-side concern, not a local-inferencing one.
diff --git a/sdk_v2/cpp/examples/realtime_audio/main.cc b/sdk_v2/cpp/examples/realtime_audio/main.cc
@@ -46,7 +46,12 @@ void RealtimeAudioChat(IModel& model, const std::string& audio_path) {
     flItem* raw_item = nullptr;
     if (item_api->ItemQueue_TryPop(event.item_queue, &raw_item)) {
       Item item(*raw_item);
-      if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+      if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT) {
+        auto seg = item.GetSpeechSegment();
+        std::cout.write(seg.text.data(), seg.text.size());
+        std::cout.flush();
+      } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+        // `response_format=text` session option produces a simple TextItem stream.
         std::cout << item.GetText().text << std::flush;
       } else {
         std::cerr << "Unexpected item type" << std::endl;
@@ -133,9 +138,18 @@ void RealtimeAudioChat(IModel& model, const std::string& audio_path) {
             << ", completion: " << usage.completion_tokens
             << ", total: " << usage.total_tokens << "\n";
 
-  // 8. The full response items are also available.
-  for (const auto& item : response.GetItems()) {
-    if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
+  // 8. The full response is a single item — SpeechResultItem by default, or TextItem
+  // if the session was configured with `response_format=text`.
+  const auto& items = response.GetItems();
+  if (!items.empty()) {
+    const auto& item = items.front();
+    if (item.GetType() == FOUNDRY_LOCAL_ITEM_SPEECH_RESULT) {
+      auto result = item.GetSpeechResult();
+      std::cout << "Full response: ";
+      std::cout.write(result.text.data(), result.text.size());
+      std::cout << "\n";
+      std::cout << "Segments: " << result.segments.size() << "\n";
+    } else if (item.GetType() == FOUNDRY_LOCAL_ITEM_TEXT) {
       std::cout << "Full response: " << item.GetText().text << "\n";
     }
   }

diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_c.h b/sdk_v2/cpp/include/foundry_local/foundry_local_c.h
@@ -304,12 +304,15 @@ typedef enum flItemType {
   FOUNDRY_LOCAL_ITEM_BYTES = 1,  // Raw bytes with an item type tag.
   FOUNDRY_LOCAL_ITEM_TENSOR = 10,
   FOUNDRY_LOCAL_ITEM_TEXT = 20,
-  FOUNDRY_LOCAL_ITEM_MESSAGE = 21,       // role + content string.
-  FOUNDRY_LOCAL_ITEM_IMAGE = 25,         // Image input/output. Could be bytes or URI (file, memory address, url, etc.)
-  FOUNDRY_LOCAL_ITEM_AUDIO = 30,         // Audio input/output. Could be bytes or URI.
-  FOUNDRY_LOCAL_ITEM_TOOL_CALL = 100,    // request to call tool: call id, tool name, arguments
-  FOUNDRY_LOCAL_ITEM_TOOL_RESULT = 101,  // response from tool: call id, result
-  FOUNDRY_LOCAL_ITEM_QUEUE = 200,        // An item containing an flItemQueue of sub-items. Turtles all the way down.
+  FOUNDRY_LOCAL_ITEM_MESSAGE = 21,         // role + content string.
+  FOUNDRY_LOCAL_ITEM_IMAGE = 25,           // Image input/output. Could be bytes or URI (file, memory address, url, etc.)
+  FOUNDRY_LOCAL_ITEM_AUDIO = 30,           // Audio input/output. Could be bytes or URI.
+  FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT = 31,  // Output-only. Recognized/translated speech segment.
+                                           // Pushed via streaming callback during AudioSession.
+  FOUNDRY_LOCAL_ITEM_SPEECH_RESULT = 32,   // Output-only. Final aggregate from AudioSession.
+  FOUNDRY_LOCAL_ITEM_TOOL_CALL = 100,      // request to call tool: call id, tool name, arguments
+  FOUNDRY_LOCAL_ITEM_TOOL_RESULT = 101,    // response from tool: call id, result
+  FOUNDRY_LOCAL_ITEM_QUEUE = 200,          // An item containing an flItemQueue of sub-items. Turtles all the way down.
 } flItemType;
 
 typedef enum flTextItemType {
@@ -492,6 +495,70 @@ typedef struct flToolResultData {
   /* V2 fields go here. */
 } flToolResultData;
 
+/* -----------------------------------------------------------------------
+ * Speech recognition output types.
+ *
+ * SPEECH_SEGMENT and SPEECH_RESULT items are produced by AudioSession and
+ * delivered via the streaming callback / final Response. Callers never
+ * construct them — the ABI exposes only Get accessors.
+ *
+ * Streaming model: zero-or-more kPartial segments for the current utterance,
+ * then exactly one kFinal closes it. Segment identity is implicit in stream
+ * order; there is no segment id.
+ *
+ * kPartial text is the cumulative current hypothesis for the segment, not a
+ * delta-since-last-event. Consumers replace by stream position.
+ * ----------------------------------------------------------------------- */
+
+/// Sentinel for absent flSpeechWord / flSpeechSegmentData / flSpeechResultData
+/// time fields. Required because the C ABI cannot carry std::optional.
+#define FOUNDRY_LOCAL_DURATION_UNSET INT64_MIN
+
+typedef enum flSpeechSegmentKind {
+  FOUNDRY_LOCAL_SPEECH_SEGMENT_NONE = 0,     ///< Entry in a final aggregate result.
+  FOUNDRY_LOCAL_SPEECH_SEGMENT_PARTIAL = 1,  ///< Streaming: hypothesis for the current segment; may change.
+  FOUNDRY_LOCAL_SPEECH_SEGMENT_FINAL = 2,    ///< Streaming: segment is stable, or entry in the final result.
+} flSpeechSegmentKind;
+
+/// Versioned struct for a single word within a speech segment.
+/// All optional fields use sentinels (FOUNDRY_LOCAL_DURATION_UNSET / NULL) when absent.
+typedef struct flSpeechWord {
+  uint32_t version;        ///< Set to FOUNDRY_LOCAL_API_VERSION.
+  const char* text;        ///< UTF-8 word text. Always populated.
+  int64_t start_time_ms;   ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  int64_t end_time_ms;     ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  bool has_confidence;     ///< True iff `confidence` is populated.
+  float confidence;        ///< 0..1 model posterior. Valid iff has_confidence is true.
+  const char* speaker_id;  ///< Diarization label. NULL if absent.
+  /* V2 fields go here. */
+} flSpeechWord;
+
+/// Versioned struct for SPEECH_SEGMENT item content (output-only).
+typedef struct flSpeechSegmentData {
+  uint32_t version;           ///< Set to FOUNDRY_LOCAL_API_VERSION.
+  flSpeechSegmentKind kind;   ///< NONE / PARTIAL / FINAL.
+  const char* text;           ///< UTF-8. For PARTIAL: cumulative current hypothesis. May be NULL/"".
+  int64_t start_time_ms;      ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  int64_t end_time_ms;        ///< Milliseconds from audio start. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  bool utterance_start;       ///< True on the first PARTIAL of a new utterance. End is implicit.
+  const flSpeechWord* words;  ///< Borrowed array. Length = words_count.
+  size_t words_count;
+  const char* language;  ///< Per-segment language for code-switching. NULL if absent.
+  /* V2 fields go here. */
+} flSpeechSegmentData;
+
+/// Versioned struct for SPEECH_RESULT item content (output-only).
+/// `segments` entries are SPEECH_SEGMENT items with kind = FINAL or NONE.
+typedef struct flSpeechResultData {
+  uint32_t version;               ///< Set to FOUNDRY_LOCAL_API_VERSION.
+  const char* text;               ///< UTF-8 concatenated final transcript. May be NULL/"".
+  const char* language;           ///< Detected source language. NULL if absent.
+  int64_t duration_ms;            ///< Total audio duration. FOUNDRY_LOCAL_DURATION_UNSET if absent.
+  const flItem* const* segments;  ///< Borrowed array of SPEECH_SEGMENT items. Length = segments_count.
+  size_t segments_count;
+  /* V2 fields go here. */
+} flSpeechResultData;
+
 /// Versioned struct that we pass to a callback during Session::ProcessRequest.
 /// Guarantees ordering and synchronization via the flItemQueue.
 typedef struct flStreamingCallbackData {
@@ -707,6 +774,20 @@ struct flItemApi {
   /// Borrowed pointers in the returned struct are owned by the item and valid until the item is released.
   FL_API_STATUS(GetToolResult, _In_ const flItem* item, _Out_ flToolResultData* out_tool_result);
 
+  /// Get content of a SPEECH_SEGMENT item into a versioned struct.
+  /// Output-only type — there is no SetSpeechSegment. Item_Create with
+  /// FOUNDRY_LOCAL_ITEM_SPEECH_SEGMENT returns FOUNDRY_LOCAL_ERROR_INVALID_USAGE.
+  /// Borrowed pointers in the returned struct (text, words array, language) are owned by the item and
+  /// valid until the item is released.
+  FL_API_STATUS(GetSpeechSegment, _In_ const flItem* item, _Out_ flSpeechSegmentData* out_segment);
+
+  /// Get content of a SPEECH_RESULT item into a versioned struct.
+  /// Output-only type — there is no SetSpeechResult. Item_Create with
+  /// FOUNDRY_LOCAL_ITEM_SPEECH_RESULT returns FOUNDRY_LOCAL_ERROR_INVALID_USAGE.
+  /// Borrowed pointers in the returned struct (text, language, segments array) are owned by the item and
+  /// valid until the item is released. Each entry of `segments` is a SPEECH_SEGMENT item.
+  FL_API_STATUS(GetSpeechResult, _In_ const flItem* item, _Out_ flSpeechResultData* out_result);
+
   /// Get metadata from the item (read-only).
   /// Returned flKeyValuePairs is owned by the item and valid until the item is released — do not release it.
   FL_API_STATUS(GetMetadata, _In_ const flItem* item, _Outptr_ const flKeyValuePairs** out_metadata);

diff --git a/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h b/sdk_v2/cpp/include/foundry_local/foundry_local_cpp.h
@@ -458,6 +458,40 @@ struct ToolResultContent {
   std::string_view result;
 };
 
+/// One word in a SPEECH_SEGMENT. Optional fields use std::optional / empty string_view.
+struct SpeechWord {
+  std::string_view text;
+  std::optional<int64_t> start_time_ms;
+  std::optional<int64_t> end_time_ms;
+  std::optional<float> confidence;
+  std::optional<std::string_view> speaker_id;
+};
+
+/// Content returned from a SPEECH_SEGMENT item (output-only).
+///
+/// See SPEECH_TYPES.md for the streaming model. PARTIAL `text` is the
+/// cumulative current hypothesis for the segment, not a delta. As an entry of
+/// a SpeechResultContent, `kind` is FINAL (or NONE for a single non-segmented
+/// transcript).
+struct SpeechSegmentContent {
+  flSpeechSegmentKind kind;
+  std::string_view text;
+  std::optional<int64_t> start_time_ms;
+  std::optional<int64_t> end_time_ms;
+  bool utterance_start;
+  std::vector<SpeechWord> words;
+  std::optional<std::string_view> language;
+};
+
+/// Content returned from a SPEECH_RESULT item (output-only).
+/// `segments` exposes non-owning views over segment items owned by the result.
+struct SpeechResultContent {
+  std::string_view text;
+  std::optional<std::string_view> language;
+  std::optional<int64_t> duration_ms;
+  std::vector<Item> segments;
+};
+
 // ===========================================================================
 // Item
 // ===========================================================================
@@ -485,6 +519,8 @@ class Item {
   MessageContent GetMessage() const;
   ToolCallContent GetToolCall() const;
   ToolResultContent GetToolResult() const;
+  SpeechSegmentContent GetSpeechSegment() const;
+  SpeechResultContent GetSpeechResult() const;
 
   const flItem* native_handle() const noexcept { return handle_.get(); }
   flItem* native_handle_mutable() { return handle_.get_mutable(); }
@@ -973,6 +1009,24 @@ class ChatSession : public Session {
   void UndoTurns(size_t count);
 };
 
+/// Session for automatic-speech-recognition (transcription) models.
+///
+/// Output format is controlled by the session option `response_format`:
+///   - Unset, or any value other than "text" (default): each request produces a
+///     SpeechResultItem and the streaming callback receives SpeechSegmentItems
+///     (one per decoded token).
+///   - "text": plain-text output — each request produces a TextItem and the
+///     streaming callback receives TextItems.
+///
+/// Set this once on the session via `SetOptions` before issuing requests:
+///
+///     AudioSession session(model);
+///     RequestOptions opts;
+///     opts.additional_options.Set("response_format", "text");
+///     session.SetOptions(opts);
+///
+/// Output format is a session-level decision and is intentionally NOT honoured
+/// when set on a per-request `RequestOptions`.
 class AudioSession : public Session {
  public:
   explicit AudioSession(IModel& model);