Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 38 additions & 11 deletions agent_cli/agents/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class TranscriptResult(TypedDict, total=False):
transcript: str | None
saved_recording_path: Path | None
llm_enabled: bool
error: str | None


SYSTEM_PROMPT = """
Expand Down Expand Up @@ -343,6 +344,14 @@ def _option_default(value: Any) -> Any:
return getattr(value, "default", value)


def _transcript_result_error(result: object) -> str | None:
"""Return a structured transcript error when one is present."""
if not isinstance(result, dict):
return None
error = result.get("error")
return error if isinstance(error, str) and error else None


async def _async_main( # noqa: PLR0912, PLR0915, C901
*,
extra_instructions: str | None,
Expand Down Expand Up @@ -465,17 +474,27 @@ def _set_saved_recording_path(path: Path) -> None:
and provider_cfg.asr_provider == "wyoming"
else None
)
transcript = await live_transcriber(
logger=LOGGER,
stop_event=stop_event,
quiet=general_cfg.quiet,
live=live,
save_recording=save_recording,
extra_instructions=extra_instructions,
recording_path_callback=_set_saved_recording_path,
audio_level_callback=audio_level_callback,
live_preview_config=live_preview_config,
)
try:
transcript = await live_transcriber(
logger=LOGGER,
stop_event=stop_event,
quiet=general_cfg.quiet,
live=live,
save_recording=save_recording,
extra_instructions=extra_instructions,
recording_path_callback=_set_saved_recording_path,
audio_level_callback=audio_level_callback,
live_preview_config=live_preview_config,
)
except asr.SilentAudioCaptureError as exc:
message = str(exc)
LOGGER.warning(message)
return TranscriptResult(
raw_transcript=None,
transcript=None,
llm_enabled=False,
error=message,
)

elapsed = time.monotonic() - start_time

Expand Down Expand Up @@ -898,6 +917,10 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901
raise typer.Exit(1) from None
if json_output:
print(json.dumps(result))
if error := _transcript_result_error(result):
if not json_output:
print(error)
raise typer.Exit(1)
return

# Normal recording mode
Expand Down Expand Up @@ -994,3 +1017,7 @@ def transcribe( # noqa: PLR0912, PLR0911, PLR0915, C901
raise typer.Exit(1) from None
if json_output:
print(json.dumps(result))
if error := _transcript_result_error(result):
if not json_output:
print(error)
raise typer.Exit(1)
66 changes: 62 additions & 4 deletions agent_cli/services/asr.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
import asyncio
import io
import json
import struct
import wave
from contextlib import suppress
from dataclasses import dataclass
from datetime import UTC, datetime
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, NoReturn

from agent_cli import constants
from agent_cli.core.audio import (
Expand Down Expand Up @@ -71,6 +72,51 @@ def min_audio_bytes(self) -> int:
)


@dataclass
class AudioCaptureStats:
"""Basic signal stats for captured little-endian int16 PCM audio."""

byte_count: int = 0
peak_sample: int = 0

@property
def is_all_silence(self) -> bool:
"""Return true when capture produced bytes but every sample was zero."""
return self.byte_count > 0 and self.peak_sample == 0

def observe(self, chunk: bytes) -> None:
"""Update stats from one PCM chunk."""
self.byte_count += len(chunk)
self.peak_sample = max(self.peak_sample, _peak_int16_sample(chunk))


class SilentAudioCaptureError(RuntimeError):
"""Raised when microphone capture returns only digital silence."""

DEFAULT_MESSAGE = (
"Microphone capture returned only digital silence. "
"On macOS, allow Microphone permission for Agent CLI, "
"then restart Agent CLI."
)

def __init__(self, message: str | None = None) -> None:
"""Create an error with the default silent-capture guidance."""
super().__init__(message or self.DEFAULT_MESSAGE)


def _raise_silent_audio_capture_error() -> NoReturn:
raise SilentAudioCaptureError
Comment on lines +107 to +108

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 _raise_silent_audio_capture_error raises the class object rather than an instance (raise SilentAudioCaptureError vs raise SilentAudioCaptureError()). Python 3 instantiates the class with no arguments when it sees raise ExceptionClass, so the DEFAULT_MESSAGE is used correctly and there is no runtime bug. However, raising a class directly is unconventional, surprises some static analysers, and adds an unnecessary call frame to every traceback. Since the function has no other callers and provides no testability benefit, it can be collapsed to a direct raise at the call site.

Suggested change
def _raise_silent_audio_capture_error() -> NoReturn:
raise SilentAudioCaptureError
def _raise_silent_audio_capture_error() -> NoReturn:
raise SilentAudioCaptureError()

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!



def _peak_int16_sample(chunk: bytes) -> int:
sample_count = len(chunk) // constants.AUDIO_FORMAT_WIDTH
if sample_count <= 0:
return 0
pcm = chunk[: sample_count * constants.AUDIO_FORMAT_WIDTH]
samples = struct.unpack(f"<{sample_count}h", pcm)
return max((abs(sample) for sample in samples), default=0)


def _write_live_preview_event(
log_file: Path,
*,
Expand Down Expand Up @@ -421,8 +467,8 @@ async def _send_audio(
recording_path_callback: Callable[[Path], None] | None = None,
audio_level_callback: Callable[[bytes], None] | None = None,
live_preview_callback: Callable[[bytes], Awaitable[None]] | None = None,
) -> None:
"""Read from mic and send to Wyoming server."""
) -> AudioCaptureStats:
"""Read from mic, send to Wyoming server, and return capture stats."""
from wyoming.asr import Transcribe # noqa: PLC0415
from wyoming.audio import AudioChunk, AudioStart, AudioStop # noqa: PLC0415

Expand All @@ -434,9 +480,11 @@ async def _send_audio(
# Buffer to save audio if requested
audio_buffer = io.BytesIO() if save_recording else None
pending_audio_level_tasks: set[asyncio.Task[None]] = set()
capture_stats = AudioCaptureStats()

async def send_chunk(chunk: bytes) -> None:
"""Send audio chunk to ASR server and optionally buffer it."""
capture_stats.observe(chunk)
if audio_buffer is not None:
audio_buffer.write(chunk)
if audio_level_callback:
Expand Down Expand Up @@ -475,6 +523,7 @@ async def send_chunk(chunk: bytes) -> None:
recording_path_callback(saved_path)
finally:
await _finish_audio_level_callbacks(pending_audio_level_tasks)
return capture_stats


async def record_audio_to_buffer(queue: asyncio.Queue, logger: logging.Logger) -> bytes:
Expand Down Expand Up @@ -692,7 +741,7 @@ async def _transcribe_live_audio_wyoming(
if live_preview is not None:
live_preview.reset_log()
live_preview_task = asyncio.create_task(live_preview.run())
_, recv_task = await manage_send_receive_tasks(
send_task, recv_task = await manage_send_receive_tasks(
_send_audio(
client,
stream,
Expand All @@ -715,8 +764,17 @@ async def _transcribe_live_audio_wyoming(
return_when=asyncio.ALL_COMPLETED,
)
result = recv_task.result()
capture_stats = send_task.result() if send_task is not None else None

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 send_task is never None here. manage_send_receive_tasks is typed as returning tuple[asyncio.Task, asyncio.Task] and always calls asyncio.create_task(send_task_coro) unconditionally, so the guard if send_task is not None is dead code. It could mislead a future reader into thinking there is a code path where send_task can be absent.

Suggested change
capture_stats = send_task.result() if send_task is not None else None
capture_stats = send_task.result()

if (
capture_stats is not None
and capture_stats.is_all_silence
and not result.strip()
):
_raise_silent_audio_capture_error()
final_transcript = result
return result
except SilentAudioCaptureError:
raise
except (ConnectionRefusedError, Exception):
logger.warning("Failed to connect to Wyoming ASR server")
return None
Expand Down
26 changes: 26 additions & 0 deletions macos/AgentCLI/Sources/AgentCLI/AgentCommandRunner.swift
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ final class AgentCommandRunner: ObservableObject {
private var recordingIndicator = RecordingIndicatorController()
private let pasteController: TranscriptPasteController
private let bootstrap: AgentBootstrap
private let microphonePermissionController: any MicrophonePermissionControlling
private var activityTracker = MenuActivityTracker()
private var pendingStopRecordingCommands: Set<String> = []
private var holdTranscriptionState: HoldTranscriptionState = .idle
Expand Down Expand Up @@ -69,11 +70,13 @@ final class AgentCommandRunner: ObservableObject {

init(
pasteController: TranscriptPasteController = TranscriptPasteController(),
microphonePermissionController: any MicrophonePermissionControlling = MicrophonePermissionController.shared,
bootstrap: @escaping AgentBootstrap = { requirement, force, progress in
AgentRuntime.shared.ensureReady(for: requirement, force: force, progress: progress)
}
) {
self.pasteController = pasteController
self.microphonePermissionController = microphonePermissionController
self.bootstrap = bootstrap
hasLastError = FileManager.default.fileExists(atPath: AgentRuntime.shared.lastErrorURL.path)
}
Expand Down Expand Up @@ -148,6 +151,7 @@ final class AgentCommandRunner: ObservableObject {
statusMessage = "Transcription is already recording"
return false
}
guard ensureMicrophonePermissionForRecording() else { return false }

holdTranscriptionState = .recording
holdToTranscribePasteTarget = FocusedTextTarget.capture()
Expand Down Expand Up @@ -196,6 +200,9 @@ final class AgentCommandRunner: ObservableObject {
markStopRequested(for: command)
beginTranscribingActivity()
}
if shouldStartRecording {
guard ensureMicrophonePermissionForRecording() else { return }
}

activeCommandCount += 1
beginCommandActivity(for: command)
Expand Down Expand Up @@ -383,6 +390,25 @@ final class AgentCommandRunner: ObservableObject {
activityTracker.beginTranscribing()
}

private func ensureMicrophonePermissionForRecording() -> Bool {
let presentation = MicrophonePermissionPresentation(
status: microphonePermissionController.currentStatus()
)
guard presentation.canRecord else {
microphonePermissionController.requestAccessIfNeeded { [weak self] granted in
guard granted else { return }
Task { @MainActor in
self?.statusMessage = "Microphone permission enabled. Try recording again."
}
}
let message = presentation.statusMessage ?? "Microphone permission is required for recording."
statusMessage = message
notify(title: "Microphone Permission Required", body: message)
return false
}
return true
}

private func clearTranscribingActivityIfFinished() {
if pendingStopRecordingCommands.isEmpty && !holdTranscriptionState.isFinishing {
activityTracker.finishTranscribing()
Expand Down
58 changes: 58 additions & 0 deletions macos/AgentCLI/Sources/AgentCLI/MicrophonePermission.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import AVFoundation

enum MicrophonePermissionStatus {
case authorized
case denied
case notDetermined
}

struct MicrophonePermissionPresentation {
let status: MicrophonePermissionStatus

var canRecord: Bool {
status == .authorized
}

var statusMessage: String? {
switch status {
case .authorized:
return nil
case .denied:
return "Allow Microphone permission for Agent CLI in System Settings."
case .notDetermined:
return "Approve Microphone permission for Agent CLI, then try recording again."
}
}
}

protocol MicrophonePermissionControlling {
func currentStatus() -> MicrophonePermissionStatus
func requestAccessIfNeeded(completion: @escaping (Bool) -> Void)
}

final class MicrophonePermissionController: MicrophonePermissionControlling {
static let shared = MicrophonePermissionController()

private init() {}

func currentStatus() -> MicrophonePermissionStatus {
switch AVCaptureDevice.authorizationStatus(for: .audio) {
case .authorized:
return .authorized
case .notDetermined:
return .notDetermined
case .denied, .restricted:
return .denied
@unknown default:
return .denied
}
}

func requestAccessIfNeeded(completion: @escaping (Bool) -> Void) {
guard currentStatus() == .notDetermined else {
completion(currentStatus() == .authorized)
return
}
AVCaptureDevice.requestAccess(for: .audio, completionHandler: completion)
}
Comment on lines +51 to +57

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 currentStatus() is called twice on the same execution path: once for the guard and once inside the else branch for the completion value. Between the two calls the OS authorization status could change (e.g., user grants access in System Settings while the app is on the guard check), causing the completion to be called with a stale value. Caching the result in a local constant eliminates the TOCTOU window.

Suggested change
func requestAccessIfNeeded(completion: @escaping (Bool) -> Void) {
guard currentStatus() == .notDetermined else {
completion(currentStatus() == .authorized)
return
}
AVCaptureDevice.requestAccess(for: .audio, completionHandler: completion)
}
func requestAccessIfNeeded(completion: @escaping (Bool) -> Void) {
let status = currentStatus()
guard status == .notDetermined else {
completion(status == .authorized)
return
}
AVCaptureDevice.requestAccess(for: .audio, completionHandler: completion)
}

}
23 changes: 23 additions & 0 deletions macos/AgentCLI/Tests/AgentCLITests/MicrophonePermissionTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#if canImport(XCTest)
import XCTest
@testable import AgentCLI

final class MicrophonePermissionTests: XCTestCase {
func testAuthorizedPermissionAllowsRecording() {
let presentation = MicrophonePermissionPresentation(status: .authorized)

XCTAssertTrue(presentation.canRecord)
XCTAssertEqual(presentation.statusMessage, nil)
}

func testDeniedPermissionExplainsSettingsFix() {
let presentation = MicrophonePermissionPresentation(status: .denied)

XCTAssertFalse(presentation.canRecord)
XCTAssertEqual(
presentation.statusMessage,
"Allow Microphone permission for Agent CLI in System Settings."
)
}
}
Comment on lines +6 to +22

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Missing notDetermined presentation test

The notDetermined state has a distinct canRecord = false and a different statusMessage ("Approve Microphone permission…") compared to the denied branch. Adding a test case would guard against the two non-authorized messages being accidentally swapped or the canRecord property being set to true for notDetermined in a future refactor.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

#endif
Loading
Loading