diff --git a/app/pipeline/contracts/__init__.py b/app/pipeline/contracts/__init__.py index bda9bd0..591afa5 100644 --- a/app/pipeline/contracts/__init__.py +++ b/app/pipeline/contracts/__init__.py @@ -2,13 +2,16 @@ from .asr import ASRProvider, ASRRequest, ASRResult from .artifacts import ( + ARTIFACT_MANIFEST_VERSION, AsyncUploadReader, AudioArtifactIndex, + ArtifactManifestEntry, PersistedTranscriptionArtifacts, SavedUploadArtifact, TranscriptionArtifactStore, TranscriptionArtifactWriteRequest, UploadPersistenceRequest, + build_artifact_manifest, ) from .context import PipelineContext from .diarization import ( @@ -48,6 +51,7 @@ "ASRProvider", "ASRRequest", "ASRResult", + "ARTIFACT_MANIFEST_VERSION", "AsyncUploadReader", "AudioArtifactIndex", "AudioEnhancementProvider", @@ -55,6 +59,7 @@ "AudioEnhancementResult", "AudioNormalizationRequest", "AudioNormalizationResult", + "ArtifactManifestEntry", "DiarizationProvider", "DiarizationRequest", "DiarizationResult", @@ -76,4 +81,5 @@ "VoiceprintMatchProvider", "VoiceprintMatchRequest", "VoiceprintMatchResult", + "build_artifact_manifest", ] diff --git a/app/pipeline/contracts/artifacts.py b/app/pipeline/contracts/artifacts.py index 322c202..369c6d3 100644 --- a/app/pipeline/contracts/artifacts.py +++ b/app/pipeline/contracts/artifacts.py @@ -6,6 +6,8 @@ from pathlib import Path from typing import Any, Protocol, runtime_checkable +ARTIFACT_MANIFEST_VERSION = "artifact_manifest.v1" + class AsyncUploadReader(Protocol): """Minimal async file interface used by UploadFile and test doubles.""" @@ -64,6 +66,49 @@ class PersistedTranscriptionArtifacts: embedding_paths: dict[str, Path] +@dataclass(frozen=True, slots=True) +class ArtifactManifestEntry: + """Public-safe artifact descriptor embedded in completed results. + + This intentionally describes artifact names and roles without exposing + host-local paths. Clients may ignore the whole manifest. + """ + + name: str + filename: str + role: str + media_type: str + required_for_result: bool = False + speaker_label: str | None = None + + def as_dict(self) -> dict[str, Any]: + payload: dict[str, Any] = { + "name": self.name, + "filename": self.filename, + "role": self.role, + "media_type": self.media_type, + "required_for_result": self.required_for_result, + } + if self.speaker_label is not None: + payload["speaker_label"] = self.speaker_label + return payload + + +def build_artifact_manifest( + stable: list[ArtifactManifestEntry], + optional: list[ArtifactManifestEntry] | None = None, + experimental: list[ArtifactManifestEntry] | None = None, +) -> dict[str, Any]: + """Build the optional artifact manifest for a completed transcription.""" + + return { + "manifest_version": ARTIFACT_MANIFEST_VERSION, + "stable": [entry.as_dict() for entry in stable], + "optional": [entry.as_dict() for entry in optional or []], + "experimental": [entry.as_dict() for entry in experimental or []], + } + + @runtime_checkable class TranscriptionArtifactStore(Protocol): """Stable slot for persisting completed transcription artifacts.""" @@ -74,11 +119,14 @@ def persist_transcription( __all__ = [ + "ARTIFACT_MANIFEST_VERSION", "AsyncUploadReader", "AudioArtifactIndex", + "ArtifactManifestEntry", "PersistedTranscriptionArtifacts", "SavedUploadArtifact", "TranscriptionArtifactStore", "TranscriptionArtifactWriteRequest", "UploadPersistenceRequest", + "build_artifact_manifest", ] diff --git a/app/providers/artifacts/default.py b/app/providers/artifacts/default.py index 1fd860b..106fee4 100644 --- a/app/providers/artifacts/default.py +++ b/app/providers/artifacts/default.py @@ -6,8 +6,14 @@ from pathlib import Path from config import DENOISE_MODEL, DENOISE_SNR_THRESHOLD +from infra.audio.paths import safe_speaker_label from infra.transcription_artifacts import persist_transcription_artifacts -from pipeline.contracts import PipelineContext, PipelineResult +from pipeline.contracts import ( + ArtifactManifestEntry, + PipelineContext, + PipelineResult, + build_artifact_manifest, +) class InMemoryArtifactsProvider: @@ -89,6 +95,7 @@ def _build_transcription(self, context: PipelineContext) -> dict | None: context.aligned_segments, context.voiceprint_matches, ) + embedding_labels = sorted(context.speaker_embeddings) warning = None if not context.voiceprint_matches and not context.speaker_embeddings: warning = "no_speakers_detected" @@ -111,6 +118,7 @@ def _build_transcription(self, context: PipelineContext) -> dict | None: "max_speakers": context.request.max_speakers, "no_repeat_ngram_size": context.request.no_repeat_ngram_size or 0, }, + "artifacts": self._build_artifact_manifest(embedding_labels), } if context.transcription_result is not None: guard_report = context.transcription_result.get("hallucination_guard") @@ -123,6 +131,29 @@ def _build_transcription(self, context: PipelineContext) -> dict | None: transcription["warning"] = warning return transcription + @staticmethod + def _build_artifact_manifest(speaker_labels: list[str]) -> dict: + stable = [ + ArtifactManifestEntry( + name="result", + filename="result.json", + role="primary_result", + media_type="application/json", + required_for_result=True, + ) + ] + stable.extend( + ArtifactManifestEntry( + name="speaker_embedding", + filename=f"emb_{safe_speaker_label(speaker_label)}.npy", + role="speaker_embedding", + media_type="application/octet-stream", + speaker_label=speaker_label, + ) + for speaker_label in speaker_labels + ) + return build_artifact_manifest(stable=stable) + def build(self, context: PipelineContext) -> PipelineResult: transcription = self._build_transcription(context) artifact_paths = None diff --git a/doc/api.en.md b/doc/api.en.md index 7f6c166..417b413 100644 --- a/doc/api.en.md +++ b/doc/api.en.md @@ -180,6 +180,28 @@ practice, omit `denoise_model` to inherit `DENOISE_MODEL`, send "max_speakers": 0, "no_repeat_ngram_size": 0 }, + "artifacts": { + "manifest_version": "artifact_manifest.v1", + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": true + }, + { + "name": "speaker_embedding", + "filename": "emb_SPEAKER_00.npy", + "role": "speaker_embedding", + "media_type": "application/octet-stream", + "required_for_result": false, + "speaker_label": "SPEAKER_00" + } + ], + "optional": [], + "experimental": [] + }, "alignment": { "status": "succeeded", "language": "en", @@ -198,7 +220,8 @@ enrollment or rename call. **Result contract anchors**: completed results report `status="completed"` in the persisted transcription object. `segments[].speaker_label` is always the raw diarization cluster label. `segments[].words` and top-level `alignment` are -optional metadata; clients must tolerate either field being absent. +optional metadata; top-level `artifacts` is optional as well. Clients must +tolerate these fields being absent. `speaker_id` / `speaker_name`: matching uses an **adaptive threshold**, not a fixed `0.75` cutoff. Actual logic: @@ -259,6 +282,14 @@ no need to cross-reference the original request. See [`configuration.en.md`](./configuration.en.md) for each setting's source and default. +**`artifacts`** is an optional manifest describing stable, optional, and +experimental artifacts that live alongside this result. Current stable entries +include the primary `result.json` and one `emb_.npy` speaker +embedding per cluster. The manifest exposes only filenames, roles, categories, +media types, and `speaker_label`; it does not expose local paths, hosts, tokens, +real job runtime paths, or debug data. Default clients do not need this field, +and older results without `artifacts` remain compatible. + Completed `GET /api/jobs/{id}` results and `GET /api/transcriptions/{id}` share the same payload shape. That means `speaker_map` and `unique_speakers` are available in the completed job result as well: @@ -289,6 +320,7 @@ aggregation fields for UI / downstream consumers: | --- | --- | --- | | `speaker_map` | object | `speaker_label → {matched_id, matched_name, similarity, embedding_key}` mapping; reflects the **diarization model's voiceprint match result** and does not change when segments are manually corrected | | `unique_speakers` | array[string] | Deduplicated list of speaker names, recalculated from the persisted `segments[].speaker_name` values to reflect the latest manual corrections | +| `artifacts` | object | Optional artifact manifest for stable / optional / experimental artifacts; clients must tolerate it being absent | ### `GET /api/export/{tr_id}` diff --git a/doc/api.zh.md b/doc/api.zh.md index 4e5eb2c..635544e 100644 --- a/doc/api.zh.md +++ b/doc/api.zh.md @@ -175,6 +175,28 @@ curl -X POST http://localhost:8780/api/transcribe \ "max_speakers": 0, "no_repeat_ngram_size": 0 }, + "artifacts": { + "manifest_version": "artifact_manifest.v1", + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": true + }, + { + "name": "speaker_embedding", + "filename": "emb_SPEAKER_00.npy", + "role": "speaker_embedding", + "media_type": "application/octet-stream", + "required_for_result": false, + "speaker_label": "SPEAKER_00" + } + ], + "optional": [], + "experimental": [] + }, "alignment": { "status": "succeeded", "language": "zh", @@ -191,7 +213,8 @@ curl -X POST http://localhost:8780/api/transcribe \ **结果契约锚点**:完成态持久化转写对象会带 `status="completed"`。 `segments[].speaker_label` 永远是原始 diarization cluster 标签。 -`segments[].words` 和顶层 `alignment` 都是可选元数据,客户端必须能接受字段缺失。 +`segments[].words`、顶层 `alignment` 和顶层 `artifacts` 都是可选元数据, +客户端必须能接受字段缺失。 `speaker_id` 和 `speaker_name`:匹配采用**自适应阈值**,不是固定 0.75。实际逻辑: @@ -231,6 +254,12 @@ alignment 模型会记录为 `jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-c 都可独立解读,无需再查原始请求。各配置项来源和默认值见 [`configuration.zh.md`](./configuration.zh.md)。 +**`artifacts`** 是可选 manifest,用于描述与该结果同目录的稳定、可选和实验性 +artifact。当前稳定项包括主结果 `result.json` 和每个说话人 cluster 的 +`emb_.npy`。manifest 只暴露文件名、角色、类别、媒体类型和 +`speaker_label`,不暴露本地路径、主机、token、真实 job 运行路径或调试信息。 +默认客户端不需要依赖该字段;老结果没有 `artifacts` 时仍应按兼容结果处理。 + `GET /api/jobs/{id}` 的完成态结果与 `GET /api/transcriptions/{id}` 使用同一份 持久化结果结构,因此完成态里同样会带上 `speaker_map` 和 `unique_speakers`: @@ -260,6 +289,7 @@ alignment 模型会记录为 `jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-c | --- | --- | --- | | `speaker_map` | object | `speaker_label → {matched_id, matched_name, similarity, embedding_key}` 的映射,反映 **diarization 模型的声纹匹配结果**,不随人工单段纠错变化;便于前端一次性渲染人名下拉 / 统计 | | `unique_speakers` | array[string] | 去重后的说话人名列表,从持久化结果里的 `segments[].speaker_name` 重算,反映最新的人工纠错结果 | +| `artifacts` | object | 可选 artifact manifest;用于发现结果相关的稳定 / 可选 / 实验 artifact,缺失时必须兼容 | 与 `GET /api/jobs/{id}` 不同,本端点始终从磁盘读取持久化结果,**进程重启后仍可访问**, 也能反映最新的人工纠错;`/api/jobs/{id}` 优先读内存,内存未命中时才回落到磁盘(见上方注意事项)。 diff --git a/doc/changelog.en.md b/doc/changelog.en.md index 5bf3d11..0398c05 100644 --- a/doc/changelog.en.md +++ b/doc/changelog.en.md @@ -4,6 +4,15 @@ ## Unreleased +### Features + +- Added an optional `artifacts` manifest to completed results. The manifest + lists artifact category, role, filename, media type, and `speaker_label` + where relevant; it does not expose local paths, job runtime paths, hosts, + tokens, or debug data. Default clients can continue to rely only on the + `result.json` primary view, and must treat unknown or missing `artifacts` + fields as compatible. + ## 0.7.6 — Health, alignment, and embedding runtime fixes (2026-05-07) ### Security diff --git a/doc/changelog.zh.md b/doc/changelog.zh.md index c542919..08fc043 100644 --- a/doc/changelog.zh.md +++ b/doc/changelog.zh.md @@ -4,6 +4,13 @@ ## Unreleased +### 功能 + +- 新增完成态结果的可选 `artifacts` manifest。该字段只列出当前结果相关 artifact + 的类别、角色、文件名、媒体类型和 `speaker_label`,不暴露本地路径、job 运行路径、 + host、token 或调试信息。默认客户端仍只需读取 `result.json` 主视图;未知或缺失 + `artifacts` 字段必须被视为兼容。 + ## 0.7.6 — 健康检查、alignment 与 embedding 运行时修复 (2026-05-07) ### 安全 diff --git a/doc/configuration.en.md b/doc/configuration.en.md index d0185f5..1dcdc14 100644 --- a/doc/configuration.en.md +++ b/doc/configuration.en.md @@ -193,12 +193,17 @@ Stable anchors in completed transcription results: - Top-level `alignment`: optional forced-alignment metadata, sanitized. - Top-level `params`: effective per-job processing settings, including request overrides and service defaults used for this result. +- Top-level `artifacts`: optional artifact manifest listing stable / optional / + experimental artifact filenames, roles, categories, media types, and + `speaker_label` values; it never exposes local paths, hosts, tokens, or debug + data. - `speaker_map`: diarization cluster to voiceprint match map; manual segment corrections do not rewrite it. - `unique_speakers`: deduplicated current segment display names. New fields are added under the optional-field principle. Clients should ignore -unknown fields and tolerate missing `words`, `alignment`, and `warning`. +unknown fields and tolerate missing `words`, `alignment`, `artifacts`, and +`warning`. ## v0.7.6 Validation Wording diff --git a/doc/configuration.zh.md b/doc/configuration.zh.md index b2cfd12..5eb626b 100644 --- a/doc/configuration.zh.md +++ b/doc/configuration.zh.md @@ -178,11 +178,14 @@ cohort 生命周期: - `segments[].words`:可选词级 alignment。 - 顶层 `alignment`:可选 forced-alignment 元数据,字段内容会脱敏。 - 顶层 `params`:记录本次任务实际使用的请求级与服务级处理参数,便于离线解释结果。 +- 顶层 `artifacts`:可选 artifact manifest,只列出稳定 / 可选 / 实验 artifact 的 + 文件名、角色、类别、媒体类型和 `speaker_label`;不暴露本地路径、主机、token 或 + 调试信息。 - `speaker_map`:diarization cluster 到声纹匹配结果的映射;人工改单段说话人不会回写它。 - `unique_speakers`:按当前 segment 展示名去重后的列表。 新增字段按可选字段原则扩展;客户端应忽略不认识的字段,并容忍 `words` / -`alignment` / `warning` 缺失。 +`alignment` / `artifacts` / `warning` 缺失。 ## v0.7.6 验证口径 diff --git a/tests/unit/test_pipeline_runner.py b/tests/unit/test_pipeline_runner.py index c2c439f..4831f72 100644 --- a/tests/unit/test_pipeline_runner.py +++ b/tests/unit/test_pipeline_runner.py @@ -491,6 +491,28 @@ def match(self, request): "language": "zh", "reason": "language_disabled", } + assert result["transcription"]["artifacts"] == { + "manifest_version": "artifact_manifest.v1", + "stable": [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + }, + { + "name": "speaker_embedding", + "filename": "emb_SPEAKER_00.npy", + "role": "speaker_embedding", + "media_type": "application/octet-stream", + "required_for_result": False, + "speaker_label": "SPEAKER_00", + }, + ], + "optional": [], + "experimental": [], + } assert ( result["transcription"]["speaker_map"]["SPEAKER_00"]["matched_id"] == "spk_demo" ) @@ -503,6 +525,11 @@ def match(self, request): assert result_path.exists() persisted_result = json.loads(result_path.read_text(encoding="utf-8")) assert persisted_result["asr_hallucination_guard"]["removed_segment_count"] == 2 + assert persisted_result["artifacts"] == result["transcription"]["artifacts"] + assert str(tmp_path) not in json.dumps( + persisted_result["artifacts"], + ensure_ascii=False, + ) assert emb_path.exists() assert not audio_path.with_suffix(".wav").exists() assert not audio_path.with_suffix(".denoised.wav").exists() @@ -579,6 +606,18 @@ def test_artifact_result_contract_keeps_status_speaker_label_and_optional_alignm assert result["segments"][0]["speaker_label"] == "SPEAKER_00" assert result["segments"][0]["speaker_id"] is None assert result["speaker_map"] == {} + assert result["artifacts"]["manifest_version"] == "artifact_manifest.v1" + assert result["artifacts"]["stable"] == [ + { + "name": "result", + "filename": "result.json", + "role": "primary_result", + "media_type": "application/json", + "required_for_result": True, + } + ] + assert result["artifacts"]["optional"] == [] + assert result["artifacts"]["experimental"] == [] assert "alignment" not in result