From d97105986a3f2cb8211f286ec155f1cff7683d67 Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Mon, 23 Mar 2026 16:48:53 +0800
Subject: [PATCH 1/6] feat: add video analysis operators and docs

---
 runtime/ops/mapper/README_video_ops.md        | 168 ++++++++
 runtime/ops/mapper/__init__.py                |  17 +-
 .../ops/mapper/_video_common/README_models.md |  83 ++++
 .../_video_common/README_qwen_service.md      |  92 ++++
 runtime/ops/mapper/_video_common/__init__.py  |   1 +
 runtime/ops/mapper/_video_common/ffmpeg.py    | 117 +++++
 runtime/ops/mapper/_video_common/io_video.py  |  13 +
 runtime/ops/mapper/_video_common/log.py       |  23 +
 .../ops/mapper/_video_common/model_paths.py   |  28 ++
 runtime/ops/mapper/_video_common/paths.py     |  18 +
 .../mapper/_video_common/qwen_http_client.py  |  42 ++
 runtime/ops/mapper/_video_common/schema.py    |   9 +
 .../mapper/video_audio_extract/__init__.py    |   6 +
 .../mapper/video_audio_extract/metadata.yml   |  16 +
 .../ops/mapper/video_audio_extract/process.py |  81 ++++
 .../mapper/video_classify_qwenvl/__init__.py  |   6 +
 .../mapper/video_classify_qwenvl/metadata.yml |  16 +
 .../mapper/video_classify_qwenvl/process.py   | 114 +++++
 .../mapper/video_deborder_crop/__init__.py    |   6 +
 .../mapper/video_deborder_crop/metadata.yml   |  16 +
 .../ops/mapper/video_deborder_crop/process.py | 212 +++++++++
 .../mapper/video_event_tag_qwenvl/__init__.py |   6 +
 .../video_event_tag_qwenvl/metadata.yml       |  16 +
 .../mapper/video_event_tag_qwenvl/process.py  | 135 ++++++
 .../mapper/video_format_convert/__init__.py   |   6 +
 .../mapper/video_format_convert/metadata.yml  |  16 +
 .../mapper/video_format_convert/process.py    |  97 +++++
 .../mapper/video_keyframe_extract/__init__.py |   6 +
 .../video_keyframe_extract/metadata.yml       |  16 +
 .../mapper/video_keyframe_extract/process.py  | 229 ++++++++++
 .../ops/mapper/video_mot_track/__init__.py    |   6 +
 .../video_mot_track/configs/bytetrack.yaml    |   7 +
 .../ops/mapper/video_mot_track/metadata.yml   |  16 +
 runtime/ops/mapper/video_mot_track/process.py | 124 ++++++
 .../mapper/video_sensitive_crop/__init__.py   |   6 +
 .../mapper/video_sensitive_crop/metadata.yml  |  16 +
 .../mapper/video_sensitive_crop/process.py    | 150 +++++++
 .../mapper/video_sensitive_detect/__init__.py |   6 +
 .../video_sensitive_detect/metadata.yml       |  16 +
 .../mapper/video_sensitive_detect/process.py  | 155 +++++++
 .../ops/mapper/video_speech_asr/__init__.py   |   6 +
 .../ops/mapper/video_speech_asr/metadata.yml  |  16 +
 .../ops/mapper/video_speech_asr/process.py    | 213 +++++++++
 .../ops/mapper/video_subject_crop/__init__.py |   6 +
 .../mapper/video_subject_crop/metadata.yml    |  16 +
 .../ops/mapper/video_subject_crop/process.py  | 175 ++++++++
 .../ops/mapper/video_subtitle_ocr/__init__.py |   6 +
 .../mapper/video_subtitle_ocr/metadata.yml    |  16 +
 .../ops/mapper/video_subtitle_ocr/process.py  | 406 ++++++++++++++++++
 .../mapper/video_summary_qwenvl/__init__.py   |   6 +
 .../mapper/video_summary_qwenvl/metadata.yml  |  16 +
 .../mapper/video_summary_qwenvl/process.py    | 145 +++++++
 runtime/ops/mapper/video_text_ocr/__init__.py |   6 +
 .../ops/mapper/video_text_ocr/metadata.yml    |  16 +
 runtime/ops/mapper/video_text_ocr/process.py  | 288 +++++++++++++
 55 files changed, 3449 insertions(+), 1 deletion(-)
 create mode 100644 runtime/ops/mapper/README_video_ops.md
 create mode 100644 runtime/ops/mapper/_video_common/README_models.md
 create mode 100644 runtime/ops/mapper/_video_common/README_qwen_service.md
 create mode 100644 runtime/ops/mapper/_video_common/__init__.py
 create mode 100644 runtime/ops/mapper/_video_common/ffmpeg.py
 create mode 100644 runtime/ops/mapper/_video_common/io_video.py
 create mode 100644 runtime/ops/mapper/_video_common/log.py
 create mode 100644 runtime/ops/mapper/_video_common/model_paths.py
 create mode 100644 runtime/ops/mapper/_video_common/paths.py
 create mode 100644 runtime/ops/mapper/_video_common/qwen_http_client.py
 create mode 100644 runtime/ops/mapper/_video_common/schema.py
 create mode 100644 runtime/ops/mapper/video_audio_extract/__init__.py
 create mode 100644 runtime/ops/mapper/video_audio_extract/metadata.yml
 create mode 100644 runtime/ops/mapper/video_audio_extract/process.py
 create mode 100644 runtime/ops/mapper/video_classify_qwenvl/__init__.py
 create mode 100644 runtime/ops/mapper/video_classify_qwenvl/metadata.yml
 create mode 100644 runtime/ops/mapper/video_classify_qwenvl/process.py
 create mode 100644 runtime/ops/mapper/video_deborder_crop/__init__.py
 create mode 100644 runtime/ops/mapper/video_deborder_crop/metadata.yml
 create mode 100644 runtime/ops/mapper/video_deborder_crop/process.py
 create mode 100644 runtime/ops/mapper/video_event_tag_qwenvl/__init__.py
 create mode 100644 runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml
 create mode 100644 runtime/ops/mapper/video_event_tag_qwenvl/process.py
 create mode 100644 runtime/ops/mapper/video_format_convert/__init__.py
 create mode 100644 runtime/ops/mapper/video_format_convert/metadata.yml
 create mode 100644 runtime/ops/mapper/video_format_convert/process.py
 create mode 100644 runtime/ops/mapper/video_keyframe_extract/__init__.py
 create mode 100644 runtime/ops/mapper/video_keyframe_extract/metadata.yml
 create mode 100644 runtime/ops/mapper/video_keyframe_extract/process.py
 create mode 100644 runtime/ops/mapper/video_mot_track/__init__.py
 create mode 100644 runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml
 create mode 100644 runtime/ops/mapper/video_mot_track/metadata.yml
 create mode 100644 runtime/ops/mapper/video_mot_track/process.py
 create mode 100644 runtime/ops/mapper/video_sensitive_crop/__init__.py
 create mode 100644 runtime/ops/mapper/video_sensitive_crop/metadata.yml
 create mode 100644 runtime/ops/mapper/video_sensitive_crop/process.py
 create mode 100644 runtime/ops/mapper/video_sensitive_detect/__init__.py
 create mode 100644 runtime/ops/mapper/video_sensitive_detect/metadata.yml
 create mode 100644 runtime/ops/mapper/video_sensitive_detect/process.py
 create mode 100644 runtime/ops/mapper/video_speech_asr/__init__.py
 create mode 100644 runtime/ops/mapper/video_speech_asr/metadata.yml
 create mode 100644 runtime/ops/mapper/video_speech_asr/process.py
 create mode 100644 runtime/ops/mapper/video_subject_crop/__init__.py
 create mode 100644 runtime/ops/mapper/video_subject_crop/metadata.yml
 create mode 100644 runtime/ops/mapper/video_subject_crop/process.py
 create mode 100644 runtime/ops/mapper/video_subtitle_ocr/__init__.py
 create mode 100644 runtime/ops/mapper/video_subtitle_ocr/metadata.yml
 create mode 100644 runtime/ops/mapper/video_subtitle_ocr/process.py
 create mode 100644 runtime/ops/mapper/video_summary_qwenvl/__init__.py
 create mode 100644 runtime/ops/mapper/video_summary_qwenvl/metadata.yml
 create mode 100644 runtime/ops/mapper/video_summary_qwenvl/process.py
 create mode 100644 runtime/ops/mapper/video_text_ocr/__init__.py
 create mode 100644 runtime/ops/mapper/video_text_ocr/metadata.yml
 create mode 100644 runtime/ops/mapper/video_text_ocr/process.py

diff --git a/runtime/ops/mapper/README_video_ops.md b/runtime/ops/mapper/README_video_ops.md
new file mode 100644
index 000000000..e6eaa6ac9
--- /dev/null
+++ b/runtime/ops/mapper/README_video_ops.md
@@ -0,0 +1,168 @@
+# DataMate 视频算子说明
+
+## 1. 模块概述
+
+本模块为 DataMate 提供视频数据清洗与视频 AI 辅助标注相关算子，覆盖视频预处理、敏感内容检测与裁剪、多目标跟踪、主体跟踪裁剪、关键帧提取、OCR、ASR、视频分类、视频摘要、事件标注等能力。
+
+所有视频算子均按照 DataMate 算子规范组织在 `runtime/ops/mapper/` 目录下，每个算子目录包含以下标准文件：
+
+- `__init__.py`
+- `metadata.yml`
+- `process.py`
+
+视频算子共用的基础能力统一放置于：
+
+- `runtime/ops/mapper/_video_common/`
+
+---
+
+## 2. 已实现算子
+
+### 2.1 视频清洗与预处理
+- `video_format_convert`：视频格式转换
+- `video_deborder_crop`：黑边去除与有效区域裁剪
+- `video_sensitive_detect`：敏感内容检测
+- `video_sensitive_crop`：敏感片段裁剪
+
+### 2.2 跟踪与结构化提取
+- `video_mot_track`：多目标跟踪
+- `video_subject_crop`：主体跟踪裁剪
+- `video_keyframe_extract`：关键帧提取
+- `video_audio_extract`：音频提取
+
+### 2.3 OCR / ASR
+- `video_subtitle_ocr`：字幕提取
+- `video_text_ocr`：显著文字 OCR 提取
+- `video_speech_asr`：语音提取 / 语音识别
+
+### 2.4 基于 QwenVL 的视频语义理解
+- `video_classify_qwenvl`：视频分类
+- `video_summary_qwenvl`：视频摘要提取
+- `video_event_tag_qwenvl`：事件标注
+
+---
+
+## 3. 目录结构
+
+```text
+runtime/ops/mapper/
+├── __init__.py
+├── _video_common/
+│   ├── __init__.py
+│   ├── ffmpeg.py
+│   ├── io_video.py
+│   ├── log.py
+│   ├── model_paths.py
+│   ├── paths.py
+│   ├── qwen_http_client.py
+│   └── schema.py
+├── video_audio_extract/
+├── video_classify_qwenvl/
+├── video_deborder_crop/
+├── video_event_tag_qwenvl/
+├── video_format_convert/
+├── video_keyframe_extract/
+├── video_mot_track/
+├── video_sensitive_crop/
+├── video_sensitive_detect/
+├── video_speech_asr/
+├── video_subject_crop/
+├── video_subtitle_ocr/
+├── video_summary_qwenvl/
+└── video_text_ocr/
+```
+
+---
+
+## 4. 模型管理方式
+
+代码与模型权重分离管理：
+
+- GitHub 仓库中仅保存算子代码、配置与文档；
+- 模型权重统一存放于模型库，不直接提交到代码仓库。
+
+对于本地模型类算子，运行时按以下优先级解析模型根目录：
+
+1. `params["model_root"]`
+2. 环境变量 `DATAMATE_MODEL_ROOT`
+3. 默认兜底目录（如 `/mnt/models`）
+
+对于 QwenVL 相关算子，不在每个算子进程中重复加载模型，而是通过独立 HTTP 服务调用模型能力，以减少重复初始化开销、提升整体执行效率。
+
+---
+
+## 5. 推理方式划分
+
+### 5.1 本地模型类算子
+以下算子直接从统一模型根目录读取模型：
+
+- `video_mot_track`
+- `video_subject_crop`
+- `video_subtitle_ocr`
+- `video_text_ocr`
+- `video_speech_asr`
+
+### 5.2 QwenVL 服务类算子
+以下算子通过独立 HTTP 服务进行推理：
+
+- `video_sensitive_detect`
+- `video_classify_qwenvl`
+- `video_summary_qwenvl`
+- `video_event_tag_qwenvl`
+
+---
+
+## 6. 运行环境说明
+
+当前视频模块涉及两类运行环境：
+
+### 6.1 DataMate 视频算子运行环境
+主要用于视频算子本体执行，涉及能力包括：
+
+- OpenCV
+- FFmpeg 相关依赖
+- YOLO / Ultralytics
+- PaddleOCR
+- ONNX Runtime
+- Faster-Whisper / ASR 相关依赖
+
+### 6.2 QwenVL 服务运行环境
+主要用于独立的 QwenVL HTTP 服务，包括：
+
+- Flask
+- Transformers
+- Qwen-VL 相关依赖
+- Torch 及设备运行时相关依赖
+
+---
+
+## 7. 典型输出结果
+
+不同算子的输出可能包括：
+
+- 裁剪后视频
+- 转码后视频
+- `tracks.json`
+- `summary.json`
+- `events.json`
+- `subtitles.srt`
+- OCR 结果文件
+- 提取音频文件
+- 调试视频 / 可视化结果
+
+---
+
+## 8. QwenVL 服务依赖说明
+
+以下算子依赖独立的 QwenVL HTTP 服务：
+
+- `video_sensitive_detect`
+- `video_classify_qwenvl`
+- `video_summary_qwenvl`
+- `video_event_tag_qwenvl`
+
+该服务会在启动时预加载模型，并在内存中常驻，从而避免多个算子重复加载同一大模型。
+
+---
+
+
diff --git a/runtime/ops/mapper/__init__.py b/runtime/ops/mapper/__init__.py
index 4b9701991..ed0a0fcb2 100644
--- a/runtime/ops/mapper/__init__.py
+++ b/runtime/ops/mapper/__init__.py
@@ -47,6 +47,21 @@ def _import_operators():
     from . import remove_duplicate_sentences
     from . import knowledge_relation_slice
     from . import pii_ner_detection
-
+        # ===== Video operators (PR1-PR5) =====
+    from . import _video_common
+    from . import video_format_convert
+    from . import video_sensitive_detect
+    from . import video_sensitive_crop
+    from . import video_mot_track
+    from . import video_subject_crop
+    from . import video_classify_qwenvl
+    from . import video_summary_qwenvl
+    from . import video_event_tag_qwenvl
+    from . import video_keyframe_extract
+    from . import video_deborder_crop
+    from . import video_audio_extract
+    from . import video_speech_asr
+    from . import video_subtitle_ocr
+    from . import video_text_ocr
 
 _import_operators()
diff --git a/runtime/ops/mapper/_video_common/README_models.md b/runtime/ops/mapper/_video_common/README_models.md
new file mode 100644
index 000000000..b5517fd91
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/README_models.md
@@ -0,0 +1,83 @@
+# 视频算子模型说明
+
+## 1. 基本原则
+
+视频算子采用“代码与模型分离”的管理方式：
+
+- 代码、元数据与说明文档保存在 GitHub 仓库中；
+- 模型权重统一存放于模型库或模型存储目录；
+- 模型文件不直接提交到代码仓库。
+
+---
+
+## 2. 本地模型的解析方式
+
+对于本地模型类算子，模型根目录按以下优先级解析：
+
+1. `params["model_root"]`
+2. 环境变量 `DATAMATE_MODEL_ROOT`
+3. 默认兜底目录（如 `/mnt/models`）
+
+算子在确定模型根目录后，再基于相对路径查找具体模型文件或模型目录。
+
+---
+
+## 3. QwenVL 模型管理方式
+
+QwenVL 相关算子不在每个算子进程中直接加载模型，而是通过独立的 HTTP 服务调用模型能力。
+
+其设计目的包括：
+
+- 避免多个算子重复加载同一大模型；
+- 降低重复初始化的时间开销；
+- 减少整体内存占用；
+- 提高分类、摘要、事件标注、敏感检测等算子的复用效率。
+
+---
+
+## 4. 建议的模型组织方式
+
+建议在模型库中按统一相对路径组织模型，例如：
+
+- `yolo/yolov8n.pt`
+- `ocr/det`
+- `ocr/rec`
+- `ocr/cls`
+- `asr/...`
+- `qwen/Qwen2.5-VL-7B-Instruct`
+
+具体目录名称可根据模型库中的实际组织方式进行调整，但建议在代码与文档中保持一致。
+
+---
+
+## 5. 算子与模型对应关系
+
+### 5.1 本地模型类算子
+- `video_mot_track`：依赖目标跟踪模型（如 YOLO）
+- `video_subject_crop`：依赖目标跟踪结果或目标跟踪模型
+- `video_subtitle_ocr`：依赖 OCR 检测、识别、方向分类模型
+- `video_text_ocr`：依赖 OCR 检测、识别、方向分类模型
+- `video_speech_asr`：依赖 ASR 模型
+
+### 5.2 QwenVL 服务类算子
+- `video_sensitive_detect`：依赖 QwenVL HTTP 服务
+- `video_classify_qwenvl`：依赖 QwenVL HTTP 服务
+- `video_summary_qwenvl`：依赖 QwenVL HTTP 服务
+- `video_event_tag_qwenvl`：依赖 QwenVL HTTP 服务
+
+---
+
+## 6. 部署说明
+
+在部署与运行前，需确保：
+
+- 所需模型已正确放置于模型库或模型目录；
+- `model_root` 或 `DATAMATE_MODEL_ROOT` 配置正确；
+- QwenVL 相关算子运行前，独立 HTTP 服务已正常启动；
+- 模型相对路径与代码中的约定保持一致。
+
+---
+
+## 7. 说明
+
+当前项目中的模型权重未进行参数修改，默认优先复用模型库中已有模型，不重复提交模型权重文件。
diff --git a/runtime/ops/mapper/_video_common/README_qwen_service.md b/runtime/ops/mapper/_video_common/README_qwen_service.md
new file mode 100644
index 000000000..d9331c8c7
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/README_qwen_service.md
@@ -0,0 +1,92 @@
+# QwenVL 服务说明
+
+## 1. 设计目的
+
+以下视频算子依赖 QwenVL 进行语义理解能力推理：
+
+- 视频敏感内容检测
+- 视频分类
+- 视频摘要
+- 视频事件标注
+
+为避免每个算子重复加载 QwenVL 模型，本项目采用独立 HTTP 服务方式提供统一推理能力。
+
+---
+
+## 2. 为什么采用独立 HTTP 服务
+
+QwenVL 属于大模型，如果在每个算子中单独加载，会带来以下问题：
+
+- 模型重复初始化，耗时较高；
+- 多个算子重复占用显存 / 内存；
+- 分类、摘要、事件标注等连续执行时整体效率较低。
+
+因此，项目采用独立 HTTP 服务进行统一推理，服务启动后模型常驻内存，由多个算子共享调用。
+
+---
+
+## 3. 服务工作方式
+
+QwenVL 服务的基本工作流程如下：
+
+1. 启动独立推理服务；
+2. 服务在启动时加载 QwenVL 模型；
+3. 模型在服务进程内常驻；
+4. 算子通过 `service_url` 发起 HTTP 请求；
+5. 服务返回推理结果给对应算子。
+
+---
+
+## 4. 服务依赖的算子
+
+当前依赖 QwenVL HTTP 服务的算子包括：
+
+- `video_sensitive_detect`
+- `video_classify_qwenvl`
+- `video_summary_qwenvl`
+- `video_event_tag_qwenvl`
+
+---
+
+## 5. 算子侧配置方式
+
+QwenVL 相关算子侧主要依赖：
+
+- `service_url`
+
+算子本身不直接维护 QwenVL 模型路径，而是通过 `service_url` 调用已经启动的服务。
+
+---
+
+## 6. 服务侧配置要点
+
+服务侧通常需要配置以下内容：
+
+- QwenVL 模型目录
+- 服务监听地址与端口
+- 推理任务类型
+- 运行设备环境
+
+服务支持的任务类型可包括：
+
+- `sensitive`
+- `classify25`
+- `summary`
+- `event_tag`
+
+---
+
+## 7. 部署建议
+
+为保证视频算子正常运行，建议：
+
+- 先启动 QwenVL HTTP 服务，再运行依赖该服务的算子；
+- 确保服务地址可访问；
+- 确保服务模型目录配置正确；
+- 在文档中单独维护服务启动方式与部署说明。
+
+---
+
+## 8. 说明
+
+当前仓库中的视频算子与 QwenVL 服务逻辑解耦，视频算子只负责组织输入、调用服务并处理结果；模型加载与推理执行由独立服务统一负责。
diff --git a/runtime/ops/mapper/_video_common/__init__.py b/runtime/ops/mapper/_video_common/__init__.py
new file mode 100644
index 000000000..7c68785e9
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/__init__.py
@@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/ffmpeg.py b/runtime/ops/mapper/_video_common/ffmpeg.py
new file mode 100644
index 000000000..c0340c04e
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/ffmpeg.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+import os
+import subprocess
+
+def run_cmd(cmd, logger=None):
+    if logger:
+        logger.info("FFmpeg cmd: " + " ".join(cmd))
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p.returncode != 0:
+        msg = f"FFmpeg failed (code={p.returncode}).\nSTDOUT:\n{p.stdout}\nSTDERR:\n{p.stderr}"
+        raise RuntimeError(msg)
+    return p.stdout, p.stderr
+
+def convert_to_mp4_h264(
+    in_path: str,
+    out_path: str,
+    crf: int = 23,
+    preset: str = "veryfast",
+    audio: bool = True,
+    fps: int = None,
+    scale: str = None,  # e.g. "1280:720" or None
+    logger=None,
+):
+    """
+    最通用的“交付格式”：mp4(H.264) + yuv420p
+    - crf 越小质量越高，体积越大（18~28常用）
+    - preset 越慢压缩越好但越耗时（veryfast/fast/medium）
+    """
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+
+    cmd = ["ffmpeg", "-y", "-i", in_path]
+
+    # 视频参数
+    cmd += ["-c:v", "libx264", "-pix_fmt", "yuv420p", "-preset", preset, "-crf", str(crf)]
+
+    # 可选 fps / scale
+    if fps is not None:
+        cmd += ["-r", str(int(fps))]
+    if scale is not None:
+        cmd += ["-vf", f"scale={scale}"]
+
+    # 音频
+    if audio:
+        cmd += ["-c:a", "aac", "-b:a", "128k"]
+    else:
+        cmd += ["-an"]
+
+    cmd += [out_path]
+    return run_cmd(cmd, logger=logger)
+
+def transcode_any(
+    in_path: str,
+    out_path: str,
+    vcodec: str = "libx264",
+    acodec: str = "aac",
+    pix_fmt: str = "yuv420p",
+    crf: int = 23,
+    preset: str = "veryfast",
+    vbitrate: str = None,   # e.g. "2M"
+    abitrate: str = "128k",
+    fps: int = None,
+    scale: str = None,      # e.g. "1280:720"
+    extra_args: list = None,
+    logger=None,
+):
+    """
+    通用转码：支持任意容器/编码器组合
+    - vcodec/acodec 支持 'copy'（封装重打包或直接流拷贝）
+    - out_path 后缀决定容器格式：.mp4/.mkv/.mov/.avi/.wmv...
+    """
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    cmd = ["ffmpeg", "-y", "-i", in_path]
+
+    # video
+    cmd += ["-c:v", vcodec]
+    if vcodec != "copy":
+        cmd += ["-pix_fmt", pix_fmt]
+        if crf is not None:
+            cmd += ["-crf", str(crf)]
+        if preset:
+            cmd += ["-preset", preset]
+        if vbitrate:
+            cmd += ["-b:v", str(vbitrate)]
+
+    # fps/scale
+    if fps is not None:
+        cmd += ["-r", str(int(fps))]
+    if scale is not None:
+        cmd += ["-vf", f"scale={scale}"]
+
+    # audio
+    cmd += ["-c:a", acodec]
+    if acodec != "copy":
+        if abitrate:
+            cmd += ["-b:a", str(abitrate)]
+
+    if extra_args:
+        cmd += list(extra_args)
+
+    cmd += [out_path]
+    return run_cmd(cmd, logger=logger)
+
+
+
+def cut_segment(in_path: str, out_path: str, start: float, end: float, logger=None):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    cmd = ["ffmpeg", "-y", "-ss", str(start), "-to", str(end), "-i", in_path, "-c", "copy", out_path]
+    return run_cmd(cmd, logger=logger)
+
+def concat_segments(segment_paths, out_path: str, logger=None):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    list_file = out_path + ".txt"
+    with open(list_file, "w", encoding="utf-8") as f:
+        for p in segment_paths:
+            f.write(f"file '{os.path.abspath(p)}'\n")
+    cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", out_path]
+    return run_cmd(cmd, logger=logger)
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/io_video.py b/runtime/ops/mapper/_video_common/io_video.py
new file mode 100644
index 000000000..787a9b6c7
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/io_video.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+import cv2
+
+def get_video_info(video_path: str):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Cannot open video: {video_path}")
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    cap.release()
+    return fps, width, height, frames
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/log.py b/runtime/ops/mapper/_video_common/log.py
new file mode 100644
index 000000000..a47e9d324
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/log.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+import logging
+import os
+
+def get_logger(name: str, log_dir: str = None):
+    logger = logging.getLogger(name)
+    if logger.handlers:
+        return logger
+
+    logger.setLevel(logging.INFO)
+    fmt = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
+
+    sh = logging.StreamHandler()
+    sh.setFormatter(fmt)
+    logger.addHandler(sh)
+
+    if log_dir:
+        os.makedirs(log_dir, exist_ok=True)
+        fh = logging.FileHandler(os.path.join(log_dir, "run.log"), encoding="utf-8")
+        fh.setFormatter(fmt)
+        logger.addHandler(fh)
+
+    return logger
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/model_paths.py b/runtime/ops/mapper/_video_common/model_paths.py
new file mode 100644
index 000000000..1a9bffe25
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/model_paths.py
@@ -0,0 +1,28 @@
+import os
+
+def get_model_root(params=None) -> str:
+    """
+    模型根目录优先级：
+      1) params['model_root']
+      2) 环境变量 DATAMATE_MODEL_ROOT
+      3) 默认 /mnt/models
+    """
+    params = params or {}
+    return params.get("model_root") or os.environ.get("DATAMATE_MODEL_ROOT") or "/mnt/models"
+
+
+def resolve_model_path(params, param_key: str, default_rel: str) -> str:
+    """
+    解析模型路径：
+      - 如果 params[param_key] 是绝对路径：直接用
+      - 如果是相对路径：拼到 model_root
+      - 如果没传：用 model_root + default_rel
+    """
+    params = params or {}
+    root = get_model_root(params)
+
+    v = params.get(param_key)
+    if v:
+        return v if os.path.isabs(v) else os.path.join(root, v)
+
+    return os.path.join(root, default_rel)
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/paths.py b/runtime/ops/mapper/_video_common/paths.py
new file mode 100644
index 000000000..89591302d
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/paths.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import os
+import time
+import uuid
+
+def ensure_dir(p: str):
+    os.makedirs(p, exist_ok=True)
+    return p
+
+def make_run_dir(export_path: str, op_name: str):
+    """
+    统一输出目录：{export_path}/{op_name}/{timestamp_uuid}/
+    """
+    ts = time.strftime("%Y%m%d_%H%M%S")
+    run_id = f"{ts}_{uuid.uuid4().hex[:8]}"
+    out_dir = os.path.join(export_path, op_name, run_id)
+    ensure_dir(out_dir)
+    return out_dir
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/qwen_http_client.py b/runtime/ops/mapper/_video_common/qwen_http_client.py
new file mode 100644
index 000000000..b6640f699
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/qwen_http_client.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+import requests
+
+def qwenvl_infer_by_image_path(
+    image_path: str,
+    task: str,
+    service_url: str = "http://127.0.0.1:18080",
+    max_new_tokens: int = 64,
+    language: str = "zh",
+    style: str = "normal",
+    timeout: int = 180,
+):
+    """
+    对齐你当前服务端 qwen_vl_server.py 的接口：
+      POST {service_url}/infer
+      JSON: {image_path, task, max_new_tokens, language, style}
+
+    返回：服务端 jsonify 的 dict
+    """
+    sess = requests.Session()
+    sess.trust_env = False  # 避免系统代理拦 localhost
+
+    payload = {
+        "image_path": image_path,
+        "task": task,
+        "max_new_tokens": int(max_new_tokens),
+        "language": language,
+        "style": style,
+    }
+    r = sess.post(service_url.rstrip("/") + "/infer", json=payload, timeout=timeout)
+    r.raise_for_status()
+    return r.json()
+
+def save_frame_to_jpg(frame_bgr, out_path: str):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ok = cv2.imwrite(out_path, frame_bgr)
+    if not ok:
+        raise RuntimeError(f"failed to write jpg: {out_path}")
+    return out_path
\ No newline at end of file
diff --git a/runtime/ops/mapper/_video_common/schema.py b/runtime/ops/mapper/_video_common/schema.py
new file mode 100644
index 000000000..d566359e8
--- /dev/null
+++ b/runtime/ops/mapper/_video_common/schema.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+def init_tracks_schema(video_path, fps, width, height):
+    return {
+        "video": video_path,
+        "fps": float(fps),
+        "width": int(width),
+        "height": int(height),
+        "frames": []  # {"frame_id": i, "objects":[{"track_id":..,"bbox":[..],...}]}
+    }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_audio_extract/__init__.py b/runtime/ops/mapper/video_audio_extract/__init__.py
new file mode 100644
index 000000000..674e260e7
--- /dev/null
+++ b/runtime/ops/mapper/video_audio_extract/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoAudioExtract",
+    module_path="ops.mapper.video_audio_extract.process",
+)
diff --git a/runtime/ops/mapper/video_audio_extract/metadata.yml b/runtime/ops/mapper/video_audio_extract/metadata.yml
new file mode 100644
index 000000000..6486b513a
--- /dev/null
+++ b/runtime/ops/mapper/video_audio_extract/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频抽取音频'
+name_en: 'Video Audio Extract'
+description: '从视频中抽取音频，默认输出 wav（16k/mono）；也可输出 aac，并生成音频信息 audio_info.json。'
+description_en: 'Extract audio from video, default wav (16k/mono); can output aac; also generates audio_info.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoAudioExtract'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'audio'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_audio_extract/process.py b/runtime/ops/mapper/video_audio_extract/process.py
new file mode 100644
index 000000000..08ff6b482
--- /dev/null
+++ b/runtime/ops/mapper/video_audio_extract/process.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import shutil
+import subprocess
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+
+
+class VideoAudioExtract:
+    """从视频提取音频（wav 16k mono）
+
+    params:
+      - ffmpeg_path: str, optional
+      - sample_rate: int, default 16000
+      - channels: int, default 1
+      - out_format: wav|aac, default wav
+
+    outputs:
+      - artifacts/audio.wav (or audio.aac)
+      - artifacts/audio_info.json
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_audio_extract"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
+
+        sr = int(params.get("sample_rate", 16000))
+        ch = int(params.get("channels", 1))
+        out_format = (params.get("out_format", "wav") or "wav").lower()
+
+        if out_format == "aac":
+            audio_path = os.path.join(art_dir, "audio.aac")
+            cmd = [
+                ffmpeg_path, "-hide_banner", "-y",
+                "-i", video_path,
+                "-vn",
+                "-ac", str(ch),
+                "-ar", str(sr),
+                "-c:a", "aac",
+                audio_path
+            ]
+        else:
+            audio_path = os.path.join(art_dir, "audio.wav")
+            cmd = [
+                ffmpeg_path, "-hide_banner", "-y",
+                "-i", video_path,
+                "-vn",
+                "-ac", str(ch),
+                "-ar", str(sr),
+                "-c:a", "pcm_s16le",
+                audio_path
+            ]
+
+        logger.info("FFmpeg cmd: " + " ".join(cmd))
+        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"FFmpeg failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+
+        info = {"audio_path": audio_path, "sample_rate": sr, "channels": ch, "format": out_format}
+        info_path = os.path.join(art_dir, "audio_info.json")
+        with open(info_path, "w", encoding="utf-8") as f:
+            json.dump(info, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. audio={audio_path}")
+        return {"out_dir": out_dir, "audio_path": audio_path, "audio_info": info_path}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_classify_qwenvl/__init__.py b/runtime/ops/mapper/video_classify_qwenvl/__init__.py
new file mode 100644
index 000000000..1a47cd17c
--- /dev/null
+++ b/runtime/ops/mapper/video_classify_qwenvl/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoClassifyQwenVL",
+    module_path="ops.mapper.video_classify_qwenvl.process",
+)
diff --git a/runtime/ops/mapper/video_classify_qwenvl/metadata.yml b/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
new file mode 100644
index 000000000..1f27ca1a9
--- /dev/null
+++ b/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频分类（QwenVL）'
+name_en: 'Video Classify (QwenVL)'
+description: '抽帧调用 QwenVL classify25，多帧投票输出分类结果 classification.json。'
+description_en: 'Sample frames and call QwenVL classify25; vote to output classification.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoClassifyQwenVL'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_classify_qwenvl/process.py b/runtime/ops/mapper/video_classify_qwenvl/process.py
new file mode 100644
index 000000000..48a0a6082
--- /dev/null
+++ b/runtime/ops/mapper/video_classify_qwenvl/process.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import collections
+import cv2
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
+
+
+def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
+    if total_frames <= 0:
+        return []
+    fps = float(fps) if fps else 25.0
+    step = max(1, int(round(fps / max(float(sample_fps), 1e-6))))
+    idxs = list(range(0, total_frames, step))
+    if max_frames and len(idxs) > int(max_frames):
+        n = int(max_frames)
+        idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+    return idxs
+
+
+class VideoClassifyQwenVL:
+    """
+    抽帧 + QwenVL HTTP 分类（对齐服务端 task=classify25）：
+      返回: {class_id, class_name, raw}
+
+    params:
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - sample_fps: 默认 1.0
+      - max_frames: 默认 12
+      - return_topk: 默认 3
+      - max_new_tokens: 默认 16
+    outputs:
+      - artifacts/classification.json
+    """
+
+    def execute(self, sample, params=None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        out_dir = make_run_dir(export_path, "video_classify_qwenvl")
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoClassifyQwenVL", log_dir)
+
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
+        sample_fps = float(params.get("sample_fps", 1.0))
+        max_frames = int(params.get("max_frames", 12))
+        return_topk = int(params.get("return_topk", 3))
+        max_new_tokens = int(params.get("max_new_tokens", 16))
+
+        fps, W, H, total_frames = get_video_info(video_path)
+        idxs = _sample_frame_indices(total_frames, fps, sample_fps, max_frames)
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        votes = collections.Counter()
+        evidence = []
+
+        for idx in idxs:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ok, frame = cap.read()
+            if not ok:
+                continue
+
+            frame_jpg = os.path.join(frames_dir, f"{idx:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+
+            try:
+                res = qwenvl_infer_by_image_path(
+                    image_path=frame_jpg,
+                    task="classify25",
+                    service_url=service_url,
+                    max_new_tokens=max_new_tokens,
+                    timeout=timeout_sec,
+                )
+            except Exception as e:
+                logger.error(f"classify infer failed frame={idx}: {repr(e)}")
+                continue
+
+            class_name = (res.get("class_name") or "其他").strip()
+            class_id = int(res.get("class_id", 25))
+            votes[class_name] += 1
+            evidence.append({"frame_idx": idx, "image_path": frame_jpg, "class_id": class_id, "class_name": class_name})
+
+        cap.release()
+
+        topk = [{"label": k, "vote": int(v)} for k, v in votes.most_common(return_topk)]
+        top1 = topk[0]["label"] if topk else "其他"
+
+        result = {
+            "top1": top1,
+            "topk": topk,
+            "service_url": service_url,
+            "sample_fps": sample_fps,
+            "max_frames": max_frames,
+            "evidence": evidence,
+        }
+
+        json_path = os.path.join(art_dir, "classification.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. classification_json={json_path}, top1={top1}")
+        return {"out_dir": out_dir, "classification_json": json_path, "top1": top1}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_deborder_crop/__init__.py b/runtime/ops/mapper/video_deborder_crop/__init__.py
new file mode 100644
index 000000000..c8f8e4d07
--- /dev/null
+++ b/runtime/ops/mapper/video_deborder_crop/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoDeborderCrop",
+    module_path="ops.mapper.video_deborder_crop.process",
+)
diff --git a/runtime/ops/mapper/video_deborder_crop/metadata.yml b/runtime/ops/mapper/video_deborder_crop/metadata.yml
new file mode 100644
index 000000000..71ac6ea2f
--- /dev/null
+++ b/runtime/ops/mapper/video_deborder_crop/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频去黑边裁剪'
+name_en: 'Video Deborder Crop'
+description: '使用 ffmpeg cropdetect 自动检测黑边并裁剪输出 deborder.mp4；也支持 force_crop 指定裁剪框。'
+description_en: 'Detect black borders via ffmpeg cropdetect and crop to output deborder.mp4; supports force_crop.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoDeborderCrop'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_deborder_crop/process.py b/runtime/ops/mapper/video_deborder_crop/process.py
new file mode 100644
index 000000000..02e009243
--- /dev/null
+++ b/runtime/ops/mapper/video_deborder_crop/process.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+import os
+import re
+import json
+import shutil
+import subprocess
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+
+
+@dataclass
+class CropBox:
+    w: int
+    h: int
+    x: int
+    y: int
+
+    def to_str(self) -> str:
+        return f"{self.w}:{self.h}:{self.x}:{self.y}"
+
+
+def _even(x: int) -> int:
+    return x - (x % 2)
+
+
+def _parse_cropdetect(stderr: str) -> List[CropBox]:
+    # ffmpeg cropdetect logs like: "crop=iw:ih:x:y" or "crop=1920:800:0:140"
+    boxes = []
+    for line in stderr.splitlines():
+        m = re.search(r"crop=(\d+):(\d+):(\d+):(\d+)", line)
+        if m:
+            w, h, x, y = map(int, m.groups())
+            boxes.append(CropBox(w=w, h=h, x=x, y=y))
+    return boxes
+
+
+def _pick_box(boxes: List[CropBox], mode: str = "safe_keep_more") -> Optional[CropBox]:
+    """
+    mode:
+      - safe_keep_more: 尽量少裁（更保守，避免误裁内容）=> 取 w/h 最大 + x/y 最小
+      - aggressive_remove_more: 尽量多裁黑边 => 取 w/h 最小 + x/y 最大
+      - median: 取中位数
+    """
+    if not boxes:
+        return None
+
+    ws = sorted(b.w for b in boxes)
+    hs = sorted(b.h for b in boxes)
+    xs = sorted(b.x for b in boxes)
+    ys = sorted(b.y for b in boxes)
+
+    if mode == "aggressive_remove_more":
+        w, h, x, y = min(ws), min(hs), max(xs), max(ys)
+    elif mode == "median":
+        mid = len(ws) // 2
+        w, h, x, y = ws[mid], hs[mid], xs[mid], ys[mid]
+    else:
+        # 默认：尽量少裁，避免裁掉内容
+        w, h, x, y = max(ws), max(hs), min(xs), min(ys)
+
+    # crop 参数通常要求偶数（编码器/像素格式更兼容）
+    return CropBox(w=_even(w), h=_even(h), x=_even(x), y=_even(y))
+
+
+def detect_crop_box(
+    ffmpeg_path: str,
+    video_path: str,
+    sample_points: List[Tuple[float, float]],
+    cropdetect: str,
+    logger,
+) -> Optional[CropBox]:
+    """在多个时间点探测 crop，汇总后给出一个 crop box。"""
+    all_boxes: List[CropBox] = []
+    for (ss, dur) in sample_points:
+        cmd = [
+            ffmpeg_path, "-hide_banner", "-y",
+            "-ss", f"{ss}",
+            "-i", video_path,
+            "-t", f"{dur}",
+            "-vf", cropdetect,
+            "-f", "null", "-"
+        ]
+        logger.info("cropdetect cmd: " + " ".join(cmd))
+        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        # cropdetect 输出在 stderr；即使 returncode!=0 也可能有输出，所以不直接失败
+        boxes = _parse_cropdetect(p.stderr)
+        if boxes:
+            # 取该段最后一个（通常更稳定）
+            all_boxes.append(boxes[-1])
+
+    # 汇总选择一个 box（默认保守：少裁）
+    return _pick_box(all_boxes, mode="safe_keep_more")
+
+
+def crop_video(
+    ffmpeg_path: str,
+    video_path: str,
+    out_path: str,
+    crop: CropBox,
+    logger,
+    crf: int = 23,
+    preset: str = "veryfast",
+    audio_copy: bool = True,
+):
+    # 裁剪会改变尺寸，必须重新编码视频；音频可以 copy
+    cmd = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-i", video_path,
+        "-vf", f"crop={crop.to_str()}",
+        "-c:v", "libx264",
+        "-preset", preset,
+        "-crf", str(crf),
+        "-pix_fmt", "yuv420p",
+    ]
+    if audio_copy:
+        cmd += ["-c:a", "copy"]
+    else:
+        cmd += ["-c:a", "aac", "-b:a", "128k"]
+
+    cmd += [out_path]
+
+    logger.info("crop cmd: " + " ".join(cmd))
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p.returncode != 0:
+        raise RuntimeError(f"ffmpeg crop failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+
+
+class VideoDeborderCrop:
+    """去黑边（自动 cropdetect + crop）
+
+    params:
+      - ffmpeg_path: str, optional
+      - cropdetect: str, default "cropdetect=24:16:0"
+      - sample_points: list, optional
+          默认会采样 [(0,2),(5,2)]；如果视频很短也没关系
+      - force_crop: str, optional  # 直接指定 "w:h:x:y"
+      - crf: int, default 23
+      - preset: str, default "veryfast"
+      - audio_copy: bool, default True
+
+    outputs:
+      - artifacts/deborder.mp4
+      - artifacts/crop_params.json
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_deborder_crop"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        logger = get_logger(op_name, log_dir)
+
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
+
+        cropdetect = params.get("cropdetect", "cropdetect=24:16:0")
+        force_crop = params.get("force_crop", None)
+        crf = int(params.get("crf", 23))
+        preset = params.get("preset", "veryfast")
+        audio_copy = bool(params.get("audio_copy", True))
+
+        # 默认采样点：开头 2s + 5s 处 2s
+        sample_points = params.get("sample_points", None)
+        if not sample_points:
+            sample_points = [(0.0, 2.0), (5.0, 2.0)]
+
+        crop_box: Optional[CropBox] = None
+        if force_crop:
+            m = re.match(r"(\d+):(\d+):(\d+):(\d+)", str(force_crop))
+            if not m:
+                raise ValueError('force_crop must be "w:h:x:y"')
+            w, h, x, y = map(int, m.groups())
+            crop_box = CropBox(w=_even(w), h=_even(h), x=_even(x), y=_even(y))
+        else:
+            crop_box = detect_crop_box(ffmpeg_path, video_path, sample_points, cropdetect, logger)
+
+        if not crop_box:
+            # 探测不到就原样输出（不裁剪）
+            logger.warning("cropdetect found nothing, keep original video.")
+            crop_box = CropBox(w=0, h=0, x=0, y=0)
+
+        out_mp4 = os.path.join(art_dir, "deborder.mp4")
+        crop_json = os.path.join(art_dir, "crop_params.json")
+
+        if crop_box.w == 0 or crop_box.h == 0:
+            # 直接复制（不裁）
+            cmd = [ffmpeg_path, "-hide_banner", "-y", "-i", video_path, "-c", "copy", out_mp4]
+            logger.info("copy cmd: " + " ".join(cmd))
+            p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            if p.returncode != 0:
+                raise RuntimeError(f"ffmpeg copy failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+            info = {"mode": "copy", "crop": None, "out_mp4": out_mp4}
+        else:
+            crop_video(ffmpeg_path, video_path, out_mp4, crop_box, logger, crf=crf, preset=preset, audio_copy=audio_copy)
+            info = {"mode": "crop", "crop": crop_box.__dict__, "out_mp4": out_mp4}
+
+        with open(crop_json, "w", encoding="utf-8") as f:
+            json.dump(info, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. deborder_mp4={out_mp4}")
+        return {"out_dir": out_dir, "deborder_mp4": out_mp4, "crop_params_json": crop_json}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/__init__.py b/runtime/ops/mapper/video_event_tag_qwenvl/__init__.py
new file mode 100644
index 000000000..5ac7d0d8e
--- /dev/null
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoEventTagQwenVL",
+    module_path="ops.mapper.video_event_tag_qwenvl.process",
+)
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml b/runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml
new file mode 100644
index 000000000..56d797f7e
--- /dev/null
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/metadata.yml
@@ -0,0 +1,16 @@
+name: '事件标注（QwenVL）'
+name_en: 'Video Event Tag (QwenVL)'
+description: '自适应分段取每段中点帧调用 QwenVL event_tag，输出 events.json。'
+description_en: 'Adaptive segmentation; call QwenVL event_tag on mid-frame of each segment; outputs events.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoEventTagQwenVL'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_event_tag_qwenvl/process.py b/runtime/ops/mapper/video_event_tag_qwenvl/process.py
new file mode 100644
index 000000000..43974f276
--- /dev/null
+++ b/runtime/ops/mapper/video_event_tag_qwenvl/process.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
+
+
+def _make_segments(duration_sec: float, params: dict):
+    adaptive = bool(params.get("adaptive_segment", True))
+    max_segments = int(params.get("max_segments", 60))
+    max_new_tokens = int(params.get("max_new_tokens", 32))
+
+    if duration_sec <= 0:
+        return [(0.0, 0.0)]
+
+    if not adaptive:
+        seg_len = float(params.get("segment_seconds", 5.0))
+    else:
+        target = int(params.get("target_segments", 12))
+        min_seg = float(params.get("min_segment_seconds", 2.0))
+        max_seg = float(params.get("max_segment_seconds", 60.0))
+        seg_len = duration_sec / max(1, target)
+        seg_len = max(min_seg, min(max_seg, seg_len))
+
+    segs = []
+    s = 0.0
+    while s < duration_sec and len(segs) < max_segments:
+        e = min(duration_sec, s + seg_len)
+        segs.append((s, e))
+        s = e
+    return segs
+
+
+class VideoEventTagQwenVL:
+    """
+    分段取中点帧 → QwenVL HTTP 事件标注（对齐服务端 task=event_tag）：
+      返回: {event}
+
+    params:
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - adaptive_segment: 默认 True
+      - target_segments: 默认 12
+      - min_segment_seconds: 默认 2.0
+      - max_segment_seconds: 默认 60.0
+      - segment_seconds: 默认 5.0（当 adaptive_segment=false 时）
+      - max_segments: 默认 60
+      - max_new_tokens: 默认 32
+    outputs:
+      - artifacts/events.json
+      - artifacts/frames/*.jpg
+    """
+
+    def execute(self, sample, params=None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        out_dir = make_run_dir(export_path, "video_event_tag_qwenvl")
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoEventTagQwenVL", log_dir)
+
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
+        max_new_tokens = int(params.get("max_new_tokens", 32))
+
+        fps, W, H, total_frames = get_video_info(video_path)
+        duration_sec = (float(total_frames) / float(fps)) if fps else 0.0
+        segs = _make_segments(duration_sec, params)
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        events = []
+        for i, (s, e) in enumerate(segs):
+            mid = (s + e) / 2.0
+            mid_frame = int(round(mid * float(fps))) if fps else 0
+            cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame)
+            ok, frame = cap.read()
+            if not ok:
+                continue
+
+            frame_jpg = os.path.join(frames_dir, f"seg_{i:04d}_mid_{mid_frame:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+
+            try:
+                res = qwenvl_infer_by_image_path(
+                    image_path=frame_jpg,
+                    task="event_tag",
+                    service_url=service_url,
+                    max_new_tokens=max_new_tokens,
+                    timeout=timeout_sec,
+                )
+                event = (res.get("event") or "").strip()
+            except Exception as ex:
+                logger.error(f"event_tag infer failed seg={i} mid={mid:.2f}: {repr(ex)}")
+                event = ""
+
+            events.append(
+                {
+                    "seg_id": i,
+                    "start": float(s),
+                    "end": float(e),
+                    "mid": float(mid),
+                    "mid_frame": int(mid_frame),
+                    "image_path": frame_jpg,
+                    "event": event,
+                }
+            )
+
+        cap.release()
+
+        out_json = os.path.join(art_dir, "events.json")
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "video": video_path,
+                    "service_url": service_url,
+                    "duration_sec": duration_sec,
+                    "segments": events,
+                },
+                f,
+                ensure_ascii=False,
+                indent=2,
+            )
+
+        logger.info(f"Done. events_json={out_json}, segments={len(events)}")
+        return {"out_dir": out_dir, "events_json": out_json, "segments_count": len(events)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_format_convert/__init__.py b/runtime/ops/mapper/video_format_convert/__init__.py
new file mode 100644
index 000000000..62f6d4509
--- /dev/null
+++ b/runtime/ops/mapper/video_format_convert/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoFormatConvert",
+    module_path="ops.mapper.video_format_convert.process",
+)
diff --git a/runtime/ops/mapper/video_format_convert/metadata.yml b/runtime/ops/mapper/video_format_convert/metadata.yml
new file mode 100644
index 000000000..ed1e3f2ad
--- /dev/null
+++ b/runtime/ops/mapper/video_format_convert/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频格式转换'
+name_en: 'Video Format Convert'
+description: '仅做容器格式转换（stream copy，不重编码）；输出 converted.xxx 与 convert_result.json。'
+description_en: 'Container remux via ffmpeg stream copy (no re-encode); outputs converted.xxx and convert_result.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoFormatConvert'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_format_convert/process.py b/runtime/ops/mapper/video_format_convert/process.py
new file mode 100644
index 000000000..3a4ce5522
--- /dev/null
+++ b/runtime/ops/mapper/video_format_convert/process.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from .._video_common.ffmpeg import run_cmd
+
+
+class VideoFormatConvert:
+    """
+    仅做“容器格式转换”（不重编码）：
+    - 通过 ffmpeg stream copy 实现：-c:v copy -c:a copy
+    - 输出文件后缀决定目标容器格式：mp4/mkv/mov/avi/wmv...
+
+    输入:
+      sample["filePath"]
+      sample["export_path"]
+
+    params:
+      - container: 目标容器后缀（默认 "mp4"）
+      - out_name: 输出文件名（默认 "converted.{container}"）
+      - copy_video: 是否 copy 视频流（默认 True）
+      - copy_audio: 是否 copy 音频流（默认 True）
+      - extra_args: 额外 ffmpeg 参数列表（可选）
+
+    输出:
+      out_dir/converted.xxx
+      out_dir/convert_result.json
+      out_dir/run.log
+    """
+
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        in_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_format_convert")
+        logger = get_logger("VideoFormatConvert", log_dir=out_dir)
+
+        # 目标容器
+        container = str(params.get("container", "mp4")).lstrip(".").lower()
+        out_name = params.get("out_name", f"converted.{container}")
+        if not out_name.lower().endswith(f".{container}"):
+            # 防止用户给了不匹配的后缀
+            out_name = f"{out_name}.{container}"
+        out_video = os.path.join(out_dir, out_name)
+
+        copy_video = bool(params.get("copy_video", True))
+        copy_audio = bool(params.get("copy_audio", True))
+        extra_args = params.get("extra_args", None)  # list[str] or None
+
+        logger.info(f"Start container convert (stream copy). in={in_path}, out={out_video}, container={container}")
+
+        cmd = ["ffmpeg", "-y", "-i", in_path]
+
+        # 视频流
+        cmd += ["-c:v", "copy" if copy_video else "libx264"]
+        # 音频流
+        cmd += ["-c:a", "copy" if copy_audio else "aac"]
+
+        # 如果用户传了额外参数（例如 -map 0、-movflags +faststart 等）
+        if extra_args:
+            if not isinstance(extra_args, list):
+                raise ValueError("params['extra_args'] must be a list, e.g. ['-movflags', '+faststart']")
+            cmd += extra_args
+
+        cmd += [out_video]
+
+        try:
+            run_cmd(cmd, logger=logger)
+        except Exception as e:
+            # 给更明确的提示：某些容器不支持某些编码，copy 会失败
+            logger.error("Container convert failed. This is usually due to codec/container incompatibility when using stream copy.")
+            logger.error("You can either choose a different container, or enable re-encode (copy_video/copy_audio=False).")
+            raise
+
+        result = {
+            "out_dir": out_dir,
+            "input": in_path,
+            "output_video": out_video,
+            "mode": "stream_copy",
+            "params": {
+                "container": container,
+                "out_name": out_name,
+                "copy_video": copy_video,
+                "copy_audio": copy_audio,
+                "extra_args": extra_args,
+            },
+        }
+
+        json_path = os.path.join(out_dir, "convert_result.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. output={out_video}")
+        return result
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_keyframe_extract/__init__.py b/runtime/ops/mapper/video_keyframe_extract/__init__.py
new file mode 100644
index 000000000..be2c3c378
--- /dev/null
+++ b/runtime/ops/mapper/video_keyframe_extract/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoKeyframeExtract",
+    module_path="ops.mapper.video_keyframe_extract.process",
+)
diff --git a/runtime/ops/mapper/video_keyframe_extract/metadata.yml b/runtime/ops/mapper/video_keyframe_extract/metadata.yml
new file mode 100644
index 000000000..2779b28e7
--- /dev/null
+++ b/runtime/ops/mapper/video_keyframe_extract/metadata.yml
@@ -0,0 +1,16 @@
+name: '关键帧提取'
+name_en: 'Video Keyframe Extract'
+description: '基于 ffmpeg scene detect 提取关键帧，并可补封面帧（t=0），输出 keyframes.json 与关键帧图片。'
+description_en: 'Extract keyframes via ffmpeg scene detect and optional cover frame (t=0); outputs keyframes.json and images.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoKeyframeExtract'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'image'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_keyframe_extract/process.py b/runtime/ops/mapper/video_keyframe_extract/process.py
new file mode 100644
index 000000000..b2b795518
--- /dev/null
+++ b/runtime/ops/mapper/video_keyframe_extract/process.py
@@ -0,0 +1,229 @@
+import os
+import json
+import shutil
+import subprocess
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+
+def _run(cmd: List[str]) -> Tuple[int, str]:
+    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    return p.returncode, (p.stderr or "") + (p.stdout or "")
+
+
+def _ensure_dir(p: str):
+    os.makedirs(p, exist_ok=True)
+
+
+def _list_jpgs(d: str) -> List[str]:
+    if not os.path.isdir(d):
+        return []
+    xs = [os.path.join(d, x) for x in os.listdir(d) if x.lower().endswith(".jpg")]
+    xs.sort()
+    return xs
+
+
+def _probe_duration(ffprobe_path: str, video_path: str) -> float:
+    # 尽量不用任何第三方库，直接 ffprobe
+    cmd = [
+        ffprobe_path, "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        video_path
+    ]
+    rc, out = _run(cmd)
+    if rc != 0:
+        return 0.0
+    try:
+        return float(out.strip().splitlines()[-1])
+    except Exception:
+        return 0.0
+
+
+@dataclass
+class KeyframeParams:
+    ffmpeg_path: str = ""
+    ffprobe_path: str = ""
+    scene_threshold: float = 0.3
+    threshold_candidates: Optional[List[float]] = None
+    max_keyframes: int = 30
+    min_interval_sec: float = 1.0
+    always_include_first: bool = True
+    quality: int = 2  # -q:v
+    out_json_name: str = "keyframes.json"
+
+
+class VideoKeyframeExtractLocal:
+    """
+    本地运行版：不依赖 datamate。
+    输出：
+      <out_dir>/artifacts/keyframes/cover.jpg (可选)
+      <out_dir>/artifacts/keyframes/%06d.jpg  (scene 帧)
+      <out_dir>/artifacts/keyframes.json
+    """
+
+    def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        p = KeyframeParams(**(params or {}))
+
+        ffmpeg = p.ffmpeg_path or shutil.which("ffmpeg")
+        ffprobe = p.ffprobe_path or shutil.which("ffprobe")
+        if not ffmpeg:
+            raise RuntimeError("ffmpeg not found. Install ffmpeg or set ffmpeg_path.")
+        if not ffprobe:
+            raise RuntimeError("ffprobe not found. Install ffprobe or set ffprobe_path.")
+
+        artifacts = os.path.join(out_dir, "artifacts")
+        key_dir = os.path.join(artifacts, "keyframes")
+        _ensure_dir(key_dir)
+
+        duration = _probe_duration(ffprobe, video_path)
+
+        outputs: List[Dict[str, Any]] = []
+
+        # 1) cover
+        cover_path = os.path.join(key_dir, "cover.jpg")
+        if p.always_include_first:
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-ss", "0",
+                "-i", video_path,
+                "-frames:v", "1",
+                "-q:v", str(p.quality),
+                "-vf", "format=yuvj420p",
+                cover_path
+            ]
+            rc, log = _run(cmd)
+            if rc == 0 and os.path.exists(cover_path):
+                outputs.append({"kind": "cover", "time_sec": 0.0, "path": cover_path})
+            else:
+                # cover 失败不致命
+                pass
+
+        # 2) scene keyframes
+        thr_candidates = p.threshold_candidates or [p.scene_threshold, 0.2, 0.15, 0.1, 0.06]
+        scene_files: List[str] = []
+        used_thr: Optional[float] = None
+
+        for thr in thr_candidates:
+            # 清掉旧的 scene 输出（保留 cover）
+            for f in _list_jpgs(key_dir):
+                if os.path.basename(f) != "cover.jpg":
+                    try:
+                        os.remove(f)
+                    except Exception:
+                        pass
+
+            vf = f"select='gt(scene,{thr})',format=yuvj420p"
+            out_tpl = os.path.join(key_dir, "%06d.jpg")
+
+            # 兼容新旧 ffmpeg
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-i", video_path,
+                "-vf", vf,
+                "-q:v", str(p.quality),
+                "-frames:v", str(p.max_keyframes * 3),
+                "-fps_mode", "vfr",
+                out_tpl
+            ]
+            rc, log = _run(cmd)
+            if rc != 0 and "Unrecognized option 'fps_mode'" in log:
+                cmd = [
+                    ffmpeg, "-hide_banner", "-y",
+                    "-i", video_path,
+                    "-vf", vf,
+                    "-q:v", str(p.quality),
+                    "-frames:v", str(p.max_keyframes * 3),
+                    "-vsync", "vfr",
+                    out_tpl
+                ]
+                rc, log = _run(cmd)
+
+            files = [f for f in _list_jpgs(key_dir) if os.path.basename(f) != "cover.jpg"]
+            if files:
+                scene_files = files
+                used_thr = thr
+                break
+
+        # 3) fallback：scene=0 时取中间帧
+        if not scene_files:
+            t = duration / 2.0 if duration > 0 else 0.0
+            mid_path = os.path.join(key_dir, "000001.jpg")
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-ss", f"{t}",
+                "-i", video_path,
+                "-frames:v", "1",
+                "-q:v", str(p.quality),
+                "-vf", "format=yuvj420p",
+                mid_path
+            ]
+            rc, log = _run(cmd)
+            if rc != 0 or (not os.path.exists(mid_path)):
+                raise RuntimeError(f"KeyframeExtractLocal failed: scene=0 and fallback midframe failed. log={log[-800:]}")
+            scene_files = [mid_path]
+            used_thr = None
+
+        # 4) 时间间隔过滤 + 截断 max_keyframes
+        # 这里用“均匀估计”时间戳（不解析 showinfo），足够用于过滤过密
+        if duration > 0 and len(scene_files) > 1:
+            kept: List[Tuple[float, str]] = []
+            last_t = -1e9
+            for i, f in enumerate(scene_files):
+                t = (i / max(1, (len(scene_files) - 1))) * duration
+                if t - last_t >= p.min_interval_sec:
+                    kept.append((t, f))
+                    last_t = t
+                if len(kept) >= p.max_keyframes:
+                    break
+            for t, f in kept:
+                outputs.append({"kind": "scene", "time_sec": float(t), "path": f})
+        else:
+            for f in scene_files[:p.max_keyframes]:
+                outputs.append({"kind": "scene", "time_sec": None, "path": f})
+
+        out_json = os.path.join(artifacts, p.out_json_name)
+        payload = {
+            "input": video_path,
+            "out_dir": out_dir,
+            "scene_threshold": p.scene_threshold,
+            "used_scene_threshold": used_thr,
+            "max_keyframes": p.max_keyframes,
+            "min_interval_sec": p.min_interval_sec,
+            "always_include_first": p.always_include_first,
+            "keyframes": outputs,
+        }
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(payload, f, ensure_ascii=False, indent=2)
+
+        return {
+            "out_dir": out_dir,
+            "keyframes_json": out_json,
+            "keyframes_dir": key_dir,
+        }
+
+
+if __name__ == "__main__":
+    import argparse
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--video", required=True)
+    ap.add_argument("--out_dir", required=True)
+    ap.add_argument("--scene_threshold", type=float, default=0.15)
+    ap.add_argument("--max_keyframes", type=int, default=30)
+    ap.add_argument("--min_interval_sec", type=float, default=1.0)
+    ap.add_argument("--always_include_first", action="store_true")
+    args = ap.parse_args()
+
+    runner = VideoKeyframeExtractLocal()
+    res = runner.run(
+        video_path=args.video,
+        out_dir=args.out_dir,
+        params={
+            "scene_threshold": args.scene_threshold,
+            "max_keyframes": args.max_keyframes,
+            "min_interval_sec": args.min_interval_sec,
+            "always_include_first": bool(args.always_include_first),
+        },
+    )
+    print(json.dumps(res, ensure_ascii=False, indent=2))
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/__init__.py b/runtime/ops/mapper/video_mot_track/__init__.py
new file mode 100644
index 000000000..9d9954c8e
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoMotTrack",
+    module_path="ops.mapper.video_mot_track.process",
+)
diff --git a/runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml b/runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml
new file mode 100644
index 000000000..c8cdead2e
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/configs/bytetrack.yaml
@@ -0,0 +1,7 @@
+tracker_type: bytetrack
+track_high_thresh: 0.25
+track_low_thresh: 0.1
+new_track_thresh: 0.25
+track_buffer: 30
+match_thresh: 0.8
+fuse_score: True
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/metadata.yml b/runtime/ops/mapper/video_mot_track/metadata.yml
new file mode 100644
index 000000000..24c0ff529
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/metadata.yml
@@ -0,0 +1,16 @@
+name: '多目标跟踪'
+name_en: 'Video MOT Track'
+description: '基于检测+跟踪生成轨迹文件 tracks.json，并输出 debug.mp4 用于可视化验收。'
+description_en: 'Run detection+tracking to generate tracks.json and debug.mp4 for visualization.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoMotTrack'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_mot_track/process.py b/runtime/ops/mapper/video_mot_track/process.py
new file mode 100644
index 000000000..b3a1010c3
--- /dev/null
+++ b/runtime/ops/mapper/video_mot_track/process.py
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+import shutil
+
+from ultralytics import YOLO
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.schema import init_tracks_schema
+from .._video_common.model_paths import resolve_model_path
+
+
+class VideoMotTrack:
+    """多目标跟踪（YOLO + ByteTrack）
+
+    权重策略（模型仓）：
+      DATAMATE_MODEL_ROOT=/mnt/models
+      默认权重：/mnt/models/yolo/yolov8n.pt
+    params:
+      - model_root: 可选，覆盖 DATAMATE_MODEL_ROOT
+      - yolo_model: 可选，权重路径（相对/绝对均可）
+      - conf: default 0.3
+      - iou: default 0.5
+      - classes: "0,2,3" or None
+      - tracker_cfg: bytetrack yaml 路径（默认算子 configs/bytetrack.yaml）
+      - save_debug: default True
+    outputs:
+      - tracks.json
+      - debug.mp4 (optional)
+    """
+
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        out_dir = make_run_dir(export_path, "video_mot_track")
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        logger = get_logger("VideoMotTrack", log_dir)
+
+        # YOLO config dir（避免写到不可写目录）
+        os.environ.setdefault("YOLO_CONFIG_DIR", os.path.join(out_dir, "yolo_cfg"))
+        os.makedirs(os.environ["YOLO_CONFIG_DIR"], exist_ok=True)
+
+        # ✅ 模型仓默认权重
+        yolo_model = resolve_model_path(params, "yolo_model", "yolo/yolov8n.pt")
+
+        conf = float(params.get("conf", 0.3))
+        iou = float(params.get("iou", 0.5))
+        classes = params.get("classes", None)  # "0,2,3" or None
+        tracker_cfg = params.get("tracker_cfg", os.path.join(os.path.dirname(__file__), "configs/bytetrack.yaml"))
+        save_debug = bool(params.get("save_debug", True))
+
+        cls_list = None
+        if classes:
+            cls_list = [int(x.strip()) for x in str(classes).split(",") if x.strip() != ""]
+
+        fps, W, H, _ = get_video_info(video_path)
+        tracks = init_tracks_schema(video_path, fps, W, H)
+
+        debug_path = os.path.join(art_dir, "debug.mp4")
+        debug_writer = None
+        if save_debug:
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            debug_writer = cv2.VideoWriter(debug_path, fourcc, fps, (W, H))
+
+        logger.info(f"Start tracking. video={video_path}, model={yolo_model}, conf={conf}, iou={iou}, classes={classes}")
+        if not os.path.exists(yolo_model):
+            raise RuntimeError(f"YOLO weight not found: {yolo_model}. Please download to model repo path.")
+
+        model = YOLO(yolo_model)
+        results_iter = model.track(
+            source=video_path,
+            conf=conf,
+            iou=iou,
+            classes=cls_list,
+            tracker=tracker_cfg,
+            stream=True,
+            verbose=False,
+        )
+
+        frame_idx = 0
+        for r in results_iter:
+            frame = r.orig_img
+            objs = []
+            if r.boxes is not None and r.boxes.id is not None:
+                ids = r.boxes.id.cpu().numpy().tolist()
+                xyxy = r.boxes.xyxy.cpu().numpy().tolist()
+                confs = r.boxes.conf.cpu().numpy().tolist()
+                clss = r.boxes.cls.cpu().numpy().tolist()
+                for tid, bb, sc, cc in zip(ids, xyxy, confs, clss):
+                    x1, y1, x2, y2 = bb
+                    objs.append({
+                        "track_id": int(tid),
+                        "bbox": [float(x1), float(y1), float(x2), float(y2)],
+                        "score": float(sc),
+                        "cls_id": int(cc),
+                    })
+                    if debug_writer is not None:
+                        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0,255,0), 2)
+                        cv2.putText(frame, f"id={int(tid)}", (int(x1), int(y1)-5),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2)
+            tracks["frames"].append({"frame_idx": frame_idx, "objects": objs})
+
+            if debug_writer is not None:
+                debug_writer.write(frame)
+            frame_idx += 1
+
+        if debug_writer is not None:
+            debug_writer.release()
+
+        tracks_path = os.path.join(art_dir, "tracks.json")
+        with open(tracks_path, "w", encoding="utf-8") as f:
+            json.dump(tracks, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. tracks_json={tracks_path}")
+        out = {"out_dir": out_dir, "tracks_json": tracks_path}
+        if save_debug:
+            out["debug_mp4"] = debug_path
+        return out
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_crop/__init__.py b/runtime/ops/mapper/video_sensitive_crop/__init__.py
new file mode 100644
index 000000000..ff8912df0
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_crop/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSensitiveCrop",
+    module_path="ops.mapper.video_sensitive_crop.process",
+)
diff --git a/runtime/ops/mapper/video_sensitive_crop/metadata.yml b/runtime/ops/mapper/video_sensitive_crop/metadata.yml
new file mode 100644
index 000000000..403983306
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_crop/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频敏感裁剪'
+name_en: 'Video Sensitive Crop'
+description: '根据敏感片段 JSON 裁剪/清洗输出 cleaned.mp4，并生成 crop_result.json。'
+description_en: 'Crop/clean video based on sensitive segments JSON; outputs cleaned.mp4 and crop_result.json.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSensitiveCrop'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_crop/process.py b/runtime/ops/mapper/video_sensitive_crop/process.py
new file mode 100644
index 000000000..154c1f375
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_crop/process.py
@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.ffmpeg import cut_segment, concat_segments
+from ..video_sensitive_detect.process import VideoSensitiveDetect
+
+
+def complement_intervals(segments, duration):
+    if not segments:
+        return [[0.0, duration]]
+
+    segs = sorted([(float(x["start"]), float(x["end"])) for x in segments], key=lambda x: x[0])
+
+    # merge
+    merged = []
+    cs, ce = segs[0]
+    for s, e in segs[1:]:
+        if s <= ce:
+            ce = max(ce, e)
+        else:
+            merged.append([cs, ce])
+            cs, ce = s, e
+    merged.append([cs, ce])
+
+    keep = []
+    prev = 0.0
+    for s, e in merged:
+        s = max(0.0, min(duration, s))
+        e = max(0.0, min(duration, e))
+        if s > prev:
+            keep.append([prev, s])
+        prev = max(prev, e)
+    if prev < duration:
+        keep.append([prev, duration])
+
+    return [[s, e] for s, e in keep if e - s >= 0.05]
+
+
+class VideoSensitiveCrop:
+    """
+    敏感裁剪：默认 remove（剔除敏感段）
+    params:
+      - segments_json: 必填（video_sensitive_detect 输出）
+      - keep_mode: "remove" 或 "keep"（默认 remove）
+      - out_name: 默认 cleaned.mp4
+    """
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_sensitive_crop")
+        logger = get_logger("VideoSensitiveCrop", log_dir=out_dir)
+
+
+        segments_json = params.get("segments_json", "")
+        # 如果没传 segments_json，就自动先跑 VideoSensitiveDetect 生成
+        if (not segments_json) or (not os.path.exists(segments_json)):
+            # detect_params 优先从 params["detect_params"] 读取；否则从当前 params 里抽取 detect 所需字段
+            detect_params = params.get("detect_params", None)
+            if detect_params is None:
+                detect_keys = ["qwen_module", "qwen_func", "sample_fps", "threshold", "merge_gap", "prompt"]
+                detect_params = {k: params[k] for k in detect_keys if k in params}
+
+            # VideoSensitiveDetect 里 qwen_module 是必填的；没给就明确报错（避免你后面裁剪时不知道为什么没生成）
+            if "qwen_module" not in detect_params:
+                raise RuntimeError(
+                    "VideoSensitiveCrop: segments_json not provided, and detect_params missing required 'qwen_module'. "
+                    "Please pass params['qwen_module'] (and optional qwen_func/sample_fps/threshold/merge_gap)."
+                )
+
+            logger.info("segments_json not provided; run VideoSensitiveDetect first to generate sensitive_segments.json")
+            det_out = VideoSensitiveDetect().execute(sample, detect_params)
+
+            # 兼容不同返回 key：尽量从 det_out 中找出 json 路径
+            for key in [
+                "segments_json",
+                "sensitive_segments_json",
+                "sensitive_segments_path",
+                "json_path",
+                "output_json",
+            ]:
+                if key in det_out and det_out[key] and os.path.exists(det_out[key]):
+                    segments_json = det_out[key]
+                    break
+
+            # 如果 detect 没把路径通过 return 带出来，就回退到 out_dir 默认文件名（你的 detect 默认写 sensitive_segments.json）
+            if (not segments_json) or (not os.path.exists(segments_json)):
+                fallback = os.path.join(out_dir, "sensitive_segments.json")
+                if os.path.exists(fallback):
+                    segments_json = fallback
+
+            if (not segments_json) or (not os.path.exists(segments_json)):
+                raise RuntimeError("VideoSensitiveCrop: failed to obtain sensitive segments json from detect step.")
+
+
+
+
+        keep_mode = params.get("keep_mode", "remove")
+        out_name = params.get("out_name", "cleaned.mp4")
+        out_video = os.path.join(out_dir, out_name)
+
+        det = json.load(open(segments_json, "r", encoding="utf-8"))
+        segments = det.get("segments", [])
+
+        fps, W, H, nframes = get_video_info(video_path)
+        duration = nframes / float(fps) if fps > 0 else 0.0
+
+        if keep_mode == "remove":
+            keep_intervals = complement_intervals(segments, duration)
+        elif keep_mode == "keep":
+            keep_intervals = [[float(x["start"]), float(x["end"])] for x in segments]
+        else:
+            raise ValueError("keep_mode must be 'remove' or 'keep'")
+
+        logger.info(f"Start crop. mode={keep_mode}, keep_intervals={len(keep_intervals)}, duration={duration:.2f}s")
+
+        if not keep_intervals:
+            logger.info("No intervals to keep. Copy original as output.")
+            cut_segment(video_path, out_video, 0.0, duration, logger=logger)
+        else:
+            seg_dir = os.path.join(out_dir, "segments")
+            os.makedirs(seg_dir, exist_ok=True)
+
+            seg_files = []
+            for i, (s, e) in enumerate(keep_intervals):
+                seg_path = os.path.join(seg_dir, f"seg_{i:04d}.mp4")
+                cut_segment(video_path, seg_path, s, e, logger=logger)
+                seg_files.append(seg_path)
+
+            concat_segments(seg_files, out_video, logger=logger)
+
+        result = {
+            "out_dir": out_dir,
+            "input": video_path,
+            "segments_json": segments_json,
+            "keep_mode": keep_mode,
+            "output_video": out_video,
+            "kept_intervals": keep_intervals,
+        }
+        json_path = os.path.join(out_dir, "crop_result.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. output={out_video}")
+        return result
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_detect/__init__.py b/runtime/ops/mapper/video_sensitive_detect/__init__.py
new file mode 100644
index 000000000..c03c3c425
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_detect/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSensitiveDetect",
+    module_path="ops.mapper.video_sensitive_detect.process",
+)
diff --git a/runtime/ops/mapper/video_sensitive_detect/metadata.yml b/runtime/ops/mapper/video_sensitive_detect/metadata.yml
new file mode 100644
index 000000000..7f51a044f
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_detect/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频敏感检测'
+name_en: 'Video Sensitive Detect'
+description: '抽帧+Qwen判定生成敏感片段 sensitive_segments.json（需要提供 qwen_module/qwen_func）。'
+description_en: 'Sample frames and call Qwen inference to generate sensitive_segments.json (requires qwen_module/qwen_func).'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSensitiveDetect'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_sensitive_detect/process.py b/runtime/ops/mapper/video_sensitive_detect/process.py
new file mode 100644
index 000000000..a57070657
--- /dev/null
+++ b/runtime/ops/mapper/video_sensitive_detect/process.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
+
+
+def _merge_times_to_segments(times, gap=1.5, pad=0.5):
+    if not times:
+        return []
+    times = sorted(times)
+    segs = []
+    s = times[0]
+    prev = times[0]
+    for t in times[1:]:
+        if t - prev <= gap:
+            prev = t
+        else:
+            segs.append([max(0.0, s - pad), prev + pad])
+            s = t
+            prev = t
+    segs.append([max(0.0, s - pad), prev + pad])
+    return segs
+
+
+class VideoSensitiveDetect:
+    """
+    抽帧 + QwenVL HTTP 敏感检测（对齐 qwen_vl_server.py）：
+
+    服务端：
+      POST {service_url}/infer
+      JSON: {image_path, task="sensitive", max_new_tokens, language, style}
+      返回: {is_sensitive,label,score,reason}
+
+    params:
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - sample_fps: 默认 1.0
+      - threshold: 默认 0.5
+      - merge_gap: 默认 1.5
+      - pad_sec: 默认 0.5
+      - max_new_tokens: 默认 8
+    outputs:
+      - out_dir/sensitive_segments.json
+    """
+
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        out_dir = make_run_dir(export_path, "video_sensitive_detect")
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoSensitiveDetect", log_dir)
+
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
+        sample_fps = float(params.get("sample_fps", 1.0))
+        threshold = float(params.get("threshold", 0.5))
+        merge_gap = float(params.get("merge_gap", 1.5))
+        pad_sec = float(params.get("pad_sec", 0.5))
+        max_new_tokens = int(params.get("max_new_tokens", 8))
+
+        fps, W, H, total_frames = get_video_info(video_path)
+        step = max(1, int(round(float(fps) / max(sample_fps, 1e-6))))
+
+        logger.info(
+            f"Start sensitive detect. video={video_path}, fps={fps}, step={step}, "
+            f"url={service_url}, thr={threshold}, gap={merge_gap}"
+        )
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        hits = []
+        sensitive_times = []
+
+        frame_id = 0
+        while True:
+            ok, frame = cap.read()
+            if not ok:
+                break
+
+            if frame_id % step != 0:
+                frame_id += 1
+                continue
+
+            t = frame_id / float(fps) if fps else 0.0
+            frame_jpg = os.path.join(frames_dir, f"{frame_id:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+
+            try:
+                res = qwenvl_infer_by_image_path(
+                    image_path=frame_jpg,
+                    task="sensitive",
+                    service_url=service_url,
+                    max_new_tokens=max_new_tokens,
+                    timeout=timeout_sec,
+                )
+            except Exception as e:
+                logger.error(f"infer failed at t={t:.2f}s frame={frame_id}: {repr(e)}")
+                frame_id += 1
+                continue
+
+            is_sensitive = bool(res.get("is_sensitive", False))
+            score = float(res.get("score", 0.0))
+            label = str(res.get("label", "none"))
+            reason = str(res.get("reason", ""))
+
+            hits.append(
+                {
+                    "time": t,
+                    "frame_idx": frame_id,
+                    "image_path": frame_jpg,
+                    "is_sensitive": is_sensitive,
+                    "label": label,
+                    "score": score,
+                    "reason": reason,
+                }
+            )
+
+            if is_sensitive and score >= threshold:
+                sensitive_times.append(t)
+
+            frame_id += 1
+
+        cap.release()
+
+        segs = _merge_times_to_segments(sensitive_times, gap=merge_gap, pad=pad_sec)
+
+        result = {
+            "out_dir": out_dir,
+            "video": video_path,
+            "service_url": service_url,
+            "sample_fps": sample_fps,
+            "threshold": threshold,
+            "merge_gap": merge_gap,
+            "pad_sec": pad_sec,
+            "hits": hits,
+            "segments": [{"start": float(s), "end": float(e)} for s, e in segs],
+        }
+
+        json_path = os.path.join(out_dir, "sensitive_segments.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. segments_json={json_path}, segments={len(segs)}, hits={len(hits)}")
+        return {"out_dir": out_dir, "segments_json": json_path, "segments_count": len(segs)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_speech_asr/__init__.py b/runtime/ops/mapper/video_speech_asr/__init__.py
new file mode 100644
index 000000000..cc00c40da
--- /dev/null
+++ b/runtime/ops/mapper/video_speech_asr/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSpeechASR",
+    module_path="ops.mapper.video_speech_asr.process",
+)
diff --git a/runtime/ops/mapper/video_speech_asr/metadata.yml b/runtime/ops/mapper/video_speech_asr/metadata.yml
new file mode 100644
index 000000000..0847fa477
--- /dev/null
+++ b/runtime/ops/mapper/video_speech_asr/metadata.yml
@@ -0,0 +1,16 @@
+name: '语音识别ASR'
+name_en: 'Video Speech ASR'
+description: '从视频抽取音频并进行语音识别，输出 asr.json（可含时间戳）；支持指定语言/模型规模等参数。'
+description_en: 'Extract audio and run ASR, outputs asr.json (with timestamps); supports language/model options.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSpeechASR'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_speech_asr/process.py b/runtime/ops/mapper/video_speech_asr/process.py
new file mode 100644
index 000000000..711698995
--- /dev/null
+++ b/runtime/ops/mapper/video_speech_asr/process.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import shutil
+import subprocess
+import re
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+
+
+def _write_srt(segments, srt_path):
+    def _fmt(t):
+        h = int(t // 3600)
+        m = int((t % 3600) // 60)
+        s = int(t % 60)
+        ms = int(round((t - int(t)) * 1000))
+        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, 1):
+            f.write(str(i) + "\n")
+            f.write(f"{_fmt(seg['start'])} --> {_fmt(seg['end'])}\n")
+            f.write((seg.get("text") or "").strip() + "\n\n")
+
+
+def _contains_cjk(s: str) -> bool:
+    return bool(re.search(r"[\u4e00-\u9fff]", s or ""))
+
+
+def _to_simplified(text: str) -> str:
+    try:
+        from opencc import OpenCC
+        return OpenCC("t2s").convert(text)
+    except Exception:
+        return text
+
+
+class VideoSpeechASR:
+    """语音转文字（优先 faster-whisper；失败自动回退 openai-whisper）
+
+    params:
+      - ffmpeg_path: str, optional
+      - model: tiny|base|small|medium|large-v3, default small
+      - language: auto|zh|en, default zh
+      - beam_size: int, default 5
+      - vad_filter: bool, default True
+      - compute_type: int8|int8_float16|float16|float32, default int8
+      - sample_rate: int, default 16000
+      - channels: int, default 1
+      - max_audio_sec: float, optional
+      - zh_script: simplified|traditional|keep, default simplified
+
+      # 离线/本地模型（faster-whisper）
+      - fw_model_path: str, optional   # 本地模型路径（目录）
+      - fw_download_root: str, optional
+      - local_files_only: bool, default False
+
+    outputs:
+      - artifacts/audio.wav
+      - artifacts/asr.json / asr.txt / asr.srt
+      - artifacts/asr_backend.json（记录用了哪个后端/异常信息）
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        op_name = "video_speech_asr"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
+
+        sr = int(params.get("sample_rate", 16000))
+        ch = int(params.get("channels", 1))
+        max_audio_sec = params.get("max_audio_sec", None)
+        max_audio_sec = float(max_audio_sec) if max_audio_sec is not None else None
+
+        audio_path = os.path.join(art_dir, "audio.wav")
+        cmd = [
+            ffmpeg_path, "-hide_banner", "-y",
+            "-i", video_path,
+            "-vn",
+            "-ac", str(ch),
+            "-ar", str(sr),
+            "-c:a", "pcm_s16le",
+        ]
+        if max_audio_sec is not None and max_audio_sec > 0:
+            cmd += ["-t", f"{max_audio_sec}"]
+        cmd += [audio_path]
+
+        logger.info("FFmpeg cmd: " + " ".join(cmd))
+        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"FFmpeg failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
+
+        model_name = (params.get("model", "small") or "small")
+        language = (params.get("language", "zh") or "zh").lower()
+        beam_size = int(params.get("beam_size", 5))
+        vad_filter = bool(params.get("vad_filter", True))
+        compute_type = (params.get("compute_type", "int8") or "int8")
+        zh_script = (params.get("zh_script", "simplified") or "simplified").lower()
+
+        fw_model_path = params.get("fw_model_path", None)
+        fw_download_root = params.get("fw_download_root", None)
+        local_files_only = bool(params.get("local_files_only", False))
+
+        segments = []
+        full_text = ""
+        backend_info = {"backend": None, "error": None}
+
+        # ===== try faster-whisper =====
+        try:
+            from faster_whisper import WhisperModel
+            backend_info["backend"] = "faster-whisper"
+
+            # 离线策略：local_files_only 时，把 HF 的联网行为尽量关掉
+            if local_files_only:
+                os.environ.setdefault("HF_HUB_OFFLINE", "1")
+                os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
+
+            model_id = fw_model_path or model_name
+            logger.info(f"[ASR] faster-whisper load model={model_id}, compute_type={compute_type}, offline={local_files_only}")
+
+            fw = WhisperModel(
+                model_id,
+                device="cpu",
+                compute_type=compute_type,
+                download_root=fw_download_root,
+            )
+
+            logger.info("[ASR] faster-whisper transcribe start...")
+            seg_iter, info = fw.transcribe(
+                audio_path,
+                language=None if language == "auto" else language,
+                beam_size=beam_size,
+                vad_filter=vad_filter,
+            )
+            for s in seg_iter:
+                segments.append({"start": float(s.start), "end": float(s.end), "text": (s.text or "").strip()})
+            full_text = " ".join([s["text"] for s in segments]).strip()
+            logger.info("[ASR] faster-whisper transcribe done.")
+
+        except Exception as e:
+            # ===== fallback openai-whisper =====
+            backend_info["backend"] = "openai-whisper"
+            backend_info["error"] = f"faster-whisper failed: {repr(e)}"
+            logger.warning("[ASR] faster-whisper failed, fallback openai-whisper. reason=" + repr(e))
+
+            try:
+                import whisper
+            except Exception as e2:
+                raise RuntimeError("ASR backend failed. Please install: pip install faster-whisper openai-whisper") from e2
+
+            logger.info(f"[ASR] openai-whisper load model={model_name} (slow on CPU)")
+            wmodel = whisper.load_model(model_name)
+
+            wargs = {"fp16": False, "verbose": False}
+            if language != "auto":
+                wargs["language"] = language
+
+            logger.info("[ASR] openai-whisper transcribe start...")
+            result = wmodel.transcribe(audio_path, **wargs)
+            logger.info("[ASR] openai-whisper transcribe done.")
+
+            for seg in result.get("segments", []):
+                segments.append({
+                    "start": float(seg.get("start", 0.0)),
+                    "end": float(seg.get("end", 0.0)),
+                    "text": (seg.get("text") or "").strip()
+                })
+            full_text = (result.get("text") or "").strip()
+
+        # 简体化
+        if zh_script == "simplified":
+            if _contains_cjk(full_text):
+                full_text = _to_simplified(full_text)
+            for s in segments:
+                if _contains_cjk(s["text"]):
+                    s["text"] = _to_simplified(s["text"])
+
+        json_path = os.path.join(art_dir, "asr.json")
+        txt_path = os.path.join(art_dir, "asr.txt")
+        srt_path = os.path.join(art_dir, "asr.srt")
+        backend_path = os.path.join(art_dir, "asr_backend.json")
+
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump({"text": full_text, "segments": segments}, f, ensure_ascii=False, indent=2)
+        with open(txt_path, "w", encoding="utf-8") as f:
+            f.write(full_text + "\n")
+        _write_srt(segments, srt_path)
+
+        with open(backend_path, "w", encoding="utf-8") as f:
+            json.dump(backend_info, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. segments={len(segments)} asr_json={json_path}")
+        return {
+            "out_dir": out_dir,
+            "audio_wav": audio_path,
+            "asr_json": json_path,
+            "asr_txt": txt_path,
+            "asr_srt": srt_path,
+            "asr_backend": backend_path,
+        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subject_crop/__init__.py b/runtime/ops/mapper/video_subject_crop/__init__.py
new file mode 100644
index 000000000..b4bc44bda
--- /dev/null
+++ b/runtime/ops/mapper/video_subject_crop/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSubjectCrop",
+    module_path="ops.mapper.video_subject_crop.process",
+)
diff --git a/runtime/ops/mapper/video_subject_crop/metadata.yml b/runtime/ops/mapper/video_subject_crop/metadata.yml
new file mode 100644
index 000000000..45107f9c5
--- /dev/null
+++ b/runtime/ops/mapper/video_subject_crop/metadata.yml
@@ -0,0 +1,16 @@
+name: '主体跟踪裁剪'
+name_en: 'Video Subject Crop'
+description: '根据 tracks.json 选择 Top1 主体轨迹并裁剪输出 subject.mp4，用于单主体验收链路。'
+description_en: 'Select the top subject track from tracks.json and crop to output subject.mp4.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSubjectCrop'
+version: '1.0.0'
+types:
+  - 'cleaning'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'video'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subject_crop/process.py b/runtime/ops/mapper/video_subject_crop/process.py
new file mode 100644
index 000000000..fb870c155
--- /dev/null
+++ b/runtime/ops/mapper/video_subject_crop/process.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import cv2
+
+from .._video_common.paths import make_run_dir
+from .._video_common.log import get_logger
+from ..video_mot_track.process import VideoMotTrack
+
+def _bbox_area(b):
+    x1, y1, x2, y2 = b
+    return max(0.0, x2 - x1) * max(0.0, y2 - y1)
+
+def _select_top1_track(tracks: dict, min_frames: int = 10):
+    stats = {}  # tid -> {"count":int, "area_sum":float}
+    for fr in tracks.get("frames", []):
+        for obj in fr.get("objects", []):
+            tid = int(obj["track_id"])
+            area = _bbox_area(obj["bbox"])
+            if tid not in stats:
+                stats[tid] = {"count": 0, "area_sum": 0.0}
+            stats[tid]["count"] += 1
+            stats[tid]["area_sum"] += area
+
+    items = []
+    for tid, s in stats.items():
+        if s["count"] < min_frames:
+            continue
+        avg_area = s["area_sum"] / max(1, s["count"])
+        items.append((tid, s["count"], avg_area))
+
+    if not items:
+        return None
+
+    items.sort(key=lambda x: (x[1], x[2]), reverse=True)
+    return int(items[0][0])
+
+def _clamp(val, lo, hi):
+    return max(lo, min(hi, val))
+
+def _ema(prev_bbox, bbox, alpha=0.8):
+    if prev_bbox is None:
+        return bbox
+    return [
+        alpha*prev_bbox[0] + (1-alpha)*bbox[0],
+        alpha*prev_bbox[1] + (1-alpha)*bbox[1],
+        alpha*prev_bbox[2] + (1-alpha)*bbox[2],
+        alpha*prev_bbox[3] + (1-alpha)*bbox[3],
+    ]
+
+def _expand_bbox(bbox, margin, W, H):
+    x1, y1, x2, y2 = bbox
+    w = x2 - x1
+    h = y2 - y1
+    x1 = x1 - w * margin
+    y1 = y1 - h * margin
+    x2 = x2 + w * margin
+    y2 = y2 + h * margin
+    x1 = _clamp(int(x1), 0, W-1)
+    y1 = _clamp(int(y1), 0, H-1)
+    x2 = _clamp(int(x2), 0, W-1)
+    y2 = _clamp(int(y2), 0, H-1)
+    if x2 <= x1: x2 = min(W-1, x1+1)
+    if y2 <= y1: y2 = min(H-1, y1+1)
+    return [x1, y1, x2, y2]
+
+class VideoSubjectCrop:
+    """
+    主体追踪裁剪（Top1）：
+    输入:
+      - sample["filePath"]
+      - sample["export_path"]
+      - params["tracks_json"] (可选：不提供就自动找同一次 run 的 tracks.json)
+    输出:
+      - subjects/subject.mp4
+      - subjects/subject_track_id.txt
+    """
+    def execute(self, sample: dict, params: dict = None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample["export_path"]
+
+        out_dir = make_run_dir(export_path, "video_subject_crop")
+        logger = get_logger("VideoSubjectCrop", log_dir=out_dir)
+
+        tracks_json = params.get("tracks_json", None)
+        if (not tracks_json) or (not os.path.exists(tracks_json)):
+            # 自动跑 MOT 生成 tracks.json
+            mot_params = params.get("mot_params", {})  # 可选：把 mot 的参数也透传进来
+            logger.info("tracks_json not provided; run VideoMotTrack first to generate tracks.json")
+            mot_out = VideoMotTrack().execute(sample, mot_params)
+            tracks_json = mot_out["tracks_json"]
+
+        crop_size = int(params.get("crop_size", 512))
+        margin = float(params.get("margin", 0.15))
+        smooth_alpha = float(params.get("smooth_alpha", 0.8))
+        min_frames = int(params.get("min_frames", 10))
+        fill_missing = bool(params.get("fill_missing", False))
+
+        with open(tracks_json, "r", encoding="utf-8") as f:
+            tracks = json.load(f)
+
+        fps = float(tracks["fps"])
+        W = int(tracks["width"])
+        H = int(tracks["height"])
+
+        subject_id = _select_top1_track(tracks, min_frames=min_frames)
+        if subject_id is None:
+            raise RuntimeError(f"No valid subject track found (min_frames={min_frames}).")
+
+        subjects_dir = os.path.join(out_dir, "subjects")
+        os.makedirs(subjects_dir, exist_ok=True)
+
+        with open(os.path.join(subjects_dir, "subject_track_id.txt"), "w", encoding="utf-8") as f:
+            f.write(str(subject_id))
+
+        out_video = os.path.join(subjects_dir, "subject.mp4")
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        writer = cv2.VideoWriter(out_video, fourcc, fps, (crop_size, crop_size))
+
+        last_bbox = None
+        frame_id = 0
+
+        logger.info(f"Start subject crop. subject_id={subject_id}, tracks={tracks_json}")
+
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            bbox = None
+            if frame_id < len(tracks.get("frames", [])):
+                objs = tracks["frames"][frame_id].get("objects", [])
+                for obj in objs:
+                    if int(obj["track_id"]) == int(subject_id):
+                        bbox = obj["bbox"]
+                        break
+
+            if bbox is None:
+                if fill_missing and last_bbox is not None:
+                    bbox_s = last_bbox
+                else:
+                    frame_id += 1
+                    continue
+            else:
+                bbox_s = _ema(last_bbox, bbox, alpha=smooth_alpha)
+                last_bbox = bbox_s
+
+            bbox_e = _expand_bbox(bbox_s, margin=margin, W=W, H=H)
+            x1, y1, x2, y2 = bbox_e
+            crop = frame[y1:y2, x1:x2]
+            if crop.size == 0:
+                frame_id += 1
+                continue
+
+            crop = cv2.resize(crop, (crop_size, crop_size), interpolation=cv2.INTER_LINEAR)
+            writer.write(crop)
+
+            frame_id += 1
+
+        cap.release()
+        writer.release()
+
+        logger.info(f"Done. subject_video={out_video}")
+
+        return {
+            "out_dir": out_dir,
+            "subject_track_id": subject_id,
+            "subject_video": out_video,
+        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subtitle_ocr/__init__.py b/runtime/ops/mapper/video_subtitle_ocr/__init__.py
new file mode 100644
index 000000000..5460c2f83
--- /dev/null
+++ b/runtime/ops/mapper/video_subtitle_ocr/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSubtitleOCR",
+    module_path="ops.mapper.video_subtitle_ocr.process",
+)
diff --git a/runtime/ops/mapper/video_subtitle_ocr/metadata.yml b/runtime/ops/mapper/video_subtitle_ocr/metadata.yml
new file mode 100644
index 000000000..98948bdda
--- /dev/null
+++ b/runtime/ops/mapper/video_subtitle_ocr/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频字幕OCR提取'
+name_en: 'Video Subtitle OCR'
+description: '对视频底部字幕区域进行OCR识别，输出 subtitles.json 与 subtitles.srt；可选自动去黑边、抽帧、跳过相似帧、字幕去重合并、英文空格修复。'
+description_en: 'OCR for bottom subtitles, outputs subtitles.json and subtitles.srt; optional deborder, sampling, frame skipping, merge, English spacing fix.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSubtitleOCR'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subtitle_ocr/process.py b/runtime/ops/mapper/video_subtitle_ocr/process.py
new file mode 100644
index 000000000..2c58753fb
--- /dev/null
+++ b/runtime/ops/mapper/video_subtitle_ocr/process.py
@@ -0,0 +1,406 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import re
+import shutil
+import subprocess
+import cv2
+import numpy as np
+import inspect
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from paddleocr import PaddleOCR
+from .._video_common.model_paths import resolve_model_path
+
+def build_paddle_ocr(params, ocr_lang: str, use_angle_cls: bool):
+    """
+    默认模型目录：
+      /mnt/models/ocr/det
+      /mnt/models/ocr/rec
+      /mnt/models/ocr/cls
+    也支持 params['ocr_model_dir'] 指定（相对/绝对）。
+    """
+    ocr_root = resolve_model_path(params, "ocr_model_dir", "ocr")
+    det_dir = os.path.join(ocr_root, "det")
+    rec_dir = os.path.join(ocr_root, "rec")
+    cls_dir = os.path.join(ocr_root, "cls")
+
+    # 目录不存在就直接报错，让用户去模型仓下载到固定位置
+    for p in [det_dir, rec_dir] + ([cls_dir] if use_angle_cls else []):
+        if not os.path.exists(p):
+            raise RuntimeError(f"PaddleOCR model dir not found: {p}. Please download OCR models into model repo path.")
+
+    sig = inspect.signature(PaddleOCR.__init__)
+    kw = {"lang": ocr_lang}
+    if "use_angle_cls" in sig.parameters:
+        kw["use_angle_cls"] = use_angle_cls
+    # PaddleOCR 3.4.0 支持这些
+    if "det_model_dir" in sig.parameters:
+        kw["det_model_dir"] = det_dir
+    if "rec_model_dir" in sig.parameters:
+        kw["rec_model_dir"] = rec_dir
+    if "cls_model_dir" in sig.parameters and use_angle_cls:
+        kw["cls_model_dir"] = cls_dir
+
+    return PaddleOCR(**kw)
+def _write_srt(segments, srt_path):
+    def _fmt(t):
+        h = int(t // 3600)
+        m = int((t % 3600) // 60)
+        s = int(t % 60)
+        ms = int(round((t - int(t)) * 1000))
+        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+
+    with open(srt_path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, 1):
+            f.write(str(i) + "\n")
+            f.write(f"{_fmt(seg['start'])} --> {_fmt(seg['end'])}\n")
+            f.write((seg.get("text") or "").strip() + "\n\n")
+
+
+def _clean_text(t: str) -> str:
+    if not t:
+        return ""
+    t = t.strip()
+    t = re.sub(r"\s+", " ", t)
+    return t
+
+
+def _english_ratio(text: str) -> float:
+    if not text:
+        return 0.0
+    letters = sum(c.isalpha() for c in text)
+    return letters / max(1, len(text))
+
+
+def _fix_english_spacing(text: str) -> str:
+    """英文字幕空格修复（轻量规则，避免影响中文）"""
+    if not text:
+        return text
+    if _english_ratio(text) < 0.40:
+        return text
+
+    t = text
+
+    # 小写后接大写：ThisIs -> This Is
+    t = re.sub(r"([a-z])([A-Z])", r"\1 \2", t)
+
+    # 字母数字边界：A1 / 1A
+    t = re.sub(r"([A-Za-z])(\d)", r"\1 \2", t)
+    t = re.sub(r"(\d)([A-Za-z])", r"\1 \2", t)
+
+    # 标点前去空格，标点后若紧跟字母则补空格（保守）
+    t = re.sub(r"\s+([,.;:?!])", r"\1", t)
+    t = re.sub(r"([,.;:?!])([A-Za-z])", r"\1 \2", t)
+
+    # 多空格压缩
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+
+
+def _norm_sub_key(text: str) -> str:
+    """用于合并的规范化 key：空格归一、末尾标点归一、英文小写化"""
+    if not text:
+        return ""
+    t = text.strip()
+    t = re.sub(r"\s+", " ", t)
+    # 去掉末尾重复标点（中英文都考虑）
+    t = re.sub(r"[.。!?！？]+$", "", t).strip()
+
+    # 英文占比高则统一小写，便于合并
+    if _english_ratio(t) > 0.40:
+        t = t.lower()
+
+    return t
+
+
+def _roi_changed(cur_roi, last_roi, diff_thr=4.0):
+    """diff_thr 调低一点更敏感，避免跳过字幕变化"""
+    if last_roi is None:
+        return True
+    a = cv2.cvtColor(cur_roi, cv2.COLOR_BGR2GRAY)
+    b = cv2.cvtColor(last_roi, cv2.COLOR_BGR2GRAY)
+    if a.shape != b.shape:
+        b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA)
+    diff = np.mean(np.abs(a.astype(np.float32) - b.astype(np.float32)))
+    return diff >= diff_thr
+
+
+def _even(x: int) -> int:
+    return x - (x % 2)
+
+
+def _parse_cropdetect(stderr: str):
+    m_last = None
+    for line in stderr.splitlines():
+        m = re.search(r"crop=(\d+):(\d+):(\d+):(\d+)", line)
+        if m:
+            m_last = m
+    if not m_last:
+        return None
+    w, h, x, y = map(int, m_last.groups())
+    return (_even(w), _even(h), _even(x), _even(y))
+
+
+def _deborder_ffmpeg(ffmpeg_path: str, in_video: str, out_video: str, logger):
+    cmd1 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-ss", "0", "-i", in_video, "-t", "2",
+        "-vf", "cropdetect=24:16:0",
+        "-f", "null", "-"
+    ]
+    logger.info("cropdetect cmd: " + " ".join(cmd1))
+    p1 = subprocess.run(cmd1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    crop = _parse_cropdetect(p1.stderr)
+    if not crop:
+        logger.warning("cropdetect found nothing, keep original (copy).")
+        cmdc = [ffmpeg_path, "-hide_banner", "-y", "-i", in_video, "-c", "copy", out_video]
+        p = subprocess.run(cmdc, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"ffmpeg copy failed.\n{p.stderr}")
+        return None
+
+    w, h, x, y = crop
+    cmd2 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-i", in_video,
+        "-vf", f"crop={w}:{h}:{x}:{y}",
+        "-c:v", "libx264", "-preset", "veryfast", "-crf", "23", "-pix_fmt", "yuv420p",
+        "-c:a", "copy",
+        out_video
+    ]
+    logger.info("crop cmd: " + " ".join(cmd2))
+    p2 = subprocess.run(cmd2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p2.returncode != 0:
+        raise RuntimeError(f"ffmpeg crop failed.\n{p2.stderr}")
+    return {"w": w, "h": h, "x": x, "y": y}
+
+
+def _extract_texts_from_any(res):
+    """
+    兼容 PaddleOCR 多种返回：
+    - 传统：res = [ [ [box,(text,score)], ... ] ]
+    - 新 pipeline/dict：res 可能是 dict/对象，里头有 'rec_texts'/'rec_scores' 或 'texts'/'scores'
+    返回: list[(text,score)]
+    """
+    out = []
+
+    # dict 风格
+    if isinstance(res, dict):
+        keys_text = ["rec_texts", "texts", "text"]
+        keys_score = ["rec_scores", "scores", "score"]
+        texts = None
+        scores = None
+        for kt in keys_text:
+            if kt in res:
+                texts = res[kt]
+                break
+        for ks in keys_score:
+            if ks in res:
+                scores = res[ks]
+                break
+
+        if texts is not None:
+            if isinstance(texts, str):
+                out.append((texts, float(scores) if scores is not None else 0.0))
+                return out
+            if isinstance(texts, (list, tuple)):
+                if scores is None:
+                    for t in texts:
+                        out.append((str(t), 0.0))
+                else:
+                    if isinstance(scores, (list, tuple)) and len(scores) == len(texts):
+                        for t, s in zip(texts, scores):
+                            out.append((str(t), float(s)))
+                    else:
+                        for t in texts:
+                            out.append((str(t), float(scores) if scores is not None else 0.0))
+                return out
+
+        if "result" in res:
+            return _extract_texts_from_any(res["result"])
+
+    # list 风格（传统）
+    if isinstance(res, list):
+        if len(res) == 0:
+            return out
+
+        if isinstance(res[0], dict):
+            for item in res:
+                out.extend(_extract_texts_from_any(item))
+            return out
+
+        lines = res[0] if isinstance(res[0], list) else res
+        for line in lines:
+            try:
+                if isinstance(line, (list, tuple)) and len(line) >= 2:
+                    info = line[1]
+                    if isinstance(info, (list, tuple)) and len(info) >= 2:
+                        out.append((str(info[0]), float(info[1])))
+                    elif isinstance(info, str):
+                        out.append((info, 0.0))
+            except Exception:
+                continue
+        return out
+
+    # 兜底
+    try:
+        s = str(res)
+        if s:
+            out.append((s, 0.0))
+    except Exception:
+        pass
+    return out
+
+
+class VideoSubtitleOCR:
+    """字幕 OCR（自动去黑边 + 固定下30% + 英文空格修复 + 去重合并）
+
+    params:
+      - preprocess_deborder: bool, default True
+      - sample_fps: float, default 1.0
+      - max_frames: int, default 240
+      - subtitle_ratio: float, default 0.30
+      - ocr_lang: ch|en, default ch
+      - min_score: float, default 0.0
+      - roi_diff_thr: float, default 4.0
+      - gap_merge_sec: float, default 1.2     # ✅ 更容易合并跨帧字幕
+      - fix_english_space: bool, default True # ✅ 英文空格修复开关
+
+    outputs:
+      - artifacts/subtitles.json
+      - artifacts/subtitles.srt
+      - artifacts/frames/subtitle_*.jpg
+      - artifacts/deborder.mp4 (if preprocess_deborder=True)
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
+
+        in_video = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+        op_name = "video_subtitle_ocr"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger(op_name, log_dir)
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found")
+
+        # ✅ 默认自动去黑边
+        if params.get("preprocess_deborder", True):
+            deborder_mp4 = os.path.join(art_dir, "deborder.mp4")
+            crop = _deborder_ffmpeg(ffmpeg_path, in_video, deborder_mp4, logger)
+            with open(os.path.join(art_dir, "deborder_crop.json"), "w", encoding="utf-8") as f:
+                json.dump({"crop": crop, "deborder_mp4": deborder_mp4}, f, ensure_ascii=False, indent=2)
+            src_video = deborder_mp4
+        else:
+            src_video = in_video
+
+        logger.info(f"video={src_video}")
+        logger.info(f"out_dir={out_dir}")
+
+        from paddleocr import PaddleOCR
+        ocr_lang = params.get("ocr_lang", "ch")
+        ocr = build_paddle_ocr(params, ocr_lang=ocr_lang, use_angle_cls=False)   
+
+        fps, w, h, total = get_video_info(src_video)
+        sample_fps = float(params.get("sample_fps", 1.0))
+        max_frames = int(params.get("max_frames", 240))
+        subtitle_ratio = float(params.get("subtitle_ratio", 0.30))
+        min_score = float(params.get("min_score", 0.0))
+        roi_diff_thr = float(params.get("roi_diff_thr", 4.0))
+        gap_merge = float(params.get("gap_merge_sec", 1.2))
+        fix_en_space = bool(params.get("fix_english_space", True))
+
+        step = max(1, int(round(fps / max(sample_fps, 0.0001))))
+        idxs = list(range(0, total, step))
+        if max_frames and len(idxs) > max_frames:
+            n = max_frames
+            idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+
+        cap = cv2.VideoCapture(src_video)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {src_video}")
+
+        raw_hits = []
+        last_roi = None
+
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+            ok, frame = cap.read()
+            if not ok or frame is None:
+                continue
+
+            t = float(fi / fps) if fps else 0.0
+            y0 = int(h * (1.0 - subtitle_ratio))
+            roi = frame[y0:h, 0:w]
+
+            if not _roi_changed(roi, last_roi, diff_thr=roi_diff_thr):
+                continue
+            last_roi = roi
+
+            jpg_path = os.path.join(frames_dir, f"subtitle_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, roi)
+
+            res = ocr.ocr(roi)
+            pairs = _extract_texts_from_any(res)
+            texts = [txt for (txt, sc) in pairs if txt and float(sc) >= min_score]
+
+            text = _clean_text(" ".join(texts))
+            if fix_en_space:
+                text = _fix_english_spacing(text)
+
+            if text:
+                raw_hits.append({"t": t, "text": text, "key": _norm_sub_key(text), "frame_id": int(fi), "jpg": jpg_path})
+
+            if (k + 1) % 20 == 0 or k == len(idxs) - 1:
+                logger.info(f"[{k+1}/{len(idxs)}] frame={fi} hit={1 if text else 0} len={len(text)}")
+
+        cap.release()
+
+        # ✅ 合并相邻相同字幕（按规范化 key 合并）
+        segments = []
+        for hit in raw_hits:
+            if not segments:
+                segments.append({
+                    "start": hit["t"],
+                    "end": hit["t"],
+                    "text": hit["text"],
+                    "key": hit["key"],
+                    "evidence": [{"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]}],
+                })
+                continue
+
+            last = segments[-1]
+            if hit["key"] == last["key"] and (hit["t"] - last["end"] <= gap_merge):
+                last["end"] = hit["t"]
+                last["evidence"].append({"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]})
+            else:
+                segments.append({
+                    "start": hit["t"],
+                    "end": hit["t"],
+                    "text": hit["text"],
+                    "key": hit["key"],
+                    "evidence": [{"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]}],
+                })
+
+        # end 往后延一点，srt 更自然
+        for seg in segments:
+            seg["end"] = float(seg["end"] + max(0.4, 1.0 / max(sample_fps, 0.1)))
+
+        # 输出时不需要 key（但保留也无所谓；你想更干净就删掉）
+        json_path = os.path.join(art_dir, "subtitles.json")
+        srt_path = os.path.join(art_dir, "subtitles.srt")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump({"segments": segments}, f, ensure_ascii=False, indent=2)
+        _write_srt(segments, srt_path)
+
+        logger.info(f"Done. subtitles={len(segments)} srt={srt_path}")
+        return {"out_dir": out_dir, "subtitles_json": json_path, "subtitles_srt": srt_path, "count": len(segments)}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_summary_qwenvl/__init__.py b/runtime/ops/mapper/video_summary_qwenvl/__init__.py
new file mode 100644
index 000000000..6e9386f9a
--- /dev/null
+++ b/runtime/ops/mapper/video_summary_qwenvl/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoSummaryQwenVL",
+    module_path="ops.mapper.video_summary_qwenvl.process",
+)
diff --git a/runtime/ops/mapper/video_summary_qwenvl/metadata.yml b/runtime/ops/mapper/video_summary_qwenvl/metadata.yml
new file mode 100644
index 000000000..5f34dd412
--- /dev/null
+++ b/runtime/ops/mapper/video_summary_qwenvl/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频摘要（QwenVL）'
+name_en: 'Video Summary (QwenVL)'
+description: '抽多帧拼 montage，只调用一次 QwenVL summary，输出 summary.json（含 montage.jpg 与证据帧）。'
+description_en: 'Build montage from sampled frames, call QwenVL summary once; outputs summary.json with montage and evidence.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoSummaryQwenVL'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_summary_qwenvl/process.py b/runtime/ops/mapper/video_summary_qwenvl/process.py
new file mode 100644
index 000000000..19efc107d
--- /dev/null
+++ b/runtime/ops/mapper/video_summary_qwenvl/process.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import math
+import cv2
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
+
+
+def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
+    if total_frames <= 0:
+        return []
+    fps = float(fps) if fps else 25.0
+    step = max(1, int(round(fps / max(float(sample_fps), 1e-6))))
+    idxs = list(range(0, total_frames, step))
+    if max_frames and len(idxs) > int(max_frames):
+        n = int(max_frames)
+        idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+    return idxs
+
+
+def _make_montage(frames, cell_w=384, cell_h=216, max_cols=4):
+    n = len(frames)
+    cols = min(max_cols, n)
+    rows = int(math.ceil(n / cols))
+    canvas = 255 * (cv2.cvtColor(cv2.UMat(cell_h * rows, cell_w * cols, cv2.CV_8UC3), cv2.COLOR_BGR2RGB).get())
+    canvas[:] = 255
+    for i, img in enumerate(frames):
+        r = i // cols
+        c = i % cols
+        x0, y0 = c * cell_w, r * cell_h
+        resized = cv2.resize(img, (cell_w, cell_h))
+        canvas[y0 : y0 + cell_h, x0 : x0 + cell_w] = resized
+    return canvas
+
+
+class VideoSummaryQwenVL:
+    """
+    抽多帧拼 montage → QwenVL HTTP 生成摘要（对齐服务端 task=summary）：
+      返回: {summary}
+
+    params:
+      - service_url: 默认 http://127.0.0.1:18080
+      - timeout_sec: 默认 180
+      - sample_fps: 默认 1.0
+      - max_frames: 默认 12
+      - language: 默认 zh
+      - style: 默认 normal
+      - max_new_tokens: 默认 160
+      - montage_cell_w: 默认 384
+      - montage_cell_h: 默认 216
+      - montage_max_cols: 默认 4
+    outputs:
+      - artifacts/montage.jpg
+      - artifacts/summary.json
+      - artifacts/frames/*.jpg
+    """
+
+    def execute(self, sample, params=None):
+        params = params or {}
+        video_path = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+
+        out_dir = make_run_dir(export_path, "video_summary_qwenvl")
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger("VideoSummaryQwenVL", log_dir)
+
+        service_url = params.get("service_url", "http://127.0.0.1:18080")
+        timeout_sec = int(params.get("timeout_sec", 180))
+
+        sample_fps = float(params.get("sample_fps", 1.0))
+        max_frames = int(params.get("max_frames", 12))
+        language = params.get("language", "zh")
+        style = params.get("style", "normal")
+        max_new_tokens = int(params.get("max_new_tokens", 160))
+
+        cell_w = int(params.get("montage_cell_w", 384))
+        cell_h = int(params.get("montage_cell_h", 216))
+        max_cols = int(params.get("montage_max_cols", 4))
+
+        fps, W, H, total_frames = get_video_info(video_path)
+        idxs = _sample_frame_indices(total_frames, fps, sample_fps, max_frames)
+
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {video_path}")
+
+        frames = []
+        evidence = []
+
+        for idx in idxs:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ok, frame = cap.read()
+            if not ok:
+                continue
+            frame_jpg = os.path.join(frames_dir, f"{idx:06d}.jpg")
+            save_frame_to_jpg(frame, frame_jpg)
+            frames.append(frame)
+            evidence.append({"frame_idx": idx, "image_path": frame_jpg})
+
+        cap.release()
+
+        montage_path = os.path.join(art_dir, "montage.jpg")
+        summary = ""
+
+        if frames:
+            montage = _make_montage(frames, cell_w=cell_w, cell_h=cell_h, max_cols=max_cols)
+            cv2.imwrite(montage_path, montage)
+
+            res = qwenvl_infer_by_image_path(
+                image_path=montage_path,
+                task="summary",
+                service_url=service_url,
+                max_new_tokens=max_new_tokens,
+                language=language,
+                style=style,
+                timeout=timeout_sec,
+            )
+            summary = (res.get("summary") or "").strip()
+
+        out_json = os.path.join(art_dir, "summary.json")
+        with open(out_json, "w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "summary": summary,
+                    "service_url": service_url,
+                    "sample_fps": sample_fps,
+                    "max_frames": max_frames,
+                    "language": language,
+                    "style": style,
+                    "evidence": evidence,
+                    "montage": montage_path,
+                },
+                f,
+                ensure_ascii=False,
+                indent=2,
+            )
+
+        logger.info(f"Done. summary_json={out_json}")
+        return {"out_dir": out_dir, "summary_json": out_json, "montage_jpg": montage_path}
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_text_ocr/__init__.py b/runtime/ops/mapper/video_text_ocr/__init__.py
new file mode 100644
index 000000000..74283ffb3
--- /dev/null
+++ b/runtime/ops/mapper/video_text_ocr/__init__.py
@@ -0,0 +1,6 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(
+    module_name="VideoTextOCR",
+    module_path="ops.mapper.video_text_ocr.process",
+)
diff --git a/runtime/ops/mapper/video_text_ocr/metadata.yml b/runtime/ops/mapper/video_text_ocr/metadata.yml
new file mode 100644
index 000000000..5f911d601
--- /dev/null
+++ b/runtime/ops/mapper/video_text_ocr/metadata.yml
@@ -0,0 +1,16 @@
+name: '视频显著文字OCR提取'
+name_en: 'Video Text OCR'
+description: '对视频上方/主要区域显著文字进行OCR识别，输出 text_ocr.json；可选自动去黑边、抽帧、跳过相似帧。'
+description_en: 'OCR for salient texts on main/top region, outputs text_ocr.json; optional deborder, sampling, frame skipping.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'VideoTextOCR'
+version: '1.0.0'
+types:
+  - 'annotation'
+modal: 'video'
+effect:
+  before: ''
+  after: ''
+inputs: 'video'
+outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_text_ocr/process.py b/runtime/ops/mapper/video_text_ocr/process.py
new file mode 100644
index 000000000..082b21436
--- /dev/null
+++ b/runtime/ops/mapper/video_text_ocr/process.py
@@ -0,0 +1,288 @@
+# -*- coding: utf-8 -*-
+import os
+import json
+import re
+import shutil
+import subprocess
+import cv2
+import numpy as np
+import inspect
+from collections import Counter
+
+from .._video_common.paths import make_run_dir, ensure_dir
+from .._video_common.log import get_logger
+from .._video_common.io_video import get_video_info
+from paddleocr import PaddleOCR
+from .._video_common.model_paths import resolve_model_path
+
+def build_paddle_ocr(params, ocr_lang: str, use_angle_cls: bool):
+    """
+    默认模型目录：
+      /mnt/models/ocr/det
+      /mnt/models/ocr/rec
+      /mnt/models/ocr/cls
+    也支持 params['ocr_model_dir'] 指定（相对/绝对）。
+    """
+    ocr_root = resolve_model_path(params, "ocr_model_dir", "ocr")
+    det_dir = os.path.join(ocr_root, "det")
+    rec_dir = os.path.join(ocr_root, "rec")
+    cls_dir = os.path.join(ocr_root, "cls")
+
+    # 目录不存在就直接报错，让用户去模型仓下载到固定位置
+    for p in [det_dir, rec_dir] + ([cls_dir] if use_angle_cls else []):
+        if not os.path.exists(p):
+            raise RuntimeError(f"PaddleOCR model dir not found: {p}. Please download OCR models into model repo path.")
+
+    sig = inspect.signature(PaddleOCR.__init__)
+    kw = {"lang": ocr_lang}
+    if "use_angle_cls" in sig.parameters:
+        kw["use_angle_cls"] = use_angle_cls
+    # PaddleOCR 3.4.0 支持这些
+    if "det_model_dir" in sig.parameters:
+        kw["det_model_dir"] = det_dir
+    if "rec_model_dir" in sig.parameters:
+        kw["rec_model_dir"] = rec_dir
+    if "cls_model_dir" in sig.parameters and use_angle_cls:
+        kw["cls_model_dir"] = cls_dir
+
+    return PaddleOCR(**kw)
+
+def _clean_text(t: str) -> str:
+    if not t:
+        return ""
+    t = t.strip()
+    t = re.sub(r"\s+", " ", t)
+    return t
+
+
+def _roi_changed(cur_roi, last_roi, diff_thr=6.0):
+    if last_roi is None:
+        return True
+    a = cv2.cvtColor(cur_roi, cv2.COLOR_BGR2GRAY)
+    b = cv2.cvtColor(last_roi, cv2.COLOR_BGR2GRAY)
+    if a.shape != b.shape:
+        b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA)
+    diff = np.mean(np.abs(a.astype(np.float32) - b.astype(np.float32)))
+    return diff >= diff_thr
+
+
+def _even(x: int) -> int:
+    return x - (x % 2)
+
+
+def _parse_cropdetect(stderr: str):
+    m_last = None
+    for line in stderr.splitlines():
+        m = re.search(r"crop=(\d+):(\d+):(\d+):(\d+)", line)
+        if m:
+            m_last = m
+    if not m_last:
+        return None
+    w, h, x, y = map(int, m_last.groups())
+    return (_even(w), _even(h), _even(x), _even(y))
+
+
+def _deborder_ffmpeg(ffmpeg_path: str, in_video: str, out_video: str, logger):
+    cmd1 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-ss", "0", "-i", in_video, "-t", "2",
+        "-vf", "cropdetect=24:16:0",
+        "-f", "null", "-"
+    ]
+    logger.info("cropdetect cmd: " + " ".join(cmd1))
+    p1 = subprocess.run(cmd1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    crop = _parse_cropdetect(p1.stderr)
+    if not crop:
+        logger.warning("cropdetect found nothing, keep original (copy).")
+        cmdc = [ffmpeg_path, "-hide_banner", "-y", "-i", in_video, "-c", "copy", out_video]
+        p = subprocess.run(cmdc, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        if p.returncode != 0:
+            raise RuntimeError(f"ffmpeg copy failed.\n{p.stderr}")
+        return None
+
+    w, h, x, y = crop
+    cmd2 = [
+        ffmpeg_path, "-hide_banner", "-y",
+        "-i", in_video,
+        "-vf", f"crop={w}:{h}:{x}:{y}",
+        "-c:v", "libx264", "-preset", "veryfast", "-crf", "23", "-pix_fmt", "yuv420p",
+        "-c:a", "copy",
+        out_video
+    ]
+    logger.info("crop cmd: " + " ".join(cmd2))
+    p2 = subprocess.run(cmd2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if p2.returncode != 0:
+        raise RuntimeError(f"ffmpeg crop failed.\n{p2.stderr}")
+    return {"w": w, "h": h, "x": x, "y": y}
+
+
+def _extract_texts_from_any(res):
+    out = []
+    if isinstance(res, dict):
+        for kt in ["rec_texts", "texts", "text"]:
+            if kt in res:
+                texts = res[kt]
+                scores = res.get("rec_scores", res.get("scores", res.get("score", None)))
+                if isinstance(texts, str):
+                    out.append((texts, float(scores) if scores is not None else 0.0))
+                    return out
+                if isinstance(texts, (list, tuple)):
+                    if isinstance(scores, (list, tuple)) and len(scores) == len(texts):
+                        for t, s in zip(texts, scores):
+                            out.append((str(t), float(s)))
+                    else:
+                        for t in texts:
+                            out.append((str(t), float(scores) if scores is not None else 0.0))
+                    return out
+        if "result" in res:
+            return _extract_texts_from_any(res["result"])
+
+    if isinstance(res, list):
+        if len(res) == 0:
+            return out
+        if isinstance(res[0], dict):
+            for item in res:
+                out.extend(_extract_texts_from_any(item))
+            return out
+        lines = res[0] if isinstance(res[0], list) else res
+        for line in lines:
+            try:
+                if isinstance(line, (list, tuple)) and len(line) >= 2:
+                    info = line[1]
+                    if isinstance(info, (list, tuple)) and len(info) >= 2:
+                        out.append((str(info[0]), float(info[1])))
+                    elif isinstance(info, str):
+                        out.append((info, 0.0))
+            except Exception:
+                continue
+        return out
+
+    try:
+        s = str(res)
+        if s:
+            out.append((s, 0.0))
+    except Exception:
+        pass
+    return out
+
+
+def _is_garbage_text(t: str) -> bool:
+    if not t:
+        return True
+    s = t.replace(" ", "")
+    if len(s) < 2:
+        return True
+    letters = sum(c.isalpha() for c in s)
+    if letters / len(s) > 0.9:
+        uniq = len(set(s.lower()))
+        if uniq <= 5:
+            return True
+    cnt = Counter(s.lower())
+    most = cnt.most_common(1)[0][1]
+    if most / len(s) > 0.65:
+        return True
+    return False
+
+
+class VideoTextOCR:
+    """显著文字 OCR（自动去黑边 + 上70%）
+
+    params:
+      - preprocess_deborder: bool, default True
+      - sample_fps: float, default 0.5
+      - max_frames: int, default 120
+      - top_ratio: float, default 0.70
+      - ocr_lang: ch|en, default ch
+      - min_score: float, default 0.0
+      - roi_diff_thr: float, default 6.0
+    """
+
+    @staticmethod
+    def execute(sample, params):
+        os.environ.setdefault("PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK", "True")
+
+        in_video = sample["filePath"]
+        export_path = sample.get("export_path", "./outputs")
+        op_name = "video_text_ocr"
+        out_dir = make_run_dir(export_path, op_name)
+        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
+        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
+        frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
+        logger = get_logger(op_name, log_dir)
+
+        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
+        if not ffmpeg_path:
+            raise RuntimeError("ffmpeg not found")
+
+        if params.get("preprocess_deborder", True):
+            deborder_mp4 = os.path.join(art_dir, "deborder.mp4")
+            crop = _deborder_ffmpeg(ffmpeg_path, in_video, deborder_mp4, logger)
+            with open(os.path.join(art_dir, "deborder_crop.json"), "w", encoding="utf-8") as f:
+                json.dump({"crop": crop, "deborder_mp4": deborder_mp4}, f, ensure_ascii=False, indent=2)
+            src_video = deborder_mp4
+        else:
+            src_video = in_video
+
+        logger.info(f"video={src_video}")
+        logger.info(f"out_dir={out_dir}")
+
+        from paddleocr import PaddleOCR
+        ocr_lang = params.get("ocr_lang", "ch")
+        ocr = build_paddle_ocr(params, ocr_lang=ocr_lang, use_angle_cls=False)   
+
+        fps, w, h, total = get_video_info(src_video)
+        sample_fps = float(params.get("sample_fps", 0.5))
+        max_frames = int(params.get("max_frames", 120))
+        top_ratio = float(params.get("top_ratio", 0.70))
+        min_score = float(params.get("min_score", 0.0))
+        roi_diff_thr = float(params.get("roi_diff_thr", 6.0))
+
+        step = max(1, int(round(fps / max(sample_fps, 0.0001))))
+        idxs = list(range(0, total, step))
+        if max_frames and len(idxs) > max_frames:
+            n = max_frames
+            idxs = [idxs[int(i * (len(idxs) - 1) / max(1, n - 1))] for i in range(n)]
+
+        cap = cv2.VideoCapture(src_video)
+        if not cap.isOpened():
+            raise RuntimeError(f"Cannot open video: {src_video}")
+
+        hits = []
+        last_roi = None
+
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
+            ok, frame = cap.read()
+            if not ok or frame is None:
+                continue
+
+            t = float(fi / fps) if fps else 0.0
+            y1 = int(h * top_ratio)
+            roi = frame[0:y1, 0:w]
+
+            if not _roi_changed(roi, last_roi, diff_thr=roi_diff_thr):
+                continue
+            last_roi = roi
+
+            jpg_path = os.path.join(frames_dir, f"text_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, roi)
+
+            res = ocr.ocr(roi)
+            pairs = _extract_texts_from_any(res)
+            texts = [txt for (txt, sc) in pairs if txt and float(sc) >= min_score]
+            text = _clean_text(" ".join(texts))
+
+            if text and (not _is_garbage_text(text)):
+                hits.append({"t": t, "frame_id": int(fi), "text": text, "jpg": jpg_path})
+
+            if (k + 1) % 20 == 0 or k == len(idxs) - 1:
+                logger.info(f"[{k+1}/{len(idxs)}] frame={fi} hit={1 if text else 0} len={len(text)}")
+
+        cap.release()
+
+        json_path = os.path.join(art_dir, "text_ocr.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump({"hits": hits}, f, ensure_ascii=False, indent=2)
+
+        logger.info(f"Done. hits={len(hits)}")
+        return {"out_dir": out_dir, "text_ocr_json": json_path, "count": len(hits)}
\ No newline at end of file

From 7a8956e123ef57ebafb801b06a244cbfd50965a5 Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Tue, 24 Mar 2026 15:31:56 +0800
Subject: [PATCH 2/6] chore: add qwen vl service image files and remove extra
 mapper docs

---
 runtime/ops/mapper/README_video_ops.md        | 168 -------------
 .../ops/mapper/_video_common/README_models.md |  83 -------
 .../_video_common/README_qwen_service.md      |  92 -------
 scripts/images/qwen-vl-service/Dockerfile     |  57 +++++
 .../images/qwen-vl-service/qwen_vl_server.py  | 231 ++++++++++++++++++
 scripts/images/qwen-vl-service/start.sh       |   8 +
 6 files changed, 296 insertions(+), 343 deletions(-)
 delete mode 100644 runtime/ops/mapper/README_video_ops.md
 delete mode 100644 runtime/ops/mapper/_video_common/README_models.md
 delete mode 100644 runtime/ops/mapper/_video_common/README_qwen_service.md
 create mode 100644 scripts/images/qwen-vl-service/Dockerfile
 create mode 100644 scripts/images/qwen-vl-service/qwen_vl_server.py
 create mode 100644 scripts/images/qwen-vl-service/start.sh

diff --git a/runtime/ops/mapper/README_video_ops.md b/runtime/ops/mapper/README_video_ops.md
deleted file mode 100644
index e6eaa6ac9..000000000
--- a/runtime/ops/mapper/README_video_ops.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# DataMate 视频算子说明
-
-## 1. 模块概述
-
-本模块为 DataMate 提供视频数据清洗与视频 AI 辅助标注相关算子，覆盖视频预处理、敏感内容检测与裁剪、多目标跟踪、主体跟踪裁剪、关键帧提取、OCR、ASR、视频分类、视频摘要、事件标注等能力。
-
-所有视频算子均按照 DataMate 算子规范组织在 `runtime/ops/mapper/` 目录下，每个算子目录包含以下标准文件：
-
-- `__init__.py`
-- `metadata.yml`
-- `process.py`
-
-视频算子共用的基础能力统一放置于：
-
-- `runtime/ops/mapper/_video_common/`
-
----
-
-## 2. 已实现算子
-
-### 2.1 视频清洗与预处理
-- `video_format_convert`：视频格式转换
-- `video_deborder_crop`：黑边去除与有效区域裁剪
-- `video_sensitive_detect`：敏感内容检测
-- `video_sensitive_crop`：敏感片段裁剪
-
-### 2.2 跟踪与结构化提取
-- `video_mot_track`：多目标跟踪
-- `video_subject_crop`：主体跟踪裁剪
-- `video_keyframe_extract`：关键帧提取
-- `video_audio_extract`：音频提取
-
-### 2.3 OCR / ASR
-- `video_subtitle_ocr`：字幕提取
-- `video_text_ocr`：显著文字 OCR 提取
-- `video_speech_asr`：语音提取 / 语音识别
-
-### 2.4 基于 QwenVL 的视频语义理解
-- `video_classify_qwenvl`：视频分类
-- `video_summary_qwenvl`：视频摘要提取
-- `video_event_tag_qwenvl`：事件标注
-
----
-
-## 3. 目录结构
-
-```text
-runtime/ops/mapper/
-├── __init__.py
-├── _video_common/
-│   ├── __init__.py
-│   ├── ffmpeg.py
-│   ├── io_video.py
-│   ├── log.py
-│   ├── model_paths.py
-│   ├── paths.py
-│   ├── qwen_http_client.py
-│   └── schema.py
-├── video_audio_extract/
-├── video_classify_qwenvl/
-├── video_deborder_crop/
-├── video_event_tag_qwenvl/
-├── video_format_convert/
-├── video_keyframe_extract/
-├── video_mot_track/
-├── video_sensitive_crop/
-├── video_sensitive_detect/
-├── video_speech_asr/
-├── video_subject_crop/
-├── video_subtitle_ocr/
-├── video_summary_qwenvl/
-└── video_text_ocr/
-```
-
----
-
-## 4. 模型管理方式
-
-代码与模型权重分离管理：
-
-- GitHub 仓库中仅保存算子代码、配置与文档；
-- 模型权重统一存放于模型库，不直接提交到代码仓库。
-
-对于本地模型类算子，运行时按以下优先级解析模型根目录：
-
-1. `params["model_root"]`
-2. 环境变量 `DATAMATE_MODEL_ROOT`
-3. 默认兜底目录（如 `/mnt/models`）
-
-对于 QwenVL 相关算子，不在每个算子进程中重复加载模型，而是通过独立 HTTP 服务调用模型能力，以减少重复初始化开销、提升整体执行效率。
-
----
-
-## 5. 推理方式划分
-
-### 5.1 本地模型类算子
-以下算子直接从统一模型根目录读取模型：
-
-- `video_mot_track`
-- `video_subject_crop`
-- `video_subtitle_ocr`
-- `video_text_ocr`
-- `video_speech_asr`
-
-### 5.2 QwenVL 服务类算子
-以下算子通过独立 HTTP 服务进行推理：
-
-- `video_sensitive_detect`
-- `video_classify_qwenvl`
-- `video_summary_qwenvl`
-- `video_event_tag_qwenvl`
-
----
-
-## 6. 运行环境说明
-
-当前视频模块涉及两类运行环境：
-
-### 6.1 DataMate 视频算子运行环境
-主要用于视频算子本体执行，涉及能力包括：
-
-- OpenCV
-- FFmpeg 相关依赖
-- YOLO / Ultralytics
-- PaddleOCR
-- ONNX Runtime
-- Faster-Whisper / ASR 相关依赖
-
-### 6.2 QwenVL 服务运行环境
-主要用于独立的 QwenVL HTTP 服务，包括：
-
-- Flask
-- Transformers
-- Qwen-VL 相关依赖
-- Torch 及设备运行时相关依赖
-
----
-
-## 7. 典型输出结果
-
-不同算子的输出可能包括：
-
-- 裁剪后视频
-- 转码后视频
-- `tracks.json`
-- `summary.json`
-- `events.json`
-- `subtitles.srt`
-- OCR 结果文件
-- 提取音频文件
-- 调试视频 / 可视化结果
-
----
-
-## 8. QwenVL 服务依赖说明
-
-以下算子依赖独立的 QwenVL HTTP 服务：
-
-- `video_sensitive_detect`
-- `video_classify_qwenvl`
-- `video_summary_qwenvl`
-- `video_event_tag_qwenvl`
-
-该服务会在启动时预加载模型，并在内存中常驻，从而避免多个算子重复加载同一大模型。
-
----
-
-
diff --git a/runtime/ops/mapper/_video_common/README_models.md b/runtime/ops/mapper/_video_common/README_models.md
deleted file mode 100644
index b5517fd91..000000000
--- a/runtime/ops/mapper/_video_common/README_models.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# 视频算子模型说明
-
-## 1. 基本原则
-
-视频算子采用“代码与模型分离”的管理方式：
-
-- 代码、元数据与说明文档保存在 GitHub 仓库中；
-- 模型权重统一存放于模型库或模型存储目录；
-- 模型文件不直接提交到代码仓库。
-
----
-
-## 2. 本地模型的解析方式
-
-对于本地模型类算子，模型根目录按以下优先级解析：
-
-1. `params["model_root"]`
-2. 环境变量 `DATAMATE_MODEL_ROOT`
-3. 默认兜底目录（如 `/mnt/models`）
-
-算子在确定模型根目录后，再基于相对路径查找具体模型文件或模型目录。
-
----
-
-## 3. QwenVL 模型管理方式
-
-QwenVL 相关算子不在每个算子进程中直接加载模型，而是通过独立的 HTTP 服务调用模型能力。
-
-其设计目的包括：
-
-- 避免多个算子重复加载同一大模型；
-- 降低重复初始化的时间开销；
-- 减少整体内存占用；
-- 提高分类、摘要、事件标注、敏感检测等算子的复用效率。
-
----
-
-## 4. 建议的模型组织方式
-
-建议在模型库中按统一相对路径组织模型，例如：
-
-- `yolo/yolov8n.pt`
-- `ocr/det`
-- `ocr/rec`
-- `ocr/cls`
-- `asr/...`
-- `qwen/Qwen2.5-VL-7B-Instruct`
-
-具体目录名称可根据模型库中的实际组织方式进行调整，但建议在代码与文档中保持一致。
-
----
-
-## 5. 算子与模型对应关系
-
-### 5.1 本地模型类算子
-- `video_mot_track`：依赖目标跟踪模型（如 YOLO）
-- `video_subject_crop`：依赖目标跟踪结果或目标跟踪模型
-- `video_subtitle_ocr`：依赖 OCR 检测、识别、方向分类模型
-- `video_text_ocr`：依赖 OCR 检测、识别、方向分类模型
-- `video_speech_asr`：依赖 ASR 模型
-
-### 5.2 QwenVL 服务类算子
-- `video_sensitive_detect`：依赖 QwenVL HTTP 服务
-- `video_classify_qwenvl`：依赖 QwenVL HTTP 服务
-- `video_summary_qwenvl`：依赖 QwenVL HTTP 服务
-- `video_event_tag_qwenvl`：依赖 QwenVL HTTP 服务
-
----
-
-## 6. 部署说明
-
-在部署与运行前，需确保：
-
-- 所需模型已正确放置于模型库或模型目录；
-- `model_root` 或 `DATAMATE_MODEL_ROOT` 配置正确；
-- QwenVL 相关算子运行前，独立 HTTP 服务已正常启动；
-- 模型相对路径与代码中的约定保持一致。
-
----
-
-## 7. 说明
-
-当前项目中的模型权重未进行参数修改，默认优先复用模型库中已有模型，不重复提交模型权重文件。
diff --git a/runtime/ops/mapper/_video_common/README_qwen_service.md b/runtime/ops/mapper/_video_common/README_qwen_service.md
deleted file mode 100644
index d9331c8c7..000000000
--- a/runtime/ops/mapper/_video_common/README_qwen_service.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# QwenVL 服务说明
-
-## 1. 设计目的
-
-以下视频算子依赖 QwenVL 进行语义理解能力推理：
-
-- 视频敏感内容检测
-- 视频分类
-- 视频摘要
-- 视频事件标注
-
-为避免每个算子重复加载 QwenVL 模型，本项目采用独立 HTTP 服务方式提供统一推理能力。
-
----
-
-## 2. 为什么采用独立 HTTP 服务
-
-QwenVL 属于大模型，如果在每个算子中单独加载，会带来以下问题：
-
-- 模型重复初始化，耗时较高；
-- 多个算子重复占用显存 / 内存；
-- 分类、摘要、事件标注等连续执行时整体效率较低。
-
-因此，项目采用独立 HTTP 服务进行统一推理，服务启动后模型常驻内存，由多个算子共享调用。
-
----
-
-## 3. 服务工作方式
-
-QwenVL 服务的基本工作流程如下：
-
-1. 启动独立推理服务；
-2. 服务在启动时加载 QwenVL 模型；
-3. 模型在服务进程内常驻；
-4. 算子通过 `service_url` 发起 HTTP 请求；
-5. 服务返回推理结果给对应算子。
-
----
-
-## 4. 服务依赖的算子
-
-当前依赖 QwenVL HTTP 服务的算子包括：
-
-- `video_sensitive_detect`
-- `video_classify_qwenvl`
-- `video_summary_qwenvl`
-- `video_event_tag_qwenvl`
-
----
-
-## 5. 算子侧配置方式
-
-QwenVL 相关算子侧主要依赖：
-
-- `service_url`
-
-算子本身不直接维护 QwenVL 模型路径，而是通过 `service_url` 调用已经启动的服务。
-
----
-
-## 6. 服务侧配置要点
-
-服务侧通常需要配置以下内容：
-
-- QwenVL 模型目录
-- 服务监听地址与端口
-- 推理任务类型
-- 运行设备环境
-
-服务支持的任务类型可包括：
-
-- `sensitive`
-- `classify25`
-- `summary`
-- `event_tag`
-
----
-
-## 7. 部署建议
-
-为保证视频算子正常运行，建议：
-
-- 先启动 QwenVL HTTP 服务，再运行依赖该服务的算子；
-- 确保服务地址可访问；
-- 确保服务模型目录配置正确；
-- 在文档中单独维护服务启动方式与部署说明。
-
----
-
-## 8. 说明
-
-当前仓库中的视频算子与 QwenVL 服务逻辑解耦，视频算子只负责组织输入、调用服务并处理结果；模型加载与推理执行由独立服务统一负责。
diff --git a/scripts/images/qwen-vl-service/Dockerfile b/scripts/images/qwen-vl-service/Dockerfile
new file mode 100644
index 000000000..81513258e
--- /dev/null
+++ b/scripts/images/qwen-vl-service/Dockerfile
@@ -0,0 +1,57 @@
+# QwenVL service image for DataMate video operators
+#
+# Build example:
+#   docker build -f scripts/images/qwen-vl-service/Dockerfile -t datamate-qwen-vl-service .
+#
+# Notes:
+# 1. This image is used for the standalone QwenVL HTTP service.
+# 2. Replace the base image according to the actual runtime environment.
+# 3. If the service runs on Ascend / NPU, the base image must provide the
+#    corresponding device runtime.
+
+ARG BASE_IMAGE=python:3.10-slim
+FROM ${BASE_IMAGE}
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    APP_HOME=/workspace/datamate \
+    QWEN_SERVER_PORT=18080 \
+    QWEN_MODEL_DIR=/mnt/models/qwen/Qwen2.5-VL-7B-Instruct
+
+WORKDIR ${APP_HOME}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    curl \
+    ca-certificates \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN python -m pip install --upgrade pip setuptools wheel
+
+# Copy service script
+COPY scripts/images/qwen-vl-service/qwen_vl_server.py ${APP_HOME}/qwen_vl_server.py
+COPY scripts/images/qwen-vl-service/start.sh /usr/local/bin/start.sh
+RUN chmod +x /usr/local/bin/start.sh
+
+# Install dependencies inside image.
+# If the actual deployment environment uses a dedicated runtime image
+# that already contains the required packages, this section can be adjusted.
+RUN pip install \
+    Flask \
+    requests \
+    Pillow \
+    transformers \
+    qwen-vl-utils \
+    safetensors \
+    tokenizers \
+    accelerate \
+    torch
+
+EXPOSE 18080
+
+CMD ["/usr/local/bin/start.sh"]
diff --git a/scripts/images/qwen-vl-service/qwen_vl_server.py b/scripts/images/qwen-vl-service/qwen_vl_server.py
new file mode 100644
index 000000000..2920be30b
--- /dev/null
+++ b/scripts/images/qwen-vl-service/qwen_vl_server.py
@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+import json
+import os
+import re
+from flask import Flask, request, jsonify
+
+import torch
+import torch_npu  # noqa: F401
+from PIL import Image
+from transformers import AutoTokenizer
+from transformers import Qwen2VLImageProcessor, Qwen2_5_VLForConditionalGeneration
+
+# 允许通过环境变量覆盖
+DEFAULT_MODEL_DIR = "/mnt/nvme0n1/home/gtx/Video_Analysis_System/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct"
+MODEL_DIR = os.environ.get("QWEN_MODEL_DIR", DEFAULT_MODEL_DIR)
+PREPROCESSOR_CFG = os.path.join(MODEL_DIR, "preprocessor_config.json")
+
+app = Flask(__name__)
+
+# ===== Load once =====
+cfg = json.load(open(PREPROCESSOR_CFG, "r", encoding="utf-8"))
+MERGE_SIZE = int(cfg.get("merge_size", 1))
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
+image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_DIR)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_DIR,
+    torch_dtype=torch.float16
+).to("npu").eval()
+
+# ---------- Sensitive labels ----------
+SENSITIVE_VALID = ["porn", "violence", "blood", "explosion", "politics", "none"]
+SENSITIVE_SET = set(SENSITIVE_VALID)
+
+# ---------- Classify 25 ----------
+CLASS25 = [
+    "日常生活", "影视剧集", "音乐舞蹈", "幽默搞笑", "游戏电竞",
+    "动漫二次元", "新闻时事", "教育教学", "科技数码", "财经商业",
+    "纪录片", "体育竞技", "美食烹饪", "时尚美妆", "汽车交通",
+    "萌宠动物", "健康健身", "自然风光", "三农", "监控安防",
+    "广告营销", "才艺展示", "军事国防", "情感心理", "其他"
+]
+
+# ================= Prompt builders =================
+def label_only_prompt() -> str:
+    # 越短越好，避免模型复述规则
+    return "只输出一个词：porn|violence|blood|explosion|politics|none。不要解释。"
+
+
+def classify25_prompt() -> str:
+    items = "\n".join([f"{i+1}. {c}" for i, c in enumerate(CLASS25)])
+    return (
+        "你是视频分类器。根据图片判断视频类别。\n"
+        "只输出一个数字编号（1-25），不要解释、不要输出其它内容。\n"
+        f"类别列表：\n{items}\n"
+        "输出示例：8"
+    )
+
+
+def summary_prompt(language: str = "zh", style: str = "normal") -> str:
+    if (language or "zh").lower().startswith("en"):
+        if style == "short":
+            return "Summarize the video in one sentence based on the image. No extra text."
+        if style == "detail":
+            return "Summarize the video in 3-5 sentences based on the image, including objects, actions, and scene. No extra text."
+        return "Summarize the video based on the image. Be concise. No extra text."
+    # zh
+    if style == "short":
+        return "用一句话概括这段视频内容。不要解释。"
+    if style == "detail":
+        return "用3-5句概括视频内容，包含关键对象、动作、场景。不要解释。"
+    return "概括视频内容，包含关键对象、动作、场景。不要解释。"
+
+
+def event_tag_prompt() -> str:
+    return "根据图片判断正在发生的事件，用短语输出事件名称（不超过10个字）。不要解释。"
+
+
+# ================= Common helpers =================
+def build_prompt_with_image_tokens(user_text: str, num_image_tokens: int) -> str:
+    # 用 chat_template 插入视觉 token，然后展开 <|image_pad|>
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_text}]}]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    prompt = prompt.replace("<|image_pad|>", "<|image_pad|>" * num_image_tokens)
+    return prompt
+
+
+def extract_assistant_answer(raw_text: str) -> str:
+    """提取模型真正的回答，避免 prompt 回显导致误判：
+    - 截取最后一个 'assistant' 之后的内容
+    - 取最后一行
+    - 清理多余字符
+    """
+    if not raw_text:
+        return ""
+
+    t = raw_text
+    idx = t.rfind("assistant")
+    if idx != -1:
+        t = t[idx + len("assistant"):]
+
+    t = t.strip().splitlines()[-1].strip()
+    return t
+
+
+def extract_assistant_answer_sensitive(raw_text: str) -> str:
+    """敏感检测专用：只保留英文/分隔符，降低回显污染风险。"""
+    t = extract_assistant_answer(raw_text)
+    t = re.sub(r"[^a-zA-Z|]+", " ", t).strip().lower()
+    return t
+
+
+def normalize_sensitive_label(raw_text: str) -> str:
+    """严格匹配：只接受模型最终回答等于某个标签，否则返回 none。"""
+    ans = extract_assistant_answer_sensitive(raw_text)
+
+    if "|" in ans:
+        parts = [p.strip() for p in ans.split("|") if p.strip()]
+        if parts and parts[-1] in SENSITIVE_SET:
+            return parts[-1]
+        return "none"
+
+    if ans in SENSITIVE_SET:
+        return ans
+
+    return "none"
+
+
+def normalize_class25(raw_text: str) -> dict:
+    ans = extract_assistant_answer(raw_text).strip()
+    nums = re.findall(r"\d+", ans)
+    if not nums:
+        return {"id": 25, "label": "其他", "raw": ans}
+    idx = int(nums[-1])
+    if idx < 1 or idx > 25:
+        idx = 25
+    return {"id": idx, "label": CLASS25[idx - 1], "raw": ans}
+
+
+def infer_raw_text(image_path: str, user_text: str, max_new_tokens: int = 64) -> str:
+    """返回模型原始输出文本（可能包含回显）。"""
+    image = Image.open(image_path).convert("RGB")
+
+    img_inputs = image_processor(images=image, return_tensors="pt")
+    grid = img_inputs["image_grid_thw"][0]  # [t,h,w]
+    num_patches = int(grid.prod().item())
+    num_image_tokens = num_patches // (MERGE_SIZE * MERGE_SIZE)
+
+    prompt = build_prompt_with_image_tokens(user_text, num_image_tokens)
+    text_inputs = tokenizer(prompt, return_tensors="pt")
+
+    inputs = {**text_inputs, **img_inputs}
+    inputs = {k: v.to("npu") for k, v in inputs.items()}
+
+    with torch.no_grad():
+        out = model.generate(
+            **inputs,
+            max_new_tokens=int(max_new_tokens),
+            do_sample=False,
+            temperature=0.0
+        )
+
+    return tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+
+
+@app.route("/health", methods=["GET"])
+def health():
+    return jsonify({"ok": True, "model_dir": MODEL_DIR})
+
+
+@app.route("/infer", methods=["POST"])
+def infer_api():
+    data = request.get_json(force=True)
+    image_path = data["image_path"]
+    task = data.get("task", "sensitive")
+
+    # 通用可控项
+    max_new_tokens = int(data.get("max_new_tokens", 64))
+    language = data.get("language", "zh")
+    style = data.get("style", "normal")
+
+    try:
+        if task == "sensitive":
+            raw = infer_raw_text(image_path, label_only_prompt(), max_new_tokens=8)
+            label = normalize_sensitive_label(raw)
+            answer = extract_assistant_answer_sensitive(raw)
+
+            is_sensitive = (label != "none")
+            score = 0.90 if is_sensitive else 0.05
+
+            return jsonify({
+                "task": task,
+                "is_sensitive": is_sensitive,
+                "label": label,
+                "score": float(score),
+                "reason": answer if answer else label
+            })
+
+        if task == "classify25":
+            raw = infer_raw_text(image_path, classify25_prompt(), max_new_tokens=16)
+            cls = normalize_class25(raw)
+            return jsonify({
+                "task": task,
+                "class_id": int(cls["id"]),
+                "class_name": cls["label"],
+                "raw": cls.get("raw", "")
+            })
+
+        if task == "summary":
+            raw = infer_raw_text(image_path, summary_prompt(language=language, style=style), max_new_tokens=max_new_tokens)
+            return jsonify({
+                "task": task,
+                "summary": extract_assistant_answer(raw).strip()
+            })
+
+        if task == "event_tag":
+            raw = infer_raw_text(image_path, event_tag_prompt(), max_new_tokens=max_new_tokens)
+            return jsonify({
+                "task": task,
+                "event": extract_assistant_answer(raw).strip()
+            })
+
+        return jsonify({"task": task, "error": "unknown_task"}), 200
+
+    except Exception as e:
+        # 即使服务端异常也返回 JSON，避免客户端解析失败
+        return jsonify({"task": task, "error": "server_error", "reason": str(e)[:200]}), 200
+
+
+if __name__ == "__main__":
+    app.run(host="127.0.0.1", port=18080, debug=False)
\ No newline at end of file
diff --git a/scripts/images/qwen-vl-service/start.sh b/scripts/images/qwen-vl-service/start.sh
new file mode 100644
index 000000000..ccdb3071b
--- /dev/null
+++ b/scripts/images/qwen-vl-service/start.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PORT="${QWEN_SERVER_PORT:-18080}"
+
+# qwen_vl_server.py binds 127.0.0.1 in __main__ by default.
+# Start the Flask app explicitly so the container listens on 0.0.0.0.
+exec python -c "import qwen_vl_server as s; s.app.run(host='0.0.0.0', port=int('${PORT}'), debug=False)"

From 7baa709936a0a613635a75a55d83d7e53a430efb Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Fri, 27 Mar 2026 13:44:10 +0800
Subject: [PATCH 3/6] chore: update qwen vl service files with configurable
 model path

---
 scripts/images/qwen-vl-service/Dockerfile     | 42 +++++++++----------
 .../images/qwen-vl-service/qwen_vl_server.py  | 36 ++++------------
 scripts/images/qwen-vl-service/start.sh       |  7 ++--
 3 files changed, 32 insertions(+), 53 deletions(-)

diff --git a/scripts/images/qwen-vl-service/Dockerfile b/scripts/images/qwen-vl-service/Dockerfile
index 81513258e..5ee874447 100644
--- a/scripts/images/qwen-vl-service/Dockerfile
+++ b/scripts/images/qwen-vl-service/Dockerfile
@@ -1,13 +1,12 @@
 # QwenVL service image for DataMate video operators
 #
-# Build example:
-#   docker build -f scripts/images/qwen-vl-service/Dockerfile -t datamate-qwen-vl-service .
+# This image definition targets the standalone QwenVL HTTP service.
 #
 # Notes:
-# 1. This image is used for the standalone QwenVL HTTP service.
-# 2. Replace the base image according to the actual runtime environment.
-# 3. If the service runs on Ascend / NPU, the base image must provide the
-#    corresponding device runtime.
+# 1. The actual deployment environment uses Ascend / NPU with torch_npu and CANN.
+# 2. Replace BASE_IMAGE with the actual Ascend-compatible runtime image in your environment.
+# 3. Model weights are mounted from host storage and are not copied into the image.
+# 4. The model path is configured by QWEN_MODEL_DIR and should be adjusted at deployment time.
 
 ARG BASE_IMAGE=python:3.10-slim
 FROM ${BASE_IMAGE}
@@ -16,8 +15,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PIP_NO_CACHE_DIR=1 \
     APP_HOME=/workspace/datamate \
+    QWEN_SERVER_HOST=0.0.0.0 \
     QWEN_SERVER_PORT=18080 \
-    QWEN_MODEL_DIR=/mnt/models/qwen/Qwen2.5-VL-7B-Instruct
+    QWEN_MODEL_DIR=/mnt/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct
 
 WORKDIR ${APP_HOME}
 
@@ -33,25 +33,23 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 RUN python -m pip install --upgrade pip setuptools wheel
 
-# Copy service script
+# Python packages aligned to the observed service environment.
+# If the actual deployment base image already includes these packages,
+# this section can be adjusted accordingly.
+RUN pip install \
+    Flask==3.1.3 \
+    requests==2.32.5 \
+    Pillow==12.1.0 \
+    transformers==5.2.0 \
+    safetensors==0.7.0 \
+    tokenizers==0.22.2 \
+    accelerate==1.12.0 \
+    torch==2.9.0
+
 COPY scripts/images/qwen-vl-service/qwen_vl_server.py ${APP_HOME}/qwen_vl_server.py
 COPY scripts/images/qwen-vl-service/start.sh /usr/local/bin/start.sh
 RUN chmod +x /usr/local/bin/start.sh
 
-# Install dependencies inside image.
-# If the actual deployment environment uses a dedicated runtime image
-# that already contains the required packages, this section can be adjusted.
-RUN pip install \
-    Flask \
-    requests \
-    Pillow \
-    transformers \
-    qwen-vl-utils \
-    safetensors \
-    tokenizers \
-    accelerate \
-    torch
-
 EXPOSE 18080
 
 CMD ["/usr/local/bin/start.sh"]
diff --git a/scripts/images/qwen-vl-service/qwen_vl_server.py b/scripts/images/qwen-vl-service/qwen_vl_server.py
index 2920be30b..f2b2b2417 100644
--- a/scripts/images/qwen-vl-service/qwen_vl_server.py
+++ b/scripts/images/qwen-vl-service/qwen_vl_server.py
@@ -2,6 +2,7 @@
 import json
 import os
 import re
+
 from flask import Flask, request, jsonify
 
 import torch
@@ -10,11 +11,13 @@
 from transformers import AutoTokenizer
 from transformers import Qwen2VLImageProcessor, Qwen2_5_VLForConditionalGeneration
 
-# 允许通过环境变量覆盖
-DEFAULT_MODEL_DIR = "/mnt/nvme0n1/home/gtx/Video_Analysis_System/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct"
+DEFAULT_MODEL_DIR = "/mnt/models/qwen/Qwen/Qwen2.5-VL-7B-Instruct"
 MODEL_DIR = os.environ.get("QWEN_MODEL_DIR", DEFAULT_MODEL_DIR)
 PREPROCESSOR_CFG = os.path.join(MODEL_DIR, "preprocessor_config.json")
 
+SERVER_HOST = os.environ.get("QWEN_SERVER_HOST", "127.0.0.1")
+SERVER_PORT = int(os.environ.get("QWEN_SERVER_PORT", "18080"))
+
 app = Flask(__name__)
 
 # ===== Load once =====
@@ -28,11 +31,9 @@
     torch_dtype=torch.float16
 ).to("npu").eval()
 
-# ---------- Sensitive labels ----------
 SENSITIVE_VALID = ["porn", "violence", "blood", "explosion", "politics", "none"]
 SENSITIVE_SET = set(SENSITIVE_VALID)
 
-# ---------- Classify 25 ----------
 CLASS25 = [
     "日常生活", "影视剧集", "音乐舞蹈", "幽默搞笑", "游戏电竞",
     "动漫二次元", "新闻时事", "教育教学", "科技数码", "财经商业",
@@ -41,9 +42,8 @@
     "广告营销", "才艺展示", "军事国防", "情感心理", "其他"
 ]
 
-# ================= Prompt builders =================
+
 def label_only_prompt() -> str:
-    # 越短越好，避免模型复述规则
     return "只输出一个词：porn|violence|blood|explosion|politics|none。不要解释。"
 
 
@@ -64,7 +64,6 @@ def summary_prompt(language: str = "zh", style: str = "normal") -> str:
         if style == "detail":
             return "Summarize the video in 3-5 sentences based on the image, including objects, actions, and scene. No extra text."
         return "Summarize the video based on the image. Be concise. No extra text."
-    # zh
     if style == "short":
         return "用一句话概括这段视频内容。不要解释。"
     if style == "detail":
@@ -76,9 +75,7 @@ def event_tag_prompt() -> str:
     return "根据图片判断正在发生的事件，用短语输出事件名称（不超过10个字）。不要解释。"
 
 
-# ================= Common helpers =================
 def build_prompt_with_image_tokens(user_text: str, num_image_tokens: int) -> str:
-    # 用 chat_template 插入视觉 token，然后展开 <|image_pad|>
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": user_text}]}]
     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     prompt = prompt.replace("<|image_pad|>", "<|image_pad|>" * num_image_tokens)
@@ -86,43 +83,31 @@ def build_prompt_with_image_tokens(user_text: str, num_image_tokens: int) -> str
 
 
 def extract_assistant_answer(raw_text: str) -> str:
-    """提取模型真正的回答，避免 prompt 回显导致误判：
-    - 截取最后一个 'assistant' 之后的内容
-    - 取最后一行
-    - 清理多余字符
-    """
     if not raw_text:
         return ""
-
     t = raw_text
     idx = t.rfind("assistant")
     if idx != -1:
         t = t[idx + len("assistant"):]
-
     t = t.strip().splitlines()[-1].strip()
     return t
 
 
 def extract_assistant_answer_sensitive(raw_text: str) -> str:
-    """敏感检测专用：只保留英文/分隔符，降低回显污染风险。"""
     t = extract_assistant_answer(raw_text)
     t = re.sub(r"[^a-zA-Z|]+", " ", t).strip().lower()
     return t
 
 
 def normalize_sensitive_label(raw_text: str) -> str:
-    """严格匹配：只接受模型最终回答等于某个标签，否则返回 none。"""
     ans = extract_assistant_answer_sensitive(raw_text)
-
     if "|" in ans:
         parts = [p.strip() for p in ans.split("|") if p.strip()]
         if parts and parts[-1] in SENSITIVE_SET:
             return parts[-1]
         return "none"
-
     if ans in SENSITIVE_SET:
         return ans
-
     return "none"
 
 
@@ -138,11 +123,10 @@ def normalize_class25(raw_text: str) -> dict:
 
 
 def infer_raw_text(image_path: str, user_text: str, max_new_tokens: int = 64) -> str:
-    """返回模型原始输出文本（可能包含回显）。"""
     image = Image.open(image_path).convert("RGB")
 
     img_inputs = image_processor(images=image, return_tensors="pt")
-    grid = img_inputs["image_grid_thw"][0]  # [t,h,w]
+    grid = img_inputs["image_grid_thw"][0]
     num_patches = int(grid.prod().item())
     num_image_tokens = num_patches // (MERGE_SIZE * MERGE_SIZE)
 
@@ -165,7 +149,7 @@ def infer_raw_text(image_path: str, user_text: str, max_new_tokens: int = 64) ->
 
 @app.route("/health", methods=["GET"])
 def health():
-    return jsonify({"ok": True, "model_dir": MODEL_DIR})
+    return jsonify({"ok": True, "model_dir": MODEL_DIR, "host": SERVER_HOST, "port": SERVER_PORT})
 
 
 @app.route("/infer", methods=["POST"])
@@ -174,7 +158,6 @@ def infer_api():
     image_path = data["image_path"]
     task = data.get("task", "sensitive")
 
-    # 通用可控项
     max_new_tokens = int(data.get("max_new_tokens", 64))
     language = data.get("language", "zh")
     style = data.get("style", "normal")
@@ -223,9 +206,8 @@ def infer_api():
         return jsonify({"task": task, "error": "unknown_task"}), 200
 
     except Exception as e:
-        # 即使服务端异常也返回 JSON，避免客户端解析失败
         return jsonify({"task": task, "error": "server_error", "reason": str(e)[:200]}), 200
 
 
 if __name__ == "__main__":
-    app.run(host="127.0.0.1", port=18080, debug=False)
\ No newline at end of file
+    app.run(host=SERVER_HOST, port=SERVER_PORT, debug=False)
diff --git a/scripts/images/qwen-vl-service/start.sh b/scripts/images/qwen-vl-service/start.sh
index ccdb3071b..29da01430 100644
--- a/scripts/images/qwen-vl-service/start.sh
+++ b/scripts/images/qwen-vl-service/start.sh
@@ -1,8 +1,7 @@
 #!/usr/bin/env bash
 set -euo pipefail
 
-PORT="${QWEN_SERVER_PORT:-18080}"
+export QWEN_SERVER_HOST="${QWEN_SERVER_HOST:-0.0.0.0}"
+export QWEN_SERVER_PORT="${QWEN_SERVER_PORT:-18080}"
 
-# qwen_vl_server.py binds 127.0.0.1 in __main__ by default.
-# Start the Flask app explicitly so the container listens on 0.0.0.0.
-exec python -c "import qwen_vl_server as s; s.app.run(host='0.0.0.0', port=int('${PORT}'), debug=False)"
+exec python /workspace/datamate/qwen_vl_server.py

From 0d9eff433a3c263f5fad06d73c7528a63e4e965e Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Wed, 8 Apr 2026 16:44:26 +0800
Subject: [PATCH 4/6] feat: update video operators and qwen vl service files

---
 .../mapper/video_classify_qwenvl/metadata.yml |   4 +-
 .../mapper/video_classify_qwenvl/process.py   | 136 ++++++-----
 .../ops/mapper/video_speech_asr/__init__.py   |   6 -
 .../ops/mapper/video_speech_asr/metadata.yml  |  16 --
 .../ops/mapper/video_speech_asr/process.py    | 213 -----------------
 .../ops/mapper/video_subtitle_ocr/process.py  | 222 +++++++++++-------
 scripts/images/qwen-vl-service/Dockerfile     |   4 -
 .../images/qwen-vl-service/qwen_vl_server.py  |  49 ++--
 8 files changed, 241 insertions(+), 409 deletions(-)
 delete mode 100644 runtime/ops/mapper/video_speech_asr/__init__.py
 delete mode 100644 runtime/ops/mapper/video_speech_asr/metadata.yml
 delete mode 100644 runtime/ops/mapper/video_speech_asr/process.py

diff --git a/runtime/ops/mapper/video_classify_qwenvl/metadata.yml b/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
index 1f27ca1a9..67c47b0c3 100644
--- a/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
+++ b/runtime/ops/mapper/video_classify_qwenvl/metadata.yml
@@ -1,7 +1,7 @@
 name: '视频分类（QwenVL）'
 name_en: 'Video Classify (QwenVL)'
-description: '抽帧调用 QwenVL classify25，多帧投票输出分类结果 classification.json。'
-description_en: 'Sample frames and call QwenVL classify25; vote to output classification.json.'
+description: '抽帧调用 QwenVL classify22，多帧投票输出分类结果 classification.json。'
+description_en: 'Sample frames and call QwenVL classify22; vote to output classification.json.'
 language: 'python'
 vendor: 'huawei'
 raw_id: 'VideoClassifyQwenVL'
diff --git a/runtime/ops/mapper/video_classify_qwenvl/process.py b/runtime/ops/mapper/video_classify_qwenvl/process.py
index 48a0a6082..fa063cf73 100644
--- a/runtime/ops/mapper/video_classify_qwenvl/process.py
+++ b/runtime/ops/mapper/video_classify_qwenvl/process.py
@@ -3,18 +3,34 @@
 import json
 import collections
 import cv2
+import importlib
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
-from .._video_common.qwen_http_client import qwenvl_infer_by_image_path, save_frame_to_jpg
+
+_qwen = importlib.import_module("tools.qwen_sensitive")
+qwenvl_infer = _qwen.qwenvl_infer
+
+
+CLASS_NAMES = [
+    "影视剧情类", "新闻资讯类", "教育知识类", "美食饮品类", "自然风光类",
+    "时尚美妆类", "亲子育儿类", "宠物日常类", "游戏电竞类", "音乐舞蹈类",
+    "动漫二次元类", "数码产品类", "汽车交通类", "财经商业类", "文化艺术类",
+    "乐器演奏类", "国防军事类", "体育竞技类", "野生动物类", "农业类",
+    "航空航天类", "其他类"
+]
+
+DEFAULT_CLASS_ID = len(CLASS_NAMES)
+DEFAULT_CLASS_NAME = CLASS_NAMES[-1]
 
 
 def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_frames: int):
     if total_frames <= 0:
         return []
+    sample_fps = float(sample_fps)
     fps = float(fps) if fps else 25.0
-    step = max(1, int(round(fps / max(float(sample_fps), 1e-6))))
+    step = max(1, int(round(fps / max(sample_fps, 0.0001))))
     idxs = list(range(0, total_frames, step))
     if max_frames and len(idxs) > int(max_frames):
         n = int(max_frames)
@@ -23,92 +39,94 @@ def _sample_frame_indices(total_frames: int, fps: float, sample_fps: float, max_
 
 
 class VideoClassifyQwenVL:
-    """
-    抽帧 + QwenVL HTTP 分类（对齐服务端 task=classify25）：
-      返回: {class_id, class_name, raw}
+    """视频分类
+
+    思路：抽帧 -> 调 QwenVL 服务 task=classify25 -> 多帧投票输出 top1
 
     params:
-      - service_url: 默认 http://127.0.0.1:18080
-      - timeout_sec: 默认 180
-      - sample_fps: 默认 1.0
-      - max_frames: 默认 12
-      - return_topk: 默认 3
-      - max_new_tokens: 默认 16
-    outputs:
-      - artifacts/classification.json
+      - sample_fps: float, default 1.0
+      - max_frames: int, default 12
+      - return_topk: int, default 3
     """
 
-    def execute(self, sample, params=None):
-        params = params or {}
+    @staticmethod
+    def execute(sample, params):
         video_path = sample["filePath"]
         export_path = sample.get("export_path", "./outputs")
 
-        out_dir = make_run_dir(export_path, "video_classify_qwenvl")
+        op_name = "video_classify_qwenvl"
+        out_dir = make_run_dir(export_path, op_name)
         log_dir = ensure_dir(os.path.join(out_dir, "logs"))
         art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
         frames_dir = ensure_dir(os.path.join(art_dir, "frames"))
-        logger = get_logger("VideoClassifyQwenVL", log_dir)
 
-        service_url = params.get("service_url", "http://127.0.0.1:18080")
-        timeout_sec = int(params.get("timeout_sec", 180))
+        logger = get_logger(op_name, log_dir)
+        logger.info(f"video={video_path}")
+        logger.info(f"out_dir={out_dir}")
+
+        fps, w, h, total = get_video_info(video_path)
         sample_fps = float(params.get("sample_fps", 1.0))
         max_frames = int(params.get("max_frames", 12))
         return_topk = int(params.get("return_topk", 3))
-        max_new_tokens = int(params.get("max_new_tokens", 16))
 
-        fps, W, H, total_frames = get_video_info(video_path)
-        idxs = _sample_frame_indices(total_frames, fps, sample_fps, max_frames)
+        idxs = _sample_frame_indices(total, fps, sample_fps, max_frames)
+        logger.info(f"fps={fps:.3f}, frames={total}, sample_fps={sample_fps}, idxs={len(idxs)}")
 
         cap = cv2.VideoCapture(video_path)
         if not cap.isOpened():
             raise RuntimeError(f"Cannot open video: {video_path}")
 
-        votes = collections.Counter()
-        evidence = []
-
-        for idx in idxs:
-            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        votes = []
+        evidences = []
+        for k, fi in enumerate(idxs):
+            cap.set(cv2.CAP_PROP_POS_FRAMES, int(fi))
             ok, frame = cap.read()
-            if not ok:
+            if not ok or frame is None:
                 continue
 
-            frame_jpg = os.path.join(frames_dir, f"{idx:06d}.jpg")
-            save_frame_to_jpg(frame, frame_jpg)
-
-            try:
-                res = qwenvl_infer_by_image_path(
-                    image_path=frame_jpg,
-                    task="classify25",
-                    service_url=service_url,
-                    max_new_tokens=max_new_tokens,
-                    timeout=timeout_sec,
-                )
-            except Exception as e:
-                logger.error(f"classify infer failed frame={idx}: {repr(e)}")
-                continue
+            jpg_path = os.path.join(frames_dir, f"frame_{int(fi):06d}.jpg")
+            cv2.imwrite(jpg_path, frame)
 
-            class_name = (res.get("class_name") or "其他").strip()
-            class_id = int(res.get("class_id", 25))
-            votes[class_name] += 1
-            evidence.append({"frame_idx": idx, "image_path": frame_jpg, "class_id": class_id, "class_name": class_name})
+            resp = qwenvl_infer(frame, task="classify25", timeout=180)
+            cid = int(resp.get("class_id", DEFAULT_CLASS_ID) or DEFAULT_CLASS_ID)
+            cname = resp.get("class_name", DEFAULT_CLASS_NAME) or DEFAULT_CLASS_NAME
 
-        cap.release()
+            if cname not in CLASS_NAMES:
+                cid = DEFAULT_CLASS_ID
+                cname = DEFAULT_CLASS_NAME
 
-        topk = [{"label": k, "vote": int(v)} for k, v in votes.most_common(return_topk)]
-        top1 = topk[0]["label"] if topk else "其他"
+            votes.append(cname)
+            evidences.append({"frame_id": int(fi), "jpg": jpg_path, "class_id": cid, "class_name": cname})
+
+            logger.info(f"[{k+1}/{len(idxs)}] frame={fi} -> {cid}:{cname}")
+
+        cap.release()
 
-        result = {
-            "top1": top1,
-            "topk": topk,
-            "service_url": service_url,
-            "sample_fps": sample_fps,
-            "max_frames": max_frames,
-            "evidence": evidence,
-        }
+        if not votes:
+            result = {
+                "top1": {"class_id": DEFAULT_CLASS_ID, "class_name": DEFAULT_CLASS_NAME, "score": 0.0},
+                "topk": [],
+                "evidence": []
+            }
+        else:
+            c = collections.Counter(votes)
+            top = c.most_common(max(1, return_topk))
+            top1_name, top1_cnt = top[0]
+            top1_id = (CLASS_NAMES.index(top1_name) + 1) if top1_name in CLASS_NAMES else DEFAULT_CLASS_ID
+            result = {
+                "top1": {"class_id": int(top1_id), "class_name": top1_name, "score": float(top1_cnt / len(votes))},
+                "topk": [{
+                    "class_id": (CLASS_NAMES.index(name) + 1) if name in CLASS_NAMES else DEFAULT_CLASS_ID,
+                    "class_name": name,
+                    "score": float(cnt / len(votes))
+                } for name, cnt in top],
+                "evidence": evidences,
+                "meta": {"fps": float(fps), "width": int(w), "height": int(h), "total_frames": int(total)}
+            }
 
         json_path = os.path.join(art_dir, "classification.json")
         with open(json_path, "w", encoding="utf-8") as f:
             json.dump(result, f, ensure_ascii=False, indent=2)
 
-        logger.info(f"Done. classification_json={json_path}, top1={top1}")
-        return {"out_dir": out_dir, "classification_json": json_path, "top1": top1}
\ No newline at end of file
+        logger.info(f"Done. classification_json={json_path}")
+        return {"out_dir": out_dir, "classification_json": json_path, "top1": result["top1"]}
diff --git a/runtime/ops/mapper/video_speech_asr/__init__.py b/runtime/ops/mapper/video_speech_asr/__init__.py
deleted file mode 100644
index cc00c40da..000000000
--- a/runtime/ops/mapper/video_speech_asr/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(
-    module_name="VideoSpeechASR",
-    module_path="ops.mapper.video_speech_asr.process",
-)
diff --git a/runtime/ops/mapper/video_speech_asr/metadata.yml b/runtime/ops/mapper/video_speech_asr/metadata.yml
deleted file mode 100644
index 0847fa477..000000000
--- a/runtime/ops/mapper/video_speech_asr/metadata.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: '语音识别ASR'
-name_en: 'Video Speech ASR'
-description: '从视频抽取音频并进行语音识别，输出 asr.json（可含时间戳）；支持指定语言/模型规模等参数。'
-description_en: 'Extract audio and run ASR, outputs asr.json (with timestamps); supports language/model options.'
-language: 'python'
-vendor: 'huawei'
-raw_id: 'VideoSpeechASR'
-version: '1.0.0'
-types:
-  - 'annotation'
-modal: 'video'
-effect:
-  before: ''
-  after: ''
-inputs: 'video'
-outputs: 'text'
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_speech_asr/process.py b/runtime/ops/mapper/video_speech_asr/process.py
deleted file mode 100644
index 711698995..000000000
--- a/runtime/ops/mapper/video_speech_asr/process.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# -*- coding: utf-8 -*-
-import os
-import json
-import shutil
-import subprocess
-import re
-
-from .._video_common.paths import make_run_dir, ensure_dir
-from .._video_common.log import get_logger
-
-
-def _write_srt(segments, srt_path):
-    def _fmt(t):
-        h = int(t // 3600)
-        m = int((t % 3600) // 60)
-        s = int(t % 60)
-        ms = int(round((t - int(t)) * 1000))
-        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
-
-    with open(srt_path, "w", encoding="utf-8") as f:
-        for i, seg in enumerate(segments, 1):
-            f.write(str(i) + "\n")
-            f.write(f"{_fmt(seg['start'])} --> {_fmt(seg['end'])}\n")
-            f.write((seg.get("text") or "").strip() + "\n\n")
-
-
-def _contains_cjk(s: str) -> bool:
-    return bool(re.search(r"[\u4e00-\u9fff]", s or ""))
-
-
-def _to_simplified(text: str) -> str:
-    try:
-        from opencc import OpenCC
-        return OpenCC("t2s").convert(text)
-    except Exception:
-        return text
-
-
-class VideoSpeechASR:
-    """语音转文字（优先 faster-whisper；失败自动回退 openai-whisper）
-
-    params:
-      - ffmpeg_path: str, optional
-      - model: tiny|base|small|medium|large-v3, default small
-      - language: auto|zh|en, default zh
-      - beam_size: int, default 5
-      - vad_filter: bool, default True
-      - compute_type: int8|int8_float16|float16|float32, default int8
-      - sample_rate: int, default 16000
-      - channels: int, default 1
-      - max_audio_sec: float, optional
-      - zh_script: simplified|traditional|keep, default simplified
-
-      # 离线/本地模型（faster-whisper）
-      - fw_model_path: str, optional   # 本地模型路径（目录）
-      - fw_download_root: str, optional
-      - local_files_only: bool, default False
-
-    outputs:
-      - artifacts/audio.wav
-      - artifacts/asr.json / asr.txt / asr.srt
-      - artifacts/asr_backend.json（记录用了哪个后端/异常信息）
-    """
-
-    @staticmethod
-    def execute(sample, params):
-        video_path = sample["filePath"]
-        export_path = sample.get("export_path", "./outputs")
-
-        op_name = "video_speech_asr"
-        out_dir = make_run_dir(export_path, op_name)
-        log_dir = ensure_dir(os.path.join(out_dir, "logs"))
-        art_dir = ensure_dir(os.path.join(out_dir, "artifacts"))
-
-        logger = get_logger(op_name, log_dir)
-        logger.info(f"video={video_path}")
-        logger.info(f"out_dir={out_dir}")
-
-        ffmpeg_path = params.get("ffmpeg_path") or shutil.which("ffmpeg")
-        if not ffmpeg_path:
-            raise RuntimeError("ffmpeg not found. Please install ffmpeg or pass params.ffmpeg_path")
-
-        sr = int(params.get("sample_rate", 16000))
-        ch = int(params.get("channels", 1))
-        max_audio_sec = params.get("max_audio_sec", None)
-        max_audio_sec = float(max_audio_sec) if max_audio_sec is not None else None
-
-        audio_path = os.path.join(art_dir, "audio.wav")
-        cmd = [
-            ffmpeg_path, "-hide_banner", "-y",
-            "-i", video_path,
-            "-vn",
-            "-ac", str(ch),
-            "-ar", str(sr),
-            "-c:a", "pcm_s16le",
-        ]
-        if max_audio_sec is not None and max_audio_sec > 0:
-            cmd += ["-t", f"{max_audio_sec}"]
-        cmd += [audio_path]
-
-        logger.info("FFmpeg cmd: " + " ".join(cmd))
-        p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        if p.returncode != 0:
-            raise RuntimeError(f"FFmpeg failed (code={p.returncode}).\nSTDERR:\n{p.stderr}")
-
-        model_name = (params.get("model", "small") or "small")
-        language = (params.get("language", "zh") or "zh").lower()
-        beam_size = int(params.get("beam_size", 5))
-        vad_filter = bool(params.get("vad_filter", True))
-        compute_type = (params.get("compute_type", "int8") or "int8")
-        zh_script = (params.get("zh_script", "simplified") or "simplified").lower()
-
-        fw_model_path = params.get("fw_model_path", None)
-        fw_download_root = params.get("fw_download_root", None)
-        local_files_only = bool(params.get("local_files_only", False))
-
-        segments = []
-        full_text = ""
-        backend_info = {"backend": None, "error": None}
-
-        # ===== try faster-whisper =====
-        try:
-            from faster_whisper import WhisperModel
-            backend_info["backend"] = "faster-whisper"
-
-            # 离线策略：local_files_only 时，把 HF 的联网行为尽量关掉
-            if local_files_only:
-                os.environ.setdefault("HF_HUB_OFFLINE", "1")
-                os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
-
-            model_id = fw_model_path or model_name
-            logger.info(f"[ASR] faster-whisper load model={model_id}, compute_type={compute_type}, offline={local_files_only}")
-
-            fw = WhisperModel(
-                model_id,
-                device="cpu",
-                compute_type=compute_type,
-                download_root=fw_download_root,
-            )
-
-            logger.info("[ASR] faster-whisper transcribe start...")
-            seg_iter, info = fw.transcribe(
-                audio_path,
-                language=None if language == "auto" else language,
-                beam_size=beam_size,
-                vad_filter=vad_filter,
-            )
-            for s in seg_iter:
-                segments.append({"start": float(s.start), "end": float(s.end), "text": (s.text or "").strip()})
-            full_text = " ".join([s["text"] for s in segments]).strip()
-            logger.info("[ASR] faster-whisper transcribe done.")
-
-        except Exception as e:
-            # ===== fallback openai-whisper =====
-            backend_info["backend"] = "openai-whisper"
-            backend_info["error"] = f"faster-whisper failed: {repr(e)}"
-            logger.warning("[ASR] faster-whisper failed, fallback openai-whisper. reason=" + repr(e))
-
-            try:
-                import whisper
-            except Exception as e2:
-                raise RuntimeError("ASR backend failed. Please install: pip install faster-whisper openai-whisper") from e2
-
-            logger.info(f"[ASR] openai-whisper load model={model_name} (slow on CPU)")
-            wmodel = whisper.load_model(model_name)
-
-            wargs = {"fp16": False, "verbose": False}
-            if language != "auto":
-                wargs["language"] = language
-
-            logger.info("[ASR] openai-whisper transcribe start...")
-            result = wmodel.transcribe(audio_path, **wargs)
-            logger.info("[ASR] openai-whisper transcribe done.")
-
-            for seg in result.get("segments", []):
-                segments.append({
-                    "start": float(seg.get("start", 0.0)),
-                    "end": float(seg.get("end", 0.0)),
-                    "text": (seg.get("text") or "").strip()
-                })
-            full_text = (result.get("text") or "").strip()
-
-        # 简体化
-        if zh_script == "simplified":
-            if _contains_cjk(full_text):
-                full_text = _to_simplified(full_text)
-            for s in segments:
-                if _contains_cjk(s["text"]):
-                    s["text"] = _to_simplified(s["text"])
-
-        json_path = os.path.join(art_dir, "asr.json")
-        txt_path = os.path.join(art_dir, "asr.txt")
-        srt_path = os.path.join(art_dir, "asr.srt")
-        backend_path = os.path.join(art_dir, "asr_backend.json")
-
-        with open(json_path, "w", encoding="utf-8") as f:
-            json.dump({"text": full_text, "segments": segments}, f, ensure_ascii=False, indent=2)
-        with open(txt_path, "w", encoding="utf-8") as f:
-            f.write(full_text + "\n")
-        _write_srt(segments, srt_path)
-
-        with open(backend_path, "w", encoding="utf-8") as f:
-            json.dump(backend_info, f, ensure_ascii=False, indent=2)
-
-        logger.info(f"Done. segments={len(segments)} asr_json={json_path}")
-        return {
-            "out_dir": out_dir,
-            "audio_wav": audio_path,
-            "asr_json": json_path,
-            "asr_txt": txt_path,
-            "asr_srt": srt_path,
-            "asr_backend": backend_path,
-        }
\ No newline at end of file
diff --git a/runtime/ops/mapper/video_subtitle_ocr/process.py b/runtime/ops/mapper/video_subtitle_ocr/process.py
index 2c58753fb..b7b5b325d 100644
--- a/runtime/ops/mapper/video_subtitle_ocr/process.py
+++ b/runtime/ops/mapper/video_subtitle_ocr/process.py
@@ -4,47 +4,15 @@
 import re
 import shutil
 import subprocess
+from difflib import SequenceMatcher
 import cv2
 import numpy as np
-import inspect
 
 from .._video_common.paths import make_run_dir, ensure_dir
 from .._video_common.log import get_logger
 from .._video_common.io_video import get_video_info
-from paddleocr import PaddleOCR
-from .._video_common.model_paths import resolve_model_path
 
-def build_paddle_ocr(params, ocr_lang: str, use_angle_cls: bool):
-    """
-    默认模型目录：
-      /mnt/models/ocr/det
-      /mnt/models/ocr/rec
-      /mnt/models/ocr/cls
-    也支持 params['ocr_model_dir'] 指定（相对/绝对）。
-    """
-    ocr_root = resolve_model_path(params, "ocr_model_dir", "ocr")
-    det_dir = os.path.join(ocr_root, "det")
-    rec_dir = os.path.join(ocr_root, "rec")
-    cls_dir = os.path.join(ocr_root, "cls")
-
-    # 目录不存在就直接报错，让用户去模型仓下载到固定位置
-    for p in [det_dir, rec_dir] + ([cls_dir] if use_angle_cls else []):
-        if not os.path.exists(p):
-            raise RuntimeError(f"PaddleOCR model dir not found: {p}. Please download OCR models into model repo path.")
-
-    sig = inspect.signature(PaddleOCR.__init__)
-    kw = {"lang": ocr_lang}
-    if "use_angle_cls" in sig.parameters:
-        kw["use_angle_cls"] = use_angle_cls
-    # PaddleOCR 3.4.0 支持这些
-    if "det_model_dir" in sig.parameters:
-        kw["det_model_dir"] = det_dir
-    if "rec_model_dir" in sig.parameters:
-        kw["rec_model_dir"] = rec_dir
-    if "cls_model_dir" in sig.parameters and use_angle_cls:
-        kw["cls_model_dir"] = cls_dir
-
-    return PaddleOCR(**kw)
+
 def _write_srt(segments, srt_path):
     def _fmt(t):
         h = int(t // 3600)
@@ -83,41 +51,138 @@ def _fix_english_spacing(text: str) -> str:
         return text
 
     t = text
-
-    # 小写后接大写：ThisIs -> This Is
     t = re.sub(r"([a-z])([A-Z])", r"\1 \2", t)
-
-    # 字母数字边界：A1 / 1A
     t = re.sub(r"([A-Za-z])(\d)", r"\1 \2", t)
     t = re.sub(r"(\d)([A-Za-z])", r"\1 \2", t)
-
-    # 标点前去空格，标点后若紧跟字母则补空格（保守）
     t = re.sub(r"\s+([,.;:?!])", r"\1", t)
     t = re.sub(r"([,.;:?!])([A-Za-z])", r"\1 \2", t)
-
-    # 多空格压缩
     t = re.sub(r"\s+", " ", t).strip()
     return t
 
 
 def _norm_sub_key(text: str) -> str:
-    """用于合并的规范化 key：空格归一、末尾标点归一、英文小写化"""
+    """原始 key：保守，仅用于展示/回溯"""
     if not text:
         return ""
     t = text.strip()
     t = re.sub(r"\s+", " ", t)
-    # 去掉末尾重复标点（中英文都考虑）
     t = re.sub(r"[.。!?！？]+$", "", t).strip()
-
-    # 英文占比高则统一小写，便于合并
     if _english_ratio(t) > 0.40:
         t = t.lower()
+    return t
+
+
+def _merge_norm_text(text: str) -> str:
+    """用于合并判断的更强规范化。
+    英文：小写、去大部分标点、压缩空格，额外生成 nospace 判断。
+    中文：保留汉字和数字，去尾部标点的影响。
+    """
+    if not text:
+        return ""
+    t = text.strip().lower()
+    t = t.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
+    # 去掉中英文尾标点影响
+    t = re.sub(r"[.。,，!！?？:：;；~～_\-]+$", "", t).strip()
+    # 中间的特殊符号尽量弱化
+    t = re.sub(r"[^0-9a-z\u4e00-\u9fff\s]", " ", t)
+    t = re.sub(r"\s+", " ", t).strip()
+    return t
+
 
+def _merge_nospace_text(text: str) -> str:
+    t = _merge_norm_text(text)
+    t = re.sub(r"\s+", "", t)
     return t
 
 
+def _text_sim(a: str, b: str) -> float:
+    a1 = _merge_nospace_text(a)
+    b1 = _merge_nospace_text(b)
+    if not a1 or not b1:
+        return 0.0
+    if a1 == b1:
+        return 1.0
+    return SequenceMatcher(None, a1, b1).ratio()
+
+
+def _choose_better_text(a: str, b: str) -> str:
+    """相似字幕合并时，保留更像完整句子的版本。"""
+    if not a:
+        return b
+    if not b:
+        return a
+    a_score = len(_merge_nospace_text(a))
+    b_score = len(_merge_nospace_text(b))
+    # 有明显结束标点的稍微加分
+    if re.search(r"[.。!?！？]$", a.strip()):
+        a_score += 2
+    if re.search(r"[.。!?！？]$", b.strip()):
+        b_score += 2
+    return b if b_score > a_score else a
+
+
+def _should_merge_hit(last_seg, hit, gap_merge: float, sim_thr: float = 0.90) -> bool:
+    dt = hit["t"] - last_seg["end"]
+    if dt > gap_merge:
+        return False
+
+    # 1) 原始 key 一样
+    if hit["key"] == last_seg["key"]:
+        return True
+
+    # 2) 更强的规范化后完全一致（尤其适合英文空格/标点变化）
+    if _merge_nospace_text(hit["text"]) == _merge_nospace_text(last_seg["text"]):
+        return True
+
+    # 3) 一方是另一方的子串，且时间很近
+    a = _merge_nospace_text(last_seg["text"])
+    b = _merge_nospace_text(hit["text"])
+    if a and b and (a in b or b in a) and dt <= max(2.0, gap_merge):
+        return True
+
+    # 4) 文本相似度高，则认为是同一句的 OCR 波动
+    sim = _text_sim(last_seg["text"], hit["text"])
+    if sim >= sim_thr:
+        return True
+
+    return False
+
+
+def _post_merge_segments(segments, gap_merge: float):
+    """第二轮后处理：处理 A, A', A'' 这类连续波动；必要时删除极短重复段。"""
+    if not segments:
+        return segments
+
+    merged = [segments[0]]
+    for seg in segments[1:]:
+        last = merged[-1]
+        if _should_merge_hit(last, {"t": seg["start"], "text": seg["text"], "key": seg.get("key", "")}, gap_merge=max(gap_merge, 2.0), sim_thr=0.86):
+            last["end"] = max(last["end"], seg["end"])
+            last["text"] = _choose_better_text(last["text"], seg["text"])
+            last["key"] = _norm_sub_key(last["text"])
+            last.setdefault("evidence", []).extend(seg.get("evidence", []))
+        else:
+            merged.append(seg)
+
+    # 删除明显的短抖动重复：前后两段极近且文本高度相似，保留更优那一段
+    cleaned = []
+    for seg in merged:
+        if not cleaned:
+            cleaned.append(seg)
+            continue
+        last = cleaned[-1]
+        sim = _text_sim(last["text"], seg["text"])
+        if sim >= 0.92 and (seg["start"] - last["end"] <= max(2.0, gap_merge)):
+            last["end"] = max(last["end"], seg["end"])
+            last["text"] = _choose_better_text(last["text"], seg["text"])
+            last["key"] = _norm_sub_key(last["text"])
+            last.setdefault("evidence", []).extend(seg.get("evidence", []))
+        else:
+            cleaned.append(seg)
+    return cleaned
+
+
 def _roi_changed(cur_roi, last_roi, diff_thr=4.0):
-    """diff_thr 调低一点更敏感，避免跳过字幕变化"""
     if last_roi is None:
         return True
     a = cv2.cvtColor(cur_roi, cv2.COLOR_BGR2GRAY)
@@ -179,15 +244,7 @@ def _deborder_ffmpeg(ffmpeg_path: str, in_video: str, out_video: str, logger):
 
 
 def _extract_texts_from_any(res):
-    """
-    兼容 PaddleOCR 多种返回：
-    - 传统：res = [ [ [box,(text,score)], ... ] ]
-    - 新 pipeline/dict：res 可能是 dict/对象，里头有 'rec_texts'/'rec_scores' 或 'texts'/'scores'
-    返回: list[(text,score)]
-    """
     out = []
-
-    # dict 风格
     if isinstance(res, dict):
         keys_text = ["rec_texts", "texts", "text"]
         keys_score = ["rec_scores", "scores", "score"]
@@ -201,7 +258,6 @@ def _extract_texts_from_any(res):
             if ks in res:
                 scores = res[ks]
                 break
-
         if texts is not None:
             if isinstance(texts, str):
                 out.append((texts, float(scores) if scores is not None else 0.0))
@@ -218,20 +274,16 @@ def _extract_texts_from_any(res):
                         for t in texts:
                             out.append((str(t), float(scores) if scores is not None else 0.0))
                 return out
-
         if "result" in res:
             return _extract_texts_from_any(res["result"])
 
-    # list 风格（传统）
     if isinstance(res, list):
         if len(res) == 0:
             return out
-
         if isinstance(res[0], dict):
             for item in res:
                 out.extend(_extract_texts_from_any(item))
             return out
-
         lines = res[0] if isinstance(res[0], list) else res
         for line in lines:
             try:
@@ -245,7 +297,6 @@ def _extract_texts_from_any(res):
                 continue
         return out
 
-    # 兜底
     try:
         s = str(res)
         if s:
@@ -256,25 +307,7 @@ def _extract_texts_from_any(res):
 
 
 class VideoSubtitleOCR:
-    """字幕 OCR（自动去黑边 + 固定下30% + 英文空格修复 + 去重合并）
-
-    params:
-      - preprocess_deborder: bool, default True
-      - sample_fps: float, default 1.0
-      - max_frames: int, default 240
-      - subtitle_ratio: float, default 0.30
-      - ocr_lang: ch|en, default ch
-      - min_score: float, default 0.0
-      - roi_diff_thr: float, default 4.0
-      - gap_merge_sec: float, default 1.2     # ✅ 更容易合并跨帧字幕
-      - fix_english_space: bool, default True # ✅ 英文空格修复开关
-
-    outputs:
-      - artifacts/subtitles.json
-      - artifacts/subtitles.srt
-      - artifacts/frames/subtitle_*.jpg
-      - artifacts/deborder.mp4 (if preprocess_deborder=True)
-    """
+    """字幕 OCR（增强版相邻重复合并）"""
 
     @staticmethod
     def execute(sample, params):
@@ -293,7 +326,6 @@ def execute(sample, params):
         if not ffmpeg_path:
             raise RuntimeError("ffmpeg not found")
 
-        # ✅ 默认自动去黑边
         if params.get("preprocess_deborder", True):
             deborder_mp4 = os.path.join(art_dir, "deborder.mp4")
             crop = _deborder_ffmpeg(ffmpeg_path, in_video, deborder_mp4, logger)
@@ -308,7 +340,7 @@ def execute(sample, params):
 
         from paddleocr import PaddleOCR
         ocr_lang = params.get("ocr_lang", "ch")
-        ocr = build_paddle_ocr(params, ocr_lang=ocr_lang, use_angle_cls=False)   
+        ocr = PaddleOCR(use_angle_cls=True, lang=ocr_lang)
 
         fps, w, h, total = get_video_info(src_video)
         sample_fps = float(params.get("sample_fps", 1.0))
@@ -316,7 +348,7 @@ def execute(sample, params):
         subtitle_ratio = float(params.get("subtitle_ratio", 0.30))
         min_score = float(params.get("min_score", 0.0))
         roi_diff_thr = float(params.get("roi_diff_thr", 4.0))
-        gap_merge = float(params.get("gap_merge_sec", 1.2))
+        gap_merge = float(params.get("gap_merge_sec", 2.0))
         fix_en_space = bool(params.get("fix_english_space", True))
 
         step = max(1, int(round(fps / max(sample_fps, 0.0001))))
@@ -358,14 +390,19 @@ def execute(sample, params):
                 text = _fix_english_spacing(text)
 
             if text:
-                raw_hits.append({"t": t, "text": text, "key": _norm_sub_key(text), "frame_id": int(fi), "jpg": jpg_path})
+                raw_hits.append({
+                    "t": t,
+                    "text": text,
+                    "key": _norm_sub_key(text),
+                    "frame_id": int(fi),
+                    "jpg": jpg_path
+                })
 
             if (k + 1) % 20 == 0 or k == len(idxs) - 1:
                 logger.info(f"[{k+1}/{len(idxs)}] frame={fi} hit={1 if text else 0} len={len(text)}")
 
         cap.release()
 
-        # ✅ 合并相邻相同字幕（按规范化 key 合并）
         segments = []
         for hit in raw_hits:
             if not segments:
@@ -379,8 +416,10 @@ def execute(sample, params):
                 continue
 
             last = segments[-1]
-            if hit["key"] == last["key"] and (hit["t"] - last["end"] <= gap_merge):
+            if _should_merge_hit(last, hit, gap_merge=gap_merge, sim_thr=0.90):
                 last["end"] = hit["t"]
+                last["text"] = _choose_better_text(last["text"], hit["text"])
+                last["key"] = _norm_sub_key(last["text"])
                 last["evidence"].append({"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]})
             else:
                 segments.append({
@@ -391,11 +430,11 @@ def execute(sample, params):
                     "evidence": [{"t": hit["t"], "frame_id": hit["frame_id"], "jpg": hit["jpg"]}],
                 })
 
-        # end 往后延一点，srt 更自然
+        segments = _post_merge_segments(segments, gap_merge=gap_merge)
+
         for seg in segments:
             seg["end"] = float(seg["end"] + max(0.4, 1.0 / max(sample_fps, 0.1)))
 
-        # 输出时不需要 key（但保留也无所谓；你想更干净就删掉）
         json_path = os.path.join(art_dir, "subtitles.json")
         srt_path = os.path.join(art_dir, "subtitles.srt")
         with open(json_path, "w", encoding="utf-8") as f:
@@ -403,4 +442,9 @@ def execute(sample, params):
         _write_srt(segments, srt_path)
 
         logger.info(f"Done. subtitles={len(segments)} srt={srt_path}")
-        return {"out_dir": out_dir, "subtitles_json": json_path, "subtitles_srt": srt_path, "count": len(segments)}
\ No newline at end of file
+        return {
+            "out_dir": out_dir,
+            "subtitles_json": json_path,
+            "subtitles_srt": srt_path,
+            "count": len(segments)
+        }
diff --git a/scripts/images/qwen-vl-service/Dockerfile b/scripts/images/qwen-vl-service/Dockerfile
index 5ee874447..99ceaf495 100644
--- a/scripts/images/qwen-vl-service/Dockerfile
+++ b/scripts/images/qwen-vl-service/Dockerfile
@@ -1,7 +1,5 @@
 # QwenVL service image for DataMate video operators
 #
-# This image definition targets the standalone QwenVL HTTP service.
-#
 # Notes:
 # 1. The actual deployment environment uses Ascend / NPU with torch_npu and CANN.
 # 2. Replace BASE_IMAGE with the actual Ascend-compatible runtime image in your environment.
@@ -34,8 +32,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN python -m pip install --upgrade pip setuptools wheel
 
 # Python packages aligned to the observed service environment.
-# If the actual deployment base image already includes these packages,
-# this section can be adjusted accordingly.
 RUN pip install \
     Flask==3.1.3 \
     requests==2.32.5 \
diff --git a/scripts/images/qwen-vl-service/qwen_vl_server.py b/scripts/images/qwen-vl-service/qwen_vl_server.py
index f2b2b2417..08150c819 100644
--- a/scripts/images/qwen-vl-service/qwen_vl_server.py
+++ b/scripts/images/qwen-vl-service/qwen_vl_server.py
@@ -20,7 +20,6 @@
 
 app = Flask(__name__)
 
-# ===== Load once =====
 cfg = json.load(open(PREPROCESSOR_CFG, "r", encoding="utf-8"))
 MERGE_SIZE = int(cfg.get("merge_size", 1))
 
@@ -34,13 +33,15 @@
 SENSITIVE_VALID = ["porn", "violence", "blood", "explosion", "politics", "none"]
 SENSITIVE_SET = set(SENSITIVE_VALID)
 
-CLASS25 = [
-    "日常生活", "影视剧集", "音乐舞蹈", "幽默搞笑", "游戏电竞",
-    "动漫二次元", "新闻时事", "教育教学", "科技数码", "财经商业",
-    "纪录片", "体育竞技", "美食烹饪", "时尚美妆", "汽车交通",
-    "萌宠动物", "健康健身", "自然风光", "三农", "监控安防",
-    "广告营销", "才艺展示", "军事国防", "情感心理", "其他"
+CLASS_NAMES = [
+    "影视剧情类", "新闻资讯类", "教育知识类", "美食饮品类", "自然风光类",
+    "时尚美妆类", "亲子育儿类", "宠物日常类", "游戏电竞类", "音乐舞蹈类",
+    "动漫二次元类", "数码产品类", "汽车交通类", "财经商业类", "文化艺术类",
+    "乐器演奏类", "国防军事类", "体育竞技类", "野生动物类", "农业类",
+    "航空航天类", "其他类"
 ]
+DEFAULT_CLASS_ID = len(CLASS_NAMES)
+DEFAULT_CLASS_NAME = CLASS_NAMES[-1]
 
 
 def label_only_prompt() -> str:
@@ -48,12 +49,13 @@ def label_only_prompt() -> str:
 
 
 def classify25_prompt() -> str:
-    items = "\n".join([f"{i+1}. {c}" for i, c in enumerate(CLASS25)])
+    items = "\n".join([f"{i+1}. {c}" for i, c in enumerate(CLASS_NAMES)])
     return (
         "你是视频分类器。根据图片判断视频类别。\n"
-        "只输出一个数字编号（1-25），不要解释、不要输出其它内容。\n"
+        f"只输出一个数字编号（1-{len(CLASS_NAMES)}），不要解释、不要输出其它内容。\n"
         f"类别列表：\n{items}\n"
-        "输出示例：8"
+        "如果无法明确归入前面的类别，请输出最后一个编号。\n"
+        f"输出示例：{len(CLASS_NAMES)}"
     )
 
 
@@ -115,16 +117,15 @@ def normalize_class25(raw_text: str) -> dict:
     ans = extract_assistant_answer(raw_text).strip()
     nums = re.findall(r"\d+", ans)
     if not nums:
-        return {"id": 25, "label": "其他", "raw": ans}
+        return {"id": DEFAULT_CLASS_ID, "label": DEFAULT_CLASS_NAME, "raw": ans}
     idx = int(nums[-1])
-    if idx < 1 or idx > 25:
-        idx = 25
-    return {"id": idx, "label": CLASS25[idx - 1], "raw": ans}
+    if idx < 1 or idx > len(CLASS_NAMES):
+        idx = DEFAULT_CLASS_ID
+    return {"id": idx, "label": CLASS_NAMES[idx - 1], "raw": ans}
 
 
 def infer_raw_text(image_path: str, user_text: str, max_new_tokens: int = 64) -> str:
     image = Image.open(image_path).convert("RGB")
-
     img_inputs = image_processor(images=image, return_tensors="pt")
     grid = img_inputs["image_grid_thw"][0]
     num_patches = int(grid.prod().item())
@@ -149,7 +150,14 @@ def infer_raw_text(image_path: str, user_text: str, max_new_tokens: int = 64) ->
 
 @app.route("/health", methods=["GET"])
 def health():
-    return jsonify({"ok": True, "model_dir": MODEL_DIR, "host": SERVER_HOST, "port": SERVER_PORT})
+    return jsonify({
+        "ok": True,
+        "model_dir": MODEL_DIR,
+        "host": SERVER_HOST,
+        "port": SERVER_PORT,
+        "num_classes": len(CLASS_NAMES),
+        "classes": CLASS_NAMES
+    })
 
 
 @app.route("/infer", methods=["POST"])
@@ -157,7 +165,6 @@ def infer_api():
     data = request.get_json(force=True)
     image_path = data["image_path"]
     task = data.get("task", "sensitive")
-
     max_new_tokens = int(data.get("max_new_tokens", 64))
     language = data.get("language", "zh")
     style = data.get("style", "normal")
@@ -167,10 +174,8 @@ def infer_api():
             raw = infer_raw_text(image_path, label_only_prompt(), max_new_tokens=8)
             label = normalize_sensitive_label(raw)
             answer = extract_assistant_answer_sensitive(raw)
-
             is_sensitive = (label != "none")
             score = 0.90 if is_sensitive else 0.05
-
             return jsonify({
                 "task": task,
                 "is_sensitive": is_sensitive,
@@ -190,7 +195,11 @@ def infer_api():
             })
 
         if task == "summary":
-            raw = infer_raw_text(image_path, summary_prompt(language=language, style=style), max_new_tokens=max_new_tokens)
+            raw = infer_raw_text(
+                image_path,
+                summary_prompt(language=language, style=style),
+                max_new_tokens=max_new_tokens
+            )
             return jsonify({
                 "task": task,
                 "summary": extract_assistant_answer(raw).strip()

From a237197f5bb04a014cfd5112791e3dd0b1951754 Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Tue, 14 Apr 2026 20:23:06 +0800
Subject: [PATCH 5/6] feat: finalize video operators, docker files and
 technical report

---
 .../mapper/video_keyframe_extract/process.py  | 160 +++++++++++++-----
 1 file changed, 122 insertions(+), 38 deletions(-)

diff --git a/runtime/ops/mapper/video_keyframe_extract/process.py b/runtime/ops/mapper/video_keyframe_extract/process.py
index b2b795518..4b97f4e63 100644
--- a/runtime/ops/mapper/video_keyframe_extract/process.py
+++ b/runtime/ops/mapper/video_keyframe_extract/process.py
@@ -1,4 +1,5 @@
 import os
+import re
 import json
 import shutil
 import subprocess
@@ -24,7 +25,6 @@ def _list_jpgs(d: str) -> List[str]:
 
 
 def _probe_duration(ffprobe_path: str, video_path: str) -> float:
-    # 尽量不用任何第三方库，直接 ffprobe
     cmd = [
         ffprobe_path, "-v", "error",
         "-show_entries", "format=duration",
@@ -40,6 +40,36 @@ def _probe_duration(ffprobe_path: str, video_path: str) -> float:
         return 0.0
 
 
+def _extract_pts_times_from_showinfo(log_text: str) -> List[float]:
+    times: List[float] = []
+    pattern = re.compile(r"pts_time:\s*([0-9]+(?:\.[0-9]+)?)")
+    for line in log_text.splitlines():
+        m = pattern.search(line)
+        if m:
+            try:
+                times.append(float(m.group(1)))
+            except Exception:
+                pass
+    return times
+
+
+def _pair_files_with_times(files: List[str], times: List[float]) -> List[Tuple[Optional[float], str]]:
+    if not files:
+        return []
+
+    if not times:
+        return [(None, f) for f in files]
+
+    n = min(len(files), len(times))
+    pairs = [(times[i], files[i]) for i in range(n)]
+
+    if len(files) > n:
+        for i in range(n, len(files)):
+            pairs.append((None, files[i]))
+
+    return pairs
+
+
 @dataclass
 class KeyframeParams:
     ffmpeg_path: str = ""
@@ -49,7 +79,8 @@ class KeyframeParams:
     max_keyframes: int = 30
     min_interval_sec: float = 1.0
     always_include_first: bool = True
-    quality: int = 2  # -q:v
+    always_include_last: bool = True
+    quality: int = 2
     out_json_name: str = "keyframes.json"
 
 
@@ -57,8 +88,9 @@ class VideoKeyframeExtractLocal:
     """
     本地运行版：不依赖 datamate。
     输出：
-      <out_dir>/artifacts/keyframes/cover.jpg (可选)
-      <out_dir>/artifacts/keyframes/%06d.jpg  (scene 帧)
+      <out_dir>/artifacts/keyframes/cover.jpg
+      <out_dir>/artifacts/keyframes/tail.jpg
+      <out_dir>/artifacts/keyframes/%06d.jpg
       <out_dir>/artifacts/keyframes.json
     """
 
@@ -77,10 +109,9 @@ def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] =
         _ensure_dir(key_dir)
 
         duration = _probe_duration(ffprobe, video_path)
-
         outputs: List[Dict[str, Any]] = []
 
-        # 1) cover
+        # 1) 固定取首帧
         cover_path = os.path.join(key_dir, "cover.jpg")
         if p.always_include_first:
             cmd = [
@@ -92,61 +123,88 @@ def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] =
                 "-vf", "format=yuvj420p",
                 cover_path
             ]
-            rc, log = _run(cmd)
+            rc, _ = _run(cmd)
             if rc == 0 and os.path.exists(cover_path):
                 outputs.append({"kind": "cover", "time_sec": 0.0, "path": cover_path})
+
+        # 2) 固定取尾帧
+        tail_path = os.path.join(key_dir, "tail.jpg")
+        tail_time: Optional[float] = None
+        if p.always_include_last and duration > 0:
+            # 取接近视频末尾的位置，避免有些编码下 duration 精确到末尾会取不到
+            tail_time = max(duration - 0.04, 0.0)
+            cmd = [
+                ffmpeg, "-hide_banner", "-y",
+                "-ss", f"{tail_time}",
+                "-i", video_path,
+                "-frames:v", "1",
+                "-q:v", str(p.quality),
+                "-vf", "format=yuvj420p",
+                tail_path
+            ]
+            rc, _ = _run(cmd)
+            if rc == 0 and os.path.exists(tail_path):
+                outputs.append({"kind": "tail", "time_sec": float(tail_time), "path": tail_path})
             else:
-                # cover 失败不致命
-                pass
+                tail_time = None
 
-        # 2) scene keyframes
+        # 3) scene keyframes + 真实时间戳
         thr_candidates = p.threshold_candidates or [p.scene_threshold, 0.2, 0.15, 0.1, 0.06]
-        scene_files: List[str] = []
+        scene_pairs: List[Tuple[Optional[float], str]] = []
         used_thr: Optional[float] = None
 
         for thr in thr_candidates:
-            # 清掉旧的 scene 输出（保留 cover）
+            # 清掉旧的 scene 输出（保留 cover / tail）
             for f in _list_jpgs(key_dir):
-                if os.path.basename(f) != "cover.jpg":
+                base = os.path.basename(f)
+                if base not in ("cover.jpg", "tail.jpg"):
                     try:
                         os.remove(f)
                     except Exception:
                         pass
 
-            vf = f"select='gt(scene,{thr})',format=yuvj420p"
+            vf = f"select='gt(scene,{thr})',showinfo,format=yuvj420p"
             out_tpl = os.path.join(key_dir, "%06d.jpg")
 
-            # 兼容新旧 ffmpeg
             cmd = [
                 ffmpeg, "-hide_banner", "-y",
                 "-i", video_path,
                 "-vf", vf,
                 "-q:v", str(p.quality),
-                "-frames:v", str(p.max_keyframes * 3),
+                "-frames:v", str(max(p.max_keyframes * 5, 50)),
                 "-fps_mode", "vfr",
                 out_tpl
             ]
             rc, log = _run(cmd)
+
             if rc != 0 and "Unrecognized option 'fps_mode'" in log:
                 cmd = [
                     ffmpeg, "-hide_banner", "-y",
                     "-i", video_path,
                     "-vf", vf,
                     "-q:v", str(p.quality),
-                    "-frames:v", str(p.max_keyframes * 3),
+                    "-frames:v", str(max(p.max_keyframes * 5, 50)),
                     "-vsync", "vfr",
                     out_tpl
                 ]
                 rc, log = _run(cmd)
 
-            files = [f for f in _list_jpgs(key_dir) if os.path.basename(f) != "cover.jpg"]
+            files = []
+            for f in _list_jpgs(key_dir):
+                base = os.path.basename(f)
+                if base not in ("cover.jpg", "tail.jpg"):
+                    files.append(f)
+
+            pts_times = _extract_pts_times_from_showinfo(log)
+            pairs = _pair_files_with_times(files, pts_times)
+
             if files:
-                scene_files = files
+                scene_pairs = pairs
                 used_thr = thr
                 break
 
-        # 3) fallback：scene=0 时取中间帧
-        if not scene_files:
+        # 4) fallback：scene=0 时取中间帧
+        if not scene_pairs:
             t = duration / 2.0 if duration > 0 else 0.0
             mid_path = os.path.join(key_dir, "000001.jpg")
             cmd = [
@@ -160,37 +218,61 @@ def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] =
             ]
             rc, log = _run(cmd)
             if rc != 0 or (not os.path.exists(mid_path)):
-                raise RuntimeError(f"KeyframeExtractLocal failed: scene=0 and fallback midframe failed. log={log[-800:]}")
-            scene_files = [mid_path]
+                raise RuntimeError(
+                    f"KeyframeExtractLocal failed: scene=0 and fallback midframe failed. log={log[-800:]}"
+                )
+            scene_pairs = [(float(t), mid_path)]
             used_thr = None
 
-        # 4) 时间间隔过滤 + 截断 max_keyframes
-        # 这里用“均匀估计”时间戳（不解析 showinfo），足够用于过滤过密
-        if duration > 0 and len(scene_files) > 1:
-            kept: List[Tuple[float, str]] = []
-            last_t = -1e9
-            for i, f in enumerate(scene_files):
-                t = (i / max(1, (len(scene_files) - 1))) * duration
+        # 5) 按真实时间过滤 + 截断 max_keyframes
+        kept: List[Tuple[Optional[float], str]] = []
+        last_t = -1e9
+
+        for t, f in scene_pairs:
+            if t is None:
+                kept.append((None, f))
+            else:
                 if t - last_t >= p.min_interval_sec:
-                    kept.append((t, f))
+                    kept.append((float(t), f))
                     last_t = t
-                if len(kept) >= p.max_keyframes:
-                    break
-            for t, f in kept:
-                outputs.append({"kind": "scene", "time_sec": float(t), "path": f})
-        else:
-            for f in scene_files[:p.max_keyframes]:
-                outputs.append({"kind": "scene", "time_sec": None, "path": f})
+
+            if len(kept) >= p.max_keyframes:
+                break
+
+        if not kept and scene_pairs:
+            kept = [scene_pairs[0]]
+
+        # 6) 先写 outputs，再删除未保留候选帧，保证目录和 json 一致
+        kept_paths = set()
+        for t, f in kept:
+            kept_paths.add(os.path.abspath(f))
+            outputs.append({
+                "kind": "scene",
+                "time_sec": t,
+                "path": f
+            })
+
+        for f in _list_jpgs(key_dir):
+            base = os.path.basename(f)
+            if base in ("cover.jpg", "tail.jpg"):
+                continue
+            if os.path.abspath(f) not in kept_paths:
+                try:
+                    os.remove(f)
+                except Exception:
+                    pass
 
         out_json = os.path.join(artifacts, p.out_json_name)
         payload = {
             "input": video_path,
             "out_dir": out_dir,
+            "duration_sec": duration,
             "scene_threshold": p.scene_threshold,
             "used_scene_threshold": used_thr,
             "max_keyframes": p.max_keyframes,
             "min_interval_sec": p.min_interval_sec,
             "always_include_first": p.always_include_first,
+            "always_include_last": p.always_include_last,
             "keyframes": outputs,
         }
         with open(out_json, "w", encoding="utf-8") as f:
@@ -213,6 +295,7 @@ def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] =
     ap.add_argument("--max_keyframes", type=int, default=30)
     ap.add_argument("--min_interval_sec", type=float, default=1.0)
     ap.add_argument("--always_include_first", action="store_true")
+    ap.add_argument("--always_include_last", action="store_true")
     args = ap.parse_args()
 
     runner = VideoKeyframeExtractLocal()
@@ -224,6 +307,7 @@ def run(self, video_path: str, out_dir: str, params: Optional[Dict[str, Any]] =
             "max_keyframes": args.max_keyframes,
             "min_interval_sec": args.min_interval_sec,
             "always_include_first": bool(args.always_include_first),
+            "always_include_last": bool(args.always_include_last),
         },
     )
     print(json.dumps(res, ensure_ascii=False, indent=2))
\ No newline at end of file

From d488d953a20ebb18192c116107f5ea4197246e7d Mon Sep 17 00:00:00 2001
From: guotingxuan5599 <1321352073@qq.com>
Date: Fri, 17 Apr 2026 09:39:29 +0800
Subject: [PATCH 6/6] chore: refine qwen vl service Dockerfile

---
 scripts/images/qwen-vl-service/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/images/qwen-vl-service/Dockerfile b/scripts/images/qwen-vl-service/Dockerfile
index 99ceaf495..c28c54987 100644
--- a/scripts/images/qwen-vl-service/Dockerfile
+++ b/scripts/images/qwen-vl-service/Dockerfile
@@ -1,10 +1,13 @@
 # QwenVL service image for DataMate video operators
 #
+# This Dockerfile provides the image build entry for the standalone QwenVL service.
+#
 # Notes:
 # 1. The actual deployment environment uses Ascend / NPU with torch_npu and CANN.
 # 2. Replace BASE_IMAGE with the actual Ascend-compatible runtime image in your environment.
 # 3. Model weights are mounted from host storage and are not copied into the image.
 # 4. The model path is configured by QWEN_MODEL_DIR and should be adjusted at deployment time.
+# 5. The default BASE_IMAGE is only a generic placeholder for repository delivery.
 
 ARG BASE_IMAGE=python:3.10-slim
 FROM ${BASE_IMAGE}