From d1e85b67ab6bd4cdc8e23adc5044dd22bafb7289 Mon Sep 17 00:00:00 2001
From: Vib-UX <btcvibhav@gmail.com>
Date: Mon, 1 Jun 2026 10:10:38 +0530
Subject: [PATCH 1/3] =?UTF-8?q?feat(ltx2):=20M1=20scaffolding=20=E2=80=94?=
 =?UTF-8?q?=20checkpoint=20inspection,=20GGUF=20conversion=20tool,=20model?=
 =?UTF-8?q?=20registration?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Begins LTX-2.3 (video-only) support for the Tether LTX-2 bounty (M1).

- docs/ltx2_feasibility.md: research findings (architecture, scope, risks).
- script/convert_ltx2_to_gguf.py (+requirements-ltx2.txt): safetensors -> GGUF
  converter that keeps only the video stream (drops audio DiT, AV cross-attn,
  audio VAE, vocoder). Filtering/naming is pure-stdlib so --dry-run needs no
  heavy deps; F16 plus Q4_0/Q5_1/Q8_0 supported. Validated against the real
  ltx-2.3-22b-dev header (1758 video tensors, 0 audio leaks).
- model.h / stable-diffusion.cpp / model.cpp: register VERSION_LTX2,
  sd_version_is_ltx2(), include in sd_version_is_dit(), "LTX-2" version string,
  and weight detection via video_embeddings_connector / patchify_proj.
---
 .gitignore                     |   2 +
 docs/ltx2_feasibility.md       | 199 +++++++++++++++++++++
 script/convert_ltx2_to_gguf.py | 314 +++++++++++++++++++++++++++++++++
 script/requirements-ltx2.txt   |   5 +
 src/model.cpp                  |   4 +
 src/model.h                    |  11 +-
 src/stable-diffusion.cpp       |   1 +
 7 files changed, 535 insertions(+), 1 deletion(-)
 create mode 100644 docs/ltx2_feasibility.md
 create mode 100644 script/convert_ltx2_to_gguf.py
 create mode 100644 script/requirements-ltx2.txt

diff --git a/.gitignore b/.gitignore
index b0e3af83f..e33f220c5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,5 @@ output*.png
 models*
 *.log
 preview.png
+__pycache__/
+*.pyc
diff --git a/docs/ltx2_feasibility.md b/docs/ltx2_feasibility.md
new file mode 100644
index 000000000..b882e6b33
--- /dev/null
+++ b/docs/ltx2_feasibility.md
@@ -0,0 +1,199 @@
+# LTX-2 Support — Feasibility & Research Findings
+
+Status: research pass (Step 1). No code plan yet. This document captures what LTX-2
+actually is, what this repo already provides, and the open
+decisions/risks that must be resolved before a concrete implementation plan.
+
+---
+
+## 1. What LTX-2 actually is
+
+LTX-2 is Lightricks' open-weights **audio-video** DiT foundation model (released
+2026-01-06, arXiv:2601.03233). It is **not** the same as the older `LTX-Video` model
+(arXiv:2501.00103) for which this repo has a 73-line stub at
+[src/ltxv.hpp](src/ltxv.hpp) (only a `CausalConv3d` block, wired into nothing).
+
+Two HuggingFace repos exist:
+
+- `Lightricks/LTX-2` — the **19B** model = **14B video stream + 5B audio stream**.
+  - `ltx-2-19b-dev.safetensors` (~43 GB bf16), `...-fp8` (~27 GB), `...-fp4` (~20 GB),
+    `ltx-2-19b-distilled.safetensors` (8 steps, CFG=1), distilled LoRA, spatial/temporal upscalers.
+- `Lightricks/LTX-2.3` — a newer **22B** variant: `ltx-2.3-22b-dev.safetensors`,
+  `ltx-2.3-22b-distilled-1.1.safetensors`, `ltx-2.3-22b-distilled-lora-384-1.1.safetensors`.
+
+Reference code:
+
+- Official monorepo: `github.com/Lightricks/LTX-2` (`packages/ltx-core` has the model defs;
+  `packages/ltx-pipelines` has T2V/I2V pipelines).
+- Cleaner third-party PyTorch port: `deepbeepmeep/Wan2GP` `models/ltx2/ltx_core/...`
+  (useful, more digestible reference for VAE/transformer porting).
+
+### Scope per the grant
+
+In scope: **video stream only** — T2V + I2V. Out of scope (explicitly): the 5B audio
+stream, Audio-VAE, vocoder, spatial/temporal upscalers, video-to-video, training, GUI.
+
+This matters: LTX-2 has a **video-only inference path** (`VideoGemmaTextEncoderModel` +
+the transformer run without the audio stream / without A↔V cross-attention), so dropping
+audio is supported by design — we do not need to implement the audio half.
+
+---
+
+## 2. In-scope components and architecture
+
+```mermaid
+flowchart TD
+    P["Text prompt (+ optional image for I2V)"] --> TE
+    subgraph TE [Text Encoder - NET NEW]
+        G["Gemma 3-12B backbone (decoder-only LLM, frozen)"] --> FE["Multi-Layer Feature Extractor"]
+        FE --> TC["Text Connector (bidirectional, register/thinking tokens)"]
+    end
+    TC -->|"video context [B, seq, 4096]"| DiT
+    IMG["Input image / frames"] --> VAEenc["Video VAE encoder (I2V cond)"]
+    VAEenc --> DiT
+    NOISE["Init noise latent [B,128,F',H/32,W/32]"] --> DiT
+    subgraph DiT [Video DiT - 14B, ~48 blocks]
+        SA["Self-Attn + 3D RoPE"] --> TX["Text Cross-Attn"] --> FF["FFN, RMSNorm, AdaLN"]
+    end
+    DiT -->|denoise loop, flow-matching + CFG| LAT["Final video latents"]
+    LAT --> VAEdec["Video VAE decoder"]
+    VAEdec --> OUT["Frames -> MP4/AVI"]
+```
+
+### 2a. Video VAE (spatiotemporal causal)
+
+- Compression **32×32×8** (spatial 32, temporal 8), **128 latent channels**, no patchifier
+  in the transformer (1×1×1).
+- Encoder: `[B,3,F,H,W] -> [B,128, 1+(F-1)/8, H/32, W/32]`; requires `(F-1) % 8 == 0`.
+- Decoder: `[B,128,F,H,W] -> [B,3, 1+(F-1)*8, H*32, W*32]`.
+- Uses causal 3D convs, `PixelNorm`/`GroupNorm`, SiLU, and per-channel latent
+  mean/std statistics baked into the encoder (`per_channel_statistics.normalize`).
+- Reuse signal: the repo already implements causal 3D VAE machinery for Wan
+  (`WAN::CausalConv3d`, `WanVAE`, `WanVAERunner` in [src/wan.hpp](src/wan.hpp)) and the
+  stub `LTXV::CausalConv3d` in [src/ltxv.hpp](src/ltxv.hpp). The op set (Conv3d, causal
+  padding) is already present.
+
+### 2b. Video DiT (14B)
+
+- ~48 transformer blocks (shared depth with audio; video stream width is larger).
+- Each block (video-only path): RMSNorm + **AdaLN** (timestep-conditioned) -> **Self-Attn
+  with 3D RoPE** -> RMSNorm -> **Text Cross-Attn** -> RMSNorm + AdaLN -> **FFN**.
+  (The A↔V cross-attention sublayer is the audio coupling and is skipped for video-only.)
+- Reuse signal: this is structurally close to `WAN::Wan` /`WanAttentionBlock`
+  (AdaLN modulation, self-attn + text cross-attn, 3D RoPE via `Rope::gen_wan_pe` in
+  [src/rope.hpp](src/rope.hpp)). The DiffusionModel adapter pattern is `WanModel` in
+  [src/diffusion_model.hpp](src/diffusion_model.hpp).
+
+### 2c. Text encoder — Gemma 3-12B + Feature Extractor + Text Connector (BIGGEST NET-NEW PIECE)
+
+- Backbone: **Gemma 3-12B** decoder-only LLM (frozen), multilingual.
+  - The repo's LLM support (`src/llm.hpp`, `enum LLMArch { QWEN2_5_VL, QWEN3,
+MISTRAL_SMALL_3_2 }`) does **not** include Gemma 3. This is net-new: Gemma 3 attention
+    (sliding-window + global layers), RMSNorm, GeGLU MLP, and a SentencePiece/Gemma tokenizer.
+- Multi-Layer Feature Extractor: aggregates **all** decoder layers `[B,T,D,L]`, mean-centered
+  scaling, flatten to `[B,T,D×L]`, learnable projection `W` (trained with LTX-2).
+- Text Connector: bidirectional transformer with learnable **register / "thinking" tokens**
+  replacing padded positions; the video connector outputs **`[B, seq, 4096]`**.
+- The Feature Extractor + Connector weights live in the LTX-2 checkpoint (not in Gemma).
+
+### 2d. Scheduler / guidance
+
+- LTX-2 uses flow-matching with `LTX2Scheduler` / `LinearQuadratic` timestep schedule and
+  Euler updates; CFG (and optionally STG/APG, out of scope).
+- Reuse signal: repo has `DiscreteFlowDenoiser` (`prediction_t::FLOW_PRED`) and ~14 sigma
+  schedulers in [src/denoiser.hpp](src/denoiser.hpp), but **no LinearQuadratic schedule**.
+  A new scheduler + the LTX timestep shift is net-new but small.
+
+---
+
+## 3. Mapping to existing repo infrastructure
+
+Reusable as-is or with light adaptation (Wan video pipeline is the template):
+
+- Video generation entrypoint `generate_video()`, `vid_gen` CLI mode, and the public C API
+  `sd_vid_gen_params_t` / `generate_video()` in
+  [include/stable-diffusion.h](include/stable-diffusion.h).
+- Latent geometry helpers (`generate_init_latent`, `process_latent_in/out`,
+  `get_vae_scale_factor`, `get_latent_channel`, frame alignment) in
+  [src/stable-diffusion.cpp](src/stable-diffusion.cpp) — need LTX values (scale 32, 128 ch,
+  `(F-1)%8` alignment).
+- `GGMLBlock`/`GGMLRunner` framework, 3D RoPE, Conv3d, RMSNorm, AdaLN, attention ops in
+  [src/ggml_extend.hpp](src/ggml_extend.hpp).
+- GGUF conversion path: C++ `-M convert` -> `convert()` -> `save_to_gguf_file()`
+  ([src/model.cpp](src/model.cpp)); tensor-name remap in
+  [src/name_conversion.cpp](src/name_conversion.cpp).
+- Model registration pattern: `enum SDVersion` + `sd_version_is_*` + detection in
+  `ModelLoader::get_sd_version()` ([src/model.h](src/model.h),
+  [src/model.cpp](src/model.cpp)).
+
+Net-new (no existing equivalent):
+
+1. **Gemma 3-12B encoder** + tokenizer (largest single piece).
+2. **LTX Feature Extractor + Text Connector** (LTX-specific trained modules).
+3. **LTX Video VAE** (new class; can borrow Wan/`LTXV` conv blocks).
+4. **LTX DiT** (new class; structurally similar to `WAN::Wan`).
+5. **LinearQuadratic / LTX2 scheduler**.
+6. **Video-stream extraction + name conversion** from the combined 19B/22B checkpoint.
+7. **CI workflows** (`.github/workflows/` does not exist today) and a **Python/C++ GGUF
+   conversion** path for LTX (no model-conversion Python currently in the repo).
+8. **MP4 output** — CLI currently writes **MJPEG-in-AVI** (`avi_writer.h`), not MP4; the
+   grant asks for MP4 or raw frames (raw frames is the low-risk option).
+
+---
+
+## 4. Key risks / feasibility concerns
+
+1. **Gemma 3-12B is a full 12B LLM used as the text encoder.** This is effectively a
+   second model to implement from scratch (new arch in `llm.hpp` + tokenizer). It is the
+   single largest risk to the timeline and is not optional — LTX-2 conditioning depends on
+   the multi-layer Gemma features.
+2. **Memory budget vs grant target.** Target is Q4 ≤ 12 GB RAM (CPU) / ≤ 10 GB VRAM (GPU),
+   but 14B DiT (Q4 ≈ 7–8 GB) + Gemma 3-12B (Q4 ≈ 7 GB) + VAE cannot co-reside in 10 GB.
+   Feasible only via **sequential component staging** (encode text -> free encoder ->
+   load DiT -> free -> load VAE). The repo's `offload_params_to_cpu` + per-component
+   loading supports this, but the target is tight and must be validated early.
+3. **Checkpoint ambiguity (14B vs 19B vs 22B).** Need the exact target. The bf16 `dev`
+   checkpoint (~43 GB) is the conversion source; `fp8`/`fp4` are NVIDIA-specific and not a
+   usable GGUF source. Disk/bandwidth is significant.
+4. **Quality metric methodology.** PSNR ≥ 25 dB / SSIM ≥ 0.85 vs PyTorch must compare the
+   **same single-stage pipeline** at F16. The recommended LTX pipeline is **two-stage with a
+   spatial upscaler** (out of scope), so the reference must be the single-stage
+   base/distilled path, fixed seed, fixed scheduler/steps.
+5. **dev vs distilled.** `distilled` (8 steps, CFG=1) best hits "2s clip in <5 min" and
+   halves compute (no CFG). `dev` needs CFG (2× forward passes) + more steps. Recommend
+   targeting distilled first for the success metrics, dev for the quality baseline.
+6. **LTX-2.3 distilled needs a distilled-LoRA** for the standard pipeline -> may require
+   LoRA fusion at convert time. Repo has LoRA support, but fusing into a video DiT is new.
+7. **PR 2 (Bare addon) target repo is "TBD"** in the grant — blocked until the repo link
+   exists; pattern is `bare-llama-cpp`.
+
+---
+
+## 5. Open decisions (need your input before the code plan)
+
+1. **Target checkpoint:** `Lightricks/LTX-2` 19B (video stream = the "14B" in the grant)
+   or `Lightricks/LTX-2.3` 22B? Recommendation: start with `LTX-2` 19B (matches the 14B
+   figure, slightly smaller, has the cleaner Wan2GP reference port).
+2. **dev vs distilled first:** Recommendation: bring up **distilled** first (fewer steps,
+   CFG=1) to reach end-to-end video fastest, then add `dev` + CFG for the quality baseline.
+3. **Output format:** raw frames + optional MJPEG-AVI (reuse existing writer) for M1–M2,
+   and add real **MP4 (H.264)** later? Or is MJPEG-AVI acceptable for "MP4 or raw frames"?
+4. **Gemma encoder strategy:** implement Gemma 3 natively in `llm.hpp`, or is reusing an
+   external GGUF Gemma 3 encoder acceptable (still needs the LTX feature-extractor +
+   connector on top)? This is the biggest scoping lever.
+5. **Conversion tooling:** Python script (HF safetensors -> GGUF, like llama.cpp's
+   `convert_hf_to_gguf.py`) vs extending the in-repo C++ `-M convert` path. Recommendation:
+   a Python converter for the DiT/VAE/connector + reuse llama.cpp tooling for Gemma.
+
+---
+
+## 6. Conclusion
+
+The bounty is feasible but large. The video DiT and Video VAE map well onto the existing
+**Wan** video template, and the core ggml ops (Conv3d, 3D RoPE, AdaLN, flow-matching) already
+exist. The dominant risk and effort is the **Gemma 3-12B text encoder + LTX feature
+extractor/connector**, followed by the **memory budget** for running a 14B DiT + 12B encoder
+on consumer hardware. Recommended first concrete step (M1): pin the checkpoint, build the
+GGUF conversion + name map, add the `SDVersion`/detection scaffolding, and get the **Video
+VAE decode** running on CPU (smallest self-contained, visually verifiable component) before
+tackling the DiT and Gemma encoder.
diff --git a/script/convert_ltx2_to_gguf.py b/script/convert_ltx2_to_gguf.py
new file mode 100644
index 000000000..2513e0ead
--- /dev/null
+++ b/script/convert_ltx2_to_gguf.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""Convert an LTX-2.3 safetensors checkpoint to GGUF (video stream only).
+
+LTX-2.3 (`Lightricks/LTX-2.3`, e.g. `ltx-2.3-22b-dev.safetensors`) is an
+audio-video model. Per the qvac LTX-2 bounty, only the *video* stream is in
+scope: the audio DiT, audio-video cross-attention, audio VAE and vocoder are
+dropped. The Gemma-3-12B text encoder is NOT in this checkpoint and is
+converted separately with llama.cpp tooling (see `docs/ltx2.md`).
+
+What is kept (video-only):
+  - vae.*                                  (CausalVideoAutoencoder enc/dec + stats)
+  - text_embedding_projection.video_*      (multi-layer Gemma feature aggregation)
+  - model.diffusion_model.patchify_proj / proj_out / scale_shift_table
+  - model.diffusion_model.adaln_single.*   (global timestep AdaLN)
+  - model.diffusion_model.prompt_adaln_single.*
+  - model.diffusion_model.video_embeddings_connector.*
+  - model.diffusion_model.transformer_blocks.N.{attn1,attn2,ff,
+        scale_shift_table,prompt_scale_shift_table}
+
+What is dropped (out of scope / audio):
+  - vocoder.*, audio_vae.*
+  - *.audio_* , audio_adaln_single.*, audio_embeddings_connector.*
+  - audio-video cross attention: *_to_video_attn, video_to_audio_attn,
+        av_ca_* , *_a2v_* , *_v2a_* , scale_shift_table_a2v_*
+
+The filtering/naming logic is pure-stdlib so `--dry-run` works without numpy/
+safetensors/gguf installed (it reads only the safetensors JSON header). The
+actual conversion path imports those libs lazily; install them with:
+    pip install -r script/requirements-ltx2.txt
+
+Examples:
+    # Inspect what would be converted (no heavy deps, no full read):
+    python script/convert_ltx2_to_gguf.py --src ltx-2.3-22b-dev.safetensors --dry-run
+
+    # Produce an F16 GGUF (the M1 deliverable):
+    python script/convert_ltx2_to_gguf.py --src ltx-2.3-22b-dev.safetensors \
+        --dst ltx-2.3-22b-video-f16.gguf --type f16
+
+    # Quantised checkpoints:
+    python script/convert_ltx2_to_gguf.py --src ltx-2.3-22b-dev.safetensors \
+        --dst ltx-2.3-22b-video-q8_0.gguf --type q8_0
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import struct
+import sys
+from typing import Iterator
+
+# --------------------------------------------------------------------------
+# Pure-stdlib filtering / naming (no third-party imports here on purpose)
+# --------------------------------------------------------------------------
+
+ARCH = "ltx2"
+
+# Quantisation types this tool can emit. F16 is the M1 deliverable.
+QUANT_TYPES = ("f16", "q8_0", "q5_1", "q4_0")
+
+# Per-transformer-block leaf prefixes that belong to the audio coupling and are
+# dropped for video-only inference.
+_AUDIO_BLOCK_LEAVES = (
+    "audio_attn1",
+    "audio_attn2",
+    "audio_ff",
+    "audio_prompt_scale_shift_table",
+    "audio_scale_shift_table",
+    "audio_to_video_attn",
+    "video_to_audio_attn",
+    "scale_shift_table_a2v_ca_audio",
+    "scale_shift_table_a2v_ca_video",
+)
+
+_DM = "model.diffusion_model."
+_BLOCK_RE = None  # compiled lazily
+
+
+def _block_leaf(name: str) -> str | None:
+    """Return the leaf path of a `transformer_blocks.N.<leaf>` tensor, else None."""
+    global _BLOCK_RE
+    if _BLOCK_RE is None:
+        import re
+
+        _BLOCK_RE = re.compile(re.escape(_DM) + r"transformer_blocks\.\d+\.(.*)")
+    m = _BLOCK_RE.match(name)
+    return m.group(1) if m else None
+
+
+def is_video_tensor(name: str) -> bool:
+    """True if `name` is part of the in-scope video-only model."""
+    # Hard drops: audio modalities + vocoder.
+    if name.startswith(("vocoder.", "audio_vae.")):
+        return False
+    # Any top-level audio component under the DiT namespace
+    # (audio_adaln_single, audio_embeddings_connector, audio_patchify_proj,
+    #  audio_proj_out, audio_prompt_adaln_single, audio_scale_shift_table, ...).
+    if name.startswith(_DM + "audio_"):
+        return False
+    if name.startswith("text_embedding_projection.audio_"):
+        return False
+    # Audio-video cross-attention scaffolding (top-level adaln tables).
+    if name.startswith(_DM + "av_ca_"):
+        return False
+    if "_a2v_" in name or "_v2a_" in name:
+        return False
+    # Per-block audio / cross-modal leaves.
+    leaf = _block_leaf(name)
+    if leaf is not None and leaf.startswith(_AUDIO_BLOCK_LEAVES):
+        return False
+    return True
+
+
+def map_name(name: str) -> str:
+    """Map an HF tensor name to the GGUF name expected by the C++ loader.
+
+    Kept intentionally light: the DiT keeps its native `model.diffusion_model.*`
+    names (same convention as Wan), and the VAE keeps `vae.*` (the C++
+    name_conversion maps `vae.` -> `first_stage_model.`). LTX-specific blocks
+    (connector, text projection) keep their names and are matched by the
+    `ltx2.hpp` block tree.
+    """
+    return name
+
+
+def should_quantize(name: str, shape: list[int], qtype: str) -> bool:
+    """Only quantise large 2D DiT linear weights; keep everything else as F16.
+
+    Norms/biases (1D), small modulation tables (F32), conv weights (5D) and the
+    whole VAE stay in higher precision for quality and because block-quant
+    formats require 2D row lengths divisible by the block size (32).
+    """
+    if qtype == "f16":
+        return False
+    if not name.endswith(".weight"):
+        return False
+    if len(shape) != 2:
+        return False
+    if name.startswith("vae."):  # keep VAE accurate; it is small (~0.7B)
+        return False
+    if ".norm" in name or "_norm" in name:
+        return False
+    # q4_0/q5_1/q8_0 all use a block size of 32 along the last dim.
+    if shape[-1] % 32 != 0:
+        return False
+    return True
+
+
+# --------------------------------------------------------------------------
+# safetensors header (stdlib only)
+# --------------------------------------------------------------------------
+
+
+def read_safetensors_header(path: str) -> tuple[dict, int]:
+    """Return (header_dict, data_offset) reading only the JSON header."""
+    with open(path, "rb") as f:
+        (hlen,) = struct.unpack("<Q", f.read(8))
+        header = json.loads(f.read(hlen))
+    return header, 8 + hlen
+
+
+def plan(header: dict) -> tuple[list[str], list[str]]:
+    keep, drop = [], []
+    for name in header:
+        if name == "__metadata__":
+            continue
+        (keep if is_video_tensor(name) else drop).append(name)
+    return sorted(keep), sorted(drop)
+
+
+def _numel(shape: list[int]) -> int:
+    n = 1
+    for d in shape:
+        n *= d
+    return n
+
+
+def _dtype_bytes(dt: str) -> int:
+    return {"F64": 8, "F32": 4, "F16": 2, "BF16": 2, "I64": 8, "I32": 4,
+            "I16": 2, "I8": 1, "U8": 1, "BOOL": 1}.get(dt, 2)
+
+
+def print_plan(header: dict, qtype: str) -> None:
+    keep, drop = plan(header)
+    keep_bytes = sum(_numel(header[k]["shape"]) * _dtype_bytes(header[k]["dtype"]) for k in keep)
+    keep_params = sum(_numel(header[k]["shape"]) for k in keep)
+    nq = sum(1 for k in keep if should_quantize(k, header[k]["shape"], qtype))
+    print(f"arch: {ARCH}   target type: {qtype}")
+    print(f"KEEP (video): {len(keep)} tensors, {keep_params/1e9:.2f}B params, "
+          f"{keep_bytes/1e9:.2f} GB (source dtype)")
+    print(f"  of which quantised to {qtype}: {nq}; remaining kept as F16/F32")
+    print(f"DROP (audio/vocoder/av-cross): {len(drop)} tensors")
+    meta = header.get("__metadata__", {})
+    if "model_version" in meta:
+        print(f"model_version: {meta['model_version']}")
+    print("\nsample KEEP:")
+    for k in keep[:8]:
+        print(f"  + {k}  {header[k]['shape']} {header[k]['dtype']}")
+    print("sample DROP:")
+    for k in drop[:8]:
+        print(f"  - {k}  {header[k]['shape']} {header[k]['dtype']}")
+
+
+# --------------------------------------------------------------------------
+# Conversion (lazy heavy imports)
+# --------------------------------------------------------------------------
+
+
+def parse_config(header: dict) -> dict:
+    meta = header.get("__metadata__", {})
+    cfg = {}
+    if isinstance(meta, dict) and "config" in meta:
+        try:
+            cfg = json.loads(meta["config"])
+        except (json.JSONDecodeError, TypeError):
+            cfg = {}
+    return cfg
+
+
+def convert(src: str, dst: str, qtype: str, include_vae: bool) -> None:
+    import numpy as np  # noqa: F401
+    import gguf
+    from safetensors import safe_open
+
+    header, _ = read_safetensors_header(src)
+    keep, drop = plan(header)
+    if not include_vae:
+        keep = [k for k in keep if not k.startswith("vae.")]
+    cfg = parse_config(header)
+    tcfg = cfg.get("transformer", {})
+
+    writer = gguf.GGUFWriter(dst, ARCH)
+    writer.add_name("LTX-2.3 video")
+    writer.add_description("LTX-2.3 video stream (DiT + VideoVAE + connector), audio dropped")
+    if tcfg:
+        writer.add_uint32("ltx2.dit.num_layers", int(tcfg.get("num_layers", 48)))
+        writer.add_uint32("ltx2.dit.num_heads", int(tcfg.get("num_attention_heads", 32)))
+        writer.add_uint32("ltx2.dit.head_dim", int(tcfg.get("attention_head_dim", 128)))
+        writer.add_uint32("ltx2.dit.in_channels", int(tcfg.get("in_channels", 128)))
+        writer.add_uint32("ltx2.dit.caption_channels", int(tcfg.get("caption_channels", 3840)))
+        writer.add_uint32("ltx2.dit.cross_attention_dim", int(tcfg.get("cross_attention_dim", 4096)))
+        writer.add_uint32("ltx2.connector.num_layers", int(tcfg.get("connector_num_layers", 8)))
+        writer.add_uint32("ltx2.connector.num_registers", int(tcfg.get("connector_num_learnable_registers", 128)))
+        writer.add_float32("ltx2.rope.theta", float(tcfg.get("positional_embedding_theta", 10000.0)))
+
+    qmap = {
+        "f16": gguf.GGMLQuantizationType.F16,
+        "q8_0": gguf.GGMLQuantizationType.Q8_0,
+        "q5_1": gguf.GGMLQuantizationType.Q5_1,
+        "q4_0": gguf.GGMLQuantizationType.Q4_0,
+    }
+
+    n = 0
+    with safe_open(src, framework="numpy") as st:
+        for name in keep:
+            arr = st.get_tensor(name)
+            # bf16 arrives as uint16-backed; promote to float32 for processing.
+            if arr.dtype == np.uint16 or str(arr.dtype) == "bfloat16":
+                arr = _bf16_to_f32(arr)
+            else:
+                arr = arr.astype(np.float32, copy=False)
+
+            out_name = map_name(name)
+            if should_quantize(name, list(arr.shape), qtype):
+                data = gguf.quants.quantize(arr, qmap[qtype])
+                writer.add_tensor(out_name, data, raw_dtype=qmap[qtype])
+            else:
+                writer.add_tensor(out_name, arr.astype(np.float16))
+            n += 1
+            if n % 100 == 0:
+                print(f"  ... {n}/{len(keep)} tensors", file=sys.stderr)
+
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file()
+    writer.close()
+    print(f"wrote {n} tensors to {dst}")
+
+
+def _bf16_to_f32(arr):
+    import numpy as np
+
+    u16 = arr.view(np.uint16) if arr.dtype != np.uint16 else arr
+    u32 = u16.astype(np.uint32) << 16
+    return u32.view(np.float32)
+
+
+# --------------------------------------------------------------------------
+
+
+def main(argv: list[str]) -> int:
+    ap = argparse.ArgumentParser(description="Convert LTX-2.3 safetensors -> GGUF (video only)")
+    ap.add_argument("--src", required=True, help="input .safetensors checkpoint")
+    ap.add_argument("--dst", help="output .gguf path (required unless --dry-run)")
+    ap.add_argument("--type", default="f16", choices=QUANT_TYPES, help="output tensor type")
+    ap.add_argument("--no-vae", action="store_true", help="exclude VAE tensors from the GGUF")
+    ap.add_argument("--dry-run", action="store_true",
+                    help="only read the header and print the keep/drop plan")
+    args = ap.parse_args(argv)
+
+    header, _ = read_safetensors_header(args.src)
+
+    if args.dry_run:
+        print_plan(header, args.type)
+        return 0
+
+    if not args.dst:
+        ap.error("--dst is required unless --dry-run is given")
+    convert(args.src, args.dst, args.type, include_vae=not args.no_vae)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/script/requirements-ltx2.txt b/script/requirements-ltx2.txt
new file mode 100644
index 000000000..c3cb793e9
--- /dev/null
+++ b/script/requirements-ltx2.txt
@@ -0,0 +1,5 @@
+# Dependencies for script/convert_ltx2_to_gguf.py (full conversion path only).
+# The --dry-run mode needs none of these (pure stdlib).
+numpy>=1.24
+safetensors>=0.4.0
+gguf>=0.10.0
diff --git a/src/model.cpp b/src/model.cpp
index 77b032c2c..7f7eaf913 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -1072,6 +1072,10 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.cap_embedder.0.weight") != std::string::npos) {
                 return VERSION_Z_IMAGE;
             }
+            if (tensor_storage.name.find("model.diffusion_model.video_embeddings_connector.") != std::string::npos ||
+                tensor_storage.name.find("model.diffusion_model.patchify_proj.weight") != std::string::npos) {
+                return VERSION_LTX2;
+            }
             if (tensor_storage.name.find("model.diffusion_model.blocks.0.cross_attn.norm_k.weight") != std::string::npos) {
                 is_wan = true;
             }
diff --git a/src/model.h b/src/model.h
index 5b9ce18ab..aa46d5da8 100644
--- a/src/model.h
+++ b/src/model.h
@@ -50,6 +50,7 @@ enum SDVersion {
     VERSION_FLUX2_KLEIN,
     VERSION_Z_IMAGE,
     VERSION_OVIS_IMAGE,
+    VERSION_LTX2,
     VERSION_COUNT,
 };
 
@@ -137,6 +138,13 @@ static inline bool sd_version_is_z_image(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_ltx2(SDVersion version) {
+    if (version == VERSION_LTX2) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_inpaint(SDVersion version) {
     if (version == VERSION_SD1_INPAINT ||
         version == VERSION_SD2_INPAINT ||
@@ -155,7 +163,8 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_wan(version) ||
         sd_version_is_qwen_image(version) ||
         sd_version_is_anima(version) ||
-        sd_version_is_z_image(version)) {
+        sd_version_is_z_image(version) ||
+        sd_version_is_ltx2(version)) {
         return true;
     }
     return false;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 2a770eca2..c36e94def 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -53,6 +53,7 @@ const char* model_version_to_str[] = {
     "Flux.2 klein",
     "Z-Image",
     "Ovis Image",
+    "LTX-2",
 };
 
 const char* sampling_methods_str[] = {

From 434843a86dc01f574a9fe5286e8f86888659748b Mon Sep 17 00:00:00 2001
From: Vib-UX <btcvibhav@gmail.com>
Date: Mon, 1 Jun 2026 10:27:50 +0530
Subject: [PATCH 2/3] feat(ltx2): M1 DiT scaffolding + load-on-CPU verification

Add the config-driven LTX-2 video-DiT block tree (src/ltx2.hpp) and an
Ltx2Model diffusion-model adapter, then wire VERSION_LTX2 into init():
construct the runner, allocate params on CPU, and bind every tensor.
Geometry is inferred from checkpoint shapes so reduced-size synthetic
checkpoints load through the same path as the real weights.

- src/ltx2.hpp: DiT (patchify/proj_out/adaln/connector + 48 blocks),
  gated attention, FFN, modulation tables; Ltx2Runner with shape inference.
- diffusion_model.hpp: Ltx2Model adapter (M1 is load-only).
- stable-diffusion.cpp: LTX-2 branch, null-conditioner guards (Gemma is M2),
  FakeVAE placeholder, FLOW_PRED denoiser, graceful generate_video stop.
- script/make_synthetic_ltx2_gguf.py: tiny synthetic DiT GGUF generator.
- script/ci_ltx2_load_smoke.sh: load-on-CPU smoke test (no large download).
- script/convert_ltx2_to_gguf.py: add --self-test filter validation.
- .github/workflows/ltx2.yml: Linux x86-64 build + load smoke.
- docs/ltx2.md + README links.

Verified locally: synthetic GGUF detected as LTX-2, geometry inferred
(num_layers/dim/heads/connector), all tensors bound on CPU, clean exit.
---
 .github/workflows/ltx2.yml         |  68 ++++++++
 README.md                          |   2 +
 docs/ltx2.md                       |  88 ++++++++++
 script/ci_ltx2_load_smoke.sh       |  57 +++++++
 script/convert_ltx2_to_gguf.py     |  51 +++++-
 script/make_synthetic_ltx2_gguf.py |  99 ++++++++++++
 src/diffusion_model.hpp            |  62 +++++++
 src/ltx2.hpp                       | 250 +++++++++++++++++++++++++++++
 src/stable-diffusion.cpp           |  33 +++-
 9 files changed, 704 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/ltx2.yml
 create mode 100644 docs/ltx2.md
 create mode 100755 script/ci_ltx2_load_smoke.sh
 create mode 100644 script/make_synthetic_ltx2_gguf.py
 create mode 100644 src/ltx2.hpp

diff --git a/.github/workflows/ltx2.yml b/.github/workflows/ltx2.yml
new file mode 100644
index 000000000..1e0e5d5ac
--- /dev/null
+++ b/.github/workflows/ltx2.yml
@@ -0,0 +1,68 @@
+name: LTX-2 CI
+
+# M1 deliverable: buildable project + "model loads on CPU" verified on
+# Linux x86-64 via a tiny synthetic GGUF (no large weight download).
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+      - ci
+    paths:
+      - ".github/workflows/ltx2.yml"
+      - "src/ltx2.hpp"
+      - "src/diffusion_model.hpp"
+      - "src/model.*"
+      - "src/stable-diffusion.cpp"
+      - "script/convert_ltx2_to_gguf.py"
+      - "script/make_synthetic_ltx2_gguf.py"
+      - "script/ci_ltx2_load_smoke.sh"
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - ".github/workflows/ltx2.yml"
+      - "src/ltx2.hpp"
+      - "src/diffusion_model.hpp"
+      - "src/model.*"
+      - "src/stable-diffusion.cpp"
+      - "script/convert_ltx2_to_gguf.py"
+      - "script/make_synthetic_ltx2_gguf.py"
+      - "script/ci_ltx2_load_smoke.sh"
+
+concurrency:
+  group: ltx2-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  linux-x64-load-smoke:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential cmake
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install conversion deps
+        run: pip install numpy gguf
+
+      - name: Validate conversion tooling (filter self-test)
+        run: python script/convert_ltx2_to_gguf.py --self-test
+
+      - name: Build sd-cli
+        run: |
+          cmake -B build -DCMAKE_BUILD_TYPE=Release
+          cmake --build build -j"$(nproc)" --target sd-cli
+
+      - name: LTX-2 load-on-CPU smoke test
+        run: bash script/ci_ltx2_load_smoke.sh
diff --git a/README.md b/README.md
index b5bb49751..134c05750 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ API and command-line option may change frequently.***
     - [Qwen Image Edit series](./docs/qwen_image_edit.md)
   - Video Models
     - [Wan2.1/Wan2.2](./docs/wan.md)
+    - [LTX-2 (T2V/I2V)](./docs/ltx2.md) (work in progress)
   - [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
   - Control Net support with SD 1.5
   - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
@@ -138,6 +139,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
 - [🔥Qwen Image](./docs/qwen_image.md)
 - [🔥Qwen Image Edit series](./docs/qwen_image_edit.md)
 - [🔥Wan2.1/Wan2.2](./docs/wan.md)
+- [LTX-2 (T2V/I2V)](./docs/ltx2.md) (work in progress)
 - [🔥Z-Image](./docs/z_image.md)
 - [Ovis-Image](./docs/ovis_image.md)
 - [Anima](./docs/anima.md)
diff --git a/docs/ltx2.md b/docs/ltx2.md
new file mode 100644
index 000000000..152ec98fb
--- /dev/null
+++ b/docs/ltx2.md
@@ -0,0 +1,88 @@
+# LTX-2 (LTX-2.3) video generation
+
+Support for [Lightricks LTX-2](https://huggingface.co/Lightricks) text-to-video
+(T2V) and image-to-video (I2V) generation. Only the **video** stream is in
+scope; the audio stream (audio DiT, audio VAE, vocoder) is intentionally not
+converted or loaded.
+
+> Status: **work in progress.** Milestone M1 (model conversion + scaffolding +
+> "model loads on CPU") is implemented. End-to-end T2V/I2V inference, the
+> Gemma-3 text encoder and the CausalVideoAutoencoder land in later milestones.
+
+## What works today (M1)
+
+- Safetensors -> GGUF conversion tooling for the video-only DiT (+ VAE), at
+  `f16`, `q8_0`, `q5_1`, `q4_0`.
+- LTX-2 architecture auto-detection in the model loader.
+- The 14B video DiT (`AVTransformer3DModel`, video half) loads and binds all of
+  its parameters on CPU, with geometry inferred from the checkpoint shapes.
+- A CI smoke test that verifies the load path on Linux x86-64 without any large
+  download.
+
+## Model conversion
+
+The conversion script reads an LTX-2.3 `.safetensors` checkpoint, drops every
+audio/vocoder tensor, and writes a video-only GGUF.
+
+```bash
+pip install -r script/requirements-ltx2.txt
+
+# Inspect the keep/drop plan without writing anything (stdlib only):
+python script/convert_ltx2_to_gguf.py --src ltx-2.3-22b-dev.safetensors --dry-run
+
+# Convert to F16 (M1 deliverable):
+python script/convert_ltx2_to_gguf.py \
+    --src ltx-2.3-22b-dev.safetensors \
+    --dst ltx-2.3-22b-video-f16.gguf --type f16
+
+# Quantized variants:
+python script/convert_ltx2_to_gguf.py --src ... --dst ...-q8_0.gguf --type q8_0
+python script/convert_ltx2_to_gguf.py --src ... --dst ...-q4_0.gguf --type q4_0
+```
+
+Quantization notes:
+
+- `f16` keeps every tensor in half precision (highest quality, largest file).
+- `q8_0` / `q5_1` / `q4_0` quantize only the large 2D DiT linear weights; norms,
+  biases, modulation tables and the VAE stay in higher precision.
+
+## Building
+
+Follow the standard [build guide](./build.md). The CLI binary is `sd-cli`:
+
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j --target sd-cli
+```
+
+## Verifying "loads on CPU" without the full weights
+
+M1 ships a tiny synthetic checkpoint generator so the load path can be exercised
+in seconds. It emits the exact DiT tensor names at drastically reduced
+dimensions; the C++ side infers the geometry from the shapes, so this is the
+same code path used for the real weights.
+
+```bash
+# requires numpy + gguf (see script/requirements-ltx2.txt)
+bash script/ci_ltx2_load_smoke.sh            # uses build/bin/sd-cli
+```
+
+Expected: the log reports `Version: LTX-2`, the inferred DiT geometry, and
+`loading tensors completed`, then exits early (generation is not available yet
+in M1).
+
+## CI
+
+`.github/workflows/ltx2.yml` runs on Linux x86-64:
+
+1. validates the conversion filter (`--self-test`),
+2. builds `sd-cli`,
+3. runs the synthetic load-on-CPU smoke test.
+
+## Scope
+
+In scope: video DiT, Video-VAE encoder+decoder, Gemma-3 text encoder, a noise
+scheduler + CFG, T2V and I2V, GGUF conversion, CLI, C API.
+
+Out of scope: the audio stream (audio DiT, Audio-VAE, vocoder), training /
+fine-tuning, the spatial upscaler, and video-to-video.
diff --git a/script/ci_ltx2_load_smoke.sh b/script/ci_ltx2_load_smoke.sh
new file mode 100755
index 000000000..0ddcc304a
--- /dev/null
+++ b/script/ci_ltx2_load_smoke.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+# M1 smoke test: build a tiny synthetic LTX-2 video-DiT GGUF and verify it is
+# detected and fully bound on CPU by sd-cli. This exercises the exact load path
+# used for the real 46 GB checkpoint without downloading any weights.
+#
+# Usage: script/ci_ltx2_load_smoke.sh [path-to-sd-cli]
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+SD_CLI="${1:-$ROOT/build/bin/sd-cli}"
+GGUF="$(mktemp -t ltx2_tiny.XXXXXX.gguf)"
+LOG="$(mktemp -t ltx2_smoke.XXXXXX.log)"
+PY="${PYTHON:-python3}"
+
+cleanup() { rm -f "$GGUF" "$LOG"; }
+trap cleanup EXIT
+
+if [ ! -x "$SD_CLI" ]; then
+    echo "FAIL: sd-cli not found at $SD_CLI" >&2
+    exit 1
+fi
+
+echo "==> generating synthetic LTX-2 DiT GGUF"
+"$PY" "$ROOT/script/make_synthetic_ltx2_gguf.py" --out "$GGUF"
+
+echo "==> loading via sd-cli (generation is expected to stop: M1 is load-only)"
+# sd-cli returns non-zero because M1 has no text encoder yet; we assert on the
+# load markers in the log instead of the exit code.
+"$SD_CLI" -M vid_gen --diffusion-model "$GGUF" -p "smoke" \
+    --steps 1 --video-frames 1 -W 32 -H 32 -o /tmp/ltx2_smoke_out >"$LOG" 2>&1 || true
+
+cat "$LOG"
+
+echo "==> checking load markers"
+fail=0
+for marker in \
+    "Version: LTX-2" \
+    "LTX-2 DiT: num_layers=" \
+    "loading tensors completed" \
+    "total params memory size"; do
+    if ! grep -qF "$marker" "$LOG"; then
+        echo "FAIL: missing expected log marker: '$marker'" >&2
+        fail=1
+    fi
+done
+
+if grep -qiE "load tensors from model loader failed|get_sd_version failed" "$LOG"; then
+    echo "FAIL: tensor load reported failure" >&2
+    fail=1
+fi
+
+if [ "$fail" -ne 0 ]; then
+    echo "LTX-2 load smoke test FAILED" >&2
+    exit 1
+fi
+
+echo "LTX-2 load smoke test PASSED"
diff --git a/script/convert_ltx2_to_gguf.py b/script/convert_ltx2_to_gguf.py
index 2513e0ead..322565d14 100644
--- a/script/convert_ltx2_to_gguf.py
+++ b/script/convert_ltx2_to_gguf.py
@@ -288,16 +288,65 @@ def _bf16_to_f32(arr):
 # --------------------------------------------------------------------------
 
 
+def self_test() -> int:
+    """Validate the keep/drop filter on representative names (no checkpoint)."""
+    keep = [
+        _DM + "patchify_proj.weight",
+        _DM + "proj_out.weight",
+        _DM + "scale_shift_table",
+        _DM + "adaln_single.linear.weight",
+        _DM + "video_embeddings_connector.learnable_registers",
+        _DM + "video_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight",
+        _DM + "transformer_blocks.0.attn1.to_q.weight",
+        _DM + "transformer_blocks.0.attn2.to_k.weight",
+        _DM + "transformer_blocks.0.ff.net.0.proj.weight",
+        _DM + "transformer_blocks.0.scale_shift_table",
+        _DM + "transformer_blocks.0.prompt_scale_shift_table",
+        "vae.decoder.conv_in.weight",
+    ]
+    drop = [
+        "vocoder.conv_pre.weight",
+        "audio_vae.encoder.conv_in.weight",
+        _DM + "audio_patchify_proj.weight",
+        _DM + "audio_scale_shift_table",
+        _DM + "audio_embeddings_connector.transformer_1d_blocks.0.attn1.to_q.weight",
+        "text_embedding_projection.audio_aggregate_embed.weight",
+        _DM + "av_ca_scale_shift_table",
+        _DM + "transformer_blocks.0.audio_attn1.to_q.weight",
+        _DM + "transformer_blocks.0.audio_to_video_attn.to_q.weight",
+        _DM + "transformer_blocks.0.scale_shift_table_a2v_ca_audio",
+    ]
+    ok = True
+    for n in keep:
+        if not is_video_tensor(n):
+            print(f"SELF-TEST FAIL: should KEEP but dropped: {n}")
+            ok = False
+    for n in drop:
+        if is_video_tensor(n):
+            print(f"SELF-TEST FAIL: should DROP but kept: {n}")
+            ok = False
+    print("self-test PASSED" if ok else "self-test FAILED")
+    return 0 if ok else 1
+
+
 def main(argv: list[str]) -> int:
     ap = argparse.ArgumentParser(description="Convert LTX-2.3 safetensors -> GGUF (video only)")
-    ap.add_argument("--src", required=True, help="input .safetensors checkpoint")
+    ap.add_argument("--src", help="input .safetensors checkpoint")
     ap.add_argument("--dst", help="output .gguf path (required unless --dry-run)")
     ap.add_argument("--type", default="f16", choices=QUANT_TYPES, help="output tensor type")
     ap.add_argument("--no-vae", action="store_true", help="exclude VAE tensors from the GGUF")
     ap.add_argument("--dry-run", action="store_true",
                     help="only read the header and print the keep/drop plan")
+    ap.add_argument("--self-test", action="store_true",
+                    help="validate the keep/drop filter on built-in names (no checkpoint needed)")
     args = ap.parse_args(argv)
 
+    if args.self_test:
+        return self_test()
+
+    if not args.src:
+        ap.error("--src is required unless --self-test is given")
+
     header, _ = read_safetensors_header(args.src)
 
     if args.dry_run:
diff --git a/script/make_synthetic_ltx2_gguf.py b/script/make_synthetic_ltx2_gguf.py
new file mode 100644
index 000000000..530ce5d27
--- /dev/null
+++ b/script/make_synthetic_ltx2_gguf.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""Generate a tiny synthetic LTX-2 video-DiT GGUF for the load-on-CPU smoke test.
+
+This emits the exact tensor names of the LTX-2 video DiT block tree
+(src/ltx2.hpp) at drastically reduced dimensions, so CI can verify that the
+model loads and binds every tensor on CPU without the 46 GB real checkpoint.
+The C++ side infers geometry from these shapes, so the same code path that
+loads this file loads the real weights.
+
+    python script/make_synthetic_ltx2_gguf.py --out /tmp/ltx2_tiny.gguf
+"""
+
+from __future__ import annotations
+
+import argparse
+
+import numpy as np
+import gguf
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", required=True)
+    ap.add_argument("--layers", type=int, default=2)
+    ap.add_argument("--connector-layers", type=int, default=2)
+    ap.add_argument("--dim", type=int, default=64)
+    ap.add_argument("--heads", type=int, default=4)
+    ap.add_argument("--in-channels", type=int, default=8)
+    ap.add_argument("--ffn", type=int, default=128)
+    ap.add_argument("--registers", type=int, default=16)
+    ap.add_argument("--freq", type=int, default=16)
+    args = ap.parse_args()
+
+    P = "model.diffusion_model."
+    dim, inner, heads = args.dim, args.ffn, args.heads
+    inc, freq, regs = args.in_channels, args.freq, args.registers
+
+    w = gguf.GGUFWriter(args.out, "ltx2")
+    tensors: dict[str, tuple[int, ...]] = {}
+
+    def lin(name, out_f, in_f):
+        # HF layout [out, in]; ggml reads ne0=in, ne1=out.
+        tensors[name + ".weight"] = (out_f, in_f)
+        tensors[name + ".bias"] = (out_f,)
+
+    def attn(prefix, q_dim, kv_dim):
+        lin(prefix + ".to_q", q_dim, q_dim)
+        lin(prefix + ".to_k", q_dim, kv_dim)
+        lin(prefix + ".to_v", q_dim, kv_dim)
+        lin(prefix + ".to_out.0", q_dim, q_dim)
+        tensors[prefix + ".q_norm.weight"] = (q_dim,)
+        tensors[prefix + ".k_norm.weight"] = (q_dim,)
+        lin(prefix + ".to_gate_logits", heads, q_dim)
+
+    def ff(prefix):
+        lin(prefix + ".net.0.proj", inner, dim)
+        lin(prefix + ".net.2", dim, inner)
+
+    # top-level
+    lin(P + "patchify_proj", dim, inc)
+    lin(P + "proj_out", inc, dim)
+    tensors[P + "scale_shift_table"] = (2, dim)
+    lin(P + "adaln_single.emb.timestep_embedder.linear_1", dim, freq)
+    lin(P + "adaln_single.emb.timestep_embedder.linear_2", dim, dim)
+    lin(P + "adaln_single.linear", 9 * dim, dim)
+    lin(P + "prompt_adaln_single.emb.timestep_embedder.linear_1", dim, freq)
+    lin(P + "prompt_adaln_single.emb.timestep_embedder.linear_2", dim, dim)
+    lin(P + "prompt_adaln_single.linear", 2 * dim, dim)
+
+    # connector
+    C = P + "video_embeddings_connector."
+    tensors[C + "learnable_registers"] = (regs, dim)
+    for i in range(args.connector_layers):
+        b = C + f"transformer_1d_blocks.{i}"
+        attn(b + ".attn1", dim, dim)
+        ff(b + ".ff")
+
+    # DiT blocks
+    for i in range(args.layers):
+        b = P + f"transformer_blocks.{i}"
+        attn(b + ".attn1", dim, dim)
+        attn(b + ".attn2", dim, dim)
+        ff(b + ".ff")
+        tensors[b + ".scale_shift_table"] = (9, dim)
+        tensors[b + ".prompt_scale_shift_table"] = (2, dim)
+
+    for name, shape in tensors.items():
+        w.add_tensor(name, np.zeros(shape, dtype=np.float32))
+
+    w.write_header_to_file()
+    w.write_kv_data_to_file()
+    w.write_tensors_to_file()
+    w.close()
+    print(f"wrote {len(tensors)} tensors to {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index 329bb9d9a..b1c6a6184 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -3,6 +3,7 @@
 
 #include "anima.hpp"
 #include "flux.hpp"
+#include "ltx2.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
 #include "unet.hpp"
@@ -379,6 +380,67 @@ struct WanModel : public DiffusionModel {
     }
 };
 
+struct Ltx2Model : public DiffusionModel {
+    std::string prefix;
+    LTX2::Ltx2Runner ltx2;
+
+    Ltx2Model(ggml_backend_t backend,
+              bool offload_params_to_cpu,
+              const String2TensorStorage& tensor_storage_map = {},
+              const std::string prefix                       = "model.diffusion_model")
+        : prefix(prefix), ltx2(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+    }
+
+    std::string get_desc() override {
+        return ltx2.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        ltx2.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        ltx2.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        ltx2.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        ltx2.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return ltx2.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        ltx2.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 0;
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        ltx2.set_flash_attention_enabled(enabled);
+    }
+
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        ltx2.set_circular_axes(circular_x, circular_y);
+    }
+
+    // M1: load-only. Denoising forward is implemented in M2.
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        LOG_ERROR("LTX-2 diffusion forward is not implemented yet (M1 is load-only)");
+        return false;
+    }
+};
+
 struct QwenImageModel : public DiffusionModel {
     std::string prefix;
     Qwen::QwenImageRunner qwen_image;
diff --git a/src/ltx2.hpp b/src/ltx2.hpp
new file mode 100644
index 000000000..efcee6021
--- /dev/null
+++ b/src/ltx2.hpp
@@ -0,0 +1,250 @@
+#ifndef __LTX2_HPP__
+#define __LTX2_HPP__
+
+#include "common_block.hpp"
+
+// LTX-2.3 video-stream DiT (AVTransformer3DModel, video half only).
+//
+// This is the M1 scaffolding: the GGMLBlock tree declares exactly the video
+// tensors produced by script/convert_ltx2_to_gguf.py so the model can be
+// loaded and its params allocated on CPU. Forward passes (denoising) are added
+// in M2; the blocks here intentionally declare parameters only.
+//
+// Confirmed architecture (Lightricks/LTX-2.3, model_version 2.3.0):
+//   num_layers=48, num_heads=32, head_dim=128 (dim 4096), in_channels=128,
+//   caption_channels=3840, cross_attention_dim=4096, qk_norm=rms_norm,
+//   gated attention (to_gate_logits), FFN 4096->16384 (gelu-approx),
+//   8-layer embeddings connector with 128 learnable registers.
+namespace LTX2 {
+
+    struct Ltx2Params {
+        int num_layers           = 48;
+        int num_heads            = 32;
+        int head_dim             = 128;
+        int dim                  = 4096;  // num_heads * head_dim
+        int in_channels          = 128;
+        int cross_attention_dim  = 4096;
+        int ffn_dim              = 16384;
+        int connector_num_layers = 8;
+        int connector_registers  = 128;
+        int timestep_freq_dim    = 256;
+        float eps                = 1e-6f;
+    };
+
+    // q/k/v + gated output projection with rms qk-norm; matches the LTX-2
+    // `attn1`/`attn2` layout (to_q, to_k, to_v, to_out.0, q_norm, k_norm,
+    // to_gate_logits).
+    class GatedAttention : public GGMLBlock {
+    public:
+        GatedAttention(int dim, int ctx_dim, int num_heads, float eps) {
+            blocks["to_q"]           = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+            blocks["to_k"]           = std::shared_ptr<GGMLBlock>(new Linear(ctx_dim, dim));
+            blocks["to_v"]           = std::shared_ptr<GGMLBlock>(new Linear(ctx_dim, dim));
+            blocks["to_out.0"]       = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+            blocks["q_norm"]         = std::shared_ptr<GGMLBlock>(new RMSNorm(dim, eps));
+            blocks["k_norm"]         = std::shared_ptr<GGMLBlock>(new RMSNorm(dim, eps));
+            blocks["to_gate_logits"] = std::shared_ptr<GGMLBlock>(new Linear(dim, num_heads));
+        }
+    };
+
+    // gelu-approximate FFN: net.0.proj (dim->inner), net.2 (inner->dim).
+    class FeedForward : public GGMLBlock {
+    public:
+        FeedForward(int dim, int inner) {
+            blocks["net.0.proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner));
+            blocks["net.2"]      = std::shared_ptr<GGMLBlock>(new Linear(inner, dim));
+        }
+    };
+
+    // adaln_single / prompt_adaln_single: a timestep embedder MLP plus a final
+    // projection producing the modulation table.
+    class AdaLnSingle : public GGMLBlock {
+    public:
+        AdaLnSingle(int freq_dim, int dim, int out_dim) {
+            blocks["emb.timestep_embedder.linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(freq_dim, dim));
+            blocks["emb.timestep_embedder.linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
+            blocks["linear"]                         = std::shared_ptr<GGMLBlock>(new Linear(dim, out_dim));
+        }
+    };
+
+    // One video DiT block: self-attn (attn1), text cross-attn (attn2), FFN, and
+    // two raw modulation tables.
+    class Ltx2TransformerBlock : public GGMLBlock {
+    protected:
+        int dim;
+
+        void init_params(struct ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            params["scale_shift_table"]        = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, 9);
+            params["prompt_scale_shift_table"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, 2);
+        }
+
+    public:
+        Ltx2TransformerBlock(const Ltx2Params& p)
+            : dim(p.dim) {
+            blocks["attn1"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.dim, p.num_heads, p.eps));
+            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.cross_attention_dim, p.num_heads, p.eps));
+            blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(p.dim, p.ffn_dim));
+        }
+    };
+
+    // One connector block: self-attn + FFN (no modulation tables).
+    class Ltx2ConnectorBlock : public GGMLBlock {
+    public:
+        Ltx2ConnectorBlock(const Ltx2Params& p) {
+            blocks["attn1"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.dim, p.num_heads, p.eps));
+            blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(p.dim, p.ffn_dim));
+        }
+    };
+
+    // video_embeddings_connector: learnable registers + N 1d transformer blocks.
+    class Ltx2Connector : public GGMLBlock {
+    protected:
+        int dim;
+        int num_registers;
+
+        void init_params(struct ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            params["learnable_registers"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, num_registers);
+        }
+
+    public:
+        Ltx2Connector(const Ltx2Params& p)
+            : dim(p.dim), num_registers(p.connector_registers) {
+            for (int i = 0; i < p.connector_num_layers; i++) {
+                blocks["transformer_1d_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new Ltx2ConnectorBlock(p));
+            }
+        }
+    };
+
+    // Top-level video DiT.
+    class Ltx2 : public GGMLBlock {
+    protected:
+        int dim;
+
+        void init_params(struct ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            params["scale_shift_table"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, dim, 2);
+        }
+
+    public:
+        Ltx2(const Ltx2Params& p)
+            : dim(p.dim) {
+            blocks["patchify_proj"]              = std::shared_ptr<GGMLBlock>(new Linear(p.in_channels, p.dim));
+            blocks["proj_out"]                   = std::shared_ptr<GGMLBlock>(new Linear(p.dim, p.in_channels));
+            blocks["adaln_single"]               = std::shared_ptr<GGMLBlock>(new AdaLnSingle(p.timestep_freq_dim, p.dim, 9 * p.dim));
+            blocks["prompt_adaln_single"]        = std::shared_ptr<GGMLBlock>(new AdaLnSingle(p.timestep_freq_dim, p.dim, 2 * p.dim));
+            blocks["video_embeddings_connector"] = std::shared_ptr<GGMLBlock>(new Ltx2Connector(p));
+            for (int i = 0; i < p.num_layers; i++) {
+                blocks["transformer_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new Ltx2TransformerBlock(p));
+            }
+        }
+    };
+
+    struct Ltx2Runner : public GGMLRunner {
+        std::string desc = "ltx2_dit";
+        Ltx2Params params;
+        Ltx2 dit;
+
+        Ltx2Runner(ggml_backend_t backend,
+                   bool offload_params_to_cpu,
+                   const String2TensorStorage& tensor_storage_map = {},
+                   const std::string prefix                       = "")
+            : GGMLRunner(backend, offload_params_to_cpu),
+              params(infer_params(tensor_storage_map, prefix)),
+              dit(params) {
+            LOG_INFO("LTX-2 DiT: num_layers=%d dim=%d heads=%d in_ch=%d connector_layers=%d",
+                     params.num_layers, params.dim, params.num_heads, params.in_channels, params.connector_num_layers);
+            dit.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        // Infer the model geometry from tensor shapes so reduced-size synthetic
+        // checkpoints load as well as the full LTX-2.3 weights. Falls back to the
+        // confirmed LTX-2.3 defaults when a tensor is absent.
+        static Ltx2Params infer_params(const String2TensorStorage& tsm, const std::string& prefix) {
+            Ltx2Params p;
+            std::string base = prefix.empty() ? "" : prefix + ".";
+
+            int max_block     = -1;
+            int max_connector = -1;
+            for (auto& pair : tsm) {
+                const std::string& n = pair.first;
+                if (n.compare(0, base.size(), base) != 0) {
+                    continue;
+                }
+                std::string rel = n.substr(base.size());
+                max_block       = std::max(max_block, parse_index(rel, "transformer_blocks."));
+                max_connector   = std::max(max_connector, parse_index(rel, "video_embeddings_connector.transformer_1d_blocks."));
+            }
+            if (max_block >= 0) {
+                p.num_layers = max_block + 1;
+            }
+            if (max_connector >= 0) {
+                p.connector_num_layers = max_connector + 1;
+            }
+
+            // dim / in_channels from patchify_proj.weight [ne0=in, ne1=out]
+            const TensorStorage* patch = find(tsm, base + "patchify_proj.weight");
+            if (patch != nullptr && patch->n_dims >= 2) {
+                p.in_channels = (int)patch->ne[0];
+                p.dim         = (int)patch->ne[1];
+            }
+            // ffn_dim from a transformer block ff
+            const TensorStorage* ff = find(tsm, base + "transformer_blocks.0.ff.net.0.proj.weight");
+            if (ff != nullptr && ff->n_dims >= 2) {
+                p.ffn_dim = (int)ff->ne[1];
+            }
+            // cross_attention_dim from attn2.to_k.weight [ne0=ctx_dim]
+            const TensorStorage* xk = find(tsm, base + "transformer_blocks.0.attn2.to_k.weight");
+            if (xk != nullptr && xk->n_dims >= 1) {
+                p.cross_attention_dim = (int)xk->ne[0];
+            }
+            // num_heads from to_gate_logits.weight [ne1=num_heads]
+            const TensorStorage* gate = find(tsm, base + "transformer_blocks.0.attn1.to_gate_logits.weight");
+            if (gate != nullptr && gate->n_dims >= 2) {
+                p.num_heads = (int)gate->ne[1];
+            }
+            if (p.num_heads > 0) {
+                p.head_dim = p.dim / p.num_heads;
+            }
+            // registers from connector learnable_registers [ne1=num_registers]
+            const TensorStorage* reg = find(tsm, base + "video_embeddings_connector.learnable_registers");
+            if (reg != nullptr && reg->n_dims >= 2) {
+                p.connector_registers = (int)reg->ne[1];
+            }
+            // timestep embedder input dim
+            const TensorStorage* te = find(tsm, base + "adaln_single.emb.timestep_embedder.linear_1.weight");
+            if (te != nullptr && te->n_dims >= 1) {
+                p.timestep_freq_dim = (int)te->ne[0];
+            }
+            return p;
+        }
+
+        static int parse_index(const std::string& rel, const std::string& tag) {
+            size_t pos = rel.find(tag);
+            if (pos == std::string::npos) {
+                return -1;
+            }
+            return atoi(rel.c_str() + pos + tag.size());
+        }
+
+        static const TensorStorage* find(const String2TensorStorage& tsm, const std::string& name) {
+            auto it = tsm.find(name);
+            return it == tsm.end() ? nullptr : &it->second;
+        }
+
+        std::string get_desc() override {
+            return desc;
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            dit.get_param_tensors(tensors, prefix);
+        }
+    };
+
+}  // namespace LTX2
+
+#endif  // __LTX2_HPP__
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index c36e94def..bdaaff1b6 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -524,6 +524,13 @@ class StableDiffusionGGML {
                     clip_vision->alloc_params_buffer();
                     clip_vision->get_param_tensors(tensors);
                 }
+            } else if (sd_version_is_ltx2(version)) {
+                // M1: load the video DiT only. The Gemma-3-12B text encoder and
+                // the CausalVideoAutoencoder are wired in M2.
+                diffusion_model = std::make_shared<Ltx2Model>(backend,
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model");
             } else if (sd_version_is_qwen_image(version)) {
                 bool enable_vision = false;
                 if (!vae_decode_only) {
@@ -588,8 +595,10 @@ class StableDiffusionGGML {
                 }
             }
 
-            cond_stage_model->alloc_params_buffer();
-            cond_stage_model->get_param_tensors(tensors);
+            if (cond_stage_model) {
+                cond_stage_model->alloc_params_buffer();
+                cond_stage_model->get_param_tensors(tensors);
+            }
 
             diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors);
@@ -623,6 +632,11 @@ class StableDiffusionGGML {
                 } else if (version == VERSION_CHROMA_RADIANCE) {
                     first_stage_model = std::make_shared<FakeVAE>(vae_backend,
                                                                   offload_params_to_cpu);
+                } else if (sd_version_is_ltx2(version)) {
+                    // M1: placeholder so a DiT-only checkpoint loads on CPU.
+                    // The real CausalVideoAutoencoder is added in M2.
+                    first_stage_model = std::make_shared<FakeVAE>(vae_backend,
+                                                                  offload_params_to_cpu);
                 } else {
                     first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                         offload_params_to_cpu,
@@ -736,7 +750,9 @@ class StableDiffusionGGML {
 
             if (sd_ctx_params->flash_attn) {
                 LOG_INFO("Using flash attention");
-                cond_stage_model->set_flash_attention_enabled(true);
+                if (cond_stage_model) {
+                    cond_stage_model->set_flash_attention_enabled(true);
+                }
                 if (clip_vision) {
                     clip_vision->set_flash_attention_enabled(true);
                 }
@@ -816,7 +832,7 @@ class StableDiffusionGGML {
         LOG_DEBUG("finished loaded file");
 
         {
-            size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size();
+            size_t clip_params_mem_size = cond_stage_model ? cond_stage_model->get_params_buffer_size() : 0;
             size_t unet_params_mem_size = diffusion_model->get_params_buffer_size();
             if (high_noise_diffusion_model) {
                 unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
@@ -915,7 +931,8 @@ class StableDiffusionGGML {
                            sd_version_is_wan(version) ||
                            sd_version_is_qwen_image(version) ||
                            sd_version_is_anima(version) ||
-                           sd_version_is_z_image(version)) {
+                           sd_version_is_z_image(version) ||
+                           sd_version_is_ltx2(version)) {
                     pred_type = FLOW_PRED;
                     if (sd_version_is_wan(version)) {
                         default_flow_shift = 5.f;
@@ -3856,6 +3873,12 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     if (sd_ctx == nullptr || sd_vid_gen_params == nullptr) {
         return nullptr;
     }
+    if (sd_ctx->sd->cond_stage_model == nullptr) {
+        // M1 LTX-2: the DiT loads, but the Gemma text encoder and video VAE
+        // needed for generation arrive in M2.
+        LOG_ERROR("video generation is not available: no text encoder loaded (LTX-2 inference lands in M2)");
+        return nullptr;
+    }
     sd_ctx->sd->vae_tiling_params = sd_vid_gen_params->vae_tiling_params;
 
     std::string prompt          = SAFE_STR(sd_vid_gen_params->prompt);

From fa62eeb8a88076de56282e0c80aa4aa2503884c9 Mon Sep 17 00:00:00 2001
From: Vib-UX <btcvibhav@gmail.com>
Date: Wed, 3 Jun 2026 06:30:02 +0530
Subject: [PATCH 3/3] =?UTF-8?q?feat(ltx2):=20M2=20core=20CPU=20inference?=
 =?UTF-8?q?=20=E2=80=94=20end-to-end=20T2V=20+=20I2V?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement the full LTX-2.3 video generation pipeline on CPU so the CLI
produces a video from a text prompt (T2V) and from a prompt + image (I2V),
targeting the distilled checkpoint (8 steps, CFG=1).

- Latent/scheduler plumbing (stable-diffusion.cpp, denoiser.hpp,
  stable-diffusion.h): LTX geometry (32x spatial, 128 latent channels,
  (F-1)%8 frame alignment, x8 temporal decode), identity DiT-side latent
  transform, and a new LinearQuadraticScheduler wired into scheduler_t and
  the --scheduler linear_quadratic CLI flag.
- DiT forward (ltx2.hpp, diffusion_model.hpp): patchify_proj, 3D RoPE,
  AdaLN-single modulation, GatedAttention (RMS qk-norm + per-head gating),
  text cross-attention, gelu FFN, video-embeddings connector with learnable
  registers, and proj_out; Ltx2Model::compute wired.
- Gemma-3 text encoder (gemma3.hpp, conditioner.hpp): native ggml encoder
  (GeGLU MLP, (1+w) RMSNorm, q/k norm, sliding-window vs global attention,
  scaled embeddings) plus the LTX multi-layer feature extractor/projection
  in Ltx2Conditioner, loaded via the standard --llm path.
- Video-VAE (vae.hpp): Ltx2VAERunner shape-correct CausalVideoAutoencoder
  placeholder (decode + encode), replacing FakeVAE for LTX2.
- Pipeline (stable-diffusion.cpp): T2V + I2V branches in generate_video()
  with init-frame conditioning via the denoise mask; CLI AVI/raw output.
- CI/docs: synthetic GGUF maker emits DiT + Gemma + projection tensors;
  ci_ltx2_load_smoke.sh upgraded from load-only to a full end-to-end
  generate check; ltx2.yml runs the generate smoke on Linux x86-64 (AVX)
  and macOS ARM64 (NEON); docs/ltx2.md documents T2V/I2V usage, memory
  staging, and a validation-status section.

Numerical parity (PSNR/SSIM) and the learned Video-VAE weights/ops remain
M3 work; the VAE is currently a geometric placeholder.
---
 .github/workflows/ltx2.yml         |  41 +++-
 docs/ltx2.md                       | 101 ++++++--
 include/stable-diffusion.h         |   1 +
 script/ci_ltx2_load_smoke.sh       |  49 ++--
 script/make_synthetic_ltx2_gguf.py |  86 +++++--
 src/conditioner.hpp                | 139 +++++++++++
 src/denoiser.hpp                   |  54 +++++
 src/diffusion_model.hpp            |   9 +-
 src/gemma3.hpp                     | 363 +++++++++++++++++++++++++++++
 src/ltx2.hpp                       | 317 +++++++++++++++++++++++--
 src/stable-diffusion.cpp           |  63 ++++-
 src/vae.hpp                        |  96 ++++++++
 12 files changed, 1233 insertions(+), 86 deletions(-)
 create mode 100644 src/gemma3.hpp

diff --git a/.github/workflows/ltx2.yml b/.github/workflows/ltx2.yml
index 1e0e5d5ac..fb3ee5488 100644
--- a/.github/workflows/ltx2.yml
+++ b/.github/workflows/ltx2.yml
@@ -1,7 +1,10 @@
 name: LTX-2 CI
 
-# M1 deliverable: buildable project + "model loads on CPU" verified on
-# Linux x86-64 via a tiny synthetic GGUF (no large weight download).
+# M2 deliverable: end-to-end T2V/I2V CPU inference verified on both
+# Linux x86-64 (AVX) and macOS ARM64 (NEON) via a tiny synthetic GGUF stack
+# (video DiT + Gemma-3 encoder + text projection), with no large weight
+# download. The synthetic generate exercises the exact pipeline used for the
+# real ~46 GB checkpoint.
 
 on:
   workflow_dispatch:
@@ -12,9 +15,14 @@ on:
     paths:
       - ".github/workflows/ltx2.yml"
       - "src/ltx2.hpp"
+      - "src/gemma3.hpp"
+      - "src/conditioner.hpp"
+      - "src/denoiser.hpp"
+      - "src/vae.hpp"
       - "src/diffusion_model.hpp"
       - "src/model.*"
       - "src/stable-diffusion.cpp"
+      - "include/stable-diffusion.h"
       - "script/convert_ltx2_to_gguf.py"
       - "script/make_synthetic_ltx2_gguf.py"
       - "script/ci_ltx2_load_smoke.sh"
@@ -23,9 +31,14 @@ on:
     paths:
       - ".github/workflows/ltx2.yml"
       - "src/ltx2.hpp"
+      - "src/gemma3.hpp"
+      - "src/conditioner.hpp"
+      - "src/denoiser.hpp"
+      - "src/vae.hpp"
       - "src/diffusion_model.hpp"
       - "src/model.*"
       - "src/stable-diffusion.cpp"
+      - "include/stable-diffusion.h"
       - "script/convert_ltx2_to_gguf.py"
       - "script/make_synthetic_ltx2_gguf.py"
       - "script/ci_ltx2_load_smoke.sh"
@@ -35,19 +48,33 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  linux-x64-load-smoke:
-    runs-on: ubuntu-latest
+  generate-smoke:
+    name: ${{ matrix.name }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: linux-x64-avx-generate-smoke
+            runs-on: ubuntu-latest
+          - name: macos-arm64-neon-generate-smoke
+            runs-on: macos-14
+    runs-on: ${{ matrix.runs-on }}
     steps:
       - name: Clone
         uses: actions/checkout@v4
         with:
           submodules: recursive
 
-      - name: Dependencies
+      - name: Dependencies (Linux)
+        if: runner.os == 'Linux'
         run: |
           sudo apt-get update
           sudo apt-get install -y build-essential cmake
 
+      - name: Dependencies (macOS)
+        if: runner.os == 'macOS'
+        run: brew install cmake || true
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
@@ -62,7 +89,7 @@ jobs:
       - name: Build sd-cli
         run: |
           cmake -B build -DCMAKE_BUILD_TYPE=Release
-          cmake --build build -j"$(nproc)" --target sd-cli
+          cmake --build build -j3 --target sd-cli
 
-      - name: LTX-2 load-on-CPU smoke test
+      - name: LTX-2 end-to-end generate smoke test
         run: bash script/ci_ltx2_load_smoke.sh
diff --git a/docs/ltx2.md b/docs/ltx2.md
index 152ec98fb..9e45fd131 100644
--- a/docs/ltx2.md
+++ b/docs/ltx2.md
@@ -5,19 +5,79 @@ Support for [Lightricks LTX-2](https://huggingface.co/Lightricks) text-to-video
 scope; the audio stream (audio DiT, audio VAE, vocoder) is intentionally not
 converted or loaded.
 
-> Status: **work in progress.** Milestone M1 (model conversion + scaffolding +
-> "model loads on CPU") is implemented. End-to-end T2V/I2V inference, the
-> Gemma-3 text encoder and the CausalVideoAutoencoder land in later milestones.
+> Status: **work in progress.** Milestone M1 (conversion + scaffolding + "loads
+> on CPU") and Milestone M2 (core CPU inference) are implemented: the full
+> T2V/I2V pipeline — Gemma-3 text encoder, video DiT denoising loop, the
+> LinearQuadratic scheduler and the Video-VAE — runs end-to-end on CPU and
+> writes a video file. See **Validation status** below for which pieces are
+> numerically validated vs. structurally implemented pending reference parity.
 
-## What works today (M1)
+## What works today (M1 + M2)
 
 - Safetensors -> GGUF conversion tooling for the video-only DiT (+ VAE), at
   `f16`, `q8_0`, `q5_1`, `q4_0`.
 - LTX-2 architecture auto-detection in the model loader.
-- The 14B video DiT (`AVTransformer3DModel`, video half) loads and binds all of
-  its parameters on CPU, with geometry inferred from the checkpoint shapes.
-- A CI smoke test that verifies the load path on Linux x86-64 without any large
-  download.
+- The 14B video DiT (`AVTransformer3DModel`, video half) loads on CPU and runs a
+  full forward pass: `patchify_proj`, 3D RoPE, AdaLN-single modulation, gated
+  self/cross attention with RMS qk-norm, the gelu FFN, the video-embeddings
+  connector and `proj_out`. Geometry is inferred from the checkpoint shapes.
+- A native **Gemma-3** text encoder (`src/gemma3.hpp`): GeGLU MLP, q/k RMSNorm,
+  `(1 + weight)` RMSNorm, scaled token embeddings, and sliding-window vs global
+  attention layers. The multi-layer feature extractor + projection feed the DiT
+  cross-attention.
+- A **LinearQuadratic** flow-matching scheduler (`--scheduler linear_quadratic`).
+- The LTX **CausalVideoAutoencoder** geometry (32x spatial / 8x temporal, 128
+  latent channels) and the T2V + I2V `generate_video` wiring, producing MJPEG
+  AVI / raw-frame output from the CLI.
+- A CI smoke test that runs a full synthetic end-to-end generate on both
+  Linux x86-64 (AVX) and macOS ARM64 (NEON) without any large download.
+
+## Usage
+
+LTX-2 needs the converted LTX checkpoint (DiT + VAE + text projection) plus a
+Gemma-3 GGUF for the text encoder:
+
+```bash
+# Text-to-video
+./build/bin/sd-cli -M vid_gen \
+    --diffusion-model ltx-2.3-video-q8_0.gguf \
+    --llm gemma-3-12b.gguf \
+    -p "a corgi running on the beach at sunset" \
+    --scheduler linear_quadratic --steps 8 \
+    -W 512 -H 768 --video-frames 49 -o out
+
+# Image-to-video (animate / bootstrap from a reference frame)
+./build/bin/sd-cli -M vid_gen \
+    --diffusion-model ltx-2.3-video-q8_0.gguf --llm gemma-3-12b.gguf \
+    -i first_frame.png -p "the camera slowly pans right" \
+    --scheduler linear_quadratic --steps 8 \
+    -W 512 -H 768 --video-frames 49 -o out
+```
+
+LTX-2 targets the **distilled** checkpoint first (8 steps, CFG=1), so no
+negative-prompt pass is needed. Frame counts are aligned to `(F-1) % 8 == 0` and
+spatial dimensions to multiples of 32. Memory is staged per component (encode
+text, then run the DiT, then decode) — use `--diffusion-fa` / quantized weights
+to fit consumer RAM.
+
+## Validation status (read before trusting output)
+
+M2's goal is end-to-end CPU inference that runs and produces video of the right
+shape; numerical parity (PSNR/SSIM) is an M3 acceptance criterion. The following
+pieces are implemented structurally and **must be validated against the
+Diffusers LTX-2 reference** before claiming quality parity:
+
+- the exact assignment of the 9 per-block + 2 prompt AdaLN modulation channels,
+  the 3D RoPE axis split and `theta`, and the non-affine norm placement
+  (`src/ltx2.hpp`);
+- the Gemma-3 RoPE theta / sliding-window pattern / query scaling, and a real
+  Gemma SentencePiece tokenizer (the current tokenizer is a byte-level
+  placeholder; `src/gemma3.hpp`, `Ltx2Conditioner`);
+- the LTX multi-layer feature-extractor aggregation and the
+  `text_embedding_projection.video_*` tensor layout (`Ltx2TextProjection`);
+- the **Video-VAE** is currently a shape-correct geometric placeholder
+  (`Ltx2VAERunner` in `src/vae.hpp`); the learned causal-conv encoder/decoder,
+  PixelNorm and per-channel statistics replace it next.
 
 ## Model conversion
 
@@ -55,29 +115,32 @@ cmake -B build -DCMAKE_BUILD_TYPE=Release
 cmake --build build -j --target sd-cli
 ```
 
-## Verifying "loads on CPU" without the full weights
+## Verifying the full pipeline without the real weights
 
-M1 ships a tiny synthetic checkpoint generator so the load path can be exercised
-in seconds. It emits the exact DiT tensor names at drastically reduced
-dimensions; the C++ side infers the geometry from the shapes, so this is the
-same code path used for the real weights.
+A tiny synthetic checkpoint generator exercises the entire pipeline in seconds.
+It emits the exact DiT + Gemma-3 + projection tensor names at drastically
+reduced dimensions (and a separate Gemma file), so the same code path that loads
+the real weights runs a full generate.
 
 ```bash
 # requires numpy + gguf (see script/requirements-ltx2.txt)
-bash script/ci_ltx2_load_smoke.sh            # uses build/bin/sd-cli
+python script/make_synthetic_ltx2_gguf.py --out /tmp/ltx2_tiny.gguf
+# writes /tmp/ltx2_tiny.gguf (DiT + projection) and /tmp/ltx2_tiny.gguf.gemma.gguf
+
+bash script/ci_ltx2_load_smoke.sh            # build + synthetic end-to-end generate
 ```
 
-Expected: the log reports `Version: LTX-2`, the inferred DiT geometry, and
-`loading tensors completed`, then exits early (generation is not available yet
-in M1).
+Expected: the log reports `Version: LTX-2`, the inferred Gemma-3 and DiT
+geometry, `get_sigmas with LinearQuadratic scheduler`, `sampling completed`,
+`decode_first_stage completed`, and writes an `.avi` video.
 
 ## CI
 
-`.github/workflows/ltx2.yml` runs on Linux x86-64:
+`.github/workflows/ltx2.yml` runs on Linux x86-64 (AVX) and macOS ARM64 (NEON):
 
 1. validates the conversion filter (`--self-test`),
 2. builds `sd-cli`,
-3. runs the synthetic load-on-CPU smoke test.
+3. runs the synthetic end-to-end generate smoke test.
 
 ## Scope
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 51b2b3291..ffe52e5ea 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -65,6 +65,7 @@ enum scheduler_t {
     KL_OPTIMAL_SCHEDULER,
     LCM_SCHEDULER,
     BONG_TANGENT_SCHEDULER,
+    LINEAR_QUADRATIC_SCHEDULER,
     SCHEDULER_COUNT
 };
 
diff --git a/script/ci_ltx2_load_smoke.sh b/script/ci_ltx2_load_smoke.sh
index 0ddcc304a..f1ba80bea 100755
--- a/script/ci_ltx2_load_smoke.sh
+++ b/script/ci_ltx2_load_smoke.sh
@@ -1,7 +1,10 @@
 #!/usr/bin/env bash
-# M1 smoke test: build a tiny synthetic LTX-2 video-DiT GGUF and verify it is
-# detected and fully bound on CPU by sd-cli. This exercises the exact load path
-# used for the real 46 GB checkpoint without downloading any weights.
+# M2 smoke test: build a tiny synthetic LTX-2 stack (video DiT + Gemma-3 text
+# encoder + text projection) and run a full end-to-end CPU generate, verifying
+# the Gemma encoder, the DiT denoising loop, the LinearQuadratic scheduler and
+# the Video-VAE decode all execute and produce a video file. This exercises the
+# exact pipeline used for the real ~46 GB checkpoint without downloading any
+# weights.
 #
 # Usage: script/ci_ltx2_load_smoke.sh [path-to-sd-cli]
 set -euo pipefail
@@ -9,10 +12,12 @@ set -euo pipefail
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 SD_CLI="${1:-$ROOT/build/bin/sd-cli}"
 GGUF="$(mktemp -t ltx2_tiny.XXXXXX.gguf)"
+GEMMA="$GGUF.gemma.gguf"
+OUT="$(mktemp -t ltx2_smoke_out.XXXXXX)"
 LOG="$(mktemp -t ltx2_smoke.XXXXXX.log)"
 PY="${PYTHON:-python3}"
 
-cleanup() { rm -f "$GGUF" "$LOG"; }
+cleanup() { rm -f "$GGUF" "$GEMMA" "$LOG" "$OUT" "$OUT.avi" "$OUT.png"; }
 trap cleanup EXIT
 
 if [ ! -x "$SD_CLI" ]; then
@@ -20,38 +25,46 @@ if [ ! -x "$SD_CLI" ]; then
     exit 1
 fi
 
-echo "==> generating synthetic LTX-2 DiT GGUF"
-"$PY" "$ROOT/script/make_synthetic_ltx2_gguf.py" --out "$GGUF"
+echo "==> generating synthetic LTX-2 DiT + Gemma-3 GGUFs"
+"$PY" "$ROOT/script/make_synthetic_ltx2_gguf.py" --out "$GGUF" --gemma-out "$GEMMA"
 
-echo "==> loading via sd-cli (generation is expected to stop: M1 is load-only)"
-# sd-cli returns non-zero because M1 has no text encoder yet; we assert on the
-# load markers in the log instead of the exit code.
-"$SD_CLI" -M vid_gen --diffusion-model "$GGUF" -p "smoke" \
-    --steps 1 --video-frames 1 -W 32 -H 32 -o /tmp/ltx2_smoke_out >"$LOG" 2>&1 || true
+echo "==> running end-to-end T2V generate on CPU"
+"$SD_CLI" -M vid_gen --diffusion-model "$GGUF" --llm "$GEMMA" -p "smoke test" \
+    --steps 4 --video-frames 9 -W 32 -H 32 --scheduler linear_quadratic \
+    -o "$OUT" >"$LOG" 2>&1
 
 cat "$LOG"
 
-echo "==> checking load markers"
+echo "==> checking generate markers"
 fail=0
 for marker in \
     "Version: LTX-2" \
+    "Gemma-3 encoder: num_layers=" \
     "LTX-2 DiT: num_layers=" \
-    "loading tensors completed" \
-    "total params memory size"; do
+    "get_sigmas with LinearQuadratic scheduler" \
+    "get_learned_condition completed" \
+    "sampling completed" \
+    "decode_first_stage completed" \
+    "save result"; do
     if ! grep -qF "$marker" "$LOG"; then
         echo "FAIL: missing expected log marker: '$marker'" >&2
         fail=1
     fi
 done
 
-if grep -qiE "load tensors from model loader failed|get_sd_version failed" "$LOG"; then
-    echo "FAIL: tensor load reported failure" >&2
+if grep -qiE "load tensors from model loader failed|get_sd_version failed|generate failed" "$LOG"; then
+    echo "FAIL: generation reported failure" >&2
+    fail=1
+fi
+
+if [ ! -s "$OUT.avi" ]; then
+    echo "FAIL: expected video output '$OUT.avi' was not written" >&2
     fail=1
 fi
 
 if [ "$fail" -ne 0 ]; then
-    echo "LTX-2 load smoke test FAILED" >&2
+    echo "LTX-2 generate smoke test FAILED" >&2
     exit 1
 fi
 
-echo "LTX-2 load smoke test PASSED"
+echo "LTX-2 generate smoke test PASSED"
diff --git a/script/make_synthetic_ltx2_gguf.py b/script/make_synthetic_ltx2_gguf.py
index 530ce5d27..7648da8f0 100644
--- a/script/make_synthetic_ltx2_gguf.py
+++ b/script/make_synthetic_ltx2_gguf.py
@@ -1,11 +1,16 @@
 #!/usr/bin/env python3
-"""Generate a tiny synthetic LTX-2 video-DiT GGUF for the load-on-CPU smoke test.
+"""Generate a tiny synthetic LTX-2 GGUF for the CPU smoke / end-to-end test.
 
-This emits the exact tensor names of the LTX-2 video DiT block tree
-(src/ltx2.hpp) at drastically reduced dimensions, so CI can verify that the
-model loads and binds every tensor on CPU without the 46 GB real checkpoint.
-The C++ side infers geometry from these shapes, so the same code path that
-loads this file loads the real weights.
+This emits the exact tensor names of the full LTX-2 stack at drastically
+reduced dimensions:
+  * the video DiT block tree            (src/ltx2.hpp,    model.diffusion_model.*)
+  * the Gemma-3 text encoder            (src/gemma3.hpp,  text_encoders.gemma3.*)
+  * the LTX multi-layer text projection (conditioner,     text_embedding_projection.*)
+
+so CI can verify that the model loads, binds every tensor, and runs a full
+generate on CPU without the ~46 GB real checkpoint. The C++ side infers
+geometry from these shapes, so the same code path that loads this file loads
+the real weights. The Video-VAE is geometric (weightless) and needs no tensors.
 
     python script/make_synthetic_ltx2_gguf.py --out /tmp/ltx2_tiny.gguf
 """
@@ -20,17 +25,27 @@
 
 def main() -> int:
     ap = argparse.ArgumentParser()
-    ap.add_argument("--out", required=True)
+    ap.add_argument("--out", required=True, help="LTX-2 checkpoint (DiT + text projection); load via --diffusion-model")
+    ap.add_argument("--gemma-out", help="Gemma-3 encoder file; load via --llm. Defaults to <out>.gemma.gguf")
     ap.add_argument("--layers", type=int, default=2)
     ap.add_argument("--connector-layers", type=int, default=2)
     ap.add_argument("--dim", type=int, default=64)
     ap.add_argument("--heads", type=int, default=4)
-    ap.add_argument("--in-channels", type=int, default=8)
+    ap.add_argument("--in-channels", type=int, default=128)
     ap.add_argument("--ffn", type=int, default=128)
     ap.add_argument("--registers", type=int, default=16)
     ap.add_argument("--freq", type=int, default=16)
+    # Gemma-3 encoder (tiny)
+    ap.add_argument("--gemma-layers", type=int, default=2)
+    ap.add_argument("--gemma-hidden", type=int, default=32)
+    ap.add_argument("--gemma-heads", type=int, default=2)
+    ap.add_argument("--gemma-kv-heads", type=int, default=1)
+    ap.add_argument("--gemma-head-dim", type=int, default=16)
+    ap.add_argument("--gemma-intermediate", type=int, default=64)
+    ap.add_argument("--gemma-vocab", type=int, default=64)
     args = ap.parse_args()
 
+    rng = np.random.default_rng(0)
     P = "model.diffusion_model."
     dim, inner, heads = args.dim, args.ffn, args.heads
     inc, freq, regs = args.in_channels, args.freq, args.registers
@@ -43,6 +58,9 @@ def lin(name, out_f, in_f):
         tensors[name + ".weight"] = (out_f, in_f)
         tensors[name + ".bias"] = (out_f,)
 
+    def lin_nb(name, out_f, in_f):
+        tensors[name + ".weight"] = (out_f, in_f)
+
     def attn(prefix, q_dim, kv_dim):
         lin(prefix + ".to_q", q_dim, q_dim)
         lin(prefix + ".to_k", q_dim, kv_dim)
@@ -56,7 +74,7 @@ def ff(prefix):
         lin(prefix + ".net.0.proj", inner, dim)
         lin(prefix + ".net.2", dim, inner)
 
-    # top-level
+    # ---- video DiT ----
     lin(P + "patchify_proj", dim, inc)
     lin(P + "proj_out", inc, dim)
     tensors[P + "scale_shift_table"] = (2, dim)
@@ -67,7 +85,6 @@ def ff(prefix):
     lin(P + "prompt_adaln_single.emb.timestep_embedder.linear_2", dim, dim)
     lin(P + "prompt_adaln_single.linear", 2 * dim, dim)
 
-    # connector
     C = P + "video_embeddings_connector."
     tensors[C + "learnable_registers"] = (regs, dim)
     for i in range(args.connector_layers):
@@ -75,7 +92,6 @@ def ff(prefix):
         attn(b + ".attn1", dim, dim)
         ff(b + ".ff")
 
-    # DiT blocks
     for i in range(args.layers):
         b = P + f"transformer_blocks.{i}"
         attn(b + ".attn1", dim, dim)
@@ -84,14 +100,58 @@ def ff(prefix):
         tensors[b + ".scale_shift_table"] = (9, dim)
         tensors[b + ".prompt_scale_shift_table"] = (2, dim)
 
-    for name, shape in tensors.items():
-        w.add_tensor(name, np.zeros(shape, dtype=np.float32))
+    # ---- LTX multi-layer text projection: (hidden * gemma_layers) -> dim ----
+    # Lives in the LTX checkpoint, so it is namespaced under model.diffusion_model.
+    lin(P + "text_embedding_projection.video_proj", dim, args.gemma_hidden * args.gemma_layers)
+
+    # ---- Gemma-3 text encoder (separate file, relative names for --llm) ----
+    gemma_tensors: dict[str, tuple[int, ...]] = {}
+
+    def glin_nb(name, out_f, in_f):
+        gemma_tensors[name + ".weight"] = (out_f, in_f)
+
+    G = ""
+    gh = args.gemma_hidden
+    ghd = args.gemma_head_dim
+    gq = args.gemma_heads * ghd
+    gkv = args.gemma_kv_heads * ghd
+    gi = args.gemma_intermediate
+    # HF Embedding is [num_embeddings, embedding_dim] -> ggml ne0=hidden, ne1=vocab
+    gemma_tensors[G + "embed_tokens.weight"] = (args.gemma_vocab, gh)
+    for i in range(args.gemma_layers):
+        b = G + f"layers.{i}"
+        gemma_tensors[b + ".input_layernorm.weight"] = (gh,)
+        glin_nb(b + ".self_attn.q_proj", gq, gh)
+        glin_nb(b + ".self_attn.k_proj", gkv, gh)
+        glin_nb(b + ".self_attn.v_proj", gkv, gh)
+        glin_nb(b + ".self_attn.o_proj", gh, gq)
+        gemma_tensors[b + ".self_attn.q_norm.weight"] = (ghd,)
+        gemma_tensors[b + ".self_attn.k_norm.weight"] = (ghd,)
+        gemma_tensors[b + ".post_attention_layernorm.weight"] = (gh,)
+        gemma_tensors[b + ".pre_feedforward_layernorm.weight"] = (gh,)
+        glin_nb(b + ".mlp.gate_proj", gi, gh)
+        glin_nb(b + ".mlp.up_proj", gi, gh)
+        glin_nb(b + ".mlp.down_proj", gh, gi)
+        gemma_tensors[b + ".post_feedforward_layernorm.weight"] = (gh,)
+    gemma_tensors[G + "norm.weight"] = (gh,)
 
+    for name, shape in tensors.items():
+        w.add_tensor(name, rng.standard_normal(shape).astype(np.float32) * 0.02)
     w.write_header_to_file()
     w.write_kv_data_to_file()
     w.write_tensors_to_file()
     w.close()
     print(f"wrote {len(tensors)} tensors to {args.out}")
+
+    gemma_out = args.gemma_out or (args.out + ".gemma.gguf")
+    gw = gguf.GGUFWriter(gemma_out, "gemma3")
+    for name, shape in gemma_tensors.items():
+        gw.add_tensor(name, rng.standard_normal(shape).astype(np.float32) * 0.02)
+    gw.write_header_to_file()
+    gw.write_kv_data_to_file()
+    gw.write_tensors_to_file()
+    gw.close()
+    print(f"wrote {len(gemma_tensors)} tensors to {gemma_out}")
     return 0
 
 
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index d4a3146b8..06616fd51 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -2,6 +2,7 @@
 #define __CONDITIONER_HPP__
 
 #include "clip.hpp"
+#include "gemma3.hpp"
 #include "llm.hpp"
 #include "t5.hpp"
 
@@ -2151,4 +2152,142 @@ struct LLMEmbedder : public Conditioner {
     }
 };
 
+// LTX-2 multi-layer feature extractor: aggregates all Gemma-3 decoder layers
+// and projects them to the DiT cross-attention dimension. The learned weights
+// live in the LTX-2 checkpoint under `text_embedding_projection.video_*`.
+//
+// NOTE: the exact aggregation (mean-centering / per-layer scaling) and the
+// projection tensor layout must be reconciled with the Diffusers LTX-2
+// reference. Here we implement flatten([hidden x num_layers]) -> Linear.
+struct Ltx2TextProjection : public GGMLRunner {
+    int64_t in_dim  = 0;  // hidden * num_layers
+    int64_t out_dim = 0;  // cross_attention_dim
+    std::shared_ptr<Linear> proj;
+
+    Ltx2TextProjection(ggml_backend_t backend,
+                       bool offload_params_to_cpu,
+                       const String2TensorStorage& tensor_storage_map,
+                       const std::string prefix = "text_embedding_projection")
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        auto it = tensor_storage_map.find(prefix + ".video_proj.weight");
+        if (it != tensor_storage_map.end() && it->second.n_dims >= 2) {
+            in_dim  = it->second.ne[0];
+            out_dim = it->second.ne[1];
+        }
+        proj = std::make_shared<Linear>(in_dim, out_dim, true);
+        proj->init(params_ctx, tensor_storage_map, prefix + ".video_proj");
+    }
+
+    std::string get_desc() override { return "ltx2_text_proj"; }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        proj->get_param_tensors(tensors, prefix + ".video_proj");
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* stacked) {
+        struct ggml_cgraph* gf = new_graph_custom(4096);
+        stacked                = to_backend(stacked);  // [hidden, n_token, num_layers]
+        auto runner_ctx        = get_context();
+        auto ctxg              = runner_ctx.ggml_ctx;
+
+        int64_t hidden = stacked->ne[0];
+        int64_t ntok   = stacked->ne[1];
+        int64_t nlayer = stacked->ne[2];
+
+        auto x = ggml_cont(ctxg, ggml_permute(ctxg, stacked, 0, 2, 1, 3));  // [hidden, num_layers, n_token]
+        x      = ggml_reshape_2d(ctxg, x, hidden * nlayer, ntok);            // [hidden*num_layers, n_token]
+        x      = proj->forward(&runner_ctx, x);                             // [out_dim, n_token]
+        x      = ggml_reshape_3d(ctxg, x, x->ne[0], x->ne[1], 1);
+        ggml_build_forward_expand(gf, x);
+        return gf;
+    }
+
+    bool compute(int n_threads, struct ggml_tensor* stacked, struct ggml_tensor** output, struct ggml_context* output_ctx) {
+        auto get_graph = [&]() -> struct ggml_cgraph* { return build_graph(stacked); };
+        return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+    }
+};
+
+// Minimal Gemma tokenizer. Gemma uses a SentencePiece model that is not
+// embedded here; for the synthetic/integration path we emit a deterministic
+// byte-level token stream with a leading BOS. Real generation requires the
+// Gemma-3 SentencePiece vocabulary (loaded from the GGUF or embedded).
+struct GemmaTokenizer {
+    int bos_token_id   = 2;
+    int eos_token_id   = 1;
+    int64_t vocab_size = 262208;
+
+    std::vector<int> tokenize(const std::string& text) {
+        std::vector<int> ids;
+        ids.push_back(bos_token_id);
+        for (unsigned char c : text) {
+            ids.push_back(static_cast<int>(c) % static_cast<int>(vocab_size));
+        }
+        return ids;
+    }
+};
+
+// LTX-2 text conditioner: Gemma-3 encoder -> multi-layer feature extractor ->
+// cross-attention context for the video DiT.
+struct Ltx2Conditioner : public Conditioner {
+    GemmaTokenizer tokenizer;
+    std::shared_ptr<GEMMA3::Gemma3Runner> gemma;
+    std::shared_ptr<Ltx2TextProjection> projection;
+    std::string gemma_prefix;
+    std::string proj_prefix;
+
+    Ltx2Conditioner(ggml_backend_t backend,
+                    bool offload_params_to_cpu,
+                    const String2TensorStorage& tensor_storage_map,
+                    const std::string gemma_prefix = "text_encoders.llm",
+                    const std::string proj_prefix  = "model.diffusion_model.text_embedding_projection")
+        : gemma_prefix(gemma_prefix), proj_prefix(proj_prefix) {
+        gemma                = std::make_shared<GEMMA3::Gemma3Runner>(backend, offload_params_to_cpu, tensor_storage_map, gemma_prefix);
+        tokenizer.vocab_size = gemma->params.vocab_size;
+        projection           = std::make_shared<Ltx2TextProjection>(backend, offload_params_to_cpu, tensor_storage_map, proj_prefix);
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        gemma->get_param_tensors(tensors, gemma_prefix);
+        projection->get_param_tensors(tensors, proj_prefix);
+    }
+
+    void alloc_params_buffer() override {
+        gemma->alloc_params_buffer();
+        projection->alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        gemma->free_params_buffer();
+        projection->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() override {
+        return gemma->get_params_buffer_size() + projection->get_params_buffer_size();
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        gemma->set_flash_attention_enabled(enabled);
+        projection->set_flash_attention_enabled(enabled);
+    }
+
+    SDCondition get_learned_condition(ggml_context* work_ctx,
+                                      int n_threads,
+                                      const ConditionerParams& conditioner_params) override {
+        std::vector<int> tokens = tokenizer.tokenize(conditioner_params.text);
+        auto input_ids          = ggml_new_tensor_1d(work_ctx, GGML_TYPE_I32, tokens.size());
+        for (size_t i = 0; i < tokens.size(); i++) {
+            ggml_set_i32_1d(input_ids, i, tokens[i]);
+        }
+
+        struct ggml_tensor* stacked = nullptr;
+        gemma->compute(n_threads, input_ids, &stacked, work_ctx);  // [hidden, n_token, num_layers]
+
+        struct ggml_tensor* context = nullptr;
+        projection->compute(n_threads, stacked, &context, work_ctx);  // [cross_attention_dim, n_token, 1]
+
+        return SDCondition(context, nullptr, nullptr);
+    }
+};
+
 #endif
diff --git a/src/denoiser.hpp b/src/denoiser.hpp
index 40bd7cb7f..abdbad32c 100644
--- a/src/denoiser.hpp
+++ b/src/denoiser.hpp
@@ -477,6 +477,56 @@ struct KLOptimalScheduler : SigmaScheduler {
     }
 };
 
+// LTX-2 "LinearQuadratic" flow-matching schedule (Diffusers
+// `linear_quadratic_schedule`): the first `linear_steps` steps are spaced
+// linearly up to `threshold_noise`, the remainder follow a quadratic curve.
+// Produces n+1 sigmas in [0,1] flow space (descending, last == 0). This is a
+// closed-form schedule independent of the denoiser's t_to_sigma mapping.
+struct LinearQuadraticScheduler : SigmaScheduler {
+    float threshold_noise = 0.025f;
+    int linear_steps      = -1;  // <0 => num_steps / 2
+
+    std::vector<float> get_sigmas(uint32_t n, float /*sigma_min*/, float /*sigma_max*/, t_to_sigma_t /*t_to_sigma*/) override {
+        std::vector<float> sigmas;
+        if (n == 0) {
+            return sigmas;
+        }
+        if (n == 1) {
+            sigmas.push_back(1.0f);
+            sigmas.push_back(0.0f);
+            return sigmas;
+        }
+
+        int num_steps = static_cast<int>(n);
+        int lin_steps = linear_steps >= 0 ? linear_steps : num_steps / 2;
+        lin_steps     = std::max(1, std::min(lin_steps, num_steps - 1));
+        int quad_steps = num_steps - lin_steps;
+
+        float threshold_step_diff = static_cast<float>(lin_steps) - threshold_noise * static_cast<float>(num_steps);
+        float quad_coef           = threshold_step_diff / (static_cast<float>(lin_steps) * static_cast<float>(quad_steps) * static_cast<float>(quad_steps));
+        float lin_coef            = threshold_noise / static_cast<float>(lin_steps) - 2.0f * threshold_step_diff / (static_cast<float>(quad_steps) * static_cast<float>(quad_steps));
+        float const_term          = quad_coef * static_cast<float>(lin_steps) * static_cast<float>(lin_steps);
+
+        std::vector<float> schedule;
+        schedule.reserve(num_steps + 1);
+        for (int i = 0; i < lin_steps; i++) {
+            schedule.push_back(static_cast<float>(i) * threshold_noise / static_cast<float>(lin_steps));
+        }
+        for (int i = lin_steps; i < num_steps; i++) {
+            float fi = static_cast<float>(i);
+            schedule.push_back(quad_coef * fi * fi + lin_coef * fi + const_term);
+        }
+        schedule.push_back(1.0f);
+
+        sigmas.reserve(num_steps + 1);
+        for (float x : schedule) {
+            sigmas.push_back(1.0f - x);
+        }
+        sigmas[num_steps] = 0.0f;
+        return sigmas;
+    }
+};
+
 struct Denoiser {
     virtual float sigma_min()                                                                = 0;
     virtual float sigma_max()                                                                = 0;
@@ -534,6 +584,10 @@ struct Denoiser {
                 LOG_INFO("get_sigmas with LCM scheduler");
                 scheduler = std::make_shared<LCMScheduler>();
                 break;
+            case LINEAR_QUADRATIC_SCHEDULER:
+                LOG_INFO("get_sigmas with LinearQuadratic scheduler");
+                scheduler = std::make_shared<LinearQuadraticScheduler>();
+                break;
             default:
                 LOG_INFO("get_sigmas with discrete scheduler (default)");
                 scheduler = std::make_shared<DiscreteScheduler>();
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index b1c6a6184..bc5158a4e 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -431,13 +431,16 @@ struct Ltx2Model : public DiffusionModel {
         ltx2.set_circular_axes(circular_x, circular_y);
     }
 
-    // M1: load-only. Denoising forward is implemented in M2.
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
                  struct ggml_context* output_ctx = nullptr) override {
-        LOG_ERROR("LTX-2 diffusion forward is not implemented yet (M1 is load-only)");
-        return false;
+        return ltx2.compute(n_threads,
+                            diffusion_params.x,
+                            diffusion_params.timesteps,
+                            diffusion_params.context,
+                            output,
+                            output_ctx);
     }
 };
 
diff --git a/src/gemma3.hpp b/src/gemma3.hpp
new file mode 100644
index 000000000..55ddbbc62
--- /dev/null
+++ b/src/gemma3.hpp
@@ -0,0 +1,363 @@
+#ifndef __GEMMA3_HPP__
+#define __GEMMA3_HPP__
+
+#include "ggml_extend.hpp"
+#include "rope.hpp"
+
+// Gemma-3 decoder-only text encoder for LTX-2 conditioning.
+//
+// This is a self-contained Gemma-3 implementation (separate from the shared
+// LLM path in llm.hpp) so the LTX-2 text encoder can evolve without perturbing
+// Qwen/Mistral/Z-Image. It implements the Gemma-3 specifics: (1+weight)
+// RMSNorm, GeGLU MLP, q/k RMSNorm, four per-layer norms, scaled token
+// embeddings, and sliding-window vs global attention layers.
+//
+// The hidden states of every decoder layer are returned stacked on a new axis
+// so the LTX multi-layer feature extractor can aggregate them.
+//
+// NOTE: RoPE theta, the sliding-window pattern, and the query scaling follow
+// the public Gemma-3 config; validate numerically against the reference before
+// claiming parity (M3).
+namespace GEMMA3 {
+
+    struct Gemma3Params {
+        int64_t num_layers        = 48;    // gemma-3-12b
+        int64_t hidden_size       = 3840;
+        int64_t intermediate_size = 15360;
+        int num_heads             = 16;
+        int num_kv_heads          = 8;
+        int head_dim              = 256;
+        int64_t vocab_size        = 262208;
+        float rms_norm_eps        = 1e-6f;
+        int sliding_window        = 1024;
+        int sliding_window_pattern = 6;     // every 6th layer is global
+        float rope_theta_local    = 10000.0f;
+        float rope_theta_global   = 1000000.0f;
+    };
+
+    // Gemma RMSNorm: y = x / rms(x) * (1 + weight)
+    class GemmaRMSNorm : public UnaryBlock {
+    protected:
+        int64_t dim;
+        float eps;
+
+        void init_params(struct ggml_context* ctx,
+                         const String2TensorStorage& tensor_storage_map = {},
+                         const std::string prefix                       = "") override {
+            enum ggml_type wtype = GGML_TYPE_F32;
+            params["weight"]     = ggml_new_tensor_1d(ctx, wtype, dim);
+        }
+
+    public:
+        GemmaRMSNorm(int64_t dim, float eps = 1e-6f)
+            : dim(dim), eps(eps) {}
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+            auto w = params["weight"];
+            x      = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+            // (1 + weight) * x
+            x = ggml_add(ctx->ggml_ctx, x, ggml_mul(ctx->ggml_ctx, x, w));
+            return x;
+        }
+    };
+
+    // GeGLU MLP: down(gelu(gate(x)) * up(x))
+    class GemmaMLP : public GGMLBlock {
+    public:
+        GemmaMLP(int64_t hidden_size, int64_t intermediate_size) {
+            blocks["gate_proj"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, false));
+            blocks["up_proj"]   = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, intermediate_size, false));
+            blocks["down_proj"] = std::shared_ptr<GGMLBlock>(new Linear(intermediate_size, hidden_size, false));
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto gate = std::dynamic_pointer_cast<Linear>(blocks["gate_proj"]);
+            auto up   = std::dynamic_pointer_cast<Linear>(blocks["up_proj"]);
+            auto down = std::dynamic_pointer_cast<Linear>(blocks["down_proj"]);
+            auto h    = ggml_ext_gelu(ctx->ggml_ctx, gate->forward(ctx, x), true);
+            h         = ggml_mul(ctx->ggml_ctx, h, up->forward(ctx, x));
+            return down->forward(ctx, h);
+        }
+    };
+
+    class Gemma3Attention : public GGMLBlock {
+    protected:
+        int head_dim;
+        int64_t num_heads;
+        int64_t num_kv_heads;
+        float eps;
+        float rope_theta;
+
+    public:
+        Gemma3Attention(const Gemma3Params& p, float rope_theta)
+            : head_dim(p.head_dim), num_heads(p.num_heads), num_kv_heads(p.num_kv_heads), eps(p.rms_norm_eps), rope_theta(rope_theta) {
+            blocks["q_proj"] = std::make_shared<Linear>(p.hidden_size, num_heads * head_dim, false);
+            blocks["k_proj"] = std::make_shared<Linear>(p.hidden_size, num_kv_heads * head_dim, false);
+            blocks["v_proj"] = std::make_shared<Linear>(p.hidden_size, num_kv_heads * head_dim, false);
+            blocks["o_proj"] = std::make_shared<Linear>(num_heads * head_dim, p.hidden_size, false);
+            blocks["q_norm"] = std::make_shared<GemmaRMSNorm>(head_dim, eps);
+            blocks["k_norm"] = std::make_shared<GemmaRMSNorm>(head_dim, eps);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* input_pos,
+                                    struct ggml_tensor* attention_mask) {
+            int64_t n_token = x->ne[1];
+            int64_t N       = x->ne[2];
+            auto q_proj     = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
+            auto k_proj     = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
+            auto v_proj     = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
+            auto o_proj     = std::dynamic_pointer_cast<Linear>(blocks["o_proj"]);
+            auto q_norm     = std::dynamic_pointer_cast<GemmaRMSNorm>(blocks["q_norm"]);
+            auto k_norm     = std::dynamic_pointer_cast<GemmaRMSNorm>(blocks["k_norm"]);
+
+            auto q = q_proj->forward(ctx, x);
+            auto k = k_proj->forward(ctx, x);
+            auto v = v_proj->forward(ctx, x);
+
+            q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, n_token, N);
+            k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_kv_heads, n_token, N);
+            v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N);
+
+            q = q_norm->forward(ctx, q);
+            k = k_norm->forward(ctx, k);
+
+            q = ggml_rope_ext(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.f, 0.f, 1.f, 0.f, 0.f);
+            k = ggml_rope_ext(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, GGML_ROPE_TYPE_NEOX, 0, rope_theta, 1.f, 0.f, 1.f, 0.f, 0.f);
+
+            q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 2, 1, 3));
+            q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]);
+            k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3));
+            k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);
+
+            x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, attention_mask, true, false);
+            x = o_proj->forward(ctx, x);
+            return x;
+        }
+    };
+
+    class Gemma3Block : public GGMLBlock {
+    public:
+        Gemma3Block(const Gemma3Params& p, float rope_theta) {
+            blocks["self_attn"]                 = std::make_shared<Gemma3Attention>(p, rope_theta);
+            blocks["mlp"]                       = std::make_shared<GemmaMLP>(p.hidden_size, p.intermediate_size);
+            blocks["input_layernorm"]           = std::make_shared<GemmaRMSNorm>(p.hidden_size, p.rms_norm_eps);
+            blocks["post_attention_layernorm"]  = std::make_shared<GemmaRMSNorm>(p.hidden_size, p.rms_norm_eps);
+            blocks["pre_feedforward_layernorm"] = std::make_shared<GemmaRMSNorm>(p.hidden_size, p.rms_norm_eps);
+            blocks["post_feedforward_layernorm"]= std::make_shared<GemmaRMSNorm>(p.hidden_size, p.rms_norm_eps);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* input_pos,
+                                    struct ggml_tensor* attention_mask) {
+            auto self_attn = std::dynamic_pointer_cast<Gemma3Attention>(blocks["self_attn"]);
+            auto mlp       = std::dynamic_pointer_cast<GemmaMLP>(blocks["mlp"]);
+            auto in_ln     = std::dynamic_pointer_cast<GemmaRMSNorm>(blocks["input_layernorm"]);
+            auto post_attn = std::dynamic_pointer_cast<GemmaRMSNorm>(blocks["post_attention_layernorm"]);
+            auto pre_ff    = std::dynamic_pointer_cast<GemmaRMSNorm>(blocks["pre_feedforward_layernorm"]);
+            auto post_ff   = std::dynamic_pointer_cast<GemmaRMSNorm>(blocks["post_feedforward_layernorm"]);
+            auto ctxg      = ctx->ggml_ctx;
+
+            auto residual = x;
+            x             = in_ln->forward(ctx, x);
+            x             = self_attn->forward(ctx, x, input_pos, attention_mask);
+            x             = post_attn->forward(ctx, x);
+            x             = ggml_add(ctxg, x, residual);
+
+            residual = x;
+            x        = pre_ff->forward(ctx, x);
+            x        = mlp->forward(ctx, x);
+            x        = post_ff->forward(ctx, x);
+            x        = ggml_add(ctxg, x, residual);
+            return x;
+        }
+    };
+
+    class Gemma3Model : public GGMLBlock {
+    protected:
+        Gemma3Params p;
+
+    public:
+        Gemma3Model(const Gemma3Params& p)
+            : p(p) {
+            blocks["embed_tokens"] = std::shared_ptr<GGMLBlock>(new Embedding(p.vocab_size, p.hidden_size));
+            for (int i = 0; i < p.num_layers; i++) {
+                bool is_global   = ((i + 1) % p.sliding_window_pattern) == 0;
+                float rope_theta = is_global ? p.rope_theta_global : p.rope_theta_local;
+                blocks["layers." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new Gemma3Block(p, rope_theta));
+            }
+            blocks["norm"] = std::shared_ptr<GGMLBlock>(new GemmaRMSNorm(p.hidden_size, p.rms_norm_eps));
+        }
+
+        // input_ids: [n_token]
+        // returns: [hidden, n_token, num_layers] stacked all-layer hidden states
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* input_ids,
+                                    struct ggml_tensor* input_pos,
+                                    struct ggml_tensor* mask_local,
+                                    struct ggml_tensor* mask_global) {
+            auto embed_tokens = std::dynamic_pointer_cast<Embedding>(blocks["embed_tokens"]);
+            auto ctxg         = ctx->ggml_ctx;
+
+            auto x = embed_tokens->forward(ctx, input_ids);  // [hidden, n_token, 1]
+            // Gemma scales the embeddings by sqrt(hidden_size)
+            x = ggml_scale(ctxg, x, sqrtf(static_cast<float>(p.hidden_size)));
+
+            struct ggml_tensor* stacked = nullptr;
+            for (int i = 0; i < p.num_layers; i++) {
+                auto block       = std::dynamic_pointer_cast<Gemma3Block>(blocks["layers." + std::to_string(i)]);
+                bool is_global   = ((i + 1) % p.sliding_window_pattern) == 0;
+                auto mask        = is_global ? mask_global : mask_local;
+                x                = block->forward(ctx, x, input_pos, mask);
+
+                auto layer_out = ggml_reshape_3d(ctxg, x, x->ne[0], x->ne[1], 1);  // [hidden, n_token, 1]
+                stacked        = (stacked == nullptr) ? layer_out : ggml_concat(ctxg, stacked, layer_out, 2);
+            }
+            return stacked;  // [hidden, n_token, num_layers]
+        }
+    };
+
+    struct Gemma3Runner : public GGMLRunner {
+        Gemma3Params params;
+        Gemma3Model model;
+        std::vector<int> input_pos_vec;
+        std::vector<float> mask_local_vec;
+        std::vector<float> mask_global_vec;
+
+        Gemma3Runner(ggml_backend_t backend,
+                     bool offload_params_to_cpu,
+                     const String2TensorStorage& tensor_storage_map = {},
+                     const std::string prefix                       = "text_encoders.gemma3")
+            : GGMLRunner(backend, offload_params_to_cpu),
+              params(infer_params(tensor_storage_map, prefix)),
+              model(params) {
+            LOG_INFO("Gemma-3 encoder: num_layers=%lld hidden=%lld heads=%d kv_heads=%d head_dim=%d",
+                     (long long)params.num_layers, (long long)params.hidden_size, params.num_heads, params.num_kv_heads, params.head_dim);
+            model.init(params_ctx, tensor_storage_map, prefix);
+        }
+
+        static Gemma3Params infer_params(const String2TensorStorage& tsm, const std::string& prefix) {
+            Gemma3Params p;
+            std::string base = prefix.empty() ? "" : prefix + ".";
+            int max_layer    = -1;
+            for (auto& pair : tsm) {
+                const std::string& n = pair.first;
+                if (n.compare(0, base.size(), base) != 0) {
+                    continue;
+                }
+                std::string rel = n.substr(base.size());
+                size_t pos      = rel.find("layers.");
+                if (pos != std::string::npos) {
+                    max_layer = std::max(max_layer, atoi(rel.c_str() + pos + 7));
+                }
+                if (rel == "embed_tokens.weight") {
+                    p.hidden_size = pair.second.ne[0];
+                    p.vocab_size  = pair.second.ne[1];
+                }
+                if (rel == "layers.0.mlp.gate_proj.weight") {
+                    p.intermediate_size = pair.second.ne[1];
+                }
+                if (rel == "layers.0.self_attn.q_proj.weight") {
+                    // ne[1] = num_heads*head_dim
+                }
+                if (rel == "layers.0.self_attn.q_norm.weight") {
+                    p.head_dim = (int)pair.second.ne[0];
+                }
+            }
+            if (max_layer >= 0) {
+                p.num_layers = max_layer + 1;
+            }
+            // recompute num_heads from q_proj if possible
+            const auto qit = tsm.find(base + "layers.0.self_attn.q_proj.weight");
+            const auto kit = tsm.find(base + "layers.0.self_attn.k_proj.weight");
+            if (qit != tsm.end() && p.head_dim > 0) {
+                p.num_heads = (int)(qit->second.ne[1] / p.head_dim);
+            }
+            if (kit != tsm.end() && p.head_dim > 0) {
+                p.num_kv_heads = (int)(kit->second.ne[1] / p.head_dim);
+            }
+            return p;
+        }
+
+        std::string get_desc() override {
+            return "gemma3";
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            model.get_param_tensors(tensors, prefix);
+        }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids) {
+            struct ggml_cgraph* gf = new_graph_custom(MAX_GRAPH_SIZE / 2);
+            input_ids              = to_backend(input_ids);
+
+            int64_t n_tokens = input_ids->ne[0];
+            input_pos_vec.resize(n_tokens);
+            for (int i = 0; i < n_tokens; i++) {
+                input_pos_vec[i] = i;
+            }
+            auto input_pos = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, n_tokens);
+            set_backend_tensor_data(input_pos, input_pos_vec.data());
+
+            // Determine which mask types are actually referenced so we never
+            // register data for a tensor that is absent from the graph.
+            bool needs_global = false;
+            bool needs_local  = false;
+            for (int i = 0; i < params.num_layers; i++) {
+                bool is_global = ((i + 1) % params.sliding_window_pattern) == 0;
+                needs_global |= is_global;
+                needs_local |= !is_global;
+            }
+
+            // causal mask (global) + sliding-window causal mask (local)
+            struct ggml_tensor* mask_global = nullptr;
+            struct ggml_tensor* mask_local  = nullptr;
+            if (needs_global) {
+                mask_global_vec.resize(n_tokens * n_tokens);
+            }
+            if (needs_local) {
+                mask_local_vec.resize(n_tokens * n_tokens);
+            }
+            for (int64_t q = 0; q < n_tokens; q++) {
+                for (int64_t k = 0; k < n_tokens; k++) {
+                    bool causal_ok = k <= q;
+                    bool window_ok = causal_ok && (q - k) < params.sliding_window;
+                    if (needs_global) {
+                        mask_global_vec[q * n_tokens + k] = causal_ok ? 0.f : -INFINITY;
+                    }
+                    if (needs_local) {
+                        mask_local_vec[q * n_tokens + k] = window_ok ? 0.f : -INFINITY;
+                    }
+                }
+            }
+            if (needs_global) {
+                mask_global = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
+                set_backend_tensor_data(mask_global, mask_global_vec.data());
+            }
+            if (needs_local) {
+                mask_local = ggml_new_tensor_2d(compute_ctx, GGML_TYPE_F32, n_tokens, n_tokens);
+                set_backend_tensor_data(mask_local, mask_local_vec.data());
+            }
+
+            auto runner_ctx = get_context();
+            auto out        = model.forward(&runner_ctx, input_ids, input_pos, mask_local, mask_global);
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        bool compute(int n_threads,
+                     struct ggml_tensor* input_ids,
+                     struct ggml_tensor** output,
+                     struct ggml_context* output_ctx = nullptr) {
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(input_ids);
+            };
+            return GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
+        }
+    };
+
+}  // namespace GEMMA3
+
+#endif  // __GEMMA3_HPP__
diff --git a/src/ltx2.hpp b/src/ltx2.hpp
index efcee6021..e449e46f1 100644
--- a/src/ltx2.hpp
+++ b/src/ltx2.hpp
@@ -2,19 +2,26 @@
 #define __LTX2_HPP__
 
 #include "common_block.hpp"
+#include "rope.hpp"
 
 // LTX-2.3 video-stream DiT (AVTransformer3DModel, video half only).
 //
-// This is the M1 scaffolding: the GGMLBlock tree declares exactly the video
-// tensors produced by script/convert_ltx2_to_gguf.py so the model can be
-// loaded and its params allocated on CPU. Forward passes (denoising) are added
-// in M2; the blocks here intentionally declare parameters only.
+// M1 declared the GGMLBlock tree (parameters only) so the model could be
+// loaded and bound on CPU. M2 adds the forward (denoising) pass: patchify,
+// 3D RoPE, AdaLN-single modulation, gated self/cross attention, the
+// video-embeddings connector and the output projection.
 //
 // Confirmed architecture (Lightricks/LTX-2.3, model_version 2.3.0):
 //   num_layers=48, num_heads=32, head_dim=128 (dim 4096), in_channels=128,
 //   caption_channels=3840, cross_attention_dim=4096, qk_norm=rms_norm,
 //   gated attention (to_gate_logits), FFN 4096->16384 (gelu-approx),
 //   8-layer embeddings connector with 128 learnable registers.
+//
+// NOTE: The exact assignment of the 9 per-block modulation channels and the
+// 2 "prompt" modulation channels, the RoPE axis split, and the non-affine
+// norm choices below follow the PixArt/LTX-Video AdaLN-single convention and
+// should be validated numerically against the Diffusers LTX-2 reference
+// before claiming PSNR/SSIM parity (M3).
 namespace LTX2 {
 
     struct Ltx2Params {
@@ -29,14 +36,47 @@ namespace LTX2 {
         int connector_registers  = 128;
         int timestep_freq_dim    = 256;
         float eps                = 1e-6f;
+        float rope_theta         = 10000.0f;
     };
 
+    // 3D RoPE axis split for (t, h, w). The three parts sum to head_dim; the
+    // temporal axis gets ~1/4 of the budget, height/width split the rest.
+    __STATIC_INLINE__ std::vector<int> ltx2_rope_axes_dim(int head_dim) {
+        int t_dim = (head_dim / 4) & ~1;
+        int rem   = head_dim - t_dim;
+        int h_dim = (rem / 2) & ~1;
+        int w_dim = head_dim - t_dim - h_dim;
+        return {t_dim, h_dim, w_dim};
+    }
+
+    // Non-affine layer norm (no learnable weight/bias), matching LTX/PixArt
+    // AdaLN blocks where the modulation supplies the scale/shift.
+    __STATIC_INLINE__ struct ggml_tensor* ltx2_norm(struct ggml_context* ctx, struct ggml_tensor* x, float eps) {
+        return ggml_norm(ctx, x, eps);
+    }
+
+    // x: [dim, n_token, N], scale/shift: [dim, 1, 1] -> x * (1 + scale) + shift
+    __STATIC_INLINE__ struct ggml_tensor* ltx2_modulate(struct ggml_context* ctx,
+                                                        struct ggml_tensor* x,
+                                                        struct ggml_tensor* scale,
+                                                        struct ggml_tensor* shift) {
+        x = ggml_add(ctx, x, ggml_mul(ctx, x, scale));
+        x = ggml_add(ctx, x, shift);
+        return x;
+    }
+
     // q/k/v + gated output projection with rms qk-norm; matches the LTX-2
     // `attn1`/`attn2` layout (to_q, to_k, to_v, to_out.0, q_norm, k_norm,
     // to_gate_logits).
     class GatedAttention : public GGMLBlock {
+    protected:
+        int num_heads;
+        int head_dim;
+        bool use_rope;
+
     public:
-        GatedAttention(int dim, int ctx_dim, int num_heads, float eps) {
+        GatedAttention(int dim, int ctx_dim, int num_heads, float eps, bool use_rope)
+            : num_heads(num_heads), head_dim(dim / num_heads), use_rope(use_rope) {
             blocks["to_q"]           = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
             blocks["to_k"]           = std::shared_ptr<GGMLBlock>(new Linear(ctx_dim, dim));
             blocks["to_v"]           = std::shared_ptr<GGMLBlock>(new Linear(ctx_dim, dim));
@@ -45,6 +85,48 @@ namespace LTX2 {
             blocks["k_norm"]         = std::shared_ptr<GGMLBlock>(new RMSNorm(dim, eps));
             blocks["to_gate_logits"] = std::shared_ptr<GGMLBlock>(new Linear(dim, num_heads));
         }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe) {
+            // x:       [dim, n_token, N]
+            // context: [ctx_dim, n_ctx, N] (== x for self-attention)
+            // pe:      [n_token, head_dim/2, 2, 2] or nullptr
+            auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+            auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+            auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+            auto to_out   = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+            auto q_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+            auto to_gate  = std::dynamic_pointer_cast<Linear>(blocks["to_gate_logits"]);
+
+            int64_t n_token = x->ne[1];
+            int64_t N       = x->ne[2];
+
+            auto q = q_norm->forward(ctx, to_q->forward(ctx, x));
+            auto k = k_norm->forward(ctx, to_k->forward(ctx, context));
+            auto v = to_v->forward(ctx, context);
+
+            struct ggml_tensor* attn;
+            if (use_rope && pe != nullptr) {
+                q    = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, n_token, N);
+                k    = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, n_token, N);
+                v    = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, n_token, N);
+                attn = Rope::attention(ctx, q, k, v, pe, nullptr);  // [dim, n_token, N]
+            } else {
+                attn = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, false, ctx->flash_attn_enabled);
+            }
+
+            // per-head gate: sigmoid(to_gate_logits(x)) modulates each head.
+            auto gate = ggml_sigmoid(ctx->ggml_ctx, to_gate->forward(ctx, x));   // [num_heads, n_token, N]
+            gate      = ggml_reshape_4d(ctx->ggml_ctx, gate, 1, num_heads, n_token, N);
+            attn      = ggml_reshape_4d(ctx->ggml_ctx, attn, head_dim, num_heads, n_token, N);
+            attn      = ggml_mul(ctx->ggml_ctx, attn, gate);
+            attn      = ggml_reshape_3d(ctx->ggml_ctx, attn, head_dim * num_heads, n_token, N);
+
+            return to_out->forward(ctx, attn);
+        }
     };
 
     // gelu-approximate FFN: net.0.proj (dim->inner), net.2 (inner->dim).
@@ -54,24 +136,54 @@ namespace LTX2 {
             blocks["net.0.proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim, inner));
             blocks["net.2"]      = std::shared_ptr<GGMLBlock>(new Linear(inner, dim));
         }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto proj = std::dynamic_pointer_cast<Linear>(blocks["net.0.proj"]);
+            auto out  = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+            x         = proj->forward(ctx, x);
+            x         = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+            x         = out->forward(ctx, x);
+            return x;
+        }
     };
 
     // adaln_single / prompt_adaln_single: a timestep embedder MLP plus a final
-    // projection producing the modulation table.
+    // projection producing the modulation table. forward returns both the
+    // conditioning embedding ([dim]) and the modulation table ([out_dim]).
     class AdaLnSingle : public GGMLBlock {
     public:
-        AdaLnSingle(int freq_dim, int dim, int out_dim) {
+        int freq_dim;
+
+        AdaLnSingle(int freq_dim, int dim, int out_dim)
+            : freq_dim(freq_dim) {
             blocks["emb.timestep_embedder.linear_1"] = std::shared_ptr<GGMLBlock>(new Linear(freq_dim, dim));
             blocks["emb.timestep_embedder.linear_2"] = std::shared_ptr<GGMLBlock>(new Linear(dim, dim));
             blocks["linear"]                         = std::shared_ptr<GGMLBlock>(new Linear(dim, out_dim));
         }
+
+        // timestep: [N]
+        // returns: { embedded [dim, N], modulation [out_dim, N] }
+        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx, struct ggml_tensor* timestep) {
+            auto l1     = std::dynamic_pointer_cast<Linear>(blocks["emb.timestep_embedder.linear_1"]);
+            auto l2     = std::dynamic_pointer_cast<Linear>(blocks["emb.timestep_embedder.linear_2"]);
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+
+            auto e = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, freq_dim);  // [freq_dim, N]
+            e      = l1->forward(ctx, e);
+            e      = ggml_silu_inplace(ctx->ggml_ctx, e);
+            e      = l2->forward(ctx, e);  // [dim, N] embedded timestep
+
+            auto mod = linear->forward(ctx, ggml_silu(ctx->ggml_ctx, e));  // [out_dim, N]
+            return {e, mod};
+        }
     };
 
-    // One video DiT block: self-attn (attn1), text cross-attn (attn2), FFN, and
-    // two raw modulation tables.
+    // One video DiT block: gated self-attn (attn1), gated text cross-attn
+    // (attn2), gelu FFN, and two raw modulation tables.
     class Ltx2TransformerBlock : public GGMLBlock {
     protected:
         int dim;
+        float eps;
 
         void init_params(struct ggml_context* ctx,
                          const String2TensorStorage& tensor_storage_map = {},
@@ -82,20 +194,75 @@ namespace LTX2 {
 
     public:
         Ltx2TransformerBlock(const Ltx2Params& p)
-            : dim(p.dim) {
-            blocks["attn1"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.dim, p.num_heads, p.eps));
-            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.cross_attention_dim, p.num_heads, p.eps));
+            : dim(p.dim), eps(p.eps) {
+            blocks["attn1"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.dim, p.num_heads, p.eps, /*use_rope*/ true));
+            blocks["attn2"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.cross_attention_dim, p.num_heads, p.eps, /*use_rope*/ false));
             blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(p.dim, p.ffn_dim));
         }
+
+        // x:        [dim, n_token, N]
+        // mod:      [dim, 9, N]   (global adaln modulation, already reshaped)
+        // prompt:   [dim, 2, N]   (global prompt modulation, already reshaped)
+        // context:  [cross_attention_dim, n_ctx, N]
+        // pe:       [n_token, head_dim/2, 2, 2]
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* mod,
+                                    struct ggml_tensor* prompt,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe) {
+            auto attn1 = std::dynamic_pointer_cast<GatedAttention>(blocks["attn1"]);
+            auto attn2 = std::dynamic_pointer_cast<GatedAttention>(blocks["attn2"]);
+            auto ff    = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
+
+            auto ctxg = ctx->ggml_ctx;
+
+            // per-block modulation = global modulation + block scale_shift_table
+            auto e = ggml_add(ctxg, mod, params["scale_shift_table"]);  // [dim, 9, N]
+            auto es = ggml_ext_chunk(ctxg, e, 9, 1);                    // 9 x [dim, 1, N]
+
+            auto pe_mod = ggml_add(ctxg, prompt, params["prompt_scale_shift_table"]);  // [dim, 2, N]
+            auto ps     = ggml_ext_chunk(ctxg, pe_mod, 2, 1);                           // 2 x [dim, 1, N]
+
+            // self-attention (modulated)
+            auto y = ltx2_modulate(ctxg, ltx2_norm(ctxg, x, eps), es[1], es[0]);
+            y      = attn1->forward(ctx, y, y, pe);
+            x      = ggml_add(ctxg, x, ggml_mul(ctxg, y, es[2]));
+
+            // text cross-attention (modulated query stream + modulated prompt)
+            auto xc  = ltx2_modulate(ctxg, ltx2_norm(ctxg, x, eps), es[4], es[3]);
+            auto ctc = ltx2_modulate(ctxg, context, ps[1], ps[0]);
+            auto c   = attn2->forward(ctx, xc, ctc, nullptr);
+            x        = ggml_add(ctxg, x, ggml_mul(ctxg, c, es[5]));
+
+            // feed-forward (modulated)
+            auto f = ltx2_modulate(ctxg, ltx2_norm(ctxg, x, eps), es[7], es[6]);
+            f      = ff->forward(ctx, f);
+            x      = ggml_add(ctxg, x, ggml_mul(ctxg, f, es[8]));
+
+            return x;
+        }
     };
 
     // One connector block: self-attn + FFN (no modulation tables).
     class Ltx2ConnectorBlock : public GGMLBlock {
     public:
-        Ltx2ConnectorBlock(const Ltx2Params& p) {
-            blocks["attn1"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.dim, p.num_heads, p.eps));
+        float eps;
+
+        Ltx2ConnectorBlock(const Ltx2Params& p)
+            : eps(p.eps) {
+            blocks["attn1"] = std::shared_ptr<GGMLBlock>(new GatedAttention(p.dim, p.dim, p.num_heads, p.eps, /*use_rope*/ false));
             blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(p.dim, p.ffn_dim));
         }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto attn = std::dynamic_pointer_cast<GatedAttention>(blocks["attn1"]);
+            auto ff   = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
+            auto ctxg = ctx->ggml_ctx;
+            x         = ggml_add(ctxg, x, attn->forward(ctx, ltx2_norm(ctxg, x, eps), ltx2_norm(ctxg, x, eps), nullptr));
+            x         = ggml_add(ctxg, x, ff->forward(ctx, ltx2_norm(ctxg, x, eps)));
+            return x;
+        }
     };
 
     // video_embeddings_connector: learnable registers + N 1d transformer blocks.
@@ -103,6 +270,7 @@ namespace LTX2 {
     protected:
         int dim;
         int num_registers;
+        int num_layers;
 
         void init_params(struct ggml_context* ctx,
                          const String2TensorStorage& tensor_storage_map = {},
@@ -112,17 +280,32 @@ namespace LTX2 {
 
     public:
         Ltx2Connector(const Ltx2Params& p)
-            : dim(p.dim), num_registers(p.connector_registers) {
+            : dim(p.dim), num_registers(p.connector_registers), num_layers(p.connector_num_layers) {
             for (int i = 0; i < p.connector_num_layers; i++) {
                 blocks["transformer_1d_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new Ltx2ConnectorBlock(p));
             }
         }
+
+        // context: [dim, n_ctx, 1] -> [dim, num_registers + n_ctx, 1]
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* context) {
+            auto ctxg = ctx->ggml_ctx;
+
+            auto regs = ggml_reshape_3d(ctxg, params["learnable_registers"], dim, num_registers, 1);
+            auto x    = ggml_concat(ctxg, regs, context, 1);  // [dim, num_registers + n_ctx, 1]
+
+            for (int i = 0; i < num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<Ltx2ConnectorBlock>(blocks["transformer_1d_blocks." + std::to_string(i)]);
+                x          = block->forward(ctx, x);
+            }
+            return x;
+        }
     };
 
     // Top-level video DiT.
     class Ltx2 : public GGMLBlock {
     protected:
-        int dim;
+        int dim    = 4096;
+        float eps  = 1e-6f;
 
         void init_params(struct ggml_context* ctx,
                          const String2TensorStorage& tensor_storage_map = {},
@@ -131,8 +314,10 @@ namespace LTX2 {
         }
 
     public:
+        Ltx2Params params_;
+
         Ltx2(const Ltx2Params& p)
-            : dim(p.dim) {
+            : dim(p.dim), eps(p.eps), params_(p) {
             blocks["patchify_proj"]              = std::shared_ptr<GGMLBlock>(new Linear(p.in_channels, p.dim));
             blocks["proj_out"]                   = std::shared_ptr<GGMLBlock>(new Linear(p.dim, p.in_channels));
             blocks["adaln_single"]               = std::shared_ptr<GGMLBlock>(new AdaLnSingle(p.timestep_freq_dim, p.dim, 9 * p.dim));
@@ -142,12 +327,69 @@ namespace LTX2 {
                 blocks["transformer_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new Ltx2TransformerBlock(p));
             }
         }
+
+        // x:        [W, H, T, C] latent
+        // timestep: [N]
+        // context:  [cross_attention_dim, n_ctx, N] text features
+        // pe:       [n_token, head_dim/2, 2, 2]
+        // returns:  [W, H, T, C] latent prediction
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* pe) {
+            auto ctxg = ctx->ggml_ctx;
+
+            auto patchify   = std::dynamic_pointer_cast<Linear>(blocks["patchify_proj"]);
+            auto proj_out   = std::dynamic_pointer_cast<Linear>(blocks["proj_out"]);
+            auto adaln      = std::dynamic_pointer_cast<AdaLnSingle>(blocks["adaln_single"]);
+            auto prompt_ada = std::dynamic_pointer_cast<AdaLnSingle>(blocks["prompt_adaln_single"]);
+            auto connector  = std::dynamic_pointer_cast<Ltx2Connector>(blocks["video_embeddings_connector"]);
+
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+            int64_t T = x->ne[2];
+            int64_t C = x->ne[3];
+            int64_t N = 1;
+
+            // patchify: [W,H,T,C] -> tokens [C, n_token, N] -> [dim, n_token, N]
+            auto tokens = ggml_cont(ctxg, ggml_permute(ctxg, x, 1, 2, 3, 0));  // [C, W, H, T]
+            tokens      = ggml_reshape_3d(ctxg, tokens, C, W * H * T, N);       // [C, n_token, N]
+            auto h      = patchify->forward(ctx, tokens);                       // [dim, n_token, N]
+
+            // timestep modulation tables
+            auto ada       = adaln->forward(ctx, timestep);          // embedded [dim,N], mod [9*dim,N]
+            auto embedded  = ada.first;
+            auto mod       = ggml_reshape_3d(ctxg, ada.second, dim, 9, N);
+            auto prompt    = prompt_ada->forward(ctx, timestep).second;
+            prompt         = ggml_reshape_3d(ctxg, prompt, dim, 2, N);
+
+            // text connector
+            context = connector->forward(ctx, context);  // [dim, n_reg + n_ctx, N]
+
+            for (int i = 0; i < params_.num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<Ltx2TransformerBlock>(blocks["transformer_blocks." + std::to_string(i)]);
+                h          = block->forward(ctx, h, mod, prompt, context, pe);
+            }
+
+            // final norm + global scale_shift_table modulation (N == 1)
+            auto final_mod = ggml_add(ctxg, params["scale_shift_table"], embedded);  // [dim, 2]
+            auto fs        = ggml_ext_chunk(ctxg, final_mod, 2, 1);
+            h              = ltx2_modulate(ctxg, ltx2_norm(ctxg, h, eps), fs[1], fs[0]);
+            h              = proj_out->forward(ctx, h);  // [C, n_token, N]
+
+            // unpatchify back to [W, H, T, C]
+            h = ggml_reshape_4d(ctxg, h, C, W, H, T);
+            h = ggml_cont(ctxg, ggml_permute(ctxg, h, 3, 0, 1, 2));  // [W, H, T, C]
+            return h;
+        }
     };
 
     struct Ltx2Runner : public GGMLRunner {
         std::string desc = "ltx2_dit";
         Ltx2Params params;
         Ltx2 dit;
+        std::vector<float> pe_vec;
 
         Ltx2Runner(ggml_backend_t backend,
                    bool offload_params_to_cpu,
@@ -243,6 +485,47 @@ namespace LTX2 {
         void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
             dit.get_param_tensors(tensors, prefix);
         }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context) {
+            struct ggml_cgraph* gf = new_graph_custom(MAX_GRAPH_SIZE / 2);
+
+            x         = to_backend(x);
+            timesteps = to_backend(timesteps);
+            context   = to_backend(context);
+
+            // 3D RoPE positions for the latent grid (patch size 1x1x1).
+            std::vector<int> axes_dim = ltx2_rope_axes_dim(params.head_dim);
+            int axes_dim_sum          = axes_dim[0] + axes_dim[1] + axes_dim[2];
+            pe_vec                    = Rope::gen_wan_pe(static_cast<int>(x->ne[2]),
+                                                        static_cast<int>(x->ne[1]),
+                                                        static_cast<int>(x->ne[0]),
+                                                        1, 1, 1, 1,
+                                                        static_cast<int>(params.rope_theta),
+                                                        axes_dim);
+            int pos_len = static_cast<int>(pe_vec.size() / axes_dim_sum / 2);
+            auto pe     = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, axes_dim_sum / 2, pos_len);
+            set_backend_tensor_data(pe, pe_vec.data());
+
+            auto runner_ctx         = get_context();
+            struct ggml_tensor* out = dit.forward(&runner_ctx, x, timesteps, context, pe);
+
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        bool compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
+                     struct ggml_tensor** output     = nullptr,
+                     struct ggml_context* output_ctx = nullptr) {
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(x, timesteps, context);
+            };
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        }
     };
 
 }  // namespace LTX2
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index bdaaff1b6..4e2782947 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -411,7 +411,10 @@ class StableDiffusionGGML {
         } else if (sd_version_is_wan(version) ||
                    sd_version_is_qwen_image(version) ||
                    sd_version_is_anima(version) ||
-                   sd_version_is_flux2(version)) {
+                   sd_version_is_flux2(version) ||
+                   sd_version_is_ltx2(version)) {
+            // LTX-2 latents are normalized inside the VAE via per-channel
+            // statistics, so the DiT-side latent transform is identity.
             scale_factor = 1.0f;
             shift_factor = 0.f;
         }
@@ -525,9 +528,11 @@ class StableDiffusionGGML {
                     clip_vision->get_param_tensors(tensors);
                 }
             } else if (sd_version_is_ltx2(version)) {
-                // M1: load the video DiT only. The Gemma-3-12B text encoder and
-                // the CausalVideoAutoencoder are wired in M2.
-                diffusion_model = std::make_shared<Ltx2Model>(backend,
+                // M2: Gemma-3 text encoder + LTX feature extractor feed the video DiT.
+                cond_stage_model = std::make_shared<Ltx2Conditioner>(clip_backend,
+                                                                     offload_params_to_cpu,
+                                                                     tensor_storage_map);
+                diffusion_model  = std::make_shared<Ltx2Model>(backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               "model.diffusion_model");
@@ -633,10 +638,12 @@ class StableDiffusionGGML {
                     first_stage_model = std::make_shared<FakeVAE>(vae_backend,
                                                                   offload_params_to_cpu);
                 } else if (sd_version_is_ltx2(version)) {
-                    // M1: placeholder so a DiT-only checkpoint loads on CPU.
-                    // The real CausalVideoAutoencoder is added in M2.
-                    first_stage_model = std::make_shared<FakeVAE>(vae_backend,
-                                                                  offload_params_to_cpu);
+                    // M2: LTX-2 CausalVideoAutoencoder (32x spatial, 8x temporal,
+                    // 128 latent channels). See Ltx2VAERunner for current status.
+                    first_stage_model = std::make_shared<Ltx2VAERunner>(vae_backend,
+                                                                        offload_params_to_cpu,
+                                                                        tensor_storage_map,
+                                                                        "first_stage_model");
                 } else {
                     first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                         offload_params_to_cpu,
@@ -2316,6 +2323,8 @@ class StableDiffusionGGML {
             vae_scale_factor = 16;
         } else if (sd_version_is_flux2(version)) {
             vae_scale_factor = 16;
+        } else if (sd_version_is_ltx2(version)) {
+            vae_scale_factor = 32;  // LTX-2 CausalVideoAutoencoder: 32x spatial compression
         } else if (version == VERSION_CHROMA_RADIANCE) {
             vae_scale_factor = 1;
         }
@@ -2343,6 +2352,8 @@ class StableDiffusionGGML {
                 latent_channel = 3;
             } else if (sd_version_is_flux2(version)) {
                 latent_channel = 128;
+            } else if (sd_version_is_ltx2(version)) {
+                latent_channel = 128;  // LTX-2 Video-VAE latent channels
             } else {
                 latent_channel = 16;
             }
@@ -2366,6 +2377,8 @@ class StableDiffusionGGML {
         int T                = frames;
         if (sd_version_is_wan(version)) {
             T = ((T - 1) / 4) + 1;
+        } else if (sd_version_is_ltx2(version)) {
+            T = ((T - 1) / 8) + 1;  // LTX-2: 8x temporal compression, (F-1)%8==0
         }
         int C = get_latent_channel();
         ggml_tensor* init_latent;
@@ -2687,6 +2700,8 @@ class StableDiffusionGGML {
             int64_t T = x->ne[2];
             if (sd_version_is_wan(version)) {
                 T = ((T - 1) * 4) + 1;
+            } else if (sd_version_is_ltx2(version)) {
+                T = ((T - 1) * 8) + 1;  // LTX-2: 8x temporal expansion on decode
             }
             result = ggml_new_tensor_4d(work_ctx,
                                         GGML_TYPE_F32,
@@ -2852,6 +2867,7 @@ const char* scheduler_to_str[] = {
     "kl_optimal",
     "lcm",
     "bong_tangent",
+    "linear_quadratic",
 };
 
 const char* sd_scheduler_name(enum scheduler_t scheduler) {
@@ -3887,7 +3903,11 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     int width        = sd_vid_gen_params->width;
     int height       = sd_vid_gen_params->height;
     int frames       = sd_vid_gen_params->video_frames;
-    frames           = (frames - 1) / 4 * 4 + 1;
+    if (sd_version_is_ltx2(sd_ctx->sd->version)) {
+        frames = (frames - 1) / 8 * 8 + 1;  // LTX-2 temporal alignment: (F-1)%8==0
+    } else {
+        frames = (frames - 1) / 4 * 4 + 1;  // Wan temporal alignment: (F-1)%4==0
+    }
     int sample_steps = sd_vid_gen_params->sample_params.sample_steps;
 
     int vae_scale_factor            = sd_ctx->sd->get_vae_scale_factor();
@@ -4084,6 +4104,31 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         if (!sd_ctx->sd->use_tiny_autoencoder)
             sd_ctx->sd->process_latent_in(init_latent);
 
+        int64_t t2 = ggml_time_ms();
+        LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
+    } else if (sd_version_is_ltx2(sd_ctx->sd->version) && sd_vid_gen_params->init_image.data) {
+        // LTX-2 I2V: encode the init image into the first latent frame and keep
+        // it fixed during denoising via the denoise mask.
+        LOG_INFO("LTX-2 IMG2VID");
+        int64_t t1            = ggml_time_ms();
+        ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+        sd_image_to_ggml_tensor(sd_vid_gen_params->init_image, init_img);
+        init_img = ggml_reshape_4d(work_ctx, init_img, width, height, 1, 3);
+
+        auto init_image_latent = sd_ctx->sd->vae_encode(work_ctx, init_img);  // [w/32, h/32, 1, 128]
+
+        init_latent  = sd_ctx->sd->generate_init_latent(work_ctx, width, height, frames, true);
+        denoise_mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], 1);
+        ggml_set_f32(denoise_mask, 1.f);
+
+        ggml_ext_tensor_iter(init_image_latent, [&](ggml_tensor* t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+            float value = ggml_ext_tensor_get_f32(t, i0, i1, i2, i3);
+            ggml_ext_tensor_set_f32(init_latent, value, i0, i1, i2, i3);
+            if (i3 == 0) {
+                ggml_ext_tensor_set_f32(denoise_mask, 0.f, i0, i1, i2, i3);
+            }
+        });
+
         int64_t t2 = ggml_time_ms();
         LOG_INFO("encode_first_stage completed, taking %" PRId64 " ms", t2 - t1);
     } else if (sd_ctx->sd->diffusion_model->get_desc() == "Wan2.1-VACE-1.3B" ||
diff --git a/src/vae.hpp b/src/vae.hpp
index 7ccba6eed..e1a0dbfcc 100644
--- a/src/vae.hpp
+++ b/src/vae.hpp
@@ -649,6 +649,102 @@ struct FakeVAE : public VAE {
     }
 };
 
+// LTX-2 CausalVideoAutoencoder (32x spatial, 8x temporal, 128 latent channels).
+//
+// PLACEHOLDER IMPLEMENTATION: this performs shape-correct geometric
+// up/down-sampling on the CPU so the end-to-end T2V/I2V pipeline runs and
+// produces frames of the right dimensions. It does NOT implement the learned
+// causal-conv encoder/decoder, PixelNorm, or per-channel statistics yet, so
+// output is not the true decoded video. The real CausalVideoAutoencoder
+// (loading `first_stage_model.*` weights) replaces this in follow-up work; the
+// 32x32x8 geometry, the [W,H,T,C] tensor contract and the pipeline wiring are
+// established here.
+struct Ltx2VAERunner : public VAE {
+    static const int kSpatial  = 32;
+    static const int kTemporal = 8;
+
+    Ltx2VAERunner(ggml_backend_t backend,
+                  bool offload_params_to_cpu,
+                  const String2TensorStorage& tensor_storage_map,
+                  const std::string prefix)
+        : VAE(backend, offload_params_to_cpu) {
+        SD_UNUSED(tensor_storage_map);
+        SD_UNUSED(prefix);
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) override {}
+
+    std::string get_desc() override {
+        return "ltx2_vae(placeholder)";
+    }
+
+    bool compute(const int n_threads,
+                 struct ggml_tensor* z,
+                 bool decode_graph,
+                 struct ggml_tensor** output,
+                 struct ggml_context* output_ctx) override {
+        SD_UNUSED(n_threads);
+        if (output == nullptr || output_ctx == nullptr) {
+            return false;
+        }
+        if (decode_graph) {
+            return decode(z, output, output_ctx);
+        }
+        return encode(z, output, output_ctx);
+    }
+
+    // z: [Wl, Hl, Tl, 128] -> pixels [Wl*32, Hl*32, (Tl-1)*8+1, 3]
+    bool decode(struct ggml_tensor* z, struct ggml_tensor** output, struct ggml_context* output_ctx) {
+        int64_t Wl = z->ne[0], Hl = z->ne[1], Tl = z->ne[2], C = z->ne[3];
+        int64_t W = Wl * kSpatial, H = Hl * kSpatial;
+        int64_t T = (Tl - 1) * kTemporal + 1;
+        if (Tl <= 0) {
+            T = 1;
+        }
+        if (*output == nullptr) {
+            *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, W, H, T, 3);
+        }
+        int64_t group = std::max((int64_t)1, C / 3);
+        ggml_ext_tensor_iter(*output, [&](ggml_tensor* out, int64_t ox, int64_t oy, int64_t ot, int64_t oc) {
+            int64_t lx = ox / kSpatial;
+            int64_t ly = oy / kSpatial;
+            int64_t lt = (ot == 0) ? 0 : ((ot - 1) / kTemporal + 1);
+            if (lt >= Tl) {
+                lt = Tl - 1;
+            }
+            float acc      = 0.f;
+            int64_t c0     = oc * group;
+            int64_t c1     = std::min(C, c0 + group);
+            int64_t count  = std::max((int64_t)1, c1 - c0);
+            for (int64_t c = c0; c < c1; c++) {
+                acc += ggml_ext_tensor_get_f32(z, lx, ly, lt, c);
+            }
+            float value = tanhf(acc / static_cast<float>(count));
+            ggml_ext_tensor_set_f32(out, value, ox, oy, ot, oc);
+        });
+        return true;
+    }
+
+    // pixels [W, H, T, 3] -> z [W/32, H/32, (T-1)/8+1, 128]
+    bool encode(struct ggml_tensor* x, struct ggml_tensor** output, struct ggml_context* output_ctx) {
+        int64_t W = x->ne[0], H = x->ne[1], T = x->ne[2];
+        int64_t Wl = std::max((int64_t)1, W / kSpatial);
+        int64_t Hl = std::max((int64_t)1, H / kSpatial);
+        int64_t Tl = (T - 1) / kTemporal + 1;
+        if (*output == nullptr) {
+            *output = ggml_new_tensor_4d(output_ctx, GGML_TYPE_F32, Wl, Hl, Tl, 128);
+        }
+        ggml_ext_tensor_iter(*output, [&](ggml_tensor* out, int64_t lx, int64_t ly, int64_t lt, int64_t c) {
+            int64_t ox = std::min(W - 1, lx * kSpatial);
+            int64_t oy = std::min(H - 1, ly * kSpatial);
+            int64_t ot = (lt == 0) ? 0 : std::min(T - 1, (lt - 1) * kTemporal + 1);
+            float value = ggml_ext_tensor_get_f32(x, ox, oy, ot, c % 3);
+            ggml_ext_tensor_set_f32(out, value, lx, ly, lt, c);
+        });
+        return true;
+    }
+};
+
 struct AutoEncoderKL : public VAE {
     bool decode_only = true;
     AutoencodingEngine ae;