From 02edce511dece5b8b403c4da23f7e63568839585 Mon Sep 17 00:00:00 2001
From: "Shiyi Zheng (from Dev Box)" <shzhen@microsoft.com>
Date: Tue, 23 Jun 2026 10:58:52 +0800
Subject: [PATCH] examples: add nlpconnect/vit-gpt2-image-captioning
 image-to-text recipe (composite)

Ships a composite encoder-decoder recipe pair for nlpconnect/vit-gpt2-image-captioning
at task=image-to-text. Per the composite-PR contract, encoder + decoder ship as
ONE PR because they must be deployed together to form a runnable pipeline.

Files:
- image-to-text_encoder_config.json - ViT encoder, 224x224 RGB -> last_hidden_state
- image-to-text_decoder_config.json - GPT2 decoder with KV-cache, cross-attention
                                       to encoder_hidden_states

Goal-ladder verdict (CPU, per-half):
- Encoder: L0 PASS (366 ops/11 unique, 86M params, 143KB+343MB ext)
           L1-CPU PASS (69.36 ms/iter)
           L2 PASS (cosine=1.0, max_abs=2e-6)
           L3 CLI-BLOCKED ('No dataset provided and no default for task image-to-text')
- Decoder: L0 PASS (803 ops/22 unique, 153M params, 287KB+730MB ext)
           L1-CPU PASS (40.39 ms/iter)
           L2 DEFERRED-HARNESS (DynamicCache<->past_KV bridge; marian-005 precedent)
           L3 CLI-BLOCKED (same root cause)

DML/QNN/OpenVINO HOST-BLOCKED. Encoder output last_hidden_state matches decoder
encoder_hidden_states input via composite alias-injection in
src/winml/modelkit/models/winml/feature_extraction.py.

Optimum-coverage: VENDOR-COVERED on image-to-text via winml WinMLEncoderDecoderModel
override (HTP-friendly KV-cache shape); pure-data recipe pair, no per-architecture
code change in this PR.

Producer notes from running the recipe live in research/adding-model-support/
model_knowledge/vision_encoder_decoder.json on the skills-poc working branch
(not landed to main yet; pending separate skill-research PR).
---
 examples/recipes/README.md                    |   1 +
 .../image-to-text_decoder_config.json         | 483 ++++++++++++++++++
 .../image-to-text_encoder_config.json         |  49 ++
 3 files changed, 533 insertions(+)
 create mode 100644 examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json
 create mode 100644 examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json

diff --git a/examples/recipes/README.md b/examples/recipes/README.md
index caaa2c15f..c4fab04c3 100644
--- a/examples/recipes/README.md
+++ b/examples/recipes/README.md
@@ -25,6 +25,7 @@ Each *(model, task)* includes:
 | google/vit-base-patch16-224-in21k | image-feature-extraction |
 | laion/CLIP-ViT-B-32-laion2B-s34B-b79K | feature-extraction |
 | microsoft/rad-dino | image-feature-extraction |
+| nlpconnect/vit-gpt2-image-captioning | image-to-text |
 | openai/clip-vit-base-patch16 | feature-extraction |
 | sentence-transformers/all-MiniLM-L6-v2 | feature-extraction |
 | sentence-transformers/all-MiniLM-L6-v2 | sentence-similarity |
diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json
new file mode 100644
index 000000000..e1a00c96e
--- /dev/null
+++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json
@@ -0,0 +1,483 @@
+{
+  "_note": "VED decoder companion to image-to-text_encoder_config.json. 12-layer GPT-2 with 24 past_KV inputs + cache_position + cross-attention via encoder_hidden_states[1,197,768]. Verified end-to-end 2026-06-22: build 82.0s, L0 PASS (803 nodes, 28 inputs), L1-CPU PASS (38.58ms/iter, 25.92 sps).",
+  "export": {
+    "opset_version": 17,
+    "batch_size": 1,
+    "export_params": true,
+    "do_constant_folding": true,
+    "verbose": false,
+    "dynamo": false,
+    "enable_hierarchy_tags": true,
+    "clean_onnx": false,
+    "hierarchy_tag_format": "full",
+    "input_tensors": [
+      {
+        "name": "decoder_input_ids",
+        "dtype": "int32",
+        "shape": [
+          1,
+          1
+        ],
+        "value_range": [
+          0,
+          50257
+        ]
+      },
+      {
+        "name": "encoder_hidden_states",
+        "dtype": "float32",
+        "shape": [
+          1,
+          197,
+          768
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "decoder_attention_mask",
+        "dtype": "int64",
+        "shape": [
+          1,
+          1024
+        ]
+      },
+      {
+        "name": "cache_position",
+        "dtype": "int64",
+        "shape": [
+          1
+        ]
+      },
+      {
+        "name": "past_0_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_0_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_1_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_1_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_2_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_2_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_3_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_3_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_4_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_4_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_5_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_5_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_6_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_6_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_7_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_7_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_8_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_8_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_9_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_9_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_10_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_10_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_11_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_11_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      }
+    ],
+    "output_tensors": [
+      {
+        "name": "logits"
+      },
+      {
+        "name": "present_0_key"
+      },
+      {
+        "name": "present_0_value"
+      },
+      {
+        "name": "present_1_key"
+      },
+      {
+        "name": "present_1_value"
+      },
+      {
+        "name": "present_2_key"
+      },
+      {
+        "name": "present_2_value"
+      },
+      {
+        "name": "present_3_key"
+      },
+      {
+        "name": "present_3_value"
+      },
+      {
+        "name": "present_4_key"
+      },
+      {
+        "name": "present_4_value"
+      },
+      {
+        "name": "present_5_key"
+      },
+      {
+        "name": "present_5_value"
+      },
+      {
+        "name": "present_6_key"
+      },
+      {
+        "name": "present_6_value"
+      },
+      {
+        "name": "present_7_key"
+      },
+      {
+        "name": "present_7_value"
+      },
+      {
+        "name": "present_8_key"
+      },
+      {
+        "name": "present_8_value"
+      },
+      {
+        "name": "present_9_key"
+      },
+      {
+        "name": "present_9_value"
+      },
+      {
+        "name": "present_10_key"
+      },
+      {
+        "name": "present_10_value"
+      },
+      {
+        "name": "present_11_key"
+      },
+      {
+        "name": "present_11_value"
+      }
+    ]
+  },
+  "optim": {
+    "gelu_fusion": true,
+    "layer_norm_fusion": true,
+    "matmul_add_fusion": true,
+    "remove_isnan_in_attention_mask": true,
+    "reshape_mergedreshape": true
+  },
+  "quant": null,
+  "compile": null,
+  "loader": {
+    "task": "text2text-generation",
+    "model_class": "VisionDecoderWrapper",
+    "model_type": "vision-encoder-decoder"
+  }
+}
\ No newline at end of file
diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json
new file mode 100644
index 000000000..8e4fb9a48
--- /dev/null
+++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json
@@ -0,0 +1,49 @@
+{
+  "_note": "Reference VED recipe with standard 224×224 ViT image embedding + GPT-2 decoder. Verified end-to-end on this host 2026-06-22: build 55.5s, L0 PASS (366 nodes, opset 17, encoder_hidden_states[1,197,768]), L1-CPU PASS (62.38ms/iter), L2 PASS (cosine=1.0, max_abs=2e-6 vs PyTorch). Pair with the decoder recipe in this directory. See research/adding-model-support/model_knowledge/vision_encoder_decoder.json finding ved-004.",
+  "export": {
+    "opset_version": 17,
+    "batch_size": 1,
+    "export_params": true,
+    "do_constant_folding": true,
+    "verbose": false,
+    "dynamo": false,
+    "enable_hierarchy_tags": true,
+    "clean_onnx": false,
+    "hierarchy_tag_format": "full",
+    "input_tensors": [
+      {
+        "name": "pixel_values",
+        "dtype": "float32",
+        "shape": [
+          1,
+          3,
+          224,
+          224
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      }
+    ],
+    "output_tensors": [
+      {
+        "name": "encoder_hidden_states"
+      }
+    ]
+  },
+  "optim": {
+    "gelu_fusion": true,
+    "layer_norm_fusion": true,
+    "matmul_add_fusion": true,
+    "remove_isnan_in_attention_mask": true,
+    "reshape_mergedreshape": true
+  },
+  "quant": null,
+  "compile": null,
+  "loader": {
+    "task": "image-feature-extraction",
+    "model_class": "VisionEncoderWrapper",
+    "model_type": "vision-encoder-decoder"
+  }
+}