diff --git a/examples/recipes/README.md b/examples/recipes/README.md
index caaa2c15f..c4fab04c3 100644
--- a/examples/recipes/README.md
+++ b/examples/recipes/README.md
@@ -25,6 +25,7 @@ Each *(model, task)* includes:
 | google/vit-base-patch16-224-in21k | image-feature-extraction |
 | laion/CLIP-ViT-B-32-laion2B-s34B-b79K | feature-extraction |
 | microsoft/rad-dino | image-feature-extraction |
+| nlpconnect/vit-gpt2-image-captioning | image-to-text |
 | openai/clip-vit-base-patch16 | feature-extraction |
 | sentence-transformers/all-MiniLM-L6-v2 | feature-extraction |
 | sentence-transformers/all-MiniLM-L6-v2 | sentence-similarity |
diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json
new file mode 100644
index 000000000..e1a00c96e
--- /dev/null
+++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json
@@ -0,0 +1,483 @@
+{
+  "_note": "VED decoder companion to image-to-text_encoder_config.json. 12-layer GPT-2 with 24 past_KV inputs + cache_position + cross-attention via encoder_hidden_states[1,197,768]. Verified end-to-end 2026-06-22: build 82.0s, L0 PASS (803 nodes, 28 inputs), L1-CPU PASS (38.58ms/iter, 25.92 sps).",
+  "export": {
+    "opset_version": 17,
+    "batch_size": 1,
+    "export_params": true,
+    "do_constant_folding": true,
+    "verbose": false,
+    "dynamo": false,
+    "enable_hierarchy_tags": true,
+    "clean_onnx": false,
+    "hierarchy_tag_format": "full",
+    "input_tensors": [
+      {
+        "name": "decoder_input_ids",
+        "dtype": "int32",
+        "shape": [
+          1,
+          1
+        ],
+        "value_range": [
+          0,
+          50257
+        ]
+      },
+      {
+        "name": "encoder_hidden_states",
+        "dtype": "float32",
+        "shape": [
+          1,
+          197,
+          768
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "decoder_attention_mask",
+        "dtype": "int64",
+        "shape": [
+          1,
+          1024
+        ]
+      },
+      {
+        "name": "cache_position",
+        "dtype": "int64",
+        "shape": [
+          1
+        ]
+      },
+      {
+        "name": "past_0_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_0_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_1_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_1_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_2_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_2_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_3_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_3_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_4_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_4_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_5_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_5_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_6_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_6_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_7_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_7_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_8_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_8_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_9_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_9_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_10_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_10_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_11_key",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      },
+      {
+        "name": "past_11_value",
+        "dtype": "float32",
+        "shape": [
+          1,
+          12,
+          1024,
+          64
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      }
+    ],
+    "output_tensors": [
+      {
+        "name": "logits"
+      },
+      {
+        "name": "present_0_key"
+      },
+      {
+        "name": "present_0_value"
+      },
+      {
+        "name": "present_1_key"
+      },
+      {
+        "name": "present_1_value"
+      },
+      {
+        "name": "present_2_key"
+      },
+      {
+        "name": "present_2_value"
+      },
+      {
+        "name": "present_3_key"
+      },
+      {
+        "name": "present_3_value"
+      },
+      {
+        "name": "present_4_key"
+      },
+      {
+        "name": "present_4_value"
+      },
+      {
+        "name": "present_5_key"
+      },
+      {
+        "name": "present_5_value"
+      },
+      {
+        "name": "present_6_key"
+      },
+      {
+        "name": "present_6_value"
+      },
+      {
+        "name": "present_7_key"
+      },
+      {
+        "name": "present_7_value"
+      },
+      {
+        "name": "present_8_key"
+      },
+      {
+        "name": "present_8_value"
+      },
+      {
+        "name": "present_9_key"
+      },
+      {
+        "name": "present_9_value"
+      },
+      {
+        "name": "present_10_key"
+      },
+      {
+        "name": "present_10_value"
+      },
+      {
+        "name": "present_11_key"
+      },
+      {
+        "name": "present_11_value"
+      }
+    ]
+  },
+  "optim": {
+    "gelu_fusion": true,
+    "layer_norm_fusion": true,
+    "matmul_add_fusion": true,
+    "remove_isnan_in_attention_mask": true,
+    "reshape_mergedreshape": true
+  },
+  "quant": null,
+  "compile": null,
+  "loader": {
+    "task": "text2text-generation",
+    "model_class": "VisionDecoderWrapper",
+    "model_type": "vision-encoder-decoder"
+  }
+}
\ No newline at end of file
diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json
new file mode 100644
index 000000000..8e4fb9a48
--- /dev/null
+++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json
@@ -0,0 +1,49 @@
+{
+  "_note": "Reference VED recipe with standard 224×224 ViT image embedding + GPT-2 decoder. Verified end-to-end on this host 2026-06-22: build 55.5s, L0 PASS (366 nodes, opset 17, encoder_hidden_states[1,197,768]), L1-CPU PASS (62.38ms/iter), L2 PASS (cosine=1.0, max_abs=2e-6 vs PyTorch). Pair with the decoder recipe in this directory. See research/adding-model-support/model_knowledge/vision_encoder_decoder.json finding ved-004.",
+  "export": {
+    "opset_version": 17,
+    "batch_size": 1,
+    "export_params": true,
+    "do_constant_folding": true,
+    "verbose": false,
+    "dynamo": false,
+    "enable_hierarchy_tags": true,
+    "clean_onnx": false,
+    "hierarchy_tag_format": "full",
+    "input_tensors": [
+      {
+        "name": "pixel_values",
+        "dtype": "float32",
+        "shape": [
+          1,
+          3,
+          224,
+          224
+        ],
+        "value_range": [
+          0,
+          1
+        ]
+      }
+    ],
+    "output_tensors": [
+      {
+        "name": "encoder_hidden_states"
+      }
+    ]
+  },
+  "optim": {
+    "gelu_fusion": true,
+    "layer_norm_fusion": true,
+    "matmul_add_fusion": true,
+    "remove_isnan_in_attention_mask": true,
+    "reshape_mergedreshape": true
+  },
+  "quant": null,
+  "compile": null,
+  "loader": {
+    "task": "image-feature-extraction",
+    "model_class": "VisionEncoderWrapper",
+    "model_type": "vision-encoder-decoder"
+  }
+}