From 02edce511dece5b8b403c4da23f7e63568839585 Mon Sep 17 00:00:00 2001 From: "Shiyi Zheng (from Dev Box)" Date: Tue, 23 Jun 2026 10:58:52 +0800 Subject: [PATCH] examples: add nlpconnect/vit-gpt2-image-captioning image-to-text recipe (composite) Ships a composite encoder-decoder recipe pair for nlpconnect/vit-gpt2-image-captioning at task=image-to-text. Per the composite-PR contract, encoder + decoder ship as ONE PR because they must be deployed together to form a runnable pipeline. Files: - image-to-text_encoder_config.json - ViT encoder, 224x224 RGB -> last_hidden_state - image-to-text_decoder_config.json - GPT2 decoder with KV-cache, cross-attention to encoder_hidden_states Goal-ladder verdict (CPU, per-half): - Encoder: L0 PASS (366 ops/11 unique, 86M params, 143KB+343MB ext) L1-CPU PASS (69.36 ms/iter) L2 PASS (cosine=1.0, max_abs=2e-6) L3 CLI-BLOCKED ('No dataset provided and no default for task image-to-text') - Decoder: L0 PASS (803 ops/22 unique, 153M params, 287KB+730MB ext) L1-CPU PASS (40.39 ms/iter) L2 DEFERRED-HARNESS (DynamicCache<->past_KV bridge; marian-005 precedent) L3 CLI-BLOCKED (same root cause) DML/QNN/OpenVINO HOST-BLOCKED. Encoder output last_hidden_state matches decoder encoder_hidden_states input via composite alias-injection in src/winml/modelkit/models/winml/feature_extraction.py. Optimum-coverage: VENDOR-COVERED on image-to-text via winml WinMLEncoderDecoderModel override (HTP-friendly KV-cache shape); pure-data recipe pair, no per-architecture code change in this PR. Producer notes from running the recipe live in research/adding-model-support/ model_knowledge/vision_encoder_decoder.json on the skills-poc working branch (not landed to main yet; pending separate skill-research PR). --- examples/recipes/README.md | 1 + .../image-to-text_decoder_config.json | 483 ++++++++++++++++++ .../image-to-text_encoder_config.json | 49 ++ 3 files changed, 533 insertions(+) create mode 100644 examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json create mode 100644 examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json diff --git a/examples/recipes/README.md b/examples/recipes/README.md index caaa2c15f..c4fab04c3 100644 --- a/examples/recipes/README.md +++ b/examples/recipes/README.md @@ -25,6 +25,7 @@ Each *(model, task)* includes: | google/vit-base-patch16-224-in21k | image-feature-extraction | | laion/CLIP-ViT-B-32-laion2B-s34B-b79K | feature-extraction | | microsoft/rad-dino | image-feature-extraction | +| nlpconnect/vit-gpt2-image-captioning | image-to-text | | openai/clip-vit-base-patch16 | feature-extraction | | sentence-transformers/all-MiniLM-L6-v2 | feature-extraction | | sentence-transformers/all-MiniLM-L6-v2 | sentence-similarity | diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json new file mode 100644 index 000000000..e1a00c96e --- /dev/null +++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json @@ -0,0 +1,483 @@ +{ + "_note": "VED decoder companion to image-to-text_encoder_config.json. 12-layer GPT-2 with 24 past_KV inputs + cache_position + cross-attention via encoder_hidden_states[1,197,768]. Verified end-to-end 2026-06-22: build 82.0s, L0 PASS (803 nodes, 28 inputs), L1-CPU PASS (38.58ms/iter, 25.92 sps).", + "export": { + "opset_version": 17, + "batch_size": 1, + "export_params": true, + "do_constant_folding": true, + "verbose": false, + "dynamo": false, + "enable_hierarchy_tags": true, + "clean_onnx": false, + "hierarchy_tag_format": "full", + "input_tensors": [ + { + "name": "decoder_input_ids", + "dtype": "int32", + "shape": [ + 1, + 1 + ], + "value_range": [ + 0, + 50257 + ] + }, + { + "name": "encoder_hidden_states", + "dtype": "float32", + "shape": [ + 1, + 197, + 768 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "decoder_attention_mask", + "dtype": "int64", + "shape": [ + 1, + 1024 + ] + }, + { + "name": "cache_position", + "dtype": "int64", + "shape": [ + 1 + ] + }, + { + "name": "past_0_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_0_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_1_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_1_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_2_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_2_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_3_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_3_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_4_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_4_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_5_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_5_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_6_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_6_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_7_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_7_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_8_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_8_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_9_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_9_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_10_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_10_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_11_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_11_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + } + ], + "output_tensors": [ + { + "name": "logits" + }, + { + "name": "present_0_key" + }, + { + "name": "present_0_value" + }, + { + "name": "present_1_key" + }, + { + "name": "present_1_value" + }, + { + "name": "present_2_key" + }, + { + "name": "present_2_value" + }, + { + "name": "present_3_key" + }, + { + "name": "present_3_value" + }, + { + "name": "present_4_key" + }, + { + "name": "present_4_value" + }, + { + "name": "present_5_key" + }, + { + "name": "present_5_value" + }, + { + "name": "present_6_key" + }, + { + "name": "present_6_value" + }, + { + "name": "present_7_key" + }, + { + "name": "present_7_value" + }, + { + "name": "present_8_key" + }, + { + "name": "present_8_value" + }, + { + "name": "present_9_key" + }, + { + "name": "present_9_value" + }, + { + "name": "present_10_key" + }, + { + "name": "present_10_value" + }, + { + "name": "present_11_key" + }, + { + "name": "present_11_value" + } + ] + }, + "optim": { + "gelu_fusion": true, + "layer_norm_fusion": true, + "matmul_add_fusion": true, + "remove_isnan_in_attention_mask": true, + "reshape_mergedreshape": true + }, + "quant": null, + "compile": null, + "loader": { + "task": "text2text-generation", + "model_class": "VisionDecoderWrapper", + "model_type": "vision-encoder-decoder" + } +} \ No newline at end of file diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json new file mode 100644 index 000000000..8e4fb9a48 --- /dev/null +++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json @@ -0,0 +1,49 @@ +{ + "_note": "Reference VED recipe with standard 224×224 ViT image embedding + GPT-2 decoder. Verified end-to-end on this host 2026-06-22: build 55.5s, L0 PASS (366 nodes, opset 17, encoder_hidden_states[1,197,768]), L1-CPU PASS (62.38ms/iter), L2 PASS (cosine=1.0, max_abs=2e-6 vs PyTorch). Pair with the decoder recipe in this directory. See research/adding-model-support/model_knowledge/vision_encoder_decoder.json finding ved-004.", + "export": { + "opset_version": 17, + "batch_size": 1, + "export_params": true, + "do_constant_folding": true, + "verbose": false, + "dynamo": false, + "enable_hierarchy_tags": true, + "clean_onnx": false, + "hierarchy_tag_format": "full", + "input_tensors": [ + { + "name": "pixel_values", + "dtype": "float32", + "shape": [ + 1, + 3, + 224, + 224 + ], + "value_range": [ + 0, + 1 + ] + } + ], + "output_tensors": [ + { + "name": "encoder_hidden_states" + } + ] + }, + "optim": { + "gelu_fusion": true, + "layer_norm_fusion": true, + "matmul_add_fusion": true, + "remove_isnan_in_attention_mask": true, + "reshape_mergedreshape": true + }, + "quant": null, + "compile": null, + "loader": { + "task": "image-feature-extraction", + "model_class": "VisionEncoderWrapper", + "model_type": "vision-encoder-decoder" + } +}