diff --git a/examples/recipes/README.md b/examples/recipes/README.md index caaa2c15f..c4fab04c3 100644 --- a/examples/recipes/README.md +++ b/examples/recipes/README.md @@ -25,6 +25,7 @@ Each *(model, task)* includes: | google/vit-base-patch16-224-in21k | image-feature-extraction | | laion/CLIP-ViT-B-32-laion2B-s34B-b79K | feature-extraction | | microsoft/rad-dino | image-feature-extraction | +| nlpconnect/vit-gpt2-image-captioning | image-to-text | | openai/clip-vit-base-patch16 | feature-extraction | | sentence-transformers/all-MiniLM-L6-v2 | feature-extraction | | sentence-transformers/all-MiniLM-L6-v2 | sentence-similarity | diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json new file mode 100644 index 000000000..e1a00c96e --- /dev/null +++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_decoder_config.json @@ -0,0 +1,483 @@ +{ + "_note": "VED decoder companion to image-to-text_encoder_config.json. 12-layer GPT-2 with 24 past_KV inputs + cache_position + cross-attention via encoder_hidden_states[1,197,768]. Verified end-to-end 2026-06-22: build 82.0s, L0 PASS (803 nodes, 28 inputs), L1-CPU PASS (38.58ms/iter, 25.92 sps).", + "export": { + "opset_version": 17, + "batch_size": 1, + "export_params": true, + "do_constant_folding": true, + "verbose": false, + "dynamo": false, + "enable_hierarchy_tags": true, + "clean_onnx": false, + "hierarchy_tag_format": "full", + "input_tensors": [ + { + "name": "decoder_input_ids", + "dtype": "int32", + "shape": [ + 1, + 1 + ], + "value_range": [ + 0, + 50257 + ] + }, + { + "name": "encoder_hidden_states", + "dtype": "float32", + "shape": [ + 1, + 197, + 768 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "decoder_attention_mask", + "dtype": "int64", + "shape": [ + 1, + 1024 + ] + }, + { + "name": "cache_position", + "dtype": "int64", + "shape": [ + 1 + ] + }, + { + "name": "past_0_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_0_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_1_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_1_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_2_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_2_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_3_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_3_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_4_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_4_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_5_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_5_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_6_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_6_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_7_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_7_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_8_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_8_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_9_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_9_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_10_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_10_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_11_key", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + }, + { + "name": "past_11_value", + "dtype": "float32", + "shape": [ + 1, + 12, + 1024, + 64 + ], + "value_range": [ + 0, + 1 + ] + } + ], + "output_tensors": [ + { + "name": "logits" + }, + { + "name": "present_0_key" + }, + { + "name": "present_0_value" + }, + { + "name": "present_1_key" + }, + { + "name": "present_1_value" + }, + { + "name": "present_2_key" + }, + { + "name": "present_2_value" + }, + { + "name": "present_3_key" + }, + { + "name": "present_3_value" + }, + { + "name": "present_4_key" + }, + { + "name": "present_4_value" + }, + { + "name": "present_5_key" + }, + { + "name": "present_5_value" + }, + { + "name": "present_6_key" + }, + { + "name": "present_6_value" + }, + { + "name": "present_7_key" + }, + { + "name": "present_7_value" + }, + { + "name": "present_8_key" + }, + { + "name": "present_8_value" + }, + { + "name": "present_9_key" + }, + { + "name": "present_9_value" + }, + { + "name": "present_10_key" + }, + { + "name": "present_10_value" + }, + { + "name": "present_11_key" + }, + { + "name": "present_11_value" + } + ] + }, + "optim": { + "gelu_fusion": true, + "layer_norm_fusion": true, + "matmul_add_fusion": true, + "remove_isnan_in_attention_mask": true, + "reshape_mergedreshape": true + }, + "quant": null, + "compile": null, + "loader": { + "task": "text2text-generation", + "model_class": "VisionDecoderWrapper", + "model_type": "vision-encoder-decoder" + } +} \ No newline at end of file diff --git a/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json new file mode 100644 index 000000000..8e4fb9a48 --- /dev/null +++ b/examples/recipes/nlpconnect_vit-gpt2-image-captioning/image-to-text_encoder_config.json @@ -0,0 +1,49 @@ +{ + "_note": "Reference VED recipe with standard 224×224 ViT image embedding + GPT-2 decoder. Verified end-to-end on this host 2026-06-22: build 55.5s, L0 PASS (366 nodes, opset 17, encoder_hidden_states[1,197,768]), L1-CPU PASS (62.38ms/iter), L2 PASS (cosine=1.0, max_abs=2e-6 vs PyTorch). Pair with the decoder recipe in this directory. See research/adding-model-support/model_knowledge/vision_encoder_decoder.json finding ved-004.", + "export": { + "opset_version": 17, + "batch_size": 1, + "export_params": true, + "do_constant_folding": true, + "verbose": false, + "dynamo": false, + "enable_hierarchy_tags": true, + "clean_onnx": false, + "hierarchy_tag_format": "full", + "input_tensors": [ + { + "name": "pixel_values", + "dtype": "float32", + "shape": [ + 1, + 3, + 224, + 224 + ], + "value_range": [ + 0, + 1 + ] + } + ], + "output_tensors": [ + { + "name": "encoder_hidden_states" + } + ] + }, + "optim": { + "gelu_fusion": true, + "layer_norm_fusion": true, + "matmul_add_fusion": true, + "remove_isnan_in_attention_mask": true, + "reshape_mergedreshape": true + }, + "quant": null, + "compile": null, + "loader": { + "task": "image-feature-extraction", + "model_class": "VisionEncoderWrapper", + "model_type": "vision-encoder-decoder" + } +}