From 34b2bee0137b463a50e6442110b92b0b3bda2fff Mon Sep 17 00:00:00 2001 From: Mostafa Faheem Date: Mon, 8 Jun 2026 23:21:59 +0300 Subject: [PATCH 1/3] OpenVINO backend: Enhance envvar handling --- ggml/src/ggml-openvino/ggml-decoder.cpp | 14 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 37 +++++- ggml/src/ggml-openvino/ggml-openvino-extra.h | 4 + ggml/src/ggml-openvino/ggml-openvino.cpp | 120 ++++++++++-------- .../openvino/op/flash_attn_ext.cpp | 5 +- ggml/src/ggml-openvino/utils.cpp | 4 +- ggml/src/ggml-openvino/utils.h | 3 +- 7 files changed, 123 insertions(+), 64 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3f6cfedfe897..0353475ae32e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -5,6 +5,7 @@ #include "ggml-openvino.h" #include "ggml-quants.h" #include "ggml.h" +#include "utils.h" #include #include @@ -51,13 +52,12 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_model_weights(model_weights), m_model_params(model_params), m_compute_params(compute_params) { - if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && atoi(env) > 0) { -#ifdef _WIN32 - _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); -#else - unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); -#endif - print_tensor_address_map(cgraph); + static bool printed_address_map = false; + if (!printed_address_map) { + if (ggml_openvino_env_flag("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + printed_address_map = true; + print_tensor_address_map(cgraph); + } } validate_cgraph(); diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 4140136aca25..d05085606e7a 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -22,7 +22,31 @@ void ggml_openvino_device_config::init() { if (initialized) { return; } - device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + + static constexpr const char* env_var_names[] = { + "GGML_OPENVINO_DEVICE", + "GGML_OPENVINO_CACHE_DIR", + "GGML_OPENVINO_PREFILL_CHUNK_SIZE", + "GGML_OPENVINO_STATEFUL_EXECUTION", + "GGML_OPENVINO_PROFILING", + "GGML_OPENVINO_DUMP_CGRAPH", + "GGML_OPENVINO_DUMP_IR", + "GGML_OPENVINO_DEBUG_INPUT", + "GGML_OPENVINO_DEBUG_OUTPUT", + "GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", + "GGML_OPENVINO_ENABLE_CACHE", + "GGML_OPENVINO_DISABLE_KV_SLICE", + "GGML_OPENVINO_MANUAL_GQA_ATTN" + }; + + for (const char* const & env_var : env_var_names) { + auto * env = getenv(env_var); + if (env) { + environment_variables[env_var] = env; + } + } + + device_name = ggml_openvino_getenv("GGML_OPENVINO_DEVICE") ? ggml_openvino_getenv("GGML_OPENVINO_DEVICE") : "CPU"; auto available_devices = ov_singleton_core().get_available_devices(); if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) { GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str()); @@ -30,7 +54,7 @@ void ggml_openvino_device_config::init() { } is_npu = (device_name == "NPU"); - auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + const char * cache_dir = ggml_openvino_getenv("GGML_OPENVINO_CACHE_DIR"); if (device_name == "NPU") { compile_config = { {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, @@ -119,6 +143,15 @@ const std::string & ggml_openvino_get_device_name() { return ggml_openvino_get_device_config().device_name; } +// Get the value of a specific environment variable +const char* ggml_openvino_getenv(const char* var){ + auto it = ggml_openvino_get_device_config().environment_variables.find(var); + if (it == ggml_openvino_get_device_config().environment_variables.end()) { + return nullptr; + } + return it->second.c_str(); +} + // Check if running on NPU bool ggml_openvino_is_npu() { return ggml_openvino_get_device_config().is_npu; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 57bfa4d907fd..789d2a61758c 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -64,6 +64,7 @@ struct ggml_openvino_device_config { bool initialized = false; std::optional remote_context; ov::AnyMap compile_config; + std::unordered_map environment_variables; cl_command_queue cl_queue = nullptr; void init(); @@ -79,6 +80,9 @@ void ggml_openvino_init_device_config(); // Get the device name const std::string & ggml_openvino_get_device_name(); +// Get the value of a specific environment variable +const char* ggml_openvino_getenv(const char* var); + // Check if running on NPU bool ggml_openvino_is_npu(); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index cd0c1738d833..6eb0c9255e72 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -1,10 +1,28 @@ #include "ggml-openvino.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/utils.h" -#include "ggml-openvino/openvino/op_table.h" #include "ggml-quants.h" - +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include #if defined(_WIN32) # define WIN32_LEAN_AND_MEAN @@ -129,7 +147,7 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer static bool is_stateful_enabled() { static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION"); - return stateful != nullptr && atoi(stateful) > 0; + return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0; } static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { @@ -892,8 +910,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_ADD: - case GGML_OP_MUL: - case GGML_OP_SUB: { + case GGML_OP_MUL: { if (op->src[1]->op == GGML_OP_PERMUTE) { return true; } @@ -1030,9 +1047,19 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) { return true; } + if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { + // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` + // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); + return true; + } if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) { return true; } + if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) { + // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1) + // triggers a bug in ov matmul_shape_inference.hpp + return true; + } if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) { return true; } @@ -1121,14 +1148,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // Keep this op on CPU until the OpenVINO implementation is fixed. return true; } - case GGML_OP_VIEW: { - // Skip TOPK_MOE fused tests until it is fully supported - // the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe - if (strcmp(op->name, "selected_experts") == 0) { - return true; - } - break; - } default: break; } @@ -1138,47 +1157,48 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static std::unordered_set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, + static std::set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - // derive supported op sets from the op_table map, keys in - // the map use the full macro name (e.g. "GGML_OP_ADD"), while - // the ggml_*_op_name() helpers return only the trailing part (e.g. "ADD"). - // each set is built once and cached. - static const auto build_supported_sets = [] { - const auto & table = ov::frontend::ggml::get_supported_ops(); - std::unordered_set ops; - std::unordered_set unary_ops; - std::unordered_set glu_ops; - - // GGML_OP_NONE has no translator but is always safe to add to the supported set. - ops.insert(GGML_OP_NONE); - - for (int i = 0; i < GGML_OP_COUNT; ++i) { - const std::string key = std::string("GGML_OP_") + ggml_op_name(static_cast(i)); - if (table.count(key)) { - ops.insert(static_cast(i)); - } - } - for (int i = 0; i < GGML_UNARY_OP_COUNT; ++i) { - const std::string key = std::string("GGML_UNARY_OP_") + ggml_unary_op_name(static_cast(i)); - if (table.count(key)) { - unary_ops.insert(static_cast(i)); - } - } - for (int i = 0; i < GGML_GLU_OP_COUNT; ++i) { - const std::string key = std::string("GGML_GLU_OP_") + ggml_glu_op_name(static_cast(i)); - if (table.count(key)) { - glu_ops.insert(static_cast(i)); - } - } - return std::make_tuple(ops, unary_ops, glu_ops); + static const std::set supported_ops{GGML_OP_NONE, + GGML_OP_ADD, + GGML_OP_CONCAT, + GGML_OP_DIV, + GGML_OP_MUL, + GGML_OP_MUL_MAT, + GGML_OP_MUL_MAT_ID, + GGML_OP_VIEW, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_ROPE, + GGML_OP_RMS_NORM, + GGML_OP_SCALE, + GGML_OP_NORM, + GGML_OP_SOFT_MAX, + GGML_OP_SET_ROWS, + GGML_OP_FLASH_ATTN_EXT, + GGML_OP_CPY, + GGML_OP_L2_NORM, + GGML_OP_SUM_ROWS, + GGML_OP_CLAMP, + GGML_OP_PAD, + GGML_OP_SSM_CONV, + GGML_OP_GATED_DELTA_NET, + GGML_OP_IM2COL}; + static const std::set supported_unary_ops{ + GGML_UNARY_OP_GELU, + GGML_UNARY_OP_SILU, + GGML_UNARY_OP_SOFTPLUS, + GGML_UNARY_OP_TANH, + }; + static const std::set supported_glu_ops{ + GGML_GLU_OP_SWIGLU, + GGML_GLU_OP_GEGLU, }; - static const auto supported_sets = build_supported_sets(); - static const auto & supported_ops = std::get<0>(supported_sets); - static const auto & supported_unary_ops = std::get<1>(supported_sets); - static const auto & supported_glu_ops = std::get<2>(supported_sets); switch (op->op) { case GGML_OP_UNARY: { diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 08d23d23f642..11e57e904dcb 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -1,6 +1,7 @@ #include "../node_context.h" #include "../op_table.h" #include "../utils.h" +#include "ggml-openvino/ggml-openvino-extra.h" #include #include @@ -68,11 +69,11 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { // Set GGML_OPENVINO_MANUAL_GQA_ATTN to a positive value (e.g. 1) to force-enable, // or to 0 to force-disable. Unset falls back to the device-based default. static const bool manual_gqa_enabled = []() { - const char * env = getenv("GGML_OPENVINO_MANUAL_GQA_ATTN"); + const char * env = ggml_openvino_getenv("GGML_OPENVINO_MANUAL_GQA_ATTN"); if (env != nullptr) { return atoi(env) > 0; } - const char * dev = getenv("GGML_OPENVINO_DEVICE"); + const char * dev = ggml_openvino_getenv("GGML_OPENVINO_DEVICE"); return dev != nullptr && std::string(dev) == "GPU"; }(); const bool use_manual_gqa_attention = diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index b31b89052c4d..0556b89a8683 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -44,8 +44,8 @@ // =100); otherwise returns 0 (unset, empty, =0, negative, or non-numeric). // Boolean toggles use this as a flag: `if (ggml_openvino_env_flag(name))` is // true iff the value is positive, so =0 is a no-op for all toggles. -static int ggml_openvino_env_flag(const char * name) { - const char * v = getenv(name); +int ggml_openvino_env_flag(const char * name) { + const char * v = ggml_openvino_getenv(name); return v ? std::max(0, std::atoi(v)) : 0; } diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 2ed8f0c40223..10253d991cf8 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,3 @@ -#include "ggml-backend-impl.h" #include "ggml-decoder.h" #include "ggml-impl.h" @@ -93,6 +92,8 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst); +int ggml_openvino_env_flag(const char * name); + template std::vector pad_input(const T * data, size_t rows, From b3f21ea8b0fccde57591aff261dacb653eed6db7 Mon Sep 17 00:00:00 2001 From: Mostafa Faheem Date: Tue, 9 Jun 2026 12:40:22 +0300 Subject: [PATCH 2/3] more cleanup --- ggml/src/ggml-openvino/ggml-openvino.cpp | 97 ++++++++++++------------ 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 6eb0c9255e72..1960e9621de7 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -5,6 +5,7 @@ #include "ggml-impl.h" #include "ggml-openvino-extra.h" #include "ggml-openvino/utils.h" +#include "ggml-openvino/openvino/op_table.h" #include "ggml-quants.h" #include "ggml.h" @@ -910,7 +911,8 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_ADD: - case GGML_OP_MUL: { + case GGML_OP_MUL: + case GGML_OP_SUB: { if (op->src[1]->op == GGML_OP_PERMUTE) { return true; } @@ -1047,19 +1049,9 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) { return true; } - if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { - // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` - // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); - return true; - } if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) { return true; } - if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) { - // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1) - // triggers a bug in ov matmul_shape_inference.hpp - return true; - } if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) { return true; } @@ -1148,6 +1140,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // Keep this op on CPU until the OpenVINO implementation is fixed. return true; } + case GGML_OP_VIEW: { + // Skip TOPK_MOE fused tests until it is fully supported + // the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe + if (strcmp(op->name, "selected_experts") == 0) { + return true; + } + break; + } default: break; } @@ -1157,48 +1157,47 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static std::set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, + static std::unordered_set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - static const std::set supported_ops{GGML_OP_NONE, - GGML_OP_ADD, - GGML_OP_CONCAT, - GGML_OP_DIV, - GGML_OP_MUL, - GGML_OP_MUL_MAT, - GGML_OP_MUL_MAT_ID, - GGML_OP_VIEW, - GGML_OP_CONT, - GGML_OP_RESHAPE, - GGML_OP_PERMUTE, - GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, - GGML_OP_ROPE, - GGML_OP_RMS_NORM, - GGML_OP_SCALE, - GGML_OP_NORM, - GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS, - GGML_OP_FLASH_ATTN_EXT, - GGML_OP_CPY, - GGML_OP_L2_NORM, - GGML_OP_SUM_ROWS, - GGML_OP_CLAMP, - GGML_OP_PAD, - GGML_OP_SSM_CONV, - GGML_OP_GATED_DELTA_NET, - GGML_OP_IM2COL}; - static const std::set supported_unary_ops{ - GGML_UNARY_OP_GELU, - GGML_UNARY_OP_SILU, - GGML_UNARY_OP_SOFTPLUS, - GGML_UNARY_OP_TANH, - }; - static const std::set supported_glu_ops{ - GGML_GLU_OP_SWIGLU, - GGML_GLU_OP_GEGLU, + // derive supported op sets from the op_table map, keys in + // the map use the full macro name (e.g. "GGML_OP_ADD"), while + // the ggml_*_op_name() helpers return only the trailing part (e.g. "ADD"). + // each set is built once and cached. + static const auto build_supported_sets = [] { + const auto & table = ov::frontend::ggml::get_supported_ops(); + std::unordered_set ops; + std::unordered_set unary_ops; + std::unordered_set glu_ops; + + // GGML_OP_NONE has no translator but is always safe to add to the supported set. + ops.insert(GGML_OP_NONE); + + for (int i = 0; i < GGML_OP_COUNT; ++i) { + const std::string key = std::string("GGML_OP_") + ggml_op_name(static_cast(i)); + if (table.count(key)) { + ops.insert(static_cast(i)); + } + } + for (int i = 0; i < GGML_UNARY_OP_COUNT; ++i) { + const std::string key = std::string("GGML_UNARY_OP_") + ggml_unary_op_name(static_cast(i)); + if (table.count(key)) { + unary_ops.insert(static_cast(i)); + } + } + for (int i = 0; i < GGML_GLU_OP_COUNT; ++i) { + const std::string key = std::string("GGML_GLU_OP_") + ggml_glu_op_name(static_cast(i)); + if (table.count(key)) { + glu_ops.insert(static_cast(i)); + } + } + return std::make_tuple(ops, unary_ops, glu_ops); }; + static const auto supported_sets = build_supported_sets(); + static const auto & supported_ops = std::get<0>(supported_sets); + static const auto & supported_unary_ops = std::get<1>(supported_sets); + static const auto & supported_glu_ops = std::get<2>(supported_sets); switch (op->op) { case GGML_OP_UNARY: { From e68a1030383f5cc0f787b076472da8de2b2a22bb Mon Sep 17 00:00:00 2001 From: Mostafa Faheem Date: Tue, 9 Jun 2026 14:14:36 +0300 Subject: [PATCH 3/3] move ggml_openvino_env_flag to appropriate place --- ggml/src/ggml-openvino/utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 10253d991cf8..f9c9633abd9b 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -79,6 +79,8 @@ struct ov_runtime_context { } }; +int ggml_openvino_env_flag(const char * name); + enum ggml_status ov_graph_compute(struct ggml_cgraph * cgraph, ggml_backend_t backend); enum ggml_status ov_graph_compute_dynamic(struct ggml_cgraph * cgraph, std::shared_ptr r_ctx); @@ -92,8 +94,6 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst); -int ggml_openvino_env_flag(const char * name); - template std::vector pad_input(const T * data, size_t rows,