Fix oob bias access for MatMulIntegerToFloat and DynamicQuantizeMatMul (#28499)

adrianlizarraga · web-flow · commit 158bdef0183f · 2026-05-21T15:52:51.000-07:00
### Description Fixes a heap out-of-bounds read vulnerability in `DynamicQuantizeMatMul` and `MatMulIntegerToFloat` where a bias tensor with an incorrect number of elements could cause memory reads beyond the allocated buffer. ## Changes - **`dynamic_quantize_matmul.cc`**: Added element count validation for the bias tensor in both the `ComputeCommon` path and the deferred bias addition path (KleidiAI). - **`matmul_integer_base.h`**: Added element count validation in the KleidiAI pre-pack path, causing fallback to `ComputeCommon` (which then rejects the invalid bias with a clear error). - **Tests**: Added regression tests covering runtime bias mismatch, initializer bias mismatch (KleidiAI fallback), and the generic (non-KleidiAI) path for both operators. ## Why we validate element count, not shape (rank) The validation checks `bias_tensor->Shape().Size() == N` (total element count) rather than enforcing that the bias is strictly 1D. This is intentional for several reasons: 1. **Backward compatibility with existing models.** It's possible that some models may have bias tensors with shape `(1, N)` instead of `(N)`. Enforcing rank == 1 would break these models at runtime. This exact issue occurred with the GroupQueryAttention operator, which required relaxing its shape validation in PR #28259. 2. **Consistent with ONNX standard practice.** Most official ONNX operator schemas (Conv, ConvTranspose, DeformConv, Gemm, LayerNormalization) do *not* validate bias shape in their schema's `TypeAndShapeInferenceFunction`; they only document "1D" in the input description text. `BatchNormalization` is the only exception. 3. **The kernel only needs N contiguous floats.** The compute implementation accesses bias via raw data pointer (`bias->Data<float>()`) and reads exactly `N` elements. It never indexes into specific dimensions or assumes a particular rank. A bias of shape `(N)`, `(1, N)`, or `(1, 1, N)` all work identically. 4. **Schema constraints cannot be relaxed without a version bump.** If we added a strict rank check to the schema now and later discovered models using `(1, N)`, fixing it would probably require a new opset version (though we've never actually bumped the version for contrib ops ...). ## Motivation and Context Without this fix, passing a bias tensor with fewer elements than `B`'s last dimension causes the kernel to read past the end of the bias buffer, potentially exposing sensitive memory contents or causing a crash.
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -80,6 +80,12 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
   if (y->Shape().Size() == 0)
     return Status::OK();
 
+  if (bias_tensor != nullptr) {
+    ORT_RETURN_IF_NOT(bias_tensor->Shape().Size() == static_cast<int64_t>(helper.N()),
+                      "bias tensor's element count must equal B's last dimension (",
+                      helper.N(), "), but got ", bias_tensor->Shape().Size());
+  }
+
   auto* y_data = y->MutableData<float>();
   const auto* bias_data = bias_tensor != nullptr ? bias_tensor->Data<float>() : nullptr;
 
@@ -306,8 +312,12 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
     // This evaluates to true if bias data was not provided as constant data for prepacking stage
     if (!dynamic_quant_mlas_bias_data_was_packed_) {
       if (ctx->Input<Tensor>(IN_BIAS) != nullptr) {
-        const auto biases = std::vector<float>(&ctx->Input<Tensor>(IN_BIAS)->Data<float>()[0],
-                                               &ctx->Input<Tensor>(IN_BIAS)->Data<float>()[gemm_shape.N]);
+        const Tensor* bias_t = ctx->Input<Tensor>(IN_BIAS);
+        ORT_RETURN_IF_NOT(bias_t->Shape().Size() == static_cast<int64_t>(gemm_shape.N),
+                          "bias tensor's element count must equal B's last dimension (",
+                          gemm_shape.N, "), but got ", bias_t->Shape().Size());
+        const auto biases = std::vector<float>(&bias_t->Data<float>()[0],
+                                               &bias_t->Data<float>()[gemm_shape.N]);
 
         // deferred adding of bias
         for (size_t gemm_idx = 0; gemm_idx < num_gemms; gemm_idx++) {
diff --git a/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h b/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h
@@ -208,6 +208,9 @@ class MatMulIntegerBase : public OpKernel {
     }
 
     if (ctx.bias != nullptr) {
+      if (ctx.bias->Shape().Size() != static_cast<int64_t>(ctx.N)) {
+        return false;
+      }
       dynamic_quant_mlas_bias_data_was_packed_ = true;
     }
 
diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -421,6 +421,47 @@ TEST(DynamicQuantizeMatMul, KleidiRejectsUnsupportedBShape) {
   test.Run();
 }
 
+// 6. Mismatched bias (runtime tensor) -> must be rejected at compute time.
+TEST(DynamicQuantizeMatMul, KleidiBiasRuntimeShapeMismatch) {
+  if (!HasArmSME()) GTEST_SKIP();
+  KleidiDynMatMulData data;
+  // Bias has only 1 element but N=3 — this must be rejected.
+  const std::vector<float> bad_bias = {1.0f};
+
+  OpTester test("DynamicQuantizeMatMul", 1, kMSDomain);
+  test.AddInput<float>("A", {data.M, data.K}, data.a);
+  test.AddInput<int8_t>("B", {data.K, data.N}, data.b, true /*initializer*/);
+  test.AddInput<float>("b_scale", {data.N}, data.b_scale, true);
+  test.AddInput<int8_t>("b_zero_point", {data.N}, data.b_zp, true);
+  test.AddInput<float>("bias", {1}, bad_bias, false /*runtime*/);
+  test.AddOutput<float>("Y", {data.M, data.N}, std::vector<float>(data.M * data.N, 0.0f));
+  test.ConfigEp(DefaultCpuExecutionProvider())
+      .Config(OpTester::ExpectResult::kExpectFailure,
+              "bias tensor's element count must equal B's last dimension")
+      .RunWithConfig();
+}
+
+// 7. Mismatched bias (constant initializer) -> KleidiAI pre-pack rejects -> falls back to ComputeCommon
+// -> rejected
+TEST(DynamicQuantizeMatMul, KleidiBiasInitializerShapeMismatch) {
+  if (!HasArmSME()) GTEST_SKIP();
+  KleidiDynMatMulData data;
+  // Bias has only 1 element but N=3 — this must be rejected.
+  const std::vector<float> bad_bias = {1.0f};
+
+  OpTester test("DynamicQuantizeMatMul", 1, kMSDomain);
+  test.AddInput<float>("A", {data.M, data.K}, data.a);
+  test.AddInput<int8_t>("B", {data.K, data.N}, data.b, true /*initializer*/);
+  test.AddInput<float>("b_scale", {data.N}, data.b_scale, true);
+  test.AddInput<int8_t>("b_zero_point", {data.N}, data.b_zp, true);
+  test.AddInput<float>("bias", {1}, bad_bias, true /*initializer*/);
+  test.AddOutput<float>("Y", {data.M, data.N}, std::vector<float>(data.M * data.N, 0.0f));
+  test.ConfigEp(DefaultCpuExecutionProvider())
+      .Config(OpTester::ExpectResult::kExpectFailure,
+              "bias tensor's element count must equal B's last dimension")
+      .RunWithConfig();
+}
+
 #endif  // USE_KLEIDIAI
 
 TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {
@@ -486,5 +527,35 @@ TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {
   test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }
 
+// Test that a bias tensor with length mismatched to B's last dimension is rejected.
+// This reproduces a heap OOB read when bias is shorter than N.
+TEST(DynamicQuantizeMatMul, BiasShapeMismatch) {
+  constexpr int64_t M = 2;
+  constexpr int64_t K = 4;
+  constexpr int64_t N = 8;
+
+  std::vector<float> A_data(M * K, 1.0f);
+  std::vector<uint8_t> B_data(K * N, 128);
+  std::vector<float> B_scale = {0.5f};
+  std::vector<uint8_t> B_zero_point = {128};
+
+  // Bias has only 1 element but N=8 — this must be rejected.
+  std::vector<float> bad_bias = {1.0f};
+
+  OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
+  test.AddInput<float>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  test.AddInput<float>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<float>("bias", {1}, bad_bias);
+
+  test.AddOutput<float>("Y", {M, N}, std::vector<float>(M * N, 0.0f));
+
+  test.ConfigEp(DefaultCpuExecutionProvider())
+      .Config(OpTester::ExpectResult::kExpectFailure,
+              "bias tensor's element count must equal B's last dimension")
+      .RunWithConfig();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -489,5 +489,72 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }
 
+// Test that a bias tensor with length mismatched to B's last dimension is rejected.
+// This reproduces a heap OOB read when bias is shorter than N.
+TEST(MatMulIntegerToFloat, BiasShapeMismatch) {
+  constexpr int64_t M = 2;
+  constexpr int64_t K = 4;
+  constexpr int64_t N = 8;
+
+  std::vector<uint8_t> A_data(M * K, 128);
+  std::vector<uint8_t> B_data(K * N, 128);
+  std::vector<float> A_scale = {0.5f};
+  std::vector<float> B_scale = {0.5f};
+  std::vector<uint8_t> A_zero_point = {128};
+  std::vector<uint8_t> B_zero_point = {128};
+
+  // Bias has only 1 element but N=8. This must be rejected.
+  std::vector<float> bad_bias = {1.0f};
+
+  OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  test.AddInput<float>("a_scale", {1}, A_scale);
+  test.AddInput<float>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<float>("bias", {1}, bad_bias);
+
+  test.AddOutput<float>("Y", {M, N}, std::vector<float>(M * N, 0.0f));
+
+  test.ConfigEp(DefaultCpuExecutionProvider())
+      .Config(OpTester::ExpectResult::kExpectFailure,
+              "bias tensor's element count must equal B's last dimension")
+      .RunWithConfig();
+}
+
+// Test that a bias tensor with length larger than B's last dimension is rejected.
+TEST(MatMulIntegerToFloat, BiasShapeMismatch_LargerBias) {
+  constexpr int64_t M = 2;
+  constexpr int64_t K = 4;
+  constexpr int64_t N = 8;
+
+  std::vector<uint8_t> A_data(M * K, 128);
+  std::vector<uint8_t> B_data(K * N, 128);
+  std::vector<float> A_scale = {0.5f};
+  std::vector<float> B_scale = {0.5f};
+  std::vector<uint8_t> A_zero_point = {128};
+  std::vector<uint8_t> B_zero_point = {128};
+
+  // Bias has length > N, which must be rejected.
+  std::vector<float> bad_bias(static_cast<size_t>(N + 1), 1.0f);
+
+  OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
+  test.AddInput<uint8_t>("A", {M, K}, A_data);
+  test.AddInput<uint8_t>("B", {K, N}, B_data);
+  test.AddInput<float>("a_scale", {1}, A_scale);
+  test.AddInput<float>("b_scale", {1}, B_scale);
+  test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
+  test.AddInput<uint8_t>("b_zero_point", {1}, B_zero_point);
+  test.AddInput<float>("bias", {N + 1}, bad_bias);
+
+  test.AddOutput<float>("Y", {M, N}, std::vector<float>(M * N, 0.0f));
+
+  test.ConfigEp(DefaultCpuExecutionProvider())
+      .Config(OpTester::ExpectResult::kExpectFailure,
+              "bias tensor's element count must equal B's last dimension")
+      .RunWithConfig();
+}
+
 }  // namespace test
 }  // namespace onnxruntime

Original file line number	Diff line number	Diff line change
`@@ -208,6 +208,9 @@ class MatMulIntegerBase : public OpKernel {`
`208`	`208`	`}`
`209`	`209`
`210`	`210`	`if (ctx.bias != nullptr) {`
	`211`	`+ if (ctx.bias->Shape().Size() != static_cast<int64_t>(ctx.N)) {`
	`212`	`+ return false;`
	`213`	`+ }`
`211`	`214`	`dynamic_quant_mlas_bias_data_was_packed_ = true;`
`212`	`215`	`}`
`213`	`216`