1.26.0 - cherry-pick for RC2 (#28347)

sanaa-hamel-microsoft · Kevin-Taha · Kevin Taha · web-flow · commit 8c546c37b43c · 2026-05-04T16:40:27.000-04:00
This cherry-picks the following commits for the release: | Commit ID | PR Number | Commit Title | |-----------|-----------|-------------| | 9d1492a | #28164 | Add option to memory map .ORT model loads | Co-authored-by: Kevin Taha <tahakevin@gmail.com> Co-authored-by: Kevin Taha <kevintaha@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Dmitri Smirnov <dmitrism@microsoft.com> Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -211,13 +211,23 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
 /// <summary>
 /// Key for using the ORT format model flatbuffer bytes directly for initializers.
 /// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
-/// Requires `session.use_ort_model_bytes_directly` to be true.
+/// Requires `session.use_ort_model_bytes_directly` or `session.use_memory_mapped_ort_model` to be true.
 /// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
 /// duration of the InferenceSession.
 /// </summary>
 static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
     "session.use_ort_model_bytes_for_initializers";
 
+/// <summary>
+/// Key for using memory-mapped I/O to load ORT format model files.
+/// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
+/// to load the .ort model file instead of reading it into a heap-allocated buffer.
+/// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
+/// although the mapping must remain valid and model weights will be immutable.
+/// The model load will fail if the mapping fails; fallbacks should be caller-handled.
+/// </summary>
+static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";
+
 // This should only be specified when exporting an ORT format model for use on a different platform.
 // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
 // Available since version 1.11.
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include <gsl/gsl>
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/platform/scoped_resource.h"
 #include "core/platform/EigenNonBlockingThreadPool.h"
 
@@ -430,9 +431,21 @@ class PosixEnv : public Env {
       return Status::OK();
     }
 
+    // Validate that the file is large enough for the requested mapping.
+    struct stat file_stat;
+    if (fstat(file_descriptor.Get(), &file_stat) != 0) {
+      return ReportSystemError("fstat", file_path);
+    }
+    const size_t requested_end = SafeInt<size_t>(offset) + length;
+    ORT_RETURN_IF(static_cast<size_t>(file_stat.st_size) < requested_end,
+                  "File \"", file_path,
+                  "\" is too small for the requested mapping (file size: ",
+                  file_stat.st_size, " bytes, requested offset + length: ",
+                  requested_end, " bytes).");
+
     static const size_t page_size = narrow<size_t>(sysconf(_SC_PAGESIZE));
     const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
-    const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
+    const size_t mapped_length = SafeInt<size_t>(length) + static_cast<size_t>(offset_to_page);
     const FileOffsetType mapped_offset = offset - offset_to_page;
     void* const mapped_base =
         mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset);
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
@@ -424,6 +424,22 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
                            " - ", std::system_category().message(error_code));
   }
 
+  // Validate that the file is large enough for the requested mapping.
+  LARGE_INTEGER actual_size;
+  if (!GetFileSizeEx(file_handle.get(), &actual_size)) {
+    const auto error_code = GetLastError();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "GetFileSizeEx ", ToUTF8String(Basename(file_path)),
+                           " fail, errcode = ", error_code,
+                           " - ", std::system_category().message(error_code));
+  }
+  const size_t requested_end = SafeInt<size_t>(offset) + length;
+  ORT_RETURN_IF(static_cast<ULONGLONG>(actual_size.QuadPart) < requested_end,
+                "File ", ToUTF8String(Basename(file_path)),
+                " is too small for the requested mapping (file size: ",
+                actual_size.QuadPart, " bytes, requested offset + length: ",
+                requested_end, " bytes).");
+
   wil::unique_hfile file_mapping_handle{
       CreateFileMappingW(file_handle.get(),
                          nullptr,
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -1747,10 +1747,36 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
   return Status::OK();
 }
 
+static Status LoadOrtModelBytesMapped(const PathString& model_uri,
+                                      gsl::span<const uint8_t>& bytes,
+                                      Env::MappedMemoryPtr& mapped_memory) {
+  size_t num_bytes = 0;
+  ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));
+  ORT_RETURN_IF(num_bytes == 0, "Cannot memory-map an empty file: ", ToUTF8String(model_uri));
+
+  ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));
+
+  bytes = gsl::span<const uint8_t>(reinterpret_cast<const uint8_t*>(mapped_memory.get()), num_bytes);
+
+  return Status::OK();
+}
+
 Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
   return LoadOrtModelWithLoader(
       [&]() {
         model_location_ = model_uri;
+
+        const auto& config_options = GetSessionOptions().config_options;
+        const bool use_mmap =
+            config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";
+
+        if (use_mmap) {
+          ORT_RETURN_IF_ERROR(
+              LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
+          LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
+          return Status::OK();
+        }
+
         ORT_RETURN_IF_ERROR(
             LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
         return Status::OK();
@@ -1760,6 +1786,11 @@ Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
 Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len) {
   return LoadOrtModelWithLoader([&]() {
     const auto& config_options = GetSessionOptions().config_options;
+
+    if (config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1") {
+      LOGS(*session_logger_, WARNING) << "session.use_memory_mapped_ort_model is ignored when loading from a buffer.";
+    }
+
     const auto use_ort_model_bytes_directly =
         config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "0") == "1";
 
@@ -1858,8 +1889,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
   ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");
 
   // if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
-  // provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
-  // will be empty.
+  // provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
+  // ort_format_model_bytes_data_holder_ will be empty.
   // if that is the case we also allow creating initializers that directly use those bytes.
   const auto& config_options = session_options_.config_options;
   using_ort_model_bytes_for_initializers_ =
@@ -2681,6 +2712,7 @@ common::Status InferenceSession::Initialize() {
     if (!using_ort_model_bytes_for_initializers_) {
       ort_format_model_bytes_ = gsl::span<const uint8_t>();
       std::vector<uint8_t>().swap(ort_format_model_bytes_data_holder_);
+      ort_format_model_mapped_memory_.reset();
     }
 
     // once the model is saved, we may remove unnecessary attributes for inference
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
@@ -15,6 +15,7 @@
 #include "core/common/path_string.h"
 #include "core/common/profiler.h"
 #include "core/common/status.h"
+#include "core/platform/env.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/framework_common.h"
 #include "core/framework/iexecutor.h"
@@ -1028,6 +1029,8 @@ class InferenceSession {
   //   We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
   //   behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
   //   and some for the Initialize (create SessionState).
+  //   If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
+  //   mapping in ort_format_model_mapped_memory_.
   // Short term we free them after Initialize.
   // Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
   // those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
@@ -1036,9 +1039,13 @@ class InferenceSession {
   // This holds the actual model data
   // In case if the session is started with an input byte array contains model data, and the caller
   // specifies that ORT should use the model bytes directly by setting the session config option
-  // "session.use_ort_model_bytes_directly" to "1", this will be empty
+  // "session.use_ort_model_bytes_directly" to "1", this will be empty.
+  // Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
   std::vector<uint8_t> ort_format_model_bytes_data_holder_;
 
+  // Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
+  Env::MappedMemoryPtr ort_format_model_mapped_memory_;
+
   bool using_ort_model_bytes_for_initializers_{false};
 
   // Container to store pre-packed weights to share between sessions.
diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -37,6 +37,7 @@ struct OrtModelTestInfo {
   bool run_use_buffer{false};
   bool disable_copy_ort_buffer{false};
   bool use_buffer_for_initializers{false};
+  bool use_memory_mapped_load{false};
   TransformerLevel optimization_level = TransformerLevel::Level3;
 };
 
@@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
 
   if (test_info.disable_copy_ort_buffer) {
     ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
+  }
 
-    if (test_info.use_buffer_for_initializers) {
-      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
-    }
+  if (test_info.use_memory_mapped_load) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+  }
+
+  if (test_info.use_buffer_for_initializers &&
+      (test_info.disable_copy_ort_buffer || (test_info.use_memory_mapped_load && !test_info.run_use_buffer))) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
   }
 
   so.graph_optimization_level = test_info.optimization_level;
@@ -557,6 +563,31 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
   RunOrtModel(test_info);
 }
 
+// Load the model from a file using memory-mapped I/O
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
+  OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+  test_info.use_memory_mapped_load = true;
+  RunOrtModel(test_info);
+}
+
+// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
+  OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+  test_info.use_memory_mapped_load = true;
+  test_info.use_buffer_for_initializers = true;
+  RunOrtModel(test_info);
+}
+
+// Verify that mmap loading fails gracefully on a non-existent file
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedFailsOnMissingFile) {
+  SessionOptions so;
+  so.session_logid = "MemoryMappedMissingFile";
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+  auto status = session_object.Load(ORT_TSTR("nonexistent_model.ort"));
+  ASSERT_FALSE(status.IsOK());
+}
+
 // regression test for 2 issues covered by PR #17000 (internally reported issue).
 // 1) allocation planner broke in minimal build when subgraph had no nodes.
 // 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
@@ -187,6 +187,9 @@ ABSL_FLAG(int, spin_backoff_max, 1,
           "legacy single-SpinPause behavior. Values >= 2 enable exp-backoff (typical: 4 or 8) to reduce "
           "CPU/power density during the spin window. Values above 64 are clamped to 64.");
 ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations.");
+ABSL_FLAG(uint32_t, hold_ms_after_session_creation, DefaultPerformanceTestConfig().run_config.hold_ms_after_session_creation,
+          "When used with -n, keeps the process alive for the specified number of milliseconds after session creation.\n"
+          "Prints 'SESSION_READY' to stdout before sleeping. Useful for multi-process memory measurements.");
 ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation.");
 ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run.");
 ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions.");
@@ -529,6 +532,13 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a
   // -n
   test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n);
 
+  // --hold_ms_after_session_creation
+  test_config.run_config.hold_ms_after_session_creation = absl::GetFlag(FLAGS_hold_ms_after_session_creation);
+  if (test_config.run_config.hold_ms_after_session_creation > 0 &&
+      !test_config.run_config.exit_after_session_creation) {
+    fprintf(stderr, "WARNING: --hold_ms_after_session_creation has no effect without -n.\n");
+  }
+
   // -l
   test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l);
 
diff --git a/onnxruntime/test/perftest/main.cc b/onnxruntime/test/perftest/main.cc
@@ -3,7 +3,10 @@
 
 // onnxruntime dependencies
 #include <core/session/onnxruntime_c_api.h>
+#include <chrono>
+#include <iostream>
 #include <random>
+#include <thread>
 #include "command_args_parser.h"
 #include "performance_runner.h"
 #include "utils.h"
@@ -127,6 +130,11 @@ int RunPerfTest(Ort::Env& env, const perftest::PerformanceTestConfig& test_confi
   // Exit if user enabled -n option so that user can measure session creation time
   if (test_config.run_config.exit_after_session_creation) {
     perf_runner.LogSessionCreationTime();
+    if (test_config.run_config.hold_ms_after_session_creation > 0) {
+      std::cout << "SESSION_READY" << std::endl;
+      std::this_thread::sleep_for(
+          std::chrono::milliseconds(test_config.run_config.hold_ms_after_session_creation));
+    }
     return 0;
   }
 
diff --git a/onnxruntime/test/perftest/test_configuration.h b/onnxruntime/test/perftest/test_configuration.h
@@ -76,6 +76,7 @@ struct RunConfig {
   int spin_backoff_max = 1;  // 1 means no backoff (default)
   bool spin_backoff_max_set = false;
   bool exit_after_session_creation = false;
+  uint32_t hold_ms_after_session_creation{0};
   std::basic_string<ORTCHAR_T> register_custom_op_path;
   bool enable_cuda_io_binding{false};
   bool use_extensions = false;
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
@@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
 
     // invalid - negative offset
     ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
+
+    // invalid - requested length exceeds file size
+    auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
+    ASSERT_FALSE(status.IsOK());
+    ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
   }
 }
 #else
@@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
 
     // invalid - negative offset
     ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));
+
+    // invalid - requested length exceeds file size
+    auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
+    ASSERT_FALSE(status.IsOK());
+    ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
   }
 }
 #endif
diff --git a/tools/python/benchmark_mmap_ort.py b/tools/python/benchmark_mmap_ort.py

Original file line number	Diff line number	Diff line change
`@@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) {`
`151`	`151`
`152`	`152`	`// invalid - negative offset`
`153`	`153`	`ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());`
	`154`	`+`
	`155`	`+ // invalid - requested length exceeds file size`
	`156`	`+ auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);`
	`157`	`+ ASSERT_FALSE(status.IsOK());`
	`158`	`+ ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);`
`154`	`159`	`}`
`155`	`160`	`}`
`156`	`161`	`#else`
`@@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) {`
`184`	`189`
`185`	`190`	`// invalid - negative offset`
`186`	`191`	`ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));`
	`192`	`+`
	`193`	`+ // invalid - requested length exceeds file size`
	`194`	`+ auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);`
	`195`	`+ ASSERT_FALSE(status.IsOK());`
	`196`	`+ ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);`
`187`	`197`	`}`
`188`	`198`	`}`
`189`	`199`	`#endif`