Skip to content

Commit 8c546c3

Browse files
sanaa-hamel-microsoftKevin-TahaKevin TahaCopilotyuslepukhin
authored
1.26.0 - cherry-pick for RC2 (#28347)
This cherry-picks the following commits for the release: | Commit ID | PR Number | Commit Title | |-----------|-----------|-------------| | 9d1492a | #28164 | Add option to memory map .ORT model loads | Co-authored-by: Kevin Taha <tahakevin@gmail.com> Co-authored-by: Kevin Taha <kevintaha@microsoft.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Dmitri Smirnov <dmitrism@microsoft.com> Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 55c5c82 commit 8c546c3

11 files changed

Lines changed: 531 additions & 8 deletions

File tree

include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,23 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
211211
/// <summary>
212212
/// Key for using the ORT format model flatbuffer bytes directly for initializers.
213213
/// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
214-
/// Requires `session.use_ort_model_bytes_directly` to be true.
214+
/// Requires `session.use_ort_model_bytes_directly` or `session.use_memory_mapped_ort_model` to be true.
215215
/// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
216216
/// duration of the InferenceSession.
217217
/// </summary>
218218
static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
219219
"session.use_ort_model_bytes_for_initializers";
220220

221+
/// <summary>
222+
/// Key for using memory-mapped I/O to load ORT format model files.
223+
/// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
224+
/// to load the .ort model file instead of reading it into a heap-allocated buffer.
225+
/// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
226+
/// although the mapping must remain valid and model weights will be immutable.
227+
/// The model load will fail if the mapping fails; fallbacks should be caller-handled.
228+
/// </summary>
229+
static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";
230+
221231
// This should only be specified when exporting an ORT format model for use on a different platform.
222232
// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
223233
// Available since version 1.11.

onnxruntime/core/platform/posix/env.cc

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ limitations under the License.
5454
#include <gsl/gsl>
5555
#include "core/common/logging/logging.h"
5656
#include "core/common/narrow.h"
57+
#include "core/common/safeint.h"
5758
#include "core/platform/scoped_resource.h"
5859
#include "core/platform/EigenNonBlockingThreadPool.h"
5960

@@ -430,9 +431,21 @@ class PosixEnv : public Env {
430431
return Status::OK();
431432
}
432433

434+
// Validate that the file is large enough for the requested mapping.
435+
struct stat file_stat;
436+
if (fstat(file_descriptor.Get(), &file_stat) != 0) {
437+
return ReportSystemError("fstat", file_path);
438+
}
439+
const size_t requested_end = SafeInt<size_t>(offset) + length;
440+
ORT_RETURN_IF(static_cast<size_t>(file_stat.st_size) < requested_end,
441+
"File \"", file_path,
442+
"\" is too small for the requested mapping (file size: ",
443+
file_stat.st_size, " bytes, requested offset + length: ",
444+
requested_end, " bytes).");
445+
433446
static const size_t page_size = narrow<size_t>(sysconf(_SC_PAGESIZE));
434447
const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
435-
const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
448+
const size_t mapped_length = SafeInt<size_t>(length) + static_cast<size_t>(offset_to_page);
436449
const FileOffsetType mapped_offset = offset - offset_to_page;
437450
void* const mapped_base =
438451
mmap(nullptr, mapped_length, PROT_READ | PROT_WRITE, MAP_PRIVATE, file_descriptor.Get(), mapped_offset);

onnxruntime/core/platform/windows/env.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,22 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
424424
" - ", std::system_category().message(error_code));
425425
}
426426

427+
// Validate that the file is large enough for the requested mapping.
428+
LARGE_INTEGER actual_size;
429+
if (!GetFileSizeEx(file_handle.get(), &actual_size)) {
430+
const auto error_code = GetLastError();
431+
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
432+
"GetFileSizeEx ", ToUTF8String(Basename(file_path)),
433+
" fail, errcode = ", error_code,
434+
" - ", std::system_category().message(error_code));
435+
}
436+
const size_t requested_end = SafeInt<size_t>(offset) + length;
437+
ORT_RETURN_IF(static_cast<ULONGLONG>(actual_size.QuadPart) < requested_end,
438+
"File ", ToUTF8String(Basename(file_path)),
439+
" is too small for the requested mapping (file size: ",
440+
actual_size.QuadPart, " bytes, requested offset + length: ",
441+
requested_end, " bytes).");
442+
427443
wil::unique_hfile file_mapping_handle{
428444
CreateFileMappingW(file_handle.get(),
429445
nullptr,

onnxruntime/core/session/inference_session.cc

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,10 +1747,36 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
17471747
return Status::OK();
17481748
}
17491749

1750+
static Status LoadOrtModelBytesMapped(const PathString& model_uri,
1751+
gsl::span<const uint8_t>& bytes,
1752+
Env::MappedMemoryPtr& mapped_memory) {
1753+
size_t num_bytes = 0;
1754+
ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));
1755+
ORT_RETURN_IF(num_bytes == 0, "Cannot memory-map an empty file: ", ToUTF8String(model_uri));
1756+
1757+
ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));
1758+
1759+
bytes = gsl::span<const uint8_t>(reinterpret_cast<const uint8_t*>(mapped_memory.get()), num_bytes);
1760+
1761+
return Status::OK();
1762+
}
1763+
17501764
Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
17511765
return LoadOrtModelWithLoader(
17521766
[&]() {
17531767
model_location_ = model_uri;
1768+
1769+
const auto& config_options = GetSessionOptions().config_options;
1770+
const bool use_mmap =
1771+
config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";
1772+
1773+
if (use_mmap) {
1774+
ORT_RETURN_IF_ERROR(
1775+
LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
1776+
LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
1777+
return Status::OK();
1778+
}
1779+
17541780
ORT_RETURN_IF_ERROR(
17551781
LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
17561782
return Status::OK();
@@ -1760,6 +1786,11 @@ Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
17601786
Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len) {
17611787
return LoadOrtModelWithLoader([&]() {
17621788
const auto& config_options = GetSessionOptions().config_options;
1789+
1790+
if (config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1") {
1791+
LOGS(*session_logger_, WARNING) << "session.use_memory_mapped_ort_model is ignored when loading from a buffer.";
1792+
}
1793+
17631794
const auto use_ort_model_bytes_directly =
17641795
config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "0") == "1";
17651796

@@ -1858,8 +1889,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
18581889
ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");
18591890

18601891
// if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
1861-
// provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
1862-
// will be empty.
1892+
// provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
1893+
// ort_format_model_bytes_data_holder_ will be empty.
18631894
// if that is the case we also allow creating initializers that directly use those bytes.
18641895
const auto& config_options = session_options_.config_options;
18651896
using_ort_model_bytes_for_initializers_ =
@@ -2681,6 +2712,7 @@ common::Status InferenceSession::Initialize() {
26812712
if (!using_ort_model_bytes_for_initializers_) {
26822713
ort_format_model_bytes_ = gsl::span<const uint8_t>();
26832714
std::vector<uint8_t>().swap(ort_format_model_bytes_data_holder_);
2715+
ort_format_model_mapped_memory_.reset();
26842716
}
26852717

26862718
// once the model is saved, we may remove unnecessary attributes for inference

onnxruntime/core/session/inference_session.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "core/common/path_string.h"
1616
#include "core/common/profiler.h"
1717
#include "core/common/status.h"
18+
#include "core/platform/env.h"
1819
#include "core/framework/execution_providers.h"
1920
#include "core/framework/framework_common.h"
2021
#include "core/framework/iexecutor.h"
@@ -1028,6 +1029,8 @@ class InferenceSession {
10281029
// We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
10291030
// behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
10301031
// and some for the Initialize (create SessionState).
1032+
// If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
1033+
// mapping in ort_format_model_mapped_memory_.
10311034
// Short term we free them after Initialize.
10321035
// Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
10331036
// those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
@@ -1036,9 +1039,13 @@ class InferenceSession {
10361039
// This holds the actual model data
10371040
// In case if the session is started with an input byte array contains model data, and the caller
10381041
// specifies that ORT should use the model bytes directly by setting the session config option
1039-
// "session.use_ort_model_bytes_directly" to "1", this will be empty
1042+
// "session.use_ort_model_bytes_directly" to "1", this will be empty.
1043+
// Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
10401044
std::vector<uint8_t> ort_format_model_bytes_data_holder_;
10411045

1046+
// Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
1047+
Env::MappedMemoryPtr ort_format_model_mapped_memory_;
1048+
10421049
bool using_ort_model_bytes_for_initializers_{false};
10431050

10441051
// Container to store pre-packed weights to share between sessions.

onnxruntime/test/framework/ort_model_only_test.cc

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct OrtModelTestInfo {
3737
bool run_use_buffer{false};
3838
bool disable_copy_ort_buffer{false};
3939
bool use_buffer_for_initializers{false};
40+
bool use_memory_mapped_load{false};
4041
TransformerLevel optimization_level = TransformerLevel::Level3;
4142
};
4243

@@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
4950

5051
if (test_info.disable_copy_ort_buffer) {
5152
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
53+
}
5254

53-
if (test_info.use_buffer_for_initializers) {
54-
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
55-
}
55+
if (test_info.use_memory_mapped_load) {
56+
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
57+
}
58+
59+
if (test_info.use_buffer_for_initializers &&
60+
(test_info.disable_copy_ort_buffer || (test_info.use_memory_mapped_load && !test_info.run_use_buffer))) {
61+
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
5662
}
5763

5864
so.graph_optimization_level = test_info.optimization_level;
@@ -557,6 +563,31 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
557563
RunOrtModel(test_info);
558564
}
559565

566+
// Load the model from a file using memory-mapped I/O
567+
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
568+
OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
569+
test_info.use_memory_mapped_load = true;
570+
RunOrtModel(test_info);
571+
}
572+
573+
// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
574+
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
575+
OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
576+
test_info.use_memory_mapped_load = true;
577+
test_info.use_buffer_for_initializers = true;
578+
RunOrtModel(test_info);
579+
}
580+
581+
// Verify that mmap loading fails gracefully on a non-existent file
582+
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedFailsOnMissingFile) {
583+
SessionOptions so;
584+
so.session_logid = "MemoryMappedMissingFile";
585+
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
586+
InferenceSessionWrapper session_object{so, GetEnvironment()};
587+
auto status = session_object.Load(ORT_TSTR("nonexistent_model.ort"));
588+
ASSERT_FALSE(status.IsOK());
589+
}
590+
560591
// regression test for 2 issues covered by PR #17000 (internally reported issue).
561592
// 1) allocation planner broke in minimal build when subgraph had no nodes.
562593
// 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing

onnxruntime/test/perftest/command_args_parser.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ ABSL_FLAG(int, spin_backoff_max, 1,
187187
"legacy single-SpinPause behavior. Values >= 2 enable exp-backoff (typical: 4 or 8) to reduce "
188188
"CPU/power density during the spin window. Values above 64 are clamped to 64.");
189189
ABSL_FLAG(bool, n, DefaultPerformanceTestConfig().run_config.exit_after_session_creation, "Allows user to measure session creation time to measure impact of enabling any initialization optimizations.");
190+
ABSL_FLAG(uint32_t, hold_ms_after_session_creation, DefaultPerformanceTestConfig().run_config.hold_ms_after_session_creation,
191+
"When used with -n, keeps the process alive for the specified number of milliseconds after session creation.\n"
192+
"Prints 'SESSION_READY' to stdout before sleeping. Useful for multi-process memory measurements.");
190193
ABSL_FLAG(bool, l, DefaultPerformanceTestConfig().model_info.load_via_path, "Provides file as binary in memory by using fopen before session creation.");
191194
ABSL_FLAG(bool, g, DefaultPerformanceTestConfig().run_config.enable_cuda_io_binding, "[TensorRT RTX | TensorRT | CUDA] Enables tensor input and output bindings on CUDA before session run.");
192195
ABSL_FLAG(bool, X, DefaultPerformanceTestConfig().run_config.use_extensions, "Registers custom ops from onnxruntime-extensions.");
@@ -529,6 +532,13 @@ bool CommandLineParser::ParseArguments(PerformanceTestConfig& test_config, int a
529532
// -n
530533
test_config.run_config.exit_after_session_creation = absl::GetFlag(FLAGS_n);
531534

535+
// --hold_ms_after_session_creation
536+
test_config.run_config.hold_ms_after_session_creation = absl::GetFlag(FLAGS_hold_ms_after_session_creation);
537+
if (test_config.run_config.hold_ms_after_session_creation > 0 &&
538+
!test_config.run_config.exit_after_session_creation) {
539+
fprintf(stderr, "WARNING: --hold_ms_after_session_creation has no effect without -n.\n");
540+
}
541+
532542
// -l
533543
test_config.model_info.load_via_path = absl::GetFlag(FLAGS_l);
534544

onnxruntime/test/perftest/main.cc

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33

44
// onnxruntime dependencies
55
#include <core/session/onnxruntime_c_api.h>
6+
#include <chrono>
7+
#include <iostream>
68
#include <random>
9+
#include <thread>
710
#include "command_args_parser.h"
811
#include "performance_runner.h"
912
#include "utils.h"
@@ -127,6 +130,11 @@ int RunPerfTest(Ort::Env& env, const perftest::PerformanceTestConfig& test_confi
127130
// Exit if user enabled -n option so that user can measure session creation time
128131
if (test_config.run_config.exit_after_session_creation) {
129132
perf_runner.LogSessionCreationTime();
133+
if (test_config.run_config.hold_ms_after_session_creation > 0) {
134+
std::cout << "SESSION_READY" << std::endl;
135+
std::this_thread::sleep_for(
136+
std::chrono::milliseconds(test_config.run_config.hold_ms_after_session_creation));
137+
}
130138
return 0;
131139
}
132140

onnxruntime/test/perftest/test_configuration.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ struct RunConfig {
7676
int spin_backoff_max = 1; // 1 means no backoff (default)
7777
bool spin_backoff_max_set = false;
7878
bool exit_after_session_creation = false;
79+
uint32_t hold_ms_after_session_creation{0};
7980
std::basic_string<ORTCHAR_T> register_custom_op_path;
8081
bool enable_cuda_io_binding{false};
8182
bool use_extensions = false;

onnxruntime/test/platform/file_io_test.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
151151

152152
// invalid - negative offset
153153
ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
154+
155+
// invalid - requested length exceeds file size
156+
auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
157+
ASSERT_FALSE(status.IsOK());
158+
ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
154159
}
155160
}
156161
#else
@@ -184,6 +189,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
184189

185190
// invalid - negative offset
186191
ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));
192+
193+
// invalid - requested length exceeds file size
194+
auto status = Env::Default().MapFileIntoMemory(tmp.path.c_str(), 0, expected_data.size() + 1, mapped_memory);
195+
ASSERT_FALSE(status.IsOK());
196+
ASSERT_NE(status.ErrorMessage().find("too small for the requested mapping"), std::string::npos);
187197
}
188198
}
189199
#endif

0 commit comments

Comments
 (0)