Implement pre-packed blobs serialization on disk and their memory map…

…ping on load (#23069) ### Description  Pre-packing is a feature, that allows kernels to re-arrange weights data to gain performance at interference time Currently, pre-packed blobs are shared when a cross-session weight sharing is enabled and only for those weights that are marked as shared by the user. Otherwise, data resides on the heap, the kernels own the data which may be duplicated. This change enables pre-packed data to be stored on disk alongside with the external initializers. The pre-packed blobs are memory mapped and are loaded into either the X-session shared container or a new container that shares pre-packed blobs within the session. With the new approach, pre-packed blobs are always owned by the shared container using the existing pre-pack mechanism for sharing. When X-session sharing is enabled, then the external container owns the data. A separate container owned by a root `SessionState` owns and shares the data when X-session sharing is not enabled. To facilitate this new approach, we introduce a new container that works in two modes. When an optimized model is being saved, and pre-packed weights saving is enabled, the new container will record pre-packed blobs and serialize them to disk using existing `ToGraphProtoWithExternalInitializers` function. To externalize the pre-packed weights, we introduce a new session option `kOrtSessionOptionsSavePrePackedConstantInitializers.` Note, that pre-packing should be enabled (default) for this to work. `ToGraphProtoWithExternalInitializers`function is modified to recurse into subgraphs to make sure we properly account for local initializer names. In the second mode, the container would simply hold the pre-packed weights memory-mapped from disk and share them with the kernels. ### Motivation and Context  Reduce memory usage by pre-packed initializers and externalize them.
microsoft · Dec 20, 2024 · 00b262d · 00b262d
1 parent 29bccad
commit 00b262d
Show file tree

Hide file tree

Showing 28 changed files with 1,308 additions and 266 deletions.
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
@@ -7,6 +7,7 @@
 
 // It is safe to include the below header even if SHARED_PROVIDER macro is enabled
 // as it doesn't include any pb headers.
+#include "core/framework/buffer_deleter.h"
 #include "core/framework/prepacked_weights_container.h"
 
 #ifndef SHARED_PROVIDER

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -3,14 +3,15 @@
 
 #pragma once
 
+#include <filesystem>
 #include <functional>
 #include <limits>
 #include <memory>
+#include <optional>
 #include <string>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
-#include <filesystem>
 
 #include "core/common/flatbuffers.h"
 
@@ -19,13 +20,14 @@
 #include "core/common/common.h"
 #include "core/common/path_string.h"
 #include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
 #if !defined(ORT_MINIMAL_BUILD)
 #include "core/common/inlined_containers.h"
 #endif
-#include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
+#include "core/framework/prepacked_weights_container.h"
 #include "core/graph/onnx_protobuf.h"
 #include "core/graph/basic_types.h"
 #include "core/graph/constants.h"
@@ -41,6 +43,7 @@ namespace onnxruntime {
 class Graph;
 struct IndexedSubGraph;
 class Model;
+struct ModelSavingOptions;
 class OpSignature;
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
@@ -1153,29 +1156,6 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   const ONNX_NAMESPACE::GraphProto& ToGraphProto();
   ONNX_NAMESPACE::GraphProto ToGraphProto() const;
 
-  // Options to align external initializer offset.
-  // For models running on CPU, ORT will try to use mmap to load external initializers.
-  // To use mmap, external initializer need to be offset aligned.
-  // ORT saves external initializers into signle data file, each initializer is accessed with
-  // offset(start position of initializer) and length(byte length of initializer) of the data file.
-  // To use mmap, each offset need to be aligned which means offset need to divisible by
-  // allocation granularity(64KB for windows and 4K for other OSes).
-  // With align_offset to true, ORT will align offset for large initializer when
-  // save ONNX model with external data file.
-  struct OffsetAlignmentInfo {
-    // Offset will always be page aligned and allocation granularity aligned for mmap support.
-    // This is done by padding previous tensor data with zeros keeping same length.
-    bool align_offset = false;
-    // Alignment threshold for size of data.
-    // Having a low threshold will waste file space for small initializers.
-    // Only when tensor's data size is > the page_align_threshold it will be force aligned.
-    // Default to 1MB.
-    int64_t align_threshold = 1048576;
-    // The allocation Granularity for mmap() support.
-    // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-    int64_t allocation_granularity = 65536;
-  };
-
   /** Gets the GraphProto representation of this Graph
   @param external_file_path File path of the binary file to use for initializers.
   @param model_file_path path of the model file.
@@ -1186,15 +1166,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
                                                                   const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold,
-                                                                  const OffsetAlignmentInfo& align_info) const;
-
-  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
-                                                                  const std::filesystem::path& model_file_path,
-                                                                  size_t initializer_size_threshold) const {
-    OffsetAlignmentInfo default_options;
-    return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options);
-  }
+                                                                  const ModelSavingOptions& model_saving_options) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
   IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
@@ -1400,6 +1372,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
+  // This function constructs PrepackedSharedContainer in the root graph only
+  // and initializes a reference to it in all (sub)graphs
+  void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on);
+
+  const PrepackedWeightsForGraph& GetPrepacked() const noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
+  PrepackedWeightsForGraph& GetPrepacked() noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
   /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
   const Node* ParentNode() const { return parent_node_; }
 
@@ -1519,6 +1503,31 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
                                        std::optional<std::string_view> new_name);
 
+  /// <summary>
+  /// This function traverses the graph bottom up and externalizes
+  /// constant initializers along with their pre-packed blobs from different
+  /// kernels. Writes constant initializers to the external file with any pre-packed
+  /// blobs (if enabled and produced for this initializer) and then modifies TensorProto
+  /// entry with external data references.
+  /// </summary>
+  /// <param name="model_path">model file path from Model</param>
+  /// <param name="external_file_path">a binary file path for relative to the model file path
+  /// where the initializers data is written</param>
+  /// <param name="model_external_file_path">model file folder path with external file path appended</param>
+  /// <param name="model_saving_options">model saving options including alignment and pre-packs</param>
+  /// <param name="output_graph_proto">The graph proto to be modified</param>
+  /// <param name="external_stream">external file stream</param>
+  /// <param name="external_offset">current external file offset updated with each write</param>
+  /// <returns>Status instance</returns>
+  Status AddExternalInitializersToGraphProtoImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& model_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& output_graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
 #endif
 
   Version IrVersion() const noexcept {
@@ -1703,6 +1712,21 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
                      std::hash<std::string>, std::equal_to<std::string>>
       sparse_tensor_names_;
 
+  // Prepacked blobs container that stored pre-packed initializers
+  // data that is:
+  // - mem-mapped from disk
+  // - shared within the session
+  // - shared across sessions by transferring the ownership of loaded data entries to
+  // SessionState::PrepackedWeightsContainer* if one is present.
+  // This container is optional because it is present only in the root graph.
+  std::optional<PrepackedKeyToBlobMap> prepacked_key_to_blobs_;
+
+  // This container contains a reference to the root prepacked_key_to_blobs_
+  // and also (in the save mode) records association between the initializer
+  // names and their pre-packed blobs (via keys).
+  // This is optional due to delayed construction.
+  std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   // Runtime optimization storage.
   // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized

diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedWeightsForGraph;
+
+// These options affect how the model initializers are written to the external file.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into single data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+#ifdef _WIN32
+  int64_t allocation_granularity = 65536;
+#else
+  int64_t allocation_granularity = 4096;
+#endif
+};
+
+}  // namespace onnxruntime
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -250,6 +250,17 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil
 static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
     "session.optimized_model_external_initializers_min_size_in_bytes";
 
+// Use this config when saving pre-packed constant initializers to an external data file.
+// This allows you to memory map pre-packed initializers on model load and leave it to
+// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise,
+// pre-packed data resides on the heap.
+//
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
 // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
 // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
 // "0": disable. (default)

diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h
@@ -6,7 +6,8 @@
 #include <vector>
 
 #include "core/common/basic_types.h"
-#include "core/framework/buffer_deleter.h"
+#include "core/common/inlined_containers_fwd.h"
+#include "core/framework/allocator.h"
 #include "core/framework/tensor_shape.h"
 
 namespace onnxruntime {
@@ -16,11 +17,14 @@ struct PrePackedWeights final {
   // Hence we hold them in container. It is upto the developer implementing each PrePack()
   // method to define what gets stored in which position of the container.
 
-  std::vector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
-  std::vector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
+  InlinedVector<IAllocatorUniquePtr<void>> buffers_;  // cache pre-packed buffers associated with the kernel
+  InlinedVector<size_t> buffer_sizes_;                // cache sizes of pre-packed buffers (in bytes)
 
   // Produces a hash of the buffers stored in the given instance of this class
   HashValue GetHash() const;
+
+  // The function creates a copy with non-owning BufferUniquePtrs.
+  PrePackedWeights CreateReferringCopy() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc
@@ -3,9 +3,21 @@
 
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/allocator_utils.h"
+#include "core/graph/graph.h"
 
 namespace onnxruntime {
 
+PrePackedWeights PrePackedWeights::CreateReferringCopy() const {
+  PrePackedWeights copy;
+  for (const auto& prepacked_buffer : buffers_) {
+    // No deleter is needed as the buffer is not owned by the unique_ptr
+    copy.buffers_.emplace_back(prepacked_buffer.get(), [](void*) {});
+  }
+
+  copy.buffer_sizes_ = buffer_sizes_;
+  return copy;
+}
+
 AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) {
   auto iter = allocators_.find(device_name);
 
@@ -49,4 +61,50 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const {
   return prepacked_weights_map_.size();
 }
 
+void PrepackedWeightsForGraph::InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight) {
+  // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and
+  // up the tree by the same kernel with the same result. The map prevents this from happening.
+  key_to_blobs_.emplace(key, std::move(packed_weight));
+}
+
+void PrepackedWeightsForGraph::WritePackedMaybeForSave(const std::string& weight_name, const std::string& key,
+                                                       PrePackedWeights&& packed_weight) {
+  key_to_blobs_.insert_or_assign(key, std::move(packed_weight));
+
+  if (save_mode_on_) {
+    weight_prepacks_for_saving_[weight_name].insert(key);
+  }
+}
+
+const PrePackedWeights* PrepackedWeightsForGraph::GetPrepackedWeights(const std::string& key) const {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    return nullptr;
+  }
+  return &it->second;
+}
+
+std::optional<PrePackedWeights> PrepackedWeightsForGraph::ReplaceWithReferenceIfSaving(
+    const std::string& weight_name,
+    const std::string& key,
+    const PrePackedWeights& refer_to_if_absent) {
+  auto it = key_to_blobs_.find(key);
+  if (it == key_to_blobs_.end()) {
+    if (save_mode_on_) {
+      key_to_blobs_.emplace(key, refer_to_if_absent.CreateReferringCopy());
+      weight_prepacks_for_saving_[weight_name].insert(key);
+    }
+    return std::nullopt;
+  }
+
+  PrePackedWeights result = std::move(it->second);
+  if (save_mode_on_) {
+    it->second = result.CreateReferringCopy();
+    weight_prepacks_for_saving_[weight_name].insert(key);
+  } else {
+    key_to_blobs_.erase(it);
+  }
+  return result;
+}
+
 }  // namespace onnxruntime