diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h index 07625c38d8474..375f0a4dc8dd2 100644 --- a/include/onnxruntime/core/framework/op_kernel.h +++ b/include/onnxruntime/core/framework/op_kernel.h @@ -7,6 +7,7 @@ // It is safe to include the below header even if SHARED_PROVIDER macro is enabled // as it doesn't include any pb headers. +#include "core/framework/buffer_deleter.h" #include "core/framework/prepacked_weights_container.h" #ifndef SHARED_PROVIDER diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index eb9581e8018d1..7798394b045dc 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -3,14 +3,15 @@ #pragma once +#include #include #include #include +#include #include #include #include #include -#include #include "core/common/flatbuffers.h" @@ -19,13 +20,14 @@ #include "core/common/common.h" #include "core/common/path_string.h" #include "core/common/const_pointer_container.h" +#include "core/common/inlined_containers_fwd.h" #if !defined(ORT_MINIMAL_BUILD) #include "core/common/inlined_containers.h" #endif -#include "core/common/inlined_containers_fwd.h" #include "core/common/span_utils.h" #include "core/common/status.h" #include "core/common/logging/logging.h" +#include "core/framework/prepacked_weights_container.h" #include "core/graph/onnx_protobuf.h" #include "core/graph/basic_types.h" #include "core/graph/constants.h" @@ -41,6 +43,7 @@ namespace onnxruntime { class Graph; struct IndexedSubGraph; class Model; +struct ModelSavingOptions; class OpSignature; #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -1153,29 +1156,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi const ONNX_NAMESPACE::GraphProto& ToGraphProto(); ONNX_NAMESPACE::GraphProto ToGraphProto() const; - // Options to align external initializer offset. - // For models running on CPU, ORT will try to use mmap to load external initializers. - // To use mmap, external initializer need to be offset aligned. - // ORT saves external initializers into signle data file, each initializer is accessed with - // offset(start position of initializer) and length(byte length of initializer) of the data file. - // To use mmap, each offset need to be aligned which means offset need to divisible by - // allocation granularity(64KB for windows and 4K for other OSes). - // With align_offset to true, ORT will align offset for large initializer when - // save ONNX model with external data file. - struct OffsetAlignmentInfo { - // Offset will always be page aligned and allocation granularity aligned for mmap support. - // This is done by padding previous tensor data with zeros keeping same length. - bool align_offset = false; - // Alignment threshold for size of data. - // Having a low threshold will waste file space for small initializers. - // Only when tensor's data size is > the page_align_threshold it will be force aligned. - // Default to 1MB. - int64_t align_threshold = 1048576; - // The allocation Granularity for mmap() support. - // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB. - int64_t allocation_granularity = 65536; - }; - /** Gets the GraphProto representation of this Graph @param external_file_path File path of the binary file to use for initializers. @param model_file_path path of the model file. @@ -1186,15 +1166,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi */ ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path, const std::filesystem::path& model_file_path, - size_t initializer_size_threshold, - const OffsetAlignmentInfo& align_info) const; - - ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path, - const std::filesystem::path& model_file_path, - size_t initializer_size_threshold) const { - OffsetAlignmentInfo default_options; - return ToGraphProtoWithExternalInitializers(external_file_path, model_file_path, initializer_size_threshold, default_options); - } + const ModelSavingOptions& model_saving_options) const; /** Gets the ISchemaRegistry instances being used with this Graph. */ IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const; @@ -1400,6 +1372,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi #endif // !defined(ORT_MINIMAL_BUILD) + // This function constructs PrepackedSharedContainer in the root graph only + // and initializes a reference to it in all (sub)graphs + void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on); + + const PrepackedWeightsForGraph& GetPrepacked() const noexcept { + return *prepacked_weights_for_graph_; + } + + PrepackedWeightsForGraph& GetPrepacked() noexcept { + return *prepacked_weights_for_graph_; + } + /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */ const Node* ParentNode() const { return parent_node_; } @@ -1519,6 +1503,31 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto, std::optional new_name); + /// + /// This function traverses the graph bottom up and externalizes + /// constant initializers along with their pre-packed blobs from different + /// kernels. Writes constant initializers to the external file with any pre-packed + /// blobs (if enabled and produced for this initializer) and then modifies TensorProto + /// entry with external data references. + /// + /// model file path from Model + /// a binary file path for relative to the model file path + /// where the initializers data is written + /// model file folder path with external file path appended + /// model saving options including alignment and pre-packs + /// The graph proto to be modified + /// external file stream + /// current external file offset updated with each write + /// Status instance + Status AddExternalInitializersToGraphProtoImpl( + const std::filesystem::path& model_path, + const std::filesystem::path& external_file_path, + const std::filesystem::path& model_external_file_path, + const ModelSavingOptions& model_saving_options, + ONNX_NAMESPACE::GraphProto& output_graph_proto, + std::ostream& external_stream, + int64_t& external_offset) const; + #endif Version IrVersion() const noexcept { @@ -1703,6 +1712,21 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi std::hash, std::equal_to> sparse_tensor_names_; + // Prepacked blobs container that stored pre-packed initializers + // data that is: + // - mem-mapped from disk + // - shared within the session + // - shared across sessions by transferring the ownership of loaded data entries to + // SessionState::PrepackedWeightsContainer* if one is present. + // This container is optional because it is present only in the root graph. + std::optional prepacked_key_to_blobs_; + + // This container contains a reference to the root prepacked_key_to_blobs_ + // and also (in the save mode) records association between the initializer + // names and their pre-packed blobs (via keys). + // This is optional due to delayed construction. + std::optional prepacked_weights_for_graph_; + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // Runtime optimization storage. // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h new file mode 100644 index 0000000000000..924799f15b247 --- /dev/null +++ b/include/onnxruntime/core/graph/model_saving_options.h @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace onnxruntime { + +class PrepackedWeightsForGraph; + +// These options affect how the model initializers are written to the external file. +// This includes options to align external initializer offset. +// For models running on CPU, ORT will try to use mmap to load external +// initializers. To use mmap, external initializer need to be offset aligned. +// ORT saves external initializers into single data file, each initializer is +// accessed with offset(start position of initializer) and length(byte length of +// initializer) of the data file. To use mmap, each offset need to be aligned +// which means offset need to divisible by allocation granularity(64KB for +// windows and 4K for other OSes). With align_offset to true, ORT will align +// offset for large initializer when save ONNX model with external data file. +struct ModelSavingOptions { + explicit ModelSavingOptions(size_t size_threshold) + : initializer_size_threshold(size_threshold) {} + + // Mimimal initializer size in bytes to be externalized on disk + size_t initializer_size_threshold; + // Offset will always be page aligned and allocation granularity aligned for + // mmap support. This is done by padding previous tensor data with zeros + // keeping same length. + bool align_offset = false; + // Alignment threshold for size of data. + // Having a low threshold will waste file space for small initializers. + // Only when tensor's data size is > the page_align_threshold it will be force + // aligned. Default to 1MB. + int64_t align_threshold = 1048576; + // The allocation Granularity for mmap() support. + // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB. +#ifdef _WIN32 + int64_t allocation_granularity = 65536; +#else + int64_t allocation_granularity = 4096; +#endif +}; + +} // namespace onnxruntime diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 8f1bc98ce7b49..64a4dd19c12b0 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -250,6 +250,17 @@ static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFil static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes = "session.optimized_model_external_initializers_min_size_in_bytes"; +// Use this config when saving pre-packed constant initializers to an external data file. +// This allows you to memory map pre-packed initializers on model load and leave it to +// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise, +// pre-packed data resides on the heap. +// +// - "0": Default is not save pre-packed initializers to a data file. +// - "1": Save pre-packed constant initializers to an external data file. +// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers, "1") +static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers = + "session.save_external_prepacked_constant_initializers"; + // Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file. // The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead. // "0": disable. (default) diff --git a/onnxruntime/core/framework/prepacked_weights.h b/onnxruntime/core/framework/prepacked_weights.h index fbf99b81937ee..9695be1e0554c 100644 --- a/onnxruntime/core/framework/prepacked_weights.h +++ b/onnxruntime/core/framework/prepacked_weights.h @@ -6,7 +6,8 @@ #include #include "core/common/basic_types.h" -#include "core/framework/buffer_deleter.h" +#include "core/common/inlined_containers_fwd.h" +#include "core/framework/allocator.h" #include "core/framework/tensor_shape.h" namespace onnxruntime { @@ -16,11 +17,14 @@ struct PrePackedWeights final { // Hence we hold them in container. It is upto the developer implementing each PrePack() // method to define what gets stored in which position of the container. - std::vector> buffers_; // cache pre-packed buffers associated with the kernel - std::vector buffer_sizes_; // cache sizes of pre-packed buffers (in bytes) + InlinedVector> buffers_; // cache pre-packed buffers associated with the kernel + InlinedVector buffer_sizes_; // cache sizes of pre-packed buffers (in bytes) // Produces a hash of the buffers stored in the given instance of this class HashValue GetHash() const; + + // The function creates a copy with non-owning BufferUniquePtrs. + PrePackedWeights CreateReferringCopy() const; }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/prepacked_weights_container.cc b/onnxruntime/core/framework/prepacked_weights_container.cc index b6d44dd248bdd..7c832a0ac2691 100644 --- a/onnxruntime/core/framework/prepacked_weights_container.cc +++ b/onnxruntime/core/framework/prepacked_weights_container.cc @@ -3,9 +3,21 @@ #include "core/framework/prepacked_weights_container.h" #include "core/framework/allocator_utils.h" +#include "core/graph/graph.h" namespace onnxruntime { +PrePackedWeights PrePackedWeights::CreateReferringCopy() const { + PrePackedWeights copy; + for (const auto& prepacked_buffer : buffers_) { + // No deleter is needed as the buffer is not owned by the unique_ptr + copy.buffers_.emplace_back(prepacked_buffer.get(), [](void*) {}); + } + + copy.buffer_sizes_ = buffer_sizes_; + return copy; +} + AllocatorPtr PrepackedWeightsContainer::GetOrCreateAllocator(const std::string& device_name) { auto iter = allocators_.find(device_name); @@ -49,4 +61,50 @@ size_t PrepackedWeightsContainer::GetNumberOfElements() const { return prepacked_weights_map_.size(); } +void PrepackedWeightsForGraph::InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight) { + // We may have duplicate entries mapped from disk if the same weight is pre-packed from subgraphs and + // up the tree by the same kernel with the same result. The map prevents this from happening. + key_to_blobs_.emplace(key, std::move(packed_weight)); +} + +void PrepackedWeightsForGraph::WritePackedMaybeForSave(const std::string& weight_name, const std::string& key, + PrePackedWeights&& packed_weight) { + key_to_blobs_.insert_or_assign(key, std::move(packed_weight)); + + if (save_mode_on_) { + weight_prepacks_for_saving_[weight_name].insert(key); + } +} + +const PrePackedWeights* PrepackedWeightsForGraph::GetPrepackedWeights(const std::string& key) const { + auto it = key_to_blobs_.find(key); + if (it == key_to_blobs_.end()) { + return nullptr; + } + return &it->second; +} + +std::optional PrepackedWeightsForGraph::ReplaceWithReferenceIfSaving( + const std::string& weight_name, + const std::string& key, + const PrePackedWeights& refer_to_if_absent) { + auto it = key_to_blobs_.find(key); + if (it == key_to_blobs_.end()) { + if (save_mode_on_) { + key_to_blobs_.emplace(key, refer_to_if_absent.CreateReferringCopy()); + weight_prepacks_for_saving_[weight_name].insert(key); + } + return std::nullopt; + } + + PrePackedWeights result = std::move(it->second); + if (save_mode_on_) { + it->second = result.CreateReferringCopy(); + weight_prepacks_for_saving_[weight_name].insert(key); + } else { + key_to_blobs_.erase(it); + } + return result; +} + } // namespace onnxruntime diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h index 37fc01c05f2ae..f48c790eb4126 100644 --- a/onnxruntime/core/framework/prepacked_weights_container.h +++ b/onnxruntime/core/framework/prepacked_weights_container.h @@ -3,19 +3,26 @@ #pragma once -#include -#include -#include -#include - -#include "core/framework/buffer_deleter.h" - +#include "core/common/common.h" #include "core/framework/allocator.h" -#include #include "prepacked_weights.h" +#include +#include +#include +#include +#include +#include +#include + namespace onnxruntime { +#ifndef SHARED_PROVIDER +class Graph; +#else +struct Graph; +#endif + class PrepackedWeightsContainer final { public: PrepackedWeightsContainer() { @@ -66,4 +73,98 @@ class PrepackedWeightsContainer final { std::unordered_map prepacked_weights_map_; }; +// Maps a pre-packed weight blob key to PrepackedWeights instance +using PrepackedKeyToBlobMap = std::unordered_map; + +/// +/// This class has a dual purpose. +/// If saving is OFF (IsSaveModeOn() false), it is used to contain the weights memory mapped from disk. +/// Those weights are then moved to the shared container if weight sharing is enabled. +/// If cross-session weight sharing is not enabled, the weights are stored in this container, +/// and shared with the interested kernels. +/// +/// When saving to disk is ON (IsSaveModeOn() true) +/// It records the pre-packed weights blobs and associates them with the weight name. +/// When saving the model with external initializers, the weights are written to disk along +/// with the pre-packed blobs. +/// +/// +class PrepackedWeightsForGraph { + public: + PrepackedWeightsForGraph(PrepackedKeyToBlobMap& key_blobs, bool save_mode_on_) + : key_to_blobs_(key_blobs), save_mode_on_(save_mode_on_) { + } + + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PrepackedWeightsForGraph); + + // WeightToPrePacksMap maps weight name to a set of pre-packed + // keys contained in the KeyToBlobMap + using KeysPerWeight = std::unordered_set; // blob keys + using WeightToPrePacksMap = std::unordered_map; + + void InsertPrepackedWeights(const std::string& key, PrePackedWeights&& packed_weight); + + // Overwrites the existing weights and associates key with weight_name + void WritePackedMaybeForSave(const std::string& weight_name, const std::string& key, + PrePackedWeights&& packed_weight); + + const PrePackedWeights* GetPrepackedWeights(const std::string& key) const; + + // The function would add or replace existing entry with references to it. + // If the entry is present, it would replace it with references to the existing entry. + // If the entry is not present, it would add reference to refer_if_absent + // If the entry is present it would return the existing entry otherwise std::nullopt + // Reference in this context means a non-owning smart pointer. Essentially, this function + // replaces the existing entry with the same entry, but transfers the ownership outside + // the container. + std::optional ReplaceWithReferenceIfSaving(const std::string& weight_name, + const std::string& key, + const PrePackedWeights& refer_to_if_absent); + + bool IsSaveModeOn() const noexcept { + return save_mode_on_; + } + + void SetSaveMode(bool value) noexcept { + save_mode_on_ = value; + } + + const KeysPerWeight* GetKeysForWeightForSaving(const std::string& weight_name) const { + auto hit = weight_prepacks_for_saving_.find(weight_name); + if (hit != weight_prepacks_for_saving_.end()) { + return &hit->second; + } + return nullptr; + } + + size_t GetNumberOfWeightsForWriting() const noexcept { + return weight_prepacks_for_saving_.size(); + } + + size_t GetNumberOfKeyedBlobsForWriting() const noexcept { + size_t result = 0; + for (const auto& [_, keys] : weight_prepacks_for_saving_) { + result += keys.size(); + } + return result; + } + + const WeightToPrePacksMap& GetWeightToPrepack() const noexcept { + return weight_prepacks_for_saving_; + } + + PrepackedKeyToBlobMap& GetKeyToBlob() noexcept { + return key_to_blobs_; + } + + const PrepackedKeyToBlobMap& GetKeyToBlob() const noexcept { + return key_to_blobs_; + } + + private: + PrepackedKeyToBlobMap& key_to_blobs_; + bool save_mode_on_; + WeightToPrePacksMap weight_prepacks_for_saving_; +}; + } // namespace onnxruntime diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 0ac2271ba09f1..d7059bf848e83 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -13,6 +13,7 @@ #include "core/framework/node_index_info.h" #include "core/framework/op_kernel.h" #include "core/framework/ort_value_pattern_planner.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/session_state_utils.h" #include "core/framework/utils.h" #include "core/providers/cpu/controlflow/utils.h" @@ -122,7 +123,9 @@ void SessionState::UpdateAllocatorsWithEnvAllocators(const std::vector& SessionState::GetConstantInitializedTen return constant_initialized_tensors_; } +const PrepackedWeightsForGraph& onnxruntime::SessionState::GetPrepackedIniitializersForGraph() const { + return graph_.GetPrepacked(); +} + #if !defined(DISABLE_SPARSE_TENSORS) bool SessionState::IsSparseInitializer(int ort_value_index) const { return sparse_initialized_tensors_.count(ort_value_index) > 0; @@ -396,8 +403,9 @@ static std::string GenerateKeyForPrepackedWeightsMap(const std::string& op_type, return ss_1.str(); } -Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap& constant_initializers_use_count, - const std::unordered_map& initializers_to_share_map) { +Status SessionState::PrepackConstantInitializedTensors( + InlinedHashMap& constant_initializers_use_count, + const std::unordered_map& initializers_to_share_map) { auto prepacked_constant_weights = [this, &constant_initializers_use_count, &initializers_to_share_map]( bool should_cache_prepacked_weights_for_shared_initializers) -> Status { for (auto& node : GetGraphViewer().Nodes()) { @@ -407,6 +415,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapExists()) { const std::string& input_name = input_def->Name(); SessionState* st = this; + auto* prepacked_for_graph = &graph_.GetPrepacked(); // subgraph can use the value from outer scope, // so it needs to check if current node uses constant initialized tensor from current and outer graphs do { @@ -423,7 +432,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapGetOrCreateAllocator(CPU); ORT_ENFORCE(allocator_for_caching.get() != nullptr); @@ -431,16 +441,19 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapPrePack(const_initialized_tensor, input_idx, allocator_for_caching, is_packed, &weights_to_be_filled_in)); if (is_packed) { - // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight to be cached if the weight was pre-packed - ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, "The kernel corresponding to the node ", node.Name(), + // BUG CHECK: Ensure that the kernel has filled in the pre-packed weight + // to be cached if the weight was pre-packed + ORT_ENFORCE(weights_to_be_filled_in.buffers_.size() > 0, + "The kernel corresponding to the node ", node.Name(), " doesn't have an implementation that can cache computed pre-packed weights"); const auto& op_type = node.OpType(); @@ -452,40 +465,117 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapHasWeight(prepacked_weights_container_key); + bool container_contains_packed_weight = prepacked_weights_container_->HasWeight( + prepacked_weights_container_key); if (container_contains_packed_weight) { - LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " << input_name - << " used in the node: " << node.Name() << " which is of op type: " << node.OpType(); + LOGS(logger_, INFO) << "Using cached version of pre-packed weight for constant initializer: " + << input_name + << " used in the node: " << node.Name() << " which is of op type: " + << node.OpType(); + const auto& prepacked_shared = prepacked_weights_container_->GetWeight( + prepacked_weights_container_key); ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx, - prepacked_weights_container_->GetWeight(prepacked_weights_container_key), + prepacked_shared, node.Name())); ++used_shared_pre_packed_weights_counter_; - } else { // container doesn't contain the pre-packed weight - so write into it for sharing across kernel instances - if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, std::move(weights_to_be_filled_in))) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unable to write the provided PrePackedWeights instance into the container"); + // Write references to what is stored in the shared container + // and release memory mapped entries this container may have loaded from disk + std::ignore = prepacked_for_graph->ReplaceWithReferenceIfSaving(input_name, + prepacked_weights_container_key, + prepacked_shared); + + } else { + // container doesn't contain the pre-packed weight - so write into it for sharing across + // kernel instances + + // Check if we loaded it from disk, then put it into the shared container so + // everybody can share the same memory mapped entry + // the shared container takes ownership of the memory mapped entries + + // The next line replaces the existing entry with references to it + // and returns the container that holds the memory mapped entries + // so we can transfer it to shared container. + // if there is not an entry, we replace it with references to weights_to_be_filled_in + // in saving mode and return std::nullopt + auto prepacked_from_disk = prepacked_for_graph->ReplaceWithReferenceIfSaving( + input_name, + prepacked_weights_container_key, + weights_to_be_filled_in); + + if (prepacked_from_disk.has_value()) { + weights_to_be_filled_in = std::move(*prepacked_from_disk); } + if (!prepacked_weights_container_->WriteWeight(prepacked_weights_container_key, + std::move(weights_to_be_filled_in))) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, FAIL, + "Unable to write the provided PrePackedWeights instance into the container"); + } + + const auto& shared_prepacked = prepacked_weights_container_->GetWeight( + prepacked_weights_container_key); ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx, - prepacked_weights_container_->GetWeight(prepacked_weights_container_key), + shared_prepacked, node.Name())); } } - } else { // caching of pre-packed weights' turned OFF + } else { + // cross session caching of pre-packed weights' turned OFF + // we use serialization container to share weights loaded from disk + // within this session. Or if the weight is not present on disk, + // we store the newly minted pre-packed data. + AllocatorPtr session_cpu_alloc = GetAllocator(kernel->Info().GetDevice(OrtMemType::OrtMemTypeDefault)); - ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, - session_cpu_alloc, // use allocator tied to this session + PrePackedWeights weights_to_be_filled_in; + // The reason we invoke PrePack() before looking into the container for any pre-packed weight + // cached by another instance of the same op_type (for the same constant initializer) is because + // to truly know if we can use a cached pre-packed weight, we would have to compare the cached + // pre-packed weight with the pre-packed weight generated by this instance of the same op_type because + // other static properties of the node like node attributes could play a role in the pre-packed + // weights' contents. + ORT_RETURN_IF_ERROR(kernel->PrePack(const_initialized_tensor, input_idx, session_cpu_alloc, is_packed, - nullptr // no caching required - )); + &weights_to_be_filled_in)); + + // Some kernels (matmul_nbits and non-CPU related kernels) do not share their pre-packed results + // even though they set is_packed = true so we leave it up to them. + // We can change their behavior if we wish do so in a separate PR + // XXX: Interestingly enough, matmul_nbits does accept shared pre-packs, but does not + // produce them. + if (is_packed && !weights_to_be_filled_in.buffers_.empty()) { + const auto& op_type = node.OpType(); + const std::string prepacked_weights_container_key = GenerateKeyForPrepackedWeightsMap( + op_type, + weights_to_be_filled_in); + + // See if we can use pre-packed data from disk + const auto* weights_to_use = prepacked_for_graph->GetPrepackedWeights( + prepacked_weights_container_key); + + if (weights_to_use == nullptr) { + // In this case pre-packed container owns the data + prepacked_for_graph->WritePackedMaybeForSave(input_name, prepacked_weights_container_key, + std::move(weights_to_be_filled_in)); + weights_to_use = prepacked_for_graph->GetPrepackedWeights(prepacked_weights_container_key); + assert(weights_to_use != nullptr); + } + + ORT_RETURN_IF_ERROR(KernelUseSharedPrePackedBuffers(*kernel, input_idx, + *weights_to_use, + node.Name())); + } } + if (is_packed) { ++number_of_prepacks_counter_; @@ -504,6 +594,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMapParent(); + prepacked_for_graph = &st->graph_.GetPrepacked(); } while (st); } input_idx++; @@ -525,7 +616,8 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap& tensor_inputs) { +static int64_t +CalculateMemoryPatternsKey(const gsl::span& tensor_inputs) { int64_t key = 0; for (const auto& input : tensor_inputs) { for (auto dim : input.Get().Shape().GetDims()) key ^= dim; @@ -1068,9 +1160,12 @@ Status SessionState::CreateSubgraphSessionState() { // Calculate the use count of a constant initialized tensor, including the use in subgraph. // Note: This function doesn't handle the case below: -// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, which overrides the X from main graph. -// For case like this, the current implementation will calculate the use count as 2, but they could contain completely different values so each should have a use count of 1. -// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory usage of X won't be saved. This will be fine. +// The main graph has a constant initializer called X, and the subgraph also has a constant initializer called X, +// which overrides the X from main graph. +// For case like this, the current implementation will calculate the use count as 2, but they could contain completely +// different values so each should have a use count of 1. +// This is a very rare case. If it happens and X is prepacked, the consequence is that X won't be released and memory +// usage of X won't be saved. This will be fine. static void ComputeConstantInitializerUseCount(const Graph& graph, InlinedHashMap& constant_initializers_use_count) { for (const auto& node : graph.Nodes()) { for (const auto* arg : node.InputDefs()) { @@ -1189,7 +1284,30 @@ Status SessionState::FinalizeSessionState(const std::basic_string constant_initializers_use_count; ComputeConstantInitializerUseCount(graph_, constant_initializers_use_count); return FinalizeSessionStateImpl(graph_location, kernel_registry_manager, nullptr, sess_options_, - remove_initializers, constant_initializers_use_count); + remove_initializers, + GetSaveModeForPrepacks(!remove_initializers, saving_ort_format), + constant_initializers_use_count); +} + +bool SessionState::GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format) { + bool save_prepacked_constant_initializers = + sess_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsSavePrePackedConstantInitializers, + "0") == "1"; + + if (save_prepacked_constant_initializers && !saving_model) { + save_prepacked_constant_initializers = false; + LOGS(logger_, WARNING) + << "SavePrePackedConstantInitializers is set to true but the model is not being saved. Ignoring the flag."; + } + + if (save_prepacked_constant_initializers && saving_ort_format) { + save_prepacked_constant_initializers = false; + LOGS(logger_, WARNING) + << "Serializing optimized model in ORT format with external pre-packed constant initializers is not supported." + << " Ignoring the flag."; + } + + return save_prepacked_constant_initializers; } static Status Index(const OrtValueNameIdxMap& ort_value_name_idx_map, @@ -1322,11 +1440,12 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string& constant_initializers_use_count, const InlinedHashMap& outer_scope_node_arg_to_location_map, bool graph_info_already_created) { if (!graph_info_already_created) { - CreateGraphInfo(); + CreateGraphInfo(save_prepacked_initializers); } #if defined(ORT_EXTENDED_MINIMAL_BUILD) @@ -1475,21 +1594,20 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string Status { - ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse)); - if (remove_initializers) { - graph_.RemoveInitializedTensor(name); - } - return Status::OK(); - }, - logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options, - memory_profile_func, name_to_buffered_tensor_)); + ORT_RETURN_IF_ERROR(session_state_utils::SaveInitializedTensors( + Env::Default(), graph_location, *graph_viewer_, + GetAllocator(OrtDevice()), + ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator, + [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d, + bool constant, bool sparse) -> Status { + ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse)); + if (remove_initializers) { + graph_.RemoveInitializedTensor(name); + } + return Status::OK(); + }, + logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options, + memory_profile_func, name_to_buffered_tensor_, graph_.GetPrepacked())); #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) // Record Weight allocation info on device @@ -1537,15 +1655,17 @@ Status SessionState::FinalizeSessionStateImpl(const std::basic_string subgraph_outer_scope_node_arg_to_location_map; ORT_RETURN_IF_ERROR(OuterScopeNodeArgLocationAccumulator(*p_seq_exec_plan_, GetOrtValueNameIdxMap(), node, subgraph_session_state.GetGraphViewer(), subgraph_outer_scope_node_arg_to_location_map)); + ORT_RETURN_IF_ERROR(subgraph_session_state.FinalizeSessionStateImpl( graph_location, kernel_registry_manager, &node, subgraph_session_options, remove_initializers, + save_prepacked_initializers, constant_initializers_use_count, subgraph_outer_scope_node_arg_to_location_map, true)); // setup all the info for handling the feeds and fetches used in subgraph execution diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index e1674ba4b690b..82f520f4a4252 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -164,6 +164,8 @@ class SessionState { */ const std::unordered_map& GetConstantInitializedTensors() const; + const PrepackedWeightsForGraph& GetPrepackedIniitializersForGraph() const; + #if !defined(DISABLE_SPARSE_TENSORS) bool IsSparseInitializer(int ort_value_index) const; #endif @@ -364,11 +366,20 @@ class SessionState { const SessionOptions& GetSessionOptions() const { return sess_options_; } + /// + /// Deduce the flag whether we need to enable or disable + /// saving for pre-packed weights serialization. + /// + /// + /// + /// true of false + bool GetSaveModeForPrepacks(bool saving_model, bool saving_ort_format); + private: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SessionState); // Populate OrtValueNameIdxMap and create the graph viewer. - void CreateGraphInfo(); + void CreateGraphInfo(bool save_prepacked_on); // create kernels using info in kernel_create_info_map_ Status CreateKernels(const KernelRegistryManager& custom_registry_manager); @@ -399,6 +410,7 @@ class SessionState { _In_opt_ const Node* parent_node, const SessionOptions& session_options, bool remove_initializers, + bool save_prepacked_initializers, InlinedHashMap& constant_initializers_use_count, const InlinedHashMap& outer_scope_node_arg_to_location_map = {}, bool graph_info_already_created = false); diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc index 2c74805c57dce..83a353615bc35 100644 --- a/onnxruntime/core/framework/session_state_utils.cc +++ b/onnxruntime/core/framework/session_state_utils.cc @@ -68,18 +68,19 @@ struct ExtDataValueDeleter { // buffered_tensor is not null, buffered_tensor holds the real buffer pointed // by tensor_proto. buffered_tensor must be the owner of the buffer and deleter // should release the buffer when tensor_proto is released. -static inline common::Status ExtDataTensorProtoToTensor(const Env& env, - const std::basic_string& proto_path, - const ONNX_NAMESPACE::TensorProto& tensor_proto, - Tensor& tensor, OrtCallback& ext_data_deleter, - Tensor* buffered_tensor = nullptr) { +static common::Status ExtDataTensorProtoToTensor(const Env& env, + const std::basic_string& proto_path, + const ONNX_NAMESPACE::TensorProto& tensor_proto, + Tensor& tensor, OrtCallback& ext_data_deleter, + PrepackedWeightsForGraph& prepacked_for_graph, + Tensor* buffered_tensor = nullptr) { ORT_ENFORCE(utils::HasExternalData(tensor_proto)); void* ext_data_buf = nullptr; SafeInt ext_data_len = 0; ORT_RETURN_IF_ERROR(utils::GetExtDataFromTensorProto(env, proto_path.c_str(), tensor_proto, ext_data_buf, ext_data_len, ext_data_deleter, - buffered_tensor)); + buffered_tensor, &prepacked_for_graph)); // NB: creating a do-nothing allocator per tensor is wasteful; can perhaps be // avoided if the Tensor class implements the do-nothing behavior when given a @@ -100,6 +101,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st const AllocatorPtr& alloc, const AllocatorPtr& default_cpu_alloc, OrtValue& ort_value, const DataTransferManager& data_transfer_mgr, const ExternalDataLoaderManager& external_data_loader_mgr, + PrepackedWeightsForGraph& prepacked_for_graph, bool use_device_allocator_for_initializers = false, Tensor* buffered_tensor = nullptr) { if (bool(alloc) == (m != nullptr)) { @@ -127,8 +129,7 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st ORT_RETURN_IF_ERROR(utils::LoadExtDataToTensorFromTensorProto(env, proto_path, tensor_proto, *external_data_loader, *p_tensor)); - auto ml_tensor = DataTypeImpl::GetType(); - ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()); + Tensor::InitOrtValue(std::move(*p_tensor), ort_value); return common::Status::OK(); } else if (device_type == OrtDevice::CPU) { // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance @@ -139,7 +140,8 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st // TensorProtoToTensor it would copy the data, causing unnecessary overhead OrtCallback ext_data_deleter; ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_tensor, - ext_data_deleter, buffered_tensor)); + ext_data_deleter, prepacked_for_graph, + buffered_tensor)); ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()}; MLDataType ml_tensor_type = DataTypeImpl::GetType(); @@ -163,8 +165,9 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st OrtCallback ext_data_deleter; std::optional scoped_ort_callback_invoker; ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor, - ext_data_deleter, buffered_tensor)); - scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter); + ext_data_deleter, prepacked_for_graph, + buffered_tensor)); + scoped_ort_callback_invoker.emplace(ext_data_deleter); // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation. return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value); @@ -272,13 +275,14 @@ common::Status SaveInitializedTensors( const ExecutionPlanBase& exec_plan, const SessionOptions& session_options, const MemoryProfileFunction& memory_profile_func, - std::unordered_map>& buffered_tensors) { + std::unordered_map>& buffered_tensors, + PrepackedWeightsForGraph& prepacked_for_graph) { LOGS(logger, INFO) << "Saving initialized tensors."; ORT_ENFORCE(ort_value_name_idx_map.MaxIdx() > -1, "OrtValue indexes should have been populated."); // Determine if an intializer was supplied by the user for the purpose of sharing and if it requires a cross-device // copy. In case a cross-device copy is required, sharing cannot be accomplished since we allocate our own buffer - // for the destn device which cannot be shared between sessions. + // for the destination device which cannot be shared between sessions. auto use_user_supplied_initializer = [&session_options, &exec_plan, &logger, &ort_value_name_idx_map](const std::string& name) -> bool { bool retval = false; @@ -401,6 +405,7 @@ common::Status SaveInitializedTensors( Status st = DeserializeTensorProto(env, graph_loc, tensor_proto, (m.has_value()) ? &*m : nullptr, alloc, default_cpu_alloc, ort_value, data_transfer_mgr, external_data_loader_mgr, + prepacked_for_graph, use_device_allocator_for_initializers, p_tensor); if (!st.IsOK()) { std::ostringstream oss; diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h index af27f5caba0f4..17400c45e5f32 100644 --- a/onnxruntime/core/framework/session_state_utils.h +++ b/onnxruntime/core/framework/session_state_utils.h @@ -9,6 +9,7 @@ #include "core/common/const_pointer_container.h" #include "core/framework/allocator.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/tensor.h" #include "core/framework/tensor_allocator.h" #include "core/framework/session_options.h" @@ -50,7 +51,8 @@ common::Status SaveInitializedTensors( const ExecutionPlanBase& exec_plan, const SessionOptions& session_options, const MemoryProfileFunction& memory_profile_func, - std::unordered_map>& buffered_tensors); + std::unordered_map>& buffered_tensors, + PrepackedWeightsForGraph& prepacked_for_graph); common::Status AllocateTensor( const onnxruntime::MemBuffer* m, diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc index 93146e66d9f24..ec8b25e9f4afe 100644 --- a/onnxruntime/core/framework/tensor_external_data_info.cc +++ b/onnxruntime/core/framework/tensor_external_data_info.cc @@ -3,8 +3,13 @@ #include "tensor_external_data_info.h" #include "core/common/common.h" +#include "core/common/narrow.h" +#include "core/common/safeint.h" +#include "core/common/string_utils.h" #include "core/platform/path_lib.h" +#include + #ifdef _WIN32 #include #endif @@ -14,8 +19,24 @@ using ::ONNX_NAMESPACE::StringStringEntryProto; namespace onnxruntime { Status ExternalDataInfo::Create(const RepeatedPtrField& input, std::unique_ptr& out) { + auto str_to_int = [](const std::string& s, OFFSET_TYPE& result) -> Status { + char* end; +#ifdef _WIN32 + result = _strtoi64(s.c_str(), &end, 10); +#else + result = OrtStrToPtrDiff(s.c_str(), &end); +#endif + if (end != s.c_str() + s.length()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", s, " failed"); + } + return Status::OK(); + }; + out = std::make_unique(); + PrepackedInfos prepacked_infos; + const int input_size = input.size(); + for (int i = 0; i != input_size; ++i) { StringStringEntryProto stringmap = input[i]; if (!stringmap.has_key()) @@ -25,28 +46,112 @@ Status ExternalDataInfo::Create(const RepeatedPtrField& if (stringmap.key() == "location" && !stringmap.value().empty()) { out->rel_path_ = ToWideString(stringmap.value()); } else if (stringmap.key() == "offset" && !stringmap.value().empty()) { - char* end; -#ifdef _WIN32 - out->offset_ = _strtoi64(stringmap.value().c_str(), &end, 10); -#else - out->offset_ = OrtStrToPtrDiff(stringmap.value().c_str(), &end); -#endif - if (end != stringmap.value().c_str() + stringmap.value().length()) - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed"); + ORT_RETURN_IF_ERROR(str_to_int(stringmap.value(), out->offset_)); } else if (stringmap.key() == "length" && !stringmap.value().empty()) { char* end; - out->length_ = static_cast(OrtStrToPtrDiff(stringmap.value().c_str(), &end)); + out->length_ = narrow(OrtStrToPtrDiff(stringmap.value().c_str(), &end)); if (end != stringmap.value().c_str() + stringmap.value().length()) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed"); } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) { out->checksum_ = stringmap.value(); + } else if (stringmap.key().find("prepacked", 0) == 0) { + // Starts with 'prepacked', each has its own key. + // Each prepacked entry may have multiple blobs with the same key + // we output them with the same key + // format = key|offset;length;checksum[|offset;length;checksum] + // We are ignoring invalid entries (should not be any), and rely + // on in memory pre-packs regenerated in this case. + // users can over-write this file with the correct pre-packed info. + const std::string& prepacked = stringmap.value(); + if (!prepacked.empty()) { + auto split_fields = utils::SplitString(prepacked, "|", false); + if (split_fields.size() > 1) { + const std::string key{split_fields[0]}; + auto& blob_infos = prepacked_infos[key]; + for (size_t f = 1; f < split_fields.size(); ++f) { + const auto& blob = split_fields[f]; + auto blob_fields = utils::SplitString(blob, ";", false); + if (blob_fields.size() == 3) { + OFFSET_TYPE offset, len; + ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[0]), offset)); + ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[1]), len)); + blob_infos.push_back(std::make_tuple(offset, narrow(len), std::string(blob_fields[2]))); + } + } + if (blob_infos.empty()) { + prepacked_infos.erase(key); + } + } + } } else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error!"); } } + if (out->rel_path_.empty()) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Missing 'location'"); } + + if (!prepacked_infos.empty()) { + out->prepacked_infos_ = std::move(prepacked_infos); + } + return Status::OK(); } +void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& external_file_path, + int64_t external_offset, size_t tensor_bytes_size, + ::ONNX_NAMESPACE::TensorProto& proto) { + proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); + + auto* location = proto.add_external_data(); + location->set_key("location"); + location->set_value(ToUTF8String(external_file_path.native())); + + auto* offset = proto.add_external_data(); + offset->set_key("offset"); + offset->set_value(std::to_string(external_offset)); + + auto* length = proto.add_external_data(); + length->set_key("length"); + length->set_value(std::to_string(tensor_bytes_size)); +} + +std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto( + const PrepackedWeightsForGraph& prepacked_for_graph, + const InlinedHashSet& blob_keys, bool align, + int64_t align_threshold, int64_t allocation_granularity, + std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) { + size_t key_count = 0; + for (const auto& key : blob_keys) { + size_t prepack_count = 0; + const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(key); + ORT_ENFORCE(prepacked_weights != nullptr, "Prepacked weights not found for key ", key); + std::stringstream prepacked_entry; + prepacked_entry << key << "|"; + for (size_t i = 0, size = prepacked_weights->buffers_.size(); i < size; ++i) { + const auto size_in_bytes = prepacked_weights->buffer_sizes_[i]; + if (align && static_cast(size_in_bytes) > align_threshold) { + // return early on error + if (!AlignAndPad(os, allocation_granularity, external_offset)) { + return os; + } + } + if (prepack_count++ > 0) { + prepacked_entry << "|"; + } + // Checksum is currently not validated + prepacked_entry << external_offset << ";" << size_in_bytes << ";0"; + if (!os.write(reinterpret_cast(prepacked_weights->buffers_[i].get()), size_in_bytes)) { + return os; + } + external_offset = SafeInt(external_offset) + size_in_bytes; + } + auto* prepacked = proto.add_external_data(); + std::string prepacked_key("prepacked_"); + prepacked_key.append(std::to_string(key_count++)); + prepacked->set_key(std::move(prepacked_key)); + prepacked->set_value(prepacked_entry.str()); + } + return os; +} } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h index afc8fda6c3037..1b185b8c5da7d 100644 --- a/onnxruntime/core/framework/tensor_external_data_info.h +++ b/onnxruntime/core/framework/tensor_external_data_info.h @@ -2,12 +2,21 @@ // Licensed under the MIT License. #pragma once +#include +#include +#include #include +#include + +#include +#include "core/common/path_string.h" +#include "core/common/safeint.h" #include "core/common/status.h" +#include "core/framework/prepacked_weights_container.h" #include "core/graph/onnx_protobuf.h" -#include "core/session/onnxruntime_c_api.h" namespace onnxruntime { + class ExternalDataInfo { public: #ifdef _WIN32 @@ -16,7 +25,7 @@ class ExternalDataInfo { using OFFSET_TYPE = off_t; #endif - const std::basic_string& GetRelPath() const { return rel_path_; } + const PathString& GetRelPath() const { return rel_path_; } OFFSET_TYPE GetOffset() const { return offset_; } size_t GetLength() const { return length_; } @@ -29,12 +38,58 @@ class ExternalDataInfo { const ::google::protobuf::RepeatedPtrField<::ONNX_NAMESPACE::StringStringEntryProto>& input, std::unique_ptr& out); + static void SetExternalLocationToProto(const std::filesystem::path& external_file_path, + int64_t offset, + size_t tensor_bytes_size, + ::ONNX_NAMESPACE::TensorProto& proto); + + // Pads the output with zeros according to the specified allocation_granularity + // It updates external_offset for alignment. + // need to do padding before write actual tensor data as we do offset alignment at the begin of + // large tensors (offset need to be page aligned and allocation granularity aligned) like below: + // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX + // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->| + static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) { + // Align to the larger of the page size or the allocation granularity + int64_t alignment_factor = std::max(static_cast(4096), allocation_granularity); + // Align to the next page or alloc granularity boundary + SafeInt safe_external_offset = external_offset; + int64_t new_external_offset = ((safe_external_offset + alignment_factor - 1) / alignment_factor) * + alignment_factor; + + // padding tensor with zeros for alignment + for (int64_t index = external_offset; index != new_external_offset; ++index) { + stream << '\0'; + } + external_offset = new_external_offset; + return stream; + } + + static std::ostream& WritePrepackedToFileAndAddToProto( + const PrepackedWeightsForGraph& prepacked_for_graph, + const InlinedHashSet& blob_keys, + bool align, int64_t align_threshold, int64_t allocation_granularity, + std::ostream& os, + int64_t& external_offset, + ::ONNX_NAMESPACE::TensorProto& proto); + + using PrepackedInfo = std::tuple; + using PrepackedInfos = std::unordered_map>; + + bool HasPrepackedInfo() const noexcept { return !prepacked_infos_.empty(); } + + PrepackedInfos&& TakePrepackedInfos() { return std::move(prepacked_infos_); } + private: - std::basic_string rel_path_; + PathString rel_path_; OFFSET_TYPE offset_ = 0; // 0 means the whole file size_t length_ = 0; std::string checksum_; + + // Pre-packed blobs found associated with this TensorProto if present + // format key, offset, length, checksum + PrepackedInfos prepacked_infos_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 2af9f95ad059e..097ce436f4419 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -234,7 +234,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, const std::filesystem::path& tensor_proto_dir, std::basic_string& external_file_path, onnxruntime::FileOffsetType& file_offset, - SafeInt& tensor_byte_size) { + SafeInt& tensor_byte_size, + ExternalDataInfo::PrepackedInfos* prepacked_infos) { ORT_RETURN_IF_NOT(onnxruntime::utils::HasExternalData(tensor_proto), "Tensor does not have external data to read from."); @@ -258,6 +259,10 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, file_offset = external_data_info->GetOffset(); + if (prepacked_infos != nullptr && external_data_info->HasPrepackedInfo()) { + *prepacked_infos = external_data_info->TakePrepackedInfos(); + } + return Status::OK(); } @@ -988,7 +993,8 @@ static Status GetFileContent(const Env& env, const std::filesystem::path& file_p Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& model_path, const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf, SafeInt& ext_data_len, OrtCallback& ext_data_deleter, - Tensor* buffered_tensor) { + Tensor* buffered_tensor, + PrepackedWeightsForGraph* prepacked_info) { ORT_ENFORCE(utils::HasExternalData(tensor_proto)); std::basic_string tensor_proto_dir; if (!model_path.empty()) { @@ -997,8 +1003,13 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo std::basic_string external_data_file_path; FileOffsetType file_offset; SafeInt raw_data_safe_len = 0; + std::optional prepacked_infos; + if (prepacked_info != nullptr) { + prepacked_infos.emplace(); + } ORT_RETURN_IF_ERROR( - GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, raw_data_safe_len)); + GetExternalDataInfo(tensor_proto, tensor_proto_dir, external_data_file_path, file_offset, + raw_data_safe_len, (prepacked_info != nullptr) ? &*prepacked_infos : nullptr)); if (external_data_file_path == onnxruntime::utils::kTensorProtoMemoryAddressTag) { // the value in location is the memory address of the data @@ -1042,6 +1053,33 @@ Status GetExtDataFromTensorProto(const Env& env, const std::filesystem::path& mo ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), file_offset, raw_data_safe_len, ext_data_buf, ext_data_deleter)); ext_data_len = raw_data_safe_len; + + if (prepacked_info != nullptr && !prepacked_infos->empty()) { + for (const auto& [key, blobs] : *prepacked_infos) { + PrePackedWeights prepacked_weights; + prepacked_weights.buffers_.reserve(blobs.size()); + prepacked_weights.buffer_sizes_.reserve(blobs.size()); + for (const auto& blob : blobs) { + const auto blob_offset = std::get<0>(blob); + const auto blob_length = std::get<1>(blob); + SafeInt end_of_blob{blob_offset}; + end_of_blob += blob_length; + ORT_RETURN_IF(blob_offset < 0 || static_cast(end_of_blob) > file_length, + "Pre-packed blob: ", key, " offset: ", blob_offset, " file_length: ", file_length, + " is out of bounds and can not read in full"); + void* data_ptr; + OrtCallback data_deleter; + ORT_RETURN_IF_ERROR(GetFileContent(env, external_data_file_path.c_str(), blob_offset, blob_length, + data_ptr, data_deleter)); + IAllocatorUniquePtr data_ptr_unique{data_ptr, OrtCallbackInvoker(data_deleter)}; + prepacked_weights.buffers_.push_back(std::move(data_ptr_unique)); + prepacked_weights.buffer_sizes_.push_back(blob_length); + } + if (!blobs.empty()) { + prepacked_info->InsertPrepackedWeights(key, std::move(prepacked_weights)); + } + } + } #endif } diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h index 262f7adaca1cb..7b9a47842388c 100644 --- a/onnxruntime/core/framework/tensorprotoutils.h +++ b/onnxruntime/core/framework/tensorprotoutils.h @@ -3,20 +3,21 @@ #pragma once -#include -#include -#include #include +#include +#include +#include #ifndef SHARED_PROVIDER #include "core/common/common.h" #include "core/common/status.h" #include "core/common/safeint.h" -#include "core/framework/endian_utils.h" #include "core/framework/allocator.h" +#include "core/framework/endian_utils.h" #include "core/framework/external_data_loader.h" -#include "core/framework/ort_value.h" #include "core/framework/mem_buffer.h" +#include "core/framework/ort_value.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/tensor_external_data_info.h" #include "core/graph/onnx_protobuf.h" #include "core/platform/env.h" @@ -36,7 +37,8 @@ Status GetExternalDataInfo(const ONNX_NAMESPACE::TensorProto& tensor_proto, const std::filesystem::path& tensor_proto_dir, std::basic_string& external_file_path, onnxruntime::FileOffsetType& file_offset, - SafeInt& tensor_byte_size); + SafeInt& tensor_byte_size, + ExternalDataInfo::PrepackedInfos* prepacked_infos = nullptr); /** * This function is used to convert the endianess of Tensor data. * Mostly, will be used in big endian system to support the model file @@ -172,7 +174,8 @@ common::Status GetExtDataFromTensorProto(const Env& env, const std::filesystem:: const ONNX_NAMESPACE::TensorProto& tensor_proto, void*& ext_data_buf, SafeInt& ext_data_len, OrtCallback& ext_data_deleter, - Tensor* buffered_tensor = nullptr); + Tensor* buffered_tensor = nullptr, + PrepackedWeightsForGraph* prepacked_for_graph = nullptr); // Given a tensor proto with external data obtain a tensor using the specified custom external data loader. common::Status LoadExtDataToTensorFromTensorProto(const Env& env, const std::filesystem::path& model_path, diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index e8a5855b36496..0b6610db5e007 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -18,6 +18,7 @@ #include "core/flatbuffers/flatbuffers_utils.h" #include "core/flatbuffers/schema/ort.fbs.h" #include "core/framework/tensor_shape.h" +#include "core/framework/tensor_external_data_info.h" #include "core/framework/tensorprotoutils.h" #include "core/framework/utils.h" #include "core/graph/graph_flatbuffers_utils.h" @@ -25,6 +26,7 @@ #include "core/graph/indexed_sub_graph.h" #include "core/graph/model.h" #include "core/graph/model_load_utils.h" +#include "core/graph/model_saving_options.h" #include "core/graph/node_attr_utils.h" #include "core/graph/op.h" #include "core/graph/runtime_optimization_record_container.h" @@ -1543,6 +1545,17 @@ Status Graph::VerifyNoDuplicateName() { #endif // !defined(ORT_MINIMAL_BUILD) +void Graph::ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on) { + if (parent_graph_ == nullptr) { + prepacked_key_to_blobs_.emplace(); + prepacked_weights_for_graph_.emplace(*prepacked_key_to_blobs_, saving_mode_on); + } else { + // Subgraph + prepacked_weights_for_graph_.emplace(parent_graph_->prepacked_weights_for_graph_->GetKeyToBlob(), + saving_mode_on); + } +} + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) void Graph::AddEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_slot, int dst_arg_slot) { if (nodes_.size() <= src_node_index || src_arg_slot < 0 || nodes_.size() <= dst_node_index || dst_arg_slot < 0 || @@ -4084,82 +4097,103 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const { return result; } -ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path, - const std::filesystem::path& model_file_path, - size_t initializer_size_threshold, - const OffsetAlignmentInfo& align_info) const { - GraphProto result; - ToGraphProtoInternal(result); - ORT_ENFORCE(external_file_path.is_relative()); - // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could - // be empty. Else, save external data file in same directory as the model. - const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path; +Status Graph::AddExternalInitializersToGraphProtoImpl( + const std::filesystem::path& model_path, + const std::filesystem::path& external_file_path, + const std::filesystem::path& model_external_file_path, + const ModelSavingOptions& model_saving_options, + ONNX_NAMESPACE::GraphProto& output_graph_proto, + std::ostream& external_stream, + int64_t& external_offset) const { + // Process initializers in a subgraph, check their size and + // write to an external file. This function also saves pre-packed + // blobs for the initializer being saved to disk, if the initializer has any pre-packs. + // This function is invoked by ToGraphProtoWithExternalInitiallizers() and processes subgraphs + // bottom up. + for (const auto& node : Nodes()) { + if (node.ContainsSubgraph()) { + // Let find this node in the output_graph_proto + auto hit = std::find_if(output_graph_proto.mutable_node()->begin(), + output_graph_proto.mutable_node()->end(), + [&node](const ONNX_NAMESPACE::NodeProto& proto) { + return proto.name() == node.Name(); + }); + ORT_RETURN_IF_NOT(hit != output_graph_proto.mutable_node()->end(), "Node ", node.Name(), + " not found in output_graph_proto"); + auto& result_node = *hit; + for (const auto& e : node.GetAttributeNameToSubgraphMap()) { + const auto& name = e.first; + const auto& subgraph = e.second; + // Lets find this subgraph in the result_node + auto sub_hit = std::find_if(result_node.mutable_attribute()->begin(), + result_node.mutable_attribute()->end(), + [&name](const ONNX_NAMESPACE::AttributeProto& proto) { + return proto.name() == name; + }); + ORT_RETURN_IF_NOT(sub_hit != result_node.mutable_attribute()->end() && utils::HasGraph(*sub_hit), + "Subgraph ", name, " is referred to in GetAttributeNameToSubgraphMap, but not found in node ", + node.Name(), " while attempting to recurse into it."); + auto& result_subgraph = *sub_hit->mutable_g(); + ORT_RETURN_IF_ERROR(subgraph->AddExternalInitializersToGraphProtoImpl( + model_path, external_file_path, + model_external_file_path, model_saving_options, + result_subgraph, + external_stream, external_offset)); + } + } + } - std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary); - ORT_ENFORCE(external_stream.is_open()); - int64_t external_offset = 0; + // Used only when pre-packed weights are serialized + InlinedHashSet processed_weights; + // prepacked_weights_for_graph_ is present only when SessionState is finalized. + const bool process_prepacks = prepacked_weights_for_graph_.has_value() && + prepacked_weights_for_graph_->GetNumberOfWeightsForWriting() > 0; + if (process_prepacks) { + processed_weights.reserve(graph_proto_->initializer_size()); + } // Add the initializers to the result graph. - const auto& model_path = ModelPath(); -#if !defined(DISABLE_SPARSE_TENSORS) - const auto sparse_end = sparse_tensor_names_.end(); -#endif - for (const auto& initializer : graph_proto_->initializer()) { #if !defined(DISABLE_SPARSE_TENSORS) - if (sparse_end != sparse_tensor_names_.find(initializer.name())) { + if (IsSparseInitializer(initializer.name())) { // Sparse tensors are added to the ONNX file. - auto& sparse_initializer = *result.add_sparse_initializer(); + auto& sparse_initializer = *output_graph_proto.add_sparse_initializer(); auto status = utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer); - ORT_ENFORCE(status.IsOK(), "Failed to convert dense initializer to sparse"); + ORT_RETURN_IF_NOT(status.IsOK(), "Failed to convert dense initializer to sparse"); } else { #endif // Dense tensors larger than the threshold are added to the external file. - TensorProto* output_proto = result.add_initializer(); + TensorProto* output_proto = output_graph_proto.add_initializer(); std::vector raw_data; - ORT_THROW_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data)); + ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(initializer, model_path, raw_data)); size_t tensor_bytes_size = raw_data.size(); - if (tensor_bytes_size < initializer_size_threshold) { + if (tensor_bytes_size < model_saving_options.initializer_size_threshold) { *output_proto = initializer; + if (process_prepacks) { + // These pre-packs will reside in memory + processed_weights.insert(initializer.name()); + } continue; } // update external_offset for alignment // need to do padding before write actual tensor data as we do offset alignment at the begin of - // large tensors (offset need to be page aligned and alloction granularity aligned) like below: + // large tensors (offset need to be page aligned and allocation granularity aligned) like below: // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX - // |<---small tensor---->|<---padding--->|<------------------large tensor----------------------------->| - if (align_info.align_offset && static_cast(tensor_bytes_size) > align_info.align_threshold) { - // Align to the larger of the page size or the allocation granularity - int64_t alignment_factor = std::max(static_cast(4096), align_info.allocation_granularity); - // Align to the next page or alloc granularity boundary - int64_t new_external_offset = static_cast( - std::floor((external_offset + alignment_factor - 1) / alignment_factor)) * - alignment_factor; - - // padding tensor with zeros for alignment - for (int64_t index = external_offset; index != new_external_offset; ++index) { - external_stream << '0'; - } - - external_offset = new_external_offset; + // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->| + if (model_saving_options.align_offset && static_cast(tensor_bytes_size) > + model_saving_options.align_threshold) { + ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity, + external_offset), + "Failed writing external data to: ", model_external_file_path); } - for (size_t index = 0; index != tensor_bytes_size; ++index) { - external_stream << raw_data[index]; - } + ORT_RETURN_IF_NOT(external_stream.write(reinterpret_cast(raw_data.data()), tensor_bytes_size), + "Failed to write external initializers to file: ", model_external_file_path); - output_proto->set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); - ONNX_NAMESPACE::StringStringEntryProto* location = output_proto->add_external_data(); - location->set_key("location"); - location->set_value(ToUTF8String(external_file_path.native())); - ONNX_NAMESPACE::StringStringEntryProto* offset = output_proto->add_external_data(); - offset->set_key("offset"); - offset->set_value(std::to_string(external_offset)); - ONNX_NAMESPACE::StringStringEntryProto* length = output_proto->add_external_data(); - length->set_key("length"); - length->set_value(std::to_string(tensor_bytes_size)); + ExternalDataInfo::SetExternalLocationToProto(external_file_path, external_offset, + tensor_bytes_size, *output_proto); output_proto->set_name(initializer.name()); output_proto->set_data_type(initializer.data_type()); @@ -4168,12 +4202,74 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std } output_proto->set_doc_string(initializer.doc_string()); - external_offset += tensor_bytes_size; + external_offset = SafeInt(external_offset) + tensor_bytes_size; + + if (process_prepacks) { + // check if this weight was referred to in subgraphs + InlinedHashSet blob_keys_to_external_data; + + // See if this weight has any pre-prepacks referred to in this graph. + const auto* blobs_keys_for_weight = prepacked_weights_for_graph_->GetKeysForWeightForSaving(initializer.name()); + if (blobs_keys_for_weight != nullptr && !blobs_keys_for_weight->empty()) { + // Add all the blob_keys to the set of keys to process + blob_keys_to_external_data.insert(blobs_keys_for_weight->begin(), blobs_keys_for_weight->end()); + } + + if (!blob_keys_to_external_data.empty()) { + auto& os = ExternalDataInfo::WritePrepackedToFileAndAddToProto( + *prepacked_weights_for_graph_, blob_keys_to_external_data, + model_saving_options.align_offset, model_saving_options.align_threshold, + model_saving_options.allocation_granularity, + external_stream, external_offset, *output_proto); + ORT_RETURN_IF_NOT(os.good(), "Failed to write pre-packed blobs to external file"); + } + + processed_weights.insert(initializer.name()); + } + #if !defined(DISABLE_SPARSE_TENSORS) } #endif } + // Check if there are any pre-packed weights this graph refers to, but they have + // not been processed. + if (process_prepacks) { + const auto& sorted_by_weights = prepacked_weights_for_graph_->GetWeightToPrepack(); + for (const auto& [weight_name, blob_keys] : sorted_by_weights) { + ORT_ENFORCE(processed_weights.find(weight_name) != processed_weights.end()); + } + } + + return Status::OK(); +} + +ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers( + const std::filesystem::path& external_file_path, + const std::filesystem::path& model_file_path, + const ModelSavingOptions& model_saving_options) const { + GraphProto result; + ToGraphProtoInternal(result); + ORT_ENFORCE(external_file_path.is_relative()); + // If model_file_path is just a file name without a path separator, for example: "model.onnx". Its parent path could + // be empty. Else, save external data file in same directory as the model. + const std::filesystem::path modified_external_file_path = model_file_path.parent_path() / external_file_path; + const auto& model_path = ModelPath(); + + // Create the external file. + std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary); + ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path); + int64_t external_offset = 0; + + ORT_THROW_IF_ERROR(AddExternalInitializersToGraphProtoImpl(model_path, external_file_path, + modified_external_file_path, model_saving_options, + result, + external_stream, external_offset)); + + if (!external_stream.flush()) { + ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path); + } + return result; } diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc index 1bae63b510563..be0531e6473fb 100644 --- a/onnxruntime/core/graph/model.cc +++ b/onnxruntime/core/graph/model.cc @@ -383,14 +383,12 @@ ModelProto Model::ToProto() const { ModelProto Model::ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) const { + const ModelSavingOptions& model_saving_options) const { ModelProto result(model_proto_); const auto& graph = *graph_; *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name, file_path, - initializer_size_threshold, - align_info); + model_saving_options); return result; } @@ -607,16 +605,13 @@ template static Status SaveModelWithExternalInitializers(Model& model, const T& file_path, const std::filesystem::path& external_file_name, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { + const ModelSavingOptions& save_options) { int fd = 0; Status status = Env::Default().FileOpenWr(file_path, fd); ORT_RETURN_IF_ERROR(status); ORT_TRY { - status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, - initializer_size_threshold, - align_info); + status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name, save_options); } ORT_CATCH(const std::exception& ex) { ORT_HANDLE_EXCEPTION([&]() { @@ -646,10 +641,8 @@ Status Model::Load(const PathString& file_path, std::shared_ptr& p_model, Status Model::SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path, const std::filesystem::path& external_file_name, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { - return SaveModelWithExternalInitializers(model, file_path, external_file_name, initializer_size_threshold, - align_info); + const ModelSavingOptions& save_options) { + return SaveModelWithExternalInitializers(model, file_path, external_file_name, save_options); } Status Model::LoadFromBytes(int count, const void* p_bytes, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { @@ -765,8 +758,7 @@ Status Model::SaveWithExternalInitializers(Model& model, int fd, const std::filesystem::path& file_path, const std::filesystem::path& external_file_name, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { + const ModelSavingOptions& model_saving_options) { if (fd < 0) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, " is less than 0."); } @@ -774,8 +766,7 @@ Status Model::SaveWithExternalInitializers(Model& model, ORT_RETURN_IF_ERROR(model.MainGraph().Resolve()); auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path, - initializer_size_threshold, - align_info); + model_saving_options); google::protobuf::io::FileOutputStream output(fd); const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush(); if (result) { diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h index 9bcec6f78ca08..2d2086aef41fd 100644 --- a/onnxruntime/core/graph/model.h +++ b/onnxruntime/core/graph/model.h @@ -20,6 +20,8 @@ namespace onnxruntime { +class PrepackedShareableWeightsContainer; + namespace fbs { struct Model; } // namespace fbs @@ -190,15 +192,7 @@ class Model { // initializer offset could be page aligned and allocation granularity aligned for mmap support. ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) const; - - ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, - const std::filesystem::path& file_path, - size_t initializer_size_threshold) const { - Graph::OffsetAlignmentInfo default_align_info; - return ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold, default_align_info); - } + const ModelSavingOptions& model_saving_options) const; static common::Status Save(Model& model, const PathString& file_path); @@ -209,32 +203,13 @@ class Model { static common::Status SaveWithExternalInitializers(Model& model, const std::filesystem::path& file_path, const std::filesystem::path& external_file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info); - - static common::Status SaveWithExternalInitializers(Model& model, - const std::filesystem::path& file_path, - const std::filesystem::path& external_file_path, - size_t initializer_size_threshold) { - Graph::OffsetAlignmentInfo default_align_info; - return SaveWithExternalInitializers(model, file_path, external_file_path, initializer_size_threshold, default_align_info); - } - - static common::Status SaveWithExternalInitializers(Model& model, - int fd, - const std::filesystem::path& file_path, - const std::filesystem::path& external_file_path, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info); + const ModelSavingOptions& save_options); static common::Status SaveWithExternalInitializers(Model& model, int fd, const std::filesystem::path& file_path, const std::filesystem::path& external_file_path, - size_t initializer_size_threshold) { - Graph::OffsetAlignmentInfo default_align_info; - return SaveWithExternalInitializers(model, fd, file_path, external_file_path, initializer_size_threshold, default_align_info); - } + const ModelSavingOptions& save_options); static common::Status Load(std::istream& model_istream, ONNX_NAMESPACE::ModelProto* p_model_proto); diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index d182d0b9173bd..8bd4067e59492 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -42,6 +42,8 @@ using ProviderType = const std::string&; class RandomGenerator; class IOnnxRuntimeOpSchemaCollection; +struct ModelSavingOptions; + #ifdef ENABLE_TRAINING_TORCH_INTEROP namespace contrib { class PythonOpBase; @@ -901,7 +903,11 @@ struct ProviderHost { virtual void Model__operator_delete(Model* p) = 0; virtual Graph& Model__MainGraph(Model* p) = 0; virtual std::unique_ptr Model__ToProto(Model* p) = 0; - virtual std::unique_ptr Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) = 0; + virtual std::unique_ptr Model__ToGraphProtoWithExternalInitializers( + Model* p, + const std::filesystem::path& external_file_name, + const std::filesystem::path& file_path, + const ModelSavingOptions&) = 0; virtual const ModelMetaData& Model__MetaData(const Model* p) const noexcept = 0; virtual Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index 54249f0864cd7..d8516d5858a2f 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -934,6 +934,8 @@ struct NodeUnit final { Node::EdgeConstIterator OutputEdgesEnd() const { return g_host->NodeUnit__OutputEdgesEnd(this); } }; +struct ModelSavingOptions; + struct Model final { static std::unique_ptr Create(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path, const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) { @@ -945,7 +947,12 @@ struct Model final { Graph& MainGraph() { return g_host->Model__MainGraph(this); } std::unique_ptr ToProto() { return g_host->Model__ToProto(this); } - std::unique_ptr ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) { return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, initializer_size_threshold); } + std::unique_ptr ToGraphProtoWithExternalInitializers( + const std::filesystem::path& external_file_name, + const std::filesystem::path& file_path, const ModelSavingOptions& model_saving_options) { + return g_host->Model__ToGraphProtoWithExternalInitializers(this, external_file_name, file_path, + model_saving_options); + } const ModelMetaData& MetaData() const noexcept { return g_host->Model__MetaData(this); } Model() = delete; diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc index 191d26f3ab269..e7b39546fda6a 100644 --- a/onnxruntime/core/providers/vitisai/imp/graph.cc +++ b/onnxruntime/core/providers/vitisai/imp/graph.cc @@ -9,6 +9,7 @@ #include #include +#include "core/graph/model_saving_options.h" #include "core/providers/shared_library/provider_api.h" #include "./vai_assert.h" @@ -111,7 +112,9 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri if (initializer_size_threshold == std::numeric_limits::max()) { model_proto = model->ToProto(); } else { - model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold); + ModelSavingOptions model_saving_options{initializer_size_threshold}; + model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), + model_saving_options); } auto& metadata = model->MetaData(); if (!metadata.empty()) { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index a60ee500a9898..223eed248800e 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -38,6 +38,7 @@ #include "core/framework/utils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/optimizer/graph_transformer_utils.h" #include "core/optimizer/graph_transformer.h" #include "core/optimizer/layout_transformation/layout_transformation.h" @@ -2099,13 +2100,12 @@ common::Status InferenceSession::Initialize() { const size_t optimized_model_external_initializers_min_size_in_bytes = ParseStringWithClassicLocale(session_options_.config_options.GetConfigOrDefault( kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes, "1024")); - Graph::OffsetAlignmentInfo align_info; - align_info.align_offset = true; + ModelSavingOptions model_saving_options{optimized_model_external_initializers_min_size_in_bytes}; + model_saving_options.align_offset = true; ORT_RETURN_IF_ERROR_SESSIONID_(Model::SaveWithExternalInitializers(*model_, session_options_.optimized_model_filepath, optimized_model_external_initializers_file_name, - optimized_model_external_initializers_min_size_in_bytes, - align_info)); + model_saving_options)); } } } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 1444c1976d447..a40fabd6a607c 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1072,7 +1072,14 @@ struct ProviderHostImpl : ProviderHost { void Model__operator_delete(Model* p) override { delete p; } Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); } std::unique_ptr Model__ToProto(Model* p) override { return std::make_unique(p->ToProto()); } - std::unique_ptr Model__ToGraphProtoWithExternalInitializers(Model* p, const std::filesystem::path& external_file_name, const std::filesystem::path& file_path, size_t initializer_size_threshold) override { return std::make_unique(p->ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold)); }; + std::unique_ptr Model__ToGraphProtoWithExternalInitializers(Model* p, + const std::filesystem::path& external_file_name, + const std::filesystem::path& file_path, + const ModelSavingOptions& model_saving_options) override { + return std::make_unique(p->ToGraphProtoWithExternalInitializers(external_file_name, + file_path, + model_saving_options)); + }; const ModelMetaData& Model__MetaData(const Model* p) const noexcept override { return p->MetaData(); }; Status Model__Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) override { return Model::Load(file_path, model_proto); } diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc index d0bc088175755..98874874d50e9 100644 --- a/onnxruntime/test/framework/save_model_with_external_initializers.cc +++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc @@ -6,6 +6,7 @@ #include "core/common/path_string.h" #include "core/framework/data_types.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/framework/tensorprotoutils.h" #include "test/test_environment.h" #include "test_utils.h" @@ -23,15 +24,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, const std::filesystem::path& input_external_init_file, const std::filesystem::path& output_onnx, const std::filesystem::path& output_external_init_file, - size_t initializer_size_threshold, - const Graph::OffsetAlignmentInfo& align_info) { + const ModelSavingOptions& model_saving_options) { auto logger = DefaultLoggingManager().CreateLogger("LoadSaveAndCompareModel"); std::shared_ptr model; ORT_RETURN_IF_ERROR(Model::Load(input_onnx, model, nullptr, *logger)); std::filesystem::remove(output_onnx); std::filesystem::remove(output_external_init_file); - ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, initializer_size_threshold, - align_info)); + ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(*model, output_onnx, output_external_init_file, + model_saving_options)); std::shared_ptr model_from_external; ORT_RETURN_IF_ERROR(Model::Load(output_onnx.native(), model_from_external, nullptr, *logger)); @@ -67,7 +67,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, ORT_RETURN_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, model_path, from_external_tensor_proto_data)); size_t from_external_tensor_proto_size = from_external_tensor_proto_data.size(); - if (from_external_tensor_proto_size < initializer_size_threshold) { + if (from_external_tensor_proto_size < model_saving_options.initializer_size_threshold) { // 'Small' tensors should be embedded in the onnx file. ORT_RETURN_IF_NOT(from_external_tensor_proto->data_location() == ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_DEFAULT, "location mismatch"); } else { @@ -78,13 +78,14 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, ORT_RETURN_IF_NOT(tensor_proto_size == from_external_tensor_proto_size, "size mismatch"); ORT_RETURN_IF_NOT(memcmp(tensor_proto_data.data(), from_external_tensor_proto_data.data(), tensor_proto_size) == 0, "data mismatch"); - if (align_info.align_offset) { + if (model_saving_options.align_offset) { for (const StringStringEntryProto& entry : from_external_tensor_proto->external_data()) { if (entry.has_key() && entry.has_value() && entry.key() == "offset") { size_t tensor_offset; std::stringstream stream(entry.value()); stream >> tensor_offset; - ORT_RETURN_IF_NOT(tensor_offset % align_info.allocation_granularity == 0, "tensor offset not align"); + ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0, + "tensor offset not align"); } } } @@ -97,22 +98,35 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, // Original model does not have external initializers TEST(SaveWithExternalInitializers, Mnist) { - Graph::OffsetAlignmentInfo align_info; - ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/mnist.onnx"), ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), ORT_TSTR("mnist_external_initializers.bin"), 100, align_info)); + ModelSavingOptions model_saving_options{100}; + ASSERT_STATUS_OK(LoadSaveAndCompareModel( + ORT_TSTR("testdata/mnist.onnx"), + ORT_TSTR(""), ORT_TSTR("testdata/mnist_with_external_initializers.onnx"), + ORT_TSTR("mnist_external_initializers.bin"), + model_saving_options)); } // Original model has external initializers TEST(SaveWithExternalInitializers, ModelWithOriginalExternalData) { - Graph::OffsetAlignmentInfo align_info; - ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info)); + ModelSavingOptions model_saving_options{0}; + ASSERT_STATUS_OK(LoadSaveAndCompareModel( + ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), + ORT_TSTR("model_with_orig_ext_data.onnx.data"), + ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), + ORT_TSTR("model_with_new_external_initializers.bin"), + model_saving_options)); } // Original model has external initializers, align offset TEST(SaveWithExternalInitializers, ModelWithOriginalExternalDataAlignOffset) { - Graph::OffsetAlignmentInfo align_info; - align_info.align_offset = true; - align_info.align_threshold = 0; - ASSERT_STATUS_OK(LoadSaveAndCompareModel(ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), ORT_TSTR("model_with_orig_ext_data.onnx.data"), ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), ORT_TSTR("model_with_new_external_initializers.bin"), 0, align_info)); + ModelSavingOptions model_saving_options{0}; + model_saving_options.align_offset = true; + model_saving_options.align_threshold = 0; + ASSERT_STATUS_OK(LoadSaveAndCompareModel( + ORT_TSTR("testdata/model_with_orig_ext_data.onnx"), + ORT_TSTR("model_with_orig_ext_data.onnx.data"), + ORT_TSTR("testdata/model_with_new_external_initializers.onnx"), + ORT_TSTR("model_with_new_external_initializers.bin"), model_saving_options)); } } // namespace test diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index 3e694020f796b..e7f8b1aaa49d8 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -15,6 +15,7 @@ #include "core/graph/graph_utils.h" #include "core/graph/graph_viewer.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/graph/op.h" #include "core/providers/cpu/cpu_execution_provider.h" #include "core/session/onnxruntime_session_options_config_keys.h" @@ -22,13 +23,101 @@ #include "gtest/gtest.h" #include "test/test_environment.h" #include "test/util/include/default_providers.h" +#include "test/util/include/file_util.h" #include "core/optimizer/layout_transformation/layout_transformation.h" using namespace ONNX_NAMESPACE; -using namespace std; namespace onnxruntime { - namespace test { + +#ifndef ENABLE_TRAINING_CORE +#ifndef __wasm__ +static void TestSavedPrepacks(const Model& model) { + auto inspect = [](const Graph& graph) { + const auto& prepacked_for_graph = graph.GetPrepacked(); + const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob(); + ASSERT_EQ(1U, key_to_blob.size()); + const size_t expected_prepacks_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U; + ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting()); + + const size_t expected_blobs_for_writing = (graph.ParentGraph() == nullptr) ? 1U : 0U; + ASSERT_EQ(expected_blobs_for_writing, prepacked_for_graph.GetNumberOfKeyedBlobsForWriting()); + + if (graph.ParentGraph() == nullptr) { + const auto* blob_keys = prepacked_for_graph.GetKeysForWeightForSaving("if_shared"); + ASSERT_TRUE(blob_keys != nullptr); + ASSERT_EQ(blob_keys->size(), 1U); + const auto* prepacked_weights = prepacked_for_graph.GetPrepackedWeights(*blob_keys->cbegin()); + ASSERT_TRUE(prepacked_weights != nullptr); + ASSERT_EQ(prepacked_weights->buffer_sizes_.size(), 1U); + ASSERT_EQ(prepacked_weights->buffer_sizes_[0], sizeof(float) * 2); + } + }; + + const auto& main_graph = model.MainGraph(); + inspect(main_graph); + + const auto& nodes = main_graph.Nodes(); + auto if_node_hit = std::find_if(nodes.begin(), nodes.end(), + [](const Node& node) { return node.Name() == "if"; }); + ASSERT_FALSE(if_node_hit == nodes.end()); + const Node& if_node = *if_node_hit; + for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) { + inspect(*subgraph); + } +} + +static void TestLoadedSharedUserSupplied(const Model& model) { + auto inspect = [](const Graph& graph) { + const auto& prepacked_for_graph = graph.GetPrepacked(); + constexpr size_t expected_prepacks_for_writing = 0U; + ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting()); + + // We have not loaded anything since this initializer is user supplied + const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob(); + ASSERT_EQ(0U, key_to_blob.size()); + }; + + const auto& main_graph = model.MainGraph(); + inspect(main_graph); + + const auto& nodes = main_graph.Nodes(); + auto if_node_hit = std::find_if(nodes.begin(), nodes.end(), + [](const Node& node) { return node.Name() == "if"; }); + ASSERT_FALSE(if_node_hit == nodes.end()); + const Node& if_node = *if_node_hit; + for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) { + inspect(*subgraph); + } +} + +static void TestLoadedSharedNoUserSupplied(const Model& model) { + auto inspect = [](const Graph& graph) { + const auto& prepacked_for_graph = graph.GetPrepacked(); + constexpr size_t expected_prepacks_for_writing = 0U; + ASSERT_EQ(expected_prepacks_for_writing, prepacked_for_graph.GetNumberOfWeightsForWriting()); + + // We have not loaded anything since this initializer is user supplied + const auto& key_to_blob = prepacked_for_graph.GetKeyToBlob(); + ASSERT_EQ(1U, key_to_blob.size()); + }; + + const auto& main_graph = model.MainGraph(); + inspect(main_graph); + + const auto& nodes = main_graph.Nodes(); + auto if_node_hit = std::find_if(nodes.begin(), nodes.end(), + [](const Node& node) { return node.Name() == "if"; }); + ASSERT_FALSE(if_node_hit == nodes.end()); + const Node& if_node = *if_node_hit; + for (const auto& [_, subgraph] : if_node.GetAttributeNameToSubgraphMap()) { + inspect(*subgraph); + } +} + +#endif // __wasm__ +#endif // ENABLE_TRAINING_CORE + class TestOpKernel : public OpKernel { public: TestOpKernel(const OpKernelInfo& p) : OpKernel(p) { @@ -378,7 +467,7 @@ class PrePackingTestOpKernel : public OpKernel { ORT_UNUSED_PARAMETER(tensor); ORT_UNUSED_PARAMETER(input_idx); - size_t weight_packed_len = 8; + constexpr const size_t weight_packed_len = sizeof(float) * 2; weight_packed_ = IAllocator::MakeUniquePtr(alloc, weight_packed_len, true); float* data_weights_packed = reinterpret_cast(weight_packed_.get()); data_weights_packed[0] = 1.2345f; @@ -647,7 +736,8 @@ class SessionStateTestSharedInitalizersWithPrePacking : public ::testing::Test { } }; -// Pre-packing enabled + no shared initializers = no pre-packed weights caching +// Pre-packing enabled + no shared initializers, however, we put all the pre-packs +// in a session_state container for ownership. TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) { SessionOptions sess_options; sess_options.enable_mem_pattern = true; @@ -679,10 +769,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) { const auto* kernel = reinterpret_cast(session_state_1.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made. However, they sharing call is still made from a serialized container. ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + // In this case the sharing comes from the serialized container + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); // Second session/model Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), @@ -706,10 +797,11 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test1) { kernel = reinterpret_cast(session_state_2.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made. The weights are still shared from the serialized container + // either because they are loaded from disk or because the container takes ownership of them. ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); } // Pre-packing enabled + shared initializers + no pre-packed weights container = no pre-packed weights caching @@ -754,10 +846,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) { const auto* kernel = reinterpret_cast(session_state_1.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made, but sharing still takes place from the serialized container ASSERT_EQ(session_state_1.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); // Second session/model Model model_2("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), @@ -781,10 +873,10 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test2) { kernel = reinterpret_cast(session_state_2.GetKernel(0)); - // Assert that a pre-pack call was made and that no mechanism to store weight from shared container was invoked + // Assert that a pre-pack call was made, but sharing still takes place from the serialized container ASSERT_EQ(session_state_2.GetNumberOfPrepacksCounter(), static_cast(1)); ASSERT_EQ(kernel->prepack_calls_count, 1); - ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 0); + ASSERT_EQ(kernel->store_pre_packed_weight_calls_count, 1); } // Pre-packing enabled + shared initializers + pre-packed weights container = pre-packed weights caching enabled @@ -999,6 +1091,196 @@ TEST_F(SessionStateTestSharedInitalizersWithPrePacking, test4) { ASSERT_EQ(if_node_branches_shared_prepack_counter_2, static_cast(2)); } +#ifndef __wasm__ +// sharing is on +TEST_F(SessionStateTestSharedInitalizersWithPrePacking, TestPrepackedSerialization) { + const std::filesystem::path model_with_external_initializers = + "testdata/test_prepacked_serialization_optimized_model.onnx"; + + const std::filesystem::path external_initializers_file = + "test_prepacked_serialization_optimized_model.bin"; + + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + sess_options.optimized_model_filepath = model_with_external_initializers; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + // Enable saving model with pre-packed weights + sess_options.config_options.configurations[kOrtSessionOptionsSavePrePackedConstantInitializers] = "1"; + + // Enable shared initializer + OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator); + std::vector float_data(1, 1); + auto value = std::make_unique(); + Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(std::vector{1}), + reinterpret_cast(float_data.data()), mem_info, *value); + + ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get())); + + // Enable pre-packed weights container for shared initializers + PrepackedWeightsContainer prepacked_weights_container; + Model model_1("graph_main", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, std::vector(), + DefaultLoggingManager().DefaultLogger()); + + CreateGraphWithSubgraph(model_1.MainGraph()); + PlaceAllNodesToCPUEP(model_1.MainGraph()); + SessionState session_state_1(model_1.MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + &prepacked_weights_container); + + constexpr const bool saving_model_true = true; + + ASSERT_STATUS_OK(session_state_1.FinalizeSessionState(std::basic_string(), + kernel_registry_manager, + !saving_model_true)); + + TestSavedPrepacks(model_1); + + ModelSavingOptions model_saving_options{4}; + model_saving_options.align_offset = true; + + ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model_1, model_with_external_initializers, + external_initializers_file, + model_saving_options)); + } + ScopedFileDeleter test_model_deleter(model_with_external_initializers); + ScopedFileDeleter binary_file_deleter(external_initializers_file); + + // Now let's load the model along with the initializers + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + + // We are expecting this weight to be loaded from disk along + // with its pre-packed version + // Enable shared initializer + OrtMemoryInfo mem_info(CPU, OrtDeviceAllocator); + std::vector float_data(1, 1); + auto value = std::make_unique(); + Tensor::InitOrtValue(DataTypeImpl::GetType(), TensorShape(std::vector{1}), + reinterpret_cast(float_data.data()), mem_info, *value); + + ASSERT_STATUS_OK(sess_options.AddInitializer("if_shared", value.get())); + + // Enable pre-packed weights container for shared initializers + PrepackedWeightsContainer prepacked_weights_container; + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr, + DefaultLoggingManager().DefaultLogger())); + + PlaceAllNodesToCPUEP(model->MainGraph()); + SessionState session_state(model->MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + &prepacked_weights_container); + + ASSERT_STATUS_OK(session_state.FinalizeSessionState(std::basic_string(), + kernel_registry_manager, + false)); + + TestLoadedSharedUserSupplied(*model); + } + + // Load again, this time sharing is enabled, but no shared initializer in the map + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + + // Enable pre-packed weights container for shared initializers + PrepackedWeightsContainer prepacked_weights_container; + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr, + DefaultLoggingManager().DefaultLogger())); + + PlaceAllNodesToCPUEP(model->MainGraph()); + SessionState session_state(model->MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + &prepacked_weights_container); + + ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers, + kernel_registry_manager, + false)); + + TestLoadedSharedNoUserSupplied(*model); + } + // Load again, sharing is disabled + { + SessionOptions sess_options; + sess_options.enable_mem_pattern = true; + sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; + sess_options.use_deterministic_compute = false; + sess_options.enable_mem_reuse = true; + + // Enable pre-packing + sess_options.config_options.configurations[kOrtSessionOptionsConfigDisablePrepacking] = "0"; + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_with_external_initializers, model, nullptr, + DefaultLoggingManager().DefaultLogger())); + + PlaceAllNodesToCPUEP(model->MainGraph()); + SessionState session_state(model->MainGraph(), + execution_providers, + tp.get(), + nullptr, /*inter_op_thread_pool*/ + dtm, + edlm, + DefaultLoggingManager().DefaultLogger(), + profiler, + sess_options, + nullptr); + + ASSERT_STATUS_OK(session_state.FinalizeSessionState(model_with_external_initializers, + kernel_registry_manager, + false)); + + const auto& prepacked_for_main_graph = model->MainGraph().GetPrepacked(); + ASSERT_FALSE(prepacked_for_main_graph.IsSaveModeOn()); + ASSERT_EQ(1U, prepacked_for_main_graph.GetKeyToBlob().size()); + } +} +#endif // __wasm__ + INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStatePrepackingTest, testing::Values(PrepackingTestParam{false, false}, diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc index 6821f582ce2de..229f4f95b8394 100644 --- a/onnxruntime/test/framework/tensorutils_test.cc +++ b/onnxruntime/test/framework/tensorutils_test.cc @@ -1,6 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include "core/common/inlined_containers.h" +#include "core/framework/prepacked_weights.h" +#include "core/framework/prepacked_weights_container.h" #include "core/framework/tensorprotoutils.h" #include "core/graph/onnx_protobuf.h" #include "test/util/include/asserts.h" @@ -19,6 +22,76 @@ using namespace ONNX_NAMESPACE; namespace onnxruntime { namespace test { +// Test ExternalData functionality +TEST(TensorProtoUtilsTest, SetExternalDataInformation) { + ONNX_NAMESPACE::TensorProto tensor_proto; + const std::filesystem::path kExternalDataPath("test.bin"); + constexpr const int64_t init_offset = 100; + constexpr const size_t init_length = 200; + + ExternalDataInfo::SetExternalLocationToProto(kExternalDataPath, init_offset, init_length, tensor_proto); + + ASSERT_EQ(tensor_proto.data_location(), ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL); + ASSERT_EQ(tensor_proto.external_data_size(), 3); + ASSERT_EQ(tensor_proto.external_data(0).key(), "location"); + ASSERT_EQ(tensor_proto.external_data(0).value(), ToUTF8String(kExternalDataPath.native())); + ASSERT_EQ(tensor_proto.external_data(1).key(), "offset"); + ASSERT_EQ(tensor_proto.external_data(1).value(), std::to_string(init_offset)); + ASSERT_EQ(tensor_proto.external_data(2).key(), "length"); + ASSERT_EQ(tensor_proto.external_data(2).value(), std::to_string(init_length)); + + PrepackedKeyToBlobMap key_to_blob; + constexpr bool save_mode_on = true; + PrepackedWeightsForGraph prepacked_for_graph(key_to_blob, save_mode_on); + PrePackedWeights prepacked_weights; + const std::string init_name = "test_initializer"; + const std::string blob_key = "test_key"; + + std::array kData = {1.2345f, 2.4690f}; + const size_t buffer_size = kData.size() * sizeof(float); + + prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr))); + prepacked_weights.buffer_sizes_.push_back(buffer_size); + // Write a second entry like this + prepacked_weights.buffers_.push_back(BufferUniquePtr(kData.data(), BufferDeleter(nullptr))); + prepacked_weights.buffer_sizes_.push_back(buffer_size); + + prepacked_for_graph.WritePackedMaybeForSave(init_name, blob_key, std::move(prepacked_weights)); + + constexpr const int64_t starting_offset = 300; + int64_t external_offset = starting_offset; + std::stringstream ss; + const auto* blobs_for_weight = prepacked_for_graph.GetKeysForWeightForSaving(init_name); + ASSERT_TRUE(blobs_for_weight != nullptr); + InlinedHashSet blob_keys{blobs_for_weight->begin(), blobs_for_weight->end()}; + ASSERT_TRUE(ExternalDataInfo::WritePrepackedToFileAndAddToProto(prepacked_for_graph, + blob_keys, + true, 1024 * 1024, 0, + ss, external_offset, + tensor_proto)); + + auto external_data_info = std::make_unique(); + ASSERT_STATUS_OK(ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info)); + + // This should have prepacked_data entry with two blobs for a single key. + ASSERT_TRUE(external_data_info->HasPrepackedInfo()); + auto prepacked_infos = external_data_info->TakePrepackedInfos(); + ASSERT_EQ(prepacked_infos.size(), 1U); + ASSERT_TRUE(prepacked_infos.count(blob_key) > 0); + + int64_t final_offset = starting_offset; + for (const auto& blob_info : prepacked_infos[blob_key]) { + int64_t offset = std::get<0>(blob_info); + ASSERT_EQ(offset, final_offset); + size_t length = std::get<1>(blob_info); + std::string checksum = std::get<2>(blob_info); // currently "0" + final_offset = offset + length; + ASSERT_EQ(length, buffer_size); + ASSERT_EQ(checksum, "0"); + } + ASSERT_EQ(final_offset, external_offset); +} + // T must be float for double, and it must match with the 'type' argument template void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::path& model_path) { diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc index f1545e96481fa..b03f1b1eadb3b 100644 --- a/orttraining/orttraining/core/session/training_session.cc +++ b/orttraining/orttraining/core/session/training_session.cc @@ -5,6 +5,7 @@ #include "core/framework/data_transfer_utils.h" #include "core/graph/model.h" +#include "core/graph/model_saving_options.h" #include "core/session/IOBinding.h" #include "core/optimizer/rule_based_graph_transformer.h" #include "core/providers/cpu/controlflow/utils.h" @@ -1003,7 +1004,8 @@ Status TrainingSession::SaveWithExternalInitializers(const PathString& model_uri std::remove(ToUTF8String(model_uri).c_str()); std::remove(external_file_name.c_str()); - return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, initializer_size_threshold); + ModelSavingOptions model_saving_options{initializer_size_threshold}; + return Model::SaveWithExternalInitializers(*model_, model_uri, external_file_name, model_saving_options); } Status TrainingSession::Save(const PathString& model_uri, TrainingSession::SaveOption opt) { diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc index 939e1de334e52..60708b05626c5 100644 --- a/orttraining/orttraining/training_api/module.cc +++ b/orttraining/orttraining/training_api/module.cc @@ -11,6 +11,7 @@ #include "core/session/inference_session.h" #include "core/session/environment.h" #include "core/session/onnxruntime_session_options_config_keys.h" +#include "core/graph/model_saving_options.h" #include "core/graph/graph_utils.h" #include "orttraining/training_api/checkpoint.h" @@ -689,8 +690,10 @@ Status Module::ExportModelForInferencing(const std::string& inference_model_path std::string external_data_name = ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(ExternalCheckpointDataPath(ToPathString(inference_model_path))); PathString inference_model_pathstring = ToPathString(inference_model_path); + ModelSavingOptions model_saving_options{64}; ORT_THROW_IF_ERROR( - Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, 64)); + Model::SaveWithExternalInitializers(*inference_model, inference_model_pathstring, external_data_name, + model_saving_options)); } else { ORT_THROW_IF_ERROR(Model::Save(*inference_model, ToPathString(inference_model_path))); }