Skip to content

Commit

Permalink
[QNN EP] Add provider option to offload graph I/O quantization/dequan…
Browse files Browse the repository at this point in the history
…tization to the CPU EP (#22436)

### Description
Adds QNN provider option `offload_graph_io_quantization` to offload
graph input quantization and graph output dequantization to the CPU EP.
Option is disabled by default to maintain current behavior.


### Motivation and Context
Offloading the handling of I/O quantization to the CPU EP significantly
improves inference latency for many models.
  • Loading branch information
adrianlizarraga authored and apsonawane committed Oct 21, 2024
1 parent 84aee0c commit 9be2fd1
Show file tree
Hide file tree
Showing 14 changed files with 172 additions and 24 deletions.
18 changes: 11 additions & 7 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3651,13 +3651,17 @@ struct OrtApi {
* - "73"
* - "75"
* "device_id": The ID of the device to use when setting 'htp_arch'. Defaults to "0" (for single device).
"enable_htp_fp16_precision": Used for float32 model for HTP backend.
Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
- "0": With fp32 precision.
- "1": Default. With fp16 precision.
"enable_htp_weight_sharing": Enable QNN weight sharing feature while compiling multiple graphs into one QNN context.
- "0": Default. Disabled.
- "1": Enabled.
* "enable_htp_fp16_precision": Used for float32 model for HTP backend.
* Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
* - "0": With fp32 precision.
* - "1": Default. With fp16 precision.
* "enable_htp_weight_sharing": Enable QNN weight sharing feature while compiling multiple graphs into one QNN context.
* - "0": Default. Disabled.
* - "1": Enabled.
* "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
* execution provider (typically CPU EP).
* - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
* - "1": Enabled.
*
* SNPE supported keys:
* "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,23 @@ Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
int64_t quant_axis = 0;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis));
ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization");

if (qnn_model_wrapper.GetModelSettings().offload_graph_io_quantization) {
ORT_RETURN_IF(qnn_model_wrapper.IsGraphOutput(node_unit.Outputs()[0].node_arg.Name()),
"QNN EP is configured to not take DQ nodes that generate a graph output.");
}
}

if (op_type == "QuantizeLinear") {
bool is_per_chan_quant = false;
int64_t quant_axis = 0;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis));
ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization");

if (qnn_model_wrapper.GetModelSettings().offload_graph_io_quantization) {
ORT_RETURN_IF(qnn_model_wrapper.IsGraphInput(node_unit.Inputs()[0].node_arg.Name()),
"QNN EP is configured to not take Q nodes that consume a graph input.");
}
}

return Status::OK();
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/providers/qnn/builder/qnn_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ const NodeUnit& QnnModel::GetNodeUnit(const Node* node,

Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
const onnxruntime::Node& fused_node,
const qnn::ModelSettings& model_settings,
const logging::Logger& logger,
const QnnGraph_Config_t** graph_configs) {
LOGS(logger, VERBOSE) << "ComposeGraph Graph name: " << graph_viewer.Name();
Expand All @@ -115,7 +116,8 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
model_input_index_map_,
model_output_index_map_,
initializer_inputs_,
qnn_backend_manager_->GetQnnBackendType());
qnn_backend_manager_->GetQnnBackendType(),
model_settings);
bool rt = true;
rt = qnn_model_wrapper.CreateQnnGraph(qnn_backend_manager_->GetQnnContext(), graph_name, graph_configs);
if (!rt) {
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/qnn/builder/qnn_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class QnnModel {

Status ComposeGraph(const GraphViewer& graph_viewer,
const onnxruntime::Node& fused_node,
const qnn::ModelSettings& model_settings,
const logging::Logger& logger,
const QnnGraph_Config_t** graph_configs = nullptr);

Expand Down
13 changes: 11 additions & 2 deletions onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ struct TensorInfo {
const ONNX_NAMESPACE::TensorProto* initializer_tensor;
};

struct ModelSettings {
bool offload_graph_io_quantization = false;
};

class QnnModelWrapper {
public:
QnnModelWrapper(const GraphViewer& graph_viewer,
Expand All @@ -38,20 +42,24 @@ class QnnModelWrapper {
const std::unordered_map<std::string, size_t>& input_index_map,
const std::unordered_map<std::string, size_t>& output_index_map,
const std::unordered_set<std::string>& initializer_lookup,
QnnBackendType qnn_backend_type)
QnnBackendType qnn_backend_type,
const ModelSettings& model_settings)
: graph_viewer_(graph_viewer),
logger_(logger),
qnn_interface_(qnn_interface),
backend_handle_(backend_handle),
input_index_map_(input_index_map),
output_index_map_(output_index_map),
initializer_lookup_(initializer_lookup),
qnn_backend_type_(qnn_backend_type) {
qnn_backend_type_(qnn_backend_type),
model_settings_(model_settings) {
}
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnModelWrapper);

~QnnModelWrapper() = default;

const ModelSettings& GetModelSettings() const { return model_settings_; }

bool CreateQnnGraph(const Qnn_ContextHandle_t& context,
const std::string& graph_name,
const QnnGraph_Config_t** graph_configs = nullptr);
Expand Down Expand Up @@ -279,6 +287,7 @@ class QnnModelWrapper {
const std::unordered_map<std::string, size_t>& output_index_map_;
const std::unordered_set<std::string>& initializer_lookup_;
QnnBackendType qnn_backend_type_ = QnnBackendType::CPU;
ModelSettings model_settings_ = {};
}; // QnnModelWrapper

} // namespace qnn
Expand Down
32 changes: 30 additions & 2 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,23 @@ static void ParseHtpArchitecture(const std::string& htp_arch_string, QnnHtpDevic
}
}

static bool ParseBoolOption(const std::string& key, bool default_value,
const std::unordered_map<std::string, std::string>& options) {
bool result = default_value;
auto it = options.find(key);
if (it != options.end()) {
if ("1" == it->second) {
result = true;
} else if ("0" == it->second) {
result = false;
} else {
LOGS_DEFAULT(VERBOSE) << "Invalid value for " << key << " (" << it->second << "). Only 0 or 1 allowed.";
}
LOGS_DEFAULT(VERBOSE) << "Using " << key << ": " << result;
}
return result;
}

qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned char level) {
if (level == 5) {
LOGS_DEFAULT(INFO) << "Overriding profiling to basic based on ETW level: " << static_cast<int>(level);
Expand Down Expand Up @@ -403,6 +420,15 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_weight_sharing: " << enable_htp_weight_sharing_;
}

model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false,
provider_options_map);

if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
LOGS_DEFAULT(WARNING) << "Fallback to CPU EP is disabled, but user configured QNN EP to offload graph I/O "
<< "quantization/dequantization to another EP. Session creation will fail if the CPU EP "
<< "handles the graph I/O quantization/dequantization.";
}

qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
std::move(backend_path),
profiling_level_etw,
Expand Down Expand Up @@ -499,7 +525,8 @@ QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
model_input_index_map,
model_output_index_map,
initializer_input_lookup,
qnn_backend_manager_->GetQnnBackendType());
qnn_backend_manager_->GetQnnBackendType(),
model_settings_);

std::vector<std::unique_ptr<qnn::IQnnNodeGroup>> qnn_node_groups;
qnn_node_groups.reserve(node_unit_size);
Expand Down Expand Up @@ -845,7 +872,8 @@ Status QNNExecutionProvider::CompileFromOrtGraph(const std::vector<FusedNodeAndG
QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT);
InitQnnGraphConfigs(graph_configs_builder);

ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, logger, graph_configs_builder.GetQnnConfigs()));
ORT_RETURN_IF_ERROR(qnn_model->ComposeGraph(graph_viewer, fused_node, model_settings_, logger,
graph_configs_builder.GetQnnConfigs()));
ORT_RETURN_IF_ERROR(qnn_model->FinalizeGraphs(logger));
ORT_RETURN_IF_ERROR(qnn_model->SetupQnnInputOutput(logger));

Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ class QNNExecutionProvider : public IExecutionProvider {
#ifdef _WIN32
onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_;
#endif
qnn::ModelSettings model_settings_ = {};

class PerThreadContext final {
public:
Expand Down
8 changes: 5 additions & 3 deletions onnxruntime/test/onnx/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ void usage() {
"\t [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
"\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
"\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
"\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
"\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_path|/folderpath/libQnnCpu.so\" \n\n"
"\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
Expand Down Expand Up @@ -587,20 +589,20 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
std::string str = str_stream.str();
ORT_THROW("Wrong value for htp_arch. select from: " + str);
}
} else if (key == "enable_htp_fp16_precision") {
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") {
std::unordered_set<std::string> supported_options = {"0", "1"};
if (supported_options.find(value) == supported_options.end()) {
std::ostringstream str_stream;
std::copy(supported_options.begin(), supported_options.end(),
std::ostream_iterator<std::string>(str_stream, ","));
std::string str = str_stream.str();
ORT_THROW("Wrong value for enable_htp_fp16_precision. select from: " + str);
ORT_THROW("Wrong value for ", key, ". select from: ", str);
}
} else {
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority',
'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision'])");
'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization'])");
}

qnn_options[key] = value;
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/perftest/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ namespace perftest {
"\t [QNN only] [device_id]: The ID of the device to use when setting 'htp_arch'. Defaults to '0' (for single device). \n"
"\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
"\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
"\t [Example] [For QNN EP] -e qnn -i \"backend_path|/folderpath/libQnnCpu.so\" \n"
"\n"
"\t [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/test/perftest/ort_test_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -302,20 +302,20 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
std::string str = str_stream.str();
ORT_THROW("Wrong value for htp_arch. select from: " + str);
}
} else if (key == "enable_htp_fp16_precision") {
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") {
std::unordered_set<std::string> supported_options = {"0", "1"};
if (supported_options.find(value) == supported_options.end()) {
std::ostringstream str_stream;
std::copy(supported_options.begin(), supported_options.end(),
std::ostream_iterator<std::string>(str_stream, ","));
std::string str = str_stream.str();
ORT_THROW("Wrong value for " + key + ". select from: " + str);
ORT_THROW("Wrong value for ", key, ". select from: ", str);
}
} else {
ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path',
'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode',
'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'qnn_context_priority', 'soc_model',
'htp_arch', 'device_id', 'enable_htp_fp16_precision'])");
'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization'])");
}

qnn_options[key] = value;
Expand Down
75 changes: 75 additions & 0 deletions onnxruntime/test/providers/qnn/qnn_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,81 @@ TEST_F(QnnHTPBackendTests, EPRejectsDynamicShapesF32) {
&ep_graph_checker);
}

// Test option for offloading quantization of graph inputs and dequantization of graph outputs to the CPU EP.
TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) {
// Returns a function that checks that the Q/DQ ops at the graph IO boundary are offloaded to CPU
// if the corresponding provider option is enabled.
auto graph_checker_builder = [](bool offload_graph_io_quantization) -> std::function<void(const Graph&)> {
return [offload_graph_io_quantization](const Graph& graph) {
size_t num_q = 0;
size_t num_dq = 0;
size_t num_qnn_fused_node = 0;

for (const Node& node : graph.Nodes()) {
const std::string& ep_name = node.GetExecutionProviderType();
const std::string& op_type = node.OpType();

if (offload_graph_io_quantization && op_type == "QuantizeLinear") {
const bool consumes_graph_input = graph.IsInputsIncludingInitializers(node.InputDefs()[0]);
EXPECT_EQ(ep_name, kCpuExecutionProvider);
EXPECT_TRUE(consumes_graph_input);
num_q += 1;
} else if (offload_graph_io_quantization && op_type == "DequantizeLinear") {
const bool produces_graph_output = graph.IsOutput(node.OutputDefs()[0]);
EXPECT_EQ(ep_name, kCpuExecutionProvider);
EXPECT_TRUE(produces_graph_output);
num_dq += 1;
} else {
EXPECT_EQ(ep_name, kQnnExecutionProvider);
num_qnn_fused_node += 1;
}
}

EXPECT_EQ(num_q, static_cast<size_t>(offload_graph_io_quantization));
EXPECT_EQ(num_dq, static_cast<size_t>(offload_graph_io_quantization));
EXPECT_EQ(num_qnn_fused_node, 1);
};
};

ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
const std::vector<std::string> op_types = {
"Sigmoid",
"Transpose",
"Softmax",
"Sqrt",
"Elu",
};

// Test various QDQ ops with offloading of I/O quantization enabled and disabled.
for (auto op_type : op_types) {
for (int offload_io_quant = 0; offload_io_quant <= 1; offload_io_quant++) {
provider_options["offload_graph_io_quantization"] = offload_io_quant ? "1" : "0";
auto graph_checker = graph_checker_builder(offload_io_quant);
auto expected_ep_assignment = offload_io_quant ? ExpectedEPNodeAssignment::Some : ExpectedEPNodeAssignment::All;

float min_val = (op_type == "Sqrt") ? 0.0f : -10.0f;
TestInputDef<float> input_def({1, 2, 2, 2}, false, GetFloatDataInRange(min_val, 10.0f, 8));
auto f32_model_build_fn = BuildOpTestCase<float>(op_type, {input_def}, {}, {});
auto qdq_model_build_fn = BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {});
TestQDQModelAccuracy<uint8_t>(f32_model_build_fn,
qdq_model_build_fn,
provider_options,
/*opset*/ 21,
expected_ep_assignment,
/*abs_err*/ QDQTolerance(),
logging::Severity::kERROR,
/*qnn_ctx_model_path*/ "",
/*session_option_pairs*/ {},
&graph_checker);
}
}
}

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
#endif // !defined(ORT_MINIMAL_BUILD)

Expand Down
7 changes: 6 additions & 1 deletion onnxruntime/test/providers/qnn/qnn_test_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ void InferenceModel(const std::string& model_data, const char* log_id,
ExpectedEPNodeAssignment expected_ep_assignment, const NameMLValMap& feeds,
std::vector<OrtValue>& output_vals,
bool is_qnn_ep,
const std::unordered_map<std::string, std::string>& session_option_pairs) {
const std::unordered_map<std::string, std::string>& session_option_pairs,
std::function<void(const Graph&)>* graph_checker) {
SessionOptions so;
so.session_logid = log_id;
for (auto key_value : session_option_pairs) {
Expand Down Expand Up @@ -166,6 +167,10 @@ void InferenceModel(const std::string& model_data, const char* log_id,
ASSERT_GT(ep_nodes, 0) << "No nodes were assigned to " << provider_type;
}

if (graph_checker) {
(*graph_checker)(graph);
}

const auto& outputs = graph.GetOutputs();
std::vector<std::string> output_names;

Expand Down
Loading

0 comments on commit 9be2fd1

Please sign in to comment.