From 2f078e47b89e13a51e6fc3f1e8821fbecd90d473 Mon Sep 17 00:00:00 2001 From: Avijit <30507445+avijit-nervana@users.noreply.github.com> Date: Tue, 19 Feb 2019 17:17:26 -0800 Subject: [PATCH] More diagnostic support (#435) Added a Timer class to enable logging of time taken by various stages in NGraphEncapsulate::Compute(). Also enabled printing detailed version information --- README.md | 2 +- python/ngraph_bridge/__init__.in.py | 130 +++++++++++++++------------- src/ngraph_encapsulate_op.cc | 27 +++++- src/ngraph_timer.h | 39 +++++++++ src/version.cc | 4 + src/version.h | 3 + 6 files changed, 143 insertions(+), 62 deletions(-) create mode 100644 src/ngraph_timer.h diff --git a/README.md b/README.md index 97519cc9..d466e106 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ The installation prerequisites are the same as described in the TensorFlow git clone https://github.com/NervanaSystems/ngraph-tf.git cd ngraph-tf - git checkout v0.10.0 + git checkout v0.11.0-rc0 2. Next run the following Python script to build TensorFlow, nGraph and the bridge. Please use Python 3.5: diff --git a/python/ngraph_bridge/__init__.in.py b/python/ngraph_bridge/__init__.in.py index 0438dcad..84a3b809 100644 --- a/python/ngraph_bridge/__init__.in.py +++ b/python/ngraph_bridge/__init__.in.py @@ -16,7 +16,7 @@ from __future__ import absolute_import from __future__ import division -from __future__ import print_function +from __future__ import print_function import importlib import os @@ -24,7 +24,7 @@ import time import getpass from platform import system - + import numpy as np import tensorflow as tf from tensorflow.python import pywrap_tensorflow as py_tf @@ -35,11 +35,12 @@ import ctypes - -__all__ = ['enable', 'disable', 'is_enabled', 'backends_len', 'list_backends', - 'set_backend', 'is_supported_backend', 'get_currently_set_backend_name' ,'start_logging_placement', 'stop_logging_placement', - 'is_logging_placement', '__version__'] - +__all__ = [ + 'enable', 'disable', 'is_enabled', 'backends_len', 'list_backends', + 'set_backend', 'is_supported_backend', 'get_currently_set_backend_name', + 'start_logging_placement', 'stop_logging_placement', + 'is_logging_placement', '__version__' +] ext = 'dylib' if system() == 'Darwin' else 'so' @@ -50,34 +51,33 @@ # converting version representations to strings if not already try: - TF_VERSION = str(TF_VERSION, 'ascii') + TF_VERSION = str(TF_VERSION, 'ascii') except TypeError: # will happen for python 2 or if already string - pass + pass try: - TF_VERSION_NEEDED = str(TF_VERSION_NEEDED, 'ascii') + TF_VERSION_NEEDED = str(TF_VERSION_NEEDED, 'ascii') except TypeError: - pass + pass try: - if TF_GIT_VERSION.startswith("b'"): # TF version can be a bytes __repr__() - TF_GIT_VERSION = eval(TF_GIT_VERSION) - TF_GIT_VERSION = str(TF_GIT_VERSION, 'ascii') + if TF_GIT_VERSION.startswith("b'"): # TF version can be a bytes __repr__() + TF_GIT_VERSION = eval(TF_GIT_VERSION) + TF_GIT_VERSION = str(TF_GIT_VERSION, 'ascii') except TypeError: - pass - + pass + try: - if TF_GIT_VERSION_BUILT_WITH.startswith("b'"): - TF_GIT_VERSION_BUILT_WITH = eval(TF_GIT_VERSION_BUILT_WITH) - TF_GIT_VERSION_BUILT_WITH = str(TF_GIT_VERSION_BUILT_WITH, 'ascii') + if TF_GIT_VERSION_BUILT_WITH.startswith("b'"): + TF_GIT_VERSION_BUILT_WITH = eval(TF_GIT_VERSION_BUILT_WITH) + TF_GIT_VERSION_BUILT_WITH = str(TF_GIT_VERSION_BUILT_WITH, 'ascii') except TypeError: - pass - -print("TensorFlow version installed: {0} ({1})".format(TF_VERSION, - TF_GIT_VERSION)) -print("nGraph bridge built with: {0} ({1})".format(TF_VERSION_NEEDED, - TF_GIT_VERSION_BUILT_WITH)) + pass +# print("TensorFlow version installed: {0} ({1})".format(TF_VERSION, +# TF_GIT_VERSION)) +# print("nGraph bridge built with: {0} ({1})".format(TF_VERSION_NEEDED, +# TF_GIT_VERSION_BUILT_WITH)) # We need to revisit this later. We can automate that using cmake configure # command. @@ -89,16 +89,19 @@ (TF_INSTALLED_VER[1] == TF_NEEDED_VER[1]) and \ ((TF_INSTALLED_VER[2].split('-'))[0] == (TF_NEEDED_VER[2].split('-'))[0]): libpath = os.path.dirname(__file__) - ngraph_bridge_lib = ctypes.cdll.LoadLibrary(os.path.join( - libpath, 'libngraph_bridge.' + ext)) + ngraph_bridge_lib = ctypes.cdll.LoadLibrary( + os.path.join(libpath, 'libngraph_bridge.' + ext)) else: - raise ValueError("Error: Installed TensorFlow version {0}\nnGraph bridge built with: {1}".format( - TF_VERSION, TF_VERSION_NEEDED )) + raise ValueError( + "Error: Installed TensorFlow version {0}\nnGraph bridge built with: {1}" + .format(TF_VERSION, TF_VERSION_NEEDED)) def requested(): - return ops.get_default_graph()._attr_scope( - {"_ngraph_requested": attr_value_pb2.AttrValue(b=True)}) + return ops.get_default_graph()._attr_scope({ + "_ngraph_requested": + attr_value_pb2.AttrValue(b=True) + }) ngraph_bridge_lib.ngraph_is_enabled.restype = ctypes.c_bool @@ -110,7 +113,7 @@ def requested(): ngraph_bridge_lib.ngraph_get_currently_set_backend_name.restype = ctypes.c_bool ngraph_bridge_lib.ngraph_is_logging_placement.restype = ctypes.c_bool ngraph_bridge_lib.ngraph_tf_version.restype = ctypes.c_char_p - +ngraph_bridge_lib.ngraph_lib_version.restype = ctypes.c_char_p try: importlib.import_module('plaidml.settings') @@ -122,57 +125,66 @@ def requested(): def enable(): - ngraph_bridge_lib.ngraph_enable() + ngraph_bridge_lib.ngraph_enable() def disable(): - ngraph_bridge_lib.ngraph_disable() + ngraph_bridge_lib.ngraph_disable() def is_enabled(): - return ngraph_bridge_lib.ngraph_is_enabled() + return ngraph_bridge_lib.ngraph_is_enabled() def backends_len(): - return ngraph_bridge_lib.ngraph_backends_len() + return ngraph_bridge_lib.ngraph_backends_len() def list_backends(): - len_backends = backends_len() - result = (ctypes.c_char_p * len_backends)() - if not ngraph_bridge_lib.ngraph_list_backends(result, len_backends): - raise Exception("Expected " + str(len_backends) + " backends, but got some other number of backends") - list_result = list(result) - # convert bytes to string required for py3 (encode/decode bytes) - backend_list = [] - for backend in list_result: - backend_list.append(backend.decode("utf-8")) - return backend_list + len_backends = backends_len() + result = (ctypes.c_char_p * len_backends)() + if not ngraph_bridge_lib.ngraph_list_backends(result, len_backends): + raise Exception("Expected " + str(len_backends) + + " backends, but got some other number of backends") + list_result = list(result) + # convert bytes to string required for py3 (encode/decode bytes) + backend_list = [] + for backend in list_result: + backend_list.append(backend.decode("utf-8")) + return backend_list def set_backend(backend): - if not ngraph_bridge_lib.ngraph_set_backend(backend.encode("utf-8")): - raise Exception("Backend " + backend + " unavailable.") + if not ngraph_bridge_lib.ngraph_set_backend(backend.encode("utf-8")): + raise Exception("Backend " + backend + " unavailable.") + def is_supported_backend(backend): - return ngraph_bridge_lib.ngraph_is_supported_backend(backend.encode("utf-8")) + return ngraph_bridge_lib.ngraph_is_supported_backend( + backend.encode("utf-8")) + def get_currently_set_backend_name(): - result = (ctypes.c_char_p * 1)() - if not ngraph_bridge_lib.ngraph_get_currently_set_backend_name(result): - raise Exception("Cannot get currently set backend") - list_result = list(result) - return list_result[0].decode("utf-8") + result = (ctypes.c_char_p * 1)() + if not ngraph_bridge_lib.ngraph_get_currently_set_backend_name(result): + raise Exception("Cannot get currently set backend") + list_result = list(result) + return list_result[0].decode("utf-8") + def start_logging_placement(): - ngraph_bridge_lib.ngraph_start_logging_placement() + ngraph_bridge_lib.ngraph_start_logging_placement() def stop_logging_placement(): - ngraph_bridge_lib.ngraph_stop_logging_placement() + ngraph_bridge_lib.ngraph_stop_logging_placement() def is_logging_placement(): - return ngraph_bridge_lib.ngraph_is_logging_placement() - -__version__ = ngraph_bridge_lib.ngraph_tf_version() + return ngraph_bridge_lib.ngraph_is_logging_placement() + +__version__ = \ + "nGraph bridge version: " + str(ngraph_bridge_lib.ngraph_tf_version()) + "\n" + \ + "nGraph version used: " + str(ngraph_bridge_lib.ngraph_lib_version()) + "\n" + \ + "TensorFlow verion used: " + TF_GIT_VERSION_BUILT_WITH + diff --git a/src/ngraph_encapsulate_op.cc b/src/ngraph_encapsulate_op.cc index c50d2609..d70f62aa 100644 --- a/src/ngraph_encapsulate_op.cc +++ b/src/ngraph_encapsulate_op.cc @@ -32,6 +32,7 @@ #include "ngraph_freshness_tracker.h" #include "ngraph_log.h" #include "ngraph_mark_for_clustering.h" +#include "ngraph_timer.h" #include "ngraph_utils.h" #include "ngraph/runtime/backend.hpp" @@ -230,6 +231,7 @@ class NGraphEncapsulateOp : public OpKernel { // OpKernel::Compute //--------------------------------------------------------------------------- void Compute(OpKernelContext* ctx) override { + Timer compute_time; std::lock_guard lock(m_compute_lock); NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute starting for cluster " << m_ngraph_cluster; @@ -238,6 +240,7 @@ class NGraphEncapsulateOp : public OpKernel { ng::runtime::Backend* op_backend = BackendManager::GetBackend(m_op_backend_name); + Timer function_lookup_or_create; // Get the inputs std::vector input_shapes; std::stringstream signature_ss; @@ -370,9 +373,13 @@ class NGraphEncapsulateOp : public OpKernel { ng_function = it->second; } + int time_func_create_or_lookup = function_lookup_or_create.ElapsedInMS(); + NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute got graph for cluster " << m_ngraph_cluster; + Timer create_or_lookup_tensors; + if (m_freshness_tracker == nullptr) { auto creator = [](NGraphFreshnessTracker** tracker) { *tracker = new NGraphFreshnessTracker(); @@ -542,10 +549,11 @@ class NGraphEncapsulateOp : public OpKernel { << "NGraphEncapsulateOp::Compute allocated result tensors for cluster " << m_ngraph_cluster; + int time_create_or_lookup_tensors = create_or_lookup_tensors.ElapsedInMS(); + // Execute the nGraph function. + Timer execute_function; { - // mutex_lock l(s_ng_backend_mutex); - // std::lock_guard lock(backend_mutex_ptr); BackendManager::LockBackend(m_op_backend_name); NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute call starting for cluster " @@ -574,6 +582,7 @@ class NGraphEncapsulateOp : public OpKernel { } BackendManager::UnlockBackend(m_op_backend_name); } + int time_execute_function = execute_function.ElapsedInMS(); long vm, rss; MemoryProfile(vm, rss); @@ -590,6 +599,8 @@ class NGraphEncapsulateOp : public OpKernel { << m_ngraph_cluster; // Copy value to host if backend is not CPU + Timer copy_output_tensors_to_host; + try { if (m_op_backend_name != "CPU") { for (size_t i = 0; i < output_caches.size(); ++i) { @@ -618,10 +629,22 @@ class NGraphEncapsulateOp : public OpKernel { void* src_ptr = (void*)DMAHelper::base(&ctx->input(i)); m_freshness_tracker->MarkFresh(src_ptr, ng_function); } + int time_copy_output_tensors_to_host = + copy_output_tensors_to_host.ElapsedInMS(); NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute done marking fresh for cluster " << m_ngraph_cluster; + NGRAPH_VLOG(1) << "NGRAPH_TF_TIMING_PROFILE: OP_ID: " << my_instance_id + << " Step_ID: " << ctx->step_id() + << " Cluster: " << ctx->op_kernel().name() + << " Time-Compute: " << compute_time.ElapsedInMS() + << " Function-Create-or-Lookup: " + << time_func_create_or_lookup << " Create-and-copy-tensors: " + << time_create_or_lookup_tensors + << " Execute: " << time_execute_function + << " Copy-outputs-to-host: " + << time_copy_output_tensors_to_host; } // end compute private: diff --git a/src/ngraph_timer.h b/src/ngraph_timer.h new file mode 100644 index 00000000..399033bd --- /dev/null +++ b/src/ngraph_timer.h @@ -0,0 +1,39 @@ +/******************************************************************************* + * Copyright 2019 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ +#ifndef NGRAPH_TF_BRIDGE_TIMER_H_ +#define NGRAPH_TF_BRIDGE_TIMER_H_ + +#include + +namespace tensorflow { +namespace ngraph_bridge { + +class Timer { + public: + Timer() : m_start(std::chrono::high_resolution_clock::now()) {} + int ElapsedInMS() { + return std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - m_start) + .count(); + } + + private: + const std::chrono::time_point m_start; +}; +} // namespace ngraph_bridge +} // namespace tensorflow + +#endif // NGRAPH_TF_BRIDGE_TIMER_H_ diff --git a/src/version.cc b/src/version.cc index 09a6696c..905f2868 100644 --- a/src/version.cc +++ b/src/version.cc @@ -17,6 +17,7 @@ #include "version.h" #include #include +#include "ngraph/ngraph.hpp" // nGraph-TensorFlow bridge uses semantic versioning: see http://semver.org/ @@ -38,8 +39,11 @@ NG_TF_MINOR_VERSION) "." VERSION_STR(NG_TF_PATCH_VERSION) \ NG_TF_VERSION_SUFFIX) +extern "C" const char* get_ngraph_version_string(); + namespace tensorflow { namespace ngraph_bridge { const char* ngraph_tf_version() { return (NG_TF_VERSION_STRING); } +const char* ngraph_lib_version() { return get_ngraph_version_string(); } } // namespace ngraph_bridge } // namespace tensorflow diff --git a/src/version.h b/src/version.h index d90e508f..bcadc4c6 100644 --- a/src/version.h +++ b/src/version.h @@ -21,6 +21,9 @@ namespace ngraph_bridge { extern "C" { // Returns the ngraph-tensorflow library version const char* ngraph_tf_version(); + +// Returns the nGraph version this bridge was compiled with +const char* ngraph_lib_version(); } } // namespace ngraph_bridge } // namespace tensorflow