Skip to content
This repository has been archived by the owner on Jan 3, 2023. It is now read-only.

Commit

Permalink
More diagnostic support (#435)
Browse files Browse the repository at this point in the history
Added a Timer class to enable logging of time taken by various stages in NGraphEncapsulate::Compute(). Also enabled printing detailed version information
  • Loading branch information
avijit-nervana authored Feb 20, 2019
1 parent 260b208 commit 2f078e4
Show file tree
Hide file tree
Showing 6 changed files with 143 additions and 62 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ The installation prerequisites are the same as described in the TensorFlow

git clone https://github.com/NervanaSystems/ngraph-tf.git
cd ngraph-tf
git checkout v0.10.0
git checkout v0.11.0-rc0


2. Next run the following Python script to build TensorFlow, nGraph and the bridge. Please use Python 3.5:
Expand Down
130 changes: 71 additions & 59 deletions python/ngraph_bridge/__init__.in.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import print_function

import importlib
import os
import sys
import time
import getpass
from platform import system

import numpy as np
import tensorflow as tf
from tensorflow.python import pywrap_tensorflow as py_tf
Expand All @@ -35,11 +35,12 @@

import ctypes


__all__ = ['enable', 'disable', 'is_enabled', 'backends_len', 'list_backends',
'set_backend', 'is_supported_backend', 'get_currently_set_backend_name' ,'start_logging_placement', 'stop_logging_placement',
'is_logging_placement', '__version__']

__all__ = [
'enable', 'disable', 'is_enabled', 'backends_len', 'list_backends',
'set_backend', 'is_supported_backend', 'get_currently_set_backend_name',
'start_logging_placement', 'stop_logging_placement',
'is_logging_placement', '__version__'
]

ext = 'dylib' if system() == 'Darwin' else 'so'

Expand All @@ -50,34 +51,33 @@

# converting version representations to strings if not already
try:
TF_VERSION = str(TF_VERSION, 'ascii')
TF_VERSION = str(TF_VERSION, 'ascii')
except TypeError: # will happen for python 2 or if already string
pass
pass

try:
TF_VERSION_NEEDED = str(TF_VERSION_NEEDED, 'ascii')
TF_VERSION_NEEDED = str(TF_VERSION_NEEDED, 'ascii')
except TypeError:
pass
pass

try:
if TF_GIT_VERSION.startswith("b'"): # TF version can be a bytes __repr__()
TF_GIT_VERSION = eval(TF_GIT_VERSION)
TF_GIT_VERSION = str(TF_GIT_VERSION, 'ascii')
if TF_GIT_VERSION.startswith("b'"): # TF version can be a bytes __repr__()
TF_GIT_VERSION = eval(TF_GIT_VERSION)
TF_GIT_VERSION = str(TF_GIT_VERSION, 'ascii')
except TypeError:
pass
pass

try:
if TF_GIT_VERSION_BUILT_WITH.startswith("b'"):
TF_GIT_VERSION_BUILT_WITH = eval(TF_GIT_VERSION_BUILT_WITH)
TF_GIT_VERSION_BUILT_WITH = str(TF_GIT_VERSION_BUILT_WITH, 'ascii')
if TF_GIT_VERSION_BUILT_WITH.startswith("b'"):
TF_GIT_VERSION_BUILT_WITH = eval(TF_GIT_VERSION_BUILT_WITH)
TF_GIT_VERSION_BUILT_WITH = str(TF_GIT_VERSION_BUILT_WITH, 'ascii')
except TypeError:
pass

print("TensorFlow version installed: {0} ({1})".format(TF_VERSION,
TF_GIT_VERSION))
print("nGraph bridge built with: {0} ({1})".format(TF_VERSION_NEEDED,
TF_GIT_VERSION_BUILT_WITH))
pass

# print("TensorFlow version installed: {0} ({1})".format(TF_VERSION,
# TF_GIT_VERSION))
# print("nGraph bridge built with: {0} ({1})".format(TF_VERSION_NEEDED,
# TF_GIT_VERSION_BUILT_WITH))

# We need to revisit this later. We can automate that using cmake configure
# command.
Expand All @@ -89,16 +89,19 @@
(TF_INSTALLED_VER[1] == TF_NEEDED_VER[1]) and \
((TF_INSTALLED_VER[2].split('-'))[0] == (TF_NEEDED_VER[2].split('-'))[0]):
libpath = os.path.dirname(__file__)
ngraph_bridge_lib = ctypes.cdll.LoadLibrary(os.path.join(
libpath, 'libngraph_bridge.' + ext))
ngraph_bridge_lib = ctypes.cdll.LoadLibrary(
os.path.join(libpath, 'libngraph_bridge.' + ext))
else:
raise ValueError("Error: Installed TensorFlow version {0}\nnGraph bridge built with: {1}".format(
TF_VERSION, TF_VERSION_NEEDED ))
raise ValueError(
"Error: Installed TensorFlow version {0}\nnGraph bridge built with: {1}"
.format(TF_VERSION, TF_VERSION_NEEDED))


def requested():
return ops.get_default_graph()._attr_scope(
{"_ngraph_requested": attr_value_pb2.AttrValue(b=True)})
return ops.get_default_graph()._attr_scope({
"_ngraph_requested":
attr_value_pb2.AttrValue(b=True)
})


ngraph_bridge_lib.ngraph_is_enabled.restype = ctypes.c_bool
Expand All @@ -110,7 +113,7 @@ def requested():
ngraph_bridge_lib.ngraph_get_currently_set_backend_name.restype = ctypes.c_bool
ngraph_bridge_lib.ngraph_is_logging_placement.restype = ctypes.c_bool
ngraph_bridge_lib.ngraph_tf_version.restype = ctypes.c_char_p

ngraph_bridge_lib.ngraph_lib_version.restype = ctypes.c_char_p

try:
importlib.import_module('plaidml.settings')
Expand All @@ -122,57 +125,66 @@ def requested():


def enable():
ngraph_bridge_lib.ngraph_enable()
ngraph_bridge_lib.ngraph_enable()


def disable():
ngraph_bridge_lib.ngraph_disable()
ngraph_bridge_lib.ngraph_disable()


def is_enabled():
return ngraph_bridge_lib.ngraph_is_enabled()
return ngraph_bridge_lib.ngraph_is_enabled()


def backends_len():
return ngraph_bridge_lib.ngraph_backends_len()
return ngraph_bridge_lib.ngraph_backends_len()


def list_backends():
len_backends = backends_len()
result = (ctypes.c_char_p * len_backends)()
if not ngraph_bridge_lib.ngraph_list_backends(result, len_backends):
raise Exception("Expected " + str(len_backends) + " backends, but got some other number of backends")
list_result = list(result)
# convert bytes to string required for py3 (encode/decode bytes)
backend_list = []
for backend in list_result:
backend_list.append(backend.decode("utf-8"))
return backend_list
len_backends = backends_len()
result = (ctypes.c_char_p * len_backends)()
if not ngraph_bridge_lib.ngraph_list_backends(result, len_backends):
raise Exception("Expected " + str(len_backends) +
" backends, but got some other number of backends")
list_result = list(result)
# convert bytes to string required for py3 (encode/decode bytes)
backend_list = []
for backend in list_result:
backend_list.append(backend.decode("utf-8"))
return backend_list


def set_backend(backend):
if not ngraph_bridge_lib.ngraph_set_backend(backend.encode("utf-8")):
raise Exception("Backend " + backend + " unavailable.")
if not ngraph_bridge_lib.ngraph_set_backend(backend.encode("utf-8")):
raise Exception("Backend " + backend + " unavailable.")


def is_supported_backend(backend):
return ngraph_bridge_lib.ngraph_is_supported_backend(backend.encode("utf-8"))
return ngraph_bridge_lib.ngraph_is_supported_backend(
backend.encode("utf-8"))


def get_currently_set_backend_name():
result = (ctypes.c_char_p * 1)()
if not ngraph_bridge_lib.ngraph_get_currently_set_backend_name(result):
raise Exception("Cannot get currently set backend")
list_result = list(result)
return list_result[0].decode("utf-8")
result = (ctypes.c_char_p * 1)()
if not ngraph_bridge_lib.ngraph_get_currently_set_backend_name(result):
raise Exception("Cannot get currently set backend")
list_result = list(result)
return list_result[0].decode("utf-8")


def start_logging_placement():
ngraph_bridge_lib.ngraph_start_logging_placement()
ngraph_bridge_lib.ngraph_start_logging_placement()


def stop_logging_placement():
ngraph_bridge_lib.ngraph_stop_logging_placement()
ngraph_bridge_lib.ngraph_stop_logging_placement()


def is_logging_placement():
return ngraph_bridge_lib.ngraph_is_logging_placement()

__version__ = ngraph_bridge_lib.ngraph_tf_version()
return ngraph_bridge_lib.ngraph_is_logging_placement()

__version__ = \
"nGraph bridge version: " + str(ngraph_bridge_lib.ngraph_tf_version()) + "\n" + \
"nGraph version used: " + str(ngraph_bridge_lib.ngraph_lib_version()) + "\n" + \
"TensorFlow verion used: " + TF_GIT_VERSION_BUILT_WITH

27 changes: 25 additions & 2 deletions src/ngraph_encapsulate_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "ngraph_freshness_tracker.h"
#include "ngraph_log.h"
#include "ngraph_mark_for_clustering.h"
#include "ngraph_timer.h"
#include "ngraph_utils.h"

#include "ngraph/runtime/backend.hpp"
Expand Down Expand Up @@ -230,6 +231,7 @@ class NGraphEncapsulateOp : public OpKernel {
// OpKernel::Compute
//---------------------------------------------------------------------------
void Compute(OpKernelContext* ctx) override {
Timer compute_time;
std::lock_guard<std::mutex> lock(m_compute_lock);
NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute starting for cluster "
<< m_ngraph_cluster;
Expand All @@ -238,6 +240,7 @@ class NGraphEncapsulateOp : public OpKernel {
ng::runtime::Backend* op_backend =
BackendManager::GetBackend(m_op_backend_name);

Timer function_lookup_or_create;
// Get the inputs
std::vector<TensorShape> input_shapes;
std::stringstream signature_ss;
Expand Down Expand Up @@ -370,9 +373,13 @@ class NGraphEncapsulateOp : public OpKernel {
ng_function = it->second;
}

int time_func_create_or_lookup = function_lookup_or_create.ElapsedInMS();

NGRAPH_VLOG(4) << "NGraphEncapsulateOp::Compute got graph for cluster "
<< m_ngraph_cluster;

Timer create_or_lookup_tensors;

if (m_freshness_tracker == nullptr) {
auto creator = [](NGraphFreshnessTracker** tracker) {
*tracker = new NGraphFreshnessTracker();
Expand Down Expand Up @@ -542,10 +549,11 @@ class NGraphEncapsulateOp : public OpKernel {
<< "NGraphEncapsulateOp::Compute allocated result tensors for cluster "
<< m_ngraph_cluster;

int time_create_or_lookup_tensors = create_or_lookup_tensors.ElapsedInMS();

// Execute the nGraph function.
Timer execute_function;
{
// mutex_lock l(s_ng_backend_mutex);
// std::lock_guard<std::mutex> lock(backend_mutex_ptr);
BackendManager::LockBackend(m_op_backend_name);
NGRAPH_VLOG(4)
<< "NGraphEncapsulateOp::Compute call starting for cluster "
Expand Down Expand Up @@ -574,6 +582,7 @@ class NGraphEncapsulateOp : public OpKernel {
}
BackendManager::UnlockBackend(m_op_backend_name);
}
int time_execute_function = execute_function.ElapsedInMS();

long vm, rss;
MemoryProfile(vm, rss);
Expand All @@ -590,6 +599,8 @@ class NGraphEncapsulateOp : public OpKernel {
<< m_ngraph_cluster;

// Copy value to host if backend is not CPU
Timer copy_output_tensors_to_host;

try {
if (m_op_backend_name != "CPU") {
for (size_t i = 0; i < output_caches.size(); ++i) {
Expand Down Expand Up @@ -618,10 +629,22 @@ class NGraphEncapsulateOp : public OpKernel {
void* src_ptr = (void*)DMAHelper::base(&ctx->input(i));
m_freshness_tracker->MarkFresh(src_ptr, ng_function);
}
int time_copy_output_tensors_to_host =
copy_output_tensors_to_host.ElapsedInMS();

NGRAPH_VLOG(4)
<< "NGraphEncapsulateOp::Compute done marking fresh for cluster "
<< m_ngraph_cluster;
NGRAPH_VLOG(1) << "NGRAPH_TF_TIMING_PROFILE: OP_ID: " << my_instance_id
<< " Step_ID: " << ctx->step_id()
<< " Cluster: " << ctx->op_kernel().name()
<< " Time-Compute: " << compute_time.ElapsedInMS()
<< " Function-Create-or-Lookup: "
<< time_func_create_or_lookup << " Create-and-copy-tensors: "
<< time_create_or_lookup_tensors
<< " Execute: " << time_execute_function
<< " Copy-outputs-to-host: "
<< time_copy_output_tensors_to_host;
} // end compute

private:
Expand Down
39 changes: 39 additions & 0 deletions src/ngraph_timer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef NGRAPH_TF_BRIDGE_TIMER_H_
#define NGRAPH_TF_BRIDGE_TIMER_H_

#include <chrono>

namespace tensorflow {
namespace ngraph_bridge {

class Timer {
public:
Timer() : m_start(std::chrono::high_resolution_clock::now()) {}
int ElapsedInMS() {
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - m_start)
.count();
}

private:
const std::chrono::time_point<std::chrono::high_resolution_clock> m_start;
};
} // namespace ngraph_bridge
} // namespace tensorflow

#endif // NGRAPH_TF_BRIDGE_TIMER_H_
4 changes: 4 additions & 0 deletions src/version.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "version.h"
#include <iostream>
#include <string>
#include "ngraph/ngraph.hpp"

// nGraph-TensorFlow bridge uses semantic versioning: see http://semver.org/

Expand All @@ -38,8 +39,11 @@
NG_TF_MINOR_VERSION) "." VERSION_STR(NG_TF_PATCH_VERSION) \
NG_TF_VERSION_SUFFIX)

extern "C" const char* get_ngraph_version_string();

namespace tensorflow {
namespace ngraph_bridge {
const char* ngraph_tf_version() { return (NG_TF_VERSION_STRING); }
const char* ngraph_lib_version() { return get_ngraph_version_string(); }
} // namespace ngraph_bridge
} // namespace tensorflow
3 changes: 3 additions & 0 deletions src/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ namespace ngraph_bridge {
extern "C" {
// Returns the ngraph-tensorflow library version
const char* ngraph_tf_version();

// Returns the nGraph version this bridge was compiled with
const char* ngraph_lib_version();
}
} // namespace ngraph_bridge
} // namespace tensorflow
Expand Down

0 comments on commit 2f078e4

Please sign in to comment.