feat: add more logging in chatCompletion

janhq · Jun 19, 2024 · a3ed58c · a3ed58c
1 parent 9a2f4a9
commit a3ed58c
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 5 deletions.
diff --git a/cpp/Makefile b/cpp/Makefile
@@ -48,9 +48,9 @@ else
 	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9 cortex.tensorrt-llm && \
 	cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8 cortex.tensorrt-llm && \
 	cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8 cortex.tensorrt-llm && \
-	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9 cortex.tensorrt-llm && \
 	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/libtensorrt_llm.so cortex.tensorrt-llm && \
-	cp /opt/hpcx/ompi/lib/libmpi.so cortex.tensorrt-llm && \
+	cp /opt/hpcx/ompi/lib/libmpi.so.40 cortex.tensorrt-llm && \
 	cp /usr/lib/x86_64-linux-gnu/libnccl.so cortex.tensorrt-llm
 endif
 

diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
@@ -106,6 +106,7 @@ void InferenceThread(
   // Define the callback to stream each generated token
   generation_output.onTokenGenerated = [&infer_state, input_len, outputLen, self, &generation_output](
                                           GenerationOutput::TensorPtr const& output_ids, SizeType step, bool finished) {
+    LOG_INFO << "Inference thread started";
     // Assuming the shape of output_ids tensor is (1, 1, 160), where 160 is the number of tokens
     int output_length = output_ids->getShape().d[2]; // Get the length of output IDs based on the tensor shape
     // Copy output IDs from GPU to host for printing
@@ -240,6 +241,7 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
   inference_thread.detach(); // Detach the thread to allow it to run independently
 
   q_->runTaskInQueue([cb = std::move(callback), infer_state]() {
+    LOG_INFO << "Preparing to run inference task queue...";
     while (true) { // Continuously check if the queue is not empty
       std::unique_lock<std::mutex> lock(infer_state->queue_mutex); // Lock the queue for exclusive access
       if (!infer_state->texts_to_stream.empty()) {
@@ -280,9 +282,7 @@ void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_b
         status["is_stream"] = true;
         status["status_code"] = k200OK;
         cb(std::move(status), std::move(resp_data));
-        continue;;
-      }
-      else {
+      } else {
         // If the queue is empty, release the lock and wait before trying again
         lock.unlock();
       }