Update TensorRT-LLM (#1554)

NVIDIA · May 7, 2024 · 89ba1b1 · 89ba1b1
1 parent 06c0e9b
commit 89ba1b1
Show file tree

Hide file tree

Showing 270 changed files with 10,605 additions and 3,111 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,4 @@
 *.a filter=lfs diff=lfs merge=lfs -text
 *.lib filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.dll filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -17,7 +17,7 @@ body:
         - Libraries
           - TensorRT-LLM branch or tag (e.g., main, v0.7.1)
           - TensorRT-LLM commit (if known)
-          - Versions of TensorRT, AMMO, CUDA, cuBLAS, etc. used
+          - Versions of TensorRT, Modelopt, CUDA, cuBLAS, etc. used
           - Container used (if running TensorRT-LLM in a container)
         - NVIDIA driver version
         - OS (Ubuntu 22.04, CentOS 7, Windows 10)

diff --git a/.gitignore b/.gitignore
@@ -6,7 +6,6 @@ __pycache__/
 *.nsys-rep
 .VSCodeCounter
 build*/
-*.so
 *.egg-info/
 .coverage
 *.csv
@@ -34,6 +33,7 @@ tensorrt_llm/bindings.pyi
 tensorrt_llm/bindings/*.pyi
 *docs/cpp_docs*
 *docs/source/_cpp_gen*
+*.swp
 
 # Testing
 .coverage.*

diff --git a/README.md b/README.md
@@ -6,9 +6,9 @@ TensorRT-LLM
 
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.3-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-9.3-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.9.0-green)](./setup.py)
+[![cuda](https://img.shields.io/badge/cuda-12.4.0-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.0.1-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.10.0.dev-green)](./setup.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -170,8 +170,6 @@ Given a `static_emulated_batch_size` of `n` the server will wait for `n` request
 ```
  python prepare_dataset.py \
   --output tokens-fixed-lengths.json \
-  --request-rate -1 \
-  --time-delay-dist constant \
   --tokenizer <path/to/tokenizer> \
    token-norm-dist \
    --num-requests 128 \
@@ -184,6 +182,7 @@ Take GPT-350M as an example for single GPU with static batching
 ./benchmarks/gptManagerBenchmark \
     --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
     --type IFB \
+    --request-rate -1 \
     --static_emulated_batch_size 32 \
     --static_emulated_timeout 100 \
     --dataset ../../benchmarks/cpp/tokens-fixed-lengths.json
@@ -212,6 +211,7 @@ PP=1
 MAX_LEN=1024
 MAX_BATCH=32
 MAX_LORA_RANK=32
+NUM_LORA_MODS=7
 
 SOURCE_LORA=chinese-llama-2-lora-13b
 CPP_LORA=chinese-llama-2-lora-13b-cpp
@@ -241,10 +241,9 @@ NUM_LORAS=(8 16 24 32 64 128 256)
 NUM_REQUESTS=1024
 
 # Convert LoRA to cpp format
-python examples/gpt/nemo_lora_convert.py \
+python examples/hf_lora_convert.py \
     -i $SOURCE_LORA \
     --storage-type $DTYPE \
-    --write-cpp-runtime-tensors \
     -o $CPP_LORA
 
 # Prepare datasets

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
@@ -151,6 +151,7 @@ struct BenchmarkParams
     bool enableExpDelays{false};
     std::optional<float> requestRate{std::nullopt};
     int randomSeed = 430;
+    std::optional<int> maxAttentionWindow{std::nullopt};
 
     // lora / peft params
     std::optional<std::string> loraDir{std::nullopt};
@@ -746,8 +747,8 @@ class ExecutorServer
 
         texec::SchedulerConfig schedulerConfig(batch_scheduler::batchManagerToExecSchedPolicy(schedulerPolicy));
         texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
-            std::nullopt, std::nullopt, benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize,
-            benchmarkParams.kvOnboardBlocks);
+            benchmarkParams.maxAttentionWindow, std::nullopt, benchmarkParams.freeGpuMemoryFraction,
+            benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
         texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
             std::nullopt, benchmarkParams.loraHostCacheSize);
         texec::ExecutorConfig executorConfig(
@@ -909,6 +910,16 @@ class GptServer
         mWorkItemsQueue.clear();
     }
 
+    std::string getLayerProfileInfo()
+    {
+        return mBatchManager->getLayerProfileInfo();
+    }
+
+    void setLayerProfiler()
+    {
+        return mBatchManager->setLayerProfiler();
+    }
+
     void enqueue(std::shared_ptr<InferenceRequest> const& request)
     {
         TLLM_CHECK(request != nullptr);
@@ -1267,7 +1278,7 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     std::chrono::milliseconds waitSleep, bool returnContextLogits, bool returnGenerationLogits,
     std::optional<SizeType> const staticEmulatedBatchSize, std::optional<std::chrono::milliseconds> const batchTimeout,
     bool logIterationData, bool excludeInputInOutput, std::string const& responsesJsonFile,
-    std::optional<SizeType> const maxPromptLen)
+    std::optional<SizeType> const maxPromptLen, bool dumpProfile)
 {
     TrtGptModelOptionalParams optionalParams;
 
@@ -1279,6 +1290,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
     {
         optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
     }
+    if (benchmarkParams.maxAttentionWindow)
+    {
+        optionalParams.kvCacheConfig.maxAttentionWindow = benchmarkParams.maxAttentionWindow;
+    }
     optionalParams.kvCacheConfig.enableBlockReuse = benchmarkParams.enableBlockReuse;
     optionalParams.enableChunkedContext = benchmarkParams.enableChunkedContext;
     optionalParams.enableTrtOverlap = benchmarkParams.enableTrtOverlap;
@@ -1391,6 +1406,23 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
         recorder->report();
         recorder->writeOpMetricsToCsv();
         recorder->dumpResponseSeqs();
+        if (dumpProfile)
+        {
+            // Do per-layer profiling after normal benchmarking to avoid introducing perf overhead.
+            gptServer->resetBatchDeadline();
+            gptServer->setLayerProfiler();
+            for (std::size_t i = 0; i < numSamples; ++i)
+            {
+                auto request = makeRequest(i + 1, samples[i], benchmarkParams.streaming, beamWidthTensor, eosIdTensor,
+                    padIdTensor, bufferManager, returnContextLogitsFlagTensor, returnGenerationLogitsFlagTensor);
+                gptServer->enqueue(request);
+            }
+            gptServer->waitForEmpty();
+            if (worldConfig.getRank() == 0)
+            {
+                printf("[BENCHMARK] Per layer performance profile\n%s\n", gptServer->getLayerProfileInfo().c_str());
+            }
+        }
         // Send terminateReqId to terminate servers on all ranks
         // Server on rank 0 will broadcast the terminate signal to other servers on multi-GPU cases
         gptServer->enqueue(std::make_shared<InferenceRequest>(terminateReqId));
@@ -1554,6 +1586,7 @@ int main(int argc, char* argv[])
         "eos_id", "Specify the end-of-sequence token id.", cxxopts::value<TokenIdType>()->default_value("-1"));
     options.add_options()("pad_id", "Specify the padding token id.", cxxopts::value<TokenIdType>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
+    options.add_options()("max_attention_window", "Max KV cache length per sequence", cxxopts::value<int>());
     options.add_options()(
         "random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
     options.add_options()(
@@ -1614,6 +1647,8 @@ int main(int argc, char* argv[])
     options.add_options()(
         "max_prompt_len", "Truncate all prompts from dataset to the length specified.", cxxopts::value<SizeType>());
 
+    options.add_options()("dump_profile", "Print profile information per layer.", cxxopts::value<bool>());
+
     auto result = options.parse(argc, argv);
 
     if (result.count("help"))
@@ -1674,6 +1709,12 @@ int main(int argc, char* argv[])
         benchmarkParams.maxTokensInPagedKvCache = result["max_tokens_in_paged_kvcache"].as<int>();
     }
 
+    // Argument: Max KV cache length
+    if (result.count("max_attention_window"))
+    {
+        benchmarkParams.maxAttentionWindow = result["max_attention_window"].as<int>();
+    }
+
     if (result.count("random_seed"))
     {
         benchmarkParams.randomSeed = result["random_seed"].as<int>();
@@ -1811,6 +1852,9 @@ int main(int argc, char* argv[])
         return 1;
     }
 
+    // Argument: dump profile
+    bool dumpProfile = result["dump_profile"].as<bool>();
+
     initTrtLlmPlugins(logger.get());
 
     if (api == "gptManager")
@@ -1821,7 +1865,7 @@ int main(int argc, char* argv[])
                 maxNumSamples, beamWidth, result["warm_up"].as<int>(), eosId, padId, benchmarkParams, schedulerPolicy,
                 waitSleep, returnContextLogits, returnGenerationLogits, staticEmulatedBatchSize, batchTimeout,
                 logIterationData, result["exclude_input_in_output_seq"].as<bool>(),
-                result["responses_json_file"].as<std::string>(), maxPromptLen);
+                result["responses_json_file"].as<std::string>(), maxPromptLen, dumpProfile);
         }
         catch (std::exception const& e)
         {

diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -68,7 +68,7 @@ size_t monitorMemory(std::atomic_bool& done)
 void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int> const& batchSizes, int beamWidth,
     std::vector<std::vector<int>> const& inOutLen, std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp,
     int numRuns, int duration, GptSession::Config& sessionConfig, bool cudaGraphMode, bool printAllLogits,
-    bool disableForceMaxTokens, bool dumpLayerInfo)
+    bool disableForceMaxTokens, bool dumpLayerInfo, bool dumpProfile)
 {
     std::filesystem::path jsonFileName = dataPath / "config.json";
     auto const json = GptJsonConfig::parse(jsonFileName);
@@ -298,6 +298,46 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
                                   << std::endl;
                     }
                 }
+                // Do per-layer profiling after normal benchmarking to avoid introducing perf overhead.
+                if (dumpProfile)
+                {
+                    session.setLayerProfiler();
+                    iterIdx = 0;
+
+                    while (iterIdx < numRuns)
+                    {
+                        auto const start = std::chrono::steady_clock::now();
+                        SizeType numSteps = 0;
+                        generationOutput.onTokenGenerated
+                            = [&numSteps, maxNewTokens](GenerationOutput::TensorPtr const& outputIds, SizeType step,
+                                  bool finished) { ++numSteps; };
+                        session.generate(generationOutput, generationInput, samplingConfig, generationProfiler);
+                        bufferManager.getStream().synchronize();
+                        auto const end = std::chrono::steady_clock::now();
+
+                        iterIdx += 1;
+                        float latency = std::chrono::duration<float, std::milli>(end - start).count();
+                        curDuration += latency;
+                        latencies.emplace_back(latency);
+                        generationTimes.emplace_back(generationProfiler->getElapsedTimeMs());
+
+                        bool durationLimitReached{curDuration / 1000 >= duration};
+                        if (worldConfig.getSize() > 1)
+                        {
+                            bool result{false};
+                            comm.allreduce(&durationLimitReached, &result, 1, tmpi::MpiType::kBOOL, tmpi::MpiOp::LOR);
+                            durationLimitReached = result;
+                        }
+                        if (durationLimitReached)
+                        {
+                            break;
+                        }
+                    }
+                    if (worldConfig.getRank() == 0)
+                    {
+                        printf("%s\n", session.getLayerProfileInfo().c_str());
+                    }
+                }
             }
             catch (std::runtime_error& e)
             {
@@ -377,6 +417,7 @@ int main(int argc, char* argv[])
     options.add_options()("print_all_logits", "Print all context and generation logits.");
     options.add_options()("disable_force_max_tokens", "Disable force the engine generating new max_tokens.");
     options.add_options()("dump_layer_info", "Print layer information of the engine to console.");
+    options.add_options()("dump_profile", "Print profile information per layer.");
 
     auto result = options.parse(argc, argv);
 
@@ -487,14 +528,15 @@ int main(int argc, char* argv[])
     auto printAllLogits = result.count("print_all_logits") > 0;
     auto disableForceMaxTokens = result.count("disable_force_max_tokens") > 0;
     auto dumpLayerInfo = result.count("dump_layer_info") > 0;
+    auto dumpProfile = result.count("dump_profile") > 0;
 
     initTrtLlmPlugins(logger.get());
 
     try
     {
         benchmarkGptSession(result["engine_dir"].as<std::string>(), batchSizes, beamWidth, inOutLen, logger,
             result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>(), sessionConfig,
-            enableCudaGraph, printAllLogits, disableForceMaxTokens, dumpLayerInfo);
+            enableCudaGraph, printAllLogits, disableForceMaxTokens, dumpLayerInfo, dumpProfile);
     }
     catch (std::exception const& e)
     {

diff --git a/benchmarks/python/README.md b/benchmarks/python/README.md
@@ -48,3 +48,11 @@ mpirun -n 8 python benchmark.py \
     --batch_size "1;8;64" \
     --input_output_len "60,20;128,20"
 ```
+
+Note: Building multi-GPU engines in parallel could be a heavy workload for the CPU system. Tuning `mpirun --map-by <XXX>` option on your system may achieve significant boost in build time, for example:
+```
+mpirun --map-by socket -n 8 python build.py \
+    --model gpt_175b \
+    --mode ootb \
+    --quantization fp8
+```
diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
@@ -67,6 +67,8 @@ class BuildConfig:
     layer_types: List[str] = field(default_factory=list)
     rnn_hidden_size: int = 0
     logits_soft_cap: float = 0.0
+    opt_batch_size: Optional[int] = None
+    opt_num_tokens: Optional[int] = None
 
 
 @dataclass

diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
@@ -268,6 +268,25 @@ def parse_arguments():
         help=
         "Print layer information of the engine to console (default = disabled)")
 
+    parser.add_argument(
+        '--opt_batch_size',
+        type=int,
+        default=None,
+        help=
+        "If opt_batch_size option is specified, it will override the opt batch size."
+        "This flag only takes effect when `--mode=ootb` is added. For other modes, please use --opt_num_tokens to replace it."
+    )
+
+    parser.add_argument(
+        '--opt_num_tokens',
+        type=int,
+        default=None,
+        help="It equals to max_batch_size*max_beam_width by default, set this "
+        "value as close as possible to the actual number of tokens on your workload. "
+        "Note that this argument might be removed in the future."
+        "This flag only takes effect when `--mode` is not `ootb`. For ootb mode, please use --opt_batch_size to replace it."
+    )
+
     return parser.parse_args()
 
 
@@ -334,9 +353,6 @@ def main(args):
     if args.build_only:
         return
 
-    if args.dump_profile and benchmark_profiler is not None:
-        benchmark_profiler.set_recording_perf_profile(True)
-
     start = torch.cuda.Event(enable_timing=True)
     end = torch.cuda.Event(enable_timing=True)
     benchmarker.print_report_header(args.csv,
@@ -432,6 +448,39 @@ def main(args):
                            csv=args.csv,
                            benchmark_profiler=benchmark_profiler)
 
+        # Rerun for dumping profile per layer.
+        if args.dump_profile and benchmark_profiler is not None:
+            benchmark_profiler.set_recording_perf_profile(True)
+            logger.info(f'Dump profile information per layer')
+            iter_idx = 0
+            try:
+                # Warm up
+                for _ in range(args.warm_up):
+                    benchmarker.run(inputs, config)
+                if benchmark_profiler is not None:
+                    benchmark_profiler.clean()
+                    benchmark_profiler.start()
+                cur_duration = 0
+                start_time = time()
+                while iter_idx < args.num_runs or cur_duration < args.duration:
+                    start.record()
+                    benchmarker.run(inputs,
+                                    config,
+                                    benchmark_profiler=benchmark_profiler)
+                    end.record()
+                    torch.cuda.synchronize()
+                    latencies.append(start.elapsed_time(end))
+                    iter_idx += 1
+                    cur_duration = round(time() - start_time, 3)
+                benchmarker.report_profiler(
+                    benchmark_profiler=benchmark_profiler)
+            except Exception as e:
+                logger.error("Found exception during benchmarking",
+                             e.with_traceback())
+                if not disable_mem_monitor:
+                    memory_monitor.kill()
+                raise e
+
 
 if __name__ == '__main__':
     mp.set_start_method('spawn')