diff --git a/src/relax/backend/contrib/tensorrt/codegen.cc b/src/relax/backend/contrib/tensorrt/codegen.cc index 7fa6d48bdc24..07ba1c81e653 100644 --- a/src/relax/backend/contrib/tensorrt/codegen.cc +++ b/src/relax/backend/contrib/tensorrt/codegen.cc @@ -61,7 +61,8 @@ struct TensorRTCompilerConfigNode : public ffi::Object { "TensorRT version as (major, minor, patch).", refl::DefaultValue(ffi::Array({6, 0, 1}))) .def_ro("use_implicit_batch", &TensorRTCompilerConfigNode::use_implicit_batch, - "Use implicit batch", refl::DefaultValue(true)) + "Use implicit batch (removed in TensorRT 10; networks are always explicit-batch)", + refl::DefaultValue(false)) .def_ro("max_workspace_size", &TensorRTCompilerConfigNode::max_workspace_size, "Max workspace size", refl::DefaultValue(size_t(1) << 30)) .def_ro("remove_no_mac_subgraphs", &TensorRTCompilerConfigNode::remove_no_mac_subgraphs, diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/extra/contrib/tensorrt/tensorrt_builder.cc index 4caa8e383e15..f0c2a26b2e66 100644 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_builder.cc +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_builder.cc @@ -40,36 +40,24 @@ namespace contrib { TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger, const std::vector& data_entry, - size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, - int batch_size, nvinfer1::IInt8Calibrator* calibrator) - : data_entry_(data_entry), + size_t max_workspace_size, bool use_fp16, + nvinfer1::IInt8Calibrator* calibrator) + : trt_logger_(logger), + data_entry_(data_entry), max_workspace_size_(max_workspace_size), - use_implicit_batch_(use_implicit_batch), use_fp16_(use_fp16), use_int8_(false), - batch_size_(batch_size), calibrator_(calibrator) { // Create TRT builder and network. - builder_ = nvinfer1::createInferBuilder(*logger); + builder_ = nvinfer1::createInferBuilder(*trt_logger_); -#if TRT_VERSION_GE(6, 0, 1) - // Use INetworkV2. - auto flags = - 1U << static_cast(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); - if (use_implicit_batch_) { - flags = 0U; - builder_->setMaxBatchSize(batch_size_); - } + // TensorRT 10 removed implicit-batch mode and the kEXPLICIT_BATCH creation flag; every network is + // explicit-batch, so the batch dimension is simply dimension 0 of each binding and is varied + // through optimization profiles rather than IBuilder::setMaxBatchSize. if (calibrator_ != nullptr) { use_int8_ = true; } - network_ = builder_->createNetworkV2(flags); -#else - builder_->setMaxBatchSize(batch_size_); - builder_->setMaxWorkspaceSize(max_workspace_size_); - builder_->setFp16Mode(use_fp16_); - network_ = builder_->createNetwork(); -#endif + network_ = builder_->createNetworkV2(0U); } nvinfer1::DataType DLDataType2NVDataType(DLDataType data_type) { @@ -87,10 +75,7 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode& for (size_t i = 0; i < shapes.size(); ++i) { const std::string name = node_name + "_" + std::to_string(i); auto shape = shapes[i]; - // Remove batch dim when not in explicit batch mode. - if (use_implicit_batch_ && shape.size() > 1) { - shape.erase(shape.begin()); - } + // TensorRT 10 is always explicit-batch: keep the full shape including the batch dimension. nvinfer1::Dims dims = VectorToTrtDims(shape); auto input_tensor = network_->addInput(name.c_str(), DLDataType2NVDataType(dtypes[i]), dims); node_output_map_[nid].push_back(TensorRTOpInput(input_tensor)); @@ -168,11 +153,10 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) { } TensorRTEngineAndContext TensorRTBuilder::BuildEngine() { - // Process graph to create INetworkDefinition. -// Build engine. -#if TRT_VERSION_GE(6, 0, 1) + // Build engine. config_ = builder_->createBuilderConfig(); - config_->setMaxWorkspaceSize(max_workspace_size_); + // TensorRT 10 replaced IBuilderConfig::setMaxWorkspaceSize with a tunable memory pool. + config_->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_size_); if (use_fp16_) { config_->setFlag(nvinfer1::BuilderFlag::kFP16); } @@ -184,40 +168,48 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() { LOG(INFO) << "config finishes setting up calibrator as INT8 mode ... "; } - // Add profiles. - if (!use_implicit_batch_) { - auto profile = builder_->createOptimizationProfile(); - for (int i = 0; i < network_->getNbInputs(); ++i) { - auto name = network_->getInput(i)->getName(); - const uint32_t entry_id = entry_id_map_[name]; - std::vector shape(data_entry_[entry_id]->shape, - data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim); - auto dims = VectorToTrtDims(shape); + // Every network is explicit-batch in TRT10, so always add an optimization profile that pins each + // input to its concrete shape (with a minimum batch of 1 for dynamic batch dimensions). + auto profile = builder_->createOptimizationProfile(); + for (int i = 0; i < network_->getNbInputs(); ++i) { + auto name = network_->getInput(i)->getName(); + const uint32_t entry_id = entry_id_map_[name]; + std::vector shape(data_entry_[entry_id]->shape, + data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim); + auto dims = VectorToTrtDims(shape); - profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, dims); - profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, dims); - // Set minimum batch size to 1 when dynamic batching is used. - if (network_->getInput(i)->getDimensions().nbDims >= 1 && - network_->getInput(i)->getDimensions().d[0] == -1) { - dims.d[0] = 1; - } - profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, dims); + profile->setDimensions(name, nvinfer1::OptProfileSelector::kOPT, dims); + profile->setDimensions(name, nvinfer1::OptProfileSelector::kMAX, dims); + // The network inputs are built with static shapes, so the profile must match them exactly; only + // lower kMIN for a genuinely dynamic (-1) leading dimension. + if (network_->getInput(i)->getDimensions().nbDims >= 1 && + network_->getInput(i)->getDimensions().d[0] == -1) { + dims.d[0] = 1; } - config_->addOptimizationProfile(profile); + profile->setDimensions(name, nvinfer1::OptProfileSelector::kMIN, dims); } - nvinfer1::ICudaEngine* engine = builder_->buildEngineWithConfig(*network_, *config_); -#else - nvinfer1::ICudaEngine* engine = builder_->buildCudaEngine(*network_); -#endif - TVM_FFI_ICHECK_EQ(engine->getNbBindings(), - network_input_names_.size() + network_output_names_.size()); + config_->addOptimizationProfile(profile); + + // TensorRT 10 removed buildEngineWithConfig; build a serialized engine and deserialize it through + // an IRuntime that is kept alive alongside the engine (TensorRTEngineAndContext::runtime). + nvinfer1::IHostMemory* plan = builder_->buildSerializedNetwork(*network_, *config_); + TVM_FFI_ICHECK(plan) << "Failed to build TensorRT serialized network."; + nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(*trt_logger_); + nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(plan->data(), plan->size()); + delete plan; + if (engine == nullptr) { + delete runtime; + TVM_FFI_THROW(InternalError) << "Failed to deserialize the TensorRT engine."; + } + TVM_FFI_ICHECK_EQ( + engine->getNbIOTensors(), + static_cast(network_input_names_.size() + network_output_names_.size())); nvinfer1::IExecutionContext* context = engine->createExecutionContext(); CleanUp(); - TVM_FFI_ICHECK(engine); TVM_FFI_ICHECK(context); - return {engine, context, network_input_names_, network_output_names_}; + return {runtime, engine, context, network_input_names_, network_output_names_}; } nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr, @@ -236,10 +228,9 @@ nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr, } weight.count = count; weight.values = new float[count]; - TVM_FFI_ICHECK_EQ(TVMTensorCopyToBytes(const_cast(dptr), - const_cast(weight.values), weight_bytes), - 0) - << TVMGetLastError(); + // Tensor::CopyToBytes throws on failure (the old C API TVMTensorCopyToBytes/TVMGetLastError + // were removed during the tvm-ffi refactor). + Tensor::CopyToBytes(dptr, const_cast(weight.values), weight_bytes); trt_weights_.push_back(weight); return weight; } @@ -247,35 +238,25 @@ nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr, nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& input) { if (input.type == kTensor) return input.tensor; auto shape = input.weight_shape; - // Remove batch dim when not in explicit batch mode. - // Example: - // x = dims (1, 32, 224, 224) which becomes TRT Dims (32, 224, 224) - // y = dims (1, 32) - // z = add(x, y) - // y needs to have TRT dims (32,), otherwise broadcasting will result in z having - // TRT Dims(1, 32, 224, 224) when it should be (32, 224, 224). - if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) { - shape.erase(shape.begin()); - } + // TensorRT 10 is always explicit-batch, so the constant keeps its full shape. return network_->addConstant(VectorToTrtDims(shape), input.weight)->getOutput(0); } void TensorRTBuilder::CleanUp() { + // TensorRT 10 removed obj->destroy(); objects are released with the delete operator. VLOG(1) << "Destroying TensorRT network"; TVM_FFI_ICHECK(network_); - network_->destroy(); + delete network_; network_ = nullptr; -#if TRT_VERSION_GE(6, 0, 1) VLOG(1) << "Destroying TensorRT config"; TVM_FFI_ICHECK(config_); - config_->destroy(); + delete config_; config_ = nullptr; -#endif VLOG(1) << "Destroying TensorRT builder"; TVM_FFI_ICHECK(builder_); - builder_->destroy(); + delete builder_; builder_ = nullptr; VLOG(1) << "Destroying TensorRT weights"; diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_builder.h b/src/runtime/extra/contrib/tensorrt/tensorrt_builder.h index 96905598737c..108f56b9f32f 100644 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_builder.h +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_builder.h @@ -48,6 +48,9 @@ using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; * perform inference. */ struct TensorRTEngineAndContext { + // TensorRT 10 builds a serialized engine which is then deserialized through an IRuntime. The + // runtime must outlive the engine it produced, so it is owned alongside the engine/context. + nvinfer1::IRuntime* runtime = nullptr; nvinfer1::ICudaEngine* engine = nullptr; nvinfer1::IExecutionContext* context = nullptr; std::vector inputs; @@ -67,12 +70,10 @@ class TensorRTBuilder { * \brief Create TensorRT builder. * \param logger TensorRT logger to use for errors and warnings. * \param max_workspace_size Workspace size parameter for TensorRT engine build phase. - * \param use_implicit_batch Whether to use implicit batch mode (default) * \param use_fp16 Whether to automatically convert a model to fp16 - * \param batch_size If use_implicit_batch, */ TensorRTBuilder(TensorRTLogger* logger, const std::vector& data_entry, - size_t max_workspace_size, bool use_implicit_batch, bool use_fp16, int batch_size, + size_t max_workspace_size, bool use_fp16, nvinfer1::IInt8Calibrator* calibrator = nullptr); /*! @@ -124,13 +125,14 @@ class TensorRTBuilder { /*! \brief Maps a node to its outputs. */ std::unordered_map> node_output_map_; + /*! \brief TensorRT logger, used to create the builder and the deserialization runtime. */ + TensorRTLogger* trt_logger_ = nullptr; + /*! \brief TensorRT builder. */ nvinfer1::IBuilder* builder_ = nullptr; -#if TRT_VERSION_GE(6, 0, 1) /*! \brief TensorRT builder config. */ nvinfer1::IBuilderConfig* config_ = nullptr; -#endif /*! \brief TensorRT network definition. */ nvinfer1::INetworkDefinition* network_ = nullptr; @@ -147,18 +149,12 @@ class TensorRTBuilder { /*! \brief Max workspace size in bytes for TRT. */ size_t max_workspace_size_; - /*! \brief Whether to use implicit batch mode. */ - bool use_implicit_batch_; - /*! \brief Whether to automatically convert model to 16-bit floating point precision. */ bool use_fp16_; /*! \brief whether to automatically convert model to int8 precision */ bool use_int8_; - /*! \brief Batch size to optimize for. */ - int batch_size_; - /*! \brief Input names. */ std::vector network_input_names_; diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h index 408d50cc7e08..aa10d8f0d9df 100755 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_calibrator.h @@ -123,7 +123,10 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 { const int num_inputs = data_sizes_[0].size(); buffers_.assign(num_inputs, nullptr); for (int i = 0; i < num_inputs; ++i) { - TVM_FFI_CHECK_CUDA_ERROR(cudaMalloc(&buffers_[i], data_sizes_[0][i] * sizeof(float))); + // data_sizes_ holds the per-sample element count; getBatch() copies a full batch + // (batch_size_ * per-sample) into each buffer, so the device buffer must be sized to match. + TVM_FFI_CHECK_CUDA_ERROR( + cudaMalloc(&buffers_[i], batch_size_ * data_sizes_[0][i] * sizeof(float))); } } }; diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/extra/contrib/tensorrt/tensorrt_ops.cc index f8463cb50e65..d3e68778fde9 100644 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_ops.cc +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_ops.cc @@ -252,11 +252,16 @@ class Conv1DOpConverter : public TensorRTOpConverter { auto dilation = params->node.GetAttr>("dilation"); auto padding = params->node.GetAttr>("padding"); int groups = static_cast(params->node.GetAttr("groups")); + // Relax conv attrs carry no "channels" field (unlike Relay); the number of output channels is + // the first dimension of the OIHW/OIW kernel. int channels = weight_shape[0]; - channels = static_cast(params->node.GetAttr("channels")); auto shuffle_layer = params->network->addShuffle(*input_tensor); - std::vector new_shape = {input_dims[0], input_dims[1], 1}; + // Emulate a 1D convolution with a 2D convolution by appending a trailing unit spatial + // dimension (NCW -> NCW1). In explicit-batch mode (TensorRT 10) input_dims already includes the + // batch dimension, so derive the reshape from the full input rank instead of hard-coding it. + std::vector new_shape(input_dims); + new_shape.push_back(1); shuffle_layer->setReshapeDimensions(VectorToTrtDims(new_shape)); input_tensor = shuffle_layer->getOutput(0); @@ -265,21 +270,22 @@ class Conv1DOpConverter : public TensorRTOpConverter { nvinfer1::Weights bias{weight_type, nullptr, 0}; - auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size, - params->inputs.at(1).weight, bias); + auto conv_layer = params->network->addConvolutionNd(*input_tensor, channels, kernel_size, + params->inputs.at(1).weight, bias); TVM_FFI_ICHECK(conv_layer != nullptr); - conv_layer->setPadding(nvinfer1::DimsHW(static_cast(padding[0]), 0)); + conv_layer->setPaddingNd(nvinfer1::DimsHW(static_cast(padding[0]), 0)); TVM_FFI_ICHECK_EQ(strides.size(), 1); const auto trt_strides = nvinfer1::DimsHW(static_cast(strides[0]), 1); - conv_layer->setStride(trt_strides); + conv_layer->setStrideNd(trt_strides); TVM_FFI_ICHECK_EQ(dilation.size(), 1); const auto trt_dilation = nvinfer1::DimsHW(static_cast(dilation[0]), 1); - conv_layer->setDilation(trt_dilation); + conv_layer->setDilationNd(trt_dilation); conv_layer->setNbGroups(groups); input_tensor = conv_layer->getOutput(0); - auto conv_output_dims = TrtDimsToVector(input_tensor->getDimensions()); - std::vector back_shape = {0, 0}; + // Drop the trailing unit dimension (NOW1 -> NOW); 0 copies the corresponding input dimension, + // so the number of leading dims to keep matches the original input rank. + std::vector back_shape(input_dims.size(), 0); auto shuffle_back_layer = params->network->addShuffle(*input_tensor); shuffle_back_layer->setReshapeDimensions(VectorToTrtDims(back_shape)); params->outputs.push_back(shuffle_back_layer->getOutput(0)); @@ -304,47 +310,36 @@ class Conv2DOpConverter : public TensorRTOpConverter { auto dilation = params->node.GetAttr>("dilation"); auto padding = params->node.GetAttr>("padding"); int groups = static_cast(params->node.GetAttr("groups")); + // Relax conv attrs carry no "channels" field (unlike Relay); the number of output channels is + // the first dimension of the OIHW/OIW kernel. int channels = weight_shape[0]; - channels = static_cast(params->node.GetAttr("channels")); // TRT conv2d op doesn't support asymmetric padding before 5.1, so we // workaround by adding a padding layer before the pooling op. nvinfer1::DimsHW prepadding, postpadding; bool use_asymmetric_padding; GetPadding(padding, &use_asymmetric_padding, &prepadding, &postpadding); -#if !TRT_VERSION_GE(5, 1, 5) - if (use_asymmetric_padding) { - auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding); - TVM_FFI_ICHECK(pad_layer != nullptr); - input_tensor = pad_layer->getOutput(0); - // No need for conv op to do any padding. - use_asymmetric_padding = false; - prepadding = nvinfer1::DimsHW(0, 0); - } -#endif const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]); const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; nvinfer1::Weights bias{weight_type, nullptr, 0}; - auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size, - params->inputs.at(1).weight, bias); + auto conv_layer = params->network->addConvolutionNd(*input_tensor, channels, kernel_size, + params->inputs.at(1).weight, bias); TVM_FFI_ICHECK(conv_layer != nullptr); conv_layer->setName(params->LayerName().c_str()); if (use_asymmetric_padding) { -#if TRT_VERSION_GE(5, 1, 5) conv_layer->setPrePadding(prepadding); conv_layer->setPostPadding(postpadding); -#endif } else { - conv_layer->setPadding(prepadding); + conv_layer->setPaddingNd(prepadding); } TVM_FFI_ICHECK_EQ(strides.size(), 2); const auto trt_strides = nvinfer1::DimsHW(static_cast(strides[0]), static_cast(strides[1])); - conv_layer->setStride(trt_strides); + conv_layer->setStrideNd(trt_strides); TVM_FFI_ICHECK_EQ(dilation.size(), 2); const auto trt_dilation = nvinfer1::DimsHW(static_cast(dilation[0]), static_cast(dilation[1])); - conv_layer->setDilation(trt_dilation); + conv_layer->setDilationNd(trt_dilation); conv_layer->setNbGroups(groups); params->outputs.push_back(conv_layer->getOutput(0)); } @@ -374,7 +369,8 @@ class Conv3DOpConverter : public TensorRTOpConverter { bool use_asymmetric_padding; GetPadding3D(padding, &use_asymmetric_padding, &prepadding, &postpadding); - const int num_outputs = static_cast(params->node.GetAttr("channels")); + // Relax conv3d has no "channels" attr; output channels = weight_shape[0] (OIDHW kernel). + const int num_outputs = static_cast(weight_shape[0]); const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]); const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; nvinfer1::Weights bias{weight_type, nullptr, 0}; @@ -410,31 +406,27 @@ class DenseOpConverter : public TensorRTOpConverter { void Convert(TensorRTOpConverterParams* params) const { auto input_tensor = params->inputs.at(0).tensor; - auto input_dims = TrtDimsToVector(input_tensor->getDimensions()); - TVM_FFI_ICHECK(input_dims.size() > 0 && input_dims.size() <= 3); - const size_t required_rank = TRT_HAS_IMPLICIT_BATCH(params) ? 3 : 4; - const bool need_reshape_on_input = input_dims.size() != required_rank; - if (need_reshape_on_input) { - // Add dims of size 1 until rank is required_rank. - std::vector new_shape(input_dims); - while (new_shape.size() < required_rank) new_shape.insert(new_shape.end(), 1); - input_tensor = Reshape(params, input_tensor, new_shape); - } - // Weights are in KC format. + // Weights are in KC (out_units x in_features) format. TVM_FFI_ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2); - const int num_units = params->inputs.at(1).weight_shape[0]; - const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; - nvinfer1::Weights bias{weight_type, nullptr, 0}; - nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected( - *input_tensor, num_units, params->inputs.at(1).weight, bias); - TVM_FFI_ICHECK(fc_layer != nullptr); - auto output_tensor = fc_layer->getOutput(0); - if (need_reshape_on_input) { - // Remove added dims. - input_dims[input_dims.size() - 1] = num_units; - output_tensor = Reshape(params, output_tensor, input_dims); - } - params->outputs.push_back(output_tensor); + // addMatrixMultiply requires the input to have at least 2 dimensions (rows x K); the old + // FullyConnected path padded the rank, so guard explicitly now that it is gone. + TVM_FFI_ICHECK_GE(input_tensor->getDimensions().nbDims, 2) + << "TensorRT dense expects an input of rank >= 2 (got " + << input_tensor->getDimensions().nbDims << ")"; + // TensorRT 10 removed IFullyConnectedLayer/addFullyConnected. Implement dense as a matrix + // multiply: out[.., O] = in[.., K] * weightįµ€, with weight a constant of shape [O, K]. + // IMatrixMultiplyLayer contracts the last dim of `input` (K) with the last dim of the + // transposed weight (also K) and broadcasts the remaining leading dimensions, which matches + // nn.dense semantics for any input rank >= 2 without the rank-padding reshape FC required. + auto* weight_tensor = params->network + ->addConstant(VectorToTrtDims(params->inputs.at(1).weight_shape), + params->inputs.at(1).weight) + ->getOutput(0); + auto* matmul_layer = params->network->addMatrixMultiply( + *input_tensor, nvinfer1::MatrixOperation::kNONE, *weight_tensor, + nvinfer1::MatrixOperation::kTRANSPOSE); + TVM_FFI_ICHECK(matmul_layer != nullptr); + params->outputs.push_back(matmul_layer->getOutput(0)); } }; @@ -666,33 +658,18 @@ class PoolingOpConverter : public TensorRTOpConverter { GetPadding(padding, &use_asymmetric_padding, &prepadding, &postpadding); bool ceil_mode = static_cast(params->node.GetAttr("ceil_mode")); -// TRT pooling op doesn't support asymmetric padding before 5.1, so we -// workaround by adding a padding layer before the pooling op. -#if !TRT_VERSION_GE(5, 1, 5) - if (use_asymmetric_padding) { - auto pad_layer = params->network->addPadding(*input, prepadding, postpadding); - TVM_FFI_ICHECK(pad_layer != nullptr); - input = pad_layer->getOutput(0); - // No need for pooling op to do any padding. - use_asymmetric_padding = false; - prepadding = nvinfer1::DimsHW(0, 0); - } -#endif - nvinfer1::DimsHW window_size = nvinfer1::DimsHW(static_cast(pool_size[0]), static_cast(pool_size[1])); - auto pool_layer = params->network->addPooling(*input, it->second, window_size); + auto pool_layer = params->network->addPoolingNd(*input, it->second, window_size); TVM_FFI_ICHECK(pool_layer != nullptr); nvinfer1::DimsHW trt_strides = nvinfer1::DimsHW(static_cast(strides[0]), static_cast(strides[1])); - pool_layer->setStride(trt_strides); + pool_layer->setStrideNd(trt_strides); if (use_asymmetric_padding) { -#if TRT_VERSION_GE(5, 1, 5) pool_layer->setPrePadding(prepadding); pool_layer->setPostPadding(postpadding); -#endif } else { - pool_layer->setPadding(prepadding); + pool_layer->setPaddingNd(prepadding); } if (op_name == "nn.avg_pool2d") { bool count_include_pad = static_cast(params->node.GetAttr("count_include_pad")); @@ -783,7 +760,7 @@ class GlobalPoolingOpConverter : public TensorRTOpConverter { const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2]; const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3]; auto pool_layer = - params->network->addPooling(*input_tensor, it->second, nvinfer1::DimsHW(h, w)); + params->network->addPoolingNd(*input_tensor, it->second, nvinfer1::DimsHW(h, w)); TVM_FFI_ICHECK(pool_layer != nullptr); params->outputs.push_back(pool_layer->getOutput(0)); } @@ -993,7 +970,7 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter { TVM_FFI_ICHECK_EQ(params->node.GetAttr("data_layout"), "NCHW"); TVM_FFI_ICHECK(params->node.GetAttr("out_layout") == "" || params->node.GetAttr("out_layout") == "NCHW"); - TVM_FFI_ICHECK_EQ(params->node.GetAttr("kernel_layout"), "OIHW"); + TVM_FFI_ICHECK_EQ(params->node.GetAttr("kernel_layout"), "IOHW"); auto dilation = params->node.GetAttr>("dilation"); TVM_FFI_ICHECK(static_cast(dilation[0]) == 1 && static_cast(dilation[1]) == 1); auto strides = params->node.GetAttr>("strides"); @@ -1006,35 +983,26 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter { nvinfer1::DimsHW prepadding, postpadding; bool use_asymmetric_padding; GetPadding(padding, &use_asymmetric_padding, &prepadding, &postpadding); -#if !TRT_VERSION_GE(5, 1, 5) - if (use_asymmetric_padding) { - auto pad_layer = params->network->addPadding(*input_tensor, prepadding, postpadding); - TVM_FFI_ICHECK(pad_layer != nullptr); - input_tensor = pad_layer->getOutput(0); - // No need for conv op to do any padding. - use_asymmetric_padding = false; - prepadding = nvinfer1::DimsHW(0, 0); - } -#endif - const int num_outputs = static_cast(params->node.GetAttr("channels")); + // Relax conv2d_transpose uses an IOHW kernel ([in, out, h, w]) by default, which is also the + // layout TensorRT's deconvolution expects, so the weight is passed through unchanged and the + // output channel count is the second kernel dimension. + const int num_outputs = static_cast(weight_shape[1]); const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]); const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; nvinfer1::Weights bias{weight_type, nullptr, 0}; - auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size, - params->inputs.at(1).weight, bias); + auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size, + params->inputs.at(1).weight, bias); TVM_FFI_ICHECK(deconv_layer != nullptr); if (use_asymmetric_padding) { -#if TRT_VERSION_GE(5, 1, 5) deconv_layer->setPrePadding(prepadding); deconv_layer->setPostPadding(postpadding); -#endif } else { - deconv_layer->setPadding(prepadding); + deconv_layer->setPaddingNd(prepadding); } const auto trt_strides = nvinfer1::DimsHW(static_cast(strides[0]), static_cast(strides[1])); - deconv_layer->setStride(trt_strides); + deconv_layer->setStrideNd(trt_strides); deconv_layer->setNbGroups(groups); nvinfer1::ITensor* output = deconv_layer->getOutput(0); // Output padding. @@ -1044,7 +1012,7 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter { postpadding.w() != 0) { // Output padding for Conv2D transpose is always asymmetric and applied to post only. prepadding = nvinfer1::DimsHW(0, 0); - auto pad_layer = params->network->addPadding(*output, prepadding, postpadding); + auto pad_layer = params->network->addPaddingNd(*output, prepadding, postpadding); output = pad_layer->getOutput(0); } } @@ -1065,7 +1033,7 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter { TVM_FFI_ICHECK_EQ(params->node.GetAttr("data_layout"), "NCDHW"); TVM_FFI_ICHECK(params->node.GetAttr("out_layout") == "" || params->node.GetAttr("out_layout") == "NCDHW"); - TVM_FFI_ICHECK_EQ(params->node.GetAttr("kernel_layout"), "OIDHW"); + TVM_FFI_ICHECK_EQ(params->node.GetAttr("kernel_layout"), "IODHW"); auto dilation = params->node.GetAttr>("dilation"); TVM_FFI_ICHECK_EQ(dilation.size(), 3); TVM_FFI_ICHECK(static_cast(dilation[0]) == 1 && static_cast(dilation[1]) == 1 && @@ -1078,7 +1046,10 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter { bool use_asymmetric_padding; GetPadding3D(padding, &use_asymmetric_padding, &prepadding, &postpadding); - const int num_outputs = static_cast(params->node.GetAttr("channels")); + // Relax conv3d_transpose uses an IODHW kernel ([in, out, d, h, w]) by default, matching the + // layout TensorRT's deconvolution expects, so the weight passes through unchanged and the + // output channel count is the second kernel dimension. + const int num_outputs = static_cast(weight_shape[1]); const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]); const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; nvinfer1::Weights bias{weight_type, nullptr, 0}; @@ -1186,7 +1157,7 @@ class PadOpConverter : public TensorRTOpConverter { nvinfer1::DimsHW(static_cast(padding_arr[0]), static_cast(padding_arr[1])); nvinfer1::DimsHW postpadding = nvinfer1::DimsHW(static_cast(padding_arr[2]), static_cast(padding_arr[3])); - auto pad_layer = params->network->addPadding(*input, prepadding, postpadding); + auto pad_layer = params->network->addPaddingNd(*input, prepadding, postpadding); params->outputs.push_back(pad_layer->getOutput(0)); } }; @@ -1282,9 +1253,9 @@ class AdaptivePoolingOpConverter : public TensorRTOpConverter { const auto stride = nvinfer1::DimsHW(h / output_size.h(), w / output_size.w()); const auto window_size = nvinfer1::DimsHW(h - (output_size.h() - 1) * stride.h(), w - (output_size.w() - 1) * stride.w()); - auto pool_layer = params->network->addPooling(*input_tensor, it->second, window_size); + auto pool_layer = params->network->addPoolingNd(*input_tensor, it->second, window_size); TVM_FFI_ICHECK(pool_layer != nullptr); - pool_layer->setStride(stride); + pool_layer->setStrideNd(stride); params->outputs.push_back(pool_layer->getOutput(0)); } }; diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_ops.h b/src/runtime/extra/contrib/tensorrt/tensorrt_ops.h index 26ea40075458..5e4c30ed7f30 100644 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_ops.h +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_ops.h @@ -35,11 +35,10 @@ #include "NvInfer.h" #include "tensorrt_utils.h" -#if TRT_VERSION_GE(6, 0, 1) -#define TRT_HAS_IMPLICIT_BATCH(params) (params->network->hasImplicitBatchDimension()) -#else -#define TRT_HAS_IMPLICIT_BATCH(params) (true) -#endif +// TensorRT 10 removed implicit-batch mode; every network is explicit-batch. Keep the macro so the +// converters' batch-aware branches read clearly, but it is unconditionally false (and no longer +// calls the deprecated INetworkDefinition::hasImplicitBatchDimension()). +#define TRT_HAS_IMPLICIT_BATCH(params) (false) namespace tvm { namespace runtime { diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/extra/contrib/tensorrt/tensorrt_runtime.cc index 40ca760d96f2..932c52b394dc 100644 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_runtime.cc @@ -40,6 +40,9 @@ #include "../json/json_runtime.h" #ifdef TVM_GRAPH_EXECUTOR_TENSORRT +#include +#include + #include "NvInfer.h" #include "tensorrt_builder.h" #include "tensorrt_calibrator.h" @@ -125,6 +128,10 @@ class TensorRTRuntime : public JSONRuntimeBase { for (size_t i = 0; i < nodes_.size(); ++i) { if (nodes_[i].HasAttr("use_implicit_batch") && nodes_[i].HasAttr("max_workspace_size")) { use_implicit_batch_ = static_cast(nodes_[i].GetAttr("use_implicit_batch")); + if (use_implicit_batch_) { + LOG(WARNING) << "use_implicit_batch=True is ignored: TensorRT 10 removed implicit-batch " + "mode, so the engine is always built and run in explicit-batch mode."; + } // Allow max_workspace_size to be overridden at runtime. size_t runtime_max_workspace_size = support::GetEnv("TVM_TENSORRT_MAX_WORKSPACE_SIZE", size_t(0)); @@ -145,17 +152,20 @@ class TensorRTRuntime : public JSONRuntimeBase { /*! \brief Destroy engines and contexts. */ void DestroyEngines() { for (auto& it : trt_engine_cache_) { + // TensorRT 10 removed obj->destroy(); release with delete. The deserialization runtime must + // outlive the engine it produced, so delete the context, then the engine, then the runtime. VLOG(1) << "Destroying TensorRT context for function '" << it.first.first << "' (batch size " << it.first.second << ")"; - it.second.context->destroy(); + delete it.second.context; VLOG(1) << "Destroying TensorRT engine for function '" << it.first.first << "' (batch size " << it.first.second << ")"; - it.second.engine->destroy(); + delete it.second.engine; + delete it.second.runtime; } trt_engine_cache_.clear(); } - ~TensorRTRuntime() override { + ~TensorRTRuntime() { VLOG(1) << "Destroying TensorRT runtime"; DestroyEngines(); VLOG(1) << "Destroyed TensorRT runtime"; @@ -166,11 +176,13 @@ class TensorRTRuntime : public JSONRuntimeBase { auto& engine_and_context = GetOrBuildEngine(); int batch_size = GetBatchSize(); if (batch_size == 0) return; - auto engine = engine_and_context.engine; auto context = engine_and_context.context; - const int num_bindings = engine->getNbBindings(); - std::vector bindings(num_bindings, nullptr); - std::vector binding_sizes(num_bindings, 0); + + // TensorRT 10 uses named-tensor I/O (setInputShape/setTensorAddress/enqueueV3, no binding + // indices). Track input device pointers and per-sample element counts for the INT8 calibrator. + std::vector input_bindings; + std::vector input_binding_sizes; + // Setup input bindings. for (size_t i = 0; i < input_nodes_.size(); ++i) { auto nid = input_nodes_[i]; @@ -178,28 +190,28 @@ class TensorRTRuntime : public JSONRuntimeBase { for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) { uint32_t eid = EntryID(nid, j); const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j); - int binding_index = engine->getBindingIndex(name.c_str()); - TVM_FFI_ICHECK_NE(binding_index, -1); -#if TRT_VERSION_GE(6, 0, 1) - if (!use_implicit_batch_) { - std::vector shape(data_entry_[eid]->shape, - data_entry_[eid]->shape + data_entry_[eid]->ndim); - auto dims = VectorToTrtDims(shape); - TVM_FFI_ICHECK(context->setBindingDimensions(binding_index, dims)); - } -#endif + std::vector shape(data_entry_[eid]->shape, + data_entry_[eid]->shape + data_entry_[eid]->ndim); + auto dims = VectorToTrtDims(shape); + TVM_FFI_ICHECK(context->setInputShape(name.c_str(), dims)); + + void* device_ptr = nullptr; if (data_entry_[eid]->device.device_type == kDLCUDA) { - bindings[binding_index] = data_entry_[eid]->data; + device_ptr = data_entry_[eid]->data; } else { - auto device_buffer = GetOrAllocateDeviceBuffer(eid, binding_index); + auto device_buffer = GetOrAllocateDeviceBuffer(name, eid); device_buffer.CopyFrom(data_entry_[eid]); - bindings[binding_index] = device_buffer->data; + device_ptr = device_buffer->data; } + TVM_FFI_ICHECK(context->setTensorAddress(name.c_str(), device_ptr)); - auto dims = engine->getBindingDimensions(binding_index); + // Per-sample element count (exclude the batch dimension d[0]); the INT8 calibrator + // multiplies by the batch size itself when copying calibration data, so including the + // batch dim here would over-read the device buffer by a factor of batch_size. int num_elements = 1; - for (int i = 0; i < dims.nbDims; ++i) num_elements *= dims.d[i]; - binding_sizes[binding_index] = num_elements; + for (int k = 1; k < dims.nbDims; ++k) num_elements *= dims.d[k]; + input_bindings.push_back(device_ptr); + input_binding_sizes.push_back(static_cast(num_elements)); } } } @@ -209,7 +221,7 @@ class TensorRTRuntime : public JSONRuntimeBase { if (calibrator_ != nullptr) { LOG(INFO) << "Starting adding last " << num_calibration_batches_remaining_ << "-th batch data to the calibrator"; - calibrator_->AddBatchData(bindings, binding_sizes); + calibrator_->AddBatchData(input_bindings, input_binding_sizes); num_calibration_batches_remaining_--; } return; @@ -219,34 +231,31 @@ class TensorRTRuntime : public JSONRuntimeBase { for (size_t i = 0; i < outputs_.size(); ++i) { uint32_t eid = EntryID(outputs_[i]); const std::string& name = engine_and_context.outputs[i]; - int binding_index = engine->getBindingIndex(name.c_str()); - TVM_FFI_ICHECK_NE(binding_index, -1); + void* device_ptr = nullptr; if (data_entry_[eid]->device.device_type == kDLCUDA) { - bindings[binding_index] = data_entry_[eid]->data; + device_ptr = data_entry_[eid]->data; } else { - auto device_buffer = GetOrAllocateDeviceBuffer(eid, binding_index); - bindings[binding_index] = device_buffer->data; + auto device_buffer = GetOrAllocateDeviceBuffer(name, eid); + device_ptr = device_buffer->data; } + TVM_FFI_ICHECK(context->setTensorAddress(name.c_str(), device_ptr)); } -#if TRT_VERSION_GE(6, 0, 1) - if (use_implicit_batch_) { - TVM_FFI_ICHECK(context->execute(batch_size, bindings.data())) << "Running TensorRT failed."; - } else { - TVM_FFI_ICHECK(context->executeV2(bindings.data())) << "Running TensorRT failed."; - } -#else - TVM_FFI_ICHECK(context->execute(batch_size, bindings.data())) << "Running TensorRT failed."; -#endif + // Run on TVM's current CUDA stream so the engine is ordered after the inputs produced upstream + // (and to avoid TensorRT's default-stream synchronization warning). enqueueV3 is async-only in + // TRT10, so synchronize afterwards to preserve Run()'s blocking semantics. + const DLDevice& dev = data_entry_[input_var_eid_[0]]->device; + const int device_id = dev.device_type == kDLCUDA ? dev.device_id : 0; + cudaStream_t stream = static_cast(TVMFFIEnvGetStream(kDLCUDA, device_id)); + TVM_FFI_ICHECK(context->enqueueV3(stream)) << "Running TensorRT failed."; + TVM_FFI_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream)); // Copy outputs from GPU buffers if needed. for (size_t i = 0; i < outputs_.size(); ++i) { uint32_t eid = EntryID(outputs_[i]); const std::string& name = engine_and_context.outputs[i]; - int binding_index = engine->getBindingIndex(name.c_str()); - TVM_FFI_ICHECK_NE(binding_index, -1); if (data_entry_[eid]->device.device_type != kDLCUDA) { - auto device_buffer = GetOrAllocateDeviceBuffer(eid, binding_index); + auto device_buffer = GetOrAllocateDeviceBuffer(name, eid); device_buffer.CopyTo(const_cast(data_entry_[eid])); } } @@ -269,8 +278,11 @@ class TensorRTRuntime : public JSONRuntimeBase { } return false; } - // Check for engine with compatible max_batch_size. - if (batch_size <= max_batch_size_) { + // Single-engine mode: TensorRT 10 engines are explicit-batch and their optimization profile + // pins the built batch size, so a cached engine can only serve that exact batch. Require an + // exact match (otherwise a smaller batch would be rejected by setInputShape) and rebuild on any + // change. This replaces the implicit-batch "any batch <= max" reuse that TRT10 removed. + if (batch_size == max_batch_size_) { *compatible_engine_batch_size = max_batch_size_; return true; } @@ -325,8 +337,8 @@ class TensorRTRuntime : public JSONRuntimeBase { void BuildEngineFromJson(int batch_size) { const bool use_fp16 = support::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_; - TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_, - use_fp16, batch_size, calibrator_.get()); + TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_fp16, + calibrator_.get()); for (size_t i = 0; i < input_nodes_.size(); ++i) { auto nid = input_nodes_[i]; const auto& node = nodes_[nid]; @@ -372,11 +384,20 @@ class TensorRTRuntime : public JSONRuntimeBase { infile.close(); std::string serialized_engine; LoadBinaryFromFile(path, &serialized_engine); - // Deserialize engine + // Deserialize engine. TensorRT 10 dropped the trailing IPluginFactory* argument and the runtime + // must outlive the engine, so it is owned by the cached TensorRTEngineAndContext. nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(logger_); TensorRTEngineAndContext engine_and_context; + engine_and_context.runtime = runtime; engine_and_context.engine = - runtime->deserializeCudaEngine(&serialized_engine[0], serialized_engine.size(), nullptr); + runtime->deserializeCudaEngine(&serialized_engine[0], serialized_engine.size()); + if (engine_and_context.engine == nullptr) { + // A stale or incompatible (e.g. different TensorRT version) .plan file. Drop it and rebuild. + delete runtime; + LOG(WARNING) << "Failed to deserialize cached TensorRT engine from " << path + << "; it will be rebuilt."; + return false; + } engine_and_context.context = engine_and_context.engine->createExecutionContext(); // Load metadata namespace json = ::tvm::ffi::json; @@ -424,7 +445,7 @@ class TensorRTRuntime : public JSONRuntimeBase { trt_engine_cache_[std::make_pair(symbol_name_, batch_size)].engine->serialize(); SaveBinaryToFile(path, std::string(static_cast(serialized_engine->data()), serialized_engine->size())); - serialized_engine->destroy(); + delete serialized_engine; // Serialize metadata namespace json = ::tvm::ffi::json; json::Object meta_obj; @@ -454,26 +475,27 @@ class TensorRTRuntime : public JSONRuntimeBase { return symbol_name_ + (support::GetEnv("TVM_TENSORRT_USE_FP16", false) ? "_fp16" : "_fp32"); } - /*! \brief Retreive a GPU buffer for input or output or allocate if needed. */ - Tensor GetOrAllocateDeviceBuffer(int entry_id, int binding_index) { + /*! \brief Retreive a GPU buffer for input or output or allocate if needed. Keyed by TensorRT IO + * tensor name (TRT10 has no binding indices). */ + Tensor GetOrAllocateDeviceBuffer(const std::string& name, int entry_id) { std::vector shape(data_entry_[entry_id]->shape, data_entry_[entry_id]->shape + data_entry_[entry_id]->ndim); - if (device_buffers_.count(binding_index)) { + if (device_buffers_.count(name)) { // Buffer is already initialized. - if (shape[0] > device_buffers_[binding_index]->shape[0]) { + if (shape[0] > device_buffers_[name]->shape[0]) { // Buffer is too small. Need to allocate bigger buffer. - device_buffers_[binding_index] = + device_buffers_[name] = runtime::Tensor::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0}); - } else if (shape[0] < device_buffers_[binding_index]->shape[0]) { + } else if (shape[0] < device_buffers_[name]->shape[0]) { // Buffer is too large. Create view. - return device_buffers_[binding_index].CreateView(shape, data_entry_[entry_id]->dtype); + return device_buffers_[name].CreateView(shape, data_entry_[entry_id]->dtype); } } else { // Buffer not initialized yet. - device_buffers_[binding_index] = + device_buffers_[name] = runtime::Tensor::Empty(shape, data_entry_[entry_id]->dtype, {kDLCUDA, 0}); } - return device_buffers_.at(binding_index); + return device_buffers_.at(name); } void CreateInt8Calibrator(const TensorRTEngineAndContext& engine_and_context) { @@ -498,7 +520,7 @@ class TensorRTRuntime : public JSONRuntimeBase { * is not "cuda". Since TensorRT execution can only read data from GPU, we need to copy data from * the runtime device to these buffers first. These will be allocated for the highest batch size * used by all engines. */ - std::unordered_map device_buffers_; + std::unordered_map device_buffers_; /*! \brief TensorRT logger. */ TensorRTLogger logger_; diff --git a/src/runtime/extra/contrib/tensorrt/tensorrt_utils.h b/src/runtime/extra/contrib/tensorrt/tensorrt_utils.h index ab9b169f26d6..e0c06f018be4 100644 --- a/src/runtime/extra/contrib/tensorrt/tensorrt_utils.h +++ b/src/runtime/extra/contrib/tensorrt/tensorrt_utils.h @@ -30,6 +30,15 @@ #include "NvInfer.h" +// This integration targets the TensorRT 10 API. TensorRT 10 removed a large set of APIs the +// pre-TRT10 code relied on (implicit batch, binding indices, addConvolution/addPooling/addPadding, +// IFullyConnectedLayer, IBuilder::setMaxBatchSize, IBuilderConfig::setMaxWorkspaceSize, +// IExecutionContext::execute, obj->destroy(), ...). Emit a clear error instead of a flood of +// "has no member" diagnostics on older releases. +#if !defined(NV_TENSORRT_MAJOR) || NV_TENSORRT_MAJOR < 10 +#error "TVM's TensorRT runtime requires TensorRT 10.0 or newer (or set USE_TENSORRT_RUNTIME=OFF)." +#endif + // There is a conflict between cpplint and clang-format-10. // clang-format off #define TRT_VERSION_GE(major, minor, patch) \ @@ -42,18 +51,18 @@ namespace runtime { namespace contrib { /*! - * \brief Helper function to convert an vector to TRT Dims. - * \param vec Vector. + * \brief Helper function to convert a vector-like container to TRT Dims. + * \param vec A container supporting size() and operator[] (e.g. std::vector or ffi::Array). * \return TRT Dims. */ -template -inline nvinfer1::Dims VectorToTrtDims(const std::vector& vec) { +template +inline nvinfer1::Dims VectorToTrtDims(const Container& vec) { nvinfer1::Dims dims; // Dims(nbDims=0, d[0]=1) is used to represent a scalar in TRT. dims.d[0] = 1; - dims.nbDims = vec.size(); + dims.nbDims = static_cast(vec.size()); for (size_t i = 0; i < vec.size(); ++i) { - dims.d[i] = vec[i]; + dims.d[i] = static_cast(vec[i]); } return dims; } diff --git a/tests/python/relax/test_codegen_tensorrt.py b/tests/python/relax/test_codegen_tensorrt.py index b8bae635b39d..68ad54cd4ecd 100644 --- a/tests/python/relax/test_codegen_tensorrt.py +++ b/tests/python/relax/test_codegen_tensorrt.py @@ -112,5 +112,207 @@ def get_ref(): tvm.testing.assert_allclose(out, ref, rtol=1e-3, atol=1e-3) +def _offload_and_compare(mod, params_np, patterns, data_np, rtol=1e-2, atol=1e-2): + """Offload a single-op module to TensorRT and compare against the LLVM reference. + + Each module here contains a single instance of the op under test, which both exercises the + individual converter and avoids the structurally-identical-composite deduplication that would + otherwise collapse repeated ops. + """ + ref = build_and_run(mod, [data_np, *params_np.values()], "llvm", legalize=True) + offloaded = tvm.transform.Sequential( + [ + relax.transform.BindParams("main", params_np), + relax.transform.FuseOpsByPattern(patterns), + relax.transform.MergeCompositeFunctions(), + relax.transform.RunCodegen(), + ] + )(mod) + out = build_and_run(offloaded, [data_np], "cuda") + tvm.testing.assert_allclose(out, ref, rtol=rtol, atol=atol) + + +def test_tensorrt_conv1d(): + # Regression test: explicit-batch (batch > 1) 1D convolution. The pre-TRT10 converter assumed an + # implicit batch dimension and dropped the spatial dimension under explicit batch. + @tvm.script.ir_module + class Conv1d: + @R.function + def main(data: R.Tensor((2, 8, 16), "float32"), weight: R.Tensor((4, 8, 3), "float32")): + with R.dataflow(): + out = relax.op.nn.conv1d(data, weight, padding=1) + R.output(out) + return out + + data = np.random.randn(2, 8, 16).astype("float32") + weight = np.random.randn(4, 8, 3).astype("float32") + patterns = [("tensorrt.nn.conv1d", is_op("relax.nn.conv1d")(wildcard(), wildcard()))] + _offload_and_compare(Conv1d, {"weight": weight}, patterns, data) + + +def test_tensorrt_max_pool2d(): + @tvm.script.ir_module + class MaxPool: + @R.function + def main(data: R.Tensor((2, 8, 16, 16), "float32")): + with R.dataflow(): + out = relax.op.nn.max_pool2d(data, pool_size=(2, 2), strides=(2, 2)) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + patterns = [("tensorrt.nn.max_pool2d", is_op("relax.nn.max_pool2d")(wildcard()))] + _offload_and_compare(MaxPool, {}, patterns, data) + + +def test_tensorrt_avg_pool2d(): + @tvm.script.ir_module + class AvgPool: + @R.function + def main(data: R.Tensor((2, 8, 16, 16), "float32")): + with R.dataflow(): + out = relax.op.nn.avg_pool2d(data, pool_size=(2, 2), strides=(2, 2)) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + patterns = [("tensorrt.nn.avg_pool2d", is_op("relax.nn.avg_pool2d")(wildcard()))] + _offload_and_compare(AvgPool, {}, patterns, data) + + +def test_tensorrt_softmax(): + @tvm.script.ir_module + class Softmax: + @R.function + def main(data: R.Tensor((2, 8, 16, 16), "float32")): + with R.dataflow(): + out = relax.op.nn.softmax(data, axis=1) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + patterns = [("tensorrt.nn.softmax", is_op("relax.nn.softmax")(wildcard()))] + _offload_and_compare(Softmax, {}, patterns, data) + + +def test_tensorrt_sigmoid(): + @tvm.script.ir_module + class Sigmoid: + @R.function + def main(data: R.Tensor((2, 8, 16, 16), "float32")): + with R.dataflow(): + out = relax.op.sigmoid(data) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + patterns = [("tensorrt.sigmoid", is_op("relax.sigmoid")(wildcard()))] + _offload_and_compare(Sigmoid, {}, patterns, data) + + +def test_tensorrt_tanh(): + @tvm.script.ir_module + class Tanh: + @R.function + def main(data: R.Tensor((2, 8, 16, 16), "float32")): + with R.dataflow(): + out = relax.op.tanh(data) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + patterns = [("tensorrt.tanh", is_op("relax.tanh")(wildcard()))] + _offload_and_compare(Tanh, {}, patterns, data) + + +def test_tensorrt_conv2d_transpose(): + # Default IOHW kernel layout ([in, out, h, w]); output channels are weight_shape[1]. + @tvm.script.ir_module + class ConvTranspose: + @R.function + def main( + data: R.Tensor((2, 8, 16, 16), "float32"), weight: R.Tensor((8, 4, 3, 3), "float32") + ): + with R.dataflow(): + out = relax.op.nn.conv2d_transpose(data, weight, padding=1) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + weight = np.random.randn(8, 4, 3, 3).astype("float32") + patterns = [ + ("tensorrt.nn.conv2d_transpose", is_op("relax.nn.conv2d_transpose")(wildcard(), wildcard())) + ] + _offload_and_compare(ConvTranspose, {"weight": weight}, patterns, data) + + +def test_tensorrt_conv3d_transpose(): + # Default IODHW kernel layout ([in, out, d, h, w]); output channels are weight_shape[1]. + @tvm.script.ir_module + class ConvTranspose3d: + @R.function + def main( + data: R.Tensor((2, 4, 8, 8, 8), "float32"), weight: R.Tensor((4, 2, 3, 3, 3), "float32") + ): + with R.dataflow(): + out = relax.op.nn.conv3d_transpose(data, weight, padding=1) + R.output(out) + return out + + data = np.random.randn(2, 4, 8, 8, 8).astype("float32") + weight = np.random.randn(4, 2, 3, 3, 3).astype("float32") + patterns = [ + ("tensorrt.nn.conv3d_transpose", is_op("relax.nn.conv3d_transpose")(wildcard(), wildcard())) + ] + _offload_and_compare(ConvTranspose3d, {"weight": weight}, patterns, data) + + +def test_tensorrt_int8_calibration(monkeypatch): + # INT8 calibration path: the first N runs feed calibration batches, then the INT8 engine is + # built and run. Validates that the calibrator copies a full batch (batch_size * per-sample + # elements) without over-reading the input or over-writing the device buffers, which previously + # crashed for batch > 1. + @tvm.script.ir_module + class Conv2dInt8: + @R.function + def main( + data: R.Tensor((2, 8, 16, 16), "float32"), weight: R.Tensor((4, 8, 3, 3), "float32") + ): + with R.dataflow(): + out = relax.op.nn.conv2d(data, weight, padding=1) + R.output(out) + return out + + data = np.random.randn(2, 8, 16, 16).astype("float32") + weight = np.random.randn(4, 8, 3, 3).astype("float32") + ref = build_and_run(Conv2dInt8, [data, weight], "llvm", legalize=True) + + patterns = [("tensorrt.nn.conv2d", is_op("relax.nn.conv2d")(wildcard(), wildcard()))] + offloaded = tvm.transform.Sequential( + [ + relax.transform.BindParams("main", {"weight": weight}), + relax.transform.FuseOpsByPattern(patterns), + relax.transform.MergeCompositeFunctions(), + relax.transform.RunCodegen(), + ] + )(Conv2dInt8) + + num_calibration_batches = 2 + monkeypatch.setenv("TVM_TENSORRT_USE_INT8", "1") + monkeypatch.setenv("TENSORRT_NUM_CALI_INT8", str(num_calibration_batches)) + + dev = tvm.device("cuda", 0) + vm = relax.VirtualMachine(tvm.compile(offloaded, "cuda"), dev) + data_trt = tvm.runtime.tensor(data, dev) + out = None + for _ in range(num_calibration_batches + 1): + out = vm["main"](data_trt).numpy() + + assert np.isfinite(out).all() + # INT8 is lossy, so use a generous tolerance; the key assertion is that calibration completed + # without a CUDA error. + tvm.testing.assert_allclose(out, ref, rtol=0.2, atol=0.1 * float(np.abs(ref).max())) + + if __name__ == "__main__": - test_tensorrt_offload() + tvm.testing.main()