diff --git a/src/infer_response.cc b/src/infer_response.cc index 5a898a7e..b02cdfd5 100644 --- a/src/infer_response.cc +++ b/src/infer_response.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -39,8 +39,10 @@ namespace triton { namespace backend { namespace python { InferResponse::InferResponse( const std::vector>& output_tensors, - std::shared_ptr error, const bool is_last_response, void* id) - : error_(error), is_last_response_(is_last_response), id_(id) + std::shared_ptr error, std::string parameters, + const bool is_last_response, void* id) + : error_(error), is_last_response_(is_last_response), id_(id), + parameters_(std::move(parameters)) { for (auto& output : output_tensors) { if (!output) { @@ -58,6 +60,12 @@ InferResponse::OutputTensors() return output_tensors_; } +std::string& +InferResponse::Parameters() +{ + return parameters_; +} + bool InferResponse::HasError() { @@ -106,6 +114,9 @@ InferResponse::SaveToSharedMemory( j++; } response_shm_ptr->id = id_; + + parameters_shm_ = PbString::Create(shm_pool, parameters_); + response_shm_ptr->parameters = parameters_shm_->ShmHandle(); } } @@ -143,6 +154,8 @@ InferResponse::LoadFromSharedMemory( std::shared_ptr pb_error; std::vector> output_tensors; + std::shared_ptr parameters_shm; + std::string parameters; // If the error field is set, do not load output tensors from shared memory. if (response_shm_ptr->has_error && response_shm_ptr->is_error_set) { @@ -154,26 +167,34 @@ InferResponse::LoadFromSharedMemory( bi::managed_external_buffer::handle_t* tensor_handle_shm = reinterpret_cast( response_shm.data_.get() + sizeof(ResponseShm)); + { #ifdef TRITON_PB_STUB - // Need to acquire the GIL to avoid hangs. - py::gil_scoped_acquire acquire; + // Need to acquire the GIL to avoid hangs. + py::gil_scoped_acquire acquire; #endif - for (size_t idx = 0; idx < requested_output_count; ++idx) { - std::shared_ptr pb_tensor = PbTensor::LoadFromSharedMemory( - shm_pool, tensor_handle_shm[idx], open_cuda_handle); - output_tensors.emplace_back(std::move(pb_tensor)); + for (size_t idx = 0; idx < requested_output_count; ++idx) { + std::shared_ptr pb_tensor = PbTensor::LoadFromSharedMemory( + shm_pool, tensor_handle_shm[idx], open_cuda_handle); + output_tensors.emplace_back(std::move(pb_tensor)); + } } + + parameters_shm = std::move( + PbString::LoadFromSharedMemory(shm_pool, response_shm_ptr->parameters)); + parameters = parameters_shm->String(); } return std::unique_ptr(new InferResponse( response_shm, output_tensors, pb_error, - response_shm_ptr->is_last_response, response_shm_ptr->id)); + response_shm_ptr->is_last_response, response_shm_ptr->id, parameters_shm, + parameters)); } InferResponse::InferResponse( AllocatedSharedMemory& response_shm, std::vector>& output_tensors, - std::shared_ptr& pb_error, const bool is_last_response, void* id) + std::shared_ptr& pb_error, const bool is_last_response, void* id, + std::shared_ptr& parameters_shm, std::string& parameters) { response_shm_ = std::move(response_shm); output_tensors_ = std::move(output_tensors); @@ -181,6 +202,8 @@ InferResponse::InferResponse( shm_handle_ = response_shm_.handle_; id_ = id; is_last_response_ = is_last_response; + parameters_shm_ = std::move(parameters_shm); + parameters_ = std::move(parameters); } std::shared_ptr& @@ -387,6 +410,38 @@ InferResponse::Send( cuda_copy |= cuda_used; } + if (!parameters_.empty()) { + triton::common::TritonJson::Value param; + THROW_IF_TRITON_ERROR( + param.Parse(parameters_.c_str(), parameters_.length())); + std::vector param_keys; + THROW_IF_TRITON_ERROR(param.Members(¶m_keys)); + for (const auto& key : param_keys) { + triton::common::TritonJson::Value value; + if (!param.Find(key.c_str(), &value)) { + throw PythonBackendException("Unexpected missing key on parameters"); + } + if (value.IsString()) { + std::string string_value; + THROW_IF_TRITON_ERROR(value.AsString(&string_value)); + THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetStringParameter( + response, key.c_str(), string_value.c_str())); + } else if (value.IsInt()) { + int64_t int_value = 0; + THROW_IF_TRITON_ERROR(value.AsInt(&int_value)); + THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetIntParameter( + response, key.c_str(), int_value)); + } else if (value.IsBool()) { + bool bool_value = false; + THROW_IF_TRITON_ERROR(value.AsBool(&bool_value)); + THROW_IF_TRITON_ERROR(TRITONBACKEND_ResponseSetBoolParameter( + response, key.c_str(), bool_value)); + } else { + throw PythonBackendException("Unsupported value type on parameters"); + } + } + } + #ifdef TRITON_ENABLE_GPU if (cuda_copy) { cudaStreamSynchronize(reinterpret_cast(cuda_stream)); diff --git a/src/infer_response.h b/src/infer_response.h index bdf31bb4..1af69b20 100644 --- a/src/infer_response.h +++ b/src/infer_response.h @@ -1,4 +1,4 @@ -// Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -38,6 +38,7 @@ namespace triton { namespace backend { namespace python { struct ResponseShm { uint32_t outputs_size; + bi::managed_external_buffer::handle_t parameters; bi::managed_external_buffer::handle_t error; bool has_error; // Indicates whether this error has a message or not. @@ -72,9 +73,10 @@ class InferResponse { public: InferResponse( const std::vector>& output_tensors, - std::shared_ptr error = nullptr, + std::shared_ptr error = nullptr, std::string parameters = "", const bool is_last_response = true, void* id = nullptr); std::vector>& OutputTensors(); + std::string& Parameters(); void SaveToSharedMemory( std::unique_ptr& shm_pool, bool copy_gpu = true); static std::unique_ptr LoadFromSharedMemory( @@ -116,8 +118,8 @@ class InferResponse { InferResponse( AllocatedSharedMemory& response_shm, std::vector>& output_tensors, - std::shared_ptr& pb_error, const bool is_last_response, - void* id); + std::shared_ptr& pb_error, const bool is_last_response, void* id, + std::shared_ptr& parameters_shm, std::string& parameters); std::vector> output_tensors_; std::shared_ptr error_; @@ -128,6 +130,9 @@ class InferResponse { bool is_last_response_; // Representing the request id that the response was created from. void* id_; + + std::shared_ptr parameters_shm_; + std::string parameters_; }; }}} // namespace triton::backend::python diff --git a/src/pb_stub.cc b/src/pb_stub.cc index 51df5aa2..cc8719a3 100644 --- a/src/pb_stub.cc +++ b/src/pb_stub.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -1874,11 +1874,44 @@ PYBIND11_EMBEDDED_MODULE(c_python_backend_utils, module) std::shared_ptr>(), py::arg("output_tensors") = py::list(), py::arg("error") = static_cast>(nullptr)) + .def( + py::init([](const std::vector>& + output_tensors, + std::shared_ptr error, + const py::object& parameters_) { + py::dict parameters = + PyDefaultArgumentToMutableType(parameters_); + for (const auto& pair : parameters) { + if (!py::isinstance(pair.first)) { + throw PythonBackendException( + "Expect parameters keys to have type str, found type " + + std::string(py::str(pair.first.get_type()))); + } + if (!py::isinstance(pair.second) && + !py::isinstance(pair.second) && + !py::isinstance(pair.second)) { + throw PythonBackendException( + "Expect parameters values to have type bool/int/str, found " + "type " + + std::string(py::str(pair.second.get_type()))); + } + } + py::module_ py_json = py::module_::import("json"); + std::string parameters_str = + py::str(py_json.attr("dumps")(parameters)); + + return std::make_shared( + output_tensors, error, parameters_str /* parameters */); + }), + py::arg("output_tensors") = py::list(), + py::arg("error") = static_cast>(nullptr), + py::arg("parameters") = py::str()) .def( "output_tensors", &InferResponse::OutputTensors, py::return_value_policy::reference) .def("has_error", &InferResponse::HasError) - .def("error", &InferResponse::Error); + .def("error", &InferResponse::Error) + .def("parameters", &InferResponse::Parameters); py::class_>( module, "InferenceResponseSender") diff --git a/src/request_executor.cc b/src/request_executor.cc index 78fa6ea2..c197948d 100644 --- a/src/request_executor.cc +++ b/src/request_executor.cc @@ -1,4 +1,4 @@ -// Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -153,20 +153,22 @@ InferResponseComplete( output_tensors.clear(); } + // TODO: [DLIS-7864] Pass response parameters from BLS response. if (!infer_payload->IsDecoupled()) { infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */); + output_tensors, pb_error, "" /* parameters */, + true /* is_last_response */); } else { if ((flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) == 0) { // Not the last response. infer_response = std::make_unique( - output_tensors, pb_error, false /* is_last_response */, - userp /* id */); + output_tensors, pb_error, "" /* parameters */, + false /* is_last_response */, userp /* id */); } else { // The last response. infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */, - userp /* id */); + output_tensors, pb_error, "" /* parameters */, + true /* is_last_response */, userp /* id */); } } @@ -178,11 +180,13 @@ InferResponseComplete( (flags & TRITONSERVER_RESPONSE_COMPLETE_FINAL) != 0) { // An empty response may be the last response for decoupled models. infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */, userp /* id */); + output_tensors, pb_error, "" /* parameters */, + true /* is_last_response */, userp /* id */); } else { pb_error = std::make_shared("Unexpected empty response."); infer_response = std::make_unique( - output_tensors, pb_error, true /* is_last_response */, userp /* id */); + output_tensors, pb_error, "" /* parameters */, + true /* is_last_response */, userp /* id */); } infer_payload->SetValue(std::move(infer_response));