tier4 · Owen-Liuyuxuan · Nov 11, 2024 · Nov 11, 2024 · Nov 12, 2024 · Nov 18, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,6 +25,7 @@ endif()
 
 find_package(CUDA)
 find_package(NVJPEG)
+find_package(autoware_type_adapters)
 find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 find_library(CULIBOS culibos ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
 if (${NVJPEG_FOUND} AND CUDART_LIBRARY AND CULIBOS)
@@ -150,6 +151,10 @@ if (JETSON)
           rectifier
           jpeg_compressor
           )
+
+  ament_target_dependencies(accelerated_image_processor
+    autoware_type_adapters
+  )
 
   rclcpp_components_register_node(accelerated_image_processor
           PLUGIN "gpu_imgproc::GpuImgProc"

diff --git a/include/accelerator/jpeg_compressor.hpp b/include/accelerator/jpeg_compressor.hpp
@@ -2,6 +2,8 @@
 
 #include <sensor_msgs/msg/image.hpp>
 #include <sensor_msgs/msg/compressed_image.hpp>
+#include "type_adapters/image_container.hpp"
+#include "type_adapters/compressed_image_container.hpp"
 #include <string>
 
 // This needs to be included before other CUDA headers in some environments
@@ -29,7 +31,10 @@ class NvJPEGEncoder;
 namespace JpegCompressor {
 using Image = sensor_msgs::msg::Image;
 using CompressedImage = sensor_msgs::msg::CompressedImage;
-
+using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer;
+using CompressedImageContainer = autoware::type_adaptation::type_adapters::CompressedImageContainer;
+using ImageContainerUniquePtr = autoware::type_adaptation::type_adapters::ImageContainerUniquePtr;
+using CompressedImageContainerUniquePtr = autoware::type_adaptation::type_adapters::CompressedImageContainerUniquePtr;
 enum class ImageFormat {
     RGB,
     BGR
@@ -42,6 +47,7 @@ class CPUCompressor {
     ~CPUCompressor();
 
     CompressedImage::UniquePtr compress(const Image &msg, int quality = 90, int format = TJPF_RGB, int sampling = TJ_420);
+    CompressedImage::UniquePtr compress(const ImageContainer &msg, int quality = 90, int format = TJPF_RGB, int sampling = TJ_420);
 private:
     tjhandle handle_;
     unsigned char *jpegBuf_;
@@ -56,6 +62,7 @@ class JetsonCompressor {
     ~JetsonCompressor();
 
     CompressedImage::UniquePtr compress(const Image &msg, int quality = 90, ImageFormat format = ImageFormat::RGB);
+    CompressedImage::UniquePtr compress(const ImageContainer &msg, int quality = 90, ImageFormat format = ImageFormat::RGB);
     void setCudaStream(cuda::stream::handle_t &raw_cuda_stream);
 private:
     NvJPEGEncoder *encoder_;
@@ -80,6 +87,7 @@ class NVJPEGCompressor {
     ~NVJPEGCompressor();
 
     CompressedImage::UniquePtr compress(const Image &msg, int quality = 90, ImageFormat format = ImageFormat::RGB);
+    CompressedImage::UniquePtr compress(const ImageContainer &msg, int quality = 90, ImageFormat format = ImageFormat::RGB);
     void setCudaStream(const cudaStream_t &raw_cuda_stream);
 
 private:

diff --git a/include/accelerator/rectifier.hpp b/include/accelerator/rectifier.hpp
@@ -2,7 +2,7 @@
 
 #include <sensor_msgs/msg/image.hpp>
 #include <sensor_msgs/msg/camera_info.hpp>
-
+#include "type_adapters/image_container.hpp"
 #ifdef OPENCV_AVAILABLE
 #include <opencv2/core.hpp>
 #endif
@@ -16,6 +16,8 @@
 
 using CameraInfo = sensor_msgs::msg::CameraInfo;
 using Image = sensor_msgs::msg::Image;
+using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer;
+using ImageContainerUniquePtr = autoware::type_adaptation::type_adapters::ImageContainerUniquePtr;
 
 namespace Rectifier {
 
@@ -44,6 +46,7 @@ class NPPRectifier {
     cudaStream_t& GetCudaStream() {return stream_;}
 
     Image::UniquePtr rectify(const Image &msg);
+    ImageContainerUniquePtr rectify(const ImageContainer &msg);
 private:
     Npp32f *pxl_map_x_;
     Npp32f *pxl_map_y_;
@@ -66,6 +69,7 @@ class OpenCVRectifierCPU {
     ~OpenCVRectifierCPU();
 
     Image::UniquePtr rectify(const Image &msg);
+    ImageContainerUniquePtr rectify(const ImageContainer &msg);
 private:
     cv::Mat map_x_;
     cv::Mat map_y_;
@@ -83,6 +87,7 @@ class OpenCVRectifierGPU {
     ~OpenCVRectifierGPU();
 
     Image::UniquePtr rectify(const Image &msg);
+    ImageContainerUniquePtr rectify(const ImageContainer &msg);
 private:
     cv::cuda::GpuMat map_x_;
     cv::cuda::GpuMat map_y_;

diff --git a/include/gpu_imgproc/gpu_imgproc.hpp b/include/gpu_imgproc/gpu_imgproc.hpp
@@ -5,6 +5,8 @@
 
 #include "accelerator/rectifier.hpp"
 #include "accelerator/jpeg_compressor.hpp"
+#include "type_adapters/image_container.hpp"
+#include "type_adapters/compressed_image_container.hpp"
 // #include <sensor_msgs/msg/compressed_image.hpp>
 
 namespace gpu_imgproc {
@@ -15,8 +17,13 @@ class GpuImgProc : public rclcpp::Node {
     virtual ~GpuImgProc();
 
 private:
+    using ImageContainer = autoware::type_adaptation::type_adapters::ImageContainer;
+    using CompressedImageContainer = autoware::type_adaptation::type_adapters::CompressedImageContainer;
+    using ImageContainerUniquePtr = autoware::type_adaptation::type_adapters::ImageContainerUniquePtr;
+    using CompressedImageContainerUniquePtr = autoware::type_adaptation::type_adapters::CompressedImageContainerUniquePtr;
     void imageCallback(const sensor_msgs::msg::Image::SharedPtr msg);
     void cameraInfoCallback(const sensor_msgs::msg::CameraInfo::SharedPtr msg);
+    void gpuImageCallback(const std::shared_ptr<ImageContainer> msg);
     void determineQosCallback(bool do_rectify);
 
 #if NPP_AVAILABLE
@@ -40,9 +47,11 @@ class GpuImgProc : public rclcpp::Node {
 #endif
 
     rclcpp::Subscription<sensor_msgs::msg::Image>::SharedPtr img_sub_;
+    rclcpp::Subscription<ImageContainer>::SharedPtr gpu_image_sub_;
     rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::SharedPtr info_sub_;
 
     rclcpp::Publisher<sensor_msgs::msg::Image>::SharedPtr rectified_pub_;
+    rclcpp::Publisher<ImageContainer>::SharedPtr gpu_rectified_pub_;
     rclcpp::Publisher<sensor_msgs::msg::CompressedImage>::SharedPtr compressed_pub_;
     rclcpp::Publisher<sensor_msgs::msg::CompressedImage>::SharedPtr rect_compressed_pub_;
 
@@ -51,6 +60,7 @@ class GpuImgProc : public rclcpp::Node {
     Rectifier::Implementation rectifier_impl_;
     Rectifier::MappingImpl mapping_impl_;
     bool rectifier_active_;
+    bool type_adaptation_active_;
     double alpha_;
     int32_t jpeg_quality_;
 };

diff --git a/package.xml b/package.xml
@@ -18,6 +18,7 @@
   <depend>cv_bridge</depend>
   <depend>libturbojpeg</depend>
   <depend>image_geometry</depend>
+  <depend>autoware_type_adapters</depend>
 
   <test_depend>ament_cmake_gtest</test_depend>
   <test_depend>ament_lint_auto</test_depend>

diff --git a/src/accelerator/jpeg_compressor.cpp b/src/accelerator/jpeg_compressor.cpp
@@ -1,7 +1,10 @@
 #include <cstdio>
 #include <cstring>
-
 #include <rclcpp/rclcpp.hpp>
+#include <chrono>
+#include <thread>
+
+#include <cuda_runtime.h>
 
 #include "accelerator/jpeg_compressor.hpp"
 
@@ -71,6 +74,12 @@ CompressedImage::UniquePtr CPUCompressor::compress(const Image &msg, int quality
 
     return compressed_msg;
 }
+CompressedImage::UniquePtr CPUCompressor::compress(const ImageContainer &msg, int quality, int format, int sampling) {
+    sensor_msgs::msg::Image image_msg;
+    msg.get_sensor_msgs_image(image_msg);
+    return compress(image_msg, quality, format, sampling);
+}
+
 #endif
 
 #ifdef JETSON_AVAILABLE
@@ -127,8 +136,22 @@ CompressedImage::UniquePtr JetsonCompressor::compress(const Image &msg, int qual
       TEST_ERROR(buffer_->allocateMemory() != 0, "NvBuffer allocation failed");
 
       encoder_->setCropRect(0, 0, width, height);
+
+      // Synchronize CUDA device to ensure memory allocation is complete
+      std::this_thread::sleep_for(std::chrono::milliseconds(300));
+      cudaError_t cudaStatus = cudaDeviceSynchronize();
+      if (cudaStatus != cudaSuccess) {
+          // Handle error
+          fprintf(stderr, "cudaDeviceSynchronize failed: %s\n", cudaGetErrorString(cudaStatus));
+          // You may want to throw an exception or handle the error appropriately
+      }
     }
 
+    // RCLCPP_ERROR(
+    //     rclcpp::get_logger("v4l2_camera"),
+    //     "dev_yuv_step_bytes_: %d", dev_yuv_step_bytes_[0]
+    // );
+
     TEST_ERROR(cudaMemcpy2DAsync(static_cast<void*>(dev_image_), dev_image_step_bytes_,
                                  static_cast<const void*>(img.data()), msg.step,
                                  msg.step * sizeof(Npp8u),
@@ -148,6 +171,7 @@ CompressedImage::UniquePtr JetsonCompressor::compress(const Image &msg, int qual
     NvBuffer::NvBufferPlane &plane_y = buffer_->planes[0];
     NvBuffer::NvBufferPlane &plane_u = buffer_->planes[1];
     NvBuffer::NvBufferPlane &plane_v = buffer_->planes[2];
+
     TEST_ERROR(cudaMemcpy2DAsync(plane_y.data, plane_y.fmt.stride,
                                  dev_yuv_[0], dev_yuv_step_bytes_[0], width, height,
                                  cudaMemcpyDeviceToHost, stream_.handle()) != cudaSuccess,
@@ -178,6 +202,107 @@ CompressedImage::UniquePtr JetsonCompressor::compress(const Image &msg, int qual
     return compressed_msg;
 }
 
+CompressedImage::UniquePtr JetsonCompressor::compress(const ImageContainer &msg, int quality, ImageFormat format)
+{
+    setCudaStream(msg.cuda_stream()->stream());
+    // Fill elements of  nppStreamContext
+    {
+        npp_stream_context_.hStream = stream_.handle();
+        cudaGetDevice(&npp_stream_context_.nCudaDeviceId);
+        cudaDeviceProp dev_prop;
+        cudaGetDeviceProperties(&dev_prop, npp_stream_context_.nCudaDeviceId);
+        npp_stream_context_.nMultiProcessorCount = dev_prop.multiProcessorCount;
+        npp_stream_context_.nMaxThreadsPerMultiProcessor = dev_prop.maxThreadsPerMultiProcessor;
+        npp_stream_context_.nMaxThreadsPerBlock = dev_prop.maxThreadsPerBlock;
+        npp_stream_context_.nSharedMemPerBlock = dev_prop.sharedMemPerBlock;
+        cudaDeviceGetAttribute(&npp_stream_context_.nCudaDevAttrComputeCapabilityMajor,
+                                cudaDevAttrComputeCapabilityMajor, npp_stream_context_.nCudaDeviceId);
+        cudaDeviceGetAttribute(&npp_stream_context_.nCudaDevAttrComputeCapabilityMinor,
+                                cudaDevAttrComputeCapabilityMinor, npp_stream_context_.nCudaDeviceId);
+        cudaStreamGetFlags(npp_stream_context_.hStream, &npp_stream_context_.nStreamFlags);
+    }
+
+    int width = msg.width();
+    int height = msg.height();
+    // const auto &img = msg.data;
+    size_t out_buf_size = width * height * 3 / 2;
+
+    CompressedImage::UniquePtr compressed_msg = std::make_unique<CompressedImage>();
+    compressed_msg->header = msg.header();
+    compressed_msg->format = "jpeg";
+
+    if (image_size < out_buf_size) {
+      // Allocate Npp8u buffers
+      dev_image_ = nppiMalloc_8u_C3(width, height, &dev_image_step_bytes_);
+      image_size = out_buf_size;
+
+      dev_yuv_[0] = nppiMalloc_8u_C1(width, height, &dev_yuv_step_bytes_[0]); // Y
+      dev_yuv_[1] = nppiMalloc_8u_C1(width/2, height/2, &dev_yuv_step_bytes_[1]); // U
+      dev_yuv_[2] = nppiMalloc_8u_C1(width/2, height/2, &dev_yuv_step_bytes_[2]); // V
+      buffer_.emplace(V4L2_PIX_FMT_YUV420M, width, height, 0);
+      TEST_ERROR(buffer_->allocateMemory() != 0, "NvBuffer allocation failed");
+
+      encoder_->setCropRect(0, 0, width, height);
+
+      // Synchronize CUDA device to ensure memory allocation is complete
+      std::this_thread::sleep_for(std::chrono::milliseconds(300));
+      cudaError_t cudaStatus = cudaDeviceSynchronize();
+      if (cudaStatus != cudaSuccess) {
+          // Handle error
+          fprintf(stderr, "cudaDeviceSynchronize failed: %s\n", cudaGetErrorString(cudaStatus));
+          // You may want to throw an exception or handle the error appropriately
+      }
+    }
+
+    TEST_ERROR(cudaMemcpy2DAsync(static_cast<void*>(dev_image_), dev_image_step_bytes_,
+                                 static_cast<const void*>(msg.cuda_mem()), msg.step(),
+                                 msg.width() * sizeof(Npp8u) * 3,
+                                 msg.height(), cudaMemcpyDeviceToDevice, stream_.handle()) != cudaSuccess,
+               "2D memory allocation failed");
+
+    NppiSize roi = {static_cast<int>(msg.width()), static_cast<int>(msg.height())};
+    if (format == ImageFormat::RGB) {
+        TEST_ERROR(nppiRGBToYUV420_8u_C3P3R_Ctx(dev_image_, dev_image_step_bytes_,
+                                                dev_yuv_.data(), dev_yuv_step_bytes_.data(), roi,
+                                                npp_stream_context_) != NPP_SUCCESS,
+                   "failed to convert rgb8 to yuv420");
+    } else {
+      // XXX: need to BGR -> RGB
+      std::cerr << "not supported" << std::endl;
+    }
+
+    NvBuffer::NvBufferPlane &plane_y = buffer_->planes[0];
+    NvBuffer::NvBufferPlane &plane_u = buffer_->planes[1];
+    NvBuffer::NvBufferPlane &plane_v = buffer_->planes[2];
+    TEST_ERROR(cudaMemcpy2DAsync(plane_y.data, plane_y.fmt.stride,
+                                 dev_yuv_[0], dev_yuv_step_bytes_[0], width, height,
+                                 cudaMemcpyDeviceToHost, stream_.handle()) != cudaSuccess,
+               "memory copy from Device to Host for Y plane failed");
+    TEST_ERROR(cudaMemcpy2DAsync(plane_u.data, plane_u.fmt.stride,
+                                 dev_yuv_[1], dev_yuv_step_bytes_[1], width/2, height/2,
+                                 cudaMemcpyDeviceToHost, stream_.handle()) != cudaSuccess,
+               "memory copy from Device to Host for U plane failed");
+    TEST_ERROR(cudaMemcpy2DAsync(plane_v.data, plane_v.fmt.stride,
+                                 dev_yuv_[2], dev_yuv_step_bytes_[2], width/2, height/2,
+                                 cudaMemcpyDeviceToHost, stream_.handle()) != cudaSuccess,
+               "memory copy from Device to Host for V plane failed");
+    stream_.synchronize();
+
+    unsigned char * out_data = new unsigned char[out_buf_size];
+
+    TEST_ERROR(
+        encoder_->encodeFromBuffer(buffer_.value(), JCS_YCbCr, &out_data,
+                                   out_buf_size, quality),
+        "NvJpeg Encoder Error");
+
+    compressed_msg->data.resize(static_cast<size_t>(out_buf_size / sizeof(uint8_t)));
+    memcpy(compressed_msg->data.data(), out_data, out_buf_size);
+
+    delete[] out_data;
+    out_data = nullptr;
+    return compressed_msg;
+}
+
 void JetsonCompressor::setCudaStream(cuda::stream::handle_t &raw_cuda_stream) {
   stream_ = cuda::stream::wrap(cuda::device::current::get().id(),
                                cuda::context::current::get().handle(),
@@ -236,6 +361,13 @@ CompressedImage::UniquePtr NVJPEGCompressor::compress(const Image &msg, int qual
     return compressed_msg;
 }
 
+CompressedImage::UniquePtr  NVJPEGCompressor::compress(const ImageContainer &msg, int quality, ImageFormat format)
+{
+    sensor_msgs::msg::Image image_msg;
+    msg.get_sensor_msgs_image(image_msg);
+    return compress(image_msg, quality, format);
+}
+
 void NVJPEGCompressor::setNVImage(const Image &msg) {
     unsigned char *p = nullptr;
     CHECK_CUDA(cudaMallocAsync((void **)&p, msg.data.size(), stream_));