Yolov8 model detection output printing confusing

Hi, I working on deploying Yolov8 model to Nvidia Jetson  Xavier.

I could not understand the logic in nvdsparsebbox_Yolo.cpp file

what I want is I want to print the bbox result to terminal and send then via mqtt. I have added printing statements to see the bbox information




#include "nvdsinfer_custom_impl.h"
#include "utils.h"
#include <iomanip>

extern "C" bool
NvDsInferParseYolo(std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList);

static NvDsInferParseObjectInfo
convertBBox(const float& bx1, const float& by1, const float& bx2, const float& by2, const uint& netW, const uint& netH)
{
  NvDsInferParseObjectInfo b;

  float x1 = bx1;
  float y1 = by1;
  float x2 = bx2;
  float y2 = by2;
  
  // 🔍 DEBUG: Print raw YOLO output coordinates
  std::cout << "🔍 RAW YOLO OUTPUT:" << std::endl;
  std::cout << "   → Network Resolution: " << netW << "x" << netH << std::endl;
  std::cout << "   → Raw coordinates: (" << bx1 << ", " << by1 << ") to (" << bx2 << ", " << by2 << ")" << std::endl;
  std::cout << "   → Raw size: " << std::fixed << std::setprecision(1) << (bx2 - bx1) << "x" << (by2 - by1) << " pixels" << std::endl;

  x1 = clamp(x1, 0, netW);
  y1 = clamp(y1, 0, netH);
  x2 = clamp(x2, 0, netW);
  y2 = clamp(y2, 0, netH);

  b.left = x1;
  b.width = clamp(x2 - x1, 0, netW);
  b.top = y1;
  b.height = clamp(y2 - y1, 0, netH);
  
  // 🔍 DEBUG: Print processed coordinates
  std::cout << "🔍 PROCESSED BBOX (" << netW << "x" << netH << "):" << std::endl;
  std::cout << "   → Final BBox: left=" << (int)b.left << ", top=" << (int)b.top << ", width=" << (int)b.width << ", height=" << (int)b.height << std::endl;
  std::cout << "   → Top-Left: (" << (int)b.left << ", " << (int)b.top << ")" << std::endl;
  std::cout << "   → Bottom-Right: (" << (int)(b.left + b.width) << ", " << (int)(b.top + b.height) << ")" << std::endl;
  
  // 🔍 DEBUG: Show what this would look like scaled to 1920x1080
  if (netW == 640 && netH == 640) {
    float scale_x = 1920.0f / 640.0f;  // 3.0
    float scale_y = 1080.0f / 640.0f;  // 1.6875
    
    std::cout << "🔍 SCALED TO 1920x1080:" << std::endl;
    std::cout << "   → Scaled BBox: left=" << (int)(b.left * scale_x) << ", top=" << (int)(b.top * scale_y) 
              << ", width=" << (int)(b.width * scale_x) << ", height=" << (int)(b.height * scale_y) << std::endl;
    std::cout << "   → Scaled Top-Left: (" << (int)(b.left * scale_x) << ", " << (int)(b.top * scale_y) << ")" << std::endl;
    std::cout << "   → Scaled Bottom-Right: (" << (int)((b.left + b.width) * scale_x) << ", " << (int)((b.top + b.height) * scale_y) << ")" << std::endl;
  }
  std::cout << "   ═══════════════════════════════════════════════════════════════" << std::endl;

  return b;
}

static void
addBBoxProposal(const float bx1, const float by1, const float bx2, const float by2, const uint& netW, const uint& netH,
    const int maxIndex, const float maxProb, std::vector<NvDsInferParseObjectInfo>& binfo)
{
  std::cout << "\n📦 BBOX PROPOSAL #" << (binfo.size() + 1) << ":" << std::endl;
  std::cout << "   → Resolution Context: " << netW << "x" << netH << std::endl;
  std::cout << "   → Class: " << maxIndex << " | Confidence: " << std::fixed << std::setprecision(3) << maxProb << std::endl;

  NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH);

  if (bbi.width < 1 || bbi.height < 1) {
    std::cout << "❌ REJECTED: BBox too small (width=" << bbi.width << ", height=" << bbi.height << ")" << std::endl;
    std::cout << "───────────────────────────────────────────────────────────────" << std::endl;
    return;
  }

  bbi.detectionConfidence = maxProb;
  bbi.classId = maxIndex;
  binfo.push_back(bbi);
  
  std::cout << "✅ ADDED TO DEEPSTREAM QUEUE" << std::endl;
  std::cout << "───────────────────────────────────────────────────────────────" << std::endl;
}

static std::vector<NvDsInferParseObjectInfo>
decodeTensorYolo(const float* output, const uint& outputSize, const uint& netW, const uint& netH,
    const std::vector<float>& preclusterThreshold)
{
  std::vector<NvDsInferParseObjectInfo> binfo;
  
  std::cout << "\n🔍 YOLO TENSOR DECODING:" << std::endl;
  std::cout << "   → Output size: " << outputSize << " detections" << std::endl;
  std::cout << "   → Network resolution: " << netW << "x" << netH << std::endl;
  std::cout << "   → Confidence threshold: " << preclusterThreshold[0] << std::endl;
  std::cout << "═══════════════════════════════════════════════════════════════════════════════════════" << std::endl;

  int valid_detections = 0;
  
  for (uint b = 0; b < outputSize; ++b) {
    float maxProb = output[b * 6 + 4];
    int maxIndex = (int) output[b * 6 + 5];

    if (maxProb < preclusterThreshold[maxIndex]) {
      continue;
    }
    
    if (maxIndex != 0) {  // Only process person class
      continue;
    }
    
    valid_detections++;

    float bx1 = output[b * 6 + 0];
    float by1 = output[b * 6 + 1];
    float bx2 = output[b * 6 + 2];
    float by2 = output[b * 6 + 3];

    std::cout << "\n🎯 VALID DETECTION #" << valid_detections << " FOUND:" << std::endl;
    addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);
  }
  
  std::cout << "\n📊 YOLO DECODING SUMMARY:" << std::endl;
  std::cout << "   → Total raw outputs: " << outputSize << std::endl;
  std::cout << "   → Valid detections: " << valid_detections << std::endl;
  std::cout << "   → Final proposals: " << binfo.size() << std::endl;
  std::cout << "═══════════════════════════════════════════════════════════════════════════════════════" << std::endl;

  return binfo;
}

static bool
NvDsInferParseCustomYolo(std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
    NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams,
    std::vector<NvDsInferParseObjectInfo>& objectList)
{
  if (outputLayersInfo.empty()) {
    std::cerr << "ERROR: Could not find output layer in bbox parsing" << std::endl;
    return false;
  }

  std::cout << "\n🚀 YOLO PARSING STARTED:" << std::endl;
  std::cout << "╔══════════════════════════════════════════════════════════════════════════════╗" << std::endl;
  std::cout << "║                         YOLO PARSING DEBUG                                  ║" << std::endl;
  std::cout << "╠══════════════════════════════════════════════════════════════════════════════╣" << std::endl;
  std::cout << "║ Network Input Resolution : " << std::setw(4) << networkInfo.width << "x" << std::setw(4) << networkInfo.height << "                                ║" << std::endl;
  std::cout << "║ Model processes in       : " << std::setw(4) << networkInfo.width << "x" << std::setw(4) << networkInfo.height << " coordinate space              ║" << std::endl;
  std::cout << "║ DeepStream will display  : scaled to output resolution                      ║" << std::endl;
  std::cout << "╚══════════════════════════════════════════════════════════════════════════════╝" << std::endl;

  std::vector<NvDsInferParseObjectInfo> objects;

  const NvDsInferLayerInfo& output = outputLayersInfo[0];
  const uint outputSize = output.inferDims.d[0];

  std::vector<NvDsInferParseObjectInfo> outObjs = decodeTensorYolo((const float*) (output.buffer), outputSize,
      networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold);

  objects.insert(objects.end(), outObjs.begin(), outObjs.end());

  // 🚀 FINAL DEBUG: What's actually being sent to DeepStream
  std::cout << "\n🚀 FINAL OUTPUT TO DEEPSTREAM:" << std::endl;
  std::cout << "╔══════════════════════════════════════════════════════════════════════════════╗" << std::endl;
  std::cout << "║                    EXACT DATA SENT TO DEEPSTREAM OSD                        ║" << std::endl;
  std::cout << "╚══════════════════════════════════════════════════════════════════════════════╝" << std::endl;
  
  if (objects.size() == 0) {
    std::cout << "⚠️  NO OBJECTS TO SEND TO DEEPSTREAM - Nothing will be drawn" << std::endl;
  } else {
    for (size_t i = 0; i < objects.size(); i++) {
      const auto& obj = objects[i];
      std::cout << "\n🎯 DeepStream Object [" << (i+1) << "/" << objects.size() << "]:" << std::endl;
      std::cout << "   → ClassID: " << obj.classId << " (Person)" << std::endl;
      std::cout << "   → Confidence: " << std::fixed << std::setprecision(3) << obj.detectionConfidence << std::endl;
      std::cout << "   → EXACT OSD Coordinates (" << networkInfo.width << "x" << networkInfo.height << "):" << std::endl;
      std::cout << "     left=" << (int)obj.left << ", top=" << (int)obj.top << ", width=" << (int)obj.width << ", height=" << (int)obj.height << std::endl;
      std::cout << "   → Top-Left: (" << (int)obj.left << ", " << (int)obj.top << ")" << std::endl;
      std::cout << "   → Bottom-Right: (" << (int)(obj.left + obj.width) << ", " << (int)(obj.top + obj.height) << ")" << std::endl;
      
      // Show expected display size if using 640x640 model
      if (networkInfo.width == 640 && networkInfo.height == 640) {
        float scale_x = 1920.0f / 640.0f;
        float scale_y = 1080.0f / 640.0f;
        std::cout << "   → Expected Display Size (if 1920x1080 output):" << std::endl;
        std::cout << "     left=" << (int)(obj.left * scale_x) << ", top=" << (int)(obj.top * scale_y) 
                  << ", width=" << (int)(obj.width * scale_x) << ", height=" << (int)(obj.height * scale_y) << std::endl;
      }
      std::cout << "   ═══════════════════════════════════════════════════════════════" << std::endl;
    }
  }
  
  std::cout << "\n📺 NEXT STEP: DeepStream OSD will draw " << objects.size() << " bounding boxes on video" << std::endl;
  std::cout << "   → If display shows DIFFERENT sizes than above coordinates," << std::endl;
  std::cout << "     then tracker/DeepStream is modifying them post-processing." << std::endl;
  std::cout << "   → If display shows SAME sizes as above coordinates," << std::endl;
  std::cout << "     then this is exactly what YOLO model detected." << std::endl;
  std::cout << "═══════════════════════════════════════════════════════════════════════════════════════\n" << std::endl;

  objectList = objects;

  return true;
}

extern "C" bool
NvDsInferParseYolo(std::vector<NvDsInferLayerInfo> const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
    NvDsInferParseDetectionParams const& detectionParams, std::vector<NvDsInferParseObjectInfo>& objectList)
{
  return NvDsInferParseCustomYolo(outputLayersInfo, networkInfo, detectionParams, objectList);
}

CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);



and the result is

🎯 DeepStream Object [1/4]:
   → ClassID: 0 (Person)
   → Confidence: 0.471
   → EXACT OSD Coordinates (640x640):
     left=473, top=317, width=14, height=47
   → Top-Left: (473, 317)
   → Bottom-Right: (487, 364)
   → Expected Display Size (if 1920x1080 output):
     left=1419, top=535, width=42, height=79


and then I have plotted the bbox info to image but it is drawn on the half of the object,

but using deepstream-app it is plotting to whole object.

My image size is 1920x1080, detected object bbox info must be around      left=1419, top=535, width=42, height=150

there is no issue with left and top and width values, but height is decreased by 2


@anyone can help me?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Yolov8 model detection output printing confusing #652

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Uh oh!

Yolov8 model detection output printing confusing #652

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions