Skip to content

Yolov8 model detection output printing confusing #652

@Izzatullokh24

Description

@Izzatullokh24

Hi, I working on deploying Yolov8 model to Nvidia Jetson Xavier.

I could not understand the logic in nvdsparsebbox_Yolo.cpp file

what I want is I want to print the bbox result to terminal and send then via mqtt. I have added printing statements to see the bbox information

#include "nvdsinfer_custom_impl.h"
#include "utils.h"
#include

extern "C" bool
NvDsInferParseYolo(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList);

static NvDsInferParseObjectInfo
convertBBox(const float& bx1, const float& by1, const float& bx2, const float& by2, const uint& netW, const uint& netH)
{
NvDsInferParseObjectInfo b;

float x1 = bx1;
float y1 = by1;
float x2 = bx2;
float y2 = by2;

// 🔍 DEBUG: Print raw YOLO output coordinates
std::cout << "🔍 RAW YOLO OUTPUT:" << std::endl;
std::cout << " → Network Resolution: " << netW << "x" << netH << std::endl;
std::cout << " → Raw coordinates: (" << bx1 << ", " << by1 << ") to (" << bx2 << ", " << by2 << ")" << std::endl;
std::cout << " → Raw size: " << std::fixed << std::setprecision(1) << (bx2 - bx1) << "x" << (by2 - by1) << " pixels" << std::endl;

x1 = clamp(x1, 0, netW);
y1 = clamp(y1, 0, netH);
x2 = clamp(x2, 0, netW);
y2 = clamp(y2, 0, netH);

b.left = x1;
b.width = clamp(x2 - x1, 0, netW);
b.top = y1;
b.height = clamp(y2 - y1, 0, netH);

// 🔍 DEBUG: Print processed coordinates
std::cout << "🔍 PROCESSED BBOX (" << netW << "x" << netH << "):" << std::endl;
std::cout << " → Final BBox: left=" << (int)b.left << ", top=" << (int)b.top << ", width=" << (int)b.width << ", height=" << (int)b.height << std::endl;
std::cout << " → Top-Left: (" << (int)b.left << ", " << (int)b.top << ")" << std::endl;
std::cout << " → Bottom-Right: (" << (int)(b.left + b.width) << ", " << (int)(b.top + b.height) << ")" << std::endl;

// 🔍 DEBUG: Show what this would look like scaled to 1920x1080
if (netW == 640 && netH == 640) {
float scale_x = 1920.0f / 640.0f; // 3.0
float scale_y = 1080.0f / 640.0f; // 1.6875

std::cout << "🔍 SCALED TO 1920x1080:" << std::endl;
std::cout << "   → Scaled BBox: left=" << (int)(b.left * scale_x) << ", top=" << (int)(b.top * scale_y) 
          << ", width=" << (int)(b.width * scale_x) << ", height=" << (int)(b.height * scale_y) << std::endl;
std::cout << "   → Scaled Top-Left: (" << (int)(b.left * scale_x) << ", " << (int)(b.top * scale_y) << ")" << std::endl;
std::cout << "   → Scaled Bottom-Right: (" << (int)((b.left + b.width) * scale_x) << ", " << (int)((b.top + b.height) * scale_y) << ")" << std::endl;

}
std::cout << " ═══════════════════════════════════════════════════════════════" << std::endl;

return b;
}

static void
addBBoxProposal(const float bx1, const float by1, const float bx2, const float by2, const uint& netW, const uint& netH,
const int maxIndex, const float maxProb, std::vector& binfo)
{
std::cout << "\n📦 BBOX PROPOSAL #" << (binfo.size() + 1) << ":" << std::endl;
std::cout << " → Resolution Context: " << netW << "x" << netH << std::endl;
std::cout << " → Class: " << maxIndex << " | Confidence: " << std::fixed << std::setprecision(3) << maxProb << std::endl;

NvDsInferParseObjectInfo bbi = convertBBox(bx1, by1, bx2, by2, netW, netH);

if (bbi.width < 1 || bbi.height < 1) {
std::cout << "❌ REJECTED: BBox too small (width=" << bbi.width << ", height=" << bbi.height << ")" << std::endl;
std::cout << "───────────────────────────────────────────────────────────────" << std::endl;
return;
}

bbi.detectionConfidence = maxProb;
bbi.classId = maxIndex;
binfo.push_back(bbi);

std::cout << "✅ ADDED TO DEEPSTREAM QUEUE" << std::endl;
std::cout << "───────────────────────────────────────────────────────────────" << std::endl;
}

static std::vector
decodeTensorYolo(const float* output, const uint& outputSize, const uint& netW, const uint& netH,
const std::vector& preclusterThreshold)
{
std::vector binfo;

std::cout << "\n🔍 YOLO TENSOR DECODING:" << std::endl;
std::cout << " → Output size: " << outputSize << " detections" << std::endl;
std::cout << " → Network resolution: " << netW << "x" << netH << std::endl;
std::cout << " → Confidence threshold: " << preclusterThreshold[0] << std::endl;
std::cout << "═══════════════════════════════════════════════════════════════════════════════════════" << std::endl;

int valid_detections = 0;

for (uint b = 0; b < outputSize; ++b) {
float maxProb = output[b * 6 + 4];
int maxIndex = (int) output[b * 6 + 5];

if (maxProb < preclusterThreshold[maxIndex]) {
  continue;
}

if (maxIndex != 0) {  // Only process person class
  continue;
}

valid_detections++;

float bx1 = output[b * 6 + 0];
float by1 = output[b * 6 + 1];
float bx2 = output[b * 6 + 2];
float by2 = output[b * 6 + 3];

std::cout << "\n🎯 VALID DETECTION #" << valid_detections << " FOUND:" << std::endl;
addBBoxProposal(bx1, by1, bx2, by2, netW, netH, maxIndex, maxProb, binfo);

}

std::cout << "\n📊 YOLO DECODING SUMMARY:" << std::endl;
std::cout << " → Total raw outputs: " << outputSize << std::endl;
std::cout << " → Valid detections: " << valid_detections << std::endl;
std::cout << " → Final proposals: " << binfo.size() << std::endl;
std::cout << "═══════════════════════════════════════════════════════════════════════════════════════" << std::endl;

return binfo;
}

static bool
NvDsInferParseCustomYolo(std::vector const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo, NvDsInferParseDetectionParams const& detectionParams,
std::vector& objectList)
{
if (outputLayersInfo.empty()) {
std::cerr << "ERROR: Could not find output layer in bbox parsing" << std::endl;
return false;
}

std::cout << "\n🚀 YOLO PARSING STARTED:" << std::endl;
std::cout << "╔══════════════════════════════════════════════════════════════════════════════╗" << std::endl;
std::cout << "║ YOLO PARSING DEBUG ║" << std::endl;
std::cout << "╠══════════════════════════════════════════════════════════════════════════════╣" << std::endl;
std::cout << "║ Network Input Resolution : " << std::setw(4) << networkInfo.width << "x" << std::setw(4) << networkInfo.height << " ║" << std::endl;
std::cout << "║ Model processes in : " << std::setw(4) << networkInfo.width << "x" << std::setw(4) << networkInfo.height << " coordinate space ║" << std::endl;
std::cout << "║ DeepStream will display : scaled to output resolution ║" << std::endl;
std::cout << "╚══════════════════════════════════════════════════════════════════════════════╝" << std::endl;

std::vector objects;

const NvDsInferLayerInfo& output = outputLayersInfo[0];
const uint outputSize = output.inferDims.d[0];

std::vector outObjs = decodeTensorYolo((const float*) (output.buffer), outputSize,
networkInfo.width, networkInfo.height, detectionParams.perClassPreclusterThreshold);

objects.insert(objects.end(), outObjs.begin(), outObjs.end());

// 🚀 FINAL DEBUG: What's actually being sent to DeepStream
std::cout << "\n🚀 FINAL OUTPUT TO DEEPSTREAM:" << std::endl;
std::cout << "╔══════════════════════════════════════════════════════════════════════════════╗" << std::endl;
std::cout << "║ EXACT DATA SENT TO DEEPSTREAM OSD ║" << std::endl;
std::cout << "╚══════════════════════════════════════════════════════════════════════════════╝" << std::endl;

if (objects.size() == 0) {
std::cout << "⚠️ NO OBJECTS TO SEND TO DEEPSTREAM - Nothing will be drawn" << std::endl;
} else {
for (size_t i = 0; i < objects.size(); i++) {
const auto& obj = objects[i];
std::cout << "\n🎯 DeepStream Object [" << (i+1) << "/" << objects.size() << "]:" << std::endl;
std::cout << " → ClassID: " << obj.classId << " (Person)" << std::endl;
std::cout << " → Confidence: " << std::fixed << std::setprecision(3) << obj.detectionConfidence << std::endl;
std::cout << " → EXACT OSD Coordinates (" << networkInfo.width << "x" << networkInfo.height << "):" << std::endl;
std::cout << " left=" << (int)obj.left << ", top=" << (int)obj.top << ", width=" << (int)obj.width << ", height=" << (int)obj.height << std::endl;
std::cout << " → Top-Left: (" << (int)obj.left << ", " << (int)obj.top << ")" << std::endl;
std::cout << " → Bottom-Right: (" << (int)(obj.left + obj.width) << ", " << (int)(obj.top + obj.height) << ")" << std::endl;

  // Show expected display size if using 640x640 model
  if (networkInfo.width == 640 && networkInfo.height == 640) {
    float scale_x = 1920.0f / 640.0f;
    float scale_y = 1080.0f / 640.0f;
    std::cout << "   → Expected Display Size (if 1920x1080 output):" << std::endl;
    std::cout << "     left=" << (int)(obj.left * scale_x) << ", top=" << (int)(obj.top * scale_y) 
              << ", width=" << (int)(obj.width * scale_x) << ", height=" << (int)(obj.height * scale_y) << std::endl;
  }
  std::cout << "   ═══════════════════════════════════════════════════════════════" << std::endl;
}

}

std::cout << "\n📺 NEXT STEP: DeepStream OSD will draw " << objects.size() << " bounding boxes on video" << std::endl;
std::cout << " → If display shows DIFFERENT sizes than above coordinates," << std::endl;
std::cout << " then tracker/DeepStream is modifying them post-processing." << std::endl;
std::cout << " → If display shows SAME sizes as above coordinates," << std::endl;
std::cout << " then this is exactly what YOLO model detected." << std::endl;
std::cout << "═══════════════════════════════════════════════════════════════════════════════════════\n" << std::endl;

objectList = objects;

return true;
}

extern "C" bool
NvDsInferParseYolo(std::vector const& outputLayersInfo, NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams, std::vector& objectList)
{
return NvDsInferParseCustomYolo(outputLayersInfo, networkInfo, detectionParams, objectList);
}

CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseYolo);

and the result is

🎯 DeepStream Object [1/4]:
→ ClassID: 0 (Person)
→ Confidence: 0.471
→ EXACT OSD Coordinates (640x640):
left=473, top=317, width=14, height=47
→ Top-Left: (473, 317)
→ Bottom-Right: (487, 364)
→ Expected Display Size (if 1920x1080 output):
left=1419, top=535, width=42, height=79

and then I have plotted the bbox info to image but it is drawn on the half of the object,

but using deepstream-app it is plotting to whole object.

My image size is 1920x1080, detected object bbox info must be around left=1419, top=535, width=42, height=150

there is no issue with left and top and width values, but height is decreased by 2

@anyone can help me?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions