Skip to content

Commit 72c0b0a

Browse files
authored
Merge pull request #168 from SharpAI/feature/onnx-hf-standardize
refactor: standardize on onnx-community HuggingFace ONNX format
2 parents 9fc4e81 + 86bdb7b commit 72c0b0a

File tree

3 files changed

+52
-32
lines changed

3 files changed

+52
-32
lines changed

skills/detection/yolo-detection-2026/scripts/env_config.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,12 @@ def __init__(self, session, class_names: dict):
119119
self._input_w = shape[3] if isinstance(shape[3], int) else 640
120120

121121
def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs):
122-
"""Run inference on an image path or PIL Image."""
122+
"""Run inference on an image path or PIL Image.
123+
124+
All models use onnx-community HuggingFace format:
125+
outputs[0] = logits [1, 300, 80] (raw, pre-sigmoid)
126+
outputs[1] = pred_boxes [1, 300, 4] (cx, cy, w, h normalized 0..1)
127+
"""
123128
import numpy as np
124129
from PIL import Image
125130

@@ -150,31 +155,36 @@ def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs):
150155

151156
# Run inference
152157
outputs = self.session.run(None, {self._input_name: blob})
153-
preds = outputs[0] # shape: [1, num_detections, 6]
158+
logits = outputs[0][0] # [300, 80] raw class logits
159+
pred_boxes = outputs[1][0] # [300, 4] cx, cy, w, h (normalized 0..1)
160+
161+
# Sigmoid → class probabilities
162+
probs = 1.0 / (1.0 + np.exp(-logits))
154163

155-
# Parse detections: [x1, y1, x2, y2, confidence, class_id]
164+
# Parse detections
156165
boxes = []
157-
for det in preds[0]:
158-
det_conf = float(det[4])
166+
for i in range(len(pred_boxes)):
167+
cls_id = int(np.argmax(probs[i]))
168+
det_conf = float(probs[i][cls_id])
159169
if det_conf < conf:
160170
continue
161171

162-
# Scale coordinates back to original image space
163-
x1 = (float(det[0]) - pad_x) / scale
164-
y1 = (float(det[1]) - pad_y) / scale
165-
x2 = (float(det[2]) - pad_x) / scale
166-
y2 = (float(det[3]) - pad_y) / scale
172+
# cx,cy,w,h (normalized) → x1,y1,x2,y2 (original image pixels)
173+
cx, cy, bw, bh = pred_boxes[i]
174+
px_cx = cx * self._input_w
175+
px_cy = cy * self._input_h
176+
px_w = bw * self._input_w
177+
px_h = bh * self._input_h
167178

168-
# Clip to image bounds
169-
x1 = max(0, min(x1, orig_w))
170-
y1 = max(0, min(y1, orig_h))
171-
x2 = max(0, min(x2, orig_w))
172-
y2 = max(0, min(y2, orig_h))
179+
x1 = max(0, min((px_cx - px_w / 2 - pad_x) / scale, orig_w))
180+
y1 = max(0, min((px_cy - px_h / 2 - pad_y) / scale, orig_h))
181+
x2 = max(0, min((px_cx + px_w / 2 - pad_x) / scale, orig_w))
182+
y2 = max(0, min((px_cy + px_h / 2 - pad_y) / scale, orig_h))
173183

174184
boxes.append(_BoxResult(
175185
xyxy=np.array([[x1, y1, x2, y2]]),
176186
conf=np.array([det_conf]),
177-
cls=np.array([int(det[5])]),
187+
cls=np.array([cls_id]),
178188
))
179189

180190
return [_DetResult(boxes)]
-50.3 KB
Binary file not shown.

skills/lib/env_config.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,12 @@ def __init__(self, session, class_names: dict):
119119
self._input_w = shape[3] if isinstance(shape[3], int) else 640
120120

121121
def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs):
122-
"""Run inference on an image path or PIL Image."""
122+
"""Run inference on an image path or PIL Image.
123+
124+
All models use onnx-community HuggingFace format:
125+
outputs[0] = logits [1, 300, 80] (raw, pre-sigmoid)
126+
outputs[1] = pred_boxes [1, 300, 4] (cx, cy, w, h normalized 0..1)
127+
"""
123128
import numpy as np
124129
from PIL import Image
125130

@@ -150,31 +155,36 @@ def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs):
150155

151156
# Run inference
152157
outputs = self.session.run(None, {self._input_name: blob})
153-
preds = outputs[0] # shape: [1, num_detections, 6]
158+
logits = outputs[0][0] # [300, 80] raw class logits
159+
pred_boxes = outputs[1][0] # [300, 4] cx, cy, w, h (normalized 0..1)
160+
161+
# Sigmoid → class probabilities
162+
probs = 1.0 / (1.0 + np.exp(-logits))
154163

155-
# Parse detections: [x1, y1, x2, y2, confidence, class_id]
164+
# Parse detections
156165
boxes = []
157-
for det in preds[0]:
158-
det_conf = float(det[4])
166+
for i in range(len(pred_boxes)):
167+
cls_id = int(np.argmax(probs[i]))
168+
det_conf = float(probs[i][cls_id])
159169
if det_conf < conf:
160170
continue
161171

162-
# Scale coordinates back to original image space
163-
x1 = (float(det[0]) - pad_x) / scale
164-
y1 = (float(det[1]) - pad_y) / scale
165-
x2 = (float(det[2]) - pad_x) / scale
166-
y2 = (float(det[3]) - pad_y) / scale
172+
# cx,cy,w,h (normalized) → x1,y1,x2,y2 (original image pixels)
173+
cx, cy, bw, bh = pred_boxes[i]
174+
px_cx = cx * self._input_w
175+
px_cy = cy * self._input_h
176+
px_w = bw * self._input_w
177+
px_h = bh * self._input_h
167178

168-
# Clip to image bounds
169-
x1 = max(0, min(x1, orig_w))
170-
y1 = max(0, min(y1, orig_h))
171-
x2 = max(0, min(x2, orig_w))
172-
y2 = max(0, min(y2, orig_h))
179+
x1 = max(0, min((px_cx - px_w / 2 - pad_x) / scale, orig_w))
180+
y1 = max(0, min((px_cy - px_h / 2 - pad_y) / scale, orig_h))
181+
x2 = max(0, min((px_cx + px_w / 2 - pad_x) / scale, orig_w))
182+
y2 = max(0, min((px_cy + px_h / 2 - pad_y) / scale, orig_h))
173183

174184
boxes.append(_BoxResult(
175185
xyxy=np.array([[x1, y1, x2, y2]]),
176186
conf=np.array([det_conf]),
177-
cls=np.array([int(det[5])]),
187+
cls=np.array([cls_id]),
178188
))
179189

180190
return [_DetResult(boxes)]

0 commit comments

Comments
 (0)