@@ -24,4 +24,40 @@ void PULP_AvgPool2d_fp32_fp32_CHW(const float32_t *__restrict__ pSrcA,
2424 uint32_t pad_top , uint32_t pad_bottom ,
2525 uint32_t pad_left , uint32_t pad_right );
2626
27+ /**
28+ * @brief Global Average Pooling forward pass (NCHW layout).
29+ *
30+ * For each (n, c), computes the mean over all (h, w) spatial positions:
31+ * output[n*C + c] = sum_{h,w}(input[(n*C+c)*H*W + h*W + w]) / (H*W)
32+ *
33+ * Parallelized over channels: each core handles a contiguous chunk of channels.
34+ *
35+ * @param input Input tensor [N, C, H, W] NCHW float32
36+ * @param output Output tensor [N, C, 1, 1] stored as [N*C] float32
37+ * @param N Batch size
38+ * @param C Number of channels
39+ * @param H Spatial height
40+ * @param W Spatial width
41+ */
42+ void PULP_GlobalAveragePool_fp32 (const float32_t * input , float32_t * output ,
43+ uint32_t N , uint32_t C , uint32_t H , uint32_t W );
44+
45+ /**
46+ * @brief Global Average Pooling backward pass (NCHW layout).
47+ *
48+ * Distributes the upstream gradient evenly across all spatial positions:
49+ * dX[n,c,h,w] = dY[n*C + c] / (H*W)
50+ *
51+ * Parallelized over channels: each core handles a contiguous chunk of channels.
52+ *
53+ * @param dY Upstream gradient [N, C, 1, 1] stored as [N*C] float32
54+ * @param dX Gradient w.r.t. input [N, C, H, W] NCHW float32
55+ * @param N Batch size
56+ * @param C Number of channels
57+ * @param H Spatial height
58+ * @param W Spatial width
59+ */
60+ void PULP_GlobalAveragePoolGrad_fp32 (const float32_t * dY , float32_t * dX ,
61+ uint32_t N , uint32_t C , uint32_t H , uint32_t W );
62+
2763#endif // __DEEPLOY_MATH_AVGPOOL_KERNEL_HEADER_
0 commit comments