Merge globaleveragepool header to avgpool header

runwangdl · runwangdl · commit 1373d1d5bb24 · 2026-03-23T20:14:32.000Z
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPKernels.h b/TargetLibraries/PULPOpen/inc/DeeployPULPKernels.h
@@ -15,7 +15,6 @@
 #include "kernel/BatchNorm.h"
 #include "kernel/Conv.h"
 #include "kernel/GELU.h"
-#include "kernel/GlobalAveragePool.h"
 #include "kernel/Layernorm.h"
 #include "kernel/Matmul.h"
 #include "kernel/MaxPool.h"
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
@@ -27,7 +27,6 @@
 #include "kernel/BatchNorm.h"
 #include "kernel/Conv.h"
 #include "kernel/GELU.h"
-#include "kernel/GlobalAveragePool.h"
 #include "kernel/Layernorm.h"
 #include "kernel/Matmul.h"
 #include "kernel/MaxPool.h"
diff --git a/TargetLibraries/PULPOpen/inc/kernel/AvgPool.h b/TargetLibraries/PULPOpen/inc/kernel/AvgPool.h
@@ -24,4 +24,40 @@ void PULP_AvgPool2d_fp32_fp32_CHW(const float32_t *__restrict__ pSrcA,
                                   uint32_t pad_top, uint32_t pad_bottom,
                                   uint32_t pad_left, uint32_t pad_right);
 
+/**
+ * @brief Global Average Pooling forward pass (NCHW layout).
+ *
+ * For each (n, c), computes the mean over all (h, w) spatial positions:
+ *   output[n*C + c] = sum_{h,w}(input[(n*C+c)*H*W + h*W + w]) / (H*W)
+ *
+ * Parallelized over channels: each core handles a contiguous chunk of channels.
+ *
+ * @param input   Input tensor  [N, C, H, W]  NCHW float32
+ * @param output  Output tensor [N, C, 1, 1]  stored as [N*C] float32
+ * @param N       Batch size
+ * @param C       Number of channels
+ * @param H       Spatial height
+ * @param W       Spatial width
+ */
+void PULP_GlobalAveragePool_fp32(const float32_t *input, float32_t *output,
+                                  uint32_t N, uint32_t C, uint32_t H, uint32_t W);
+
+/**
+ * @brief Global Average Pooling backward pass (NCHW layout).
+ *
+ * Distributes the upstream gradient evenly across all spatial positions:
+ *   dX[n,c,h,w] = dY[n*C + c] / (H*W)
+ *
+ * Parallelized over channels: each core handles a contiguous chunk of channels.
+ *
+ * @param dY  Upstream gradient [N, C, 1, 1]  stored as [N*C] float32
+ * @param dX  Gradient w.r.t. input [N, C, H, W] NCHW float32
+ * @param N   Batch size
+ * @param C   Number of channels
+ * @param H   Spatial height
+ * @param W   Spatial width
+ */
+void PULP_GlobalAveragePoolGrad_fp32(const float32_t *dY, float32_t *dX,
+                                      uint32_t N, uint32_t C, uint32_t H, uint32_t W);
+
 #endif // __DEEPLOY_MATH_AVGPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/kernel/GlobalAveragePool.h b/TargetLibraries/PULPOpen/inc/kernel/GlobalAveragePool.h