pulp-platform · JanCSEM · Feb 12, 2026 · Feb 13, 2026 · Feb 15, 2026 · Feb 15, 2026
@@ -25,6 +25,7 @@ concurrency:
 
 jobs:
   select-env:
+    if: github.repository == 'pulp-platform/Deeploy'
     uses: ./.github/workflows/_select-env.yml
     with:
       docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}

@@ -26,6 +26,7 @@ concurrency:
 
 jobs:
   select-env:
+    if: github.repository == 'pulp-platform/Deeploy'
     uses: ./.github/workflows/_select-env.yml
     with:
       docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}

@@ -52,10 +52,12 @@ DeeployTest/Tests/**/*.json
 DeeployTest/Tests/**/generateTest.py
 DeeployTest/out.txt
 
+venv/
+**/.venv/
 CHANGELOG_GEN.md
 
 # Container Artifacts
 .pyusbip/
 .cache/
 
-CLAUDE.md
+CLAUDE.md
@@ -17,6 +17,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Update CLI interface Across Project, Fix Tutorial, and Remove Legacy Test [#157](https://github.com/pulp-platform/Deeploy/pull/157)
 - Fix for python error when using python 3.12.11 [#189]( https://github.com/pulp-platform/Deeploy/pull/189)
 - Add support for Operators for Generic target needed in MAGIA [#193]( https://github.com/pulp-platform/Deeploy/pull/193)
+- Fix GAP9 L3 Board Tests: readfs Flash Ordering and Duplicate Input Data [#196](https://github.com/pulp-platform/Deeploy/pull/196)
 
 ### Added
 - Add many missing docstrings
@@ -42,6 +43,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - PULP-NN moved to TargetLibraries third-party folder
 - Aligned CLI commands across the project
 - Added @runwangdl as a code owner
+- Skip emitting duplicate `testInputVector` data for inputs placed in L3 (loaded at runtime from the readfs hex instead), reducing test binary size
 
 ### Fixed
 - Add missing `shell: bash` directive to CI cache generation steps to ensure correct shell execution
@@ -54,6 +56,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Fix tiling variable replacement corrupting static arrays by changing pointer update from value copy to address reassignment
 - Reduce RunNetwork stack usage by scoping per-layer variables with braces and moving tileIdxPtr allocation into per-layer execution blocks
 - Fix invalid escape sequence python error in DeeployTypes.py: appearing when using pytest to launch regressions
+- Fix GAP9 board tests with `--defaultMemLevel L3` reading garbage inputs: place all gapy `--flash-property` options before the positional subcommand and use `image flash run` so the readfs partition (input hex files) is flashed to the device
 
 ### Removed
 - `testDMA.py` was an old test; we now have `test_dmas.py` instead.

@@ -36,7 +36,6 @@ elseif(platform STREQUAL PULPOpen)
 elseif(platform STREQUAL GAP9)
   message(STATUS "Building for platform 'GAP9'")
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
-
   # Select SDK config based on simulator type
   if(SIMULATOR STREQUAL "board")
     set(ENV{KCONFIG_CONFIG} DeeployTest/Platforms/GAP9/sdk_board.config)
@@ -45,7 +44,6 @@ elseif(platform STREQUAL GAP9)
     set(ENV{KCONFIG_CONFIG} DeeployTest/Platforms/GAP9/sdk_gvsoc.config)
     message(STATUS "[GAP9] Using GVSoC SDK configuration")
   endif()
-
   include($ENV{GAP_SDK_HOME}/utils/cmake/setup.cmake)
 elseif(platform STREQUAL Generic)
   message(STATUS "Building for platform 'Generic'")

@@ -289,13 +289,28 @@ def checkValue(cls, value: Union[float, Iterable[float], np.ndarray], ctxt: Opti
                 continue
 
             # Check if exponent is representable.
-            if (cls.typeExponentOffset + exponent) > cls.typeExponentMax or (cls.typeExponentOffset + exponent) < 0:
-                return False
-
-            # Check if mantissa is representable. Implicit assumption is that cls.typeMantissa < 52 (like in FP64)
-            truncated_mantissa = 1 + math.floor((2**cls.typeMantissa) * (mantissa - 1)) / (2**cls.typeMantissa)
-            if math.fabs(truncated_mantissa - mantissa) > 0.0:
+            biased_exp = cls.typeExponentOffset + exponent
+            if biased_exp > cls.typeExponentMax:
                 return False
+            elif biased_exp >= 1:
+                # Normal number: check if mantissa is representable.
+                # Implicit assumption is that cls.typeMantissa < 52 (like in FP64)
+                truncated_mantissa = 1 + math.floor((2**cls.typeMantissa) * (mantissa - 1)) / (2**cls.typeMantissa)
+                if math.fabs(truncated_mantissa - mantissa) > 0.0:
+                    return False
+            else:
+                # Subnormal candidate (biased_exp <= 0).
+                # Minimum subnormal has biased_exp = 1 - typeMantissa (one ULP above zero).
+                if biased_exp < (1 - cls.typeMantissa):
+                    return False
+                # Value = mantissa * 2^exponent must be an integer multiple of the subnormal LSB
+                # (2^(1 - typeExponentOffset - typeMantissa)). The number of LSBs is:
+                #   mantissa * 2^(biased_exp - 1 + typeMantissa)
+                # which must be an exact integer for the value to be representable.
+                shift = biased_exp - 1 + cls.typeMantissa
+                mantissa_bits_float = mantissa * (2**shift)
+                if math.fabs(mantissa_bits_float - round(mantissa_bits_float)) > 0.0:
+                    return False
 
         return True
 

@@ -35,6 +35,10 @@ def _convert_requant_to_cmsis_fun(graph: gs.Graph, match: Match, name: str):
     if 'Emulate_CMSIS_RequantShift' in rqs.attrs:
         return graph
 
+    # Skip if inputs are not constants (e.g., when modified by perturbation nodes)
+    if not isinstance(rqs.inputs[-1], gs.Constant) or not isinstance(rqs.inputs[-2], gs.Constant):
+        return graph
+
     # WIESEP: Because CMSIS performs add-multiply-divide and we normally do multiply-add-divide
     #         we can emulate the same behavior by rounding the MUL value
     rqs.inputs[-1].values = np.round(copy.deepcopy(rqs.inputs[-1].values) /

@@ -245,8 +245,15 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f
         if node.op in ["Conv", "RequantizedConv"]:
             # In the case of Conv: [weights, opt. bias], RequantizedConv: [weights, mul, add, opt. shift]
             for tensor in node.inputs[1:]:
-                _transformLayoutConst(tensor, spatialDims, default_channels_first)
-
+                # Standard case: The weight is a direct constant input.
+                if isinstance(tensor, gs.Constant):
+                    _transformLayoutConst(tensor, spatialDims, default_channels_first)
+
+                # MeZO case: The weight is produced by a Perturb node.
+                elif isinstance(tensor, gs.Variable):
+                    if len(tensor.shape) > 1:
+                        permute_temp = _transformLayoutPermutation(len(tensor.shape), spatialDims, default_channels_first)
+                        graph.nodes.append(_appendTranspose(tensor, node, permute_temp))
         node.attrs["channels_first"] = default_channels_first
 
     return graph

@@ -344,7 +344,7 @@ def has_live_aliases(self, ctxt: NetworkContext) -> bool:
             next = queue.pop()
             buffNext = ctxt.lookup(next)
             assert isinstance(buffNext, VariableBuffer)
-            live |= buffNext._live
+            live |= buffNext._live or (next in ctxt.globalObjects)
             visited.add(next)
             queue |= buffNext.aliases - visited
         return live
@@ -359,7 +359,10 @@ def sizeInBytes(self) -> int:
             Size of this VariableBuffer in bytes
 
         """
-        return (math.prod(self.shape) * (self._type.referencedType.typeWidth)) // 8
+        if isinstance(self.shape, int):
+            return (self.shape * (self._type.referencedType.typeWidth)) // 8
+        else:
+            return (math.prod(self.shape) * (self._type.referencedType.typeWidth)) // 8
 
 
 class TransientBuffer(VariableBuffer):
@@ -1322,6 +1325,10 @@ def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool:
                         reference._instance = _type(inputNode.name, ctxt)
                 else:
                     retCheck &= reference._type.referencedType == _type.referencedType
+
+            # if node.name == "GradientAccumulator1_InPlaceAccumulatorV2_backward" and retCheck == False:
+            #     import IPython; IPython.embed()
+
         return retCheck
 
     def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
@@ -2099,11 +2106,15 @@ def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
             # Update shapes and types of tensors in onnx graph based on type inference after binding
             for node in (self.node.inputs + self.node.outputs):
                 if ctxt.is_local(node.name):
+                    if not hasattr(ctxt.localObjects[node.name], '_type'):
+                        continue  # skip untyped buffers (e.g. ReduceSum axes, MaxPool mask)
                     node.shape = ctxt.localObjects[node.name].shape
                     npType = self._broadcastToNpType(ctxt.localObjects[node.name]._type)
                     if npType is not None:
                         node.dtype = npType
                 elif ctxt.is_global(node.name):
+                    if not hasattr(ctxt.globalObjects[node.name], '_type'):
+                        continue  # skip untyped global buffers
                     npType = self._broadcastToNpType(ctxt.globalObjects[node.name]._type)
                     if isinstance(ctxt.globalObjects[node.name], ConstantBuffer):
                         if isinstance(node, gs.Constant):
@@ -2854,6 +2865,12 @@ def generateInferenceInitializationCode(self) -> str:
             if isinstance(node, StructBuffer):
                 continue
 
+            # Skip local buffers that were registered but never typed (e.g. optional ONNX
+            # outputs like the MaxPool indices/mask tensor).  These are not referenced by any
+            # template and must not be emitted as C declarations.
+            if not hasattr(node, '_type'):
+                continue
+
             name = node.name
             node.name = self.ctxt._mangle(node.name)
             callStack += node.init()
@@ -2898,10 +2915,11 @@ def generateIOBufferInitializationCode(self) -> str:
 
         callStack += "static const uint32_t " + self.ctxt._mangle("num_inputs") + f" = {len(inputs)};"
         callStack += "static const uint32_t " + self.ctxt._mangle("num_outputs") + f" = {len(outputs)};"
-
+        callStack += "static const uint32_t seed = 12345;"  # fixed seed for reproducibility
+        callStack += "static const uint32_t perturbation_sign = 1;"  # fixed sign for reproducibility
         callStack += "extern void* " + self.ctxt._mangle("inputs") + f"[{len(inputs)}];"
         callStack += "extern void* " + self.ctxt._mangle("outputs") + f"[{len(outputs)}];"
-
+    
         callStack += "static const uint32_t " + self.ctxt._mangle("inputs_bytes") + f"[{len(inputs)}] = " + "{"
 
         numBytes = []
@@ -2954,6 +2972,8 @@ def generateBufferInitializationCode(self) -> str:
         callStack = ''
         for node in ctxt.globalObjects.values():
             if isinstance(node, VariableBuffer) and not isinstance(node, StructBuffer):
+                if not hasattr(node, '_type'):
+                    continue  # skip untyped buffers (e.g. ReduceSum axes constants)
                 assert issubclass(node._type, Pointer), f"Global VariableBuffer {node.name} is not a Pointer!"
                 if node._deploy:
                     name = node.name
@@ -2999,6 +3019,8 @@ def generateBufferAllocationCode(self) -> str:
 
         for node in ctxt.globalObjects.values():
             if isinstance(node, VariableBuffer) and not isinstance(node, StructBuffer):
+                if not hasattr(node, '_type'):
+                    continue  # skip untyped buffers (e.g. ReduceSum axes constants)
                 assert issubclass(node._type, Pointer), f"Global VariableBuffer {node.name} is not a Pointer!"
                 if node._deploy:
                     name = node.name
@@ -3063,6 +3085,8 @@ def generateIncludeString(self) -> str:
         for engine in self.Platform.engines:
             for include in engine.includeList:
                 includeStr += ["#include \"" + include + "\""]
+            if engine.name == "GAP9Cluster":
+                includeStr += ["#include \"kernel/RandomNoise.h\""]
         return ("\n").join(includeStr)
 
     def generateEngineInitializationCode(self) -> str:
@@ -3124,6 +3148,10 @@ def _exportGraph(self, folderPath, fileName):
             if tensor.dtype != tensor.export_dtype:
                 tensor.values = tensor.values.astype(tensor.export_dtype)
 
+        # JANSNO: Shapes of tensors should never be an int.
+        for tensor in self.graph.tensors().values():
+            if tensor.shape is not None and isinstance(tensor.shape, int):
+                tensor.shape = tensor.shape = [tensor.shape]
         model = gs.export_onnx(self.graph)
 
         # Annotate additional information in doc_string of tensors
@@ -3536,6 +3564,8 @@ def _printMemorySummary(self):
                 if isinstance(_buffer, ConstantBuffer) or (isinstance(_buffer, VariableBuffer) and _buffer._deploy):
                     # SCHEREMO: We only
                     if (hasattr(_buffer, "_memoryLevel") and _buffer._memoryLevel == level) or level == "None":
+                        if not hasattr(_buffer, '_type'):
+                            continue  # skip untyped buffers (e.g. ReduceSum axes constants)
                         staticSize += int((np.prod(_buffer.shape) * _buffer._type.referencedType.typeWidth // 8))
                     else:
                         log.warning(f"Buffer {_buffer.name} does not have a valid memory level")