diff --git a/Deeploy/Targets/Neureka/Engine.py b/Deeploy/Targets/Neureka/Engine.py index 2585b1a688..1f3210ea25 100644 --- a/Deeploy/Targets/Neureka/Engine.py +++ b/Deeploy/Targets/Neureka/Engine.py @@ -31,11 +31,14 @@ ConvLayer([NeurekaPWConv2DMapper, NeurekaDWConv2DMapper, NeurekaDenseConv2DMapper]), } -_includeList = ["pulp_nnx_neureka.h", "pulp_nnx_util.h", "neureka_siracusa_bsp.h", "neureka.h", "neureka_task.h"] +_includeList = [ + "pulp_nnx_neureka.h", "pulp_nnx_util.h", "neureka_siracusa_bsp.h", "neureka.h", "neureka_task.h", "neureka_gvsoc.h" +] _neurekaInitCode = r""" neureka_siracusa_conf_t conf = {.max_stall = 8}; neureka_nnx_init(neureka_siracusa_get_dev(), &conf); +neureka_gvsoc_log_activate(neureka_siracusa_get_dev(), NEUREKA_GVSOC_LOG_LEVEL_ALL, NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL); """ diff --git a/Deeploy/Targets/Neureka/Parsers.py b/Deeploy/Targets/Neureka/Parsers.py index 3c564c10b2..df50f4de8d 100644 --- a/Deeploy/Targets/Neureka/Parsers.py +++ b/Deeploy/Targets/Neureka/Parsers.py @@ -50,14 +50,18 @@ def parseNodeCtxt(self, # and enforcing that the channels_first is false data_in = newCtxt.lookup(self.operatorRepresentation['data_in']) data_out = newCtxt.lookup(self.operatorRepresentation['data_out']) - weight = newCtxt.lookup(self.operatorRepresentation['weight']) + # MARCHIOA: weight depends on the type of convolution so it requires to be parsed by the child parsers + # - PW -> 3-dim + # - DW -> 4-dim + # - Dense -> 4-dim + # weight = newCtxt.lookup(self.operatorRepresentation['weight']) if not all([ channels_first == False, len(data_in.shape) == 4, - # LMACAN: weight shape should be equal to 3 because we have to do the neureka's - # special weight encoding - len(weight.shape) == 3, + # # LMACAN: weight shape should be equal to 3 because we have to do the neureka's + # # special weight encoding + # len(weight.shape) == 3, ]): return newCtxt, False @@ -83,18 +87,36 @@ def parseNode(self, node: gs.Node) -> bool: if not super().parseNode(node): return False - ch_im_out = node.inputs[1].shape[0] - ch_im_in = node.inputs[1].shape[1] + weights = node.inputs[1] + # weigths reshaped by the weigths encoder into + # (cout, cinMajor, bits, weightBandwidthBytes) + # where: + # - cout: 1 by definition (it is cin from ONNX) + # - cinMajor: number of tiles over the channels + # - bits: weight bit width (only 8 is supported) + # - weightBandwidthBytes: which is 32 in Siracusa if not all([ self.operatorRepresentation['kernel_shape'] == [3, 3], - self.operatorRepresentation['group'] == ch_im_out, - self.operatorRepresentation['group'] == ch_im_in, + len(weights.shape) == 4, + weights.shape[0] == 1, # ch_im_out ]): return False return True + def parseNodeCtxt(self, ctxt, node, channels_first = True): + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return False + + weight = newCtxt.lookup(self.operatorRepresentation['weight']) + if not (len(weight.shape) == 4): + return False + + return newCtxt, True + class NeurekaRQSDWConv2DParser(NeurekaDWConv2DParser, RQSParserInterface): @@ -136,6 +158,18 @@ def parseNode(self, node: gs.Node) -> bool: return True + def parseNodeCtxt(self, ctxt, node, channels_first = True): + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return False + + weight = newCtxt.lookup(self.operatorRepresentation['weight']) + if not (len(weight.shape) == 3): + return False + + return newCtxt, True + class NeurekaRQSPWConv2DParser(NeurekaPWConv2DParser, RQSParserInterface): @@ -176,6 +210,18 @@ def parseNode(self, node: gs.Node) -> bool: return True + def parseNodeCtxt(self, ctxt, node, channels_first = True): + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + if not ret: + return False + + weight = newCtxt.lookup(self.operatorRepresentation['weight']) + if not (len(weight.shape) == 4): + return False + + return newCtxt, True + class NeurekaRQSDenseConv2DParser(NeurekaDenseConv2DParser, RQSParserInterface): diff --git a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py index 97253d6e12..04968cf905 100644 --- a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py +++ b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py @@ -256,12 +256,12 @@ def getCounters( operatorRepresentation: OperatorRepresentation) -> Tuple[int, int, int, int, int, int, int, int, int, int]: _ = operatorRepresentation # operatorRepresentation not accessed for now because it's just for pointwise kernels - n_channel_out_subtiles = _getNumTiles(channel_out, 28) + n_channel_out_subtiles = _getNumTiles(channel_out, 32) n_channel_in_subtiles = _getNumTiles(channel_in, 28) n_height_out_subtiles = _getNumTiles(height_out, 6) n_width_out_subtiles = _getNumTiles(width_out, 6) - channel_out_border = _getBorderTileSize(channel_out, 28) + channel_out_border = _getBorderTileSize(channel_out, 32) channel_in_border = _getBorderTileSize(channel_in, 28) height_out_border = _getBorderTileSize(height_out, 6) width_out_border = _getBorderTileSize(width_out, 6) diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py index 814024a877..c31474bcc9 100644 --- a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py +++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py @@ -218,7 +218,9 @@ def serializeTilingSolution( replacementTypes['weight_addr_offset'] = PointerClass(uint32_t) for absoluteCube in absoluteOutputCubes: COffset, CSize = absoluteCube.absoluteOffset[-1], absoluteCube.rectangle.dims[-1] - WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1])) + # WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1])) + WeightCube = HyperRectangle((COffset, 0, 0, 0), + (CSize, weightShape[-3], weightShape[-2], weightShape[-1])) replacements['weight_addr_offset'].append(calculateFlatOffsetInBytes(WeightCube, weightBuffer)) else: inputWeightBaseOffsets, outputWeightBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, @@ -228,7 +230,8 @@ def serializeTilingSolution( for cube, load in zip(outputCubes, inputLoadSchedule): COffset, CSize = cube.offset[-1], cube.dims[-1] - load['weight'] = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1])) + load['weight'] = HyperRectangle((COffset, 0, 0, 0), + (CSize, weightShape[-3], weightShape[-2], weightShape[-1])) tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py index fd5d791119..660fc08c4e 100644 --- a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py +++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py @@ -49,11 +49,7 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.addConstraint(outputBatchVar == inputBatchVar) tilerModel.addConstraint(outputChannelVar == inputChannelVar) - weightBuffer = ctxt.lookup(weightBufferName) - if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM": - tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max()) - else: - tilerModel.addConstraint(weightOutChannelVar == outputChannelVar) + tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max()) tilerModel.addConstraint(inputHeightVar >= 3) tilerModel.addConstraint(inputWidthVar >= 3) @@ -214,7 +210,8 @@ def serializeTilingSolution( replacementTypes['weight_addr_offset'] = PointerClass(uint32_t) for absoluteCube in absoluteOutputCubes: COffset, CSize = absoluteCube.absoluteOffset[-1], absoluteCube.rectangle.dims[-1] - WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1])) + WeightCube = HyperRectangle((COffset, 0, 0, 0), + (CSize, weightShape[-3], weightShape[-2], weightShape[-1])) replacements['weight_addr_offset'].append(calculateFlatOffsetInBytes(WeightCube, weightBuffer)) else: inputWeightBaseOffsets, outputWeightBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, @@ -223,8 +220,7 @@ def serializeTilingSolution( outputBaseOffsets.update(outputWeightBaseOffsets) for cube, load in zip(outputCubes, inputLoadSchedule): - COffset, CSize = cube.offset[-1], cube.dims[-1] - load['weight'] = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1])) + load['weight'] = HyperRectangle((0,) * len(weightShape), tuple(weightShape)) tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) diff --git a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py index 84e0565b97..5f33d3ef95 100644 --- a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py @@ -34,9 +34,6 @@ def _weightEncode(weight: npt.NDArray[np.uint8], bits: int, depthwise: bool = Fa _NEUREKA_CIN_SUBTILE_1x1 = 32 _NEUREKA_CIN_SUBTILE_3x3 = 28 - if depthwise: - weight = weight.transpose(1, 0, 2, 3) # Swap cout and cin - cout, cin, height, width = weight.shape cinSubtile = (_NEUREKA_CIN_SUBTILE_3x3 if height == 3 else _NEUREKA_CIN_SUBTILE_1x1) diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/inputs.npz new file mode 100644 index 0000000000..0d9fc0d791 Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/network.onnx new file mode 100644 index 0000000000..5c3e85a2dc Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/outputs.npz new file mode 100644 index 0000000000..2585f5cf2c Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_3x3_RQ/outputs.npz differ diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index 71b056e9df..08b64855df 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -238,6 +238,10 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.append(f"--searchStrategy={args.searchStrategy}") if hasattr(args, 'plotMemAlloc') and args.plotMemAlloc: gen_args_list.append("--plotMemAlloc") + if hasattr(args, 'enable_3x3') and args.enable_3x3: + gen_args_list.append("--enable-3x3") + if hasattr(args, 'neureka_wmem') and args.neureka_wmem: + gen_args_list.append("--neureka-wmem") if not tiling and getattr(args, 'profileUntiled', False): gen_args_list.append("--profileUntiled")