snitch_cluster/hw/dca_interface/src/dca_fork.sv at ddfa12a1ebbf97a3bb5a82e0c920150d964ecbcb · pulp-platform/snitch_cluster · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// Copyright 2025 ETH Zurich and University of Bologna.
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
// SPDX-License-Identifier: SHL-0.51

// Author: Luca Colagrande <colluca@iis.ee.ethz.ch>

`include "dca_interface/typedef.svh"

// Forks a wide Direct Compute Access (DCA) request to multiple lanes,
// operating in SIMD fashion.
module dca_fork #(
  parameter int unsigned LaneDataWidth = 64,
  parameter int unsigned NumLanes = 8,
  // Derived parameters
  localparam int unsigned DataWidth = LaneDataWidth * NumLanes,
  localparam type dca_req_t = `DCA_REQ_STRUCT(DataWidth),
  localparam type dca_rsp_t = `DCA_RSP_STRUCT(DataWidth),
  localparam type dca_lane_req_t = `DCA_REQ_STRUCT(LaneDataWidth),
  localparam type dca_lane_rsp_t = `DCA_RSP_STRUCT(LaneDataWidth)
) (
  input  logic clk_i,
  input  logic rst_ni,
  input  dca_req_t slv_req_i,
  output dca_rsp_t slv_rsp_o,
  output dca_lane_req_t [NumLanes-1:0] mst_req_o,
  input  dca_lane_rsp_t [NumLanes-1:0] mst_rsp_i
);

  logic [NumLanes-1:0] flat_q_valids;
  logic [NumLanes-1:0] flat_q_readies;
  logic [NumLanes-1:0] flat_p_valids;
  logic [NumLanes-1:0] flat_p_readies;

  // Fork the DCA request to all lanes
  cc_stream_fork #(
    .N_OUP(NumLanes)
  ) i_dca_fork_fpu (
    .clk_i  (clk_i),
    .rst_ni (rst_ni),
    .valid_i(slv_req_i.q_valid),
    .ready_o(slv_rsp_o.q_ready),
    .valid_o(flat_q_valids),
    .ready_i(flat_q_readies)
  );

  // Join the DCA responses from all lanes
  cc_stream_join #(
    .N_INP(NumLanes)
  ) i_dca_join_fpu (
    .inp_valid_i(flat_p_valids),
    .inp_ready_o(flat_p_readies),
    .oup_valid_o(slv_rsp_o.p_valid),
    .oup_ready_i(slv_req_i.p_ready)
  );

  for (genvar i = 0; i < NumLanes; i++) begin : gen_lane
    // The same operation flags are sent to all lanes
    assign mst_req_o[i].q.rnd_mode = slv_req_i.q.rnd_mode;
    assign mst_req_o[i].q.op = slv_req_i.q.op;
    assign mst_req_o[i].q.op_mod = slv_req_i.q.op_mod;
    assign mst_req_o[i].q.src_fmt = slv_req_i.q.src_fmt;
    assign mst_req_o[i].q.dst_fmt = slv_req_i.q.dst_fmt;
    assign mst_req_o[i].q.int_fmt = slv_req_i.q.int_fmt;
    assign mst_req_o[i].q.vectorial_op = slv_req_i.q.vectorial_op;
    // Data is split across lanes, to perform SIMD operation (both operands and result)
    assign mst_req_o[i].q.operands[2][LaneDataWidth-1:0] = slv_req_i.q.operands[2][LaneDataWidth*i+:LaneDataWidth];
    assign mst_req_o[i].q.operands[1][LaneDataWidth-1:0] = slv_req_i.q.operands[1][LaneDataWidth*i+:LaneDataWidth];
    assign mst_req_o[i].q.operands[0][LaneDataWidth-1:0] = slv_req_i.q.operands[0][LaneDataWidth*i+:LaneDataWidth];
    assign slv_rsp_o.p.result[LaneDataWidth*i+:LaneDataWidth] = mst_rsp_i[i].p.result[LaneDataWidth-1:0];
    // Connect the handshake signals
    assign mst_req_o[i].q_valid = flat_q_valids[i];
    assign mst_req_o[i].p_ready = flat_p_readies[i];
    assign flat_q_readies[i] = mst_rsp_i[i].q_ready;
    assign flat_p_valids[i] = mst_rsp_i[i].p_valid;
  end

  // OR-reduce the status bits from all lanes
  // TODO(colluca): double-check that this is actually a bitwise OR
  always_comb begin
    slv_rsp_o.p.status = '0;
    for (int i = 0; i < (NumLanes-1); i++) begin
      slv_rsp_o.p.status |= mst_rsp_i[i].p.status;
    end
  end

endmodule