PyTorchTNN/EAD_example.py at main · neuroagents-lab/PyTorchTNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import torch
from torch import nn
from pt_tnn.temporal_graph import TemporalGraph
from torch.nn.modules.transformer import MultiheadAttention
import einops


class TNNEncoder(nn.Module):
    def __init__(self, n_times):
        super(TNNEncoder, self).__init__()
        self.TG = TemporalGraph(
            model_config_file="configs/test_resnet18_bn_timevary.json",
            input_shape=[3, 224, 224],
            num_timesteps=n_times,
        )
        self.n_time = n_times

    def forward(self, x):
        return self.TG(x, n_times=self.n_time, return_all=True)


class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.0):
        super().__init__()
        self.embed_dim = embed_dim
        self.attn = MultiheadAttention(
            embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True
        )
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # x shape: (batch_size, seq_len, embed_dim)
        attn_output, _ = self.attn(
            query=x, key=x, value=x
        )  # self-attention: query=key=value=x
        out = self.norm(x + attn_output)  # Add & Norm
        return out


class LazyLinearDecoder(nn.Module):
    def __init__(self, out_features):
        super(LazyLinearDecoder, self).__init__()
        self.lin = nn.LazyLinear(out_features=out_features)

    def forward(self, x):
        x = einops.rearrange(x, "b t d -> b (t d)")
        return self.lin(x)


class EncAttDec(nn.Module):
    def __init__(self, out_features, n_times=10):
        super(EncAttDec, self).__init__()

        self.encoder = TNNEncoder(n_times=n_times)
        # maps (bs, t, C, H, W) ---> (bs, t, d) or (bs, C, H, W) ---> (bs, d)
        self.attender = SelfAttention(
            embed_dim=1000, num_heads=8  # the output dimension of ResNet18 is 1000
        )
        # maps (bs, t, d) ---> (bs, t, d) or identity [e.g., (bs, d) ---> (bs, d)]
        self.decoder = LazyLinearDecoder(out_features=out_features)
        # maps (bs, t, d) ---> (bs, num_classes) or identity [e.g., (bs, d) ---> (bs, d)]

    def forward(
        self,
        x,
    ):
        # encode, (bs, t, C, H, W) ---> (bs, t, d) or (bs, C, H, W) ---> (bs, d)
        x = self.encoder(x)
        # attend, (bs, t, d) ---> (bs, t, d) or identity [e.g., (bs, d) ---> (bs, d)]
        x = self.attender(x)
        # decode/predict, (bs, t, d) ---> (bs, num_classes) or identity [e.g., (bs, d) ---> (bs, d)]
        pred = self.decoder(x)

        return pred

if __name__ == "__main__":
    bs, T = 3, 10
    random_input = torch.rand(bs, T, 3, 224, 224)
    out_features = 100


    model = EncAttDec(
        out_features=out_features,  # output dimension
        n_times=T,  # number of unrolling steps in the TNN encoder
    )
    output = model(random_input)

    print(output.shape)  # (bs, out_features)