final-project-flashnet/ActorCritic.py at main · Applied-Machine-Learning-2021/final-project-flashnet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 30 16:20:39 2018

@author: orrivlin
"""
import torch
import torch.nn.functional as F
from copy import deepcopy as dc
from lightning_gym.GCN import GCN
from lightning_gym.Logger import Logger
import numpy as np


class DiscreteActorCritic:
    def __init__(self, problem, cuda_flag=False, load_model=False, **kwargs):

        self.problem = problem  # environment
        self.cuda = cuda_flag
        self._load_model = load_model  # if have previous model to pass down
        self.path = 'mvc_net.pt'

        # hyperparameters
        self.in_feats = kwargs.get("ndim", 4)  # of node features - equal to length of x in BTWN.py
        self.n_hidden = kwargs.get("hdim", 256)
        self.gamma = kwargs.get("gamma", 1)
        '''#every action brings positive or negative(ours positive)
        #as go on, actions are worth less - properties of env
        #gamma does opposite, adjusting for reward- rewards seeing at the end are worth more than at the beginning,
        gamma = constant (1)
        #reward gradients out from goal
        #what it is saying: every decision is equal, once found path- it is =1, don't have distance
        #if have gamma less than 1, leaves out'''
        self.learning_rate = kwargs.get("lr", 0.001)  # this changes the learning rate
        self.num_episodes = 1  # is it redundant to have # of episodes, in main running episodes?
        self._test = kwargs.get("test", False)

        # create the model for the ajay
        self.model = GCN(self.in_feats, self.n_hidden, self.n_hidden, n_layers=3, activation=F.rrelu)
        if self._load_model:  # making model
            self.load_model()
        if cuda_flag:
            self.model = self.model.cuda()

        # Does optimizer make it work better?
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

        # logs information about trials/networks
        # self.log = logger()
        # self.log.add_log('tot_return')
        # self.log.add_log('TD_error')
        # self.log.add_log('entropy')
        # self.log.add_log('gains')

    def print_actor_configuration(self, ):

        print("\tLoad model: {}".format(self._load_model),
              "Learning Rate: {}".format(self.learning_rate),
              sep="\n\t"
              )

    def run_episode(self):  # similar to epochs
        done = False
        state = self.problem.reset()  # We get our initial state by resetting
        # Add our illegal actions which are ones in the edge vector
        # Initialize the list below
        PI = torch.empty(0)  # policy network- go over???
        R = torch.empty(0)  # reward
        V = torch.empty(0)  # value network - go over??????????

        while not done:  # While we haven't exceeded budget
            # Pull the graph
            G = state  # Can be graph or subgraph depending on the problem

            # Use this if we have an NVIDIA graphics card (we don't)
            if self.cuda:
                G.ndata['features'] = G.ndata['features'].cuda()

            # We put it into our model
            [pi, val] = self.model(G)

            # Get action from policy network

            # Remove the dimensions of size one
            pi = pi.squeeze()
            # For all the indices that are illegal set the distribution to negative infinity
            # No possible way they can be selected
            pi[illegal_actions] = -float('Inf')  # Whenever actor is trying to find policy distribution
            # Given this state, what action should i take-
            # if have illegal action(neighbor) don't want to take that action into account
            pi = F.softmax(pi, dim=0)  # Calculate distribution
            # Get the probability of action we can take, what is this????????????
            dist = torch.distributions.categorical.Categorical(pi)
            # Take the higher probability
            if self._test:  # How does it do it???????????????
                action = dist.probs.argmax()
            else:
                action = dist.sample()

            # take action
            new_state, reward, done, _ = self.problem.step(action.item())  # Take action and find outputs
            [illegal_actions, _] = self.problem.get_illegal_actions()
            state = new_state

            # collect outputs of networks for learning - cat = appending for tensors
            # Probability for taking action
            PI = torch.cat([PI, pi[action].unsqueeze(0)], dim=0)
            # The Reward we got
            R = torch.cat([R, reward.unsqueeze(0)], dim=0)
            # The Value we thought it would be ???????????
            V = torch.cat([V, val.unsqueeze(0)], dim=0)
            # A = torch.cat([A, action.unsqueeze(0)], dim=0)

        tot_return = R.sum().item()  # ???????????
        # self.log.add_item('tot_return', tot_return)
        # self.log.add_item('gains',np.flip(R.numpy()))

        # discount past rewards
        # Actions taken in the past has less to do in our actions in the future
        for i in range(R.shape[0] - 1):
            R[-2 - i] = R[-2 - i] + self.gamma * R[-1 - i]  # explain?????????????

        return PI, R, V, tot_return

    def update_model(self, PI, R, V):
        # R = (R - R.mean()) / (R.std() + 0.00001)
        self.optimizer.zero_grad()  # needed to update parameter correctly
        if self.cuda:  # ??????????????????????
            R = R.cuda()
        A = R.squeeze() - V.squeeze().detach()
        L_policy = -(torch.log(PI) * A).mean()
        L_value = F.smooth_l1_loss(V.squeeze(), R.squeeze())
        L_entropy = -(PI * PI.log()).mean()
        L = L_policy + L_value  # - 0.1 * L_entropy
        L.backward()
        self.optimizer.step()
        self.problem.r_logger.add_log('td_error', L_value.detach().item())
        self.problem.r_logger.add_log('entropy', L_entropy.cpu().detach().item())

    # Run so many numbers of episode then run the model
    def train(self):
        [PI, R, V, _] = self.run_episode()  # getting new json file, getting new graph/ subgraph
        for i in range(self.num_episodes - 1):  # for each range in episodes, why do have episodes = 1???
            [pi, r, v, _] = self.run_episode()
            # Update model
            PI = torch.cat([PI, pi], dim=0)  # Appending what learned to previous pi
            R = torch.cat([R, r], dim=0)
            V = torch.cat([V, v], dim=0)

        self.update_model(PI, R, V)
        return self.problem.r_logger

    def test(self):  # ?
        [_, _, _, _] = self.run_episode()
        # return self.log

    def save_model(self):  # takes what we learned
        torch.save(self.model.state_dict(), self.path)

    def load_model(self):
        self.model.load_state_dict(torch.load(self.path))
        self.model.eval()