Soccer-Environment-Multi-Agent-RL/ddpg.py at master · AkshayS21/Soccer-Environment-Multi-Agent-RL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

# coding: utf-8

# In[2]:


import model
import importlib
importlib.reload(model)
from OUNoise import OUNoise

import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim


import numpy as np


state_size = 24
action_size = 2
#h1 = 256
#h2 = 128
lr_act = 1e-3
lr_crt = 1e-3

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class ddpg():

    def __init__(self, state_size, action_size, h1, h2 , h3, n_agents = 2  ):

        a1 = h1 #int(h1/2)
        a2 = h2  #int(h2/2)
        a3 = h3 #int(h3*4)
        self.action_size = action_size
        self.actor_local = model.Network(input_dim = state_size ,h1 = a1, h2=a2, h3=a3, output_dim = action_size , actor = True).to(device)
        self.actor_target = model.Network(input_dim = state_size ,h1 = a1, h2=a2, h3=a3, output_dim = action_size , actor = True).to(device)
        self.actor_optimizer = optim.Adam( self.actor_local.parameters(), lr = lr_act)

        critic_input = n_agents * (state_size + action_size)

        self.critic_local = model.Network(input_dim = critic_input ,h1 = h1, h2=h2,h3 = h3, output_dim = 1 ).to(device)
        self.critic_target = model.Network(input_dim = critic_input ,h1 = h1, h2=h2,h3 = h3, output_dim = 1 ).to(device)
        self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr = lr_crt)

        self.noise = OUNoise( action_size , scale=1.0)

        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)
        self.tau = 1e-3

    def local_act(self , state , noise ,rand):

        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()     #detach().numpy()  #data.numpy()
        self.actor_local.train()
        if noise is not None:

            action +=  noise * self.noise.noise()
        if rand is not None:

            action = (1 - rand) * action + rand * (np.random.rand(self.action_size) - 0.5) * 2.0

        #action +=  noise * self.noise.noise()
        return np.clip(action, -1,1)

    def local_act2(self , state  ):

        #state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()     #detach().numpy()  #data.numpy()
        self.actor_local.train()
        #action +=  noise * self.noise.noise()
        return action #np.clip(action, -1,1)


    def target_act(self, state  ):

        #state = torch.from_numpy(state).float().to(device)
        self.actor_target.eval()
        with torch.no_grad():
            action = self.actor_target(state).cpu().data.numpy() #detach().numpy() #data.numpy()
        self.actor_target.train()
        #action +=   noise * self.noise.noise()
        return action #np.clip(action, -1,1)

    def hard_update(self, target, source):

        for target_params, source_params in zip( target.parameters(), source.parameters()):
            target_params.data.copy_(source_params.data)


    def soft_update(self, target, local ):

        for target_params, local_params in zip(target.parameters() , local.parameters()):
            target_params.data.copy_( (1-self.tau)* target_params.data + (self.tau* local_params.data))


    def reset_action(self):

        self.noise.reset()