33@Author :JohsuaWu1997
44@Date :2020/1/30
55"""
6- import numpy as np
76import torch
87
98from actor_critic import Actor , Critic
109from ou_noise import OUNoise
1110
1211cuda = torch .device ('cuda' )
13- torch .backends .cudnn .deterministic = True
14- torch .backends .cudnn .benchmark = False
1512
1613GAMMA = 0.9999999993340943687843739933894
1714
@@ -35,7 +32,6 @@ def __init__(self, env, time_steps, hidden_dim):
3532 self .time_dim = time_steps
3633 self .state_dim = env .observation_space .shape [1 ]
3734 self .action_dim = env .action_space .shape [0 ]
38- print (self .state_dim ,self .action_dim ,self .time_dim )
3935 self .batch_size = 64
4036 self .memory_size = self .time_dim + self .batch_size * 10
4137 self .start_size = self .time_dim + self .batch_size * 2
@@ -51,18 +47,16 @@ def __init__(self, env, time_steps, hidden_dim):
5147 self .replay_reward = torch .zeros ((self .start_size - 1 ,), device = cuda )
5248
5349 # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
54- self .exploration_noise = OUNoise (self .action_dim , sigma = 0.05 / self .action_dim )
50+ self .exploration_noise = OUNoise (self .action_dim , sigma = 0.01 / self .action_dim )
5551 self .initial ()
5652
5753 def initial (self ):
5854 self .steps = 0
59- self .action = np .zeros (( self .action_dim ,) )
55+ self .action = torch .zeros (self .action_dim , device = cuda )
6056 self .replay_state = torch .zeros ((self .start_size - 1 , 3 , self .state_dim ), device = cuda )
6157 self .replay_next_state = torch .zeros ((self .start_size - 1 , 3 , self .state_dim ), device = cuda )
6258 self .replay_action = torch .zeros ((self .start_size - 1 , self .state_dim ), device = cuda )
6359 self .replay_reward = torch .zeros ((self .start_size - 1 ,), device = cuda )
64- torch .manual_seed (self .seed )
65- np .random .seed (self .seed )
6660
6761 def train_on_batch (self ):
6862 # Sample a random minibatch of N transitions from replay buffer
@@ -84,18 +78,14 @@ def train_on_batch(self):
8478 reward_batch = torch .index_select (self .replay_reward , 0 , sample )
8579
8680 # Calculate y_batch
87- q_batch = self .critic_network .target_q (
88- self .actor_network .target_action (next_state_batch ), next_state_batch
89- )
81+ next_action_batch = self .actor_network .target_action (next_state_batch )
82+ q_batch = self .critic_network .target_q (next_action_batch , next_state_batch )
9083 y_batch = torch .add (reward_batch , q_batch , alpha = GAMMA ).view (- 1 , 1 )
9184
92- # train critic by minimizing the loss L
93- self .critic_network .train (y_batch , action_batch , state_batch )
94-
95- # train actor by target loss
85+ # train actor-critic by target loss
9686 self .actor_network .train (
97- self .critic_network .critic_loss (
98- self . actor_network . actor_action ( state_batch ) , state_batch
87+ self .critic_network .train (
88+ y_batch , action_batch , state_batch
9989 )
10090 )
10191
@@ -104,25 +94,21 @@ def train_on_batch(self):
10494 self .critic_network .update_target ()
10595
10696 def perceive (self , state , action , reward , next_state , done ):
107- state_tensor = torch .tensor ([state .tolist ()], device = cuda )
108- next_state_tensor = torch .tensor ([next_state .tolist ()], device = cuda )
109- action_tensor = torch .tensor ([action .tolist ()], device = cuda )
110- reward_tensor = torch .tensor ([reward .tolist ()], device = cuda )
11197 if self .steps < self .start_size - 1 :
112- self .replay_state [self .steps ] = state_tensor
113- self .replay_next_state [self .steps ] = next_state_tensor
114- self .replay_action [self .steps ] = action_tensor
98+ self .replay_state [self .steps ] = state
99+ self .replay_next_state [self .steps ] = next_state
100+ self .replay_action [self .steps ] = action
115101 self .replay_reward [self .steps ] = reward
116102 else :
117103 if self .steps >= self .memory_size :
118104 self .replay_state = self .replay_state [1 :]
119105 self .replay_next_state = self .replay_next_state [1 :]
120106 self .replay_action = self .replay_action [1 :]
121107 self .replay_reward = self .replay_reward [1 :]
122- self .replay_state = torch .cat ((self .replay_state , state_tensor ), dim = 0 )
123- self .replay_next_state = torch .cat ((self .replay_next_state , next_state_tensor ), dim = 0 )
124- self .replay_action = torch .cat ((self .replay_action , action_tensor ), dim = 0 )
125- self .replay_reward = torch .cat ((self .replay_reward , reward_tensor ), dim = 0 )
108+ self .replay_state = torch .cat ((self .replay_state , state . unsqueeze ( 0 ) ), dim = 0 )
109+ self .replay_next_state = torch .cat ((self .replay_next_state , next_state . unsqueeze ( 0 ) ), dim = 0 )
110+ self .replay_action = torch .cat ((self .replay_action , action . unsqueeze ( 0 ) ), dim = 0 )
111+ self .replay_reward = torch .cat ((self .replay_reward , reward . unsqueeze ( 0 ) ), dim = 0 )
126112 self .steps += 1
127113
128114 def act (self , next_state , portfolio ):
@@ -131,11 +117,12 @@ def act(self, next_state, portfolio):
131117 next_amount_data = min_max_scale (self .replay_next_state [:, 2 , :])[- 1 ].view (1 , - 1 )
132118 next_state_data = torch .cat ([next_state_data , next_amount_data ], dim = 1 )
133119 self .train_on_batch ()
134- allocation = self .actor_network .target_action (next_state_data ).cpu ().data .numpy ().ravel ()
120+ allocation = self .actor_network .target_action (next_state_data ).data .view (- 1 )
121+ allocation += torch .tensor (self .exploration_noise .noise ().tolist (), device = cuda )
135122 allocation [allocation < 0 ] = 0
136123 allocation /= sum (allocation )
137- allocation = np .floor (
124+ allocation = torch .floor (
138125 portfolio * allocation / next_state [1 , :] / self .unit
139126 ) * self .unit
140127 self .action = allocation
141- return np . array ( self .action )
128+ return self .action . clone ( )
0 commit comments