Skip to content

Commit 6bc7a4c

Browse files
committed
commit on master
1 parent 9d07b26 commit 6bc7a4c

4 files changed

Lines changed: 91 additions & 106 deletions

File tree

DDPG-agent.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212

1313
cuda = torch.device('cuda')
1414

15-
raw_amount = pd.read_csv('./sh000016/i_amount.csv', header=0, index_col=0).values
16-
raw_buy = pd.read_csv('./sh000016/o_buy.csv', header=0, index_col=0).values
17-
raw_sell = pd.read_csv('./sh000016/o_sell.csv', header=0, index_col=0).values
15+
raw_amount = pd.read_csv('../sh000016/i_amount.csv', header=0, index_col=0).values
16+
raw_buy = pd.read_csv('../sh000016/o_buy.csv', header=0, index_col=0).values
17+
raw_sell = pd.read_csv('../sh000016/o_sell.csv', header=0, index_col=0).values
1818

1919
START = 10441
2020
END = 13899
@@ -30,36 +30,38 @@ def scale(data):
3030

3131
def train(Train_Env, Epoch):
3232
agent = DDPG(train_env, lb, node)
33+
save_iter = [1, 2, 5, 10, 20, 30, 50, 100, 150, 200]
3334
for t in range(Epoch):
3435
print('epoch:', t)
3536
state, done = Train_Env.reset(), False
36-
agent.initial()
3737
while not done:
3838
action = agent.act(state, Train_Env.portfolio)
3939
next_state, reward, done, _ = Train_Env.step(action)
4040
agent.perceive(state, action, reward, next_state, done)
4141
state = next_state
4242
if Train_Env.n_step % 300 == 299:
4343
print(Train_Env.n_step, ':',
44-
int(Train_Env.rewards[-1]), '\t',
44+
int(Train_Env.rewards[Train_Env.n_step]), '\t',
4545
int(sum(Train_Env.cost)), '\t',
46-
int(Train_Env.available_cash[-1]), '\t',
47-
agent.critic_network.loss.data, '\t',
48-
agent.actor_network.loss.data
46+
int(Train_Env.available_cash[Train_Env.n_step]), '\t',
47+
agent.critic_network.loss.data
4948
)
5049
total_reward = Train_Env.rewards[-1]
5150
total_cost = sum(Train_Env.cost)
5251
print('DDPG: Evaluation Average Reward:', total_reward)
5352
print('DDPG: Average Cost: ', total_cost)
53+
54+
for k in save_iter:
55+
if t == k:
56+
torch.save(agent.actor_network.target.state_dict(), 'DDPG_model' + str(t) + '.pth')
5457
return agent
5558

5659

5760
if __name__ == '__main__':
58-
lb, node, epoch = 36, 2048, 1
61+
lb, node, epoch = 12, 1024, 201
5962
buy_train = raw_buy[:START]
6063
sell_train = raw_sell[:START]
6164
amount_train = raw_amount[:START]
6265

6366
train_env = MarketEnv([buy_train, sell_train, amount_train], 0)
6467
agent = train(train_env, epoch)
65-
torch.save(agent.actor_network.target.state_dict(), 'DDPG_model.pth')

DDPG.py

Lines changed: 18 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,12 @@
33
@Author :JohsuaWu1997
44
@Date :2020/1/30
55
"""
6-
import numpy as np
76
import torch
87

98
from actor_critic import Actor, Critic
109
from ou_noise import OUNoise
1110

1211
cuda = torch.device('cuda')
13-
torch.backends.cudnn.deterministic = True
14-
torch.backends.cudnn.benchmark = False
1512

1613
GAMMA = 0.9999999993340943687843739933894
1714

@@ -35,7 +32,6 @@ def __init__(self, env, time_steps, hidden_dim):
3532
self.time_dim = time_steps
3633
self.state_dim = env.observation_space.shape[1]
3734
self.action_dim = env.action_space.shape[0]
38-
print(self.state_dim,self.action_dim,self.time_dim)
3935
self.batch_size = 64
4036
self.memory_size = self.time_dim + self.batch_size * 10
4137
self.start_size = self.time_dim + self.batch_size * 2
@@ -51,18 +47,16 @@ def __init__(self, env, time_steps, hidden_dim):
5147
self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda)
5248

5349
# Initialize a random process the Ornstein-Uhlenbeck process for action exploration
54-
self.exploration_noise = OUNoise(self.action_dim, sigma=0.05 / self.action_dim)
50+
self.exploration_noise = OUNoise(self.action_dim, sigma=0.01 / self.action_dim)
5551
self.initial()
5652

5753
def initial(self):
5854
self.steps = 0
59-
self.action = np.zeros((self.action_dim,))
55+
self.action = torch.zeros(self.action_dim, device=cuda)
6056
self.replay_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
6157
self.replay_next_state = torch.zeros((self.start_size - 1, 3, self.state_dim), device=cuda)
6258
self.replay_action = torch.zeros((self.start_size - 1, self.state_dim), device=cuda)
6359
self.replay_reward = torch.zeros((self.start_size - 1,), device=cuda)
64-
torch.manual_seed(self.seed)
65-
np.random.seed(self.seed)
6660

6761
def train_on_batch(self):
6862
# Sample a random minibatch of N transitions from replay buffer
@@ -84,18 +78,14 @@ def train_on_batch(self):
8478
reward_batch = torch.index_select(self.replay_reward, 0, sample)
8579

8680
# Calculate y_batch
87-
q_batch = self.critic_network.target_q(
88-
self.actor_network.target_action(next_state_batch), next_state_batch
89-
)
81+
next_action_batch = self.actor_network.target_action(next_state_batch)
82+
q_batch = self.critic_network.target_q(next_action_batch, next_state_batch)
9083
y_batch = torch.add(reward_batch, q_batch, alpha=GAMMA).view(-1, 1)
9184

92-
# train critic by minimizing the loss L
93-
self.critic_network.train(y_batch, action_batch, state_batch)
94-
95-
# train actor by target loss
85+
# train actor-critic by target loss
9686
self.actor_network.train(
97-
self.critic_network.critic_loss(
98-
self.actor_network.actor_action(state_batch), state_batch
87+
self.critic_network.train(
88+
y_batch, action_batch, state_batch
9989
)
10090
)
10191

@@ -104,25 +94,21 @@ def train_on_batch(self):
10494
self.critic_network.update_target()
10595

10696
def perceive(self, state, action, reward, next_state, done):
107-
state_tensor = torch.tensor([state.tolist()], device=cuda)
108-
next_state_tensor = torch.tensor([next_state.tolist()], device=cuda)
109-
action_tensor = torch.tensor([action.tolist()], device=cuda)
110-
reward_tensor = torch.tensor([reward.tolist()], device=cuda)
11197
if self.steps < self.start_size - 1:
112-
self.replay_state[self.steps] = state_tensor
113-
self.replay_next_state[self.steps] = next_state_tensor
114-
self.replay_action[self.steps] = action_tensor
98+
self.replay_state[self.steps] = state
99+
self.replay_next_state[self.steps] = next_state
100+
self.replay_action[self.steps] = action
115101
self.replay_reward[self.steps] = reward
116102
else:
117103
if self.steps >= self.memory_size:
118104
self.replay_state = self.replay_state[1:]
119105
self.replay_next_state = self.replay_next_state[1:]
120106
self.replay_action = self.replay_action[1:]
121107
self.replay_reward = self.replay_reward[1:]
122-
self.replay_state = torch.cat((self.replay_state, state_tensor), dim=0)
123-
self.replay_next_state = torch.cat((self.replay_next_state, next_state_tensor), dim=0)
124-
self.replay_action = torch.cat((self.replay_action, action_tensor), dim=0)
125-
self.replay_reward = torch.cat((self.replay_reward, reward_tensor), dim=0)
108+
self.replay_state = torch.cat((self.replay_state, state.unsqueeze(0)), dim=0)
109+
self.replay_next_state = torch.cat((self.replay_next_state, next_state.unsqueeze(0)), dim=0)
110+
self.replay_action = torch.cat((self.replay_action, action.unsqueeze(0)), dim=0)
111+
self.replay_reward = torch.cat((self.replay_reward, reward.unsqueeze(0)), dim=0)
126112
self.steps += 1
127113

128114
def act(self, next_state, portfolio):
@@ -131,11 +117,12 @@ def act(self, next_state, portfolio):
131117
next_amount_data = min_max_scale(self.replay_next_state[:, 2, :])[-1].view(1, -1)
132118
next_state_data = torch.cat([next_state_data, next_amount_data], dim=1)
133119
self.train_on_batch()
134-
allocation = self.actor_network.target_action(next_state_data).cpu().data.numpy().ravel()
120+
allocation = self.actor_network.target_action(next_state_data).data.view(-1)
121+
allocation += torch.tensor(self.exploration_noise.noise().tolist(), device=cuda)
135122
allocation[allocation < 0] = 0
136123
allocation /= sum(allocation)
137-
allocation = np.floor(
124+
allocation = torch.floor(
138125
portfolio * allocation / next_state[1, :] / self.unit
139126
) * self.unit
140127
self.action = allocation
141-
return np.array(self.action)
128+
return self.action.clone()

actor_critic.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
1-
"""
2-
@File :actor_critic.py
3-
@Author :JohsuaWu1997
4-
@Date :2020/1/30
5-
"""
61
import torch
72

83
cuda = torch.device('cuda')
@@ -43,20 +38,24 @@ def forward(self, x):
4338
class CriticNet(torch.nn.Module):
4439
def __init__(self, input_dim, hidden_dim, output_dim):
4540
super(CriticNet, self).__init__()
46-
47-
self.nn = torch.nn.Sequential(
48-
torch.nn.Linear(input_dim + output_dim, hidden_dim),
49-
torch.nn.ReLU(),
41+
self.nn2 = torch.nn.Sequential(
42+
torch.nn.Linear(hidden_dim + output_dim, hidden_dim),
43+
torch.nn.Tanh(),
5044
torch.nn.Linear(hidden_dim, hidden_dim),
45+
torch.nn.Tanh(),
46+
torch.nn.Linear(hidden_dim, 1)
47+
)
48+
self.nn1 = torch.nn.Sequential(
49+
torch.nn.Linear(input_dim, hidden_dim),
5150
torch.nn.ReLU(),
5251
torch.nn.Linear(hidden_dim, hidden_dim),
53-
torch.nn.ReLU(),
54-
torch.nn.Linear(hidden_dim, 1)
52+
torch.nn.Softmax(dim=1),
5553
)
5654

5755
def forward(self, a, x):
58-
ax = torch.cat((a, x), 1)
59-
out = self.nn(ax)
56+
x_out = self.nn1(x)
57+
ax = torch.cat((a, x_out), 1)
58+
out = self.nn2(ax)
6059
return out
6160

6261

@@ -67,14 +66,12 @@ def __init__(self, time_dim, state_dim, action_dim, hidden_dim):
6766
self.actor_weights = [params for params in self.actor.parameters()]
6867
self.target_weights = [params for params in self.target.parameters()]
6968
self.optimizer = torch.optim.Adam(self.actor.parameters())
70-
self.loss = torch.tensor([0], device=cuda)
7169
hard_copy(self.target_weights, self.actor_weights)
7270

73-
def train(self, actor_loss):
71+
def train(self, loss_grad):
7472
for _ in range(1):
75-
self.loss = actor_loss
7673
self.optimizer.zero_grad()
77-
self.loss.backward()
74+
self.actor_weights[-1].backward(-loss_grad)
7875
self.optimizer.step()
7976

8077
def actor_action(self, state):
@@ -91,6 +88,7 @@ def update_target(self):
9188

9289
class Critic:
9390
def __init__(self, time_dim, state_dim, action_dim, hidden_dim):
91+
self.action_dim = action_dim
9492
self.critic = CriticNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda)
9593
self.target = CriticNet(state_dim * (time_dim + 1), hidden_dim, action_dim).to(cuda)
9694
self.critic_weights = [params for params in self.critic.parameters()]
@@ -107,14 +105,17 @@ def train(self, y_batch, action_batch, state_batch):
107105
self.optimizer.zero_grad()
108106
self.loss.backward()
109107
self.optimizer.step()
110-
111-
def critic_loss(self, next_action_batch, next_state_batch):
112-
self.critic.zero_grad()
113-
return torch.nn.functional.softplus(-self.critic(next_action_batch, next_state_batch).mean())
108+
return torch.mean(self.critic_weights[0].grad[:, :self.action_dim], dim=0)
114109

115110
def target_q(self, next_action_batch, next_state_batch):
116111
self.target.zero_grad()
117112
return self.target(next_action_batch, next_state_batch).view(-1)
118113

119114
def update_target(self):
120115
soft_copy(self.target_weights, self.critic_weights)
116+
117+
118+
if __name__ == '__main__':
119+
critic = CriticNet(50 * (12 + 1), 37, 50).to(cuda)
120+
for params in critic.parameters():
121+
print(params.shape)

0 commit comments

Comments
 (0)