Skip to content

Commit 0b71fc4

Browse files
feature(nyz): add H-PPO hybrid action space algorithm (#140)
* feature(nyz): add hybrid ppo, unify action_space field and use dict type mu sigma * polish(nyz): polish ppo config continous field, move to action_space field * fix(nyz): fix ppo action_space field compatibility bug * fix(nyz): fix ppg/sac/cql action_space field compatibility bug * demo(nyz): update gym hybrid hppo config * polish(pu): polish hppo hyper-para, use tanh and fixed sigma 0.3 in actor_action_args, use clamp [0,1] and [-1,1] for acceleration_value and rotation_value correspondingly after sample from the pi distri. in collect phase * polish(pu):polish as review * polish(pu): polish hppo config * polish(pu): entropy weight=0.03 performs best empirically * fix(nyz): fix unittest compatibility bugs * polish(nyz): remove atari env unused print(ci skip) Co-authored-by: puyuan1996 <[email protected]>
1 parent eb6c60c commit 0b71fc4

File tree

65 files changed

+480
-571
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+480
-571
lines changed

ding/model/template/mavac.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ def __init__(
2828
actor_head_layer_num: int = 2,
2929
critic_head_hidden_size: int = 64,
3030
critic_head_layer_num: int = 1,
31+
action_space: str = 'discrete',
3132
activation: Optional[nn.Module] = nn.ReLU(),
3233
norm_type: Optional[str] = None,
3334
) -> None:

ding/model/template/ppg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ def __init__(
1414
self,
1515
obs_shape: Union[int, SequenceType],
1616
action_shape: Union[int, SequenceType],
17+
action_space: str = 'discrete',
1718
share_encoder: bool = True,
18-
continuous: bool = False,
1919
encoder_hidden_size_list: SequenceType = [128, 128, 64],
2020
actor_head_hidden_size: int = 64,
2121
actor_head_layer_num: int = 2,
@@ -26,7 +26,7 @@ def __init__(
2626
) -> None:
2727
super(PPG, self).__init__()
2828
self.actor_critic = VAC(
29-
obs_shape, action_shape, share_encoder, continuous, encoder_hidden_size_list, actor_head_hidden_size,
29+
obs_shape, action_shape, action_space, share_encoder, encoder_hidden_size_list, actor_head_hidden_size,
3030
actor_head_layer_num, critic_head_hidden_size, critic_head_layer_num, activation, norm_type
3131
)
3232
self.aux_critic = copy.deepcopy(self.actor_critic.critic)

ding/model/template/qac.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def __init__(
2323
self,
2424
obs_shape: Union[int, SequenceType],
2525
action_shape: Union[int, SequenceType, EasyDict],
26-
actor_head_type: str,
26+
action_space: str,
2727
twin_critic: bool = False,
2828
actor_head_hidden_size: int = 64,
2929
actor_head_layer_num: int = 1,
@@ -39,7 +39,7 @@ def __init__(
3939
- obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
4040
- action_shape (:obj:`Union[int, SequenceType, EasyDict]`): Action's space, such as 4, (3, ), \
4141
EasyDict({'action_type_shape': 3, 'action_args_shape': 4}).
42-
- actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` .
42+
- action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization`` or ``hybrid`` .
4343
- twin_critic (:obj:`bool`): Whether include twin critic.
4444
- actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
4545
- actor_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output \
@@ -56,9 +56,9 @@ def __init__(
5656
obs_shape: int = squeeze(obs_shape)
5757
action_shape = squeeze(action_shape)
5858
self.action_shape = action_shape
59-
self.actor_head_type = actor_head_type
60-
assert self.actor_head_type in ['regression', 'reparameterization', 'hybrid']
61-
if self.actor_head_type == 'regression': # DDPG, TD3
59+
self.action_space = action_space
60+
assert self.action_space in ['regression', 'reparameterization', 'hybrid']
61+
if self.action_space == 'regression': # DDPG, TD3
6262
self.actor = nn.Sequential(
6363
nn.Linear(obs_shape, actor_head_hidden_size), activation,
6464
RegressionHead(
@@ -70,7 +70,7 @@ def __init__(
7070
norm_type=norm_type
7171
)
7272
)
73-
elif self.actor_head_type == 'reparameterization': # SAC
73+
elif self.action_space == 'reparameterization': # SAC
7474
self.actor = nn.Sequential(
7575
nn.Linear(obs_shape, actor_head_hidden_size), activation,
7676
ReparameterizationHead(
@@ -82,7 +82,7 @@ def __init__(
8282
norm_type=norm_type
8383
)
8484
)
85-
elif self.actor_head_type == 'hybrid': # PADDPG
85+
elif self.action_space == 'hybrid': # PADDPG
8686
# hybrid action space: action_type(discrete) + action_args(continuous),
8787
# such as {'action_type_shape': torch.LongTensor([0]), 'action_args_shape': torch.FloatTensor([0.1, -0.27])}
8888
action_shape.action_args_shape = squeeze(action_shape.action_args_shape)
@@ -110,7 +110,7 @@ def __init__(
110110
)
111111
self.actor = nn.ModuleList([actor_action_type, actor_action_args])
112112
self.twin_critic = twin_critic
113-
if self.actor_head_type == 'hybrid':
113+
if self.action_space == 'hybrid':
114114
critic_input_size = obs_shape + action_shape.action_type_shape + action_shape.action_args_shape
115115
else:
116116
critic_input_size = obs_shape + action_shape
@@ -194,7 +194,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
194194
195195
Critic Examples:
196196
>>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
197-
>>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression')
197+
>>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression')
198198
>>> model(inputs, mode='compute_critic')['q_value'] # q value
199199
tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
200200
@@ -245,13 +245,13 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
245245
>>> actor_outputs['logit'][1].shape # sigma
246246
>>> torch.Size([4, 64])
247247
"""
248-
if self.actor_head_type == 'regression':
248+
if self.action_space == 'regression':
249249
x = self.actor(inputs)
250250
return {'action': x['pred']}
251-
elif self.actor_head_type == 'reparameterization':
251+
elif self.action_space == 'reparameterization':
252252
x = self.actor(inputs)
253253
return {'logit': [x['mu'], x['sigma']]}
254-
elif self.actor_head_type == 'hybrid':
254+
elif self.action_space == 'hybrid':
255255
logit = self.actor[0](inputs)
256256
action_args = self.actor[1](inputs)
257257
return {'logit': logit['logit'], 'action_args': action_args['pred']}
@@ -284,14 +284,14 @@ def compute_critic(self, inputs: Dict) -> Dict:
284284
285285
Examples:
286286
>>> inputs = {'obs': torch.randn(4, N), 'action': torch.randn(4, 1)}
287-
>>> model = QAC(obs_shape=(N, ),action_shape=1,actor_head_type='regression')
287+
>>> model = QAC(obs_shape=(N, ),action_shape=1,action_space='regression')
288288
>>> model(inputs, mode='compute_critic')['q_value'] # q value
289289
>>> tensor([0.0773, 0.1639, 0.0917, 0.0370], grad_fn=<SqueezeBackward1>)
290290
"""
291291

292292
obs, action = inputs['obs'], inputs['action']
293293
assert len(obs.shape) == 2
294-
if self.actor_head_type == 'hybrid':
294+
if self.action_space == 'hybrid':
295295
action_type_logit = inputs['logit']
296296
action_type_logit = torch.softmax(action_type_logit, dim=-1)
297297
action_args = action['action_args']

ding/model/template/qac_dist.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def __init__(
2020
self,
2121
obs_shape: Union[int, SequenceType],
2222
action_shape: Union[int, SequenceType],
23-
actor_head_type: str = "regression",
23+
action_space: str = "regression",
2424
critic_head_type: str = "categorical",
2525
actor_head_hidden_size: int = 64,
2626
actor_head_layer_num: int = 1,
@@ -38,7 +38,7 @@ def __init__(
3838
Arguments:
3939
- obs_shape (:obj:`Union[int, SequenceType]`): Observation's space.
4040
- action_shape (:obj:`Union[int, SequenceType]`): Action's space.
41-
- actor_head_type (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
41+
- action_space (:obj:`str`): Whether choose ``regression`` or ``reparameterization``.
4242
- critic_head_type (:obj:`str`): Only ``categorical``.
4343
- actor_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` to pass to actor-nn's ``Head``.
4444
- actor_head_layer_num (:obj:`int`):
@@ -58,9 +58,9 @@ def __init__(
5858
super(QACDIST, self).__init__()
5959
obs_shape: int = squeeze(obs_shape)
6060
action_shape: int = squeeze(action_shape)
61-
self.actor_head_type = actor_head_type
62-
assert self.actor_head_type in ['regression', 'reparameterization']
63-
if self.actor_head_type == 'regression':
61+
self.action_space = action_space
62+
assert self.action_space in ['regression', 'reparameterization']
63+
if self.action_space == 'regression':
6464
self.actor = nn.Sequential(
6565
nn.Linear(obs_shape, actor_head_hidden_size), activation,
6666
RegressionHead(
@@ -72,7 +72,7 @@ def __init__(
7272
norm_type=norm_type
7373
)
7474
)
75-
elif self.actor_head_type == 'reparameterization':
75+
elif self.action_space == 'reparameterization':
7676
self.actor = nn.Sequential(
7777
nn.Linear(obs_shape, actor_head_hidden_size), activation,
7878
ReparameterizationHead(
@@ -156,7 +156,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
156156
Critic Examples:
157157
>>> # Categorical mode
158158
>>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
159-
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \
159+
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \
160160
... critic_head_type='categorical', n_atoms=51)
161161
>>> q_value = model(inputs, mode='compute_critic') # q value
162162
>>> assert q_value['q_value'].shape == torch.Size([4, 1])
@@ -204,9 +204,9 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
204204
>>> torch.Size([4, 64])
205205
"""
206206
x = self.actor(inputs)
207-
if self.actor_head_type == 'regression':
207+
if self.action_space == 'regression':
208208
return {'action': x['pred']}
209-
elif self.actor_head_type == 'reparameterization':
209+
elif self.action_space == 'reparameterization':
210210
return {'logit': [x['mu'], x['sigma']]}
211211

212212
def compute_critic(self, inputs: Dict) -> Dict:
@@ -232,7 +232,7 @@ def compute_critic(self, inputs: Dict) -> Dict:
232232
Examples:
233233
>>> # Categorical mode
234234
>>> inputs = {'obs': torch.randn(4,N), 'action': torch.randn(4,1)}
235-
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,actor_head_type='regression', \
235+
>>> model = QACDIST(obs_shape=(N, ),action_shape=1,action_space='regression', \
236236
... critic_head_type='categorical', n_atoms=51)
237237
>>> q_value = model(inputs, mode='compute_critic') # q value
238238
>>> assert q_value['q_value'].shape == torch.Size([4, 1])

ding/model/template/tests/test_hybrid_qac.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
'action_args_shape': (6, )
1717
}),
1818
'twin': True,
19-
'actor_head_type': 'hybrid'
19+
'action_space': 'hybrid'
2020
}
2121

2222

@@ -27,10 +27,10 @@ def test_hybrid_qac(
2727
self,
2828
action_shape=hybrid_args['action_shape'],
2929
twin=hybrid_args['twin'],
30-
actor_head_type=hybrid_args['actor_head_type']
30+
action_space=hybrid_args['action_space']
3131
):
3232
N = 32
33-
assert actor_head_type == 'hybrid'
33+
assert action_space == 'hybrid'
3434
inputs = {
3535
'obs': torch.randn(B, N),
3636
'action': {
@@ -42,7 +42,7 @@ def test_hybrid_qac(
4242
model = QAC(
4343
obs_shape=(N, ),
4444
action_shape=action_shape,
45-
actor_head_type=actor_head_type,
45+
action_space=action_space,
4646
critic_head_hidden_size=embedding_size,
4747
actor_head_hidden_size=embedding_size,
4848
twin_critic=twin,

ding/model/template/tests/test_qac.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,16 @@
1717

1818

1919
@pytest.mark.unittest
20-
@pytest.mark.parametrize('action_shape, twin, actor_head_type', args)
20+
@pytest.mark.parametrize('action_shape, twin, action_space', args)
2121
class TestQAC:
2222

23-
def test_fcqac(self, action_shape, twin, actor_head_type):
23+
def test_fcqac(self, action_shape, twin, action_space):
2424
N = 32
2525
inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
2626
model = QAC(
2727
obs_shape=(N, ),
2828
action_shape=action_shape,
29-
actor_head_type=actor_head_type,
29+
action_space=action_space,
3030
critic_head_hidden_size=embedding_size,
3131
actor_head_hidden_size=embedding_size,
3232
twin_critic=twin,
@@ -41,15 +41,15 @@ def test_fcqac(self, action_shape, twin, actor_head_type):
4141

4242
# compute_action
4343
print(model)
44-
if actor_head_type == 'regression':
44+
if action_space == 'regression':
4545
action = model(inputs['obs'], mode='compute_actor')['action']
4646
if squeeze(action_shape) == 1:
4747
assert action.shape == (B, )
4848
else:
4949
assert action.shape == (B, squeeze(action_shape))
5050
assert action.eq(action.clamp(-1, 1)).all()
5151
is_differentiable(action.sum(), model.actor)
52-
elif actor_head_type == 'reparameterization':
52+
elif action_space == 'reparameterization':
5353
(mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit']
5454
assert mu.shape == (B, *action_shape)
5555
assert sigma.shape == (B, *action_shape)

ding/model/template/tests/test_qac_dist.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,16 @@
1717

1818

1919
@pytest.mark.unittest
20-
@pytest.mark.parametrize('action_shape, actor_head_type', args)
20+
@pytest.mark.parametrize('action_shape, action_space', args)
2121
class TestQACDIST:
2222

23-
def test_fcqac_dist(self, action_shape, actor_head_type):
23+
def test_fcqac_dist(self, action_shape, action_space):
2424
N = 32
2525
inputs = {'obs': torch.randn(B, N), 'action': torch.randn(B, squeeze(action_shape))}
2626
model = QACDIST(
2727
obs_shape=(N, ),
2828
action_shape=action_shape,
29-
actor_head_type=actor_head_type,
29+
action_space=action_space,
3030
critic_head_hidden_size=embedding_size,
3131
actor_head_hidden_size=embedding_size,
3232
)
@@ -43,15 +43,15 @@ def test_fcqac_dist(self, action_shape, actor_head_type):
4343

4444
# compute_action
4545
print(model)
46-
if actor_head_type == 'regression':
46+
if action_space == 'regression':
4747
action = model(inputs['obs'], mode='compute_actor')['action']
4848
if squeeze(action_shape) == 1:
4949
assert action.shape == (B, )
5050
else:
5151
assert action.shape == (B, squeeze(action_shape))
5252
assert action.eq(action.clamp(-1, 1)).all()
5353
is_differentiable(action.sum(), model.actor)
54-
elif actor_head_type == 'reparameterization':
54+
elif action_space == 'reparameterization':
5555
(mu, sigma) = model(inputs['obs'], mode='compute_actor')['logit']
5656
assert mu.shape == (B, *action_shape)
5757
assert sigma.shape == (B, *action_shape)

ding/model/template/tests/test_vac.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88

99
B, C, H, W = 4, 3, 128, 128
1010
obs_shape = [4, (8, ), (4, 64, 64)]
11-
act_args = [[6, False], [(3, ), True], [[2, 3, 6], False]]
12-
#act_args = [[(3, ), True]]
11+
act_args = [[6, 'discrete'], [(3, ), 'continuous'], [[2, 3, 6], 'discrete']]
12+
# act_args = [[(3, ), True]]
1313
args = list(product(*[obs_shape, act_args, [False, True]]))
1414

1515

@@ -29,12 +29,12 @@ def test_vac(self, obs_shape, act_args, share_encoder):
2929
inputs = torch.randn(B, obs_shape)
3030
else:
3131
inputs = torch.randn(B, *obs_shape)
32-
model = VAC(obs_shape, action_shape=act_args[0], continuous=act_args[1], share_encoder=share_encoder)
32+
model = VAC(obs_shape, action_shape=act_args[0], action_space=act_args[1], share_encoder=share_encoder)
3333

3434
outputs = model(inputs, mode='compute_actor_critic')
3535
value, logit = outputs['value'], outputs['logit']
36-
if model.continuous:
37-
outputs = value.sum() + logit[0].sum() + logit[1].sum()
36+
if model.action_space == 'continuous':
37+
outputs = value.sum() + logit['mu'].sum() + logit['sigma'].sum()
3838
else:
3939
if model.multi_head:
4040
outputs = value.sum() + sum([t.sum() for t in logit])
@@ -45,8 +45,8 @@ def test_vac(self, obs_shape, act_args, share_encoder):
4545
for p in model.parameters():
4646
p.grad = None
4747
logit = model(inputs, mode='compute_actor')['logit']
48-
if model.continuous:
49-
logit = logit[0].sum() + logit[1].sum()
48+
if model.action_space == 'continuous':
49+
logit = logit['mu'].sum() + logit['sigma'].sum()
5050
self.output_check(model.actor, logit, model.action_shape)
5151

5252
for p in model.parameters():

0 commit comments

Comments
 (0)