Skip to content

Commit c247063

Browse files
authored
Reapply modular examples (huggingface#42846)
* reapply * fix * fix
1 parent a61aba5 commit c247063

17 files changed

Lines changed: 282 additions & 681 deletions

examples/modular-transformers/configuration_duplicated_method.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import Optional
99

1010
from ...configuration_utils import PreTrainedConfig
11-
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
11+
from ...modeling_rope_utils import RopeParameters
1212

1313

1414
class DuplicatedMethodConfig(PreTrainedConfig):
@@ -129,7 +129,7 @@ def __init__(
129129
eos_token_id: Optional[int] = 2,
130130
pretraining_tp: Optional[int] = 1,
131131
tie_word_embeddings: Optional[bool] = False,
132-
rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
132+
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
133133
attention_bias: Optional[bool] = False,
134134
attention_dropout: Optional[float] = 0.0,
135135
mlp_bias: Optional[bool] = False,
@@ -157,14 +157,7 @@ def __init__(
157157
self.attention_dropout = attention_dropout
158158
self.mlp_bias = mlp_bias
159159
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
160-
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
161-
rope_scaling = kwargs.pop("rope_scaling", None)
162-
self.rope_parameters = rope_scaling or rope_parameters
163-
164-
# Validate the correctness of rotary position embeddings parameters
165-
rope_theta = kwargs.get("rope_theta", 10000.0)
166-
standardize_rope_params(self, rope_theta=rope_theta)
167-
rope_config_validation(self)
160+
self.rope_parameters = rope_parameters
168161

169162
super().__init__(
170163
pad_token_id=pad_token_id,

examples/modular-transformers/configuration_my_new_model.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import Optional
99

1010
from ...configuration_utils import PreTrainedConfig
11-
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
11+
from ...modeling_rope_utils import RopeParameters
1212

1313

1414
class MyNewModelConfig(PreTrainedConfig):
@@ -165,7 +165,7 @@ def __init__(
165165
eos_token_id: Optional[int] = 2,
166166
pretraining_tp: Optional[int] = 1,
167167
tie_word_embeddings: Optional[bool] = False,
168-
rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
168+
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
169169
attention_bias: Optional[bool] = False,
170170
attention_dropout: Optional[float] = 0.0,
171171
mlp_bias=True,
@@ -194,14 +194,7 @@ def __init__(
194194
self.attention_dropout = attention_dropout
195195
self.mlp_bias = mlp_bias
196196
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
197-
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
198-
rope_scaling = kwargs.pop("rope_scaling", None)
199-
self.rope_parameters = rope_scaling or rope_parameters
200-
201-
# Validate the correctness of rotary position embeddings parameters
202-
rope_theta = kwargs.get("rope_theta", 10000.0)
203-
standardize_rope_params(self, rope_theta=rope_theta)
204-
rope_config_validation(self)
197+
self.rope_parameters = rope_parameters
205198

206199
super().__init__(
207200
pad_token_id=pad_token_id,

examples/modular-transformers/configuration_my_new_model2.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing import Optional
88

99
from ...configuration_utils import PreTrainedConfig
10-
from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
10+
from ...modeling_rope_utils import RopeParameters
1111

1212

1313
class MyNewModel2Config(PreTrainedConfig):
@@ -68,7 +68,7 @@ def __init__(
6868
eos_token_id: Optional[int] = 2,
6969
pretraining_tp: Optional[int] = 1,
7070
tie_word_embeddings: Optional[bool] = False,
71-
rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
71+
rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
7272
attention_bias: Optional[bool] = False,
7373
attention_dropout: Optional[float] = 0.0,
7474
mlp_bias: Optional[bool] = False,
@@ -96,14 +96,7 @@ def __init__(
9696
self.attention_dropout = attention_dropout
9797
self.mlp_bias = mlp_bias
9898
self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
99-
# Try to set `rope_scaling` if available, otherwise use `rope_parameters`
100-
rope_scaling = kwargs.pop("rope_scaling", None)
101-
self.rope_parameters = rope_scaling or rope_parameters
102-
103-
# Validate the correctness of rotary position embeddings parameters
104-
rope_theta = kwargs.get("rope_theta", 10000.0)
105-
standardize_rope_params(self, rope_theta=rope_theta)
106-
rope_config_validation(self)
99+
self.rope_parameters = rope_parameters
107100

108101
super().__init__(
109102
pad_token_id=pad_token_id,

examples/modular-transformers/configuration_new_model.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
77
# Example where we only want to overwrite the defaults of an init
88

9-
from ...configuration_utils import PreTrainedConfig, layer_type_validation
9+
10+
from ...configuration_utils import PreTrainedConfig
1011

1112

1213
class NewModelConfig(PreTrainedConfig):
@@ -59,14 +60,14 @@ class NewModelConfig(PreTrainedConfig):
5960
Beginning of stream token id.
6061
tie_word_embeddings (`bool`, *optional*, defaults to `True`):
6162
Whether to tie weight embeddings
62-
rope_theta (`float`, *optional*, defaults to 10000.0):
63-
The base period of the RoPE embeddings.
63+
rope_parameters (`RopeParameters`, *optional*):
64+
Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
65+
a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
66+
with longer `max_position_embeddings`.
6467
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
6568
Whether to use a bias in the query, key, value and output projection layers during self-attention.
6669
attention_dropout (`float`, *optional*, defaults to 0.0):
6770
The dropout ratio for the attention probabilities.
68-
layer_types (`list`, *optional*):
69-
Attention pattern for each layer.
7071
use_bidirectional_attention (`bool`, *optional*):
7172
If True, the model will attend to all text tokens instead of using a causal mask.
7273
@@ -116,20 +117,12 @@ def __init__(
116117
eos_token_id=1,
117118
bos_token_id=2,
118119
tie_word_embeddings=True,
119-
rope_theta=10000.0,
120+
rope_parameters=None,
120121
attention_bias=False,
121122
attention_dropout=0.0,
122123
use_bidirectional_attention=False,
123-
layer_types=None,
124124
**kwargs,
125125
):
126-
super().__init__(
127-
pad_token_id=pad_token_id,
128-
bos_token_id=bos_token_id,
129-
eos_token_id=eos_token_id,
130-
tie_word_embeddings=tie_word_embeddings,
131-
**kwargs,
132-
)
133126
self.vocab_size = vocab_size
134127
self.max_position_embeddings = max_position_embeddings
135128
self.hidden_size = hidden_size
@@ -142,15 +135,18 @@ def __init__(
142135
self.initializer_range = initializer_range
143136
self.rms_norm_eps = rms_norm_eps
144137
self.use_cache = use_cache
145-
self.rope_theta = rope_theta
146138
self.attention_bias = attention_bias
147139
self.attention_dropout = attention_dropout
148140
self.use_bidirectional_attention = use_bidirectional_attention
141+
self.rope_parameters = rope_parameters
149142

150-
self.layer_types = layer_types
151-
if self.layer_types is None:
152-
self.layer_types = ["full_attention" for _ in range(self.num_hidden_layers)]
153-
layer_type_validation(self.layer_types, self.num_hidden_layers)
143+
super().__init__(
144+
pad_token_id=pad_token_id,
145+
bos_token_id=bos_token_id,
146+
eos_token_id=eos_token_id,
147+
tie_word_embeddings=tie_word_embeddings,
148+
**kwargs,
149+
)
154150

155151
@property
156152
def num_heads(self):

examples/modular-transformers/modeling_add_function.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import torch
1111
from torch import nn
1212

13+
from ...integrations import use_kernel_func_from_hub
14+
1315

1416
def rotate_half(x):
1517
"""Rotates half the hidden dims of the input."""
@@ -18,6 +20,7 @@ def rotate_half(x):
1820
return torch.cat((-x2, x1), dim=-1)
1921

2022

23+
@use_kernel_func_from_hub("rotary_pos_emb")
2124
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
2225
"""Applies Rotary Position Embedding to the query and key tensors.
2326

0 commit comments

Comments
 (0)