@@ -263,23 +263,6 @@ def __init__(
263263 # with the layer's index.
264264 layer_idx = int (prefix .split (sep = "." )[- 1 ])
265265
266- # TODO: support MTP
267- attn_window_size = getattr (config , "attn_window_size" , None )
268- if attn_window_size is not None :
269- if isinstance (attn_window_size , list ):
270- attn_window_size = attn_window_size [layer_idx ]
271- elif isinstance (attn_window_size , int ):
272- attn_window_size = attn_window_size
273- else :
274- raise ValueError (f"Invalid attn_window_size: { attn_window_size } " )
275- attn_window_size = None if attn_window_size <= 0 else attn_window_size
276-
277- # different rope theta for full layer and swa layer
278- swa_rope_theta = getattr (config , "swa_rope_theta" , - 1 )
279- # default to full rope theta
280- swa_rope_theta = rope_theta if swa_rope_theta <= 0 else swa_rope_theta
281- rope_theta = swa_rope_theta if attn_window_size is not None else rope_theta
282-
283266 self .layer_idx = layer_idx
284267 self .self_attn = MiniMaxM2Attention (
285268 hidden_size = self .hidden_size ,
@@ -288,7 +271,6 @@ def __init__(
288271 rotary_dim = config .rotary_dim ,
289272 rope_theta = rope_theta ,
290273 rope_scaling = rope_scaling ,
291- attn_window_size = attn_window_size ,
292274 max_position_embeddings = max_position_embeddings ,
293275 rms_norm_eps = config .rms_norm_eps ,
294276 qkv_bias = getattr (config , "attention_bias" , False ),
0 commit comments