@@ -540,8 +540,7 @@ def __init__(
540540 hidden_size : int ,
541541 num_heads : int ,
542542 num_kv_heads : int ,
543- rope_theta : float = 10000 ,
544- rope_scaling : Optional [Dict [str , Any ]] = None ,
543+ rope_parameters : Dict [str , Any ],
545544 max_position_embeddings : int = 8192 ,
546545 cache_config : Optional [CacheConfig ] = None ,
547546 quant_config : Optional [QuantizationConfig ] = None ,
@@ -567,7 +566,6 @@ def __init__(
567566 self .q_size = self .num_heads * self .head_dim
568567 self .kv_size = self .num_kv_heads * self .head_dim
569568 self .scaling = self .head_dim ** - 0.5
570- self .rope_theta = rope_theta
571569 self .max_position_embeddings = max_position_embeddings
572570
573571 self .qkv_proj = QKVParallelLinear (
@@ -601,8 +599,7 @@ def __init__(
601599 self .head_dim ,
602600 rotary_dim = self .head_dim ,
603601 max_position = max_position_embeddings ,
604- base = rope_theta ,
605- rope_scaling = rope_scaling ,
602+ rope_parameters = rope_parameters ,
606603 )
607604 self .attn = Attention (
608605 self .num_heads ,
@@ -655,17 +652,14 @@ def __init__(
655652 ) -> None :
656653 super ().__init__ ()
657654 self .hidden_size = config .hidden_size
658- rope_theta = getattr (config , "rope_theta" , 10000 )
659- rope_scaling = getattr (config , "rope_scaling" , None )
660655 max_position_embeddings = getattr (config , "max_position_embeddings" ,
661656 8192 )
662657
663658 self .self_attn = PanguProMoEAttention (
664659 hidden_size = self .hidden_size ,
665660 num_heads = config .num_attention_heads ,
666661 num_kv_heads = config .num_key_value_heads ,
667- rope_theta = rope_theta ,
668- rope_scaling = rope_scaling ,
662+ rope_parameters = config .rope_parameters ,
669663 max_position_embeddings = max_position_embeddings ,
670664 cache_config = cache_config ,
671665 quant_config = quant_config ,
0 commit comments