
    qi                     :   d dl Z d dlZd dlmZ d dlmZmZmZ ddlm	Z	m
Z
  e
j                  e      Z e	       rd dlZerddlmZ d Z	 	 	 	 dded	   d
ed   dedz  dedz  dedef   f
dZ	 	 	 	 dded	   d
ed   dedz  dedz  dedef   f
dZ	 	 	 ddd	d
ed   dedz  dedz  dedef   f
dZ	 	 	 ddd	d
ed   dedz  dedz  dedef   f
dZ	 	 	 ddd	d
ed   dedz  dedz  dedef   f
dZeeeeedZ G d ded      Z G d d      Zd dededz  fdZy)!    Nwraps)TYPE_CHECKINGOptional	TypedDict   )is_torch_availablelogging)PreTrainedConfigc                 H     ddddt               d fd	       }|S )ad  
    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).

    Args:
        rope_forward (Callable):
            The forward pass of the RoPE implementation.

    Returns:
        The decorated forward pass.
    c                 P   t        j                  |      dz   }|4| j                  }| j                  }d}| j                  j
                  d   }n?| j                  |   }t        | | d      }| d}| j                  j
                  |   d   }||kD  r\t        | | d      s%t        |   }	 |	| j                  ||dz   |      \  }
}| j                  | d	
d
       t        | | d|
       y|j                  |      }| j                  | d	|d
       t        | | d|       y)zbLongrope uses long factor if sequence is larger than original pretraining length, short otherwise.r   N  original_max_position_embeddings_original_inv_freq__long_inv_freqseq_len
layer_typeinv_freqF
persistentlong_inv_freqoriginal_inv_freq)torchmax	rope_typer   configrope_parametersgetattrhasattrROPE_INIT_FUNCTIONSregister_buffersetattrto)selfposition_idsdevicer   r   r   r   prefixr   rope_init_fnr   r   s               R/opt/pipecat/venv/lib/python3.12/site-packages/transformers/modeling_rope_utils.pylongrope_frequency_updatez6dynamic_rope_update.<locals>.longrope_frequency_update.   sY   ))L)A-I $ 6 6F/3{{/J/JKm/n,z2I '<N.O P"|1%F/3{{/J/J:/V20, 554J<~!>?29=#/KK<q@)	$ q   F88!4mPU VDVHM2MB !2 4 4V <  F88!46GTY ZDVH$568IJ    c                    t        j                  |      dz   }|'| j                  }| j                  }| j                  }d}n=| j                  |   }t        | | d| j                        }t        | | d      }| d}||kD  rNt        |   }	 |	| j                  |||      \  }
| _        | j                  | d|
d	
       t        | | d|       || j                  k  rc|| j                  kD  rS|j                  |      }| j                  | d|d	
       t        | | d|       t        | | d| j                         yyy)a  
        dynamic RoPE layers should recompute `inv_freq` in the following situations:
        1 - growing beyond the cached sequence length (allow scaling)
        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
        r   Nr   _max_seq_len_cachedr   r   r   r   Fr   r   )r   r   r   max_seq_len_cachedr   r    r"   r   attention_scalingr#   r$   original_max_seq_lenr%   )r&   r'   r(   r   r   r   r0   r   r)   r*   r   s              r+   dynamic_frequency_updatez5dynamic_rope_update.<locals>.dynamic_frequency_updateQ   s|    ))L)A-I!%!8!8 $ 6 6Fz2I!(*=P/QSWSjSj!k '<N.O P"|1%F''.y9L/;%	0,Hd,   F88!4h5 QDZL(;<gFT...3EHaHa3a !2 4 4V <  F88!46GTY ZDVH$568IJDZL(;<d>W>WX 4b.r-   c                     || j                   n| j                   |   }|d|ini }d|v r | |fd|j                  i| n|dk(  r | |fd|j                  i|  | ||fi |S )Nr   dynamicr(   longrope)r   r(   )	r&   xr'   r   r   kwargsr3   r,   rope_forwards	         r+   wrapperz$dynamic_rope_update.<locals>.wrapperw   s    &0&8DNNdnnZ>X	/9/E,
+2	!$T<SSFS*$%dLTTVTD!\<V<<r-   Nr   )r9   r:   r3   r,   s   ` @@r+   dynamic_rope_updater<   !   s1    !KF$YL <= = Nr-   r   r   r(   ztorch.devicer   r   returnztorch.Tensorc                    | j                          || j                  |   n| j                  }|d   }|d   }|j                  dd      }t        | dd      xs | j                  | j
                  z  }t        ||z        }	d}
d|t        j                  d|	dt        j                  	      j                  |t        j                  
      |	z  z  z  }||z  }||
fS )a  
    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    Nfactor
rope_thetapartial_rotary_factor      ?head_dimr      dtyper(   rF   )standardize_rope_paramsr   getr    hidden_sizenum_attention_headsintr   arangeint64r%   float)r   r(   r   r   rope_parameters_dictr?   baserA   rC   dimattention_factorr   s               r+   '_compute_linear_scaling_rope_parametersrT      s    B ""$AKAW611*=]c]s]s!(+F  -D0445LcRvz40dF4F4F&JdJd4dH
h..
/C du||AsAU[[ILLTZbgbmbmLnqttuvH
 H%%%r-   c                    | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | d| j                  | j
                  z        }t        ||z        }|d   }	d}
|| j                  }n{t        |t        j                        rKt        j                  |t        j                  | j                  |j                  |j                              }nt        || j                        }||	|z  | j                  z  |	dz
  z
  ||dz
  z  z  z  }d|t        j                   d	|dt        j"                  
      j%                  |t        j&                        |z  z  z  }||
fS )a	  
    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at
                inference time
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor`
                will be accessed. The value of `factor` is used to determine the new base frequency, along with the
                current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the
                computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this
                factor has no effect. If seq_len <= max_position_embeddings, this factor effectively stretches the
                context window using an exponent derived from `dim`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length, used to update the dynamic RoPE at inference time. If `None` or shorter than
            max_position_embeddings, this value will be overridden by max_position_embeddings.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
    r@   rA   rB   rC   r?   rF   r(   r   rD   r   rE   rG   )rH   r   rI   r    rJ   rK   rL   max_position_embeddings
isinstancer   TensormaximumtensorrF   r(   r   rM   rN   r%   rO   )r   r(   r   r   rP   rQ   rA   rC   rR   r?   rS   r   s               r+   _compute_dynamic_ntk_parametersr\      sw   V ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h..
/C!(+F 00	GU\\	*--LL77w}}U\UcUcd

 gv==> FW$v'E'EE&ST*U[^behibi[jkkDdu||AsAU[[ILLTZbgbmbmLnqttuvH%%%r-   c                    | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | d| j                  | j
                  z        }t        ||z        }|d   }	|j                  d      }
|j                  d      }|j                  d      }|d	   }|	| j                  |z  }	dd}|
)|r|rt         ||	|       ||	|      z        }
n ||	      }
|j                  d      xs d}|j                  d      xs d
}d fd}d }|t        j                  d|d      j                  |t        j                        |z  z  }d|z  }d|	|z  z  }| j                  j                  dd      } |||||||      \  }}d
 ||||dz        j                  |t        j                        z
  }|d
|z
  z  ||z  z   }||
fS )a	  
    Computes the inverse frequencies with NTK scaling. Please refer to the
    [original paper](https://huggingface.co/papers/2309.00071)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin.
                    If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as available.
                *   `beta_fast` (`float`, *optional*, defaults to 32): Parameter to set the boundary for extrapolation
                    (only) in the linear ramp function.
                *   `beta_slow` (`float`, *optional*, defaults to 1): Parameter to set the boundary for interpolation
                    (only) in the linear ramp function.
                *   `factor` (`float`, *optional*): The scaling factor applied when interpolating the position IDs to
                    extend the possible context length. Additionally, if `attention_factor` is None, the log of this
                    value is used to compute a value for `attention_factor`, possibly in conjunciton with `mscale` and
                    `mscale_all_dim`, if provided.
                *   `mscale` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale` acts scalar augmenting `log(factor)` when computing the
                    numerator for the inferred value of `attention_factor`. If not provided, `attention_factor` will be
                    calculated based on `factor` only.
                *   `mscale_all_dim` (`float`, *optional*): If `attention_factor` is None and both `mscale` and
                    `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                    the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                    will be calculated based on `factor` only.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
                *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r@   rA   rB   rC   r?   rS   mscalemscale_all_dimr   r   c                 J    | dk  ryd|z  t        j                  |       z  dz   S )Nr   rB   g?)mathlog)scaler^   s     r+   
get_mscalez,_compute_yarn_parameters.<locals>.get_mscaleN  s(    A:V|dhhuo-33r-   	beta_fast    	beta_slowc                     |t        j                  || dz  t         j                  z  z        z  dt        j                  |      z  z  S )zPInverse dimension formula to find the dimension based on the number of rotationsrD   )ra   rb   pi)num_rotationsrR   rQ   rW   s       r+   find_correction_dimz5_compute_yarn_parameters.<locals>.find_correction_dim`  sB    dhh6-!:Kdgg:UVWW\]`d`h`him`n\noor-   c                      | |||      } ||||      }|r*t        j                  |      }t        j                  |      }t        |d      t	        ||dz
        fS )z.Find dimension range bounds based on rotationsr   r   )ra   floorceilr   min)	low_rothigh_rotrR   rQ   rW   truncatelowhighrk   s	           r+   find_correction_rangez7_compute_yarn_parameters.<locals>.find_correction_ranged  s^    !'36MN"8S$8OP**S/C99T?D3{CcAg...r-   c                     | |k(  r|dz  }t        j                  |t         j                        | z
  || z
  z  }t        j                  |dd      }|S )NgMbP?rE   r   r   )r   rM   float32clamp)ro   r   rR   linear_func	ramp_funcs        r+   linear_ramp_factorz4_compute_yarn_parameters.<locals>.linear_ramp_factorm  sL    #:5LC||Cu}}=Cc	RKKQ2	r-   r   rD   rG   rr   T)r   )rH   r   rI   r    rJ   rK   rL   rW   rO   r   rM   r%   )r   r(   r   r   rP   rQ   rA   rC   rR   r?   rS   r^   r_   r   rd   re   rg   ru   r{   	pos_freqsinv_freq_extrapolationinv_freq_interpolationrr   rs   rt   inv_freq_extrapolation_factorr   rk   s                              @r+   _compute_yarn_parametersr      s9   t ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h..
/C!(+F+//0BC!%%h/F)--.>?N';<^'_$
 ~//2RR4 n$Z%?*VUcBd%de)&1 %((5;I$((5:Ip/ aa03363UX[[\I 9_ FY$67%%))*d;H%iCGgiqrIC %&(:3cQh(O(R(RZ`hmhshs(R(t$t!!&C"CD
 #@
@	A  %%%r-   c                 @   | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | d| j                  | j
                  z        }t        ||z        }|d   }	|d   }
|j                  d      }|j                  d      }|d	   }|| j                  |z  }|I|dk  rd}nAt        j                  d
t        j                  |      t        j                  |      z  z         }|r,||kD  r't        j                  |	t        j                  |      }n&t        j                  |
t        j                  |      }t        j                  d|dt        j                  |      j!                         |z  }d|||z  z  z  }||fS )a  
    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
    [original implementation](https://github.com/microsoft/LongRoPE)

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   max_position_embeddings (`int`): The maximum length of the positional embeddings.
            *   original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during
                pretraining. If not provided, defaults to `max_position_embeddings`.
            *   rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys
                will be accessed:
                *   `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, inferred from
                    the value of `factor`.
                *   `factor` (`float`, *optional*): The scaling factor to apply to the RoPE embeddings. If both
                    `max_position_embeddings` and `original_max_position_embeddings` are provided, this value will be
                    overridden s the ratio between those values.
                *   `long_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is provided and greater than `original_max_position_embeddings`.
                *   `short_factor` (`float`, *optional*): The scale factor applied when computing the inverse
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*, defaults to 1.0): If less than 1.0, inverse frequencies
                will be returned for the first fraction of the head_dim.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length.

    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    r@   rA   rB   rC   long_factorshort_factorr?   rS   r   r   rV   r   rD   )rH   r   rI   r    rJ   rK   rL   rW   ra   sqrtrb   r   r[   rw   rM   rN   rO   )r   r(   r   r   rP   rQ   rA   rC   rR   r   r   r?   rS   r   ext_factorsinv_freq_shaper   s                    r+   _compute_longrope_parametersr     s   d ""$AKAW611*=]c]s]s-D0445LcRvz6+=+=A[A[+[\H
h..
/C&}5K'7L!%%h/F+//0BC';<^'_$
 ~//2RR S="#yyTXXf-=Ii@j-j)jk 7==ll;emmFSll<u}}VT\\!S!5;;vNTTVY\\NkD.$889H%%%r-   c                    | j                          || j                  |   n| j                  }|d   }|j                  dd      }t        | dd      xs | j                  | j
                  z  }t        ||z        }d}	d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }
|d
   }|d   }|d   }|d   }||z  }||z  }dt        j                  z  |
z  }t        j                  ||kD  |
|z  |
      }||z  |z
  ||z
  z  }d|z
  |z  |z  ||z  z   }||k   ||kD   z  }t        j                  |||      }||	fS )au
  
    Computes the inverse frequencies for llama 3.1.

    Args:
        config ([`~transformers."PreTrainedConfig"`]):
            The model configuration. This function assumes that the config will provide at least the following
            properties:

            *   rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived.
            *   hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly.
            *   num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly.
            *   rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following
                keys will be accessed:
                *   `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the
                    wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies
                    during smoothing.
                *   `high_freq_factor` (`float`): The scale factor used to compute `high_freq_wavelen` and
                    the value for the denominator of the smoothing factor prior to the `low_freq_factor` shift.
                *   `low_freq_factor` (`float`): The scale factor used to compute `low_freq_wavelen` and
                    the shift applied to the numerator and denominator of the smoothing factor.
                    frequencies if `seq_len` is None or less-than-or-equal-to `original_max_position_embeddings`.
                *   `original_max_position_embeddings` (`int`): The original max position embeddings used
                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.

            Additionally, this function will make use of the following properties if they are found in the config:

            *   head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be
                derived as hidden_size // num_attention_heads.
            *   partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for
                the first fraction of the head_dim. Defaults to 1.0.
        device (`torch.device`):
            The device to use for initialization of the inverse frequencies.
        seq_len (`int`, *optional*):
            The current sequence length. Unused for this type of RoPE.
    Returns:
        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
        post-processing scaling factor applied to the computed cos/sin.
    Nr@   rA   rB   rC   r   rD   rE   rG   r?   low_freq_factorhigh_freq_factorr   r   )rH   r   rI   r    rJ   rK   rL   r   rM   rN   r%   rO   ra   ri   where)r   r(   r   r   rP   rQ   rA   rC   rR   rS   r   r?   r   r   old_context_lenlow_freq_wavelenhigh_freq_wavelenwaveleninv_freq_llamasmooth_factorsmoothed_inv_freqis_medium_freqs                         r+   _compute_llama3_parametersr     s   Z ""$AKAW611*=]c]s]s  -D0445LcRvz40dF4F4F&JdJd4dH
h..
/C du||AsAU[[ILLTZbgbmbmLnqttuvH!(+F*+<=O+,>?*+MNO&8'*::$''kH$G [[+;!;X=NPXYN$w.@EUXgEghM]*n<vEXfHff!223BR8R6SSN[[1BNSN+++r-   )linearr5   yarnr6   llama3c                       e Zd ZU dZeed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed<   edz  ed	<   edz  ed
<   ee   dz  ed<   ee   dz  ed<   edz  ed<   edz  ed<   y)RopeParametersaY
  
    Args:
        rope_theta (`float`):
            The base period of the RoPE embeddings.
        rope_type (`str`, *optional*, defaults to "default"):
            The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
            'llama3'], with 'default' being the original RoPE implementation.
        partial_rotary_factor (`float`, *optional*):
            The percentage of the query and key head embedding on which RoPE will be applied.
        factor (`float`, *optional*):
            Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
            most scaling types, a `factor` of x will enable the model to handle sequences of length x *
            original maximum pre-trained length.
        original_max_position_embeddings (`int`, *optional*):
            Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
            pretraining.
        attention_factor (`float`, *optional*):
            Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
            computation. If unspecified, it defaults to value recommended by the implementation, using the
            `factor` field to infer the suggested value.
        beta_fast (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
            ramp function. If unspecified, it defaults to 32.
        beta_slow (`float`, *optional*):
            Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
            ramp function. If unspecified, it defaults to 1.
        short_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to short contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        long_factor (`list[float]`, *optional*):
            Only used with 'longrope'. The scaling factor to be applied to long contexts (<
            `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
            size divided by the number of attention heads divided by 2
        low_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
        high_freq_factor (`float`, *optional*):
            Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    r@   Nr   rA   r?   r   rS   re   rg   r   r   r   r   )	__name__
__module____qualname____doc__rO   __annotations__strrL   list r-   r+   r   r   :  s    &P Tz 4<'DL&)Dj0dl"t|t|u+$$et##T\!dl"r-   r   F)totalc                      e Zd ZdZdZddedz  fdZd Zdddd	edz  fd
Zdde	d	edz  fdZ
dde	d	edz  fdZdde	d	edz  fdZdde	d	edz  fdZdde	d	edz  fdZdde	d	edz  fdZe	 	 ddededededz  d	edz  f
d       Zy)RotaryEmbeddingConfigMixinz[
    A Mixin containing the functionality to standardize and validate RoPE parameters.
    g     @Nignore_keys_at_rope_validationc                    |j                  dd       }|xs | j                  | _        | j                  | j                  ni | _        |j                  dt        | d| j                              }| j                  j	                  d|       |j                  dt        | dd             }|9| j                  j	                  d|       |
t               n
t        |      }|dhz  }| j                          | j                  |       |S )Nrope_scalingr@   rA   ignore_keys)	popr   r    default_theta
setdefaultrI   setrH   validate_rope)r&   r   r8   r   r@   rA   s         r+   convert_rope_params_to_dictz6RotaryEmbeddingConfigMixin.convert_rope_params_to_dictx  s    zz.$7+Ct/C/C7;7K7K7Wt33]_ ZZgdL$J\J\.]^
''jA &

+BGDRikoDp q ,  ++,CEZ[7?SIgEh + .LOfNg-g*$$&'EFr-   c                 d   t        | dd      }t        | dd      }t        | dd      xs i }t        | dd      }|s|st        j                  d       y|-|i k(  s(t        |j	                               j                  |      s|j                  d|j                  dd	             |j                  d|       |||d<   |d   d
v rt        | d      r!| j                  | j                  d<   || _
        y| j                  j                  d| j                         || _
        yt        |      D ]}  }||   j                  d||   j                  dd	             ||   j                  d|       ||||   d<   ||   d   d
v sU| j                  |   j                  d| j                          || _
        y)z
        Helper to standardize the config's rope params field by ensuring the params are defined for each
        later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility)
        r@   NrA   r   layer_typeszG`standardize_rope_params` was called but no RoPE parameters were found.r   typedefault)r   r   r6   r   )r    loggerwarningr   keysissubsetr   rI   r!   r   r   rW   )r&   r@   rA   r   r   r   s         r+   rH   z2RotaryEmbeddingConfigMixin.standardize_rope_params  s    T<6
 '.Et L!$(94@FBdM48  :NNde Or$9_EYEYE[A\AeAefqAr&&{O4G4GPY4Z[&&|Z@$0;P 78 {+/MM4!CD PTOtOtD(()KL"  / ((334VX\XtXtu  / "+. 	

+66{OT^D_DcDcdjluDvw
+66|ZP(4K`OJ/0GH":.{;?]]((4??:D<X<X	  /r-   r&   r   r   c                    | j                   }|yt        | dd      3t        |j                               j	                  | j
                        rnd|i}|j                         D ]`  }|j                  d|j                  dd            }t        | d| dd      }||d<   | |||	       Ht        j                  d
| d       b y)zY
        Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
        Nr   full_attentionr   r   r   
_validate__rope_parametersr   zMMissing validation function in 'RotaryEmbeddingConfigMixin' for 'rope_type'='')
r   r    r   r   r   r   valuesrI   r   r   )r&   r   rP   r   r   validation_fns         r+   r   z(RotaryEmbeddingConfigMixin.validate_rope  s      $33'4-9cBVB[B[B]>^>g>g?
 $46J#K 3::< 
	O'++K9L9LVU^9_`I#DJykAQ*RTXYM+4OK((o;Gcdmcnnop
	r-   r   c                 r    ddh}t        |j                               }|d   }| j                  ||||       y )Nr   r@   r   )r   r   _check_received_keys)r&   r   r   required_keysreceived_keysr   s         r+   !_validate_default_rope_parametersz<RotaryEmbeddingConfigMixin._validate_default_rope_parameters  sA    $l3O0023#K0	!!)]MWb!cr-   c                     h d}t        |j                               }|d   }| j                  ||||       |d   }|t        |t              r|dk  rt
        j                  d|        y y )N>   r?   r   r@   r   r   r?   rB   ;`rope_parameters`'s factor field must be a float >= 1, got r   r   r   rX   rO   r   r   r&   r   r   r   r   r   r?   s          r+    _validate_linear_rope_parametersz;RotaryEmbeddingConfigMixin._validate_linear_rope_parameters  sv    =O0023#K0	!!)]MWb!c *>FE!:fslNNXY_X`ab ?Kr-   c                     ddh}t        |j                               }|d   }| j                  ||||       |d   }|t        |t              r|dk  rt
        j                  d|        y y )Nr   r?   r   rB   r   r   r   s          r+   !_validate_dynamic_rope_parametersz<RotaryEmbeddingConfigMixin._validate_dynamic_rope_parameters  s{    $h/O0023#K0	!!)]MWb!c *>FE!:fslNNXY_X`ab ?Kr-   c           	      4   h d}h d}t        |j                               }|d   }| j                  |||||       |d   }|t        |t              r|dk  rt
        j                  d|        |j                  d      }|-t        |t              r|d	k  rt
        j                  d
|        |j                  d      }	|	(t        |	t              st
        j                  d|	        |j                  d      }
|
(t        |
t              st
        j                  d|
        |	xs d|
xs dk  rt
        j                  d|	 d|
 d       | j                  d   }| j                  |z  }||k7  r&|dk7  r t
        j                  d| d| d| d       y y y )N>   r?   r   r@   r   >   r^   rr   re   rg   r_   rS   r   r   r?   rB   r   rS   r   O`rope_parameters`'s attention_factor field must be a float greater than 0, got re   z9`rope_parameters`'s beta_fast field must be a float, got rg   z9`rope_parameters`'s beta_slow field must be a float, got rf   r   zR`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast=z( (defaults to 32 if None) and beta_slow=z (defaults to 1 if None)r   zKThe explicitly set RoPE scaling factor (config.rope_parameters['factor'] = z) does not match the ratio implicitly set by other parameters (implicit factor = post-yarn context length / pre-yarn context length = config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = z). Using the explicit factor (z) in YaRN. This may cause unexpected behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config.)r   r   r   rX   rO   r   r   rI   r   rW   warning_once)r&   r   r   r   optional_keysr   r   r?   rS   re   rg   r   implicit_factors                r+   _validate_yarn_rope_parametersz9RotaryEmbeddingConfigMixin._validate_yarn_rope_parameters  s   a
 O0023#K0	!!)]M=fq!r *>FE!:fslNNXY_X`ab*../AB'<Le1TXhklXlNNabrast $''4	 Iu)ENNVW`Vabc#''4	 Iu)ENNVW`VabcO	Q/NNdendo p::CD\^ ,0+?+?@b+c(669YYf$A)=]^d]e fq ###A& J~	~ *>$r-   c                    h d}ddh}t        |j                               }|d   }| j                  |||||       |j                  dd      }t	        | d| j
                  | j                  z        }t        ||z        }	|j                  d	      }
t        |
t              s*t        d
 |
D              rt        j                  d|
        t        |
      |	dz  k7  r't        j                  d|	dz   dt        |
              |j                  d      }t        |t              s*t        d |D              rt        j                  d|        t        |      |	dz  k7  r't        j                  d|	dz   dt        |              |j                  d      }|d   }||t        j                  d       nG||t        j                  d       n-t        |t              r|dk  rt        j                  d|        |j                  d      }|/t        |t              r|dk  rt        j                  d|        y y y )N>   r   r@   r   r   r   rS   r?   r   r   rA   rB   rC   r   c              3   H   K   | ]  }t        |t        t        f        y wr;   rX   rL   rO   .0r7   s     r+   	<genexpr>zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>5  s     5hVWjS%L6Q5h    "zF`rope_parameters`'s short_factor field must be a list of numbers, got rD   z8`rope_parameters`'s short_factor field must have length z, got r   c              3   H   K   | ]  }t        |t        t        f        y wr;   r   r   s     r+   r   zPRotaryEmbeddingConfigMixin._validate_longrope_rope_parameters.<locals>.<genexpr>=  s     4fUVZC<5P4fr   zE`rope_parameters`'s long_factor field must be a list of numbers, got z7`rope_parameters`'s long_factor field must have length r   av  This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.z4Missing required keys in `rope_parameters`: 'factor'r   g        r   )r   r   r   rI   r    rJ   rK   rL   rX   r   allr   r   lenr   rO   )r&   r   r   r   r   r   r   rA   rC   rR   r   r   r?   r   rS   s                  r+   "_validate_longrope_rope_parametersz=RotaryEmbeddingConfigMixin._validate_longrope_rope_parameters)  sK   v+X6O0023#K0	!!)]M=fq!r / 3 34KS Q4T-=-=AYAY-YZ(223&**>:,-#5h[g5h2hNNcdpcqrs|q(NNJ3RS8*TZ[^_k[lZmn &))-8+t,4fZe4f1fNNbcnbopq{sax'NNI#QR(SYZ]^iZjYkl !$$X.+:;]+^( >>JE ^ @ HNNQRFE*fslNNXY_X`ab*../AB'<Le1TXhknXnNNabrast Yo'r-   c                    h d}|d   }t        |j                               }| j                  ||||       |d   }|t        |t              r|dk  rt
        j                  d|        |d   }|d   }|t        |t              st
        j                  d	|        |t        |t              st
        j                  d
|        ||k  rt
        j                  d| d|        |d   }	|	t        |	t              st
        j                  d|	        |	| j                  k\  r&t
        j                  d|	 d| j                          y y )N>   r?   r   r@   r   r   r   r   r   r?   rB   r   r   r   z?`rope_parameters`'s low_freq_factor field must be a float, got z@`rope_parameters`'s high_freq_factor field must be a float, got zf`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=z and low_freq_factor=r   zS`rope_parameters`'s original_max_position_embeddings field must be an integer, got zj`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got z and max_position_embeddings=)	r   r   r   rX   rO   r   r   rL   rW   )
r&   r   r   r   r   r   r?   r   r   r   s
             r+    _validate_llama3_rope_parametersz;RotaryEmbeddingConfigMixin._validate_llama3_rope_parameters[  sz   
 $K0	O0023!!)]MWb!c *>FE!:fslNNXY_X`ab)*;<*+=>"*_e*LNN\]l\mno#:6F+NNN]^n]opq.NNx#$$9/9JL
 ,;;]+^(+3:Ffhk;lNNe346 ,t/K/KKNN|344QRVRnRnQoq Lr-   r   r   r   r   c                 
   d|v r|dhz  }|j                  d       |xs
 t               }d|vr|j                  d       |||z  }||z
  }|rt        d|  d|       ||z
  |z
  }|rt        j	                  d|  d|        yy)z\Compare the received keys in `config.rope_parameters` against the expected and optional keysr   r   rA   Nz<Missing required keys in `rope_parameters` for 'rope_type'='z': z8Unrecognized keys in `rope_parameters` for 'rope_type'=')addr   KeyErrorr   r   )r   r   r   r   r   missing_keysunused_keyss          r+   r   z/RotaryEmbeddingConfigMixin._check_received_keys  s     ]"fX%Mk*%."-756 "[(M$}4YZcYddghtguvww#m3mCNNUV_U``cdocpqr r-   r;   )NN)r   r   r   r   r   r   r   rH   r   dictr   r   r   r   r   r   staticmethodr   r   r   r-   r+   r   r   q  s>    M#PT* 2./`. S4Z 6d dTWZ^T^ dc cSVY]S] cc cTWZ^T^ c1d 1QTW[Q[ 1f0$ 0UX[_U_ 0d' 'SVY]S] 'R 
 %)"&sss s Tz	s
 4Zs sr-   r   r   c                 |    t        j                  dt               | j                          | j	                  |       y)zq
    This is a deprecated function.
    It has been kept for backward compatibility with custom code models.
    aX  `rope_config_validation` is deprecated and has been removed. Its functionality has been moved to RotaryEmbeddingConfigMixin.validate_rope method. PreTrainedConfig inherits this class, so please call self.validate_rope() instead. Also, make sure to use the new rope_parameters syntax. You can call self.standardize_rope_params() in the meantime.r   N)warningswarnFutureWarningrH   r   )r   r   s     r+   rope_config_validationr     s:    
 MM	G
 	 ""$
[1r-   )NNNN)NNNr;   ) ra   r   	functoolsr   typingr   r   r   utilsr	   r
   
get_loggerr   r   r   configuration_utilsr   r<   rL   r   tuplerO   rT   r\   r   r   r   r"   r   r   r   r   r   r-   r+   <module>r      sm      5 5 . 
		H	% 5`H ,0'+!	3&'(3&^$3& 4Z3& d
	3&
 >5 !3&n ,0'+!	C&'(C&^$C& 4ZC& d
	C&
 >5 !C&P (,!	D&D&^$D& 4ZD& d
	D&
 >5 !D&R (,!	U&U&^$U& 4ZU& d
	U&
 >5 !U&t (,!	L,L,^$L, 4ZL, d
	L,
 >5 !L,f 6.$,( 4#Ye 4#nos osd	2#= 2CRVJ 2r-   