
    qi                     2   d Z ddlmZ ddlZddlZddlmZ ddlmc mZ	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=  e#j|                  e?      Z@dLdeAfdZB	 dMdej                  deAdeAdeAfdZDdej                  dej                  deAd eFd!eAd"ej                  fd#ZGd$dej                  fd%ej                  d&eAd'eAd(eId)eAd*ej                  d"eKej                  ej                  f   fd+ZLd,ej                  d-eAdz  d"ej                  fd.ZM G d/ d0e4      ZN G d1 d2e5      ZO G d3 d4e/      ZP G d5 d6e2      ZQ G d7 d8e6      ZR G d9 d:e3      ZSe! G d; d<e1             ZT G d= d>eT      ZU G d? d@eT      ZV G dA dBeT      ZW G dC dDeT      ZX G dE dFeT      ZY e!dGH       G dI dJeTe             ZZg dKZ[y)Nz<Blt modular model, inheriting from Mllama where appropriate.    )CallableN   )initialization)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )rotate_half)LlamaRotaryEmbedding)MllamaPreTrainedModelMllamaSelfAttentionDecoderLayerMllamaTextCrossAttentionMllamaTextMLPMllamaTextRMSNormMllamaTextSelfAttentioneager_attention_forward   )	BltConfigBltGlobalTransformerConfigBltLocalDecoderConfigBltLocalEncoderConfigBltPatcherConfigprimec                     t        j                  |t         j                  | j                        }t        j                  | j
                  d   | j                        }||z  }t        j                  | |z  d      S )a  
    A polynomial rolling hash algorithm that converts sequences
    of tokens into hash values. The hash is computed as:
        hash = (token_0 * prime^0 + token_1 * prime^1 + ... + token_n * prime^n)

    The rolling hash allows the model to efficiently
    identify and encode recurring byte-level patterns in the input text.

    Args:
        token_tensor (torch.Tensor): [batch_size, seq_len, group_size] containing token IDs to hash
        prime (int): Prime number used as the base for the polynomial hash.

    Returns:
        torch.Tensor: Hash values of shape [batch_size, seq_len] where each value
                     represents the hash of the corresponding token group

    Example:
        >>> tokens = torch.tensor([[1, 2, 3], [4, 5, 6]])
        >>> hashes = rolling_polynomial_hash(tokens, prime=31)
        >>> # hash[0] = 1*31^0 + 2*31^1 + 3*31^2
        >>> # hash[1] = 4*31^0 + 5*31^1 + 6*31^2
    dtypedevicer.   dim)torchtensorint64r.   arangeshapesum)token_tensorr*   prime_tensorpowersprime_powerss        U/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/blt/modular_blt.pyrolling_polynomial_hashr>   :   sa    . <<U[[ATATUL\\,,,R09L9LMF'L99\L0b99    	token_ids
group_sizemax_hashc                 Z   t        j                         5  | j                  \  }}t        j                  ||dz
  t         j                  | j
                        }t        j                  || gd      }|j                  d|d      }t        ||      }	|	|z  }
ddd       |
S # 1 sw Y   
S xY w)z1Hash token groups and map to range [0, max_hash].r$   r,   r1   N)	r3   no_gradr7   zerosr5   r.   catunfoldr>   )r@   rA   r*   rB   
batch_sizeseq_lenpaddingpadded_tokenswindowshasheshash_valuess              r=   byte_group_hash_functionrO   W   s     
 	('oo
G++j*q.T]TdTde		7I"6A>  &&q*a8(%8x'	( 	( s   BB  B*local_encoder_tokensencoder_hash_tok_embedding$encoder_hash_byte_group_nb_functionsencoder_hash_byte_group_sizeencoder_hash_byte_group_vocabreturnc                    g d}|j                  |       }d}t        |      D ]Y  }	||	t        |      z     }
|D ]A  }t        | ||
|      }|||z  z   }| ||      j	                  |j
                        z  }|dz  }C [ |S )z=Compute token embeddings enhanced with hash-based embeddings.)ʚ;l   21A ioYl   vt l   . l   }g l   Au l   0 l   T l   AK l   | r   r$   )embed_tokensrangelenrO   tor.   )rP   local_encoderrQ   rR   rS   rT   primes
embeddingsembedding_idxfunc_nbr*   rA   hash_idsoffset_hash_idss                 r=   compute_hash_embeddingsrc   i   s    F ++,@AJM=> wV,-6 	J/0DjRWYvwH&9V)VVO4_EHHIZIZ[[JQM	 r?   F	patch_idsnum_patchessequence_lengthpatches_as_queriescross_attn_kr-   c                 z   | j                   \  }}| j                  }|rp||z  }	|}
t        j                  ||      j	                  d      j	                  d      j                  |||      }| j	                  d      j                  |||      }no|}	||z  }
| j	                  d      j                  |||      }t        j                  ||      j	                  d      j	                  d      j                  |||      }||k(  }|rdnd}|j                  ||      }||	|
f}|j                   |k7  rt        d|j                    d|       |j	                  d      }d|j                  |      z
  }|j                  |j                  t        j                        t        j                  |      j                        }|S )	aR  
    Prepare cross-attention mask for patch-based attention, following mllama's robust approach.

    This function creates masks that control which patches can attend to which other patches,
    with support for query/key role swapping and cross-attention multipliers.

    Args:
        patch_ids (torch.Tensor): Tensor of shape [batch_size, seq_len] containing patch ids.
        num_patches (int): Total number of patches.
        sequence_length (int): Length of the sequence.
        patches_as_queries (bool): If True, patches are used as queries, otherwise as keys.
        cross_attn_k (int): Cross-attention multiplier for repeating patches.
        dtype (torch.dtype): Data type for the output mask.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - cross_attention_mask: 4D tensor [batch_size, 1, q_len, kv_len]
    r0   r   r/   r$   r1   zCross attention mask shape z doesn't match expected g      ?)r7   r.   r3   r6   	unsqueezeexpandrepeat_interleave
ValueErrorr[   masked_fillboolfinfomin)rd   re   rf   rg   rh   r-   rH   rI   r.   q_lenkv_lenq_patch_idskv_patch_idscross_attention_mask
repeat_dimexpected_shapeinverted_cross_attn_masks                    r=   #_prepare_patch_cross_attention_maskrz      s   4 $//JF l*  LLV4Yq\Yr]VJW5	 	 !**1-44ZgV|+))"-44Z+VLLV4>>qAKKANUUV`bikvw 	 ',6 )bJ/AA,T^A_ !%0N!!^3)*>*D*D)EE]^l]mn
 	

 099!<  #%9%<%<U%CC3?? ##EJJ/U1C1G1G  r?   patch_lengthsmax_patch_lengthc                 6   || S | j                  d      }g }| D ]j  }g }||dkD     D ]J  }|j                         }t        ||      \  }}|j                  |g|z         |s:|j	                  |       L |j	                  |       l t        d |D              }	t        j                  ||	f| j                  | j                        }
t        |      D ]D  \  }}|s	t        j                  || j                  | j                        |
|dt        |      f<   F |
dk7  j                  d      j                         |
j                  d   k  rM|
dk7  j                  d      j!                         j                         j                         dz   }|
ddd|f   }
|
S )a  
    Splits patch lengths into smaller segments if they exceed `max_patch_length`.
    Pads the result to uniform length across the batch.

    Args:
        patch_lengths (torch.Tensor): [batch_size, num_patches] tensor of patch lengths.
        max_patch_length (int, optional): Maximum allowed length per patch.

    Returns:
        torch.Tensor: [batch_size, max_len] tensor of split and padded patch lengths.
    Nr   c              3   2   K   | ]  }t        |        y wN)rZ   ).0splitss     r=   	<genexpr>z(process_patch_lengths.<locals>.<genexpr>   s     6&#f+6s   r,   r1   r$   )sizeitemdivmodextendappendmaxr3   rE   r-   r.   	enumerater4   rZ   anyr8   r7   nonzero)r{   r|   rH   	processedseqr   lengthfull_chunks	remaindermax_lenpaddedilast_nonzeros                r=   process_patch_lengthsr      s    ##A&JI !#'l 	)F[[]F%+F4D%E"KMM+,{:;i(	) 	 ! 6I66G[[*g.m6I6IR_RfRfgFy) t	6',||F-BUBU^k^r^r'sF1mFm#$t
 	!Q##%Q7!((Q(/779==?DDFJ=L=()Mr?   c                       e Zd Zy)BltMLPN__name__
__module____qualname__ r?   r=   r   r   	      r?   r   c                       e Zd Zy)
BltRMSNormNr   r   r?   r=   r   r     r   r?   r   c                   D    e Zd Z ej                         ed               Zy)BltRotaryEmbeddingc                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  |dd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j                  |j                   
      	j                  |j                   
      fS # 1 sw Y   AxY w)Nr   r/   r$   mpscpuF)device_typeenabledr   r1   )r-   )inv_freqfloatrk   r7   
isinstancer.   typestrr   	transposer3   rl   cosattention_scalingsinr[   r-   )
selfxposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r=   forwardzBltRotaryEmbedding.forward  s@    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))%;C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   =BFF
N)r   r   r   r3   rD   r   r   r   r?   r=   r   r     s$    U]]_<  <r?   r   c                   $     e Zd Zdef fdZ xZS )BltTransformerLayer	layer_idxc                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y )N)configr   eps)super__init__BltSelfAttention	self_attnr   mlpr   hidden_sizerms_norm_epsinput_layernormpost_attention_layernormr   r   r   	__class__s      r=   r   zBltTransformerLayer.__init__#  s]    )9M&>)&*<*<&BUBUV(263E3E6K^K^(_%r?   )r   r   r   intr   __classcell__r   s   @r=   r   r   "  s    `# ` `r?   r   c                   (     e Zd Zdedef fdZ xZS )r   r   r   c                 &    t         |   ||       y r   )r   r   r   s      r=   r   zBltSelfAttention.__init__-  s    +r?   )r   r   r   r%   r   r   r   r   s   @r=   r   r   ,  s    ,y ,S , ,r?   r   c            
            e Zd ZdZddedededz  f fdZ	 	 ddej                  dej                  dz  d	ej                  dz  d
e	e
   fdZ xZS )BltCrossAttentionz<Cross-attention module for Blt, following transformers styleNr   r   r   c                     t         |           d| _        t        | j                  |j
                        | _        t        | j                  |j
                        | _        y )NFr   )r   r   	is_causalr   r   r   q_normk_norm)r   r   r   r   r   s       r=   r   zBltCrossAttention.__init__4  sI     !1!1v7J7JK !1!1v7J7JKr?   hidden_statescross_attention_statesattention_maskkwargsc                 `   |j                         \  }}}| j                  |      }| j                  |      }|j                  ||| j                  | j
                        j                  dd      }| j                  |      }| j                  |      }	| j                  |      }
|	j                  |d| j                  | j
                        j                  dd      }	|
j                  |d| j                  | j
                        j                  dd      }
t        j                  | j                  j                  t              } || ||	|
|f| j                   sdn| j"                  | j$                  d|\  }}|j'                  ||d      j)                         }| j+                  |      }||z   }||fS )Nr$   r   r/           )dropoutscaling)r   r   q_projview	num_headshead_dimr   r   k_projv_projnum_key_value_headsr   get_interfacer   _attn_implementationr#   trainingr   r   reshape
contiguouso_proj)r   r   r   r   r   bszrr   _query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                 r=   r   zBltCrossAttention.forward:  s    &**,UA{{=1{{<0#((eT^^T]]S]]^_abc!%-C!D[[!78
{{#9:__S"d.F.FV``abdef
#((b$2J2JDMMZddefhij(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "))#ub9DDFkk+.!M1L((r?   r   NN)r   r   r   __doc__r%   r   r   r3   Tensorr   r   r   r   r   s   @r=   r   r   1  sw    FLy LS LsTz L 7;.2	#)||#) !&t 3#) t+	#)
 +,#)r?   r   c                       e Zd ZU eed<   dZdZdZdgZ e	e
d       e	ed      dZ ej                         d        Zy	)
BltPreTrainedModelr   Fr   r   )indexr$   )r   
attentionsc           	      $   |j                   j                  }t        |t        t        j
                  f      sd|v sd|v rYt        |dd      t        j                  |j                         t        |dd      t        j                  |j                         yt        |t        j                        rt        | j                  dd      }|7t        | j                  d      r!t        | j                  j                  dd      }||j                   }|dz  }t        j"                  |j                  d	|d
|z  d|z         |j$                  ,t        j                  |j                  |j$                            yt        |t&        t(        f      s|dv rqt        | j                  dd      }|t        |d      r|j*                  }|>dD ]9  }t        ||d      }|t        |d      s |j                  j,                  d   } n |y|dz  }dD ]v  }t        ||d      }|t        |d      s t        j"                  |j                  d	|d
|z  d|z         t        |dd      Xt        j                  |j                         x t        |dt        |dd            }	|	bt        |	d      rVt        j"                  |	j                  d	|d
|z  d|z         t        |	dd      t        j                  |	j                         yt        |t.              s|dk(  rt        | j                  dd      }|7t        | j                  d      r!t        | j                  j0                  dd      }|7t        | j                  d      r!t        | j                  j                  dd      }d}
||dz  }
t        |dt        |dd            }t        |dd      }t        |dt        |dd            }||fD ]  }|t        |d      s|
xs |j                  j,                  d   dz  }t        j"                  |j                  d	|d
|z  d|z         t        |dd      kt        j                  |j                          |t        |d      rt|j                  j,                  d   }|dz  }t        j"                  |j                  d	|d
|z  d|z         t        |dd      t        j                  |j                         yt        |t        j2                        rg|j4                  }|dz  }t        j"                  |j                  d	|d
|z  d|z         |j                  t        j                  |j                         yt        |t6              r|j8                  dk7  rt:        |j8                     n|j<                  } ||j                        \  }}t        j>                  |j@                  |       t        j>                  |jB                  |       yy)a  
        Initialize BLT weights following the original ByteLatentTransformer:

        - Most weights are drawn from a truncated normal.
        - Scale is ~ 1 / sqrt(model_dim) (or 1 / sqrt(hidden_dim) for FFN outputs).
        - Norm layers are set to weight = 1, bias = 0.
        RMSNorm	LayerNormweightNbiasr   encoder_configg      r   r   )meanstdab)r"   r   )r   r   r   r   denser/   )r   r   r   r   r  r    decoder_config	gate_projfc1up_proj	down_projfc2r$   default)"r   r   r   r   nnr   getattrinitones_r   zeros_r   	Embeddingr   hasattrr   embedding_dimtrunc_normal_padding_idxr   r   r   r7   r   r  Linearin_featuresr   	rope_typer   compute_default_rope_parameterscopy_r   original_inv_freq)r   module
class_namer   r  r2   nameproj	proj_namer   in_stdr  r  r	  
hidden_dimout_stdfan_inrope_fnbuffer_valuer   s                       r=   _init_weightsz BltPreTrainedModel._init_weightst  s    %%..
 fz2<<89Y*=TXcgqXqvx.:

6==)vvt,8FKK( fbll+!$++}dCK"wt{{<L'M%dkk&@&@-QUV"$22t#Cs(c' !!-FMM&*<*<=> f/1BCD
 W
 I
 $++}d;C{wv}=(({M D"646D'GD(,C"kk//3	
 {t)C < /	vy$7#h(?&& s(c' tVT2>DII./ VXwvw/MNF!gfh&?""MM3h#g 6640<KK, ff%)F!$++}dCK"wt{{<L'M%dkk&@&@-QUV"wt{{<L'M%dkk&@&@-QUV F&$d*WVUD5QRIfi6GWVUD5QRI #G, /#h(? BT[[%6%6q%9T%AC&& s(c' tVT2>DII./ $H)E&--33A6
$d*""$$7l'k 9fd3?KK	/ fbii(''F$,Cs(c' {{&FKK(f01 ##y0 $F$4$45;; 
 &fmm4OL!JJv5JJv//> 2r?   N)r   r   r   r%   __annotations___supports_attention_backend_supports_flash_attn_supports_flex_attn_no_split_modulesr   r   r   _can_record_outputsr3   rD   r'  r   r?   r=   r   r   `  s_    "' ./'(;1E$%5Q? U]]_Z? Z?r?   r   c                   |    e Zd ZU eed<   d eedd      iZdef fdZ	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  dee   fdZd Z xZS )BltLocalEncoderr   encoder_attentionsr$   r\   r   
layer_namec           	         t         |   |       d| _        || _        t	        j
                  t        |j                        D cg c]  }t        ||       c}      | _	        t        |      | _        t	        j                  |j                  |j                  |j                  z  d      | _        t	        j                   |j"                  |j                        | _        t	        j
                         | _        |j(                  r|j                  nd}t        |      D ]3  }| j&                  j+                  t-        |||j                               5 | j/                          y c c}w )NFr   r  out_featuresr   r$   r   r   r   )r   r   gradient_checkpointingr   r  
ModuleListrY   num_hidden_layersr   layersr   
rotary_embr  r   rh   patch_embedding_projectionr  
vocab_sizerX   cross_attn_layerscross_attn_all_layersr   r   	post_initr   r   r   layers_to_addr   s       r=   r   zBltLocalEncoder.__init__  s!    &+#mmEJ6KcKcEde	 3e
 -F;*,))**++f.A.AA+
'
 LL):):F<N<NO!#4:4P4P00VW}- 	I""))!9RXRdRde	
 	! fs   E'N	input_idsinputs_embedspatch_embedsr   r   past_key_valuescache_positionencoder_attention_maskre   rd   r   c           	         || j                  |      }|j                  d   }t        j                  || j                  j                  | j
                        }|Mt        j                  |j                  d   |j                        j                  d      j                  |d      }| j                  ||      }t        j                  || j                  j                  | j
                        }t        | j                        D ]  \  }} ||f||||d|}|t        | j                        dz
  k(  s| j                  j                  sF| j!                  ||	|
      }| j#                  |      }|j%                  ||j                  d   | j                  j&                  z  | j                  j(                        }| j                  j                  r|nd} | j*                  |   d|||d|\  }}||z   } |}||fS )	Nr   pr   r$   r0   r/   position_embeddingsr   rG  rH  r   r   r   r   )rX   r7   Fr   r   r   r3   r6   r.   rj   rk   r<  r   r;  rZ   r@  patch_reducer=  r   rh   r   r?  )r   rD  rE  rF  r   r   rG  rH  rI  re   rd   r   rH   r   rN  idxlayerr   cross_attention_outputr   encoder_cross_statess                        r=   r   zBltLocalEncoder.forward/  s      --i8M"((+
		-4;;3F3FQUQ^Q^_]003M<P<PQ[[\]^eefprtu  #oom\J		-4;;3F3FQUQ^Q^_#DKK0 	EJC!$7- /- M c$++&**dkk.O.O#00YW#>>|L+33 2 21 58P8P PRVR]R]RiRi  $(;;#D#DC!	,MD,B,B9,M -".+8#9- 	-)&  ,.DD-	E.  ,222r?   c                 F   |j                   d   }|j                   d   }|j                  d      j                  dd|j                   d         }t        j                  |||f|j
                  |j                        }|j                  |d|dd      }|ddd|ddf   }|S )	a  
        Reduce variable length patches to single embedding per patch
        Note: this works with variable number of patches for different sequences in the batch
        It handles variable length patches by assuming that patch_lengths will be 0 for any
        extra patches on the *right*. Since there can be a variable number of patches
        this function also return the number of patches for each sequence in the batch.
        Any embeddings on the right that are not allocated to a patch
        (i.e. if the sum(patch_lengths[i]) < seq_len for any i)
        will be sent to a dummy patch, which is trimmed before returning.
        r   r/   r,   r$   amaxF)srcr2   r   reduceinclude_selfN)r7   rj   rk   r3   rE   r-   r.   scatter_reduce)r   r   max_num_patchesrd   rH   r  reduced_embeddingss          r=   rQ  zBltLocalEncoder.patch_reducee  s     #((+
%++B/''+222r=;N;Nr;RS	"[[-8@S@S\i\p\p
 0>> ? 
 03CO3CQ0FG!!r?   
NNNNNNNNNN)r   r   r   r(   r(  r   r   r-  r   r3   
LongTensorr   r   r   r   r   r   rQ  r   r   s   @r=   r/  r/    s!   !!n-=QSbc4 2 .2-1,0.204(,266:"&)-43##d*43 ||d*43 llT)	43
 t+43 &&-43 43 ((4/43 !&t 343 4Z43 <<$&43 +,43l"r?   r/  c                   0    e Zd ZU eed<   def fdZ	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	dz  d
ej                  dz  dej                  dz  de
e   fdZ xZS )BltLocalDecoderr   c           	         t         |   |       d| _        || _        d| _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        t        |      | _        t        j                  |j                  |j                  |j                   z  d      | _        t%        |j                  |j&                        | _        t        j                         | _        |j,                  r|j                  nd}t        |      D ]3  }| j*                  j/                  t1        |||j                               5 | j3                          y c c}w )NFTr4  r5  r   r$   r7  )r   r   r8  r   cross_attn_decoderr  r9  rY   r:  r   r;  r   r<  r  hidden_size_globalr   rh   r=  r   r   normr?  r@  r   r   rA  rB  s       r=   r   zBltLocalDecoder.__init__  s$    &+#"&mmEJ6KcKcEde	 3e
 -F;*,))11++f.A.AA+
'
 v11v7J7JK	!#4:4P4P00VW}- 	I""))!9RXRdRde	
 	! fs   E%NrD  rE  rF  r   r   rG  rH  rI  r   c	           	      $   |j                   d   }
|}| j                  |      }|j                  |
|j                   d   | j                  j                  z  | j                  j
                        }|| j                  s||z   }|Mt        j                  |j                   d   |j                        j                  d      j                  |
d      }| j                  ||      }t        j                  || j                  j                  | j                        }t!        | j"                        D ]O  \  }}|dk(  s| j                  j$                  r! | j&                  |   d|||d|	\  }}||z   } ||f||||d|	}Q | j)                  |      }|S )	Nr   r$   r0   r/   rK  rO  rM  r   )r7   r=  r   r   rh   r   rc  r3   r6   r.   rj   rk   r<  rP  r   r   r   r;  r@  r?  re  )r   rD  rE  rF  r   r   rG  rH  rI  r   rH   r   rN  r   rS  rT  r   logitss                     r=   r   zBltLocalDecoder.forward  s    #((+
%66|D#++**1-0H0HH$++JaJa
 #D,C,C)L8M]003M<P<PQ[[\]^eefprtu  #oom\J		-4;;3F3FQUQ^Q^_!$++. 	HAuAv::,ED,B,B1,E -"/+7#9- 	-)& !.0F F!$7- /- M	" =)r?   NNNNNNNN)r   r   r   r'   r(  r   r3   r_  r   r   r   r   r   r   r   s   @r=   ra  ra    s    !!4 4 .2-1,0.204(,266:0##d*0 ||d*0 llT)	0
 t+0 &&-0 0 ((4/0 !&t 30 +,0r?   ra  c                        e Zd ZU eed<   d eedd      iZdef fdZ e	ddd	
      	 	 	 	 dd	e
j                  de
j                  dz  de
j                  dz  dedz  de
j                  dz  dee   fd       Z xZS )BltGlobalTransformerr   global_attentionsr$   global_transformerr1  c                    t         |   |       || _        t        j                         | _        t        |j                        D ]'  }| j
                  j                  t        ||             ) t        |      | _        t        |dd       2t        j                  |j                  |j                  d      | _        nt        j"                         | _        | j%                          y )Nr4  encoder_cross_output_sizeFr   )r   r   r   r  r9  r;  rY   r:  r   r   r   r<  r  r  rn  r   token_embedding_projectionIdentityrA  r   s      r=   r   zBltGlobalTransformer.__init__  s     mmov778 	GIKK269EF	G,F; 66=I.0ii00&2D2D5/D+ /1kkmD+r?   input_embedsz5.6.0rE  )versionnew_nameNr   r   rG  rH  r   c           	         |j                   \  }}}	| j                  |      }
t        j                  |
| j                  j                  | j
                        }
|Mt        j                  |j                   d   |j                        j                  d      j                  |d      }| j                  |
|      }t        | j                        D ]  \  }} ||
f||||d|}
 |
S )NrK  r$   r0   r   r/   rM  )r7   rp  rP  r   r   r   r3   r6   r.   rj   rk   r<  r   r;  )r   rE  r   r   rG  rH  r   rH   rI   r   r   rN  r   rS  s                 r=   r   zBltGlobalTransformer.forward  s     "/!4!4
GQ77F		-4;;3F3FQUQ^Q^_]003M<P<PQ[[\]^eefprtu  #oom\J!$++. 	HAu!$7- /- M	 r?   )NNNN)r   r   r   r&   r(  r   r   r-  r   r   r3   r   r_  r   r   r   r   r   r   s   @r=   rj  rj    s    &&^,<ARfg9 $ ^WO /304(,26|| t+ &&-	
  ((4/ +, Pr?   rj  c                   6    e Zd ZU eed<   def fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  dej                  dz  d	edz  d
ej                  dz  dedz  dedz  dedz  dee   fdZe	 	 dd       Z xZS )
BltPatcherr   c                    t         |   |       t        | j                        | _        t        j                         | _        t        | j                  j                        D ]1  }| j                  j                  t        | j                  |             3 t        j                  | j                  j                  | j                  j                        | _        t!        | j                  j                  | j                  j"                        | _        t        j&                  | j                  j                  | j                  j                  d      | _        | j+                          y )Nr4  r   Fro  )r   r   r   r   r<  r  r9  r;  rY   r:  r   r   r  r>  r   rX   r   r   re  r  lm_headrA  r   s      r=   r   zBltPatcher.__init__  s     ,DKK@mmot{{<<= 	LIKK24;;	JK	LLL)?)?AXAXYt{{66DKK<T<TU	yyKK##KK""
 	r?   NrD  r   r   rG  rE  	use_cacherH  
patch_size	thresholdr|   r   c                 >   |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|}| j                  ||      }| j                  D ]  } ||||      } | j                  | j                  |            }t
        j                  j!                  |      j#                         }|j                  d d	 \  }}|| j%                  ||||	
      }n.t        j&                  ||f|j(                  |j                        }t+        ||
      }|||fS )N:You must specify exactly one of input_ids or inputs_embedsr4  r   r$   r0   r   rE  r   rH  rG  r   )rN  r   )rg  r   )	entropiesrf   r{  r|  r,   )rm   rX   r   r   get_seq_lengthr3   r6   r7   r.   rj   r
   r<  r;  ry  re  distributionsCategoricalentropypatch_lengths_from_entropiesonesr-   r   )r   rD  r   r   rG  rE  rz  rH  r{  r|  r|   r   past_seen_tokenscausal_maskr   rN  rS  rg  prediction_entropiesrH   rf   r{   s                         r=   r   zBltPatcher.forward  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 &"oom\J[[ 	vE!-EXituM	v dii67$22>>f>MUUW&3&9&9"1&=#
O! ==. /%#	 > M "JJ_-]5H5HQ^QeQeM .m=MN#]F::r?   c                    | j                   d   }t        j                  ddgt        j                  | j                        j                  d      j                  |d      }|j                   d   }| ddddf   } | |kD  }|j                   d   }t        j                  || j                        j                  d      j                  |d      }	t        j                  |	|      }
t        j                  |	|
gd      }t        j                  || gd      }||   j                  ||      }|j                  d      j                         }|ddd|f   }t        j                  |||z   fd      }t        j                  |ddddf   |dz
        }t        j                  |ddddf   dz
  |fd      }||z
  dz   }|S )z
        Computes patch lengths from token entropies.

        Depending on whether a threshold is provided, the function uses either:
        - Thresholding the entropy values (when `threshold` is set).
        r   r$   r,   Nr0   r/   r1   )r7   r3   r4   longr.   rj   repeatr6   rk   	full_likerF   r   r8   r   )r  rf   r{  r|  rH   init_tokensoffset
patch_maskrI   token_indicessentinelpadded_indicespadded_maskpatch_startsmax_valid_patchespatch_start_ids
last_token
patch_endsr{   s                      r=   r  z'BltPatcher.patch_lengths_from_entropies]  s    __Q'
 LL!Quzz):J:JKUUVWX__`jlmn 	 ""1% ae$	 *
""1% WY5E5EFPPQRSZZ[egij??=':M8#<!D iij[ 9qA &k2:::wO&NNqN1557#A'9(9'9$9:  ))[,2G$HaP ___QU%;_q=PQ
YY12 6 :JGQO
"_4q8r?   r^  r   )r   r   r   r)   r(  r   r3   r_  r   r   FloatTensorro   r   r   r   r   r   staticmethodr  r   r   s   @r=   rw  rw  	  s   / $ .2.204(,26!%26!%"&'+?;##d*?; t+?; &&-	?;
 ?; ((4/?; $;?; ((4/?; $J?; 4<?; *?; +,?;B  	3 3r?   rw  c                   v    e Zd Zdef fdZee	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dej                  dz  dee   deez  fd              Zd Zd Zdej                  dedej                  fdZ xZS )BltModelr   c                    t         |   |       d| _        || _        t	        |j
                        | _        t        |j                        | _	        t        |j                        | _        |j                  t        |j                        z  }|j                   |z  }t#        j$                  ||j
                  j&                        | _        | j                  j*                  r[t-        |j.                        | _        | j0                  j3                          | j0                  j5                         D ]	  }d|_         nd | _        | j9                          y )NF)r   r   r8  r   r/  r   r\   rj  global_configrl  ra  r  local_decoderrR   rZ   rS   rT   r  r  r   rQ   patch_in_forwardrw  patcher_configpatchereval
parametersrequires_gradrA  )r   r   num_embeddingstotal_vocab_sizeparamr   s        r=   r   zBltModel.__init__  s
    &+#,V-B-BC"6v7K7K"L,V-B-BCDDs6KnKnGoo!??.P*,,,7GI^I^IjIj*k';;''%f&;&;<DLLL002 ,&+#,  DLr?   NrD  r{   r   r   rG  rE  rz  rH  r   rU   c	                    |d u |d uz  rt        d      |rg|5t        t        | j                        t        | j                              }n0t	        |t              s t        |t        | j                              }||}
|j
                  \  }}}no|j
                  \  }}t        || j                  | j                  | j                  j                  | j                  j                  | j                  j                        }
|| j                  j                  dk(  r| j                  |t        d      | j                  || j                  j                  | j                  j                  | j                  j                   | j                  j"                  |j$                        \  }}}no||j$                  n|j$                  }||j&                  n|j&                  }t)        t+        j,                  ||dz   f||      | j                  j                         }| j/                  ||      }|F||j1                         nd}t+        j2                  |||
j
                  d   z   |
j$                  	      }||j5                  d      }t7        | j                  |
||||j8                  nd |
      }t;        ||j
                  d   |d| j                  j<                  |
j&                        } | j                  d||
||||j
                  d   |||j8                  nd d|	\  }}|j?                  ||j
                  d   d      }t+        j2                  d|j
                  d   |j$                  	      }|j5                  d      }t7        | j                  |d |d d 
      } | j@                  d|||d|	}| j/                  |d d dd f   |      }t;        ||j
                  d   |d| j                  j<                  |
j&                        } | jB                  d|||||||jD                  nd ||d|	}tG        ||      S )Nr~  r4  r  z0input_ids is required for entropy-based patching)r{  r|  r|   patching_batch_sizer.   r$   r,   r   r0   r  T)rd   re   rf   rg   rh   r-   )rD  rE  r   r   rI  re   rd   rG  r/   )rE  r   r   F)rD  rE  rF  r   r   rG  rH  rI  )last_hidden_staterG  r   )$rm   r   r   r   r   r7   rc   r\   rQ   rR   rS   rT   patching_moder  r{  patching_thresholdr|   r  r.   r-   r   r3   r  _patch_ids_from_lengthsr  r6   rj   r
   self_attention_cacherz   rh   r   rl  r  cross_attention_cacher   )r   rD  r{   r   r   rG  rE  rz  rH  r   encoder_embedsrH   rf   r   r.   r-   rd   r  r  cross_attn_mask_encencoder_hidden_statesrU  global_cache_positionglobal_position_idsglobal_causal_maskglobal_hidden_statesdecoder_patch_idscross_attn_mask_decoutputs                                r=   r   zBltModel.forward  sq    -t";<YZZ&"5 4l$++6V#  1DE #6o|[_[f[fGg"h $*N-:-@-@*J*3//'J4""//@@8899N  {{((I5$,,:R$$%WXX&*ll#{{55"kk<<%)[[%A%A(,(G(G$++ '3 '#=! .7-B))H\H\+4+@	mFYFY 5JJ
Oa,?@V\]KK00! 00P	!CRC^==?de"\\ "2^5I5I!5L"LUcUjUjN )33A6L(;;())DSD_O@@ei%
 B%++A.+#11 &&
 7Id6H6H 
7
(&%#6%++A.DSD_O@@ei
7
 
7
33  488]EXEXYZE[]_` %Q0D0J0J10MVjVqVq r3==a@/;;.0 
  7t66  
.-, 
 	 
 !88q!"u9M_A'%++A.+$11 &&
 $## 

/-&%ETE`OAAfj)#6

 

 '$+
 	
r?   c                 .    | j                   j                  S r   r\   rX   )r   s    r=   get_input_embeddingszBltModel.get_input_embeddings6  s    !!...r?   c                 &    || j                   _        y r   r  )r   values     r=   set_input_embeddingszBltModel.set_input_embeddings9  s    */'r?   rI   c                    |j                   d   }t        j                  t        j                  |d|j                  |j
                        |j                  d      d d d df   gd      }t        j                  ||j
                        }|j                  d      |j                  d      j                  d      k  j                  d      dz
  S )Nr   r$   r,   r/   r1   r0   )
r7   r3   rF   rE   r-   r.   cumsumr6   rj   r8   )r   r{   rI   rH   r  token_positionss         r=   r  z BltModel._patch_ids_from_lengths<  s    "((+
yyJ1D1D]MaMab$$$,QV4 
  ,,w}7K7KL&&q)_-F-Fq-I-S-STV-WW\\ac\dghhhr?   rh  )r   r   r   r%   r   r   r   r3   r_  r   r   r  ro   r   r   tupler   r   r  r  r   r  r   r   s   @r=   r  r    s9   y (   .2-1.204(,26!%26I
##d*I
 ||d*I
 t+	I

 &&-I
 I
 ((4/I
 $;I
 ((4/I
 +,I
 
(	(I
   I
V/0
iU\\ 
iC 
iTYT`T` 
ir?   r  zB
    The Blt Text Model with a language modeling head on top.
    )custom_introc                        e Zd ZU eed<   dZdZddiZdef fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  deej                  ej                  f   dz  dedz  dej                   dz  dej                  dz  dedz  dej                  dz  deej                  z  dee   deez  fd              Z xZS )BltForCausalLMr   Fmodelz'model.local_encoder.embed_tokens.weightzlm_head.weightc                 &   t         |   |       |j                         | _        |j                  | _        t        |      | _        t        j                  |j                  j                  |j                  d      | _        | j                          y )NFro  )r   r   get_text_configtext_configr>  r  r  r  r  r  r   ry  rA  )r   r   r   s     r=   r   zBltForCausalLM.__init__T  sk     !113 ++f%
yy!6!6!B!BFDUDU\abr?   NrD  r   r   r   rv   full_text_row_masked_out_maskrG  rE  labelsrz  rH  logits_to_keepr   rU   c                     | j                   d||||||||
|d	|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         j                         }d}|	 | j                  ||	| j                  fi |}t        |||j                  |j                  |j                        S )a
  
        cross_attention_states (`torch.FloatTensor`, *optional*):
            Output of the vision model, used for cross-attention. This tensor contains the processed image features that
            the language model will attend to.
        cross_attention_mask (`torch.Tensor` of shape `(batch_size, seq_length, max_num_images, max_num_tiles)`, *optional*):
            Cross-attention mask to control the interaction between text tokens and image tiles.
            This 4D tensor defines which image tiles each text token should attend to.

            For each text token (in seq_length):
            - 1 indicates the token **should attend** to the corresponding image tile
            - 0 indicates the token **should not attend** to the corresponding image tile
        full_text_row_masked_out_mask (`tuple[torch.Tensor, torch.Tensor]`, *optional*):
            A tuple containing two tensors that mask out rows in the cross-attention mechanism:
            - The first tensor has shape `(batch_size, 1, seq_length, 1)` and contains values of 0 or 1.
              A value of 0 indicates that the corresponding text token's entire row in the cross-attention
              matrix should be masked out (all image tokens ignored).
            - The second tensor has the same shape and is used internally to apply the masking during
              the forward pass of cross-attention layers.
            This mask is derived from the cross_attention_mask and is used to handle cases where a text token
            should not attend to any image token.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BltForCausalLM

        >>> model = BltForCausalLM.from_pretrained("itazap/blt-1b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("itazap/blt-1b-hf")

        >>> prompt = "If I had to write a haiku, it would be:"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6)
        >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        >>> print(result)
        If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful.
        I love the idea of snowflakes gently falling, each one
        ```
        )	rD  r   r   rv   r  rG  rE  rz  rH  N)lossrg  rG  r   r   r   )r  r  r   r   slicery  r   loss_functionr>  r   rG  r   r   )r   rD  r   r   r   rv   r  rG  rE  r  rz  rH  r  r   outputsr   slice_indicesrg  r  s                      r=   r   zBltForCausalLM.forward]  s    ~ $** 
)%!5*G+')
 
  118B>SV8W~ot4]kmA}a,?@AGGI%4%%ffdooPPD%#33!//))
 	
r?   )NNNNNNNNNNNr   )r   r   r   r%   r(  _can_compile_fullgraphbase_model_prefix_tied_weights_keysr   r   r   r3   r_  r   r  r   r  ro   r   r   r   r   r   r   r   s   @r=   r  r  I  s    "CEUVy   .2.204:>8<RV(,26*.!%26-.X
##d*X
 t+X
 &&-	X

 !& 0 04 7X
 $..5X
 (-U\\5<<-G'H4'OX
 X
 ((4/X
   4'X
 $;X
 ((4/X
 ell*X
 +,X
 
'	'X
  X
r?   r  )r   r  rw  r  )rW   )r   rW   i0u  )\r   collections.abcr   r3   torch.distributionstorch.nnr  torch.nn.functional
functionalrP   r   r  cache_utilsr   r   r   
generationr	   masking_utilsr
   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   utils.output_capturingr   r   cohere2.modeling_cohere2r   llama.modeling_llamar   mllama.modeling_mllamar   r   r   r    r!   r"   r#   configuration_bltr%   r&   r'   r(   r)   
get_loggerr   loggerr   r>   r   rO   r  listrc   float32ro   r-   r  rz   r   r   r   r   r   r   r   r   r/  ra  rj  rw  r  r  __all__r   r?   r=   <module>r     s   C $      & C C ) / O K 5 & R R 0 G E 2 7    
		H	%: :< \a||),9<UX$#,,# !## +.	#
 #'# $'# \\#T  %K ||K K  K  	K 
 K  ;;K  5<<%&K \) )t )X]XdXd )X	] 		" 	<- <"`9 `,. ,
,)0 ,)^ n?. n? n?bp"( p"fK( K\3- 3lH# HVri! rij 
i
' i

i
Xr?   