
    qi              
          d Z ddlZddlZddlmZ ddlmZmZmZmZ ddlm	Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!  ejD                  e#      Z$dejJ                  de&dejN                  dejJ                  fdZ(dejJ                  dejJ                  de)de*dejJ                  f
dZ+dejJ                  dejJ                  fdZ,dejJ                  dejJ                  dejJ                  fdZ- G d dej\                  j^                        Z0 G d d ejb                        Z2 G d! d"ejb                        Z3 G d# d$ejb                        Z4 G d% d&e      Z5e G d' d(e             Z6e G d) d*e6             Z7 ed+,       G d- d.e6e             Z8 ed/,       G d0 d1e6             Z9e G d2 d3e6             Z:e G d4 d5e6             Z;g d6Z<y)7zPyTorch BLOOM model.    N)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCacheStaticCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )BloomConfigattention_mask	num_headsdtypereturnc                    | j                   \  }}dt        j                  t        j                  |            z  }t	        j
                  ddt        j                  |      dz
   z   z  | j                  t        j                        }t	        j                  dd|z   | j                  t        j                        }t	        j                  ||      }||k7  rt	        j
                  ddt        j                  d|z        dz
   z   z  | j                  t        j                        }	t        |||z
        }
t	        j                  ddd|
z  z   d| j                  t        j                        }t	        j                  |t	        j                  |	|      gd      }| j                  d      dz
  | z  dddddf   }|d	   |z  }|j                  ||z  d|      j                  |      S )
a  
    Link to paper: https://huggingface.co/papers/2108.12409 Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.

    Args:
    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
        attention_mask (`torch.Tensor`):
            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
        num_heads (`int`):
            number of heads
        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
            dtype of the output tensor
       r	   devicer   r   r   dimN).N)shapemathfloorlog2torchtensorr!   float32arangeint32powmincatcumsumreshapeto)r   r   r   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/bloom/modeling_bloom.pybuild_alibi_tensorr@   -   s   " ,11J
djj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYtV$FY&\\A499Q);%;<q@AABCNLaLainiviv

 ""4iBT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj,$GHaP %+++3a7>I1dTU:VM9-E==i/J?BB5II    xresidualprobtrainingc                 @    t        j                  | ||      }||z   }|S )a
  
    Dropout add function

    Args:
        x (`torch.tensor`):
            input tensor
        residual (`torch.tensor`):
            residual tensor
        prob (`float`):
            dropout probability
        training (`bool`):
            training mode
    )prE   )Fdropout)rB   rC   rD   rE   outs        r?   dropout_addrK   Y   s$     ))A
1C
S.CJrA   c                 \    | dz  dt        j                  d| z  dd| z  | z  z   z        z   z  S )z
    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
    make the model jitable.

    Args:
        x (`torch.tensor`):
            input hidden states
          ?      ? e3E?r   Hm?r)   tanh)rB   s    r?   bloom_gelu_forwardrS   l   s8     s7cEJJzA~X\A=M9M'NOOPPrA   gc                     |d   }t        j                  d|z  dd|z  |z  z   z        }d|z  d||z  z
  dd|z  |z  z   z  z  dd|z   z  z   }|| z  S )a   
    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
    0.3989423 * x * torch.exp(-0.5 * x * x)

    Args:
        g (`torch.tensor`):
            gradient output tensor
        x (`torch.tensor`):
            input tensor
    r   rO   r   rP   rM   g6vf?rQ   )rT   rB   tanh_outffs       r?   bloom_gelu_backrX   x   sz     	
!Azz*q.A1q0@,@ABH	qQH,,lQ>NQR>R1RS	TWZ^_bj^jWk	kB6MrA   c                       e Zd Zedej
                  dej
                  fd       Zedej
                  dej
                  fd       Zy)GeLUFunctioninputr   c                 :    | j                  |       t        |      S N)save_for_backwardrS   )ctxr[   s     r?   forwardzGeLUFunction.forward   s    e$!%((rA   grad_outputc                 6    | j                   }t        ||      }|S r]   )saved_tensorsrX   )r_   ra   r[   tmps       r?   backwardzGeLUFunction.backward   s    !!k51
rA   N)__name__
__module____qualname__staticmethodr)   Tensorr`   re    rA   r?   rZ   rZ      sT    )ELL )U\\ ) ) 5<< ELL  rA   rZ   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )	BloomGeluzN
    Partly copied from Megatron-DeepSpeed code and adapted for our needs
    c                 "    t         |           y r]   )super__init__)self	__class__s    r?   rp   zBloomGelu.__init__   s    rA   rB   r   c                 ,    t         j                  |      S r]   )rZ   apply)rq   rB   s     r?   r`   zBloomGelu.forward   s    !!!$$rA   )	rf   rg   rh   __doc__rp   r)   rj   r`   __classcell__rr   s   @r?   rm   rm      s(    % %%,, %rA   rm   c                   v    e Zd Zddededz  f fdZdej                  deej                  ej                  ej                  f   fdZ	dej                  dej                  fd	Z
	 	 	 	 dd
ej                  dej                  dej                  dej                  dedz  dededej                  dz  fdZ xZS )BloomAttentionNconfig	layer_idxc                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | _        |j                  | _	        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      dt        j                  | j                        z  | _        d| _        || _        |-t         j#                  d| j$                  j&                   d       t)        j*                  | j                  d| j                  z  d	      | _        t)        j*                  | j                  | j                        | _        t)        j0                  |j2                        | _        y )
NzA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).rN   zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.r	   Tbias)ro   rp   pretraining_tpslow_but_exacthidden_sizen_headr   head_dim
split_sizehidden_dropout
ValueErrorr&   sqrtinv_norm_factorbetar{   loggerwarning_oncerr   rf   r   Linearquery_key_valuedenseDropoutattention_dropout)rq   rz   r{   rr   s      r?   rp   zBloomAttention.__init__   sz   $33$33!--((DNN:**$33==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==	" !8!8 9 :, ,  "yy)9)91t?O?O;OVZ[YYt//1A1AB
!#F,D,D!ErA   	fused_qkvr   c                    |j                   \  }}}|j                  ||| j                  d| j                        }|ddddf   j	                  dd      }|ddddf   j	                  dd      }|ddddf   j	                  dd      }|||fS )a  
        Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape
        without making any copies, results share same memory storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, num_heads, seq_length, head_dim]
            key: [batch_size, num_heads, seq_length, head_dim]
            value: [batch_size, num_heads, seq_length, head_dim]
        r	   .r   Nr   r   )r%   viewr   r   	transpose)rq   r   r4   r5   three_times_hidden_sizequery_layer	key_layervalue_layers           r?   _reshapezBloomAttention._reshape   s     ;D//7
J 7NN:z4>>1dmm\	Q	*44Q:c1ai(221a8	Q	*44Q:I{22rA   rB   c                    |j                   \  }}}|| j                  z  }|j                  || j                  || j                        }|j	                  dddd      }|j                  ||| j                  | j                  z        S )z
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        r   r   r   r	   )r%   r   r   r   permuter2   )rq   rB   batch_size_and_num_headsr5   _r4   s         r?   _merge_headszBloomAttention._merge_heads   sy     34''/ *a-?
 FF:t~~z4==I IIaAq! yyZ$--1OPPrA   hidden_statesrC   r>   r   
layer_past	use_cacheoutput_attentionscache_positionc	                    |j                   \  }	}
}| j                  |      }| j                  |      \  }}}|%d|i}|j                  ||| j                  |      \  }}|j                  |	| j                  z  d| j                        }|j                  |	| j                  z  d| j                        j                  dd      }|j                  |	| j                  z  d| j                        }|j                  ||| j                  | j                        }|j                  |	| j                  |
d      }|||z   }t        j                  |dt        j                         j#                  |j$                        }| j'                  |      }|j                  |	| j                  z  |
d      }t        j(                  ||      }| j+                  |      }| j,                  dkD  r| j.                  r| j0                  | j,                  z  }t        j2                  |      }t5        | j,                        D ]z  }|t        j6                  |d d d d t9        ||z        t9        |dz   |z        f   | j:                  j<                  d d t9        ||z        t9        |dz   |z        f         z   }| n| j;                  |      }t?        ||| j@                  | jB                        }||fS )Nr   r$   )batch1batch2r   alpha)r#   r   r   )"r%   r   r   updater{   r2   r   r   r   baddbmmr   r   r   rH   softmaxr)   r+   r3   r   r   bmmr   r   r   r   
zeros_likerangelinearintr   weightrK   r   rE   )rq   r   rC   r>   r   r   r   r   r   r4   q_lengthr   r   r   r   r   cache_kwargsattention_scoresattn_weightsattention_probsattention_probs_reshapedcontext_layerslicesoutput_tensoris                            r?   r`   zBloomAttention.forward   s    #0"5"5
Ha((7	.2mmI.F+Y!,n=L%/%6%6y+t~~_k%l"I{ "))*t~~*Er4==Y%%j4>>&A2t}}U__`bdfg	!))*t~~*Er4==Y !==&&	 ) 
 (,,ZSUV%'.8L ))LbNQQR]RcRcd 00A $3#7#7
T^^8SU]_a#b  		":KH ))-8 "t':':%%(;(;;F!,,];M4../  -!!QAJ#q1u>N:O(O"OPJJ%%aQZ3A?O;P)P&PQ1 ! !JJ}5M#M8T=P=PRVR_R_`o--rA   r]   NFFN)rf   rg   rh   r   r   rp   r)   rj   tupler   r   r
   bool
LongTensorr`   rv   rw   s   @r?   ry   ry      s    F{ FsTz FB3%,, 35u||UZUaUa9a3b 3(Qell Qu|| Q> $("'26B.||B. ,,B. ||	B.
 B. DLB. B.  B. ((4/B.rA   ry   c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )BloomMLPrz   c                 6   t         |           |j                  }|j                  | _        |j                  | _        t        j                  |d|z        | _        t               | _	        t        j                  d|z  |      | _
        |j                  | _        y )N   )ro   rp   r   r   r   r   r   dense_h_to_4hrm   	gelu_impldense_4h_to_hr   )rq   rz   r   rr   s      r?   rp   zBloomMLP.__init__8  sz    (($33$33YY{AOD"YYq;D$33rA   r   rC   r   c                    | j                  | j                  |            }| j                  dkD  r| j                  rt	        j
                  |      }| j                  j                  j                  d   | j                  z  }t        | j                        D ]z  }|t        j                  |d d d d t        ||z        t        |dz   |z        f   | j                  j                  d d t        ||z        t        |dz   |z        f         z   }| n| j                  |      }t        ||| j                  | j                        }|S )Nr   r$   )r   r   r   r   r)   r   r   r   r%   r   rH   r   r   rK   r   rE   )rq   r   rC   intermediate_outputr   r   outputs          r?   r`   zBloomMLP.forwardC  s.   t'9'9-'HI"t':':"'"2"28"<''..44R84;N;NNF4../ &9AHH!!QAJ#q1u>N:O(O"OP&&--aQZ3AQWGWCX1X.XY= '# #'"4"4]"C0(D<O<OQUQ^Q^_rA   )	rf   rg   rh   r   rp   r)   rj   r`   rv   rw   s   @r?   r   r   7  s5    	4{ 	4U\\ U\\ ell rA   r   c                        e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dej                  dedz  d	e	d
e	dej                  dz  fdZ xZS )
BloomBlockNrz   r{   c                 R   t         |           |j                  }t        ||j                        | _        |j                  | _        t        ||      | _	        t        ||j                        | _
        t        |      | _        |j                  | _        |j                  | _        y )Neps)ro   rp   r   r   layer_norm_epsiloninput_layernormr   r   ry   self_attentionpost_attention_layernormr   mlp(apply_residual_connection_post_layernormr   )rq   rz   r{   r   rr   s       r?   rp   zBloomBlock.__init__W  s    (((&:S:ST,VY?(1+6C\C\(]%F#8>8g8g5$33rA   r   r>   r   r   r   r   r   c           
          | j                  |      }| j                  r|}	n|}	| j                  ||	||||||      \  }
}| j                  |
      }| j                  r|}	n|
}	| j	                  ||	      }||fS )N)r   r   r>   r   r   r   )r   r   r   r   r   )rq   r   r>   r   r   r   r   r   layernorm_outputrC   attention_outputr   r   s                r?   r`   zBloomBlock.forwarde  s      //> 88'H$H *.)<)<!)/) *= 	*
&,  889IJ 88'H'H *H5|##rA   r]   r   )rf   rg   rh   r   r   rp   r)   rj   r
   r   r   r`   rv   rw   s   @r?   r   r   V  s    4{ 4sTz 4& $("'26,$||,$ ||,$ 	,$
 DL,$ ,$  ,$ ((4/,$rA   r   c                   .    e Zd ZU eed<   dZdZdgZdZdZ	y)BloomPreTrainedModelrz   transformerTr   past_key_valuesN)
rf   rg   rh   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraphrk   rA   r?   r   r     s(    %&*#%"3!rA   r   c                       e Zd Zdef fdZdej                  dedej                  dej                  fdZ	d Z
d	ej                  fd
Ze	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  df   ez  fd       Z xZS )
BloomModelrz   c           	      "   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  | j                        | _	        t        | j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        | j                  |j                        | _        d| _        | j)                          y c c}w )Nr   )r{   F)ro   rp   r   	embed_dimr   r   r   	Embedding
vocab_sizeword_embeddingsr   r   word_embeddings_layernorm
ModuleListr   num_hidden_layersr   hln_fgradient_checkpointing	post_init)rq   rz   r   rr   s      r?   rp   zBloomModel.__init__  s     ++  "||F,=,=t~~N)24>>vG`G`)a& vOgOgIhiA
6Q ?ij dnn&2K2KL	&+# 	  js   .Dr   r   r   r   c                     t        |||      S r]   )r@   )rq   r   r   r   s       r?   r@   zBloomModel.build_alibi_tensor  s    !.)UCCrA   c                     | j                   S r]   r   )rq   s    r?   get_input_embeddingszBloomModel.get_input_embeddings  s    ###rA   new_embeddingsc                     || _         y r]   r   rq   r   s     r?   set_input_embeddingszBloomModel.set_input_embeddings  s
    -rA   N	input_idsr   inputs_embedsr   r   output_hidden_statesreturn_dictr   .c
           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r|t        | j                         }|j                  \  }}}||j                         nd}||z   }|	%t        j                  |||z   |j                         }	| j#                  |      }|rdnd}|rdnd}|$t        j$                  ||f|j                         }n|j'                  |j                         }| j)                  || j*                  |j,                  	      }t/        | j                   |||	|
      }t1        | j2                        D ]-  \  }}|r||fz   } ||||||||	      }|d   }|s%||d   fz   }/ | j5                  |      }|r||fz   }|st7        d ||||fD              S t9        ||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rz   r   r!   rk   )r   )rz   r   r   r   r   )r   r   r   r   r>   r   r   c              3   &   K   | ]	  }||  y wr]   rk   ).0vs     r?   	<genexpr>z%BloomModel.forward.<locals>.<genexpr>"  s      ghgts   )last_hidden_stater   r   
attentions)rz   r   r   r   use_return_dictr   r   rE   r   r   r   r   r%   get_seq_lengthr)   r,   r!   r   onesr3   r@   r   r   r   	enumerater   r   r   r   )rq   r   r   r   r   r   r   r   r   r   kwargsr4   r5   r   past_lengthseq_length_with_pastr   all_self_attentionsall_hidden_statesr>   causal_maskr   blockoutputss                           r?   r`   zBloomModel.forward  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yl I  00;M0*$++>O$1$7$7!
J:I:Uo446[\)K7!"\\+{Z7OXeXlXlmN66}E$5b4"6BD !"ZZ5I(JS`SgSghN+..}/C/CDN''mNaNa'b(;;'))+
 "$&&) 	JHAu#$58H$H!**#"3-G $AJM &9WQZM&I#!	J& 		-0 1]4D D )?<MObc   9+++*	
 	
rA   	NNNNNNNNN)rf   rg   rh   r   rp   r)   rj   r   r   r@   r   r   r   r   r
   r   r   r   r`   rv   rw   s   @r?   r   r     sM   { *D D# DV[VaVa Dfkfrfr D$.5<< .  .2(,.215!%)-,0#'26l
##d*l
 l
 t+	l

 ''$.l
 $;l
  $;l
 #Tkl
 D[l
 ((4/l
 
u||S 	!$M	Ml
 l
rA   r   z
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                       e Zd ZddiZdef fdZdej                  fdZ	 	 	 	 	 	 d fd	Z	e
	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
edz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deej                     ez  fd       Z xZS )BloomForCausalLMzlm_head.weightz"transformer.word_embeddings.weightrz   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr}   )
ro   rp   r   r   r   r   r   r   lm_headr   rq   rz   rr   s     r?   rp   zBloomForCausalLM.__init__7  sI     %f-yy!3!3V5F5FUS 	rA   r   c                     || _         y r]   )r  r   s     r?   set_output_embeddingsz&BloomForCausalLM.set_output_embeddings?  s	    %rA   Nc           
      8   t        |   |f||||||d|}	t        |t              rq|o|j	                         }
|j
                  \  }}|
|z
  }t        j                  |||j                  |j                        }t        j                  ||gd      }||	d<   |	S )N)r   r   r   r   r   is_first_iterationr    r$   r"   r   )ro   prepare_inputs_for_generation
isinstancer   get_max_cache_shaper%   r)   zerosr!   r   r0   )rq   r   r   r   r   r   r   r  r	  model_inputstarget_lengthr4   r5   diffnew_attn_maskrr   s                  r?   r  z.BloomForCausalLM.prepare_inputs_for_generationB  s     w<	
+)')1	
 	
 o{38R+??AM%3%9%9"J
 :-D!KK
DAVAV^l^r^rsM"YY'FBON-;L)*rA   r   r   r   r   labelsr   r   r   r   r   logits_to_keepr   c                    |	|	n| j                   j                  }	| j                  ||||||||	|
	      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|8| j                  ||| j                   j                  |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )a\  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r   r   r   r   r   r   r   r   r   num_items_in_batch)r   r(  r   losslogitsr   r   r  )rz   r  r   r  r   slicer  loss_functionr   getr   r   r   r  )rq   r   r   r   r   r%  r   r   r   r   r   r&  r	  transformer_outputsr   slice_indicesr+  r*  r   s                      r?   r`   zBloomForCausalLM.forwardg  s+   B &1%<k$++B]B]"..+)'/!5#) / 

 ,A.8B>SV8W~ot4]kmA}a,?@A%%;;11#)::.B#C	 & D Y!4QR!88F)-)9TGf$EvE0/??-;;*55
 	
rA   )NNNNTF)NNNNNNNNNNr   )rf   rg   rh   _tied_weights_keysr   rp   r)   rj   r  r  r   r   r
   r   r   r   r   r`   rv   rw   s   @r?   r  r  .  s_    +,PQ{ &ELL &  #J  .2(,.2-1&*!%)-,0#'26-.F
##d*F
 F
 t+	F

 ||d*F
 t#F
 $;F
  $;F
 #TkF
 D[F
 ((4/F
 ell*F
 
u||	@	@F
 F
rA   r  a  
    The Bloom Model transformer with a sequence classification head on top (linear layer).

    [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )BloomForSequenceClassificationrz   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
ro   rp   
num_labelsr   r   r   r   r   scorer   r  s     r?   rp   z'BloomForSequenceClassification.__init__  sV      ++%f-YYv1163D3D5Q
 	rA   Nr   r   r   r   r%  r   r   r   r   r   c
           
      r   |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|^| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j"                  dk(  rt1               } |||      }n,| j                   j"                  dk(  rt3               } |||      }|	s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )6  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   r   r   r   r   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r$   r    z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r   
regressionsingle_label_classificationmulti_label_classificationr)  )rz   r  r   r6  r%   pad_token_idr   r3   r!   r)   r-   r,   argmaxr   r   rr   rf   problem_typer5  r   longr   r   squeezer   r   r   r   r   r  )rq   r   r   r   r   r%  r   r   r   r   r	  r/  r   r+  r4   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr*  loss_fctr   s                         r?   r`   z&BloomForSequenceClassification.forward  s   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
rA   r  )rf   rg   rh   r   rp   r   r)   r   r
   rj   r   r   r   r`   rv   rw   s   @r?   r3  r3    s    {   .2(,.2-1&*!%)-,0#'e
##d*e
 e
 t+	e

 ||d*e
 t#e
 $;e
  $;e
 #Tke
 D[e
 
u||	?	?e
 e
rA   r3  c                       e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deej                     ez  fd       Z xZS )BloomForTokenClassificationrz   c                    t         |   |       |j                  | _        t        |      | _        t        |d      r|j                  |j                  }n't        |d      r|j                  |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)ro   rp   r5  r   r   hasattrrJ  r   r   r   rI   r   r   
classifierr   )rq   rz   rJ  rr   s      r?   rp   z$BloomForTokenClassification.__init__4  s      ++%f-6/0V5N5N5Z!'!:!:V-.63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	rA   Nr   r   r   r   r%  r   r   r   r   r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|l|j                  |j                        }|j                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r8  Nr9  r   r   )r*  r+  r   r  )rz   r  r   rI   rL  r3   r!   r%   r   r   r5  r   r   r  )rq   r   r   r   r   r%  r   r   r   r   r	  r/  r   r+  r*  r4   r5   rF  r   s                      r?   r`   z#BloomForTokenClassification.forwardE  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
rA   r  )rf   rg   rh   r   rp   r   r)   r   r
   rj   r   r   r   r`   rv   rw   s   @r?   rH  rH  2  s    { "  .2(,.2-1&*!%)-,0#'B
##d*B
 B
 t+	B

 ||d*B
 t#B
 $;B
  $;B
 #TkB
 D[B
 
u||	4	4B
 B
rA   rH  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  de	e
z  fd       Z xZS )BloomForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr   )	ro   rp   r   r   r   r   r   
qa_outputsr   r  s     r?   rp   z"BloomForQuestionAnswering.__init__  sA     %f-))F$6$6: 	rA   Nr   r   r   start_positionsend_positionsr   r   r   r   c	                 "   ||n| j                   j                  }| j                  ||||||      }
|
d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|
dd z   }||f|z   S |S t        ||||
j                  |
j                  	      S )
r   N)r   r   r   r   r   r   r   r$   r"   )ignore_indexr   )r*  start_logits
end_logitsr   r  )rz   r  r   rQ  splitrA  
contiguouslensizeclampr   r   r   r  )rq   r   r   r   rR  rS  r   r   r   r	  r  sequence_outputr+  rV  rW  
total_lossignored_indexrF  
start_lossend_lossr   s                        r?   r`   z!BloomForQuestionAnswering.forward  s   4 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rA   )NNNNNNNN)rf   rg   rh   rp   r   r)   r   FloatTensorr   r   r   r`   rv   rw   s   @r?   rO  rO    s      .237263715)-,0#'F
##d*F
 ))D0F
 ((4/	F

 ))D0F
 ''$.F
  $;F
 #TkF
 D[F
 
-	-F
 F
rA   rO  )r  r   r   r3  rH  rO  )=ru   r&   r)   r   torch.nnr   r   r   r   r   rH   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_bloomr   
get_loggerrf   r   rj   r   r   r@   floatr   rK   rS   rX   autogradFunctionrZ   Modulerm   ry   r   r   r   r   r  r3  rH  rO  __all__rk   rA   r?   <module>rr     s9       L L $ ; ; ) / 9  . - 
		H	%)Ju|| )J )JEKK )J\a\h\h )JX5<< 5<< u PT Y^YeYe &	Q%,, 	Q5<< 	Qu||   $
5>>** 
	%		 	%Q.RYY Q.hryy >;$+ ;$| "? " " L
% L
 L
^ z
+_ z
z
z p
%9 p
p
f U
"6 U
 U
p P
 4 P
 P
frA   