
    qiz$                    F   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,  e)jZ                  e.      Z/dZ0dejb                  de2de2fdZ3	 d}dejb                  de2dejb                  dz  fdZ4	 	 d~de5e2e2f   de6de2dejn                  dz  de2dejp                  fd Z9 G d! d"e      Z: G d# d$e      Z; G d% d&e      Z< G d' d(ejz                        Z> G d) d*ejz                        Z? G d+ d,ejz                        Z@ G d- d.ej
                  jz                        ZA G d/ d0ejz                        ZB G d1 d2ejz                        ZC G d3 d4ejz                        ZD G d5 d6ejz                        ZE G d7 d8ejz                        ZF G d9 d:ejz                        ZG G d; d<ejz                        ZH G d= d>ejz                  e%      ZI G d? d@ejz                  e%      ZJ G dA dBejz                  e%      ZK G dC dDejz                        ZL G dE dFejz                        ZM G dG dHe      ZN G dI dJe      ZOe( G dK dLe&             ZP G dM dNeP      ZQ G dO dPeP      ZR G dQ dReP      ZS G dS dTeP      ZT G dU dVeP      ZU G dW dXeP      ZV G dY dZeP      ZW G d[ d\eP      ZX G d] d^ejz                        ZY G d_ d`ejz                        ZZ e(dab       G dc ddeP             Z[ e(deb       G df dgePe             Z\	 	 	 	 	 	 	 	 ddhePdej                  diej                  dz  dejn                  dz  dje6dke6dle6dmejz                  dz  dne^doe^dej                  e5ej                  ej                  f   z  fdpZ_ e(dqb       G dr dseP             Z` e(dtb       G du dveP             Za G dw dxejz                        Zb e(dyb       G dz d{e&             Zcg d|Zdy)zPyTorch SpeechT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr+   4   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     |dkD  r | dd|dz
  d|f   } ||dd|dz
  d|f   }| j                  | j                        }| ddddf   j                         |ddddf<   |j                  |dk(  d       ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr#         Y        )r$   r%   r&   r(   )r-   r.   r/   shifted_input_valuess       r*   shift_spectrograms_rightr4   D   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r,   r%   	mask_probmask_length	min_masksreturnc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr6   r5   r7   sequence_lengths     r*   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr,   Nr#   dtyper   F)replace)r'   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper=   put_along_axis)r%   r5   r6   r/   r7   
batch_sizerB   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr>   r?   spec_aug_mask_idxdummy_mask_idxoffsetsr@   rA   s    `` `            @@r*   _compute_mask_indicesrd   Z   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   rm   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r,   c                 J    | j                  |      }| j                  |      }|S N)ru   rw   ry   hidden_statess     r*   forwardz$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r,   r   __name__
__module____qualname__rm   r   __classcell__r|   s   @r*   rf   rf      s    Ar,   rf   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rh   T)elementwise_affine)rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   	LayerNorm
layer_normr	   rv   rw   rx   s      r*   rm   z#SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r,   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr#   )ru   	transposer   rw   r   s     r*   r   z"SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r,   r   r   r   s   @r*   r   r      s    Ar,   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rh   T)
num_groupsnum_channelsaffine)rl   rm   rn   ro   rp   r   rq   rr   rs   rt   ru   r	   rv   rw   	GroupNormr   rx   s      r*   rm   z#SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr,   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r~   )ru   r   rw   r   s     r*   r   z"SpeechT5GroupNormConvLayer.forward  s2    		-066r,   r   r   r   s   @r*   r   r     s    r r,   r   c            	            e Zd ZdZddedededz  f fdZddedededz  fdZeddedededz  fd	       Z e	j                         dd
e	j                  defd       Z	 dd
e	j                  dededz  fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxc                     t         |           d| _        || _        || _        || _        | j                  || j                  z   ||       y N   )rl   rm   offsetr   r   r   make_weights)ry   r   r   r   r|   s       r*   rm   z.SpeechT5SinusoidalPositionalEmbedding.__init__   sH    **&-$++5}kRr,   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsrD   deviceF
persistent)get_embeddinghasattrtor   rD   r   register_buffer)ry   r   r   r   emb_weightss        r*   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights(  s[    ((T4#%..t||/A/A$,,J]J].^KYFr,   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rC   r   dimr#   N)mathlogtorchexprQ   int64float	unsqueezecatsincosviewrN   r   get_default_dtype)r   r   r   half_dimembs        r*   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding0  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r,   r   past_key_values_lengthc                    |j                         \  }}| j                  || j                  |      j                  |j                        }| j                  dz   |z   }|| j
                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j
                  j                  d|j                  d            j                  ||d      j                         S )Nr   r   r#   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rJ   )ry   r   r   bszseq_lenposition_idsmax_poss          r*   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardB  s     ~~'W>>y$JZJZ\rsvv

 ""Q&0T\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVXY``bbr,   c                     |j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner<   r   cumsumtype_aslong)ry   r   r   r   maskincremental_indicess         r*   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsQ  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r,   r~   r   )r   r   r   __doc__r<   rm   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r*   r   r     s    NSc S# SCRVJ SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1" U]]_c cs c c _`88478QTW[Q[8r,   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )ri   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rl   rm   r   rq   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsru   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r	   rv   rw   )ry   rz   r   r   r   r   r|   s         r*   rm   z(SpeechT5PositionalConvEmbedding.__init__d  s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI+F,J,JK !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )r   ru   r   rw   r   s     r*   r   z'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r,   r   r   s   @r*   r   r   c  s    ABr,   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodingu[   
    Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
    c                    t        j                  ||      }t        j                  d|      j                  d      }t        j                  t        j                  d|dt         j
                        j                         t        j                  d      |z   z        }t        j                  |j                         |z        |d d dd df<   t        j                  |j                         |z        |d d dd df<   |j                  d      }t        | 1          | j                  d|d       t        j                  |	      | _        || _        || _        t        j&                  t        j(                  d
            | _        y )Nr   r   r   rC        @peFr   p      ?)r   rN   rQ   r   r   r   r   r   r   r   r   rl   rm   r   r   Dropoutdropoutr   max_len	Parametertensoralpha)ry   r   r   r   r   positiondiv_termr|   s          r*   rm   z)SpeechT5ScaledPositionalEncoding.__init__  s'   [[#&<<7+55a899U\\!S!5;;GMMOTXT\T\]dTehkTkRllmii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,\\%,,s"34
r,   c                     || j                   | j                  d d d |j                  d      f   z  z   }| j                  |      }|S )Nr   )r   r   r   r   )ry   r   s     r*   r   z(SpeechT5ScaledPositionalEncoding.forward  sB    DJJMchhqkM)9!:::ll3
r,   )i  )r   r   r   r   rm   r   r   r   s   @r*   r   r     s    5r,   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncodingc                     t         |           || _        || _        t        j
                  j                  d|z  |      | _        y r   )rl   rm   r   
max_lengthr   r   	Embeddingpe_k)ry   r   r  r|   s      r*   rm   z+SpeechT5RelativePositionalEncoding.__init__  s8    $HH&&q:~s;	r,   c                    |j                   d   }t        j                  d|      j                  |j                  t        j
                        }|d d d f   |d d d f   z
  }t        j                  || j                   k  | j                   |      }t        j                  || j                  k\  | j                  dz
  |      }|| j                  z   }| j                  |      S )Nr   r   r   rD   )	r%   r   rQ   r   r   r   wherer  r  )ry   r   r   pos_seqs       r*   r   z*SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55++g(884??:JGT++g8$//A:MwWDOO+yy!!r,   )i  r   r   s   @r*   r   r     s    <	"r,   r   c                   $     e Zd Z fdZd Z xZS )r   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )Nr   r   r   )rl   rm   num_pad_remove)ry   r   r|   s     r*   rm   zSpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car,   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r  r   s     r*   r   zSpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr,   r   r   s   @r*   r   r     s    Kr,   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r{   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rl   rm   feat_extract_normr   rM   num_feat_extract_layersrf   r   r'   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)ry   rz   ir  r|   s       r*   rm   zSpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNIJ,Va!eDN K %%0HMfNlNlHmCD*6A>K  01I1I0JJst  ==5&+#"Ns   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_gradr  )ry   params     r*   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s(    __& 	(E"'E	(#r,   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S NT)r  trainingr  r  )ry   r-   r   
conv_layers       r*   r   zSpeechT5FeatureEncoder.forward  sP    $QW- 4==*.M'** 	6J&}5M	6 r,   )r   r   r   r   rm   r  r   r   r   s   @r*   r  r    s    8#&$

r,   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Nr#   eps)rl   rm   r   r   rn   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   ry   rz   r|   s     r*   rm   z"SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r,   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS r~   )r   r*  r   )ry   r   norm_hidden_statess      r*   r   z!SpeechT5FeatureProjection.forward  s:    !__];(:;]3000r,   r   r   s   @r*   r$  r$    s    <1r,   r$  c                   2    e Zd Z fdZd Z	 	 ddej                  dej                  dz  dej                  dz  fdZ	de
dej                  fd	Zd
ej                  e
z  fdZ	 	 ddej                  dej                  dz  dej                  dz  fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t         |           || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        t!        |      | _        t%        |j&                  |j(                  z   dz   |j                  |j(                        | _        y )Nr2   r   )rl   rm   rz   r  feature_encoderr$  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr    pos_sinusoidal_embedr,  s     r*   rm   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r,   c                 8    | j                   j                          y r~   )r2  r  ry   s    r*   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r,   Nr-   r/   mask_time_indicesc                    | j                  |      }|j                  dd      }|| j                  |j                  d   |      }| j	                  |      \  }}| j                  |||      }| j                  |      }||z   }| |j                  d      j                         }n=t        j                  |j                  d d t        j                  |j                        }| j                  |      }||z   }||fS )Nr   r   )r>  r/   r   )r2  r   "_get_feature_vector_attention_maskr%   r3  _mask_hidden_statesr8  r   r   r   rN   r   r:  )	ry   r-   r/   r>  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r*   r   z#SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S''00->~ 1 
 %)$7$7$F!%(AA%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%(HHn,,r,   feature_vector_lengthc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr#   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r%   rN   rD   r   rQ   fliprO   )ry   rF  r/   non_padded_lengthsoutput_lengthsr[   s         r*   r@  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   r]   c                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)r>   ri   rj   s      r*   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s"     99\K7wWZ[[[r,   )ziprz   rr   rs   )ry   r]   rR  ri   rj   s        r*   rI  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r,   r   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r5   r6   r/   r7   r  )r5   r6   r7   r#   )getattrrz   r   r7  r   rD   r4  r!  rd   mask_time_lengthmask_time_min_masksr   r   r   rO   r5  mask_feature_lengthmask_feature_min_masksexpand)ry   r   r>  r/   r[   rA   r   mask_feature_indicess           r*   rA  z/SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r,   NN)r   r   r   rm   r=  r   r   
LongTensorFloatTensorr   r<   r@  rI  rA  r   r   s   @r*   r0  r0    s    
"2 376:	 -ll - ((4/ - !,,t3	 -F ]b]m]m  e>N>NQT>T & 7;26	,((, !,,t3, ((4/	,r,   r0  c                   f     e Zd Z fdZd Z	 ddej                  dej                  dz  fdZ xZS )SpeechT5SpeechDecoderPrenetc           	      X   t         |           || _        t        j                  t        |j                        D cg c]=  }t        j                  |dk(  r|j                  n|j                  |j                        ? c}      | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        t        j                  |j"                  |j                  z   |j                        | _        y c c}w r  )rl   rm   rz   r   r  rM   speech_decoder_prenet_layersr)  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr   positional_dropoutr9  encode_positionsspeaker_embedding_dimspeaker_embeds_layerry   rz   r  r|   s      r*   rm   z$SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 	 		+,6F''v7Y7Y66
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD'c                     t        j                  |d   |      }|j                  d      j                  |j	                  d      dd      }t        j
                  |dk(  |d      dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   r  )ry   inputs_embedsr   r   	all_maskss        r*   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr,   Nr-   speaker_embeddingsc                 8   |}| j                   D ]M  }t        j                  j                   ||            }| j	                  || j
                  j                        }O | j                  |      }| j                  |      }|t        j                  j                  |      }|j                  d      j                  d|j                  d      d      }t        j                  ||gd      }t        j                  j                  | j                  |            }|S )Nr   r#   r   )rf  r   
functionalrelurr  rz   speech_decoder_prenet_dropoutrg  ri  	normalizer   r[  r   r   r   rk  )ry   r-   rs  rp  r  s        r*   r   z#SpeechT5SpeechDecoderPrenet.forward  s     %[[ 	oEMM..u]/CDM 44]DKKDmDmnM	o ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}6H&IrRMMM..t/H/H/WXMr,   r~   )	r   r   r   rm   rr  r   r   r   r   r   s   @r*   ra  ra    s8    u,K 37ll "LL4/r,   ra  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerc                 
   t         |           |dk(  r|j                  }n|j                  }||j                  dz
  k(  r|j                  }n|j                  }t        j                  |||j                  d|j                  dz
  dz  d      | _        t        j                  |      | _
        ||j                  dz
  k  rt        j                         | _        nd | _        t        j                  |j                        | _        y )Nr   r   r   F)ri   rj   r   rk   )rl   rm   rd  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rq   speech_decoder_postnet_kernelru   BatchNorm1d
batch_normTanhrw   r   speech_decoder_postnet_dropoutr   )ry   rz   r{   ro   rp   r|   s        r*   rm   z#SpeechT5BatchNormConvLayer.__init__  s    q= --K ==Kv;;a??!..L!>>LII<<99A=!C
	 ..6f::Q>> ggiDO"DOzz&"G"GHr,   c                     | j                  |      }| j                  |      }| j                  | j                  |      }| j                  |      }|S r~   )ru   r  rw   r   r   s     r*   r   z"SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r,   r   r   r   s   @r*   rz  rz    s    I<r,   rz  c                   ^     e Zd Z fdZdej
                  fdZdej
                  fdZ xZS )SpeechT5SpeechDecoderPostnetc           	         t         |           || _        t        j                  |j
                  |j                  |j                  z        | _        t        j                  |j
                  |j                        | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r~   )rl   rm   rz   r   r)  r   rd  r.   feat_outprob_outr  rM   r}  rz  rf  rl  s      r*   rm   z%SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<ghq'2h
hs   (Cr   c                    | j                  |      j                  |j                  d      d| j                  j                        }| j                  |      }| j                  |      j                  |j                  d      d      }|||fS )Nr   r#   )r  r   r   rz   rd  postnetr  )ry   r   outputs_before_postnetoutputs_after_postnetlogitss        r*   r   z$SpeechT5SpeechDecoderPostnet.forward  s~    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%'<fDDr,   c                     |j                  dd      }| j                  D ]
  } ||      } ||j                  dd      z   S r   )r   rf  )ry   r   layer_outputr  s       r*   r  z$SpeechT5SpeechDecoderPostnet.postnet  sI    $..q!4[[ 	/E .L	/|55a;;;r,   )	r   r   r   rm   r   r   r   r  r   r   s   @r*   r  r    s*    	
EU\\ E<U\\ <r,   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )SpeechT5TextEncoderPrenetc                    t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        y r~   )rl   rm   rz   r   r  
vocab_sizer   r    embed_tokensr   rh  max_text_positionsri  r,  s     r*   rm   z"SpeechT5TextEncoderPrenet.__init__  se    LL):):F<N<NPVPcPcd @%%%%!
r,   r   c                 J    | j                  |      }| j                  |      }|S r~   )r  ri  )ry   r   rp  s      r*   r   z!SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r,   )r   r   r   rm   r   r   r   r   r   s   @r*   r  r    s    
 r,   r  c                   l     e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  fdZ xZ	S )SpeechT5TextDecoderPrenetc                    t         |           || _        t        j                  |j
                        | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                  |j                  |j                        | _        t!        |j"                  |j                  z   dz   |j                  |j                        | _        y )Nr   r   )rl   rm   rz   r   r   rh  r   scale_embeddingr   sqrtr   embed_scaler  r  r    r  r   r  embed_positionsr,  s     r*   rm   z"SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r,   Nr   r/   past_key_valuesc                 $   |&|j                         }|j                  d|d         }nt        d      |dn|j                         }| j	                  ||      }| j                  |      | j                  z  }||z  }| j                  |      }||fS )Nr#   z'You have to specify `decoder_input_ids`r   )r   r   r'   get_seq_lengthr  r  r  r   )ry   r   r/   r  input_shaper   	positionsrp  s           r*   r   z!SpeechT5TextDecoderPrenet.forward  s      #..*K!r;r?;IFGG&5&=?CaCaCc((4JK	)))4t7G7GG"]3n,,r,   r]  )
r   r   r   rm   r   r   r^  r
   r   r   r   s   @r*   r  r    sD    
" 37(,	-<<- ((4/- 	-r,   r  c                   J     e Zd Z fdZdej
                  fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t         |           || _        t        j                  |j
                  |j                  d      | _        y )NFrk   )rl   rm   rz   r   r)  r   r  lm_headr,  s     r*   rm   z#SpeechT5TextDecoderPostnet.__init__4  s5    yy!3!3V5F5FUSr,   r   c                 $    | j                  |      S r~   r  r   s     r*   r   z"SpeechT5TextDecoderPostnet.forward9  s    ||M**r,   c                     | j                   S r~   r  r<  s    r*   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddings<  s     ||r,   c                     || _         y r~   r  ry   new_embeddingss     r*   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsA  s	    %r,   )
r   r   r   rm   r   r   r   r  r  r   r   s   @r*   r  r  3  s#    T
+U\\ +
&r,   r  c                   V    e Zd ZdZ	 	 	 	 ddedededz  dedz  dedz  dedz  f fd	Z	 	 	 	 	 	 dd
ej                  dej                  dz  de
dz  dej                  dz  dej                  dz  dedej                  dz  deej                  ej                  dz  e
dz  f   fdZ xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    N	embed_dim	num_headsr   
is_decoderrk   	layer_idxc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        || _	        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rl   rm   r  r  r   head_dimr'   scalingr  r  r   r)  k_projv_projq_projout_proj)ry   r  r  r   r  rk   r  r|   s          r*   rm   zSpeechT5Attention.__init__K  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr,   r   key_value_statesr  r/   position_biasoutput_attentionscache_positionr8   c                 &   |du}|j                         \  }	}
}| j                  |      | j                  z  }d}|St        |t              rA|j
                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j                  |	d| j                   | j"                        j%                  dd      }|j                  |	d| j                   | j"                        j%                  dd      }|T|s|nd}j'                  ||| j                  d|i      \  }}|r)t        |t              rd|j
                  | j                  <   |	| j                   z  d| j"                  f}|j                  |	|
| j                   | j"                        j%                  dd      } |j(                  | } |j(                  | } |j(                  | }|j                  d      }t+        j,                  ||j%                  dd            }|j                         |	| j                   z  |
|fk7  r/t/        d|	| j                   z  |
|f d	|j                                ||j1                         j                  |	| j                   z  d| j"                        j%                  d
d      }t+        j2                  ||j%                  dd            }|j%                  d
d      j                  |	| j                   z  |j                  d
      |j                  d            }||z  }|{|j                         |	d|
|fk7  r#t/        d|	d|
|f d	|j                                |j                  |	| j                   |
|      |z   }|j                  |	| j                   z  |
|      }t4        j6                  j9                  |d      }|r?|j                  |	| j                   |
|      }|j                  |	| j                   z  |
|      }nd}t4        j6                  j;                  || j:                  | j<                        }t+        j,                  ||      }|j                         |	| j                   z  |
| j"                  fk7  r7t/        d|	| j                   |
| j"                  f d	|j                                |j                  |	| j                   |
| j"                        }|j%                  dd      }|j)                  |	|
| j>                        }| jA                  |      }||fS )z#Input shape: Batch x Time x ChannelNFr#   r   r   r  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   )r   r!  z `attn_output` should be of size )!r   r  r  
isinstancer   
is_updatedgetr  cross_attention_cacheself_attention_cacherf  keysvaluesr  r  r   r  r  r   updaterY   r   bmmr'   
contiguousmatmulr   ru  softmaxr   r!  r  r  )ry   r   r  r  r/   r  r  r  is_cross_attentionr   tgt_lenr\   query_statesr  curr_past_key_valuescurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                             r*   r   zSpeechT5Attention.forwardh  sQ    .T9',,.Wa {{=1DLL@
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~>DNN*B>
#((gt~~t}}U__`acde+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  $$//166sT^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<dnn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK0111r,   )r2   FTN)NNNNFN)r   r   r   r   r<   r   rO   rm   r   r   r
   tupler   r   r   s   @r*   r  r  E  s&    !$"' !%CC C 	C
 4KC TkC $;C@ 15(,.2-1"'.2u2||u2  ,,-u2 	u2
 t+u2 ||d*u2  u2 t+u2 
u||U\\D0%$,>	?u2r,   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||j                        | _        t        j                  |j                        | _        y r~   )rl   rm   r   r   activation_dropoutintermediate_dropoutr)  r   intermediate_denser  
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)ry   rz   intermediate_sizer|   s      r*   rm   zSpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''-'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r,   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r~   )r  r  r  r  r  r   s     r*   r   zSpeechT5FeedForward.forward  sX    //>00?11-@))-8++M:r,   r   r   s   @r*   r  r    s    @r,   r  c            	            e Zd Zdef fdZ	 	 	 d	dej                  dej                  dz  dej                  dz  defdZ xZ	S )
SpeechT5EncoderLayerrz   c                    t         |           t        |j                  |j                  |j
                  d      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        ||j                        | _        t        j                  |j                  |j                        | _        y )NF)r  r  r   r  r&  )rl   rm   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r(  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr,  s     r*   rm   zSpeechT5EncoderLayer.__init__  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r,   Nr   r/   r  r  c                     |}| j                  ||||      \  }}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r/   r  r  )r  r   r   r  r  )ry   r   r/   r  r  residualr  outputss           r*   r   zSpeechT5EncoderLayer.forward  s    ( !&*nn')'/	 '5 '
#| ]3 =06%(9(9-(HH--m< "&Gr,   )NNF)
r   r   r   r   rm   r   r   rO   r   r   r   s   @r*   r  r    s]    \~ \  /3-1"'(||( t+( ||d*	(
  (r,   r  c                        e Zd Zddef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  dej                  dz  fdZ	 xZ
S )SpeechT5DecoderLayerNrz   c                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j
                  d|      | _        t        j                  |j                  |j                        | _        t!        ||j"                        | _        t        j                  |j                  |j                        | _        y )NT)r  r  r   r  r  r&  )r   r  r  )rl   rm   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r(  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr  r  )ry   rz   r  r|   s      r*   rm   zSpeechT5DecoderLayer.__init__2  s    *((44,,
 zz&"7"78$&LL1C1CI^I^$_!-**,,
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r,   r   r/   encoder_hidden_statesencoder_attention_maskr  r  	use_cacher  c	                 z   |}	| j                  |||||      \  }}
| j                  |      }|	|z   }| j                  |      }d}|C|}	| j                  ||||||      \  }}| j                  |      }|	|z   }| j	                  |      }|| j                  |      z   }| j                  |      }|f}|r||
|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_values (`Cache`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r  r/   r  r  N)r   r  r/   r  r  r  )r  r   r  r  r  r  r  )ry   r   r/   r  r  r  r  r  r  r  self_attn_weightscross_attn_weightsr  s                r*   r   zSpeechT5DecoderLayer.forwardJ  s   2 ! ,0>>'+)/) ,: ,
(( ]3 =011-@ " ,$H040A0A+!65 /"3- 1B 1-M- !LL7M$}4M 88GM &(9(9-(HH--m< ")+=>>Gr,   r~   )NNNNFTN)r   r   r   r   rm   r   r   r
   rO   r   r   r   s   @r*   r  r  1  s    \~ \6 /3596:(,).!%.2A||A t+A  %||d2	A
 !&t 3A A  $;A $;A t+Ar,   r  c                   p    e Zd ZU eed<   dZdZdZdZ e	j                         dej                  fd       Zy)	SpeechT5PreTrainedModelrz   speecht5r-   audioTmodulec           
         | j                   j                  }t        |t              rt	        j
                  |j                  j                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t	        j                  |j                  j                  d       nOt        |t              rgt	        j                  |j                          |j"                  |j$                  }}t'        j(                  ||      }t'        j*                  d|      j-                  d      }t'        j.                  t'        j*                  d|dt&        j0                        j3                         t        j4                  d      |z   z        }t'        j6                  |j3                         |z        |dddddf<   t'        j8                  |j3                         |z        |dddddf<   |j-                  d      }t	        j:                  |j<                  |       nt        |t>              rt        j                  d|j@                  jB                  z        }t	        jD                  |j@                  j                  | |       t	        jD                  |j@                  j                  | |       n?t        |tF        jH                        rPt	        j
                  |j                  d	|       |j                  t	        jJ                  |j                         nt        |tF        jL                  tF        jN                  tF        jP                  f      rt	        jJ                  |j                         t	        j                  |j                         tS        |d
d      Pt	        jJ                  |jT                         t	        j                  |jV                         t	        jJ                  |jX                         nt        |tF        jZ                        rt	        j\                  |j                         |j                  t        j                  |j^                  |j                  |j                  d   z  z        }t	        jD                  |j                  | |       nJt        |tF        j`                        rst	        j
                  |j                  d	|       |jb                  tS        |j                  dd      st	        jJ                  |j                  |jb                            nt        |td              r_|jg                  |jh                  |jj                  z   |jl                  |jb                        }	t	        j:                  |jn                  |	       nNt        |tp              r>t	        jJ                  |jr                         t	        j                  |jt                         tw        |d      r t	        jD                  |jx                         yy)zInitialize the weightsr   r   r   )meanstdrC   r   N)abr2   running_mean_is_hf_initializedFr7  )=rz   initializer_ranger  r   initnormal_ru   r   r   r  ri   in_channels	constant_rk   r   ones_r   r   r   r   rN   rQ   r   r   r   r   r   r   r   copy_r   r$  r*  in_featuresr6  r   r)  zeros_r   r   r  rV  r  running_varnum_batches_trackedrq   kaiming_normal_r   r  r   r   r   r   r   r   r   SpeechT5HifiGanr  scaler   r7  )
ry   r  r  r   r   r   r   r   kr   s
             r*   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s    kk++f=>LL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ @AJJv||$!::v~~CWc*B||Aw/99!<Hyyaau{{!K!Q!Q!SX\X`X`ahXiloXoVp!pqH))HNN$4x$?@Bq!$Q$wK))HNN$4x$?@Bq!$Q$wKaBJJvyy"% 9:		!f//;;;<AMM&++22qbA>MM&++00QB!<		*LLSc:{{&FKK(r||R^^ LMKK$JJv}}%v~t4@F//0

6--.F667		*  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15-LLSc:!!-gfmmMach6iFMM&*<*<=> EF ..$$v}}4f6J6JFL^L^K JJv~~{30KK$JJv||$6./MM&223 0r,   N)r   r   r   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointingr   r   r   Moduler"   r,   r*   r  r    sD    "$O&*#U]]_74BII 74 74r,   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  dz  de	dz  de	dz  d	e	dz  d
e
ez  fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    rz   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t#        |j                  |j$                  z  |j&                        | _        d| _        | j-                          y c c}w )Nr&  F)rl   rm   r   r   r   r(  r   r   r  r   encoder_layerdrop	layerdropr  rM   encoder_layersr  rf  r   r  encoder_max_relative_positionr  r  	post_init)ry   rz   r\   r|   s      r*   rm   zSpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$ha%9&%A$hiA&"@"@@&BfBf 
 ',# 	 %is   DNr   r/   r  output_hidden_statesreturn_dictr8   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }t	        | j                   ||      }| j                  |      }| j                  |      }| j                  |      }t               xs t        |       }|rdnd}	|rdnd}
t        | j                        D ]d  \  }}|r|	|fz   }	d}| j                  r$t        j                  g       }|| j                  k  }|r|r |||||      }|d   }|rd}|s\|
d   fz   }
f |r|	|fz   }	|st!        d	 ||	|
fD              S t#        ||	|

      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        N)rz   rp  r/   r)  F)r/   r  r  r   r]  r   c              3   &   K   | ]	  }||  y wr~   r)  .0vs     r*   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>9  s     mq_`_lms   last_hidden_stater   
attentions)rz   r  r2  use_return_dictr   r   r   r  r   r   	enumeraterf  r!  r   rH   r.  r  r   )ry   r   r/   r  r2  r3  kwargsr  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r*   r   zSpeechT5Encoder.forward  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]2;;')
 6]3,,];02R6LT6R"6BD$5b4"+DKK"8 	PC#$58H$H! #N}}&+jjn#!4t~~!E![ -!#1"/&7	! !.a 0 , &9]1=M<O&O#1	P4   1]4D Dm]4EGZ$[mmm++*
 	
r,   NNNNr   r   r   r   r   rm   r   r_  r   rO   r  r   r   r   r   s   @r*   r+  r+    s    ~ ( /3)-,0#'X
((X
 t+X
  $;	X

 #TkX
 D[X
 
	 X
r,   r+  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  dz  de	dz  de	dz  d	e	dz  d
e
ez  fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r0  prenetr+  wrapped_encoderr1  r,  s     r*   rm   z(SpeechT5EncoderWithSpeechPrenet.__init__H  5     1&9.v6 	r,   Nr-   r/   r  r2  r3  r8   c                 \    | j                  ||      \  }}| j                  |||||      }|S N)r   r/   r  r2  r3  rM  rN  	ry   r-   r/   r  r2  r3  r?  r   r  s	            r*   r   z'SpeechT5EncoderWithSpeechPrenet.forwardP  sD     )-L.(Q%~&&')/!5# ' 
 r,   rH  rI  r   s   @r*   rK  rK  B  s    
~  /3)-,0#''' t+  $;	
 #Tk D[ 
	 r,   rK  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 ddej                  dej                  dz  d	edz  d
edz  dedz  deez  fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r  rM  r+  rN  r1  r,  s     r*   rm   z&SpeechT5EncoderWithTextPrenet.__init__k  5     /7.v6 	r,   c                 6    | j                   j                         S r~   rM  get_input_embeddingsr<  s    r*   rZ  z2SpeechT5EncoderWithTextPrenet.get_input_embeddingss      {{//11r,   c                 :    | j                   j                  |       y r~   rM  set_input_embeddingsry   values     r*   r^  z2SpeechT5EncoderWithTextPrenet.set_input_embeddingsv      ((/r,   Nr-   r/   r  r2  r3  r8   c                 T    | j                  |      }| j                  |||||      }|S rQ  rR  rS  s	            r*   r   z%SpeechT5EncoderWithTextPrenet.forwardy  s=     L1&&')/!5# ' 
 r,   rH  )r   r   r   r   r   rm   rZ  r^  r   r_  r   rO   r  r   r   r   r   s   @r*   rU  rU  f  s    ~ 20 /3)-,0#''' t+  $;	
 #Tk D[ 
	 r,   rU  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  dz  de	dz  de	dz  d	e	dz  d
e
ez  fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    rz   c                 d    t         |   |       t        |      | _        | j	                          y r~   )rl   rm   r+  rN  r1  r,  s     r*   rm   z%SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r,   Nr-   r/   r  r2  r3  r8   c                 .    | j                  |||||      S rQ  )rN  )ry   r-   r/   r  r2  r3  r?  s          r*   r   z$SpeechT5EncoderWithoutPrenet.forward  s+     ##&)/!5# $ 
 	
r,   rH  rI  r   s   @r*   rd  rd    s    
~  /3)-,0#'
''
 t+
  $;	

 #Tk
 D[
 
	 
r,   rd  c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	dz  d
e
dz  de
dz  de
dz  de
dz  dej                  dz  deez  fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    rz   c           	      
   t         |   |       |j                  | _        t	        j
                  t        |j                        D cg c]  }t        ||       c}      | _	        d| _
        | j                          y c c}w )N)r  F)rl   rm   decoder_layerdropr.  r   r  rM   decoder_layersr  rf  r  r1  rl  s      r*   rm   zSpeechT5Decoder.__init__  sh     11mmX]^d^s^sXt$uST%9&A%N$uv&+# 	 %vs   B Nr   r/   r  r  r  r  r  r2  r3  r  r8   c                 l   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j
                  r%| j                  r|rt        j                  d       d}|r6|4t        t        | j                         t        | j                               }||j                         nd}|
2t        j                  |||j                  d   z   |j                        }
t!        | j                   |||
|      }||t#        | j                   |||	      }t%               xs t'        |       }|rd
nd}|rd
nd}|r|d
nd}t)        | j*                        D ]q  \  }}|r||fz   }d}| j                  r$t        j,                  g       }|| j.                  k  }|r|sE |||||||||
      }|d   }|s]||d   fz   }|i||d   fz   }s |r||fz   }|	st1        d |||||fD              S t3        |||||      S )aQ  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rz   r   r   rH  )rz   rp  r/   r  r  )rz   rp  r/   r  r)  )r  r  r  r  r  r   c              3   $   K   | ]  }|| 
 y wr~   r)  r6  s     r*   r9  z*SpeechT5Decoder.forward.<locals>.<genexpr>H  s      = s   )r;  r  r   r<  cross_attentions)rz   r  r2  r  r=  r  r!  loggerwarning_oncer   r   r  r   rQ   r%   r   r   r   r   r   r>  rf  rH   r.  r  r   )ry   r   r/   r  r  r  r  r  r2  r3  r  r?  r   r@  rA  rB  all_cross_attentionsrC  decoder_layerrE  rF  rG  s                         r*   r   zSpeechT5Decoder.forward  s   t 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]&&4==##p "	01,dkk2RT`hlhshsTtuOETE`!?!?!Afg!"\\&(>ATATUVAW(W`m`t`tN ,;;'))+
 !,1G1S%>{{+5&;	&" 12R6LT6R #7BD$5b4&7<Q<]rdh"+DKK"8 	VC#$58H$H! #N}}&+jjn#!4t~~!Ek)%'= /"3#-	M *!,M &9]1=M<O&O#(4+?=QRCSBU+U(7	V:   1]4D D ':KM`bvw   9+++*1
 	
r,   
NNNNNNNNNNr   r   r   r   r   rm   r   r_  r^  r
   rO   r   r  r   r   r   r   s   @r*   rj  rj    s    	~ 	 3726:>:>(,!%)-,0#'.2V
((4/V
 ((4/V
  %0047	V

 !& 0 04 7V
 V
 $;V
  $;V
 #TkV
 D[V
 t+V
 
:	:V
r,   rj  c                   8    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   ra  rM  rj  wrapped_decoderr1  r,  s     r*   rm   z(SpeechT5DecoderWithSpeechPrenet.__init__]  rO  r,   Nr-   r/   r  r  rs  r  r  r  r2  r3  r  r8   c                 `    | j                  ||      }| j                  ||||||||	|
|
      }|S N)
r   r/   r  r  r  r  r  r2  r3  r  rM  rz  )ry   r-   r/   r  r  rs  r  r  r  r2  r3  r  r?  decoder_hidden_statesr  s                  r*   r   z'SpeechT5DecoderWithSpeechPrenet.forwarde  sP     !%L:L M&&/)"7#9+/!5#) ' 
 r,   )NNNNNNNNNNN)r   r   r   r   r   rm   r   r_  r^  r   r
   rO   r  r   r   r   r   s   @r*   rx  rx  W  s
   
~  2626:>:>26(,!%)-,0#'.2''$. ((4/  %0047	
 !& 0 04 7 "LL4/  $;  $; #Tk D[ t+ 
:	:r,   rx  c                   $    e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  dedz  dej                  dz  deez  fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    rz   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r~   )rl   rm   r  rM  rj  rz  r1  r,  s     r*   rm   z&SpeechT5DecoderWithTextPrenet.__init__  rW  r,   c                 6    | j                   j                         S r~   rY  r<  s    r*   rZ  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r[  r,   c                 :    | j                   j                  |       y r~   r]  r_  s     r*   r^  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  ra  r,   Nr-   r/   r  r  r  r  r  r2  r3  r  r8   c                 h    | j                  |||      \  }}| j                  |||||||||	|

      }|S r|  r}  )ry   r-   r/   r  r  r  r  r  r2  r3  r  r?  r~  r  s                 r*   r   z%SpeechT5DecoderWithTextPrenet.forward  sW     15L.Zi0j-~&&/)"7#9+/!5#) ' 
 r,   ru  )r   r   r   r   r   rm   rZ  r^  r   r_  r^  r
   rO   r   r  r   r   r   r   s   @r*   r  r    s    ~ 20
 2626:>:>(,!%)-,0#'.2''$. ((4/  %0047	
 !& 0 04 7  $;  $; #Tk D[ t+ 
:	:r,   r  c                       e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	dz  d
e
dz  de
dz  de
dz  de
dz  dej                  dz  deez  fdZ xZS )SpeechT5DecoderWithoutPrenetre  rz   c                 d    t         |   |       t        |      | _        | j	                          y r~   )rl   rm   rj  rz  r1  r,  s     r*   rm   z%SpeechT5DecoderWithoutPrenet.__init__  rg  r,   Nr-   r/   r  r  r  r  r  r2  r3  r  r8   c                 <    | j                  |||||||||	|

      }|S r|  )rz  )ry   r-   r/   r  r  r  r  r  r2  r3  r  r?  r  s                r*   r   z$SpeechT5DecoderWithoutPrenet.forward  s>     &&&)"7#9+/!5#) ' 
 r,   ru  rv  r   s   @r*   r  r    s    
~  2626:>:>(,!%)-,0#'.2''$. ((4/  %0047	
 !& 0 04 7  $;  $; #Tk D[ t+ 
:	:r,   r  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ
d	 Zed
        Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
    rz   c                 f    t         |           |j                  | _        |j                  | _        y r~   )rl   rm   guided_attention_loss_sigmasigmaguided_attention_loss_scaler   r,  s     r*   rm   z-SpeechT5GuidedMultiheadAttentionLoss.__init__  s(    77
77
r,   r<  input_masksoutput_masksr8   c                 F   | j                  |||j                        }|j                  d      |j                  d      z  }|j                  |j                        j                  d      }||z  }t	        j
                  |j                  |            }| j                  |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r#   r   r   )_make_guided_attention_masksr   r   r   r   r  masked_selectr   )ry   r<  r  r  guided_attn_masksmaskslosseslosss           r*   r   z,SpeechT5GuidedMultiheadAttentionLoss.forward  s    " !==k<YcYjYjk&&r*[-B-B2-FF**+55a8"Z/zz&..u56zzD  r,   c                 r   |j                  d      }|j                  d      }t        j                  t        |      |j                  d   |j                  d   f|      }t        t        ||            D ]0  \  }\  }}	| j                  ||	| j                  |      ||d |	d |f<   2 |j                  d      S )Nr#   r   rH  )
rK   r   rN   rR   r%   r>  rS  _make_guided_attention_maskr  r   )
ry   r  r  r   r]   rL  r  rC  ilenolens
             r*   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masks  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}n+M!N 	tC$373S3STXZ^`d`j`jlr3sc5D5%4%/0	t !**1--r,   c                 (   t        j                  t        j                  | |      t        j                  ||      d      \  }}|j                         |z  }|j                         | z  }dt        j                  ||z
  dz   d|dz  z  z        z
  S )NrH  xy)indexingr   r   )r   meshgridrQ   r   r   )r>   output_lengthr  r   grid_ygrid_xs         r*   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask  s    LLf5LLv6

 -/,.UYY&6/a!78ANKLLLr,   )r   r   r   r   r   rm   r   r_  
BoolTensorr   r   r  r   r  r   r   s   @r*   r  r    sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr,   r  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dej                  d	ej                  d
ej                  dz  dej                  fdZ
 xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    rz   c                 (   t         |           |j                  | _        |j                  | _        |j                  | _        t               | _        t        t        j                  d            | _
        | j                  rt        |      | _        y y )Ng      @)
pos_weight)rl   rm   use_guided_attention_lossguided_attention_loss_num_headsr.   r   l1_criterionr   r   r   bce_criterionr  attn_criterionr,  s     r*   rm   z SpeechT5SpectrogramLoss.__init__#  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r,   Nr/   r  r  r  labelsrp  r8   c           	      V   |dk7  }|j                  |      }|j                  |      }|j                  |      }| j                  ||      | j                  ||      z   }|d d d d df   }	t        j                  |	 dz  t        j                  |	j                  d      d      j                  |	j                        gd      }
|
d d dd f   j                  |	      }
|j                  |	      }| j                  ||
      }||z   }| j                  rt        j                  |D cg c]  }|d d d | j                  f    c}d      }|dk(  }|d d d d df   }| j                  dkD  r#|d d | j                  dz
  d | j                  f   }| j                  |||      }||z  }|S c c}w )Nr1   r   r   r   r   )r  r  r   r   rT   r   r   r   r  r  r  r.   r  )ry   r/   r  r  r  r  rp  rD  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r*   r   zSpeechT5SpectrogramLoss.forward/  s    ' %%l3!7!E!El!S 5 C CL Q ##$96BTEVEVWmouEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%fk: ! ))99Tdeqa#IT%I%I#I IJeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D+|LIID fs   $F&r~   )r   r   r   r   r   rm   r   r^  r_  r   r   r   r   s   @r*   r  r    s    
O~ 
O& 6:)(() !& 1 1)  %00	)
 !!) !!)  ++d2) 
)r,   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc                       e Zd Z	 	 ddedej
                  dz  dej
                  dz  f fdZd Zd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  deeej                        dz  dedz  dedz  dej                  dz  dedz  dedz  dedz  dej                  dz  deej                     ez  fd       Z xZS )SpeechT5ModelNrz   encoderdecoderc                     t         |   |       || _        |t        |      n|| _        |t        |      n|| _        | j                          y)z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)rl   rm   rz   rd  r  r  r  r1  )ry   rz   r  r  r|   s       r*   rm   zSpeechT5Model.__init__a  sM     	 ?F3F;T[?F3F;T[ 	r,   c                     t        | j                  t              r| j                  j                         S t        | j                  t
              r| j                  j                         S t        r~   )r  r  rU  rZ  r  r  NotImplementedErrorr<  s    r*   rZ  z"SpeechT5Model.get_input_embeddingsu  sL    dll$AB<<4466dll$AB<<4466!!r,   c                     t        | j                  t              r| j                  j                  |       t        | j                  t
              r| j                  j                  |       y y r~   )r  r  rU  r^  r  r  r_  s     r*   r^  z"SpeechT5Model.set_input_embeddings|  sJ    dll$ABLL--e4dll$ABLL--e4 Cr,   c                     t        | j                  t              r%| j                  j                  j	                          yyz
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rK  rM  r=  r<  s    r*   r=  z$SpeechT5Model.freeze_feature_encoder  s/    
 dll$CDLL668 Er,   r-   r/   decoder_input_valuesdecoder_attention_maskencoder_outputsr  r  rs  r  r2  r3  r  r8   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|| j                  |||	|
|      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|Qt        | j
                  t              r7| j
                  j                  j                  |d   j                  d   |      }n|}t        | j                  t              rd|i}ni } | j                  d
|||d   ||||	|
||d
|}|s||z   S t        |j                   |j"                  |j$                  |j&                  |j(                  |j                   |j$                  |j&                  	      S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r-   r/   r  r2  r3  r   r   r   r:  rs  )
r-   r/   r  r  r  r  r  r2  r3  r  )r;  r  r~  decoder_attentionsrp  encoder_last_hidden_stater  encoder_attentionsr)  )rz   r  r2  r  r=  r  r  r   rR   rK  rM  r@  r%   r  rx  r   r;  r  r   r<  rp  )ry   r-   r/   r  r  r  r  r  rs  r  r2  r3  r  r?  r  decoder_argsdecoder_outputss                    r*   r   zSpeechT5Model.forward  s   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c%)\\%8%8%[%["((+^&" &4"dll$CD02DELL&$,, 
-1"1!"4#9+/!5#)
 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r,   r]  NNNNNNNNNNNN)r   r   r   r   r   r(  rm   rZ  r^  r=  r   r   r   r^  r  r_  r
   rO   r   r   r   r   s   @r*   r  r  [  s    %)$(	 T! T!	("59  -12648:>BF(,!%7;)-,0#'.2a
llT)a
 ((4/a
 $llT1	a

 !& 0 04 7a
 uU%6%6784?a
 a
 $;a
 "--4a
  $;a
 #Tka
 D[a
 t+a
 
u  	!$6	6a
 a
r,   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c                       e Zd ZddiZdef fdZd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  deee
j                        dz  dedz  dedz  dedz  dedz  dedz  de
j                  dz  de
j                   dz  deez  fd       Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightz+speecht5.decoder.prenet.embed_tokens.weightrz   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rl   rm   r  r'   r|   rK  r  r  r	  r  text_decoder_postnetr1  )ry   rz   speech_encodertext_decoderr|   s       r*   rm   z SpeechT5ForSpeechToText.__init__  s     $00@ A/ /  9@4V<%fnlK$>v$F! 	r,   c                 T    | j                         j                  j                          yr  get_encoderrM  r=  r<  s    r*   r=  z.SpeechT5ForSpeechToText.freeze_feature_encoder      
 	!!88:r,   c                 6    | j                   j                         S r~   )r  r  r<  s    r*   r  z-SpeechT5ForSpeechToText.get_output_embeddings  s    ((>>@@r,   c                 :    | j                   j                  |       y r~   )r  r  r  s     r*   r  z-SpeechT5ForSpeechToText.set_output_embeddings  s    !!77Gr,   Nr-   r/   decoder_input_idsr  r  r  r  r  r2  r3  r  r  r8   c                 z   |
|
n| j                   j                  }
|7|5t        || j                   j                  | j                   j                        }| j                  |||||||||	d|      }| j                  |d         }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                   |j"                  	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r-   r/   r  r  r  r  r  r  r2  r3  r  r   r#   r   )	r  r  r  r~  r  rp  r  r  r  )rz   r=  r+   r    r!   r	  r  r   r   r  r   r  r~  r  rp  r  r  r  )ry   r-   r/   r  r  r  r  r  r  r2  r3  r  r  r?  r  r  r  loss_fctoutputs                      r*   r   zSpeechT5ForSpeechToText.forward  sT   h &1%<k$++B]B] ($6DKK44dkk6X6X%! --%)!2#9++/!5)   
 **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   r  )r   r   r   _tied_weights_keysr   rm   r=  r  r  r   r   r_  r^  r  r
   rO   r   r   r   r   r   s   @r*   r  r    sf    @Ano~ (;AH  262659:>BF(,!%)-,0#'*..2~
''$.~
 ((4/~
 !++d2	~

 !& 0 04 7~
 uU%6%6784?~
 ~
 $;~
  $;~
 #Tk~
 D[~
   4'~
 t+~
 
	 ~
 ~
r,   r  modelrs  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
      j   |t        d      |+d|| j                  j                  k(  j                         z
  }
n|}
|j	                  d      }| j
                  j                  ||
d      }|j                  }t        | j
                  j                  t              r@| j
                  j                  j                  j                  |d   j                  d   |
      }
t        |j	                  d      |z  | j                  j                  z        }t        |j	                  d      |z  | j                  j                  z        }|j                  |d| j                  j                        }g }g }d }d}i }	 |dz  }| j
                  j                   j                  ||      }| j
                  j                   j#                  |d d dd f   d ||
|d|d      }|r0|j%                  t'        j(                  |j*                  d             |j                  j-                  d      }|j.                  }| j0                  j3                  |      }|j5                  || j                  j                  | j                  j                        }|j%                  |       |d d dd d f   j5                  |d| j                  j                        }t'        j(                  ||fd      }t'        j6                  | j0                  j9                  |            }||k  r||k  rAt'        j:                  |d      |k\  }t'        j<                  |      d   j?                         }ntA        tC        |            }|D cg c]	  }||vs| }}tC        |      dkD  rat'        jD                  |      }|jG                  dd      jI                  dd	      }| j0                  jK                  |      }|D ]
  } ||    || <    tC        |      |k\  rntA        tC        |            D cg c]  }||   	 }}|	s|dk(  r|d   n4t&        jL                  jN                  jP                  jS                  |d
      }|	 ||      }!n|}!|r`t'        j(                  |d	      }|dkD  r@ |j4                  |t        |j	                  d      |z        g|j	                         dd   }|!|f}!|!S g }"tA        |      D ]%  }|"j%                  ||   j	                  d             ' |:t&        jL                  jN                  jP                  jS                  |d
      }||"f}!nyg }#t&        jL                  jN                  jP                  jS                  |d
      } ||      }#|"D cg c]+  }t        |#j	                  d      tU        |"      z        |z  - }$}|#|$f}!|r^t'        j(                  |d	      } |j4                  |t        |j	                  d      |z        g|j	                         dd   }g |!|}!|!S c c}w c c}w c c}w )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r-   r/   r3  r#   )r   r/   r  r  r  r  r  r3  r   r   )batch_first)+r'   rz   r    r<   r   r	  r  r;  r  rK  rM  r@  r%   r.   r$   rd  r  rz  rV   r   r   rp  squeezer  speech_decoder_postnetr  r   sigmoidr  rK   r  rL   rM   rR   stackr   flattenr  r   r   rnnpad_sequencer=   )%r  r-   rs  r/   r  r  r  r  r  r  r  r   encoder_outr  maxlenminlenoutput_sequencespectrogramrp  r  rC  result_spectrogramr~  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr  spectrograms
meet_indexr  spectrogram_lengths	waveformswaveform_lengthss%                                        r*   _generate_speechr     s    !
 	
 !"lell6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ell&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S!q@R7RASLS< 1$${{;7+55a;CCAqI$;;CCLQ". NJ5A*5M&z2N%&#-i j 49=O9P3QRa&q)RLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#8#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !s 	@A&&|A';';A'>?	@? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rsZ[INN1$5<O8P$P QTU Uss "23G"$yy)9qA4/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   4	X&>X&X+0X0zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            !           e Zd ZdZdZdef fdZedefd       Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  deeej                        dz  dedz  dedz  dedz  dedz  dedz  dej                  dz  dej                  dz  dej                   dz  dej                   dz  deez  fd       Z ej&                         	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dedededej,                  dz  dededej                  eej                  ej                  f   z  fd       Z ej&                         	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dedededej,                  dz  dededej                  eej                  ej                  f   z  fd       Z xZS ) SpeechT5ForTextToSpeech)textr   rz   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rl   rm   r  r'   r|   rU  rx  r  r	  r  r  r1  )ry   rz   text_encoderspeech_decoderr|   s       r*   rm   z SpeechT5ForTextToSpeech.__init__2	  s     $00@ A/ /  5V<8@%flNK&B6&J# 	r,   r8   c                      yr   r)  )clss    r*   can_generatez$SpeechT5ForTextToSpeech.can_generateF	  s    
 r,   Nr/   r  r  r  r  r  r  r2  r3  rs  r  r  r  c                 ^   |
|
n| j                   j                  }
|>|$t        || j                   j                  |      \  }}| j                   j                  rd}| j                  ||||||||||	d|      }| j                  |d         \  }}}d}|,t        | j                         } |||||||j                        }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )ab  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr-   r/   r  r  r  r  r  rs  r  r2  r3  r  r   r   	r  r  r  r~  r  rp  r  r  r  )rz   r=  r4   r.   r  r	  r  r  rp  r   r  r~  r  r  r  r  )ry   r   r/   r  r  r  r  r  r  r2  r3  rs  r  r  r  r?  r  r  r  r  r  	criterionr  s                          r*   r   zSpeechT5ForTextToSpeech.forwardM	  sw   L &1%<k$++B]B]#+?WDKK88:P@<$&< {{44$(!--")!5#9++1/!5)   
 AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   r  r  r  r  r  r  c
                     |W|j                  d      }|j                  d      |k7  r2|j                  d      dk(  r|j                  |d      }nt        d      t        | |||||||||	
      S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   ro  r'   r   )ry   r   r/   rs  r  r  r  r  r  r  r?  r[   s               r*   generatez SpeechT5ForTextToSpeech.generate	  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   c
                     |W|j                  d      }
|j                  d      |
k7  r2|j                  d      dk(  r|j                  |
d      }nt        d      t        | |||||||||	
      S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )ry   r   rs  r/   r  r  r  r  r  r  r[   s              r*   generate_speechz'SpeechT5ForTextToSpeech.generate_speech&
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   NNNNNNNNNNNNNNNNg      ?r2   g      4@NFF)r   r   r   r&  r%  r   rm   classmethodrO   r	  r   r   r^  r_  r  r
   r   r   r   r   r   r   r(  r  r  r   r   s   @r*   r  r  )	  su    !!O~ ( T    .2269=:>BF(,!%)-,0#'7;+/+/.2z
##d*z
 ((4/z
 $//$6	z

 !& 0 04 7z
 uU%6%6784?z
 z
 $;z
  $;z
 #Tkz
 D[z
 "--4z
 !!D(z
 \\D(z
 t+z
" 
)	)#z
 z
x U]]_ 377; !$((-&+Y
##Y
 ((4/Y
 "--4	Y

 Y
 Y
 Y
 T!Y
 "&Y
  $Y
 
		U5#4#4e6G6G#GH	HY
 Y
v U]]_ 8<26 !$((-&+]
##]
 "--4]
 ((4/	]

 ]
 ]
 ]
 T!]
 "&]
  $]
 
		U5#4#4e6G6G#GH	H]
 ]
r,   r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            !           e Zd Zdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
e
ej                        dz  d
edz  dedz  dedz  dedz  dedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
ez  fd       Z ej                          	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dedededej&                  dz  dededej                  fd       Z xZS )SpeechT5ForSpeechToSpeechrz   c                     t         |   |       t        |      }t        |      }t	        |||      | _        t        |      | _        | j                          y r~   )	rl   rm   rK  rx  r  r	  r  r  r1  )ry   rz   r  r  r|   s       r*   rm   z"SpeechT5ForSpeechToSpeech.__init__
  sM     8@8@%fnnM&B6&J# 	r,   c                 T    | j                         j                  j                          yr  r  r<  s    r*   r=  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder
  r  r,   Nr-   r/   r  r  r  r  r  r  r2  r3  rs  r  r  r  r8   c                    |
|
n| j                   j                  }
|&|$t        || j                   j                  |      \  }}| j	                  ||||||||||	d|      }| j                  |d         \  }}}d}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
            a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr  r   r   r  )rz   r=  r4   r.   r	  r  r   r  r~  r  rp  r  r  r  )ry   r-   r/   r  r  r  r  r  r  r2  r3  rs  r  r  r  r?  r  r\   r  r  r  r  s                         r*   r   z!SpeechT5ForSpeechToSpeech.forward
  s$   Z &1%<k$++B]B]#+?WDKK88:P@<$&< --%)!5#9++1/!5)   
 "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   r  r  r  r  r  r  c
                 p    |!t        j                  d|j                        }t        | |||||||||	
      S )a'  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
                a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
                or the soundfile library (`pip install soundfile`).
                To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
                conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        )r   i   rH  )r   rN   r   r   )
ry   r-   rs  r/   r  r  r  r  r  r  s
             r*   r  z)SpeechT5ForSpeechToSpeech.generate_speech  sM    T %!&Xl>Q>Q!R#!
 	
r,   r  r  )r   r   r   r   rm   r=  r   r   r_  r^  r  r
   rO   r   r   r   r   r   r   r(  r  r   r   s   @r*   r  r  
  sL   
~ 
;  26269=:>BF(,!%)-,0#'7;+/+/.2u
''$.u
 ((4/u
 $//$6	u

 !& 0 04 7u
 uU%6%6784?u
 u
 $;u
  $;u
 #Tku
 D[u
 "--4u
 !!D(u
 \\D(u
 t+u
" 
)	)#u
 u
n U]]_ 8<26 !$((-&+W
''W
 "--4W
 ((4/	W

 W
 W
 W
 T!W
 "&W
  $W
 
		W
 W
r,   r  c                   :     e Zd Zd fd	ZddZd Zd Zd Z xZS )HifiGanResidualBlockc                    t         |           || _        t        j                  t        t        |            D cg c]3  }t        j                  |||d||   | j                  |||               5 c}      | _	        t        j                  t        t        |            D cg c]-  }t        j                  |||dd| j                  |d            / c}      | _
        y c c}w c c}w )Nr   )rj   dilationr   )rl   rm   leaky_relu_sloper   r  rM   rR   rq   get_paddingconvs1convs2)ry   channelsri   r  r   r  r\   r|   s          r*   rm   zHifiGanResidualBlock.__init__t  s     0mm s8}-
  		%a[ ,,[(1+F

 mm s8}-
  		 ,,[!<



s   8C$%2C)c                     ||z  |z
  dz  S r   r)  )ry   ri   r  s      r*   r!  z HifiGanResidualBlock.get_padding  s    h&1a77r,   c                 ,   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]
  } ||        y Nr   )r   r   r   r   r   r"  r#  ry   r   r  s      r*   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  sp    hh**288,,m<((33??K[[ 	E	[[ 	E	r,   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]!  }t        j                  j                  |       # y r~   )r"  r   r   remove_weight_normr#  ry   r  s     r*   r+  z'HifiGanResidualBlock.remove_weight_norm  sL    [[ 	/EHH''.	/[[ 	/EHH''.	/r,   c                 ,   t        | j                  | j                        D ]p  \  }}|}t        j                  j                  || j                        } ||      }t        j                  j                  || j                        } ||      }||z   }r |S r~   )rS  r"  r#  r   ru  
leaky_relur   )ry   r   conv1conv2r  s        r*   r   zHifiGanResidualBlock.forward  s    T[[9 	5LE5$HMM44]DDYDYZM!-0MMM44]DDYDYZM!-0M)H4M	5 r,   )r   )r   r      g?)r   )	r   r   r   rm   r!  r)  r+  r   r   r   s   @r*   r  r  s  s    
>8/r,   r  z
    HiFi-GAN vocoder.
    c                        e Zd ZU eed<   dZdef fdZ fdZd Zd Z	 e
d      dej                  d	ej                  fd
       Z xZS )r  rz   r  c                    t         |   |       t        |j                        | _        t        |j
                        | _        t        j                  |j                  |j                  ddd      | _        t        j                         | _        t        t        |j
                  |j                               D ]d  \  }\  }}| j                  j#                  t        j$                  |j                  d|z  z  |j                  d|dz   z  z  ||||z
  dz               f t        j                         | _        t)        t        | j                              D ]p  }|j                  d|dz   z  z  }t        |j                  |j*                        D ]6  \  }}| j&                  j#                  t-        ||||j.                               8 r t        j                  dddd      | _        | j3                  dt5        j6                  |j                               | j3                  dt5        j8                  |j                               | j;                          y )N   r   r   )ri   rj   r   r   r  r   )rl   rm   rR   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rq   model_in_dimupsample_initial_channelconv_prer  	upsamplerr>  rS  upsample_kernel_sizesrV   ConvTranspose1d	resblocksrM   resblock_dilation_sizesr  r   	conv_postr   r   rN   rT   r1  )ry   rz   r  upsample_rateri   r$  r  r|   s          r*   rm   zSpeechT5HifiGan.__init__  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r 		+A+{NN!!""331=33a!eE +((=8Q>		 s4>>*+ 	vA661Q<HH),V-I-I6KiKi)j v%X%%&:8[RZ\b\s\s&tuv	v
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r,   c                     t         |   |       t        |t              r?t	        j
                  |j                         t	        j                  |j                         y y r~   )	rl   r"  r  r  r  r  r  r  r   )ry   r  r|   s     r*   r"  zSpeechT5HifiGan._init_weights  s?    f%fo.KK$JJv||$ /r,   c                    t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  D ]
  } ||        | j                  D ]  }|j                            || j                         y r'  )
r   r   r   r   r   r;  r<  r?  r)  rA  r(  s      r*   r)  z!SpeechT5HifiGan.apply_weight_norm  s    hh**288,,m<((33??KDMM"^^ 	E	^^ 	&E##%	&DNN#r,   c                 J   t         j                  j                  | j                         | j                  D ]!  }t         j                  j                  |       # | j
                  D ]  }|j                           t         j                  j                  | j                         y r~   )r   r   r+  r;  r<  r?  rA  r,  s     r*   r+  z"SpeechT5HifiGan.remove_weight_norm  sr    
##DMM2^^ 	/EHH''.	/^^ 	'E$$&	'
##DNN3r,   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r8   c                    | j                   j                  r|| j                  z
  | j                  z  }|j	                         dk(  }|s|j                  d      }|j                  dd      }| j                  |      }t        | j                        D ]  }t        j                  j                  || j                   j                        } | j                  |   |      } | j                  || j                   z     |      }t        d| j                         D ]*  }| | j                  || j                   z  |z      |      z  }, || j                   z  } t        j                  j                  |      }| j#                  |      }t%        j&                  |      }|s2|j)                  d      j                  dd      j+                  d      }|S |j)                  d      }|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r   r   r   r   r#   )rz   normalize_beforer  r   r   r   r   r;  rM   r8  r   ru  r.  r   r<  r?  r6  rA  r   tanhr  r   )	ry   r  r?  
is_batchedr   r  	res_statejwaveforms	            r*   r   zSpeechT5HifiGan.forward  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))* 	9AMM44]DKKD`D`aM-DNN1-m<M<q4+;+;';<]KI1d../ UET^^A0@0@,@1,DEmTT	U%(8(88M	9 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr,   )r   r   r   r   r#  r%  rm   r"  r)  r+  r   r   r_  r   r   r   s   @r*   r  r    se     "!#O$4 $L%
$4 (5#4#4 (5CTCT ((r,   r  )r  r  r  r  r  r  )r   Nr  r  )er   r   numpyrF   r   r   torch.nnr   r   r    r   r  activationsr	   cache_utilsr
   r   r   
generationr   integrations.deepspeedr   integrations.fsdpr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   configuration_speecht5r   r   
get_loggerr   rq  _HIDDEN_STATES_START_POSITIONr   r<   r+   r4   r  r   r^  ndarrayrd   rf   r   r   r(  r   r   r   r   r   r  r$  r0  ra  rz  r  r  r  r  r  r  r  r  r  r+  rK  rU  rd  rj  rx  r  r  r  r  r  r  r_  rO   r   r  r  r  r  __all__r)  r,   r*   <module>r^     s        @ @ & ! C C ) @ 7 J 9  D , I 
		H	% !" %,, c [^ " bf0,,0250KP<<Z^K^04 /3tc?tt t $$t+	t
 t ZZtp#= ,!; 8!; 2B8BII B8L*bii *Zryy 2" "(299 %RYY %R1		 1D")) DN1")) 1h% %P<299 <2		+? ""-		+? "-J&,@ &$X2		 X2v")) 065 6rZ5 Zz ?4o ?4 ?4Dn
- n
b!&= !H&$; &R
#: 
>f
- f
R,&= ,^0$; 0f'#: 'T8M299 8Mv:bii :z 
L
+ L

L
^ 
c
5 c

c
R 48.2 $$)"'L"L##L ))D0L $$t+	L
 L L L YYL "L  L u00%2C2CCDDL^ 
V
5 V

V
r
 
d
 7 d

d
N;299 ;| 
so s
slr,   