
    qi2                         d dl Z d dlZd dlZd dlmZ ddlmZmZ  ej                  e      Z	e G d d             Z
 ed       G d	 d
             Z G d d      Z G d de      Zy)    N)	dataclass   )is_torch_availableloggingc                   T    e Zd ZU dZeed<   eed<   dZedz  ed<   dZedz  ed<   d Zy)InputExamplea5  
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
    guidtext_aNtext_blabelc                 \    t        j                  t        j                  |       d      dz   S )*Serializes this instance to a JSON string.   )indent
jsondumpsdataclassesasdictselfs    T/opt/pipecat/venv/lib/python3.12/site-packages/transformers/data/processors/utils.pyto_json_stringzInputExample.to_json_string/   s#    zz+,,T21=DD    )	__name__
__module____qualname____doc__str__annotations__r   r   r    r   r   r   r      s5     IKFC$JE3:Er   r   T)frozenc                   v    e Zd ZU dZee   ed<   dZee   dz  ed<   dZee   dz  ed<   dZ	ee
z  dz  ed<   d Zy)InputFeaturesa  
    A single set of features of data. Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
            tokens.
        token_type_ids: (Optional) Segment token indices to indicate first and second
            portions of the inputs. Only some models use them.
        label: (Optional) Label corresponding to the input. Int for classification problems,
            float for regression problems.
    	input_idsNattention_masktoken_type_idsr   c                 X    t        j                  t        j                  |             dz   S )r   r   r   r   s    r   r   zInputFeatures.to_json_stringI   s!    zz+,,T23d::r   )r   r   r   r   listintr!   r'   r(   r   floatr   r"   r   r   r%   r%   4   sQ     Cy'+NDI$+'+NDI$+ $E3;$;r   r%   c                   F    e Zd ZdZd Zd Zd Zd Zd Zd Z	e
d
d	       Zy)DataProcessorzEBase class for data converters for sequence classification data sets.c                     t               )z
        Gets an example from a dict.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        NotImplementedError)r   tensor_dicts     r   get_example_from_tensor_dictz*DataProcessor.get_example_from_tensor_dictQ   s     "##r   c                     t               )z8Gets a collection of [`InputExample`] for the train set.r0   r   data_dirs     r   get_train_examplesz DataProcessor.get_train_examples[       !##r   c                     t               )z6Gets a collection of [`InputExample`] for the dev set.r0   r5   s     r   get_dev_exampleszDataProcessor.get_dev_examples_   r8   r   c                     t               )z7Gets a collection of [`InputExample`] for the test set.r0   r5   s     r   get_test_exampleszDataProcessor.get_test_examplesc   r8   r   c                     t               )z*Gets the list of labels for this data set.r0   r   s    r   
get_labelszDataProcessor.get_labelsg   r8   r   c                     t        | j                               dkD  r+| j                         t        |j                           |_        |S )z
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
           )lenr>   r+   r   )r   examples     r   tfds_mapzDataProcessor.tfds_mapk   s9    
 t !A% OO-c'--.@AGMr   Nc                     t        |dd      5 }t        t        j                  |d|            cddd       S # 1 sw Y   yxY w)z!Reads a tab separated value file.rz	utf-8-sig)encoding	)	delimiter	quotecharN)openr*   csvreader)cls
input_filerI   fs       r   	_read_tsvzDataProcessor._read_tsvt   s@     *cK8 	LA

1	JK	L 	L 	Ls	   !:AN)r   r   r   r   r3   r7   r:   r<   r>   rC   classmethodrP   r"   r   r   r.   r.   N   s9    O$$$$$ L Lr   r.   c                   |    e Zd ZdZddZd Zd Ze	 dd       Zedd       Z		 	 	 	 	 	 	 ddZ
	 dd	Z	 	 	 	 	 dd
Zy)%SingleSentenceClassificationProcessorz@Generic processor for a single sentence classification data set.Nc                 L    |g n|| _         |g n|| _        || _        || _        y rQ   )labelsexamplesmodeverbose)r   rV   rW   rX   rY   s        r   __init__z.SingleSentenceClassificationProcessor.__init__~   s+    "Nb&.H	r   c                 ,    t        | j                        S rQ   )rA   rW   r   s    r   __len__z-SingleSentenceClassificationProcessor.__len__   s    4==!!r   c                     t        |t              r$t        | j                  | j                  |         S | j                  |   S )N)rV   rW   )
isinstanceslicerT   rV   rW   )r   idxs     r   __getitem__z1SingleSentenceClassificationProcessor.__getitem__   s9    c5!8VZVcVcdgVhii}}S!!r   c           
      H     | di |}|j                  ||||||dd       |S )NT)
split_namecolumn_labelcolumn_text	column_idskip_first_rowoverwrite_labelsoverwrite_examplesr"   )add_examples_from_csv)	rM   	file_namerc   rd   re   rf   rg   kwargs	processors	            r   create_from_csvz5SingleSentenceClassificationProcessor.create_from_csv   sB     M&M	''!%#)!# 	( 		
 r   c                 <     | di |}|j                  ||       |S )N)rV   r"   )add_examples)rM   texts_or_text_and_labelsrV   rl   rm   s        r   create_from_examplesz:SingleSentenceClassificationProcessor.create_from_examples   s'    M&M	7Gr   c	                 X   | j                  |      }	|r|	dd  }	g }
g }g }t        |	      D ]i  \  }}|
j                  ||          |j                  ||          ||j                  ||          E|r| d| n
t        |      }|j                  |       k | j	                  |
||||      S )Nr@   -)rh   ri   )rP   	enumerateappendr    rp   )r   rk   rc   rd   re   rf   rg   rh   ri   linestextsrV   idsiliner	   s                   r   rj   z;SingleSentenceClassificationProcessor.add_examples_from_csv   s     y)!"IE ' 	!GAtLLk*+MM$|,-$

4	?+.8*Qqc*c!f

4 	!   631AVh ! 
 	
r   c           	         |:t        |      t        |      k7  r#t        dt        |       dt        |             |:t        |      t        |      k7  r#t        dt        |       dt        |             |d gt        |      z  }|d gt        |      z  }g }t               }t        |||      D ]U  \  }}	}
t	        |t
        t        f      r|	|\  }}	n|}|j                  |	       |j                  t        |
|d |	             W |r|| _
        n| j                  j                  |       |rt        |      | _        | j                  S t        t        | j                        j                  |            | _        | j                  S )Nz(Text and labels have mismatched lengths z and z%Text and ids have mismatched lengths )r	   r
   r   r   )rA   
ValueErrorsetzipr^   tupler*   addrv   r   rW   extendrV   union)r   rq   rV   ry   rh   ri   rW   added_labelstext_or_text_and_labelr   r	   texts               r   rp   z2SingleSentenceClassificationProcessor.add_examples   s    #&>"?3v;"N:3?W;X:YY^_bci_j^kl  ?s#;<CHDSIaEbDcchilmpiqhrstt;&3788C>Vc":;;Fu367OQWY\3] 	\/"E405$-@U]4e-U#OOLd4TYZ[	\ $DMMM  * |,DK }} s4;;/55lCDDK}}r   c                 	   ||j                   }t        | j                        D ci c]  \  }}||
 }	}}g }
t        | j                        D ]h  \  }}|dz  dk(  rt        j                  d|        |j                  |j                  dt        ||j                               }|
j                  |       j t        d |
D              }g }t        t        |
| j                              D ]?  \  }\  }}|dz  dk(  r.t        j                  d| d	t        | j                                |rd
ndgt        |      z  }|t        |      z
  }|r|g|z  |z   }|rdnd
g|z  |z   }n||g|z  z   }||rdnd
g|z  z   }t        |      |k7  rt        dt        |       d|       t        |      |k7  rt        dt        |       d|       | j                  dk(  r|	|j                     }n:| j                  dk(  rt!        |j                        }nt        | j                        |dk  r| j"                  rt        j                  d       t        j                  d|j$                          t        j                  ddj'                  |D cg c]  }t)        |       c}              t        j                  ddj'                  |D cg c]  }t)        |       c}              t        j                  d|j                   d| d       |j                  t+        |||             B ||S |dk(  rt-               st/        d      ddl}ddlm} |j7                  |D cg c]  }|j8                   c}|j:                        }
|j7                  |D cg c]  }|j<                   c}|j:                        }| j                  dk(  r6|j7                  |D cg c]  }|j                   c}|j:                        }nD| j                  dk(  r5|j7                  |D cg c]  }|j                   c}|j                         } ||
|      }|S t        d      c c}}w c c}w c c}w c c}w c c}w c c}w c c}w )a  
        Convert examples in a list of `InputFeatures`

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
                values)

        Returns:
            Will return a list of task-specific `InputFeatures` which can be fed to the model.

        Ni'  r   zTokenizing example T)add_special_tokens
max_lengthc              3   2   K   | ]  }t        |        y wrQ   )rA   ).0r&   s     r   	<genexpr>zESingleSentenceClassificationProcessor.get_features.<locals>.<genexpr>  s     Ii3y>Is   zWriting example /r@   zError with input length z vs classification
regression   z*** Example ***zguid: zinput_ids:  zattention_mask: zlabel: z (id = ))r&   r'   r   ptz8return_tensors set to 'pt' but PyTorch can't be imported)TensorDataset)dtypez)return_tensors should be `'pt'` or `None`)max_lenru   rV   rW   loggerinfoencoder
   minrv   maxr   rA   r}   rX   r   r,   rY   r	   joinr    r%   r   RuntimeErrortorchtorch.utils.datar   tensorr&   longr'   )r   	tokenizerr   pad_on_left	pad_tokenmask_padding_with_zeroreturn_tensorsrz   r   	label_mapall_input_idsex_indexrB   r&   batch_lengthfeaturesr'   padding_lengthxr   r   rO   all_attention_mask
all_labelsdatasets                            r   get_featuresz2SingleSentenceClassificationProcessor.get_features   s,   2 "**J.7.DE(!UUAXE	E!*4==!9 		,Hg%1$1(<=!((#'z9+<+<= ) I
   +		, I=II.7M4==8Y.Z #	l*H*y'%1$.xj#dmm:L9MNO $:aqAC	NRN *C	N:N'[>9YF	(>1A"F"W[i!i%)~)EF	!/9OAUV3WZh3h!i9~- #;C	N;K4P\~!^__>"l2 #;C<O;PPTUaTb!cddyy,,!'--0l*gmm, ++!|-.fW\\N34k#((I3NqCF3N*O)PQR.sxx8XAQ8X/Y.Z[\ggmm_GE7!DEOOMIndijkG#	lJ !Ot#%'"#]^^6!LLx)H!!++)HPUPZPZL[M!&.RAq/?/?.RZ_ZdZd!eyy,,"\\H*Eq177*EUZZ\X
l*"\\H*Eq177*EU[[\Y
#M3EzRGNHIIQ F` 4O8X *I.R*E*Es)   Q"Q(
Q-'Q2Q7 Q<%R)NNr   F) r   r@   NFrQ   )r   r   r@   NFFF)NNFF)NFr   TN)r   r   r   r   rZ   r\   ra   rR   rn   rr   rj   rp   r   r"   r   r   rT   rT   {   s    J""
 ej      
> kp#P #dJr   rT   )rK   r   r   r   utilsr   r   
get_loggerr   r   r   r%   r.   rT   r"   r   r   <module>r      s        ! 0 
		H	% E E E0 $; ; ;2*L *LZOJM OJr   