
    qiC                         d dl Z d dlZd dlZddlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZ  e       rd dlZddlmZmZ  G d d	e      Z e ed
             G d de             Zy)    N   )GenerationConfig)add_end_docstringsis_torch_availablerequires_backends   )ArgumentHandlerDatasetPipelinePipelineExceptionbuild_pipeline_init_args),MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES0MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMESc                       e Zd ZdZddZy)%TableQuestionAnsweringArgumentHandlerzB
    Handles arguments for the TableQuestionAnsweringPipeline
    Nc                    t        | d       dd l}|t        d      |t        |t              r&|j                  d      |j                  d      |g}nt        |t              rt        |      dkD  rrt        d |D              st        dd |D               |d   j                  d      |d   j                  d      |}npt        d	|d   j                          d
      t        t        |t              st        |t        j                        r|S t        dt        |       d      ||dg}|D ]C  }t        |d   |j                        r|d   t        d      |j                  |d         |d<   E |S )Npandasr   z(Keyword argument `table` cannot be None.querytablec              3   <   K   | ]  }t        |t                y wN)
isinstancedict.0ds     a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/pipelines/table_question_answering.py	<genexpr>zATableQuestionAnsweringArgumentHandler.__call__.<locals>.<genexpr>-   s     >1:a.>s   z:Keyword argument `table` should be a list of dict, but is c              3   2   K   | ]  }t        |        y wr   )typer   s     r   r   zATableQuestionAnsweringArgumentHandler.__call__.<locals>.<genexpr>/   s     UmbcVZ[\V]Ums   zIf keyword argument `table` is a list of dictionaries, each dictionary should have a `table` and `query` key, but only dictionary has keys z `table` and `query` keys.zZInvalid input. Keyword argument `table` should be either of type `dict` or `list`, but is ))r   r   zTable cannot be None.)r   r   
ValueErrorr   r   getlistlenallkeysr
   typesGeneratorTyper    	DataFrame)selfr   r   kwargspdtqa_pipeline_inputstqa_pipeline_inputs          r   __call__z.TableQuestionAnsweringArgumentHandler.__call__   s    	$)=GHH]%&599W+=+IeiiX_N`Nl',g#E4(SZ!^>>>$TUmglUmTno  8<<(4qg9N9Z*/'$JJOPQ(--/IZZtv  $E7)CzRWY^YlYlGm u+a) 
 .3U#C"D"5 	X092<<H%g.6$%<==.0ll;Mg;V.W"7+	X #"    )NN)__name__
__module____qualname____doc__r0    r1   r   r   r      s    -#r1   r   T)has_tokenizerc                        e Zd ZdZdZdZdZdZdZdZ	 e
d      Z e       f fd	Zd Zd	 Z fd
ZddZddZddZd Z xZS )TableQuestionAnsweringPipelinea  
    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
    PyTorch.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
    >>> table = {
    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
    ...     "Stars": ["36542", "4512", "3934"],
    ...     "Contributors": ["651", "77", "34"],
    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
    ... }
    >>> oracle(query="How many stars does the transformers repository have?", table=table)
    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"table-question-answering"`.

    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
    See the up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
    ztable,queryTF   )max_new_tokensc                    t        |   di | || _        t        j                         }|j                  t               | j                  |       t        | j                  j                  dd       xr! t        | j                  j                  dd       | _        t        | j                  j                  d      rd| _        y d | _        y )Naggregation_labelsnum_aggregation_labelstapasr6   )super__init___args_parserr   copyupdater   check_model_typegetattrmodelconfig	aggregatehasattrr    )r+   args_parserr,   mapping	__class__s       r   rA   z'TableQuestionAnsweringPipeline.__init__}   s    "6"'BGGICDg& !2!24H$O 
T[JJ7U
  'tzz'8'8:NOG	UY	r1   c                 &     | j                   di |S )Nr6   )rG   )r+   inputss     r   batch_inferencez.TableQuestionAnsweringPipeline.batch_inference   s    tzz#F##r1   c           	         g }g }d}|d   j                   d   }|d   j                  | j                        }|d   j                  | j                        }|d   j                  | j                        }d}	t        |      D ]+  }
|'|	dddf   }t	        j
                  |j                         j                               }||
   }	t        |j                   d         D ]{  }|	dddf   j                         |   }|	dddf   j                         |   dz
  }|	dddf   j                         |   dz
  }|dk\  s]|dk\  sc|dk(  sit        |||f         ||<   } t        j                  |      j                  t        j                        j                  | j                        |	dddf<   ||
   }||
   }||
   }	| j                  |j                  d      |j                  d      |	j                  d      	      }|j                   }| j"                  r|j%                  |j&                         |j%                  |       t        j(                  j+                  |
      }|j,                  |j                  t        j.                        j                  |j,                  j                        z  }t1        j2                  t4              }t7        |j9                         j                               D ]  \  }}|	dddf   j                         |   }|	dddf   j                         |   dz
  }|	dddf   j                         |   dz
  }|dk\  s`|dk\  sf|dk(  sl|||f   j%                  |        |D ci c],  }|t	        j:                  ||         j=                         dkD  . }}. t        j>                  tA        |      d      }| j"                  s|fS |t        j>                  tA        |      d      fS c c}w )z
        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
        handle conversational query related to a table.
        N	input_idsr   attention_masktoken_type_ids   r   r   )rR   rS   rT   )logitsg      ?)!shapetodevicerangenp
zeros_likecpunumpytolistinttorch
from_numpyr    longrG   	unsqueezerV   rI   appendlogits_aggregationdistributions	Bernoulliprobsfloat32collectionsdefaultdictr$   	enumeratesqueezearraymeancattuple)r+   rO   
all_logitsall_aggregationsprev_answers
batch_sizerR   rS   rT   token_type_ids_exampleindexprev_labels_examplemodel_labelsi
segment_idcol_idrow_idinput_ids_exampleattention_mask_exampleoutputsrV   dist_per_tokenprobabilitiescoords_to_probspcolrowkeylogits_batchs                                r   sequential_inferencez3TableQuestionAnsweringPipeline.sequential_inference   s   
 
K(..q1
;'**4;;7	 0144T[[A 0144T[[A!%:& .	iE '&<QT&B#!}}-@-D-D-F-L-L-NO)7)>&|11!45 NA!71!=!D!D!Fq!IJ3AqD9@@B1EIF3AqD9@@B1EIF{v{zQ*-lFF;K.L*MQN 05/?/?/M/R/RSXS]S]/^/a/abfbmbm/n&q!t, )% 0%3E%:"%3E%:"jj+55a85??B5??B ! G
 ^^F~~ ''(B(BCf%"00::&:IN*003I3N3Nu}}3]3`3`$$++4 M *55d;O!-"7"7"9"@"@"BC :13AqD9@@B1E
,QT299;A>B,QT299;A>B!8qZ1_#S#J/66q9: YhhQTC/#*>!?!D!D!F!LLhLh].	i` yyz!2A6&*nno<SXYiSjlmIn:oo	 is   91O>c                 r     | j                   |i |}t        |   |fi |}t        |      dk(  r|d   S |S )a  
        Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:

        - `pipeline(table, query)`
        - `pipeline(table, [query])`
        - `pipeline(table=table, query=query)`
        - `pipeline(table=table, query=[query])`
        - `pipeline({"table": table, "query": query})`
        - `pipeline({"table": table, "query": [query]})`
        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`

        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:

        Example:

        ```python
        data = {
            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
            "age": ["56", "45", "59"],
            "number of movies": ["87", "53", "69"],
            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
        }
        ```

        This dictionary can be passed in as such, or can be converted to a pandas DataFrame:

        Example:

        ```python
        import pandas as pd

        table = pd.DataFrame.from_dict(data)
        ```

        Args:
            table (`pd.DataFrame` or `Dict`):
                Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
                See above for an example of dictionary.
            query (`str` or `list[str]`):
                Query or list of queries that will be sent to the model alongside the table.
            sequential (`bool`, *optional*, defaults to `False`):
                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
                inference to be done sequentially to extract relations within sequences, given their conversational
                nature.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Activates and controls padding. Accepts the following values:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
                Activates and controls truncation. Accepts the following values:

                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
                  or to the maximum acceptable input length for the model if that argument is not provided. This will
                  truncate row by row, removing rows from the table.
                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                  greater than the model maximum admissible input size).


        Return:
            A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
            keys:

            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
              be preceded by `AGGREGATOR >`.
            - **coordinates** (`list[tuple[int, int]]`) -- Coordinates of the cells of the answers.
            - **cells** (`list[str]`) -- List of strings made up of the answer cell values.
            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
        r   r   )rB   r@   r0   r%   )r+   argsr,   pipeline_inputsresultsrM   s        r   r0   z'TableQuestionAnsweringPipeline.__call__   sL    V ,$++T<V<'"?=f=w<11:r1   c                     i }|||d<   |||d<   i }|||d<   t        | dd       | j                  |d<   t        | dd       | j                  |d<   | j                  |d<   ||i fS )Npadding
truncation
sequentialassistant_modelassistant_tokenizer	tokenizer)rF   r   r   r   )r+   r   r   r   r,   preprocess_paramsforward_paramss          r   _sanitize_parametersz3TableQuestionAnsweringPipeline._sanitize_parameters"  s    +2i(!.8l+!+5N<(4*D1=040D0DN,-4.5A*...N;'484L4LN01 ."44r1   c                     || j                   dk(  rd}nd}|d   |d   }}|j                  rt        d      ||dk(  rt        d      | j                  ||d	||
      }||d<   |S )Nr?   drop_rows_to_fitdo_not_truncater   r   ztable is empty zquery is emptypt)return_tensorsr   r   )r    emptyr"   r   )r+   pipeline_inputr   r   r   r   rO   s          r   
preprocessz)TableQuestionAnsweringPipeline.preprocess5  s    yyG#/
.
%g.w0Gu;;-..=ERK-..uTjbijwr1   c                    |j                  d      }| j                  dk(  r(|r | j                  di |}nD | j                  di |}n1d|vr| j                  |d<    | j
                  j                  di ||}|||d}|S )Nr   r?   generation_config)model_inputsr   r   r6   )popr    r   rP   r   rG   generate)r+   r   r   generate_kwargsr   r   model_outputss          r   _forwardz'TableQuestionAnsweringPipeline._forwardE  s      )993$33ClC.$..>> #/97;7M7M 34)djj))LLLOLG)5RYZr1   c                 j   |d   }|d   }|d   }| j                   dk(  r| j                  r|d d \  }}| j                  j                  |||      }|\  }}	t	        |	      D 
ci c])  \  }
}|
| j
                  j                  j                  |   + }}
}| j
                  j                  j                  }t	        |	      D 
ci c]  \  }
}||k7  s|
||
   dz    }}
}n*|d   }| j                  j                  ||      }|d   }i }i }g }t	        |      D ]  \  }}|D cg c]  }|j                  |    }}|j                  |d      }|j                  |d      }|d	j                  |      z   ||D cg c]  }|j                  |    c}d
}|r||d<   |j                  |        t              dk(  rMt        d| j
                  j                  d      | j                  j!                  |d      D cg c]  }d|i }}t        |      dkD  r|S |d   S c c}}
w c c}}
w c c}w c c}w c c}w )Nr   r   r   r?   r   z > r   r   z, )answercoordinatescells
aggregatorzTable question answeringzEmpty answerT)skip_special_tokensr   r   )r    rI   r   convert_logits_to_predictionsrm   rG   rH   r=   no_aggregation_label_indexiatr#   joinre   r%   r   name_or_pathbatch_decode)r+   r   rO   r   r   rV   
logits_aggpredictionsanswer_coordinates_batchagg_predictionsr{   predaggregatorsno_agg_label_indexaggregators_prefixanswersrx   r   
coordinater   r   aggregator_prefixr   s                          r   postprocessz*TableQuestionAnsweringPipeline.postprocessV  sc   ~.g&	*99~~%,Ra["
"nnJJ6SY[ef<G9(/\efu\vwQXQRTXq$**"3"3"F"Ft"LLww%)ZZ%6%6%Q%Q"=F=W&29!T[_cu[uA{1~--&" & !"nnJJ6SYZ+6q>( %'"G&/0H&I '"{ALM::.MM(__UB7
$6$:$:5"$E!/$))E2BB#.FQR
eii
3R
 +5F<(v&' 6{a'(BDJJD[D[]kll8<8S8ST[qu8S8vwf&)wGwg,*w:
:? x& N S xs$   $.HH H !H&5H+
8H0)NNN)TN)F)r2   r3   r4   r5   default_input_names_pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   rA   rP   r   r0   r   r   r   r   __classcell__)rM   s   @r   r9   r9   M   sr     D (#O!#O!1" $I#J Z$ApFPd5& "(;r1   r9   )rk   r(   r^   r[   
generationr   utilsr   r   r   baser	   r
   r   r   r   ra   models.auto.modeling_autor   r   r   r9   r6   r1   r   <module>r      sk       ) 
 b a 2#O 2#j ,4@Ap;X p; Bp;r1   