
    qi'                      Z    d Z ddlmZ ddlmZ ddlmZ g dZe G d de             ZdgZ	y)	z
Processor class for EVOLLA.
   )BatchFeature)ProcessorMixin)auto_docstring)aa_seqfoldseekmsac                        e Zd Zd fd	ZddZ	 ddefdZe	 	 	 	 ddee	   e	z  dz  deee	      ee	   z  dz  dedz  dedz  fd	       Z
d
 Zd Zd Zd Z xZS )EvollaProcessorNc                     |t        d      |t        d      t        | 	  ||       d| j                  _        || _        || _        y)a  
        protein_tokenizer (`EsmTokenizer`):
            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text to be generated.
        Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__	tokenizer	pad_tokenprotein_max_lengthtext_max_length)selfprotein_tokenizerr   r   r   kwargs	__class__s         ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/evolla/processing_evolla.pyr   zEvollaProcessor.__init__   sU     $JKKABB*I6#A "4.    c           
      P   g }|D ]  }|j                  d      }|j                  d      }dj                  t        ||      D cg c]&  \  }}|j                         |j	                         z   ( c}}      }	|j                  |	        | j                  |dd|d      }
|
S c c}}w )Nr   r    ptT)return_tensors
truncation
max_lengthpadding)getjoinzipupperlowerappendr   )r   proteinsr   sa_sequencesproteinr   r   sfsa_sequence	sa_tokenss              r   process_proteinsz EvollaProcessor.process_proteins2   s     	-G[[*F{{:.H''SQYEZ"[TQ1779qwwy#8"[\K,		- **$K]gk + 
	  #\s   +B"r   c                     g }|D ]1  }| j                   j                  |dd      }|j                  |       3 | j                  |dddd|      }|S )NFT)tokenizeadd_generation_promptr   longest)add_special_tokensr   r   r   r   )r   apply_chat_templater%   )r   textsr   promptsmessagespromptprompt_inputss          r   process_textzEvollaProcessor.process_text?   sw    
  	#H^^77&* 8 F
 NN6"	# $& ' 
 r   r&   messages_listr   c                    ||t        d      ||n| j                  }||n| j                  }t        |t              r|g}t        |t
        t        f      rt        |d   t
        t        f      s|g}t        |t
        t        f      rt        d |D              st        d      t        |t
        t        f      r6t        d |D              s$t        ddj                  t               d|       t        |t
        t        f      r|D ]  }t        |t
        t        f      st        d	t        |       d
      t        d |D              st        d      t        d |D              st        d |D              sst        d|        nt        dt        |       d
      | j                  ||      }| j                  ||      }t        |d   |d   |d   |d   d      S )a  
        proteins (`Union[List[dict], dict]`):
            A list of dictionaries or a single dictionary containing the following keys:
                - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
                - `"foldseek"` (`str`) -- The foldseek string of the protein.
        messages_list (`Union[List[List[dict]], List[dict]]`):
            A list of lists of dictionaries or a list of dictionaries containing the following keys:
                - `"role"` (`str`) -- The role of the message.
                - `"content"` (`str`) -- The content of the message.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text.

        Return:
            a dict with following keys:
                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
        z3You need to specify `messages_list` and `proteins`.    c              3   <   K   | ]  }t        |t                y wN
isinstancedict.0ps     r   	<genexpr>z+EvollaProcessor.__call__.<locals>.<genexpr>   s     :aST:a;N:a   zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c              3   \   K   | ]$  }t        d  |j                         D               & yw)c              3   ,   K   | ]  }|t         v   y wr>   )PROTEIN_VALID_KEYS)rC   ks     r   rE   z5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>   s     :A'':s   N)allkeysrB   s     r   rE   z+EvollaProcessor.__call__.<locals>.<genexpr>   s&      ;
?@C:::;
s   *,z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c              3   <   K   | ]  }t        |t                y wr>   r?   rC   ms     r   rE   z+EvollaProcessor.__call__.<locals>.<genexpr>   s     A1:a.ArF   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c              3   T   K   | ]   }t        |j                               d k7   " yw)   N)lenrL   rO   s     r   rE   z+EvollaProcessor.__call__.<locals>.<genexpr>   s     <as1668})<s   &(c              3   X   K   | ]"  }t        |j                               d dhk7   $ yw)rolecontentN)setrL   rO   s     r   rE   z+EvollaProcessor.__call__.<locals>.<genexpr>   s+      D=>CMfi%88Ds   (*zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskrX   rY   )data)r   r   r   r@   rA   listtuplerK   r!   rI   	TypeErrortypeanyr-   r9   r   )	r   r&   r:   r   r   r   r6   r,   text_tokenss	            r   __call__zEvollaProcessor.__call__W   s+   > }4RSS3E3Q/W[WnWn-<-H/dNbNb h% zHmdE]3J}UVGWZ^`eYf<g*OMhu.s:aX`:a7atuuhu.s ;
DL;
 8
 D99/01 2$:'  mdE]3) !(T5M:#&abfgobpaqqr$sttAAA$ A  <8<< DBJD A %$$,:/  XY]^kYlXmmno  ))(4FG	''G%.{%;*34D*E(5"-.>"?	
 	
r   c                 :     | j                   j                  |i |S r>   )r   batch_decoder   argsr   s      r   re   zEvollaProcessor.batch_decode   s    *t~~**D;F;;r   c                 :     | j                   j                  |i |S r>   )r   decoderf   s      r   ri   zEvollaProcessor.decode   s    $t~~$$d5f55r   c                 :     | j                   j                  |i |S r>   )r   re   rf   s      r   protein_batch_decodez$EvollaProcessor.protein_batch_decode   s     2t%%22DCFCCr   c                 :     | j                   j                  |i |S r>   )r   ri   rf   s      r   protein_decodezEvollaProcessor.protein_decode   s     ,t%%,,d=f==r   )N      )rn   )ro   )NNNN)__name__
__module____qualname__r   r-   intr9   r   r]   rA   rc   re   ri   rk   rm   __classcell__)r   s   @r   r
   r
      s    /(   # 0  .2>B)-&*T
t*t#d*T
 DJ'$t*4t;T
  $J	T

 tT
 T
l<6D>r   r
   N)
__doc__feature_extraction_utilsr   processing_utilsr   utilsr   rI   r
   __all__ r   r   <module>r{      sJ    5 $ 3  [>n [> [>| 
r   