
    qi7                         d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZ d	d
lmZ  ej$                  e      Ze G d de             ZdgZy)z
Processor class for Bark
    N   )BatchFeature)ProcessorMixin)BatchEncoding)auto_docstringlogging)cached_file   )AutoTokenizerc                        e Zd ZddddZd fd	Ze	 dd       Z	 	 	 ddef fdZdd	e	dz  fd
Z
dd	edz  fdZedefd       ZddefdZe	 	 	 	 	 	 	 ddefd       Z xZS )BarkProcessor   r
   semantic_promptcoarse_promptfine_promptNc                 2    t         |   |       || _        y)a*  
        speaker_embeddings (`dict[dict[str]]`, *optional*):
            Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
            `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
            embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
            [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
            a list of `voice_preset_names`.
        N)super__init__speaker_embeddings)self	tokenizerr   	__class__s      Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/bark/processing_bark.pyr   zBarkProcessor.__init__*   s     	#"4    c                 T   |j                  d      }|t        |||j                  dd      |j                  dd      |j                  dd      |j                  dd      |j                  dd      ||j                  d	d      ddd
      }|:t        j	                  dt
        j                  j                  ||       d       d}n,t        |      5 }t        j                  |      }ddd       nd}	d|v r||d<   t        j                  |fi |} | ||      S # 1 sw Y   4xY w)a  
        Instantiate a Bark processor associated with a pretrained model.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                This can be either:

                - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
                  huggingface.co.
                - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
                  method, e.g., `./my_model_directory/`.
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file containing the speaker_embeddings dictionary located in
                `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
            **kwargs
                Additional keyword arguments passed along to both
                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
        tokenN	subfolder	cache_dirforce_downloadFproxieslocal_files_onlyrevision
r   r   r    r!   r"   r   r#    _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors`z` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`.repo_or_path)r   r   )getr	   poploggerwarningospathjoinopenjsonloadr   from_pretrained)	cls!pretrained_processor_name_or_pathspeaker_embeddings_dict_pathkwargsr   speaker_embeddings_pathr   speaker_embeddings_jsonr   s	            r   r4   zBarkProcessor.from_pretrained7   sO   , 

7#'3&11, **[$7 **[$7%zz*:EB

9d3!',>!FJ5166;8='# '."'',,'HJfgh i] `
 &*"12 L6M)-3J)K&L L "&)!335V">2!112S^W]^	Y;MNNL Ls   DD'push_to_hubc                    | j                   .t        j                  t        j                  j	                  ||d      d       i }||d<   | j
                  D ]  }| j                  |      }i }	| j                   |   D ]m  }
t        j                  t        j                  j	                  |d   || d|
       ||
   d       t        j                  j	                  || d|
 d	      |	|
<   o |	||<    t        t        j                  j	                  ||      d
      5 }t        j                  ||       ddd       t        | 4  ||fi | y# 1 sw Y   xY w)a|  
        Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
        using the [`~BarkProcessor.from_pretrained`] method.

        Args:
            save_directory (`str` or `os.PathLike`):
                Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
                if it does not exist).
            speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
                The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
                exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
            speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
                The name of the folder in which the speaker_embeddings arrays will be saved.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            kwargs:
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        Nv2T)exist_okr)   _F)allow_picklez.npyw)r   r.   makedirsr/   r0   available_voice_presets_load_voice_presetnpsaver1   r2   dumpr   save_pretrained)r   save_directoryr7   speaker_embeddings_directoryr;   r8   embeddings_dict
prompt_keyvoice_presettmp_dictkeyfpr   s               r   rH   zBarkProcessor.save_pretrainedq   sa   8 "".KK^5QSWXcgh O.<ON+":: 7
#66zB22:> jCGG+N;=Y^h]iijknjo[p %S)%* %'GGLL1MR\Q]]^_b^ccgOh$iHSMj /7
+7  bggll>3OPRUV /Z\		/2./ 	FvF/ /s   EErM   c                 ~   | j                   |   }i }|j                  d      }dD ]  }||vrt        d| d| d      t        | j                   j                  dd      ||   |j	                  dd       |j	                  d	d       |j	                  d
d      |j	                  dd       |j	                  dd      ||j	                  dd       ddd      }|Mt        dt
        j                  j                  | j                   j                  dd      ||          d| d      t        j                  |      ||<    |S )Nr   r   #Voice preset unrecognized, missing z% as a key in self.speaker_embeddings[z].r)   /r   r   r    Fr!   r"   r#   r$   r(   z{` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the z 
                    embeddings.)
r   r*   
ValueErrorr	   r+   r.   r/   r0   rE   r3   )r   rM   r8   voice_preset_pathsvoice_preset_dictr   rO   r/   s           r   rD   z BarkProcessor._load_voice_preset   si   !44\B

7#F 	3C,, 9#>cdpcqqst  ''++NC@"3' **[$7 **[$7%zz*:EB

9d3!',>!FJ5166;8=D | "'',,t'>'>'B'B>SV'WYkloYpqr sjjviw x #  &(WWT]c"7	3: ! r   c           	      ^   dD ]  }||vrt        d| d      t        ||   t        j                        s't	        | dt        | j                  |          d      t        ||   j                        | j                  |   k7  st        | dt        | j                  |          d       y )Nr   rR   z
 as a key.z voice preset must be a z
D ndarray.)	rT   
isinstancerE   ndarray	TypeErrorstrpreset_shapelenshape)r   rM   rO   s      r   _validate_voice_preset_dictz)BarkProcessor._validate_voice_preset_dict   s    F 	jC,& #Fse:!VWWl3/<3%'?DDUDUVYDZ@[?\\f ghh<$**+t/@/@/EE C5(@TEVEVWZE[A\@]]g!hii	jr   returnc                     | j                   g S t        | j                   j                               }d|v r|j                  d       |S )z
        Returns a list of available voice presets.

        Returns:
            `list[str]`: A list of voice preset names.
        r)   )r   listkeysremove)r   voice_presetss     r   rC   z%BarkProcessor.available_voice_presets   sJ     ""*IT4499;<]*  0r   remove_unavailablec                 F   g }| j                   s| j                  D ]%  }	 | j                  |      }| j                  |       ' |r%t        j                  dt        |       d| d       |r|D ]  }| j                   |=  y y y # t        $ r |j	                  |       Y w xY w)NzThe following z' speaker embeddings are not available: zU If you would like to use them, please check the paths or try downloading them again.)	r   rC   rD   rT   appendr_   r,   r-   r]   )r   rf   unavailable_keysrM   rV   s        r   _verify_speaker_embeddingsz(BarkProcessor._verify_speaker_embeddings   s    "". $ < < D(,(?(?(M%
 001BCD  $S)9%:$;;bcsbt uk k
 "$4 >L//=> "! / " $++L9s   BB B c           
         |t        |t              swt        |t              r,| j                   || j                  v r| j	                  |      }n;t        |t              r|j                  d      s|dz   }t        j                  |      }|  | j                  |fi | t        ||      } | j                  |f|d||||d|}	|||	d<   |	S )a  
        voice_preset (`str`, `dict[np.ndarray]`):
            The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
            `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
            it can be a valid file name of a local `.npz` single voice preset containing the keys
            `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
            If a voice preset is provided, the returned object will include a `"history_prompt"` key
            containing a [`BatchFeature`], i.e the voice preset with the right tensors type.
        z.npz)datatensor_type
max_length)return_tensorspaddingrn   return_attention_maskreturn_token_type_idsadd_special_tokenshistory_prompt)rX   dictr[   r   rD   endswithrE   r3   r_   r   r   )
r   textrM   ro   rn   rs   rq   rr   r8   encoded_texts
             r   __call__zBarkProcessor.__call__   s    0 #J|T,J<-++7 D$;$;;#66|D lC09N9Nv9V#/&#8L!ww|4#,D,,\DVD'\~VL%t~~	
) !"7"71	
 	
 #-9L)*r   )N)speaker_embeddings_path.json)rz   r   F)T)NNpt   FTF)__name__
__module____qualname__r\   r   classmethodr4   boolrH   r[   rD   ru   r_   propertyrb   rC   rj   r   r   ry   __classcell__)r   s   @r   r   r   "   s     L5 Mk7O 7Ox &D%9!6G
 6Gp"!sTz "!H	jt 	j   >T >.   "#7 
7 7r   r   )__doc__r2   r.   numpyrE   feature_extraction_utilsr   processing_utilsr   tokenization_utils_baser   utilsr   r   	utils.hubr	   autor   
get_loggerr}   r,   r   __all__ r   r   <module>r      sd     	  4 . 4 , $   
		H	% TN T Tn 
r   