
    qiO6                       d dl Z d dlmZmZ d dlmZ d dlZd dlZd dl	m
c mZ d dlm
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZMmNZNmOZOmPZPmQZQ ddlRmSZSmTZT ddlUmVZV ddlWmXZX ddlYmZZZm[Z[m\Z\  e?       rd dl]Z] e@j                  e_      Z` G d deX      Za G d d eK      Zb G d! d"e      Zce< G d# d$e5             Zde e<d%&       G d' d(e2                    Ze G d) d*eS      Zf G d+ d,eT      Zg G d- d.e\      Zh G d/ d0e
j                        Zj G d1 d2e
j                        Zk G d3 d4e[      Zl G d5 d6eZ      Zm G d7 d8eI      Zn G d9 d:e
j                        Zo G d; d<eQ      Zp G d= d>eP      Zq G d? d@eN      Zr G dA dBeO      Zs G dC dDe
j                        Zt G dE dFe
j                        Zu G dG dHe
j                        Zv G dI dJe
j                        Zw G dK dLeM      Zx G dM dNe
j                        Zy G dO dPe
j                        Zz e<dQ&       G dR dSed             Z{ G dT dUede      Z| G dV dWe7dXY      Z} G dZ d[eG      Z~g d\Zy)]    N)CallableIterable)	dataclass)nn   )initialization)ACT2FN)Cache)PreTrainedConfig)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BatchFeatureget_size_dict)convert_to_rgbresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ImagesKwargsUnpack)
TensorTypeTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargsis_vision_availableloggingtorch_compilable_check   )CONFIG_MAPPING
AutoConfig	AutoModel)BlipImageProcessor)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddingsc                   P     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )JanusVisionConfiga
  
    This is the configuration class to store the configuration of a [`JanusVisionModel`]. It is used to instantiate a
    `JanusVisionModel` according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.
    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        image_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each image.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for attention weights.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"`, and `"gelu_new"` are supported.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of MLP hidden dimensionality to embedding dimensionality.
        attention_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys, and values in the attention layers.
        hidden_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout probability for fully connected layers in the encoder.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        projection_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for the projection layer.
        use_qk_norm (`bool`, *optional*, defaults to `False`):
            Whether to normalize the query and key matrices.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated normal initializer for initializing all weight matrices.
        depth (`int`, *optional*, defaults to 2):
            Number of hidden layers in the aligner module.
        num_image_tokens (`int`, *optional*, defaults to 576):
            Number of image tokens.
    janus_vision_modelvision_configc                     t        |   d|||||||||	d	| | `|
| _        || _        || _        || _        || _        || _        || _	        || _
        || _        y )N)	hidden_sizenum_hidden_layersnum_attention_headsnum_channels
patch_size
image_sizeattention_dropoutlayer_norm_eps
hidden_act )super__init__intermediate_size	mlp_ratioattention_biashidden_dropout_rateprojection_dimprojection_dropoutuse_qk_norminitializer_rangedepthnum_image_tokens)selfrG   rH   rI   rJ   rK   rL   rM   rN   rO   rT   rU   rV   rW   rX   rY   rZ   r[   r\   kwargs	__class__s                       Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/janus/modular_janus.pyrR   zJanusVisionConfig.__init__   s    , 	 	
#/ 3%!!/)!	
 	
 "",#6 ,"4&!2
 0    )i         r   rc   i          ư>gelug      @Trd      rd   F{Gz?r/   i@  )__name__
__module____qualname____doc__
model_typebase_config_keyrR   __classcell__r_   s   @r`   rC   rC   R   sW    ,\ &J%O ',1 ,1ra   rC   c                   |     e Zd ZdZddddddddg d	d
dddd
ddfdededededededededee   dedef fdZ xZ	S )JanusVQVAEConfiga:
  
    This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a
    `JanusVQVAEModel` according to the specified arguments, defining the model architecture.
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information. Instantiating a
    configuration with the defaults will yield a similar configuration to the VQModel of the
    [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B).

    Args:
        embed_dim (`int`, *optional*, defaults to 8):
            Dimensionality of each embedding vector.
        num_embeddings (`int`, *optional*, defaults to 16384):
            Number of codebook embeddings.
        double_latent (`bool`, *optional*, defaults to `False`):
            Whether to use double z channels.
        latent_channels (`int`, *optional*, defaults to 256):
            Number of channels for the latent space.
        num_patches (`int`, *optional*, defaults to 32):
            Num of patches the input images can be divided into.
        in_channels (`int`, *optional*, defaults to 3):
            Number of input channels.
        out_channels (`int`, *optional*, defaults to 3):
            Number of out channels.
        base_channels (`int`, *optional*, defaults to 128):
            Base channel count.
        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
            Channel multipliers for each resolution.
        num_res_blocks (`int`, *optional*, defaults to 2):
            Number of residual blocks.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout rate.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        projection_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the MLP projection head.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in VAVAE MLP Connecter module.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        image_token_embed_dim (`int`, *optional*, defaults to 2048):
            Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
       i @  F       r      )   rw   r/   r/      r/   rd   rh   rg   rf   	embed_dimnum_embeddingsdouble_latentlatent_channelsnum_patchesin_channelsout_channelsbase_channelschannel_multipliernum_res_blocksdropoutc                     t        |   d|||||||	|
||d
| || _        || _        || _        || _        || _        || _        | `| `	| `
y )N)
ry   rz   r{   r|   r~   r   r   r   r   rZ   rP   )rQ   rR   r}   r   rW   rH   rO   image_token_embed_dim
resolutionattn_resolutions	attn_type)r]   ry   rz   r{   r|   r}   r~   r   r   r   r   r   rZ   rW   rH   rO   r   r^   r_   s                     r`   rR   zJanusVQVAEConfig.__init__   s    ( 	 	
)'+#'1)/	
 	
 '(,!2$%:"O!Nra   )
ri   rj   rk   rl   intboollistfloatrR   ro   rp   s   @r`   rr   rr      s    *\ ##" (7"#** * 	*
 * * * * * !I* * * *ra   rr   c                   <     e Zd ZdZdZeeedZ	 	 	 	 d fd	Z	 xZ
S )JanusConfiga;  
    This is the configuration class to store the configuration of a [`JanusModel`]. It is used to instantiate an
    Janus model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Janus-1B or Janus-7B models.

    e.g. [deepseek-community/Janus-Pro-1B](https://huggingface.co/deepseek-community/Janus-Pro-1B) or
    [deepseek-community/Janus-Pro-7B](https://huggingface.co/deepseek-community/Janus-Pro-7B)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVisionConfig`):
            The config object or dictionary of the vision backbone.
        vq_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `JanusVQVAEConfig`):
            The config object or dictionary of the VQVAE backbone.
        image_token_id (`int`, *optional*, defaults to 100581):
            Token index of a placeholder image token.

    Example:

    ```python
    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

    >>> # Initializing a Janus vision config
    >>> vision_config = JanusVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a VQ config
    >>> vq_config = JanusVQVAEConfig()

    >>> # Initializing a Janus Pro 1B style configuration
    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

    >>> # Initializing a model from the Janus Pro 1B style configuration
    >>> model = JanusForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```janus)text_configrE   	vq_configc                    t        |t              r,|j                  dd      |d<   t        |d      d	i || _        nY|(t
        j                  d       t        d          | _        n/t        |t              r|| _        nt        dt        |             |%t
        j                  d       t               | _        nPt        |t              rt        d	i || _        n/t        |t              r|| _        nt        dt        |             |%t
        j                  d       t               | _        nPt        |t              rt        d	i || _        n/t        |t              r|| _        nt        dt        |             | j                  j                  | _        | j                  j                  | j                  j                   z  | j                  _        || _        t'        | P  d	i | y )
Nrm   llamaz7`text_config` is None. Initializing with default valueszTInvalid type for `text_config`. Must be either `dict` or `LlamaConfig`. Type found: zK`vision_config` is None. Initializing with default JanusVisionConfig valuesz\Invalid type for `vision_config`. Must be either `dict` or `JanusVisionConfig`. Type found: zF`vq_config` is None. Initializing with default JanusVQVAEConfig valueszWInvalid type for `vq_config`. Must be either `dict` or `JanusVQVAEConfig`. Type found: rP   )
isinstancedictgetr0   r   loggerinfor   
ValueErrortyperC   rE   rr   r   rZ   rL   rK   r}   image_token_idrQ   rR   )r]   r   rE   r   r   r^   r_   s         r`   rR   zJanusConfig.__init__B  s    k4((3g(NK%-k,.GHW;WD KKQR-g68D%56*D  $[ 124 
  KKef!2!4Dt,!2!C]!CD'89!.D  $] 346 
 KK`a-/DN	4(-:	:DN	#34&DN  $Y02 
 "&!3!3!E!E%)%7%7%B%BdFXFXFcFc%c","6"ra   )NNNi )ri   rj   rk   rl   rm   r1   rC   rr   sub_configsrR   ro   rp   s   @r`   r   r     s8    +Z J!*%K 6# 6#ra   r   c                   R     e Zd ZU eed<   dZdZdZddgZddgZ	dZ
dZdZ fd	Z xZS )
JanusPreTrainedModelconfigmodelimagetextTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )N)rw   r   )rQ   _init_weightsr   JanusVisionEmbeddingsinitcopy_position_idstorcharangeshapeexpand)r]   moduler_   s     r`   r   z"JanusPreTrainedModel._init_weights  s[    f%f34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5ra   )ri   rj   rk   r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   ro   rp   s   @r`   r   r   {  sO    (&*#,.GH#4m"DN!i ira   r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)	ri   rj   rk   rl   r   r   FloatTensorr   r   rP   ra   r`   r   r     s4     6:%++d29/3NE%%,3ra   r   c                       e Zd Zy)JanusBaseModelOutputWithPastNri   rj   rk   rP   ra   r`   r   r         ra   r   c                       e Zd Zy)JanusCausalLMOutputWithPastNr   rP   ra   r`   r   r     r   ra   r   c                   J    e Zd Zddej                  dedej                  fdZy)r   pixel_valuesinterpolate_pos_encodingreturnc                 X   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r| j                  |||      }	n| j                  | j                        }	||	z   }|S )Ndtyper/   rw   )
r   patch_embeddingweightr   toflatten	transposer   position_embeddingr   )
r]   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             r`   forwardzJanusVisionEmbeddings.forward  s    *001fe++2288++LOO,O,OP!))!,66q!<
#66z65QJ001B1BCJ*,
ra   N)F)ri   rj   rk   r   Tensorr   r   rP   ra   r`   r   r     s'    ELL D ]b]i]i ra   r   c                   t     e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dee	   fdZ
 xZS )
JanusVisionAttentionz(Attention Class for Janus Vision Encoderr   c                 F   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        d| _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                        | _        |dkD  rt        j,                  |      nt        j.                         | _        |rt        j0                  | j                        nt        j.                         | _        |r%t        j0                  | j                        | _        y t        j.                         | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frw   biasr   )rQ   rR   r   rG   ry   rI   	num_headshead_dimr   scalerM   rX   rY   	is_causalnum_key_value_groupsr   LinearrU   q_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r]   r   proj_dropoutqk_normr_   s       r`   rR   zJanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=ra   Nhidden_statesattention_maskr^   c                 "   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  d| j
                  | j                        }| j                  |      }|j	                  d| j
                  | j                        }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                   sdn| j"                  | j$                  | j&                  d|\  }}|j	                  ||| j(                        }| j+                  |      }| j-                  |      }||fS )Nr   rw   r/   rd   )r   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr#   get_interfacer   _attn_implementationr=   trainingrM   r   r   ry   r   rX   )r]   r   r   r^   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputs                 r`   r   zJanusVisionAttention.forward  s    "/!3!3!5
GQ{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJnn
%
 
%
!\ "))*gt~~N&&{3((0|##ra   N)ri   rj   rk   rl   rC   rR   r   r   r&   r(   r   ro   rp   s   @r`   r   r     sO    2Q0 Q@ /3)$||)$ t+)$ +,	)$ra   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )JanusVisionMLPr   c                    t         |           || _        t        |j                  |j
                  z        | _        t        |j                     | _	        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y r  )rQ   rR   r   r   rG   rT   rS   r	   rO   activation_fnr   r   fc1fc2r   rV   dropout1dropout2r]   r   r_   s     r`   rR   zJanusVisionMLP.__init__  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>ra   r   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r  )r  r  r  r  r  r]   r   s     r`   r   zJanusVisionMLP.forward  sP    /**=9m4/m4ra   )	ri   rj   rk   rC   rR   r   r   r   ro   rp   s   @r`   r
  r
    s+    ?0 ?U\\ ell ra   r
  c                   $     e Zd Zdef fdZ xZS )r   r   c                 T   t         |   |       || _        |j                  | _        t        |      | _        t        j                  | j                  |j                        | _
        t        j                  | j                  |j                        | _        t        |      | _        y )N)eps)rQ   rR   r   rG   ry   r   	self_attnr   r   rN   layer_norm1layer_norm2r
  mlpr  s     r`   rR   z JanusVisionEncoderLayer.__init__  sv     ++-f5<<F<Q<QR<<F<Q<QR!&)ra   ri   rj   rk   rC   rR   ro   rp   s   @r`   r   r     s    *0 * *ra   r   c                   $     e Zd Zdef fdZ xZS )JanusVisionEncoderr   c                     t         |   |       t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w r  )rQ   rR   r   
ModuleListrangerH   r   layersr]   r   r   r_   s      r`   rR   zJanusVisionEncoder.__init__%  s@     mmeTZTlTlNm$n%<V%D$no$ns   Ar  rp   s   @r`   r  r  $  s    p0 p pra   r  c            
       r     e Zd ZeedZdef fdZ	 	 d
dej                  dz  de
dee   deez  fd	Z xZS )JanusVisionModelr   
attentionsr   c                 D    t         |   |       t        |      | _        y r  )rQ   rR   r  encoderr  s     r`   rR   zJanusVisionModel.__init__0  s     )&1ra   Nr   r   r^   r   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r   inputs_embedsr   )last_hidden_statepooler_outputrP   )r   r   r(  r+  post_layernormr!   )r]   r   r   r^   r   encoder_outputsr+  pooled_outputs           r`   r   zJanusVisionModel.forward4  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
ra   NF)ri   rj   rk   r   r   _can_record_outputsrC   rR   r   r   r   r&   r(   tupler!   r   ro   rp   s   @r`   r$  r$  *  sh    0*
20 2 26).
''$.
 #'
 +,	

 
+	+
ra   r$  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w Nrw   )rQ   rR   r   r   rG   rW   r  r  r   r[   hidden_layersr	   rO   r  r"  s      r`   rR   zJanusVisionAlignerMLP.__init__Q  s    99V//1F1FG]]NSTUW]WcWcNdeRYYv,,f.C.CDe
 $F$5$56 f   &1B<c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r  r  r7  r  r]   r   layers      r`   r   zJanusVisionAlignerMLP.forwardZ  G    /'' 	1E ..}=M!-0M	1 ra   )ri   rj   rk   rC   rR   r   ro   rp   s   @r`   r4  r4  P  s    70 7ra   r4  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEVectorQuantizerr   c                 N    t         |   |       |j                  gdz  | _        y )Nr/   )rQ   rR   r}   quant_state_dimsr  s     r`   rR   z"JanusVQVAEVectorQuantizer.__init__c  s&     !'!3!3 4q 8ra   image_tokensr   c                 B   |j                   d   }| j                  j                  j                   d   }| j                  |      }t        j                  |dd      }|j                  |g| j                  |      }|j                  dddd      j                         }|S )Nr   r   r/   )pdimr   rw   )	r   	embeddingr   F	normalizer   rA  permute
contiguous)r]   rB  r   emb_dimhidden_state_quants        r`   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entryg  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!ra   )
ri   rj   rk   rr   rR   r   
LongTensorr   rM  ro   rp   s   @r`   r?  r?  b  s/    9/ 9"u/?/? "EDUDU "ra   r?  c                       e Zd Zy)JanusVQVAEResnetBlockNr   rP   ra   r`   rP  rP  w  r   ra   rP  c                       e Zd Zy)JanusVQVAEAttnBlockNr   rP   ra   r`   rR  rR  {  r   ra   rR  c                       e Zd Zy)JanusVQVAEConvDownsampleNr   rP   ra   r`   rT  rT    r   ra   rT  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                 t    t         |           t        j                  j	                  ||ddd      | _        y )Nr   rw   kernel_sizestridepadding)rQ   rR   r   r   Conv2dconv)r]   r~   r_   s     r`   rR   zJanusVQVAEConvUpsample.__init__  s.    HHOOK!TU_`Oa	ra   c                 X    t        j                  |dd      }| j                  |      }|S )Ng       @nearest)scale_factormode)rG  interpolater]  r  s     r`   r   zJanusVQVAEConvUpsample.forward  s(    m#IV		-0ra   )ri   rj   rk   rR   r   ro   rp   s   @r`   rV  rV    s    bra   rV  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEMidBlockr   channelsc                     t         |           t        |||      | _        t	        |      | _        t        |||      | _        y )Nr   r~   r   )rQ   rR   rP  block_1rR  attn_1block_2)r]   r   re  r_   s      r`   rR   zJanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
ra   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r  )rh  ri  rj  r  s     r`   r   zJanusVQVAEMidBlock.forward  s2    ]3M2]3ra   )
ri   rj   rk   rr   r   rR   r   r   r   ro   rp   s   @r`   rd  rd    s2    
/ 
3 
U\\ ell ra   rd  c                   >     e Zd Z fdZdej
                  fdZ xZS )JanusVQVAEEncoderc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }|j                  }|j                  }t        j                  j                  ||ddd      | _        dt        |      z   }|| _        t        j                          | _        t%        | j                        D ]   }t        j                          }	t        j                          }
|||   z  }|||   z  }t%        | j
                        D ]N  }|	j'                  t)        |||             |}|| j                  dz
  k(  s5|
j'                  t+        |             P t        j,                         }|	|_        |
|_        || j                  dz
  k7  rt3        |      |_        | j"                  j'                  |        t7        |      | _        t        j                  j;                  d|dd	      | _        t        j                  j                  ||rd
|z  n|ddd      | _        y )Nr   rw   rX  )rw   rg  ru   re   T
num_groupsrJ   r  affiner/   ) rQ   rR   lenr   num_resolutionsr   r   r~   r{   r|   r   r   r\  conv_inr2  in_channel_multiplierr  downr   appendrP  rR  ModuleblockattnrT  
downsamplerd  mid	GroupNormnorm_outconv_out)r]   r   r   r~   r{   r|   r   ru  i_levelry  rz  block_in	block_outi_blockrv  r_   s                  r`   rR   zJanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{MqYZdef $u-?'@ @%:"MMO	T112 	#GMMOE==?D$'<W'EEH%(:7(CCI !4!45 
?)%$,%. %d22Q66KK 3H =>
? 99;DDJDI$..22":8"DIIT"-	#0 &fh7**bxUYbf*g#0Ao ( 
ra   r   c                    | j                  |      g}t        | j                        D ]  }t        | j                        D ]  } | j                  |   j
                  |   |d         }t        | j                  |   j                        dkD  r" | j                  |   j                  |   |      }|j                  |        || j                  dz
  k7  s|j                  | j                  |   j                  |d                 |d   }| j                  |      }| j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr   r   rw   )rt  r   rs  r   rv  ry  rr  rz  rw  r{  r|  r~  r   sigmoidr  )r]   r   r   r  r  hidden_stater+  s          r`   r   zJanusVQVAEEncoder.forward  sT   l34T112 		WG !4!45 3@tyy177@!"%  tyy)../!3#C499W#5#:#:7#CL#QL$$\23 $..22$$TYYw%7%B%B=QSCT%UV		W *"- HH%67 !MM*;<U]]+<== MM*;<  ra   )ri   rj   rk   rR   r   rN  r   ro   rp   s   @r`   rm  rm    s    1
f!E$4$4 !ra   rm  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusVQVAEDecoderc           	      v   t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }||j                  | j                  dz
     z  }t        j                  j                  ||ddd      | _        t        ||      | _        t        j                         | _        t#        t%        | j                              D ]  }t        j                         }t        j                         }||j                  |   z  }	t%        | j
                  dz         D ]N  }
|j'                  t)        |||	             |	}|| j                  dz
  k(  s5|j'                  t+        |             P t        j,                         }||_        ||_        |dk7  rt3        |      |_        | j                   j'                  |        t        j                  j7                  d|dd	      | _        t        j                  j                  ||ddd      | _        y )
Nrw   r   rX  rg  r   ru   re   Tro  )rQ   rR   rr  r   rs  r   r   r|   r   r   r   r\  rt  rd  r|  r  upreversedr   rw  rP  rR  rx  ry  rz  rV  upsampler}  r~  r  )r]   r   r   r|   r   r  r  ry  rz  r  r  r  r_   s               r`   rR   zJanusVQVAEDecoder.__init__  s   "6#<#<=$33,, 00** !6#<#<T=Q=QTU=U#VV xxaXYcde &fh7 --/d&:&: ;< 	GMMOE==?D%(A(A'(JJI !4!4q!89 
?)%$,%. %d22Q66KK 3H =>
? BBHBG!|4X>GGNN2)	. **bxUYbf*g,AVWabcra   r  r   c                 b   | j                  |      }| j                  |      }t        | j                        D ]  }t        | j                  dz         D ]l  } | j
                  |   j                  |   |      }t        | j
                  |   j                        dkD  sK | j
                  |   j                  |   |      }n || j                  dz
  k7  s| j
                  |   j                  |      } | j                  |      }|t        j                  |      z  }| j                  |      }|S )Nrw   r   )rt  r|  r   rs  r   r  ry  rr  rz  r  r~  r   r  r  )r]   r  r  r  s       r`   r   zJanusVQVAEDecoder.forward   s    ||L1 xx- T112 	GG !4!4q!89 P>twww/55g>|Ltwww',,-1#A4777#3#8#8#A,#OLP $..22#www/88F	G }}\2l33}}\2ra   )ri   rj   rk   rR   r   r   r   ro   rp   s   @r`   r  r    s)    ,d\E$5$5 %:K:K ra   r  c                        e Zd Zg dZeedZdZdef fdZ	de
j                  de
j                  fdZeede
j                  dee
j                  e
j                  f   fd	              Z xZS )

JanusVQVAE)rR  rP  r?  r%  r   r   c                 r    t         |   |       t        |      | _        d| _        | j                          y r0  )rQ   rR   r  decodergradient_checkpointing	post_initr  s     r`   rR   zJanusVQVAE.__init__A  s0     (0&+# 	ra   rB  r   c                    |j                   d   | j                  j                  d   | j                  j                  d   z  k7  rMt        d| j                  j                  d   | j                  j                  d   z   d|j                    d      | j                  j	                  |      }| j                  |      }| j                  |      }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        rw   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizerA  r   rM  post_quant_convr  )r]   rB  codebook_entryr   r   s        r`   decodezJanusVQVAE.decodeI  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2ra   c                     |j                   d   } | j                  |fddi|}| j                  |j                  j	                  |d            }t        ||j                        S )Nr   return_dictTr   )r   encoder  rB  r   r   r   )r]   r   r^   r   encode_outputsr   s         r`   r   zJanusVQVAE.forward\  se     "''*
$\NtNvN#{{>+F+F+K+KJXZ+[\ 4n6S6STTra   )ri   rj   rk   r   rP  rR  r1  main_input_namerr   rR   r   rN  r   r  r*   r)   r2  r   ro   rp   s   @r`   r  r  5  s     /) %O/ 5#3#3 8I8I & 	U''	U 
u  %"3"33	4		U  	Ura   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w r6  )rQ   rR   r   r   ry   rW   r  r  r   rH   r7  r	   rO   r  r"  s      r`   rR   zJanusVQVAEAlignerMLP.__init__k  s    99V--v/D/DE]]NSTUW]WoWoNpqRYYv,,f.C.CDq
 $F$5$56 rr8  c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r  r:  r;  s      r`   r   zJanusVQVAEAlignerMLP.forwardt  r=  ra   )ri   rj   rk   rr   rR   r   ro   rp   s   @r`   r  r  j  s    7/ 7ra   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j
                  |j                        | _        y r  )rQ   rR   r   r   r   rW   proj_outr	   rO   r  rz   vision_headr  s     r`   rR   zJanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRra   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r  )r  r  r  r  s     r`   r   zJanusVQVAEHead.forward  s6    m4**=9((7ra   )ri   rj   rk   rl   rr   rR   r   r   tensorr   ro   rp   s   @r`   r  r  |  s0    YS/ SU\\ ell ra   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       e Zd Zdef fdZd Zd Zeede	j                  dee   deez  fd              Zd	e	j                   d
e	j                  de	j                  fdZee	 	 	 	 	 	 	 	 	 dd	e	j                   dz  de	j                  dz  de	j$                  dz  de	j                   dz  dedz  de	j                   dz  d
e	j                  dz  dedz  dee	j$                  z  defd              Z xZS )
JanusModelr   c                    t         |   |       || _        t        j	                  |j
                        | _        t        | j                  j                        | _        t        j	                  |j                        | _        t        j                  | j                  j                  j                  | j                  j                  j                        | _        t#        | j                  j                        | _        t'        | j                  j                        | _        t+        j,                  |j.                        | _        d| _        | j5                          y )N)r   F)rQ   rR   r   r$  _from_configrE   vision_modelr4  alignerr  r   vqmodelr   	Embeddingrz   ry   generation_embeddingsr  generation_alignerr  generation_headr2   from_configr   language_modelr  r  r  s     r`   rR   zJanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#ra   c                 6    | j                   j                         S r  )r  get_input_embeddingsr]   s    r`   r  zJanusModel.get_input_embeddings  s    ""7799ra   c                 :    | j                   j                  |       y r  )r  set_input_embeddingsr]   values     r`   r  zJanusModel.set_input_embeddings  s    007ra   r   r^   r   c                 p     | j                   |fddi|}| j                  |j                        |_        |S )Nr  T)r  r  r+  r,  )r]   r   r^   vision_outputss       r`   get_image_featureszJanusModel.get_image_features  s=    
 +**<TTTVT'+||N4T4T'U$ra   	input_idsr*  image_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   devicer   r   rw   z6Image features and image tokens do not match, tokens: z, features: )r  r   r  r   r   longr  allsumr   	unsqueeze	expand_asr   r.   numel)r]   r  r*  r  special_image_maskn_image_tokensn_image_featuress          r`   get_placeholder_maskzJanusModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!ra   Nr   r   r   cache_position	use_cachelogits_to_keepc
                 $   |d u |d uz  rt        d      | | j                         |      }|| j                  |d      j                  }|j	                  d|j
                  d         }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||||	d|
}t        |j                  |j                  |j                  |j                   |      S d       S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oneT)r  r   )r*  r  )r*  r   r   r   r  r  r  )r+  r   r   r&  image_hidden_statesrP   )r   r  r  r,  r   r   r   r  r   r  masked_scatterr  r   r+  r   r   r&  )r]   r  r   r   r   r   r  r*  r  r  r^   image_embedsr  image_attention_mask	lm_outputs                  r`   r   zJanusModel.forward  sS    -t";<s   7D557	BM#22<T2R``L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M~^M'D'' 	
')%+))	
 	
	 ,'99%55#11 ++0<0H
 	

 OS
 	
ra   )	NNNNNNNNr   )ri   rj   rk   r   rR   r  r  r*   r)   r   r   r&   r(   r2  r!   r  rN  r  r   r
   r   r   r   r   ro   rp   s   @r`   r  r    sz   { *:8 !--9?@R9S	+	+  "))":?:K:K"]b]n]n"0  .215.204(,2626!%-..
##d*.
 ''$..
 t+	.

 &&-.
 .
 ((4/.
 ((4/.
 $;.
 ell*.
 
&.
  .
ra   r  c                   x    e Zd ZddiZdZdZdef fdZd Zd Z	d	e
j                  d
e
j                  fdZee	 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                   dz  de
j                  dz  de
j                  dz  dedz  de
j                  dz  de
j                   dz  de
j                  dz  dedz  dee
j                  z  dee   d
efd              Z	 	 	 	 	 	 	 d fd	Zde
j                  fdZ e
j4                         	 	 	 d d	e
j                  dz  de
j                  dz  dedz  f fd       Z xZS )!JanusForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightr   Tr   c                     t         |   |       || _        t        |      | _        t        j                  |j                  j                  |j                  j                  d      | _
        | j                          y )NFr   )rQ   rR   r   r  r   r   r   r   rG   
vocab_sizelm_headr  r  s     r`   rR   z&JanusForConditionalGeneration.__init__  s\     '
yy!3!3!?!?ASASA^A^ejk 	ra   c                 J    | j                   j                  j                         S r  )r   r  r  r  s    r`   r  z2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??ra   c                 N    | j                   j                  j                  |       y r  )r   r  r  r  s     r`   r  z2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=ra   inputsr   c                 r    | j                   j                  |      }| j                   j                  |      }|S r  )r   r  r  )r]   r  r  s      r`   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\Bra   Nr  r   r   r   r   r  r*  labelsr  r  r^   c                     | j                   d|||||||	|d|}|j                  }t        |
t              rt	        |
 d      n|
}| j                  |dd|ddf         }d}|4 | j                  d||| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   r   r   r*  r  r  N)logitsr  r  )lossr  r   r   r&  r  rP   )r   r+  r   r   slicer  loss_functionr   r   r  r   r   r   r&  r  )r]   r  r   r   r   r   r  r*  r  r  r  r^   outputsr   slice_indicesr  r  s                    r`   r   z%JanusForConditionalGeneration.forward  s    , $** 

%)%+')

 

  118B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
ra   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r   r*  r   r  r  is_first_iterationr  Tr   )rQ   prepare_inputs_for_generationr   )r]   r  r   r   r   r*  r  r  r  r^   model_inputsr_   s              r`   r  z;JanusForConditionalGeneration.prepare_inputs_for_generationP  sZ     w<	
+')))1	
 	
 VZZT%B+7L(ra   rB  c                 x    | j                   j                  j                  |      }|j                  dddd      }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r/   r   rw   )r   r  r  rI  )r]   rB  decoded_images      r`   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokensr  s:     

**11,?%--aAq9ra   logits_processorc           	         |j                  d| j                        }t        j                  |      }|j                  dd      }|dk(  rt	        %|   d|||d d|S  |j                  di |}|j                         t        j                  t        j                  fvrt        d      |j                          | j                  |j                                ||n	t               }d|d<   |j                  t         j#                  d       d	|_        |j                  |d
<   | j%                  ||j&                  |      \  }}	}|j(                  |j*                  }}
t-        |j.                        dk7  rt        d|j.                   d      |d u}| j1                  |||j*                         |j                  r:|j                  dkD  r+|j3                  t5        |j                               d |_        | j7                  ||j.                  d   |d ||      } | j8                  d|||j:                  d|\  }}| j<                  j>                  j@                  jB                  }|j.                  \  }}|jE                  dd      }|j                  dd       }|jE                  dd      }||d<   ||d d d f   |j&                  k7  ||d d d f   |jF                  d   k7  z  }||d d d f   jI                  ||jJ                          | jM                         |      }| jO                  |||      }|jQ                  dd       @| jS                  |jT                  xs d|dz  tW        |jX                  ||z         |      |d<   t[        j\                  ||f|
|      }|j^                  }|j`                  }|jb                  }|jd                  }|jf                  }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti        |      D ]|  } | jj                  d||d|}d|v r!|d   jm                  |j*                        |d<   |d   jm                  |j*                        |d<    | j<                  jn                  di |||d}| jq                  ||      }|jr                  d d dd d f   ju                         } | j<                  jw                  |       }! |||!      }"|jx                  r>t[        jz                  |"d      }#t[        j|                  |#d      j                  d      }$nt[        j                  |"d      }$|$|d d |f<   t[        j                  |$|$g      }$|$j                  d      }$| j                  |$      } |r@|r|!fz  }|r| j                         fz  }|r|j                  z  }|r|j                  z  }|rt        |!|||j                        S |S ) Ngeneration_configgeneration_moder   )r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r/   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  rw   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenmodel_kwargsr  rP   )r*  r  r  )output_attentionsoutput_hidden_statesr   )rE  )num_samples)	sequencesscoresr  r&  r   r   )Ipopr  copydeepcopyrQ   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  r   warning_prepare_model_inputsbos_token_idr   r  rr  r   _prepare_special_tokensrw  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   r\   repeatgeneration_kwargsmasked_fill_pad_token_idr  _get_initial_cache_positionr   _prepare_static_cacher  max
max_lengthr   zerosr   r  output_scoresoutput_logitsreturn_dict_in_generater   r  r   r  #_update_model_kwargs_for_generationr+  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  r   r&  r   r   r   )&r]   r  r   r  r^   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskr\   r   r   input_tokensmaskr*  generated_tokensr   r  r  r   r!  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  r  r  next_token_scoresprobs
next_tokenr_   s&                                        r`   r  z&JanusForConditionalGeneration.generate~  s    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   0(//9&9 002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N%22L5
1	#\ ")9)9vy1$MiooM^EF  %3$$>!$$%68QZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #E$"D"D #
))>>#
 	#
	<  ::2299JJ'oo
G ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW3113LA77V-t4<.2.H.H%6%K%K%Wx%>!"3">">@PSZ@Z[) /I /L*+ !;;
4D'EU[ab .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'( #	UA=4== +|GSL  </1=>N1O1R1RS`SgSg1h-.-9:J-K-N-N}OcOc-dL)*/djj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG#	UJ #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#ra   )
NNNNNNNNNr   )NNNNNNF)NNN)ri   rj   rk   _tied_weights_keysoutput_modalitiesr   r   rR   r  r  r   r   r  r*   r)   rN  r   r
   r   r   r&   r(   r   r   r  r  no_gradr   r  ro   rp   s   @r`   r  r    s   *,VW)!{ @>ell u|| 
  .215.204(,2626*.!%-.1
##d*1
 ''$.1
 t+	1

 &&-1
 1
 ((4/1
 ((4/1
   4'1
 $;1
 ell*1
 +,1
 
%1
  1
l   D
 
 U]]_ '+267;	|$t#|$ ((4/|$ .4	|$ |$ra   r  c                       e Zd ZU dZeed<   y)JanusImageProcessorKwargsz
    min_size (`int`, *optional*, defaults to 14):
        The minimum allowed size for the resized image. Ensures that neither the height nor width
        falls below this value after resizing.
    min_sizeN)ri   rj   rk   rl   r   r   rP   ra   r`   r;  r;  >  s     Mra   r;  F)totalc            "           e Zd ZdZeZdddej                  dddddddfdede	e
ef   dz  ded	ed
edeez  dedeee   z  dz  deee   z  dz  dedz  dedz  f fdZ	 	 	 ddej                   deeeeef   z  de
ez  dz  de
ez  dz  dej                   f
dZej                  ddfdej                   de	e
ef   ez  d	ede
ez  dz  de
ez  dz  dej                   fdZ e       ddddddddddddej,                  dfdededz  de	e
ef   dz  d	edz  d
edz  dedz  dedz  deee   z  dz  deee   z  dz  de
ez  dz  dedz  deeeeef   z  dz  dedz  dede
ez  dz  dej4                  j4                  f d       Z	 	 	 	 	 	 	 dded
edz  dedz  dedz  dee   dz  dee   dz  de
dz  de
dz  fdZ	 d dej                   deee   z  deee   z  de
ez  dz  dej                   f
dZ xZS )!JanusImageProcessora  
    Constructs a JANUS image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        min_size (`int`, *optional*, defaults to 14):
            The minimum allowed size for the resized image. Ensures that neither the height nor width
            falls below this value after resizing.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to square or not.
    TN   gp?	do_resizer   r<  resample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbdo_padc                     t        |   di | || _        || _        |d| _        y t        d |D              | _        y )N)   rK  rK  c              3   8   K   | ]  }t        |d z          yw)   N)r   ).0xs     r`   	<genexpr>z/JanusImageProcessor.__init__.<locals>.<genexpr>  s     )K1#a#g,)Ks   rP   )rQ   rR   rI  r<  background_colorr2  )r]   rA  r   r<  rB  rC  rD  rE  rF  rG  rH  rI  r^   r_   s                r`   rR   zJanusImageProcessor.__init__r  sD     	"6" $3D!$))K
)K$KD!ra   r   rQ  data_formatinput_data_formatr   c                 N   t        ||      \  }}|t        j                  k(  r|j                  d   n|j                  d   }||k(  r|t	        |||      }|S |}|S t        ||      }t        |t              r|g}nt        |      |k7  rt        d| d      |t        j                  k(  r~t        j                  |||f|j                        }	t        |      D ]  \  }
}||	|
ddddf<    ||kD  r||z
  dz  }||	dd|||z   ddf<   |	S ||z
  dz  }||	dddd|||z   f<   |	S t        j                  |||f|j                        }	t        |      D ]  \  }
}||	dddd|
f<    ||kD  r||z
  dz  }||	|||z   ddddf<   |	S ||z
  dz  }||	dd|||z   ddf<   |	S )a}  
        Pads an image to a square based on the longest edge.

        Args:
            image (`np.ndarray`):
                The image to pad.
            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                The color to use for the padding. Can be an integer for single channel or a
                tuple of integers representing for multi-channel images. If passed as integer
                in multi-channel mode, it will default to `0` in subsequent channels.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.

        Returns:
            `np.ndarray`: The padded image.
        r   r   Nz(background_color must have no more than z) elements to match the number of channelsr   r/   )r   r   FIRSTr   r   r  r   r   rr  r   npr  r   	enumerate)r]   r   rQ  rR  rS  r   r   rJ   max_dimresultr3  colorstarts                r`   pad_to_squarez!JanusImageProcessor.pad_to_square  s+   < 'u.?@):>N>T>T)Tu{{1~Z_ZeZefhZiU? * ,E;@QR 
 L  
 Lfe$ &, 01!"l2:<.Hqr   0 6 66XX|Wg>ekkRF%&67 (5"'q!Qw(v~ 6)a/7<q%%&.0!34  !5Q.6;q!UUU]223  XXw>ekkRF%&67 (5"'q!Qw(v~ 6)a/7<uuv~-q!34
  !5Q.6;q%%%-/23ra   c                 r   |t        |      }t        ||      \  }}t        ||      }	t        |d      }|d   |d   k7  rt	        d|d    d|d          |d   }||	z  }
t        t        ||
z        | j                        t        t        ||
z        | j                        g}t        |f||||d|}|S )an  
        Resize an image to dynamically calculated size.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`dict[str, int]` or `int`):
                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `None`: will be inferred from input
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        Tdefault_to_squarer   r   z5Output height and width must be the same. Got height=z and width=)r   rB  rR  rS  )r   r   r  r   r   roundr<  r   )r]   r   r   rB  rR  rS  r^   r   r   max_sizedeltaoutput_size_nonpaddeds               r`   r   zJanusImageProcessor.resize  s    F $ >u E&u.?@vu%TT:>T']*GXGWWbcghocpbqr  H~x fun%t}}5eem$dmm4!

 
&#/
 
 ra   imagesreturn_tensorsc           
         ||n| j                   }||n| j                  }||n| j                  }||n| j                  }||n| j                  }||n| j
                  }|	|	n| j                  }	||n| j                  }||n| j                  }||n| j                  }||n| j                  }t        |d      }| j                  |      }t        |      }t        |      st        d      t!        |||||	|||       |r|D cg c]  }t#        |       }}|D cg c]  }t%        |       }}|r#t'        |d         rt(        j+                  d       |t-        |d         }|r"|D cg c]  }| j/                  ||||       }}|r!|D cg c]  }| j1                  |||       }}|r!|D cg c]  }| j3                  |||	       }}|r"|D cg c]  }| j5                  |||	|
       }}|D cg c]  }t7        |||       }}t9        d|i|
      }|S c c}w c c}w c c}w c c}w c c}w c c}w c c}w )a  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Controls the size of the image after `resize`. The shortest edge of the image is resized to
                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to normalize the image by if `do_normalize` is set to `True`.
            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            background_color (`tuple[int, int, int]`):
                The background color to use for the padding.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image to square or not.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Fr^  zSInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor)rC  rD  rE  rF  rG  rA  r   rB  r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)r   r   rB  rS  )r   rQ  rS  )r   r   rS  r   meanstdrS  input_channel_dimr   datatensor_type)rA  rB  rC  rD  rE  rF  rG  rH  rI  rQ  r   r   fetch_imagesr   r   r   r   r   r   r   r   warning_oncer   r   r\  rescalerH  r   r   )r]   rd  rA  r   rB  rC  rD  rE  rF  rG  re  rH  rQ  rI  rR  rS  r   encoded_outputss                     r`   
preprocesszJanusImageProcessor.preprocess  s   H "+!6IDNN	'38#-#9Zt
+9+E4K^K^'3'?|TEVEV#-#9Zt
!*!6IDNN	+9+E4K^K^!-4;;/?/K+QUQfQf'tTYYTU;""6*)&1F#rss%!)%!		
 9?@nU+@F@ 6<<E.'<</&)4s
 $ >vay I $ %dXYjkF 
  $  ""%5&7 # F   $ 5RcdF 
  $ U^opF  ou
ej'{N_`
 
 '^V,DR`ae A =

s*   	H$!H)0H.H37H8H=<Ic	                    ||n| j                   }|d| j                  z  n|}||n| j                  }||n| j                  }||n| j                  }t        |      }t        |d   t        j                  j                        rt        |      dkD  r|S |d   S |t        |d         }g }	|D ]  }
t        |
      }
|r| j                  |
|||      }
|rC| j                  |
||      }
|
j                  dd      j                  t         j"                        }
|rB|r@|dk(  r;t%        |
t&        j(                  |	      }
t        j                  j+                  |
      }
|	j-                  |
        d
|	i}|dk7  r|nd}t/        ||      S )znApplies post-processing to the decoded image tokens by reversing transformations applied during preprocessing.Ng      ?r   rw   )r   rF  rG  rS  )r   rS  rM  zPIL.Image.Imagerj  r   rl  )rC  rD  rE  rF  rG  r   r   PILImagerr  r   r   unnormalizerq  clipastyperV  uint8r   r   LAST	fromarrayrw  r   )r]   rd  rC  rD  rE  rF  rG  rS  re  r   r   rm  s               r`   postprocesszJanusImageProcessor.postprocess  s    $.#9Zt
6D6Lt222R`'3'?|TEVEV#-#9Zt
!*!6IDNN	)&1fQi1 [1_6;&);$ >vay I 	'E"5)E((J)_p )  U.Tef

1c*11"((;
~AR/R3E;K;P;Pduv		++E2&!	'$ -+9=N+NTX>BBra   c                    d}t        |t              r(t        |      |k7  r t        d| dt        |             |g|z  }t        |t              r(t        |      |k7  r t        d| dt        |             |g|z  }t	        d t        ||      D              }t	        d |D              }| j                  ||||      }|S )a~  
        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
        image = (image * image_std) + image_mean
        Args:
            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
                Batch of pixel values to postprocess.
            image_mean (`float` or `Iterable[float]`):
                The mean to use for unnormalization.
            image_std (`float` or `Iterable[float]`):
                The standard deviation to use for unnormalization.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        r   zmean must have z$ elements if it is an iterable, got zstd must have c              3   .   K   | ]  \  }}| |z    y wr  rP   )rN  rh  ri  s      r`   rP  z2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s     WytSus{Ws   c              3   &   K   | ]	  }d |z    yw)rw   NrP   )rN  ri  s     r`   rP  z2JanusImageProcessor.unnormalize.<locals>.<genexpr>  s     ;#a#g;s   rg  )r   r   rr  r   r2  ziprH  )r]   r   rF  rG  rS  rJ   rev_image_meanrev_image_stds           r`   rw  zJanusImageProcessor.unnormalize  s    0 j(+:,. ?<.@dehisetdu!vww$4Ji*9~- >,?cdghqdrcs!tuu"l2IWC
I<VWW;;;n-Sd  
 ra   )r   NN)NNNNNNNr  ) ri   rj   rk   rl   r;  valid_kwargsr   BICUBICr   r   strr   r   r   rR   rV  ndarrayr2  r   r\  r   r+   rU  r   r'   ru  rv  rs  r}  r   rw  ro   rp   s   @r`   r?  r?  H  sH   %N -L &*'9'A'A&-!1504&*"LL 38nt#L 	L
 %L L eL L DK'$.L 4;&-L tL tL6 8959;?HzzH c3m 44H ++d2	H
 !11D8H 
H\ (:'A'A59;??zz? 38ns"? %	?
 ++d2? !11D8? 
?B %& "&&*.2"&'+$(150426&*>B"(8(>(>;?!TT $;T 38nt#	T
 %t+T 4KT T TkT DK'$.T 4;&-T j(4/T tT c3m 44t;T tT &T  !11D8!T" 
#T 'Tr #''+$()-(,(,%)1C1C 4K1C 	1C
 Tk1C K$&1C ;%1C :1C d
1Cp <@+zz+ HUO++ 8E?*	+
 !11D8+ 
+ra   r?  )	r?  r   r  r  r  r$  rr   rC   r   )r  collections.abcr   r   dataclassesr   numpyrV  r   torch.nn.functionalr   
functionalrG   r   r   activationsr	   cache_utilsr
   configuration_utilsr   
generationr   r   r   r   generation.utilsr   image_processing_utilsr   r   image_transformsr   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   modeling_outputsr    r!   r"   modeling_utilsr#   r$   processing_utilsr%   r&   utilsr'   r(   r)   r*   r+   r,   r-   r.   autor0   r1   r2   blip.image_processing_blipr3   blip_2.modeling_blip_2r4   !chameleon.configuration_chameleonr5   chameleon.modeling_chameleonr6   r7   r8   r9   r:   idefics.modeling_ideficsr;   r<   llama.modeling_llamar=   siglip.configuration_siglipr>   siglip.modeling_siglipr?   r@   rA   ru  
get_loggerri   r   rC   rr   r   r   r   r   r   r   rx  r   r
  r   r  r$  r4  r?  rP  rR  rT  rV  rd  rm  r  r  r  r  r  r  r;  r?  __all__rP   ra   r`   <module>r     s    . !      & !   3 u u 9 A S S   Y X F 4	 	 	 9 8 ; 5 D  e : < ^ ^ 			H	%
^1* ^1BW+ Wtk#" k#\ i? i i$ 
	4{ 	4 	4	#A 		"? 	2 "I$299 I$XRYY (*0 *p p#
' #
LBII $" = "*	< 		8 		B 	RYY  ,J!		 J!ZA		 AH2U 2Uj299 $RYY   
n
% n

n
by$$8/ y$x	E B, BJ
ra   