
    qi|9                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlZd d	lmZ d d
lmZ d dl m Z  d dl!Z!d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 erd dl!m2Z2m3Z3m4Z4 ddl5m6Z6  e(       rd dl7Z7 e,       rd dl8m9Z9  e*       xr  e'       xr  e+       xr  e)       Z:e:rd dl;Z;d dl<m=Z=m>Z> d dl?m@Z@ d dlAmBZBmCZC d dlDmEZE d dlFmGZG d dlHmIZImJZJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZR d dlNmMZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk  G d  d!egd"#      Zl G d$ d%eUd"#      Zm G d& d'eGd"#      Zn ejel      Zo ejem      Zp ejen      Zqh d(Zrh d)Zsh d*Zt e1j                  ev      Zwd+d,d-d.iZx eyexj                               Z{d/Z|d0 Z}d1 Z~d2 Z G d3 d4ej                         Z	 	 	 	 	 	 d>d5Z G d6 d7      Z G d8 d9      Z G d: d;      Zd<e_        evd=k(  r e       Zyy)?    )annotationsN)	GeneratorIterable)asynccontextmanager)	lru_cache)BytesIO)Thread)TYPE_CHECKING	Annotated	TypedDict)scan_cache_dir)DecodeStream)tqdm)AutoTokenizerBitsAndBytesConfigGenerationConfigPreTrainedTokenizerBase)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )LogitsProcessorListTextIteratorStreamer)logging)PreTrainedModelPreTrainedTokenizerFastProcessorMixin)ContinuousBatchingManager)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionChatCompletionMessageChatCompletionMessageParam)Choice)ChatCompletionChunkChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                      e Zd ZU dZded<   y))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        strgeneration_configN__name__
__module____qualname____doc____annotations__     H/opt/pipecat/venv/lib/python3.12/site-packages/transformers/cli/serve.pyrG   rG   v       	 rQ   rG   F)totalc                      e Zd ZU dZded<   y)+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id
        rH   rI   NrJ   rP   rQ   rR   rV   rV   }   rS   rQ   rV   c                  4    e Zd ZU dZded<   ded<   dZded<   y	)
%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        bytesfilerH   rI   FboolstreamN)rK   rL   rM   rN   rO   r\   rP   rQ   rR   rX   rX      s    	 rQ   rX   >   textuserstorepromptinclude	reasoning
background
truncationtool_choiceservice_tiertop_logprobsmax_tool_callsprevious_response_id>   nstopr^   audior_   logprobsmetadata	functions
modalities
predictionre   rf   rg   function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>   r`   ra   languagert   chunking_strategytimestamp_granularitiesqwenz<tool_call>z</tool_call>)startendzx-request-idc                .    dd l }|j                  |        y Nr   )torchmanual_seed)_seedr   s     rR   set_torch_seedr      s    	erQ   c                 v    dd l } | j                  j                         r| j                  j                          y y r   )r   cudais_availableempty_cache)r   s    rR   reset_torch_cacher      s*    zz 

  !rQ   c                ,    dd l }|j                  |       S r   )r   	ones_like)_input_tensorr   s     rR   torch_ones_liker      s    ??=))rQ   c                      e Zd ZdZdZdZdZy)ModalityLLMVLMSTTTTSN)rK   rL   rM   r   r   r   r   rP   rQ   rR   r   r      s    
C
C
C
CrQ   r   c                L   | j                  d      "t        di t        j                  | d         }nt	        j
                  |      } |j                  di |}|j                         D ]  \  }}|	t        |||        | j                  d      t        | d         |_
        | j                  d      t        | d         |_
        | j                  d      t        | d         |_        | j                  d      
| d   |_        | j                  d      
| d   |_        | j                  d      +t        | d         |_        t        | d         dk(  rd	|_        | j                  d
      t        | d
         |_        | j                  d      t%        | d          |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rI   max_output_tokens
max_tokensfrequency_penalty
logit_biasrk   temperatureg        Ftop_pseedrP   )getr   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penaltysequence_biasstop_stringsr   	do_sampler   r   )reqmodel_generation_configkwargsrI   non_standard_kwargskvs          rR   !create_generation_config_from_reqr      s   . ww"#/,Ttzz#>Q:R/ST MM*AB2+22<V<#))+ -1=%q!,-
 ww"#/+.s3F/G+H( ww|(+.s</@+A(
ww"#//4S9L5M/N,
ww|(*-l*;'
wwv"),V&
ww})(-c-.@(A%]#$+*/'
www#"'G"5
wwv"s6{#rQ   c                      e Zd ZdZd Zd Zy)	ToolStatez7Lightweight class to keep track of the tool call state.c                $    | j                          y N)resetselfs    rR   __init__zToolState.__init__,  s    

rQ   c                <    d| _         d| _        d| _        d| _        y)z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   s    rR   r   zToolState.reset/  s!     %%*"!"rQ   N)rK   rL   rM   rN   r   r   rP   rQ   rR   r   r   )  s    ArQ   r   c                  >    e Zd ZdZ	 d	 	 	 	 	 d	dZd Zd Zd Zd Zy)

TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nc                    || _         t        |j                        | _        || _        || _        t        j                  | j
                  | j                        | _	        | j                  j                          y r   )modelrH   name_or_path_name_or_path	processortimeout_seconds	threadingTimertimeout_reached_timerr~   )r   r   r   r   s       rR   r   zTimedModel.__init__=  s[     
 !3!34".ood&:&:D<P<PQrQ   c                    | j                   j                          t        j                  | j                  | j
                        | _         | j                   j                          y)z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   r~   r   s    rR   reset_timerzTimedModel.reset_timerJ  s@    ood&:&:D<P<PQrQ   c                    t        | d      rX| j                  K| `| `d| _        d| _        t        j                          t                | j                  j                          yyy)z>Delete the wrapped model and processor and clean up resources.r   N)hasattrr   r   gccollectr   r   r   r   s    rR   delete_modelzTimedModel.delete_modelP  sX    4!djj&<
DJ!DNJJL  KK  '=!rQ   c                    | j                   dkD  r@| j                          t        j                  | j                   d| j                    d       y y )Nr   z was removed from memory after z seconds of inactivity)r   r   loggerinfor   r   s    rR   r   zTimedModel.timeout_reached_  sM    !#KK%%&&EdFZFZE[[qr $rQ   c                <    t        | d       xs | j                  du S )z)Check if the instances have been deleted.r   N)r   r   r   s    rR   
is_deletedzTimedModel.is_deletedf  s     4))?TZZ4-??rQ   r   )r   r   r   r   r   z/ProcessorMixin | PreTrainedTokenizerFast | None)	rK   rL   rM   rN   r   r   r   r   r   rP   rQ   rR   r   r   7  sE     FJ	  C	!@rQ   r   c                     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd Zd Z	 	 	 	 	 	 	 	 ddZddZddZddZ		 	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	Z
edd
       Zeed d!d              Zd"dZed d#d       Zed$d       Zd%dZd&dZd'dZd&dZd(dZd)dZd*dZd+dZd,dZd-dZy).ServeNc                P    t         st        d      | _        | _        | _        | _        | _        | _        | _        | _	        |	 _
        |
 _        | _        | _        | _        | _        | _        |t#        |       t%        j&                  d      }|j)                  t$        j*                  |
j-                                   t%        j&                  d      }|j)                  t$        j*                  |
j-                                   i  _        d  _        d  _        d  _        d  _         j                   j                  rdnd _
         j                  r3 j9                   j                        }| _         j;                  |       t<        d fd       }t?        |      } j                  r2|jA                  tB        dgd	dgdg
       tD        jG                  d       ddl$m%} |jM                  d      d fd       }|jM                  d      d fd       }|jM                  d      d fd       }|jO                  d      |jQ                  d       fd              }|jQ                  d      d        }|jS                  d      dd       }tU        jV                  | j                   j                   j                        }tU        jX                  |       _-         j                   r j]                          y  jZ                  j_                          y )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`transformersz+transformers.generation.continuous_batching,  c                  K   d  j                   j                         D ]  }|j                           j                  j                  j	                  dd       y y w)NT   blocktimeout)loaded_modelsvaluesr   #running_continuous_batching_managerrk   )appr   r   s     rR   lifespanz Serve.__init__.<locals>.lifespan  s`     ++224 %""$%77C88==DRS=T Ds   A A#)r   *T)allow_originsallow_credentialsallow_methodsallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r   )Requestz/v1/chat/completionsc                    j                  |       j                  r&j                  || j                  j                        S j                  |      S )Nrequest) validate_chat_completion_requestcontinuous_batching#continuous_batching_chat_completionstate
request_idgenerate_chat_completion)r   bodyr   s     rR   chat_completionz'Serve.__init__.<locals>.chat_completion  sK    11$1?''??gmmF^F^__44T::rQ   z/v1/responsesc                    j                  |        | j                  dd      }|sj                  |       }t        |      S j	                  |       }t        |d      S )Nr   r\   Ttext/event-stream
media_type)validate_response_requestr   generate_response_non_streamingr&   generate_responser'   )r   r\   response_objoutputr   s       rR   	responsesz!Serve.__init__.<locals>.responses  s`    **7*;[[40F#CCGL#L11++G4F$V8KLLrQ   z/v1/audio/transcriptionsc           
       K   | j                         4 d {   }t        |d   j                          d {   |d         }t        j	                  d|d   j
                   d|d   j                   d|d   j                  dz  dd	       d d d       d {    j                  
       j                  |      }t        |d      S 7 7 7 8# 1 d {  7  sw Y   HxY ww)NrZ   r   )rZ   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr   r   r   )formrX   readr   debugfilenamecontent_typesizevalidate_transcription_requestgenerate_transcriptionr'   )r   r  parsed_requestr  r   s       rR   audio_transcriptionsz,Serve.__init__.<locals>.audio_transcriptions  s      ||~ 	 	!F#F|0022w-"
 %d6l&;&;%<M$v,JcJcId e!&\..5c:$@	 	 ///G00@F$V8KLL	2	 	 	 	sU   C+CC+CC
ACC+C5C+CC+C(CC($C+z
/v1/modelsc                 <    t        d j                         d      S )Nlist)objectdata)r&   get_gen_modelsr   s   rR   get_all_modelsz&Serve.__init__.<locals>.get_all_models  s      64;N;N;P QRRrQ   z/healthc                     t        ddi      S )Nstatusok)r&   rP   rQ   rR   healthcheckz#Serve.__init__.<locals>.healthcheck"  s    4 011rQ   httpc                   K   | j                   j                  t              xs t        t	        j
                               }|| j                  _         ||        d {   }||j                   t        <   |S 7 wr   )headersr   X_REQUEST_IDrH   uuiduuid4r   r   )r   	call_nextr   responses       rR   get_or_set_request_idz-Serve.__init__.<locals>.get_or_set_request_id&  s]      ,,\:Oc$**,>OJ'1GMM$&w//H-7H\*O 0s   AA9A7A9)hostport	log_level)r   r#   )r   r   r   dictr   r'  )r   r   )0serve_dependencies_availableImportErrorr   devicedtypetrust_remote_codeattn_implementationquantizationr$  r%  model_timeoutr&  default_seedenable_corsinput_validationforce_modelnon_blockingr   r   
get_loggersetLevel
log_levelslowerr   r   last_messageslast_kv_cache
last_modelprocess_model_nameload_model_and_processorr   r#   add_middlewarer%   r   warning_oncefastapir   postoptionsr   
middlewareuvicornConfigServerserverstart_serverrun)r   r   r+  r,  r-  r.  r/  r$  r%  r0  r&  r1  r2  r3  r4  r5  transformers_logger	cb_loggermodel_id_and_revisionr   r   r   r   r  r  r  r  r#  configs   `                            rR   r   zServe.__init__n  s   t ,s 
 $7 
!2#6 (		*"(& 0&( #<( &00@$$W%7%7	8I%JK&&'TU	7--ioo.?@A 57UY0 "!%'+'7'7SD$($;$;D<L<L$M!3DO))*?@		U 
	U x( "e"&"e"e   g 	$	(	)	; 
*	; 
/	"		M 
#		M 
,	-	M 
.	M" 
\	"			S 
 
#	S 
		2 
	2 
		 
 	 $))$))t~~^nnV,KKOOrQ   c                ~      fd}t        j                  |dd       _         j                  j                          y )Nc                     t        j                          _        t        j                   j                          j                  j	                   j
                  j                                y r   )asyncionew_event_loop_loopset_event_looprun_until_completerH  server   s   rR   _runz Serve.start_server.<locals>._run7  sD     //1DJ""4::.JJ))$++*;*;*=>rQ   zuvicorn-threadF)targetnamedaemon)r   r	   _threadr~   )r   rW  s   ` rR   rI  zServe.start_server6  s2    	? !''t:JSXYrQ   c                &   | j                   st        d      | j                   j                         st        d      d| j                  _        | j                   r8| j                   j                         r| j                   j                  d       y y y )NzHThe server cannot be killed as it was not launched in a separate thread.zThe server is already killed.Tr   )r   )r[  
ValueErroris_aliverH  should_exitjoinr   s    rR   kill_serverzServe.kill_server@  so    ||ghh||$$&<=="&<<DLL113LLa( 4<rQ   c                   t         j                  d|        t        |j                               }|j                  }||z
  }|r(t         j                  d|        t        dd|       | j                  rB	 |j                  |       ||z  }	|	r(t         j                  d|	        t        dd|	       yy# t        $ rF}t         j                  d|j                                 t        d|j                               d}~ww xY w)a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`TypedDict`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   r	  setkeys__mutable_keys__errorr$   r3  validate_pythonrE   errors)
r   r   schema	validatorunused_fields
input_keyspossible_keysunexpected_keyseunused_fields_in_requests
             rR   _validate_requestzServe._validate_requestK  s   . 	+G956 (
//$}4LL;O;LMNC:Z[jZk8lmm  H))'2 (2M'A$'=>V=WXY# #.LMeLf,g  ( ! # H1!((*>?#AHHJGGHs   :B< <	DADDc                F    | j                  |t        t        t               y N)r   rk  rl  rm  )rs  rG   response_validatorUNUSED_RESPONSE_FIELDSr   r   s     rR   r   zServe.validate_response_request|  s!    <(0	 	 	
rQ   c                F    | j                  |t        t        t               y ru  )rs  rV   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSrx  s     rR   r   z&Serve.validate_chat_completion_request  s!    >*7	 	 	
rQ   c                F    | j                  |t        t        t               y ru  )rs  rX   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSrx  s     rR   r  z$Serve.validate_transcription_request  s!    8-5	 	 	
rQ   c	                    | |||j                  |j                  |      }t        |t        t	        j                               |t        t        |||      d|      gdd      }	|	S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )contentrole
tool_callsr   )deltaindexfinish_reasonr   zchat.completion.chunk)idcreatedr   choicessystem_fingerprintr  )step
_tokenizerr.   r   timeChoiceChunkr/   )
r   r   r  r   r  r  r  decode_stream	tokenizerchunks
             rR   build_chat_completion_chunkz!Serve.build_chat_completion_chunk  s    D $)<AV#(()=)=wGG#		$% '!#-
 "/
  "*!
& rQ   c                    t        | t              r| j                  d      r| S d|  dS d| j                  d       dS )a/  
        Builds an event of a streaming OpenAI Response model or a ChatCompletion chunk.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            chunk (`BaseModel` or `ChatCompletionChunk`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        zdata: z

Texclude_none)
isinstancerH   
startswithmodel_dump_json)r  s    rR   chunk_to_sse_elementzServe.chunk_to_sse_element  sM     eS!!,,X65PfUG4<PP--4-@AFFrQ   c           	        ddl m}m} g }t        j	                  d       t        t        |       j                        D ]@  }|j                  dk7  r|j                  }|j                         D ]  \  }}|j                  }t        d |D        d      }	|	s)t        j                  |	j                         j!                               }
t#        |
t$              rd|
v so|
d   }|j'                         |j'                         t)        fd|D              sd	|j*                  v r|j*                  j-                  d	      nd
}|j*                  |dk7  rd| nd
z   }|j/                  ||d|j0                  d        C |S )z2
        List LLMs and VLMs in the cache.
        r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESz/Scanning the cache directory for LLMs and VLMs.r   c              3  T   K   | ]   }|j                   d k(  s|j                   " yw)zconfig.jsonN)	file_name	file_path).0fs     rR   	<genexpr>z'Serve.get_gen_models.<locals>.<genexpr>  s      #_A!++Q^B^AKK#_s   ((Narchitecturesc              3  4   K   | ]  }|g v s|  y wr   rP   )r  archllmsvlmss     rR   r  z'Serve.get_gen_models.<locals>.<genexpr>  s      P4$9OtPs   /r   main@)owned_byr  r  r  )&transformers.models.auto.modeling_autor  r  r   warningr   r   repos	repo_typerefsr   filesnextr   r   openr  r  r'  r   anyrepo_idsplitappendlast_modified)	cache_dirr  r  generative_modelsrepor  refrevision_infor  config_pathrN  r  authorrepo_handler  r  s                 @@rR   r  zServe.get_gen_models  sg   	

 HI	2889 	D~~(99D&*jjl "]%++"#_#_aef"K$4$4$6$;$;$=>"640_5N & 78??AAHHJPPP8;t||8KT\\//4QSF"&,,sf}AcU)RT"UK%,,(."-&-'+'9'9	'	B ! rQ   c           	        
  j                  |d          j                  k7  } _        |r0 j                  $ j                  j                  dd       d _         j	                        \  }}t        |d      r|j                  n|t        ||j                  j                  j                  ddd	       j                  J|j                  
       _        t                j                  _         j                  j                          |j                  |d   ddd      j!                  |j"                        d   d    fd fd
 fd}
 fd} j                  j%                  |j&                  |j)                  d            }|j)                  d      rt+         ||      d      S  ||      }|j-                  d      }	t/        |	d      S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   r   r  Ffifo)r   eos_token_idpad_token_id	use_cacher   	scheduler)rI   messagespt)return_tensorsadd_generation_promptreturn_dict	input_idsr   c              3    K   ddl m} 	 j                  | d       d}j                  j	                  |       D ]  }|dz  }|j
                  r'|j
                  d   }j                  | ||       |j                  |j                  k(  sU
j                  d uxr |
j                  k\  }t        d	      r|j                  k(  }|xr | }|rd
nd}j                  | |        y  y # t        $ rT}	t        j                  t        |	             j                  j                  |        dt        |	       d Y d }	~	y d }	~	ww xY ww)Nr   )RequestStatus	assistantr  r   r      r   )r   r  r   r  r  	eos_tokenlengthrk   r  r   data: {"error": ""})generation.continuous_batchingr  r  r   request_id_itergenerated_tokensr  FINISHEDr   r   r  	Exceptionr   rh  rH   cancel_request)r   r  r  n_tokens_generatedresulttoken_idgenerated_all_tokensfinal_token_is_eosreasonrq  rI   rM  r   r  s             rR   stream_chat_completionzIServe.continuous_batching_chat_completion.<locals>.stream_chat_completionD  s    F+7 66z[p6qq%&""FFVVWab  F&!+& ..#)#:#:2#>">>'1$,"7*7&/ ?   }}(>(>>-<<DH W 26G6V6V V - #9k:179;N;N1N.3G3bPbLb0-Av">>&*0"7 ?  
 A D  7SV$88GG
S*3q6(#667sC   EBC4 AC4 1E2C4 3E4	E=A
EEEEc                t   d }j                   j                         r<|:j                   j                  | d      }j                   j                         r|:j                  |j                        }t        | t        t        j                               dt        dt        |d      d      g	      }|S )
Nr  )r   r   chat.completionr   r  r  r  rk   r  messager  )r  r  r  r   r  )
r   
is_running
get_resultdecoder  r*   r   r  r-   r+   )_request_idr  r  chat_completion_resultrM  r   r  s       rR   buffer_chat_completionzIServe.continuous_batching_chat_completion.<locals>.buffer_chat_completiont  s    F::EEGFNAALLXcmnLo ::EEGFN  &&v'>'>?G%3DIIK((+ 5gK X&,	&"" *)rQ   c               `  K   	 t        j                         d      } | |      D ]3  }j                  |       t        j                  d       d {    5 y 7 # t        j
                  $ r7 j                  j                  |        t        j                  d|  d       Y y w xY ww)NFr   Request  was cancelled.)
r   tolistr  rQ  sleepCancelledErrorr   r  r   r  )r  r  _chunkinputsr   r  s      rR   cancellation_wrapper_streamzNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_stream  s     H ,V]]_e D4[-P +F33F;;!--***+*)) H88GGT+oFGHs<   B.AA! AA! B.A! !AB+(B.*B++B.c                    	  |       S # t         j                  $ r7 j                  j                  |        t        j                  d|  d       Y y w xY w)Nr  r  )rQ  r  r   r  r   r  )r  r  r   s    rR   cancellation_wrapper_bufferzNServe.continuous_batching_chat_completion.<locals>.cancellation_wrapper_buffer  sT    H-k::)) H88GGT+oFGHs    AAAr\   )r   r   	streamingr   r   r  application/json)r=  r<  r   rk   r>  r   r  r   rI   r  r  init_continuous_batchingr   logit_processorr~   apply_chat_templatetor+  add_requestr   r   r'   r  r&   )r   r   r   must_discard_cacher   r   r  r  r  
json_chunkr  rI   r  rM  r  r  s   `         @@@@@@rR   r   z)Serve.continuous_batching_chat_completion  s    !% 7 7G E2dooE/ 77C88==DRS=T;?8889NOy+29k+JI''PY	=$)$;$;"//"//
 33;7<7U7U"3 8V 8D4
 H[G\D44D44::< ..
ODZ^ / 

"U\\
;(()+.	7`	*4		H	H ==IIz:K:Z:Zfifmfmnvfw J 

 778$%@%LYlmm/
;E..D.AJ
7IJJrQ   c                .   | t        |t              rt        j                  S ddlm}m} | j                  j                  }||j                         v rt        j                  }|S ||j                         v rt        j                  }|S t        d|       )Nr   r  zUnknown modality: )r  r   r   r   r  r  r  	__class__rK   r   r   r]  )r   r   r  r  model_classnamemodalitys         rR   get_model_modalityzServe.get_model_modality  s     )%<=||#	

  //22HOOQQ||H   A H H JJ||H  1/1BCDDrQ   c           	        g }| D ]  }|d   g d}|t         j                  k(  rmt        |d   t              r|d   }nMt        |d   t              r:g }|d   D ]  }|d   dk(  s|j                  |d          ! dj                  |      }|d<   n(|t         j                  k(  rt        |d   t              r|d   j                  d|d   d       n|d   D ]  }|d   dk(  r|d   j                  |        |d   dk(  s)d	|d   d
   v rt        j                  dd|d   d
         }t        j                  t        t        j                  |                  }t        j                   dd      }	|	j"                  }
|j%                  |	j"                         n|d   d
   }
|d   j                  d|
d        |j                  |        |S )Nr  r  r  r  typer]    )r
  r]   	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)r
  r  )r   r   r  rH   r  r  r`  r   resubr"   r  r   r  	b64decodetempfileNamedTemporaryFilerY  save)r  r  processor_inputsr  parsed_messageparsed_contentr  
image_datar  rZ   r  s              rR   *get_processor_inputs_from_inbound_messagesz0Serve.get_processor_inputs_from_inbound_messages  s    '	4G&-fo"EN8<<' gi0#6%,Y%7N	 2D9%'N#*9#5 C"6?f4*11'&/BC &)XXn%=N,:y)X\\) gi0#6"9-44fgV_N`5ab#*9#5 \"6?f4*95<<WE$V_;'7;+?+FF-/VV4LbRYZeRfglRm-n
(-

76;K;KJ;W3X(Y'/'B'B&Y^'_&*ii %

499 5&-k&:5&A*95<<gVY=Z[\  ##N3O'	4P  rQ   c                4     j                    j                   |d<   |d   }|d   d   dk(  ry j                  |d          j                  k7  } _         j                        \  } j	                  |      } j                  ||      }dt        D ]/  }|j                  j                  d   j                         v s-| n |j                  |d	|j                  d
      dd	d	      }|j                  j                        }|j                  dd      d	}	dj                  j                  d   j                         v rd}	t        ||	d	      }
t        |j                         d} j#                  |      r=|s; j$                  j'                         }|d   j(                  d   |kD  r j$                  }i ||
d	|d fd}|j                  d      r(t+        t-         j.                   ||
            d      S g }d} ||
      }d}|D ]  }|j0                  d   }t3        |j4                  dd      r%|j7                  |j4                  j8                         |j:                  r|j:                  }t3        |dd      st|j<                  } t?        tA        tC        jB                               dtE        dtG        djI                  |      d      |      g|       }|jK                  d	!      }tM        |d"      S )#a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r  r   r  r  )r   r   Ttoolsr  )r  r  r  r  tokenizer   req_0gptossFskip_special_tokensskip_promptr   r  )streamerrI   return_dict_in_generatepast_key_valuesc              3  6  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}	 |j	                          t               }j                  d	
       d}d}	| D ](  }|	dz  }	dj                   j                  d   j                         v r|j                  d      }||z  }|r||v rd}QR|j                         t           d   k(  rd|_
        z|j                         t           d   k(  r(|j                          j                  |d d       |j                  r@|xj                  |z  c_        |j                  sYt        j                  d|j                        }
|
|
j!                  d      }
d|_        t#        t%        |
      dd|dz         }n|dk(  rMd|j                  vr]|xj&                  |j)                  d      z  c_        |xj&                  |j)                  d      z  c_        |j&                  dk  r&dj+                  |j-                  d      d d       dz   }t#        t%        |      dd      }j                  |d |g       |dk7  sj                  ||       + j.                  d uxr |	j.                  k\  }t1        | j2                  d      r || j2                  j4                  k(  }|xr | }|rdnd }j                  ||!       |j+                          |j+                          y # t6        $ r9}t8        j;                  t=        |             d"t=        |       d# Y d }~Nd }~ww xY w# |j+                          w xY ww)$NFr!  r   T<|channel|>final<|message|>c                 L     j                   di | }|j                  _        y NrP   generater(  r;  r   generate_outputr   r   s     rR   generate_with_cachez[Serve.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cacheH  $    "0%..":6":%4%D%D"rQ   rX  r   r   r  r  r  
<|return|>r~   r   r  )r   r  r  r   z\"name\": \"(.*?)\")rY  function
_tool_call)r5  r  r
  r  z"arguments": {{})	arguments)r5  r  r
  )r   r  r  r   )r  r   r  r  rk   r  r  r  )rN  r  r9  r	   r~   r   r  removesuffixstrip_TOOL_CALL_TOKENSr   r   r   r   r  searchgroupr0   r1   r   countr`  r  r   r   r  r  r  r   rh  rH   )r&  r  
filter_cotcot_trace_endr1  threadresults
tool_stater  r  	tool_nametoolr  r  r  rq  rI   generation_kwargsr   rM  r   r   tool_model_familys                   rR   r  z>Serve.generate_chat_completion.<locals>.stream_chat_completion>  s     J M5<<55a8>>@@!
 =E #6?PQFG|&[
 66z[p6qq%&"& [F&!+&  5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G "(G3).J$$ )4!<<>->?P-QRY-ZZ:>J7$ "<<>->?P-QRW-XX&,,."&"B"B+6%).:&;	 #C #  %%66&--7- $.#C#C,.II6LjN_N_,`	#,#4$,090BICG
 @':-Hi-X*+)3'2\'A	(" $*R<$, $4:;L;L#L$, !+ < <S@Q Q < * < <S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6%),06&;	 #C #  % |">>'?T ?  s[| &44D@ O*.?.N.NN % 8--{;)/83E3E3O3O)O&+?+ZHZDZ(%9v66{RX`u6vv   7SV$*3q6(#667
 sD   ANIL? BL? .N?	N/M<7N <NN NNr\   r   r   rk   r  usager  r   r  r  )r  r  r  r   r  rJ  r  r  )'r4  r=  r<  r>  r  r  _MODELS_WITH_TOOL_SUPPORTrN  r  r9  r  r   r  r+  r   r   rI   is_continuationr;  get_seq_lengthshaper'   mapr  r  getattrr  r  r  r  rJ  r*   r   r  r-   r+   r`  
model_dumpr&   )r   r   r  r  r   r  r  supported_model_familiesr  r#  generation_streamerr;  seq_lenr  r  r  	generatorrJ  r  choicer  r  rI   rH  r   rM  r   rI  s   `                     @@@@@@rR   r   zServe.generate_chat_completion  sH    '++CL9<Z B<;. $ 7 7G E2dooE/889NOy**5I*FJJ8U]^ !(A 	$'5<<+E+Ea+H+N+N+PP$<!	 .."&'''" / 
 5<<(WW\73
 #u||11!4::<<"'2 3

 >c[`[r[rs$-?((779Gk"((,w6 $ 2 2

+!2'+,
M	 M	^ 778$D--/EFY[e/fg. 
 G"M./BJOIE" (q)6<<D9NN6<<#7#78''$*$8$8M5'40!KKE( &4DIIK((+ 5bggg>NU` a&3	 &"" ,66D6IF3EFFrQ   c                0     j                  d          j                  k7  } _         j                        \  }t        d   t              r'dv r	dd   dgng }|j                  dd   d       nt        d   t              r8dv r.d   d   d   dk7  rdd   dgd   }nYd   }d   |d   d	<   nHd   }nBt        d   t              r$dv r	dd   dgng }|j                  d          nt        d
      |j                  |ddd      d   }|j                  j                        }j                  dd      d}dj                  j                  d   j                         v rd}t!        ||d      }t#        j$                        }d} j'                        r=|s; j(                  j+                         }	|d   j,                  d   |	kD  r j(                  }|t/        |      ||d|d fd}
 |
|      S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemr	  r^   r   r  r  %inputs should be a list, dict, or strTr  r  r  r  r  ri   r   r!  Fr"  r%  Nr   )r  attention_maskr&  rI   r'  r(  c              3  ~	  K   d}d }dj                   j                  d   j                         v rd}d}fd}t        |      }d}d}d}	 |j	                          t        j
                         }	t        d|t        d	 |	d
j                  d      dddiidg g j                  dd      dj                  d                  }
|dz  }j                  |
       t        d|t        d	 |	dj                  d      dddiidg g j                  dd      dj                  d                  }|dz  }j                  |       t        d||t        d dddg             }|dz  }j                  |       t        dd |||t        dd g !      "      }|dz  }j                  |       d }| D ]  }dj                   j                  d   j                         v r|j                  d#      }||z  }|r7||v rd}d }Mt!        d$d ||||g %      }|dz  }j                  |       {|s~t!        d$d ||||g %      }|dz  }j                  |        t#        d&d ||d|g '      }|dz  }j                  |       t%        d(d |||t        d|j&                  g !      "      }|dz  }|dz  }j                  |       t)        d)||t        d dd*d|j*                  gg +            }|dz  }|dz  }j                  |       t-        d,|t        d	 |	d*j                  d      dddii|j.                  gdg j                  dd      dj                  d      -            }|dz  }j                  |       |j1                          |j1                          y # t2        $ r}t4        j7                  d.t9        |              t;        d/|t9        |      0      }|dz  }j                  |       t=        d1|t        d	 	d2j                  d      dddiig dg ddj                  d      t?        d3t9        |      4      5            }|dz  }j                  |       Y d }~d }~ww xY w# |j1                          w xY ww)6NFr!  r   Tr*  c                 L     j                   di | }|j                  _        y r,  r-  r/  s     rR   r1  zMServe.generate_response.<locals>.stream_response.<locals>.generate_with_cacheD  r2  rQ   r3  zresponse.createdresp_queuedrY  formatr
  r]   r"  rx   autorn   )r  
created_atr  r   rY  r]   r  r  r  rx   re   rn   )r
  sequence_numberr"  r  zresponse.in_progressin_progresszresponse.output_item.addedmsg_r  r  )r  r
  r  r  r  )r
  re  output_indexitemzresponse.content_part.addedoutput_textr   r
  r]   r   )r
  item_idre  rh  content_indexpartr4  zresponse.output_text.delta)r
  rl  re  rh  rm  r  rm   zresponse.output_text.done)r
  rl  re  rh  rm  r]   rm   zresponse.content_part.donezresponse.output_item.done	completedr  r
  r  r  r  r   zresponse.completedr  rd  r  r   rY  r]   r  r  r  rx   re   rn   z"Exception in response generation: rh  )r
  re  r  zresponse.failedfailedserver_error)coder  )r  rd  r  r   rY  r]   r  r  r  rx   re   rn   rh  ) rN  r  r9  r	   r~   r  r7   r3   r   r  r;   r<   r>   r5   r?   r;  r@   rA   r6   r]   r=   rn  r4   ri  r`  r  r   rh  rH   r9   r:   r8   )r&  r  rA  rB  r1  rC  re  rh  rm  rd  response_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedrD  r  response_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedrq  error_eventresponse_failedrH  r   rM  r   r   r   s                           rR   stream_responsez0Serve.generate_response.<locals>.stream_response:  s     J M5<<55a8>>@@!
 =E #6?PQFOLMY!YY[
 $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4$ $  1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4($$  1$//0DEE .J5$3!-.!*.Y}[fpr	.*  1$//0JKK /L6":,/$3!-"/+RUWX/+  1$//0KLL & %XF5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G "(G3).J&(G$9O%A*.zl(;0?-9.;&,)+:6 ,q0O"&";";<V"WW "9O%A*.zl(;0?-9.;&,)+:6 ,q0O"&";";<V"WWK%XP -B4":,/$3!-"# -)  1$//0IJJ .J5":,/$3!-"/+E^EcEcqst.*  1$"//0JKK -H4$3!-.!*.&*(!;!@!@ A$&	-)  1$!//0IJJ &<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4&"$  1$//0BCCJ I  !AA#a&JK0 $3F
  1$//<<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#,  1$//@@C!AH sE   AR=G5O E7O <R=	R%CR R(  R%%R( (R::R=)r=  r<  r>  r  rH   r  r  r'  	TypeErrorr  r  r+  r   rN  r  r9  r   r   rI   rL  r;  rM  rN  r   )r   r   r  r   r  r#  rS  rI   r;  rT  r  rH  r   rM  r   s   ``         @@@@rR   r  zServe.generate_response  sj    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'CDD..$tQU / 

 5<<(WW3W=
 #u||11!4::<<"'2 3

 >c[`[r[rs$-?((779Gk"((,w6 $ 2 2 -f5+!2'+,
l	 l	\ 2J??rQ   c                   | j                  |d         }|| j                  k7  }|| _        | j                  |      \  }}t        |d   t              r'd|v r	d|d   dgng }|j                  d|d   d       nt        |d   t              r8d|v r.|d   d   d   dk7  rd|d   dg|d   }nY|d   }|d   |d   d	<   nH|d   }nBt        |d   t              r$d|v r	d|d   dgng }|j                  |d          nt        d
      |j                  |ddd      d   }|j                  |j                        }|j                  dd      }d}d|j                  j                  d   j                         v rd}t!        ||j"                        }	d}
| j%                  |      r:|s8| j&                  j)                         }|j*                  d   |kD  r| j&                  }
|j-                  |t/        |      |	d|
      }|j0                  | _        |j3                  |j4                  |      d   }t7        j6                         }t9        d| dddt;        d|g       gg       }t=        d| |d||j                  d      d d!d"ii|gd#g |j                  d$d      d%|j                  d&      '      }|j?                  d(      S ))a  
        Generates an OpenAI Response in non-streaming mode (single JSON payload).

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `dict`: The OpenAI `Response` serialized as a dict.
        r   rX  rY  rZ  r	  r^   r   r  r  r[  Tr  r\  r  ri   r   r!  Fr%  Nr   )r  r]  rI   r'  r(  r#  rg  r  ro  r  rj  rk  rp  r`  rb  r
  r]   r"  rx   rc  rn   rq  r  ) r=  r<  r>  r  rH   r  r  r'  r]  r  r  r+  r   rN  r  r9  r   rI   rL  r;  rM  rN  r.  r   r(  batch_decode	sequencesr  r>   r?   r3   rQ  )r   r   rM  r  r   r   r  r   r#  rI   r;  rT  r0  	full_textrd  response_output_itemr}  s                    rR   r   z%Serve.generate_response_non_streaming*  s    !% 7 7G E2dooE/889NOyc'lC(M[_bMbxC4GHIhjFMM6c'lCDGd+$w<?6*h6'/C<OP`SVW^S_`F \F+.~+>F1Ii(WGd+M[_bMbxC4GHIhjFMM#g,'DEE..$tQU / 

 5<<(WW3W=
 #u||11!4::<<"'=c[`[r[rs$-?((779G||B') $ 2 2..*62/$() ) 
 -<< **?+D+DZm*nopq	YY[
4j\"']XZ[\ 
 &zl#!'0VV,-() #(=u EWWZ(
 ",,$,??rQ   c                (  
 t               st        d      | j                  |d         }| j                  |      \  t	        j
                  dd      }t        |j                        }j                  j                  }t        j                  |d         }t        j                  ||d      \  }} ||d	      j                  j                        

d
   j                  j                         
d
<   ||dd
fd}	 |	       S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Tr"  r%  rZ   )srmonor  )sampling_rater  input_features)r&  rI   r'  c               3     K    j                   di } j                  | j                  d      d   }t        |      }|j	                  d        y w)NTr  r   )r]   r  rP   )r.  r  r  r(   r  )generated_idstranscription_texttranscriptionaudio_inputsaudio_modelaudio_processorrH  s      rR   _generate_transcriptionz=Serve.generate_transcription.<locals>._generate_transcription  sg     0K00U<UCTUM!0!=!=m>U>Uko!=!pqr!s)/ABM"222EFGs   AA)r   r*  r=  load_audio_model_and_processorr   r  r   rI   feature_extractorr  ior   librosaloadr  r+  r,  )r   r   rM  rS  rI   model_sampling_rateaudio_bytesaudio_array_r  r  r  r  rH  s             @@@@rR   r  zServe.generate_transcription  s"    $%o  !% 7 7G E'+'J'JK`'a$_2%%4T
 >)F)F

 .??MMjjV- k6IPTUQ&{BUfjknn
 *66F)G)J)J;K\K\)]%& ,!2'+
	H '((rQ   c                >   |j                  d      xs |j                  d      }d}| j                  d}n`t        | j                        t        |      k\  rd}n<t        t        | j                              D ]  }| j                  |   ||   k7  sd} n || _        |S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r  rX  TF)r   r:  lenrange)r   r   r  req_continues_last_messagesis        rR   rL  zServe.is_continuation  s     77:&:#'''*:&*# %*/'##$H5*/' 3t1123 %%a(HQK727/
 &**rQ   c                    | j                   dk(  rt        ddd      }n| j                   dk(  rt        d      }nd}|t        j                  d|        |S )	z
        Returns the quantization config for the given CLI arguments.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        zbnb-4bitTnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantzbnb-8bit)load_in_8bitNz0Quantization applied with the following config: )r/  r   r   r   )r   quantization_configs     rR   get_quantization_configzServe.get_quantization_config  si     
*"4!$)*.#
 *,"4$"G"&*KKJK^J_`a""rQ   c                H    | j                   | j                   }d|v r|S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        r  z@main)r4  )r   model_ids     rR   r=  zServe.process_model_name  s4     '''H(?O5!!rQ   c                   ddl }ddlm}m} t        j                  d|        d|v r|j                  dd      \  }}n|d}}	 |j                  ||| j                        }| j                  d
v r| j                  nt        || j                        }| j                         }	|| j                  || j                  | j                  |	d}
 |j                  |fi |
}t        t        |j                   d         } |j                  |fi |
}|j"                  j$                  du xr |j"                  j&                  dk(  }|j"                  j$                  duxr |j"                  j$                  dk  }|s|rd|j"                  _        t        j                  d|        ||fS # t        $ r@ 	 t        j                  ||| j                        }n# t        $ r t        d	      w xY wY w xY w)a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        r   N)
AutoConfigAutoProcessorzLoading r  r  r  )revisionr-  zBFailed to load processor with `AutoProcessor` and `AutoTokenizer`.)rc  N)r  r.  r,  
device_mapr-  r     r  zLoaded model )r   r   r  r  r   r   r  from_pretrainedr-  OSErrorr   r,  rP  r  r.  r+  r  rI   r   
max_length)r   rM  r   r  r  r  r  data_processorr,  r  model_kwargsrN  architecturer   has_default_max_lengthhas_short_max_new_tokenss                   rR   _load_model_and_data_processorz$Serve._load_model_and_data_processor  s    	:h4567''!6!<!<S!!DHh!6hH	d*::!"&"8"8 ; N #jjN:

tzz@Z"::< !#'#;#;++!%!7!7#6
 ,++HEE|V-A-A!-DE,,,XFF ##22d:gu?V?V?a?aeg?g 	 ##22$>p5CZCZCiCilpCp 	! "%=59E##2m$9#:;<n$$I  	dd!.!>!>%&*&<&<"
  dbccd 	ds*   F# #	G,-"GG,G%%G,+G,c                x   || j                   vs| j                   |   j                         r=| j                  |      \  }}t        || j                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   r   r   r  r   r0  r   r   r   )r   rM  r   r   s       rR   r>  zServe.load_model_and_processor@  s     !(:(::d>P>PQf>g>r>r>t#BBCXYE98B $ 2 2#9D45 i	 45AAC&&'<=CCE**+@AKKIirQ   c                x   || j                   vs| j                   |   j                         r=| j                  |      \  }}t        || j                  |      | j                   |<   ||fS | j                   |   j                          | j                   |   j                  }| j                   |   j                  }||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   rM  r  r  s       rR   r  z$Serve.load_audio_model_and_processorY  s     !(:(::d>P>PQf>g>r>r>t+/+N+NOd+e(K8B $ 2 2)9D45 O++	 45AAC,,-BCIIK"001FGQQOO++rQ   )Frc  rc  FNN	localhosti@  r   r   NFFNF) r   z^Annotated[bool, typer.Option(help='Whether to use continuous batching for chat completions.')]r+  zAnnotated[str, typer.Option(help='Device to use for inference; will default to `auto` and place the model on an accelerator if available.')]r,  zAnnotated[str | None, typer.Option(help="Override the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.")]r-  zXAnnotated[bool, typer.Option(help='Whether to trust remote code when loading a model.')]r.  zAnnotated[str | None, typer.Option(help='Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.')]r/  zmAnnotated[str | None, typer.Option(help="Which quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'")]r$  zIAnnotated[str, typer.Option(help='Interface the server will listen to.')]r%  zDAnnotated[int, typer.Option(help='Port the server will listen to.')]r0  zeAnnotated[int, typer.Option(help='Time in seconds after which a model will be removed from memory.')]r&  z]Annotated[str, typer.Option(help="Logging level as a string. Example: 'info' or 'warning'.")]r1  z]Annotated[int | None, typer.Option(help='The default seed for torch, should be an integer.')]r2  zAnnotated[bool, typer.Option(help='Whether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.')]r3  zQAnnotated[bool, typer.Option(help='Whether to turn on strict input validation.')]r4  zAnnotated[str | None, typer.Option(help="Name of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.")]r5  zbAnnotated[bool, typer.Option(hidden=True, help='Whether to run the server in a separate thread.')]returnNone)r   r'  rk  r   rl  rD   rm  re  r(  )r   NNNNNNN)r   rH   r  z
int | Noner   
str | Noner  r  r  r  r  z list[ChoiceDeltaToolCall] | Noner  zDecodeStream | Noner  zPreTrainedTokenizerFast | Noner  r.   )r  zChatCompletionChunk | BaseModelr  rH   r   )r  r  r  zlist[dict[str, any]])r   r'  r   rH   r   StreamingResponse | JSONResponse)r   r   r  r   )r  r   )r   r'  r  r  )r   r'  r  zGenerator[str, None, None])r   r'  r  r'  )r   r'  r  r[   )r  zBitsAndBytesConfig | None)r  rH   r  rH   )rM  rH   )rM  rH   r  z/tuple[PreTrainedModel, PreTrainedTokenizerFast])rM  rH   r  z&tuple[PreTrainedModel, ProcessorMixin])rK   rL   rM   r   rI  ra  rs  r   r   r  r  staticmethodr  r   r  r   r  r  r   r  r   r  rL  r  r=  r  r>  r  rP   rQ   rR   r   r   k  sq         ZeUY    ns  qF
F

F
F"
#F(
)F4
5F< X=F> S?F@
AFF
GFL
MFR
SF^ l_F`
aFl
mFr 
sFP	)// / 	/
 /b


 " $(7;-14888 8 	8
 8 "8 58 +8 28 
8t G G& -!  -!^WKr  ( +  + ZFGPq@f	X@t.)`+<#.""D%L 2,rQ   r   a  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.

Models will be loaded and unloaded automatically based on usage and a timeout.


The server will expose the following endpoints:
    - POST /v1/chat/completions: Generates chat completions.
    - POST /v1/responses: Generates responses.
    - POST /v1/audio/transcriptions: Generates transcriptions from audio.
    - GET /v1/models: Lists available models for 3rd party tools.

Requires FastAPI and Uvicorn to be installed.
__main__)r   r'  r   r   r  r   )
__future__r   rQ  r  r   enumr   r  r   r  r  r   r  r  collections.abcr   r   
contextlibr   	functoolsr   r   r	   typingr
   r   r   typerhuggingface_hubr   tokenizers.decodersr   r   r   r   r   r   r   transformers.utils.import_utilsr   r   r   r   r   r   r   r   r   utilsr   r   r   r    r  r!   r  PILr"   r)  rE  rA  r#   r$   fastapi.middleware.corsr%   fastapi.responsesr&   r'    openai.types.audio.transcriptionr(   .openai.types.audio.transcription_create_paramsr)   openai.types.chatr*   r+   r,   !openai.types.chat.chat_completionr-   'openai.types.chat.chat_completion_chunkr.   r/   r0   r1   r  *openai.types.chat.completion_create_paramsr2   openai.types.responsesr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   -openai.types.responses.response_create_paramsrB   pydanticrC   rD   rE   rG   rV   rX   rv  rz  r}  rw  r{  r~  r6  rK   r   r=  r  rf  rK  r  r   r   r   Enumr   r   r   r   r   rN   rV  rP   rQ   rR   <module>r     sO   #     	 	  	     / *    6 6  * ,   e e     K  k 4 6k;O;QkViVk   .6A>\cc8  [    " \@@4QY^ 6U]b 0MUZ  %%NO&'RS)*OP %!.# 
		H	%
   !!2!7!7!9: !*tyy 8	8-8 	8v 1@ 1@hE, E,R0 zGE rQ   