L iddlZddlZddlZddlZddlZddlZddlZddlZddlZddl Z ddl Z ddl Z ddl Z ddl Z ddlmZmZddlmZmZmZddlmZddlmZmZddlmZddl mZddlmZmZmZdd l m!Z!dd l"m#Z#dd l$m%Z%ddl&Z&dd l'm(Z(m)Z)dd l*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0ddl1m2Z2m3Z3m4Z4m5Z5m6Z6ddl7m8Z8m9Z9ddl1m:Z:e8rddl;Z;ddl&mZ>m?Z?ddl@mAZAmBZBe,rddlCZCe0rddlDmEZEe.xre+xre/xre-ZFeFrddlGZGddlHmIZImJZJddlKmLZLddlMmNZNmOZOddlPmQZQddlRmSZSddlTmUZUddlVmWZWmXZXmYZYmZZZm[Z[ddl\m]Z]ddl^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmddlnmoZodd lpmqZqmrZrmsZsGd!d"eod#$ZtGd%d&e]d#$ZuGd'd(eSd#$ZveretZwereuZxerevZyhd)Zzhd*Z{hd+Z|e9je~Zd,d-d.d/iZeejZd0ZGd1d2ej Zd3efd4Zd5ed6d7d8d7fd9ZGd:d;ZGd<d=ZeGd>d?ZGd@dAe:Ze~dBk(reZejyy)CN)ArgumentParser Namespace)AsyncGenerator GeneratorIterable)asynccontextmanager) dataclassfield)BytesIO)Thread)Optional TypedDictUnion) model_info)HF_HUB_OFFLINE) DecodeStream)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available) AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging)BaseTransformersCLICommand) AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManager RequestStatus)Image)FastAPI HTTPException)CORSMiddleware) JSONResponseStreamingResponse) Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoice ChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEvent ResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming) BaseModel TypeAdapterValidationErrorceZdZUdZeed<y))TransformersResponseCreateParamsStreamingz OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string). generation_configN__name__ __module__ __qualname____doc__str__annotations__c/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/commands/serving.pyrNrN{ rXrNF)totalceZdZUdZeed<y)+TransformersCompletionCreateParamsStreamingz OpenAI's CompletionCreateParamsStreaming with additional fields for the generation config (as a json string) and passing the request_id rONrPrWrXrYr]r]rZrXr]c4eZdZUdZeed<eed<dZeed<y)%TransformersTranscriptionCreateParamsz OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string). filerOFstreamN) rQrRrSrTbytesrVrUraboolrWrXrYr_r_s  rXr_> textuserstorepromptinclude reasoning background truncation tool_choice service_tier top_logprobsmax_tool_callsprevious_response_id>nstopreaudiorflogprobsmetadata functions modalities predictionrlrmrn function_callstream_optionsresponse_formatpresence_penaltyreasoning_effortweb_search_optionsparallel_tool_callsmax_completion_tokens>rgrhlanguager{chunking_strategytimestamp_granularitiesqwenz z )startendz x-request-idceZdZdZdZdZdZy)ModalityLLMVLMSTTTTSN)rQrRrSrrrrrWrXrYrrs C C C CrXrargsct|S)z~ Factory function used to instantiate serving server from provided command line arguments. Returns: ServeCommand ) ServeCommand)rs rYserve_command_factoryrs  rXreqmodel_generation_configr'returnc `|jd"td itj|d}nt j |}|j d i|}|jD]\}}| t||||jdt|d|_ |jdt|d|_ |jdt|d|_ |jd |d|_ |jd |d|_|jd+t|d|_t|ddk(rd |_|jd t|d |_|jd t%j&|d |S) a Creates a generation config from the parameters of the request. If a generation config is passed in the request, it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config. Other parameters in the request will be applied on top of the baseline. Args: req (`dict`): The request which may optionally contain generation parameters. model_generation_config (`GenerationConfig`): The model's default generation config. kwargs (`dict`): Additional parameters to set in the generation config. Returns: The prepared `GenerationConfig` object. rOmax_output_tokens max_tokensfrequency_penalty logit_biasrr temperaturegFtop_pseedrW)getr'jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatrepetition_penalty sequence_bias stop_stringsr do_samplertorch manual_seed)rrkwargsrOnon_standard_kwargskvs rY!create_generation_config_from_reqrs. ww"#/,Ttzz#>Q:R/ST MM*AB2+22Reset the tool call state (assumes we're outside a tool call).FrN)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferrs rYrzToolState.reset*s! %%*"!" rXN)rQrRrSrTrrrWrXrYrr$sArXrc LeZdZdZ d dddedeedfdZd Zd Z d Z d Z y) TimedModelz A class that holds a PreTrainedModel instance and its associated processor. Automatically deletes the instances after a specified timeout. Nmodelr(timeout_seconds processor)rrc||_t|j|_||_||_t j|j |j|_ |jjyr) rrU name_or_path _name_or_pathrr threadingTimertimeout_reached_timerr)rrrrs rYrzTimedModel.__init__8s[   !3!34".ood&:&:DDelete the wrapped model and processor and clean up resources.rN) hasattrrrgccollectrcuda is_available empty_cacherrrs rY delete_modelzTimedModel.delete_modelKsr 4 !djj&< DJ!DN JJLzz&&( &&( KK   '= !rXc|jtj|jd|jdy)Nz was removed from memory after z seconds of inactivity)rloggerinforrrs rYrzTimedModel.timeout_reached[s7  t))**I$J^J^I__uvwrXc<t|d xs|jduS)z)Check if the instances have been deleted.rN)rrrs rY is_deletedzTimedModel.is_deleted_s 4))?TZZ4-??rXr) rQrRrSrTrr rrrrrrrWrXrYrr2sPSW    E"MNO  ! x@rXrceZdZUdZedddiZeed<edddiZe ed <ed d gd d Z e e ed<eddgd d Z e e ed<edddiZ eed<ed ddiZe e ed<edddiZeed<edddiZeed<eddddgd Ze ed<edddiZeed<eddd iZe ed!<ed"dd#iZeed$<ed%dd&iZeed'<ed(dd)iZe ed*<ed dd+iZe eed,<eddd-iZeed.<eddd/iZeed0<ed dd1iZe e ed2<d3Zy )4ServeArgumentsz Arguments for the serve CLI. See the metadata arg for each argument's description -- the metadata will be printed with `transformers serve --help` Fhelpz8Whether to use continuous batching for chat completions.)defaultrucontinuous_batchingautozfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)rbfloat16float16float32)rchoices torch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypez2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA. load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA. load_in_4bitnf4zQuantization type.fp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant localhostz$Interface the server will listen to.hosti@zPort the server will listen to.porti,z@Time in seconds after which a model will be removed from memory. model_timeoutrz8Logging level as a string. Example: 'info' or 'warning'. log_levelz1The default seed for torch, should be an integer. default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled. enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request. force_modelc|j^|j|j|_y|j|jk7r&td|jd|jdyy)z(Only used for BC `torch_dtype` argument.Nz`torch_dtype` z and `dtype` zn have different values. `torch_dtype` is deprecated and will be removed in 4.59.0, please set `dtype` instead.)rr ValueErrorrs rY __post_init__zServeArguments.__post_init__su    'zz!!-- !!TZZ/ $T%5%5$6mDJJ<PMM0 (rX)rQrRrSrTr rrcrVrrUrr rrrrrrrrrrrrrrrrrrWrXrYrrdsT!&TU! > FC"'WA "K#!PA E8C=$)] ^t*/ r *#efL$efL$ %UFZhmotgu=vww!&uHm?n!o$okV=c4deD#edf6W-XYD#Y\]M3 &*d!eIs#(([\#L(3- & K# B d "' 2 "K# rXrcJeZdZedefdZdefdZdede ddd e fd Z defd Z defd Z defd Z d1dedeedeedeedeedeeddeedeedefdZdddefdZdZej2deeeeffdZdededeedffd Zedd!defd"Zed#efd$Z dede!eddffd%Z"dede!eddffd&Z#dede!eddffd'Z$dede%fd(Z&ededed)fd*Z'd+edefd,Z(d-efd.Z)d-ede*d!effd/Z+d-ede*d!e,ffd0Z-y)2rparsercdtf}|jd|}|jty)z Register this command to argparse so it's available for the transformer-cli Args: parser: Root parser to register command-specific arguments serve)dataclass_types)funcN)r add_parser set_defaultsr)rr serve_parsers rYregister_subcommandz ServeCommand.register_subcommands3*+((/(R !!'<!=rXrc 0ts td||_|jj|_|jrt j }|jj)||j_tjd|t j}|jj|vr)td|d|jjd|d|jj|_ |jj)tj|jjt!j"d}|j%t j&|jj(j+t!j"d}|j%t j&|jj(j+i|_d|_d|_d|_d|_y) NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`z-No attn_implementation passed, defaulting to z"Continuous batching only supports z as attn_implementation, got z#Try setting `--attn_implementation=` transformersz+transformers.generation.continuous_batching)serve_dependencies_available ImportErrorrruse_continuous_batchingr) default_attention_implementationrrr#supported_attention_implementationsrrrrrr" get_loggersetLevel log_levelsrlower loaded_models#running_continuous_batching_manager last_messages last_kv_cache last_model)rrdefault_attn_implsupported_attn_impltransformers_logger cb_loggers rYrzServeCommand.__init__s+s   '+yy'D'D$  ' ' 9 Z Z \ yy,,40A - KL]K^_`";"_"_"a yy,,4GG 89L8MMjyy4459:K9LAO  9900 99 ! ! -   dii44 5&00@$$W%7%7 8K8K8Q8Q8S%TU&&'TU 7--dii.A.A.G.G.IJK57X\0"!rXrequestschema validatorrK unused_fieldsc0tjd|t|j}|j}||z }|r(tj d|t dd||jjrB |j|||z} | r(tj d| t dd| yy#t$rF}tj d|jt d|jd}~wwxYw)a Validates the request against the schema, and checks for unexpected keys. Args: request (`dict`): The request to validate. schema (`TypedDict`): The schema of the request to validate. It is a `TypedDict` definition. validator (`TypeAdapter`): The validator to use to validate the request. Built from `schema`. unused_fields (`set`): Fields accepted by `schema`, but not used in `transformers serve`. Raises: HTTPException: If the request is invalid or contains unexpected or unused fields. zValidating request: z Unexpected keys in the request: i) status_codedetailzValidation error: NzUnused fields in the request: ) rdebugsetkeys__mutable_keys__errorr-rrvalidate_pythonrLerrors) rrrrr input_keys possible_keysunexpected_keyseunused_fields_in_requests rY_validate_requestzServeCommand._validate_requests.  +G956( // $}4  LL;O;LM NC:Z[jZk8lm m 99 % % H))'2 (2M'A $' =>V=WXY# #.LMeLf,g( &# H 1!((*>?#AHHJGG HsC DADDcF|j|tttyN)rrrr)r/rNresponse_validatorUNUSED_RESPONSE_FIELDSrrs rYvalidate_response_requestz&ServeCommand.validate_response_requestAs! <(0  rXcF|j|tttyr1)r/r]completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr4s rY validate_chat_completion_requestz-ServeCommand.validate_chat_completion_requestIs! >*7  rXcF|j|tttyr1)r/r_transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr4s rYvalidate_transcription_requestz+ServeCommand.validate_transcription_requestQs! 8-5  rXN request_idcontentrrole finish_reason tool_callsr7 decode_stream tokenizerrc | |||j|j|}t|tt j|t t |||d|gdd} d| jd d S) a Builds a chunk of a streaming OpenAI Chat Completion response. IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps, like Cursor, assume that when the field exists, it has data. Args: request_id (`str`): The request ID. content (`str`, *optional*): Content of the response from the model. model (`str`, *optional*): The model that generated the content. role (`str`, *optional*): The role of the next content, until a new role is defined. finish_reason (`str`, *optional*): The reason the generation by the model has finished. tool_calls (`list[ChoiceDeltaToolCall]`, *optional*): Data about the tool calls, when they are triggered. Returns: `str`: The built chunk, a string containing a JSON string with the payload. )r?r@rBr)deltaindexrArzchat.completion.chunk)idcreatedrrsystem_fingerprintobjectdata: T exclude_none )step _tokenizerr4rtimer5r6model_dump_json) rr>r?rr@rArBrCrDchunks rYbuild_chat_completion_chunkz(ServeCommand.build_chat_completion_chunkYsD  $)<AV#(()=)=wGG# $% '!#- "/  "*! $--4-@AFFrXresponserJc.d|jddS)a Builds a event of a streaming OpenAI Response response. IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps, like Cursor, assume that when the field exists, it has data. Args: response (`BaseModel`): The response to build an event from. One of the multiple OpenAI Response output types Returns: `str`: The built chunk, a string containing a JSON string with the payload. rLTrMrO)rS)rrVs rYbuild_response_eventz!ServeCommand.build_response_events"00d0CDDIIrXctdtffd }t|}jr2|jtdgddgdgt j ddd lm}|jd d |d tffd }|jdd tffd }|jdd |ffd }|jd|jdfd}|jdd}|jdd |fd} tj|j j"j j$j j&y)a Setup and run the FastAPI server for transformers serve. Models will be loaded and unloaded automatically based on usage and a timeout. The server will expose the following endpoints: - POST /v1/chat/completions: Generates chat completions. - POST /v1/responses: Generates responses. - POST /v1/audio/transcriptions: Generates transcriptions from audio. - GET /v1/models: Lists available models for 3rd party tools. Requires FastAPI and Uvicorn to be installed. appcKdjjD]}|jjjj ddyyw)NTblocktimeout)rvaluesrrrr)rZrrs rYlifespanz"ServeCommand.run..lifespans` ++224 %""$ %77C88==DRS=TDsA A#)ra*T) allow_originsallow_credentials allow_methods allow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.r)Requestz/v1/chat/completionsrbodycj|jr'j||jj}nj |}t |dSNrtext/event-stream media_type)r9r #continuous_batching_chat_completionstater>generate_chat_completionr0)rrhoutputrs rYchat_completionz)ServeCommand.run..chat_completionsW  1 1$ 1 ?++AA$ H`H`a66t<$V8KL LrXz /v1/responsescdj|j|}t|dSrj)r5generate_responser0)rrrrs rY responsesz#ServeCommand.run..responsess2  * *7 * ;++G4F$V8KL LrXz/v1/audio/transcriptionsc K|j4d{}t|djd{|d}tj d|dj d|dj d|djdz dd dddd{j j|}t|d S7778#1d{7swYHxYww) Nr`r)r`rzReceived file: z ; MIME type: z; size: z.2fz KiBrkrlrm) formr_readrr#filename content_typesizer=generate_transcriptionr0)rryparsed_requestrrrs rYaudio_transcriptionsz.ServeCommand.run..audio_transcriptionss||~  !F#F|0022w-"  %d6l&;&;%.get_all_modelss 64;N;N;P QR RrXz/healthctddiS)Nstatusok)r/rWrXrY healthcheckz%ServeCommand.run..healthchecks4 01 1rXhttpcK|jjtxstt j }||j _||d{}||jt<|S7wr)headersr X_REQUEST_IDrUuuiduuid4rpr>)r call_nextr>rVs rYget_or_set_request_idz/ServeCommand.run..get_or_set_request_ids] ,,\:Oc$**,>OJ'1GMM $&w//H-7H  \ *O0sAA9A7A9)rrrN)rr,radd_middlewarer.r warning_oncefastapirgpostdictoptionsr middlewareuvicornrunrrrr) rrarZrgrsrvrrrrs ` rYrzServeCommand.runs  U U  Ux(      "e"&"e"e     g  $ ( ) MW MD M * M / " Mt M # M , - M M . M" \ "   S  # S   2  2        Cdiinn499>>TYYM`M`arXc gd}trQ|Dcgc]E}|dtjjj|j dddGc}S|Dcgc] }t |}}|Dcgc]5}|j d|jj|jd7c}Scc}wcc}wcc}w)a. This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based model working with generate can work. This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party integrations. ) zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructr/r)rHrKrIowned_by) rdatetimenow timestampsplitrrH created_atauthor)rmodelsr model_infoss rYrzServeCommand.get_gen_modelss   $  %'00446@@B % C 0 3  ;AA:e,AKA)  ((%$//99; %  BsA B5 B:8:B?rc  j|d jk7} _|r0j$jjddd_j \}}t |dr |j n| t||j j jddd jK|jd _tj_ jj|j|d d d j!|j"} fd  fd}||d|S)a' Generates an OpenAI Chat Completion using continuous batching. Args: req (`dict`): The request to generate an OpenAI Chat Completion for. Returns: `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks. rNTrr]rDFfifo)r eos_token_id pad_token_id use_cacher scheduler)rO streamingmessagespt)return_tensorsadd_generation_promptc3K j|djj|D]\}|jtj k(rj|dyj||j d|^y#t$rT}tjt|jj|dt|dYd}~yd}~wwxYww) N assistantr@rrrrAr)r>r?rrCrDdata: {"error": ""}) rUrrequest_id_iterrr*FINISHEDgenerated_tokens Exceptionrr'rUcancel_request)r>rCresultr-model_id_and_revisionrrDs rYstream_chat_completionzPServeCommand.continuous_batching_chat_completion..stream_chat_completion^s 766z [p6qq"FFVVWabF}} (>(>>">>&*0"7? ">>'1$*$;$;B$?"7*7&/ ?" 7 SV$88GG S*3q6(#66 7s<C6A(B-C6.'BC6 C3A C.)C6.C33C6cK t|jd}jj||j}||D]$}|t j dd{&y7#t j$r7jj|tjd|dYywxYww)NF)r>rrzRequest z was cancelled.) rtolistr add_requestrasynciosleepCancelledErrorrrwarning)_inputsr>rCrTrOrrs rYcancellation_wrapperzNServeCommand.continuous_batching_chat_completion..cancellation_wrapperzs G ,W^^-=u E !EEQQ CTCcCcR 4J N+EK!--***+*)) G88GG S*_EF Gs<CA+A:0A81A:7C8A::ACCCCr)process_model_namerrrrload_model_and_processorrrDrrOrrinit_continuous_batchingrlogit_processorrapply_chat_templatetor) rrr>must_discard_cacherrinputsrrOrrrDs ` @@@@rYroz0ServeCommand.continuous_batching_chat_completion.sa!% 7 7G E2dooE/ 77C88==DRS=T;?8889NOy+29k+JI''PY = $)$;$;"//"//   3 3 ;7<7U7U"3t8V8D 4 H[G\D 4 4 D  4 4 : : <..s:tko.pss LL  78 G$F1Iz::rXr(c|jj}|tjvrtj }|S|t jvrtj}|Std|)NzUnknown modality: ) __class__rQrr`rrrrr)rmodel_classnamemodalitys rYget_model_modalityzServeCommand.get_model_modalitysm//22 HOOQ Q||H   A H H J J||H1/1BCD DrXrc g}|D]}|dgd}|tjk(rmt|dtr|d}nMt|dtr:g}|dD]}|ddk(s |j |d!dj |}|d<n(|tjk(rt|dtr|dj d|ddn|dD]}|ddk(r|dj | |ddk(s)d |dd vrtjd d |dd }tjttj|}tj d d} | j"} |j%| j"n|dd } |dj d| d|j ||S)Nr@r@r?r?typerd )rrd image_urlbase64urlz^.pngF)suffixdeleteimage)rr)rr isinstancerUrappendjoinrresubr+openr r b64decodetempfileNamedTemporaryFilenamesave) rrprocessor_inputsmessageparsed_messageparsed_contentr? image_datarr`rs rY*get_processor_inputs_from_inbound_messagesz7ServeCommand.get_processor_inputs_from_inbound_messagess' 4G&-fo"EN8<<'gi0#6%,Y%7N 2D9%'N#*9#5C"6?f4*11'&/BC&)XXn%=N,:y)X\\)gi0#6"9-44fgV_N`5ab#*9#5\"6?f4*95<req_0gptossFskip_special_tokens skip_promptr input_ids)streamerrOreturn_dict_in_generatepast_key_valuesc3nKd}d}d jjdjvrd}d} fd}t| }d} |j t }j d  |D]#}d jjdjvr|jd }||z }|r||vrd}LM|jtd k(rd|_ u|jtd k(r(|jj |dd|jr@|xj|z c_ |jsYtjd|j} |  | j!d} d|_ t#t%| dd|dz} n|dk(rHd|jvrX|xj&|j)dz c_|xj&|j)dzc_|j&dkr&dj+|j-ddddz}t#t%|dd} j |d| g|dk7sj ||&j |d|j+|j+y#t.$r9} t0j3t5| d t5| d!Yd} ~ Nd} ~ wwxYw#|j+wxYww)"NFrrT<|channel|>final<|message|>cLjdi|}|j_yNrWgeneraterrrgenerate_outputrrs rYgenerate_with_cachezbServeCommand.generate_chat_completion..stream_chat_completion..generate_with_cache$"0%..":6":%4%D%D"rXtargetrrrr <|return|>rrrB)r>r@rArz\"name\": \"(.*?)\"r#)rfunction _tool_call)r rGrrHz"arguments": {{}) arguments)r rGr)r>r@rBr)r?rrrrrr)config architecturesrr rrrU removesuffixstrip_TOOL_CALL_TOKENSrrrrrsearchgroupr7r8rcountrrrrr'rU)r _request_id filter_cot cot_trace_endrthreadresults tool_stater tool_nametoolr-generation_kwargsrrr>rtool_model_familys rYrzEServeCommand.generate_chat_completion..stream_chat_completionsNJ M5<<55a8>>@@! =  E#6?PQFGg  &[ 66z [p6qq&VF5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G"(G3).J$$)4!<<>->?P-QRY-ZZ:>J7$"<<>->?P-QRW-XX&,,."&"B"B+6%).:&; #C#%%66&--7-$.#C#C,.II6LjN_N_,` #,#4$,090BICG @':-Hi-X*+)3'2\'A ("$*R<$,$4:;L;L#L$,!+ < < S@Q Q < * < < S@Q Q <#-#?#?!#C-/WWV\\#5Fs5K-Ls-RF':-HSY-Z*+)3(" #'"B"B+6Ttf\q#C#%|">>'?T?iVn66{RX`u6vv   7 SV$*3q6(#66 7  sCAL5 H>K ?K L5 L$/LL LL L22L5)rrrrrrr_MODELS_WITH_TOOL_SUPPORTrrrrrrrr rrOis_continuationrget_seq_lengthshape)rrrrrrrsupported_model_familiesrrgeneration_streamerrOrseq_lenrr"rrr>r#s` @@@@@rYrqz%ServeCommand.generate_chat_completions& 99 ,9900CL9c[`[r[rs    $-?((779Gk"((,w6 $ 2 2   +!2'+,  x x t&&9:FFrXc< jd jk7} _j \ }tdtr'dvr dddgng}|j dddntdt r8dvr.ddddk7rdddgd}nYd}d|dd <nHd}nBtdtr$dvr dddgng}|j dn td |j|d d }|j j}jddd }d jjdjvrd}t!||d }t# j$}d}j'r=|s;j(j+} |dj,d| kDr j(}|t/j0|||d |d fd} | |S)a  Generates an OpenAI Response using `generate`. Args: req (`dict`): The request to generate an OpenAI Response for. Returns: `Generator[str, None, None]`: A generator that yields the OpenAI Response events. rinput instructionssystemrrerr@r?z%inputs should be a list, dict, or strTr)rrrprrFrrNrr)rattention_maskrrOrrc3. Kd}d}djjdjvrd}d}fd}t|}d}d}d} |j t j } t d|td | d jd d d diidggjdddjd } |dz }j| td|td | djd d d diidggjdddjd } |dz }j| td||tddddg} |dz }j| tdd|||tdd g!"} |dz }j| d }|D]~}djjdjvr|jd#}||z }|r ||vrd}d }MNt!d$d||||d d%d&g'}|dz }j|t#d(d||d|d d%d&g)}|dz }j|t%d*d|||td|j&g!"}|dz }|dz }j|t)d+||tddd,d|j*gg-}|dz }|dz }j|t-d.|td | d,jd d d dii|j.gdgjdddjd/ }|dz }j||j1|j1y#t2$r}t4j7d0t9|t;d1|t9|2}|dz }j|t=d3|td  d4jd d d diigdgddjdt?d5t9|67 }|dz }j|Yd}~d}~wwxYw#|j1wxYww)8NFrrTrcLjdi|}|j_yrrrs rYrzTServeCommand.generate_response..stream_response..generate_with_cacherrXr zresponse.createdresp_queuedr-formatrrdrVrrru) rHrrrr-rdrKrrrrrlru)rsequence_numberrVr#zresponse.in_progress in_progresszresponse.output_item.addedmsg_rr)rHrrr@r?)rr5 output_indexitemzresponse.content_part.added output_textr)rrd annotations)ritem_idr5r8 content_indexpartr zresponse.output_text.deltagX@)tokenlogprob)rr<r5r8r=rFrtzresponse.output_text.done)rr<r5r8r=rdrtzresponse.content_part.donezresponse.output_item.done completed)rHrrr@r?r;zresponse.completed) rHrrrr-rdrrrKrrrlruz"Exception in response generation: r')rr5rzresponse.failedfailed server_error)coder) rHrrrr-rdrrrKrrrlrur') rrrr rrRr>r:rrXrBrCrEr<rFrrGrHr=rdrDr>r;r9rrrr'rUr@rAr?)rrrrrrr5r8r=rresponse_createdresponse_in_progressresponse_output_item_addedresponse_content_part_addedrrresponse_output_text_deltaresponse_output_text_doneresponse_content_part_doneresponse_output_item_doneresponse_completedr- error_eventresponse_failedr"rrrr>rs rYstream_responsez7ServeCommand.generate_response..stream_responsesJ M5<<55a8>>@@! =  E#6?PQFOLMM  !YY[ $8+$3%":,/#-'3%(WW^%<&(89) !,/GG4I5,Q$*!$!4 $ $ 1$//0@AA'>/$3%":,/#-,3%(WW^%<&(89) !,/GG4I5,Q$*!$!4 ($$ 1$//0DEE.J5$3!-.!*.Y}[fpr .* 1$//0JKK/L6":,/$3!-"/+RUWX /+ 1$//0KLL&PF5<<#=#=a#@#F#F#HH!'!4!4\!Bv%G"(G3).J&(G$$1G9"&zl 3(7%1&3$,.4"@!A2.$q(O334NOO3P8-B4":,/$3!-"# (*t<=-) 1$//0IJJ.J5":,/$3!-"/+E^EcEcqst .* 1$" //0JKK-H4$3!-.!*.&*(!;!@!@ A$&  -) 1$! //0IJJ&<-$3%":,/#-*3%(WW^%<&(89 9 > >?) ,/GG4I5,Q$*!$!4 &"$ 1$//0BCC J I! A A#a&JK0 $3F  1$// <<"5*$3%":,/#-'3%(WW^%<&(89!) ,1$*!$!4+!/$'F#, 1$//@@C! AH s>A RMN%R% Q=.CQ83R8Q==RRR)rrrrrUrrrrrrrrrrrr rrOr%rr&r'r ones_like)rrrrrrr)rOrr*rPr"rrr>s`` @@@@rYruzServeCommand.generate_responses]!% 7 7G E2dooE/889NOy c'lC (M[_bMbxC4GHIhjF MM6c'lC D G d +$w<?6*h6'/CF1Ii(W G d +M[_bMbxC4GHIhjF MM#g, 'DE E..vTbf.g5<<(WW3W= # u||11!4::< <"' 2  3  >c[`[r[rs    $-?((779Gk"((,w6 $ 2 2 #oof5+!2'+,  ` ` D2J??rXc( ts td|j|d}|j|\ t j dd}t | j} jj}tj|d}tj||d\}} ||d j j d j j  d <||dd  fd } | S) a Generates an OpenAI Transcription using the audio file. Args: req (`dict`): The request containing the audio file and model information. Returns: `Generator[str, None, None]`: A generator that yields the transcription result. z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`rTrrr`)srmonor) sampling_raterinput_features)rrOrc3Kjdi}j|jdd}t|}|j dyw)NT)rr)rdrMrW)r batch_decode sequencesr1rS) generated_idstranscription_text transcription audio_inputs audio_modelaudio_processorr"s rY_generate_transcriptionzDServeCommand.generate_transcription.._generate_transcriptionsg0K00U<UCTUM!0!=!=m>U>Uko!=!pqr!s )/ABM"222EF GsAA)rr rload_audio_model_and_processorr rDrrOfeature_extractorrUior librosaloadrrr)rrrr)rOmodel_sampling_rate audio_bytes audio_array_r`r]r^r_r"s @@@@rYr~z#ServeCommand.generate_transcriptions"$%o !% 7 7G E'+'J'JK`'a$ _2  % %4T > )F)F  .??MMjjV-  k6IPTU Q&{BUfjknn    *66F)G)J)J;K\K\)] %&,!2'+   H '((rXc>|jdxs|jd}d}|jd}n`t|jt|k\rd}n p5CZCZCiCilpCp ! "%=59E # # 2 m$9#:;<n$$rXc||jvs|j|jrG|j|\}}t||jj ||j|<||fS|j|j |j|j}|j|j}||fS)a\ Loads the text model and processor from the given model ID and revision into the ServeCommand instance. Args: model_id_and_revision (`str`): The model ID and revision to load. Returns: `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor. rr rrrrrrrrr)rrrrs rYrz%ServeCommand.load_model_and_processorks !(:(: :d>P>PQf>g>r>r>t#BBCXY E98B $ 7 7#9D  4 5i   4 5 A A C&&'<=CCE**+@AKKIirXc||jvs|j|jrG|j|\}}t||jj ||j|<||fS|j|j |j|j}|j|j}||fS)aU Loads the audio model and processor from the given model ID and revision into the ServeCommand instance. Args: model_id_and_revision (`str`): The model ID and revision to load. Returns: `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor. rr)rrr^r_s rYraz+ServeCommand.load_audio_model_and_processors !(:(: :d>P>PQf>g>r>r>t+/+N+NOd+e (K8B $ 7 7)9D  4 5O++   4 5 A A C,,-BCIIK"001FGQQOO++rX)rNNNNNNN).rQrRrS staticmethodrrrrrrr$r/r5r9r=rUr rrrrrUrXr functoolscacheanyrrrorrrrrqrur~rcr%rtrrtuplerrrarWrXrYrrs >N > >+^+Z///! /  /b     d !%#"'+<@047;6G6G#6G} 6G sm 6G  } 6GT"7896G -6G346G 6GpJ[JSJ ]b~__+T#s(^ 4++ZZ;tZ;Z;Q_`cei`iQjZ;x "3   + x+ + ZGGDGGYsD$5OGGRc@Tc@iT4.Hc@J .)$.)9S$_3M.)`+4+D+<#n#BV9W##8"3"3"";%C;%z %(  "99 : 6,C,ERcesRsLt,rXr__main__)rrrrenumrrrcrrrrrRrargparserrcollections.abcrrr contextlibr dataclassesr r r r typingr rrhuggingface_hubrhuggingface_hub.constantsrtokenizers.decodersrr &transformers.models.auto.modeling_autorrtransformers.utils.import_utilsrrrrrrrrrrrr utilsr!r"r$rr%r&r'r(generation.continuous_batchingr)r*rdPILr+r rrr,r-fastapi.middleware.corsr.fastapi.responsesr/r0 openai.types.audio.transcriptionr1.openai.types.audio.transcription_create_paramsr2openai.types.chatr3'openai.types.chat.chat_completion_chunkr4r5r6r7r8*openai.types.chat.completion_create_paramsr9openai.types.responsesr:r;r<r=r>r?r@rArBrCrDrErFrGrH-openai.types.responses.response_create_paramsrIpydanticrJrKrLrNr]r_r2r7r;r3r8r<rrQrrrr%r$rEnumrrrrrrrrrrrWrXrYrsr    .??*(--&4,0(Zk 4 6k;O;QkViVk .6A>\<["\@@4QY^6U]b0MUZ%%NO&'RS)*OP %!.#   H %    !!2!7!7!9: tyy 8 8/8 8v  /@/@d nn nbG,-G,T& z NE IIKrX