L iKddlmZmZddlmZerddlmZddlmZm Z m Z m Z m Z ddl mZe rddlZe j eZdZGd d eZy) ) TYPE_CHECKINGOptional) HfQuantizer)PreTrainedModel)is_accelerate_availableis_kernels_availableis_torch_availableis_triton_availablelogging)get_module_from_nameNc eZdZdZdZdZdgZfdZdZdZ d#d Z d d d e de fd Z d d ddd e ddfdZd$dZd d dee dee fdZ d%d d deee fdZdee de dee fdZdZdZd e de fdZd&de fd Zd%d!Zede fd"ZxZS)'Mxfp4HfQuantizerz/ FP4 quantization using fbgemm kernels TF acceleratec Bt||fi|||_d|_yN)super__init__quantization_configtriton_kernels_hub)selfrkwargs __class__s m/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/quantizers/quantizer_mxfp4.pyrzMxfp4HfQuantizer.__init__1s& ,77#6 "&c|j ddlm}|d|_|jS|jS#t$r tdwxYw)z3Lazy import and initialize kernels only when neededr) get_kernelz kernels-community/triton_kernelsz2kernels package is required for MXFP4 quantization)rkernelsr ImportError)rrs r_lazy_import_kernelsz%Mxfp4HfQuantizer._lazy_import_kernels6s]  " " * X.*45W*X'&&&t&&& X!"VWW Xs 9Acxts td|jjrytj j s\tjj s>|jr'tjdd|j_ytdts tdtjj rd}tdxr t}n:tj j}|dk\}tdxr t}|jrR|s'tjd d|j_y|sAtjd d|j_y|s t!d |s t!d |js|j#|j%d }|tjdy|N|jsAt'|t(r0d|j+vsd|j+vr t!dyyyy)NzqUsing mxfp4 quantization requires torchPlease install the latest version of torch ( pip install --upgrade torch )z^Using MXFP4 quantized models requires a GPU, we will default to dequantizing the model to bf16Tz-Quantizing a model using MXFP4 requires a GPUz9Using mxfp4 requires Accelerate: `pip install accelerate`z3.5.0)z3.4.0uMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) We will default to dequantizing the model to bf16.zMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16uMXFP4 quantization is only supported on GPUs with compute capability >= 7.5 (e.g T4, A100, L4, H100, or B200) or XPUs (e.g Intel® Data Center GPU Max Series) zuMXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0 device_mapzYou have loaded an FP4 model on CPU and have a CUDA/XPU device available, make sure to set your model on a GPU/XPU device in order to run your model. To remove this warning, pass device_map = 'cuda' or device_map = 'xpu'. cpudiskzYou are attempting to load an FP4 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r r r dequantizetorchcuda is_availablexpu pre_quantizedlogger warning_once RuntimeErrorr r r get_device_capability ValueErrorr!get isinstancedictvalues)rargsrgpu_is_supportedkernels_availablecompute_capabilityr%s rvalidate_environmentz%Mxfp4HfQuantizer.validate_environmentAs#!#]   # # . .  '')UYY-C-C-E!!##t7;((3"#RSS&(YZ Z 99 ! ! ##  3G < WAUAW !&!A!A!C 1V;  3G < WAUAW    ###I7;((3$##7;((3!r #H !!  % % 'ZZ -     V  #&&z40j//11Vz?P?P?R5R n6S1'$rreturncV|&tj}tjd||S)NzOverriding dtype=%s with `dtype=torch.bfloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp4. Pass your own dtype to specify the dtype of the remaining non-linear layers or pass dtype=torch.bfloat16 to remove this warning.)r)bfloat16r.info)rdtypes r update_dtypezMxfp4HfQuantizer.update_dtypes. =NNE KK@   rmodelr param_namec ddlm}ddlm}|jj r%d|vsd|vrt ||dtd \}}nt ||\}}t||s"t||r|jj r|dvryy y) NrMxfp4GptOssExperts GptOssExpertsblocksscales_blocks)down_proj_biasgate_up_proj_biasFT) integrationsrFmodels.gpt_oss.modeling_gpt_ossrHrr(rlenr4)rrBrCrrFrHmodule tensor_names rparam_needs_quantizationz)Mxfp4HfQuantizer.param_needs_quantizations5C  # # . .H 4JhZdNd"6ujIZCPYN?>["\ FK"6uj"I FK f0 1 v} -$2J2J2U2UEEr param_valuez torch.Tensor target_devicez torch.devicec ddlm}m}m}m} m} ddlm} |js|j} t||\} }tj|5t| |r| || \}}| jj| jj | jj"}}}| ||| \}}d|vrdnd}t%| ||t%| |d||||t'| |d t'| |d dddy|j)d }|j)d }|j)d }|j)d}|j)d}d|vsd|vr3|j*jrt||dt-d  \} }nt||\} }||||||d}t| |s"t| | rf|j*jrO|j*jr|dt-d  }|| ||||fi|y|| ||||jfi|yyy#1swYyxYw)Nr)rFr(load_and_swizzle_mxfp4quantize_to_mxfp4 swizzle_mxfp4rG gate_up_proj down_proj_precision_config)rhs_data) weight_scaleflex_ctxrK_scales empty_param casting_dtype to_contiguousrank device_meshrIrJ)rarbrcrdrerB)rNrFr(rWrXrYrOrHr-r!rr)devicer4 matmul_ogsPrecisionConfigFlexCtx InFlexDatasetattrdelattrr3rrP)rrBrTrCrUrrFr(rWrXrYrHrrQ_triton_weight_tensorr^rhrirjprojrarbrcrdre shard_kwargs dq_param_names rcreate_quantized_paramz'Mxfp4HfQuantizer.create_quantized_paramsz   D!!!%!:!:!< ,UJ?IFAm, 6f&899J;Xj9k6(,*55EE*55==*55@@/9WO :G,l{DFD*>?& 12'\G]g]iLjk FtfG$45FtfG$45+ 6 64!**]3K"JJ7M"JJ7M::f%D **]3KJ&(j*@dF^F^FiFi0 CTc)n_8UV 0 C  +!.!.* L&"456=1d6N6N6Y6Y++66%//@#i.$AMvz; }m`lm*"#%113  ' 7Z1_ 6 6s B?II c F|jjr|j|tjj rtjj ytjj rtjj yyr)rr(remove_quantization_configr)r*r+ empty_cacher,)rrBrs r#_process_model_after_weight_loadingz4Mxfp4HfQuantizer._process_model_after_weight_loadingsd  # # . .  + +E 2 :: " " $ JJ " " $ YY # # % II ! ! #&r expected_keyscheckpoint_keyscg}|D]C}|jdr8|dtd }|j|dz|j|dzM|jdr8|dtd }|j|dz|j|dz|js|jd r$|dtd }|j|dz|jd r%|dtd }|j|dz |jd r |j|3|j|F|S) Nz.mlp.experts.gate_up_projrZgate_up_proj_blocksgate_up_proj_scalesz.mlp.experts.down_projr[down_proj_blocksdown_proj_scalesz.mlp.experts.down_proj_blocksz .mlp.experts.gate_up_proj_blocksrJ)endswithrPappendr-)rrBrwrxnew_expected_keyskeybases rupdate_expected_keysz%Mxfp4HfQuantizer.update_expected_keys sS  .C||781c.112!((0E)EF!((0E)EF67.c+../!((0B)BC!((0B)BC''<< ?@9#&8"9!9:D%,,TK-?@\\"DE<#&;"#> $ 8 8  ,0+C+C (r missing_keysprefixc$ddlm}g}|jD]\\}}t||s|D]E}||vs ||d|vs|j dr#|j dr5|j |G^|D cgc] } | |vs|  c} Scc} w)NrrE.z.weightz.bias)rNrF named_modulesr4r~r) rrBrrrFnot_missing_keysnamerQmissingks rupdate_missing_keysz$Mxfp4HfQuantizer.update_missing_keysGs5!//1 9LD&&"45+9GDvhay4I,I ' 0 0 ; ' 0 0 9(//8 9 9(Ea14D+DEEEs < B B cd|jjvr-t|dd |jj ddddd|S)N GptOssConfigbase_model_tp_plan grouped_gemmz(layers.*.mlp.experts.gate_up_proj_blocksz(layers.*.mlp.experts.gate_up_proj_scalesz%layers.*.mlp.experts.down_proj_blocksz%layers.*.mlp.experts.down_proj_scales)r__name__getattrrupdaterrs rupdate_tp_planzMxfp4HfQuantizer.update_tp_planVR V--66 6v3T:F))00DRDRAOAO  rcd|jjvr-t|dd |jj ddddd|S)Nrbase_model_ep_planrr)rrrrrrs rupdate_ep_planzMxfp4HfQuantizer.update_ep_plancrrc2|jjr.d|vr|jddSd|vr|jddS|S|jsF|j dr|jddS|j dr|jddS|S)NrKr`rZrzr[r|)rr(replacer-r~)rrCs rget_param_namezMxfp4HfQuantizer.get_param_nameps  # # . .J&!)))R88j(!)))R88  ##"">2!)).:OPP"";/!))+7IJJrsafe_serializationclddlm}|j}|jD]\}}t ||st |ds!t |ds.|j jjj|j jjjddjdddd ||d <|jjjjj|jjjjjdd||d <|jjjj|jjjjddjdd dd||d <|j jjjj|j jjjjdd||d<i}||fS)NrrErZr[ Zz.gate_up_proj_blocksz.gate_up_proj_scalesi@ z.down_proj_blocksz.down_proj_scales)rNrF state_dictrr4hasattrrZstoragelayoutunswizzle_datadata transposereshapegate_up_proj_precision_configr^r[down_proj_precision_config)rrBrrFrrrQmetadatas rget_state_dict_and_metadataz,Mxfp4HfQuantizer.get_state_dict_and_metadata}s5%%' !//1 LD&6#56FN3FK0''//66EEfFYFYFaFaFfFfgYr2&WRR,dV#789 88EEMMTTcc<<IIQQVViB'dV#789 $$,,33BB6CSCSC[C[C`C`aYr2&WRr2.dV#456 55BBJJQQ``99FFNNSSiB'dV#456+ 68##rcy)NT)rrs ris_serializablez Mxfp4HfQuantizer.is_serializablesrc.tjdy)NzMXFP4 quantization don't support training, please consider dequantizing the model first by passing quantization_config=Mxfp4Config(dequantize=True) to .from_pretrained()F)r.r/)rs r is_trainablezMxfp4HfQuantizer.is_trainables x r)r@ torch.dtyper<r)rBrr)F)r __module__ __qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesrr!r;rAstrboolrSrrrvlistrrrrrrrrrpropertyr __classcell__)rs@rrr'sI(,$ %' 'M^ .?S_c"R R$R R & Rh$!*;!DQTI!hlmphq!@59D D'tCy1D> FtCy F# FRVWZR[ F     !$T!$Fdrr)typingrrrrmodeling_utilsrutilsr r r r r quantizers_utilsrr) get_loggerrr.rrrrrrsU+03   H %A{Ar