L iCldZddlZddlZddlZddlZddlZddlZddlZddl Z ddl Z ddl Z ddl Z ddl Z ddlZddlZddlZddlZddlmZmZddlmZddlmZddlmZmZmZmZmZddlmZddl m!Z"ddl#Z$ddl%Z&ddl'Z'ddl(m)Z*dd l+m,Z,m-Z-m.Z.dd l/m0Z0dd l'm1Z1dd l2m3Z3m4Z4m5Z5m6Z6m7Z7dd l8m9Z9ddl:m;Z;ddlZ>m?Z?ddl@mAZAmBZBddlCmDZDddlEmFZFddlGmHZHmIZIddlJmKZKddlLmMZMmNZNmOZOddlPmQZQddlRmSZSddlTmUZUmVZVmWZWddlXmYZYmZZZddl[m\Z\m]Z]ddl^m_Z_ddl`maZaddlbmcZcddldmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlddlmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZdd lmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZdd!lmZmZmZdd"l!mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZdd#lmZdd$lmZdd%lmZefgZeiZerdd&lmZeZerddlZerQddlmcmZddlmcmZddlmZdd lm9Ze0jeݫe0jek\Zer ddlm)cmZnd'ZerCddlmcm'Zdd lm9Ze0jee0jd(k\Zdd)lmmZmZmZmZnd'Zerdd*lmZertdd+lmZmZdd lm9Zdd,lmZdd-lmZmZmZmZmZmZmZe6gZe0jee0jd.kDrdd/lmZdd0lmZeegz ZeOrdd1lmZed2rdd3lmZd4Zd5Zd6ZerddlZeĐjeZ d7Z d8Z d9Z d:Z d;Zd<Zd=Zed>?Gd@dAZy)Buc The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task. N)IteratorMappingpartial)Path) TYPE_CHECKINGAnyCallableOptionalUnion)#get_reporting_integration_callbacks) ModelCard create_repo upload_folder)version)nn) DataLoaderDatasetIterableDataset RandomSamplerSequentialSampler) __version__)PretrainedConfig) DataCollatorDataCollatorWithPaddingdefault_data_collator) DebugOptionDebugUnderflowOverflow)SequenceFeatureExtractor)FeatureExtractionMixin)"ALL_HYPERPARAMETER_SEARCH_BACKENDSdefault_hp_search_backend)BaseImageProcessor)deepspeed_initdeepspeed_load_checkpointis_deepspeed_available)tpu_spmd_dataloader)TrainingSummary)PreTrainedModelload_sharded_checkpoint unwrap_model)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMESMODEL_MAPPING_NAMES) Adafactor get_scheduler)ProcessorMixin)"is_torch_greater_or_equal_than_2_3)PreTrainedTokenizerBase)CallbackHandlerDefaultFlowCallbackExportableStatePrinterCallbackProgressCallbackTrainerCallbackTrainerControl TrainerState)DistributedTensorGathererEvalLoopContainerIterableDatasetShard LabelSmootherLayerWiseDummyOptimizerLengthGroupedSamplerSequentialDistributedSamplerdistributed_broadcast_scalarsdistributed_concatfind_batch_sizeget_model_param_countget_module_class_from_nameget_parameter_names nested_concat nested_detachnested_numpifynested_xla_mesh_reducereissue_pt_warningsremove_dummy_checkpointset_rng_state_for_device)PREFIX_CHECKPOINT_DIRBestRunEvalLoopOutputEvalPredictionHPSearchBackend HubStrategyPredictionOutputRemoveColumnsCollator SaveStrategyTrainerMemoryTracker TrainOutputcheck_target_module_existsdefault_compute_objectivedenumpify_detensorizeenable_full_determinismfind_executable_batch_sizeget_last_checkpoint has_lengthneftune_post_forward_hooknumber_of_arguments seed_workerset_seed speed_metrics)OptimizerNames ParallelModeTrainingArguments)(ADAPTER_CONFIG_NAMEADAPTER_SAFE_WEIGHTS_NAMEADAPTER_WEIGHTS_NAME CONFIG_NAMEGENERATION_CONFIG_NAMESAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMEWEIGHTS_INDEX_NAME WEIGHTS_NAMEXLA_FSDPV2_MIN_VERSIONPushInProgressPushToHubMixincan_return_losscheck_torch_load_is_safe find_labelsis_accelerate_availableis_apollo_torch_availableis_bitsandbytes_availableis_datasets_availableis_galore_torch_availableis_grokadamw_availableis_in_notebookis_liger_kernel_availableis_lomo_availableis_peft_availableis_sagemaker_dp_enabledis_sagemaker_mp_enabledis_schedulefree_availableis_torch_hpu_availableis_torch_mlu_availableis_torch_mps_availableis_torch_musa_availableis_torch_neuroncore_availableis_torch_npu_availableis_torch_optimi_availableis_torch_xla_availableis_torch_xpu_availableis_torchao_availablelogging strtobool)deprecate_kwarg)requires)QuantizationMethod)NotebookProgressCallbackFz1.10)smp_forward_backwardsmp_forward_only smp_gathersmp_nested_concat) PeftModel) Acceleratorskip_first_batches)AcceleratorState)AutocastKwargsDistributedDataParallelKwargsDistributedTypeload_fsdp_modelload_fsdp_optimizersave_fsdp_modelsave_fsdp_optimizer1.3.0)TorchTensorParallelPlugin)SeedableRandomSampler)DeepSpeedSchedulerWrapper0.28.0)DataLoaderConfigurationctrhtf}tjtj jdtjdk\r ddlm}g||}t||Sy)Npeftz0.7.0r)PeftMixedModelF) rrrparse importlibmetadatarr isinstance)modelclasses_to_checkrs Z/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/trainer.py_is_peft_modelrs`%< ==++33F; < g@V V +B!1B>B %!122 c~tr2dttjtj vrddiSiS)N adapter_onlyT)rylistinspect signaturer parametersrr_get_fsdp_ckpt_kwargsrs5 ^tGddee eefde;fd2Z?d3ede;fd4Z@d5e9fd6ZAdeefd7ZBd8ZCd9ZDd:ZEdd;ee eejjjffd<ZHeI dd ed ee dee%e%ffd=ZJdd5e9d>ej@jBfd?ZKd@e;de9fdAZLeIddBe;dCee9de9fdDZMdEe dFeee%fffdGZNdEe dFeee%ffdHe9dIeeeOffdJZPdKefdLZQddMZRddNZSdOZTddPZU ddQee ee:fdEe dFeee%fd fdReeedSe%fdTZVde9fdUZWde9fdVZX ddWZYdXZZddYZ[dZZ\d[Z]dd\Z^ dd]Z_d^Z`d_Zad`ZbdaZcdbZddcZeddZfdeZgdfZh ddgeedFgeeeOffdheeeeeOfgeOfdie9dje eeefdkee dleifdmeedFgefde ejeejffdnZkddoeeeOfdpeeOdd fdqZldre ejLe%fde ejLe%ffdsZmdteee ejLe%ffdeee ejLe%fffduZndvZodteee ejLe%fffdwZpdxZqddyee:fdzZr dd e jdteee ejLe%ffd{eejLdejLfd|Zs dd e jdteee ejLe%ffd}e:d{eejLfd~Ztde:fdZude:fdZvddeede:fdZwddeefdZxd deefdZydZzd e{d*fdeefdZ|dddZ} ddee eeeeffdeeededeeeOffdZ~ dd3edeeededefdZ dd@e;d&edee:deeededef dZddZ dd e jdteee ejLe%ffde:deeedeeejLeejLeejLff dZdteee ejLe%fffdZddeefdZ ddeedeede eeed fdeedeede eeed fde eeed fd%e eeed fde eeed ffdZdZdZ ddeede:deedeedef dZ dd@e;d&edee:deeededef dZdZddZdZddZdZdedej"dee ejLe9ffdZdede9dej"deeee ejLe9fffdZd ed@e;de9fdZy )TraineruA Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers. Args: model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*): The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed. [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers models. args ([`TrainingArguments`], *optional*): The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided. data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will default to [`default_data_collator`] if no `processing_class` is provided, an instance of [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer. train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*): The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally sets the seed of the RNGs used. eval_dataset (Union[`torch.utils.data.Dataset`, dict[str, `torch.utils.data.Dataset`, `datasets.Dataset`]), *optional*): The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each dataset prepending the dictionary key to the metric name. processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): Processing class used to process the data. If provided, will be used to automatically process the inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. This supersedes the `tokenizer` argument, which is now deprecated. model_init (`Callable[[], PreTrainedModel]`, *optional*): A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start from a new instance of the model as given by this function. The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to be able to choose different architectures according to hyper parameters (such as layer count, sizes of inner layers, dropout probabilities etc). compute_loss_func (`Callable`, *optional*): A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`]. compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*): The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered after the last eval batch to signal that the function needs to calculate and return the global summary statistics rather than accumulating the batch-level statistics callbacks (List of [`TrainerCallback`], *optional*): A list of callbacks to customize the training loop. Will add those to the list of default callbacks detailed in [here](callback). If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method. optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. optimizer_cls_and_kwargs (`tuple[Type[torch.optim.Optimizer], dict[str, Any]]`, *optional*): A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument. Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*): A function that preprocess the logits right before caching them at each evaluation step. Must take two tensors, the logits and the labels, and return the logits once processed as desired. The modifications made by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. Important attributes: - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`] subclass. - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`, the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`. - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from data parallelism, this means some of the model layers are split on different GPUs). - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set to `False` if model parallel or deepspeed is used, or if the default `TrainingArguments.place_model_on_device` is overridden to return `False` . - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while in `train`) r )_get_learning_rate log_metricsmetrics_format save_metrics save_state tokenizerprocessing_classz5.0.0T)new_namerraise_if_both_namesNNNrargs data_collator train_datasetzdatasets.Dataset eval_dataset model_init.compute_loss_funccompute_metrics callbacks optimizersoptimizer_cls_and_kwargspreprocess_logits_for_metricsc !|'d}tjd|dt|}|jr.| ,dt j | j vr td|j*|jdk7r|td|jd |jtjk(s |jr|j td ||_||_|jj rt#|jj$nt'|jj$d|_d|_d |_||_|j1t3|jj4|_|j6j9|j;}t=j>||j@|%|||_!|jE}n.tGd |tIjJd tL||_!|jNjPtRvr#td|jNjPdtU|dd rtU|dd rd|_+nd |_+tU|ddtY|jZj]Dcgc] }|dvs| }}t_|dkDrd|_+nJt_|dk(r5|jj`tcj`|dk7|_+nd |_+|jVrtjd|jjdrtgrddl4m5}|jjl|jjlni}to|tpr |dOd|i|ndts|dr7to|jutpr|dOd|jui|n!tjwdn tydtU|dd xrtU|dd  }tU|ddduxr|jzj|}tU|ddduxrtU|jzd d }|rts|d!r td"|rt|s |s td#|rO|sMtd$|jzjjd%|jzjj|jd&|_Ct_|jdkDrN|jr td'|jd&s(|jtjk7r td(|j|_I|jVsH|js<|js |jr |jr|js |jrd |_I|!to|ttfr t|nt}||n||_R||_S||_T||_U|jr:tU|d)dtjk7r|j||j`|jVrd|j_Y||_Z||_|jj|}t|rats|dr|ju}nDts|d*r-ts|jdr|jj.}n td+ts|d,r|j|_`nNt j |jj }td-|j]D|_`|j|_c| |_d| |_e| \|_f|_g| |_h|j|j tGd.|#|j |j tGd/tr|j||j.j D]}|j`}n|jjD]'}t_|d0dkDs|d0dj`}nk7r td1|js |jr#|j |j tGd2tt|jjz}| |n|| z} t| |j.|j|j|j|_o|j|jjrtntd |_td|_u|jjr|j|jjr+tj|jjd3t|js+ttU|jd4dr td5|jdkDr$|jdkDrtjd6|&t|s|jdkr td7|Ito|tbjjjr|jr td8d|_d |_d |_tr_|jr td9tr|jtjjjk7r tjwd:tjjjd;|jd<tjjjtjjj|_nftstjjd=r?tjwd:tjjjd>|js |jr||jd?k(rl|j`tcj`d@k(r&|jrt stdAdB|_tjdC|jdD|js |jr\|jsPtsE|jdBk(rd|_tbj"|_n|jdEk(rd|_|jj&dk7r(t)|jj&F|_nd|_|jj&dkDrGtU|j.j,dGddHk(r"tIjJdIt.d|_t1|_t5|j7|j9|jj:|j2gzDcgc]}to|t<s|c}J|_d|_d|_|j.} t|j.rQts|j.dr|j.ju} n |j.jj.} tC| jN}!|jjD|!n|jjD|_tG| jN|_|jjI|j|j|j2|_|jJ|_d |_|j6jQ|jjSdKd |_|jTrxtVs tdLtYjZ}"t]j^t]j`tcjdtg|"|"dfdMN|jxr|jT |_ycc}wcc}w)PN tmp_trainerz1No `TrainingArguments` passed, using `output_dir=z`.) output_dircompute_resultzWhen using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result` boolean argument which will be triggered after the last batch of the eval set to signal that the summary statistics should be returned by the function.noz%You have set `args.eval_strategy` to zx but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. z`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`.Fz<`Trainer` requires either a `model` or `model_init` argumentz`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will overwrite your model when calling the `train` method. This will become a fatal error in the next release.zThe model you have picked (a) cannot be used as is for training: it only computes hidden states and does not accept any labels. You should choose a model with a head suitable for your task like any of the `AutoModelForXxx` listed at https://huggingface.co/docs/transformers/model_doc/autois_parallelizablemodel_parallelT hf_device_map)cpudiskr rzYou have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.)_apply_liger_kernel_to_instancerget_base_modelzRThe model is not an instance of PreTrainedModel. No liger kernels will be applied.zYou have set `use_liger_kernel` to `True` but liger-kernel >= 0.3.0 is not available. Please install it with `pip install liger-kernel` is_quantized_hf_peft_config_loaded hf_quantizeris_qat_trainable _orig_modzYou cannot fine-tune quantized model with `torch.compile()` make sure to pass a non-compiled model when fine-tuning a quantized model with PEFTzYou cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more detailsz8The model you are trying to fine-tune is quantized with z but that quantization method do not support training. Please open an issue on GitHub: https://github.com/huggingface/transformers to request the support for training support for xlazZUsing --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags.z.Using fsdp only works in distributed training.quantization_method base_modelz8Cannot extract base model safely from this PEFT wrapper.accepts_loss_kwargsc3jK|]+}|jtjjk(-ywN)kindr Parameter VAR_KEYWORD).0ks r z#Trainer.__init__..s*1<='++7771s13zSPassing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.zPassing a `model_init` is incompatible with providing the `optimizers` argument. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.paramsa[The model and the optimizer parameters are not on the same device, which probably means you created an optimizer around your model **before** putting on the device and passing it to the `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and `model.to(xm.xla_device())` is performed before the optimizer creation in your script.zPassing `optimizers` is not allowed if PyTorch FSDP is enabled. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.exist_ok collate_batchzRThe `data_collator` should be a simple callable (function, class with `__call__`).zHmax_steps is given, it will override any value given in num_train_epochszThe train_dataset does not implement __len__, max_steps has to be specified. The number of steps needs to be known in advance for the learning rate scheduler.zTthe `--group_by_length` option is only available for `Dataset`, not `IterableDatasetzOSageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead z(FP16 provided in SM_HP_MP_PARAMETERS is z+, but FP16 provided in trainer argument is z , setting to fp16zJ, but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer.autorz2Tried to use `fp16` but it is not supported on cpucpu_ampzUsing z half precision backendapex)epsilon problem_typemulti_label_classificationzsLabel smoothing is not compatible with multi-label classification. Disabling label smoothing for this training run.)is_local_process_zerois_world_process_zerostateful_callbacks xla_fsdp_v2z*FSDPv2 requires `torch_xla` 2.2 or higher.)fsdptensor) axis_namesr)loggerinforibatch_eval_metricsrrr ValueError eval_strategy save_strategyrXBESTload_best_model_at_endmetric_for_best_modelrrfull_determinismr^seedrehp_name deepspeed is_in_trainr"create_accelerator_and_postprocessrYskip_memory_metrics_memory_trackerstartget_process_log_levelr set_verbosity_setup_devicesrcall_model_init RuntimeErrorwarningswarn FutureWarning __class____name__r.getattris_model_parallelsetrvalueslendeviceruse_liger_kernelrliger_kernel.transformersrliger_kernel_configrr*hasattrrwarning ImportErrorr is_trainablerquantization_config quant_method fsdp_configis_fsdp_xla_enabledris_deepspeed_enabled parallel_moderh DISTRIBUTEDplace_model_on_devicefp16_full_evalbf16_full_evaldo_trainis_fsdp_enabledr3r rrrrrrrBITS_AND_BYTES_move_model_to_device_n_gpu model_wrapped acceleratorr,rAttributeErrorrmodel_accepts_loss_kwargsforwardanyneftune_noise_alpharr optimizer lr_schedulerrr param_groupsDEFAULT_CALLBACKSr report_tor4callback_handler add_callback disable_tqdmr7DEFAULT_PROGRESS_CALLBACK_loggers_initialized hub_model_id push_to_hub init_hf_repo should_saveosmakedirsrcallable TypeError max_stepsnum_train_epochsrautilsdatargroup_by_length_signature_columnsuse_apex use_cpu_amprbf16IS_SAGEMAKER_MP_POST_1_10r smpstatecfghalf_precision_backendr2bfloat16 amp_dtypelabel_smoothing_factorr?label_smootherconfig UserWarningr:controlr;rrrr6 current_floshp_search_backendrx label_namesrv on_init_endtrain_batch_size_train_batch_size_created_lr_schedulerstop_and_update_metricsgetis_fsdp_xla_v2_enabledIS_XLA_FSDPV2_POST_2_2xrglobal_runtime_device_countxsset_global_meshMeshrarrayrangeis_fsdp_xla_v1_enabled)#selfrrrrrrrrrrrrrr log_levelr;devicesr kernel_config_is_quantized_and_base_model&_quantization_method_supports_training%_is_model_quantized_and_qat_trainabledefault_collatorunwrapped_modelforward_paramsparam model_device param_groupoptimizer_devicedefault_callbackscbmodel_to_inspectdefault_label_names num_devicess# r__init__zTrainer.__init__sR& <&J KKKJ4 0 < `ASASA`A` /18~t0T\`0`1 el    2Ef - (GE;,Gb  (u0ENs$  *2XJ5K]K]KqKqK~K~JDDIDVDVDjDjDwDwCxz  $(#3#3E#: tyy>A (( p##E*t/A/A\E]E]/] !QRR&*%?%?"  " "(($$(;(;T]]''##).D & ++.EG_-`a $$4 5'  /<.G]M]*( 0  & &4d;?Q?`?``  & &udkk :  ! ! DII # **77> / *(89"1"@"@"B,7GOD^D^`g>A $"7"7!"; KKb c  $Z -F4>>]^K^d   %=%++*:*:*J*JK$$st t"&   # $yy !rss(99 2 22NNB399==CUCUBVWDDHII;O&&)iimm&8&8%9; !$ 2 2DI399==&1NNB399==CUCUBVWcc II(C(Cv(M{{ell51199=()]^^2;D/ KK&!"&"<"<">!22<< ~M Q[\^`oQp   !%:: $** %tzz#34#'::#<#<#> $(::#8#8#>#> )*:*D*DE26))2G2G2O.UYU^U^UjUj./?/I/IJ,,88DJJPTP\P\] "&!6!6%*" 446&*&6&6&:&:=%&P#  & &) !MNN88:K   rwwrxxk0B'CkSTEUbtu v&*&>&>&btGbGbCb#w pn s; ACAC7AC AC returncDtjd|jS)NzUTrainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.rr@rrs rrzTrainer.tokenizer2sno$$$rc<tjd||_y)NzjTrainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.r)rrs rrzTrainer.tokenizer7s x !1rc|jj|}t|r%|jjj }n|j }~|j |_|jt}||_ |S)z Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: https://huggingface.co/papers/2310.05914 ) rSr,rrrget_input_embeddingsrXregister_forward_hookrbneftune_hook_handle)rrr embeddings hook_handles r_activate_neftunezTrainer._activate_neftune>sy **77> / *(3399NNPJ(==?J )-)A)A & 667PQ #.  rc"t|ds td|jj|}t |r%|j j j}n|j}|jj|` ~y)z^ Deactivates the neftune method. Make sure to call `_activate_neftune` first. rzNNeftune is not activated make sure to call `trainer._activate_neftune()` firstN) r?rrSr,rrrrrremoverX)rrrrs r_deactivate_neftunezTrainer._deactivate_neftuneQs{t23mn n**77> / *(3399NNPJ(==?J   '')  *Orc:|jj|y)ag Add a callback to the current list of [`~transformers.TrainerCallback`]. Args: callback (`type` or [`~transformers.TrainerCallback]`): A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the first case, will instantiate a member of that class. N)r^r_rcallbacks rr_zTrainer.add_callbackbs **84rc8|jj|S)aK Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it. If the callback is not found, returns `None` (and no error is raised). Args: callback (`type` or [`~transformers.TrainerCallback]`): A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the first case, will pop the first member of that class found in the list of callbacks. Returns: [`~transformers.TrainerCallback`]: The callback removed, if found. )r^ pop_callbackrs rrzTrainer.pop_callbackms$$11(;;rc:|jj|y)a Remove a callback from the current list of [`~transformers.TrainerCallback`]. Args: callback (`type` or [`~transformers.TrainerCallback]`): A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the first case, will remove the first member of that class found in the list of callbacks. N)r^remove_callbackrs rrzTrainer.remove_callback}s --h7rct|ddtjdy|j|}|jj t jk(rt|dr|jyyy)NrzZThe model is already on multiple devices. Skipping the move to device specified in `args`. tie_weights) r6rr@torrHrhTPUr?r)rrr;s rrPzTrainer._move_model_to_devicesi 5/4 0 < NNl    99 " "l&6&6 675-;X     *-6-C-CDJJ   *+"+"8"8!9:://<<H"d4::+G+G+T+T&UUNP^ *-6-C-CDJJ   ** *-6-C-CDJJ   **Y  KK()9(:;JJ((1122STXT]T]^mTnSopyy122HI]I]IfIfHgh77 0M18L8L3L1MM w<1 YZ^ZcZcduZvYwx==AYY=W|jj|jvr||jjnd}nd}|j|jjdnd}t|jj|jjz|||St|S)Nrrlengthsmodel_input_name)rrarror|rrrlength_column_namerrmodel_input_namesrArgradient_accumulation_stepsr)rrrrs r_get_train_samplerzTrainer._get_train_samplers   ..M  =(A 99 $ $$&:mXEUEU+Vyy33}7Q7QQ"$))">">? >B>S>S>_%%77:ei ( **TYY-R-RR%!1  !/ /rF batch_size sampler_fn is_trainingdataloader_keycn|j}tr.t|tjr|j ||}n|j |j|}|||jj|jj|jjd}t|tjjjs~| |||d<|jj|d<|jj |d<|r=t#t$|jj|jj&|d<|j(j+t-|fi|} |<|jjr&t/|dr| |j0|<| S|| i|_| S) zACreate a [`~torch.utils.data.DataLoader`] from the given dataset.)r)r collate_fn num_workers pin_memorypersistent_workerssampler drop_lastprefetch_factor)rrankworker_init_fn_eval_dataloaders)rr|rrrrrrdataloader_num_workersdataloader_pin_memorydataloader_persistent_workersrrmrnrdataloader_drop_lastdataloader_prefetch_factorrrd process_indexrSpreparerr?r) rrrrrrrrdataloader_params dataloaders r_get_dataloaderzTrainer._get_dataloader:s** "z'8;K;K'L11'{1SG CCDDVDVdoCpM%'99;;))99"&))"I"I  '5;;#3#3#C#CD%/9'/B!),-1YY-K-K k *37993W3W / 06=TYY-M-MTXT]T]TkTk7!"23%%--j.VDU.VW   %$))*Q*Qt019C&&~6+9*)E&rc|j td|j|jd|j|jdS)a@ Returns the training [`~torch.utils.data.DataLoader`]. Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed training if necessary) otherwise. Subclass and override this method if you want to inject some custom behavior. z+Trainer: training requires a train_dataset.TrainingT)rrrrr)rrr rrrs rget_train_dataloaderzTrainer.get_train_dataloaderhsU    %JK K##&&"--.. $  rcL| t|sy|jjrtr2t |t j t jStrGt |tjtj|jjSt|S|jjrtrXt!|t"j$r>|jj&|j(vr||jj&nd}nd}|j*|j*j,dnd}t/|jj0|||S|jj dkr t|Sy)N) num_replicasr)rrrrrr )raruse_legacy_prediction_looprrBr world_sizeglobal_ordinalrrudp_sizedp_rankper_device_eval_batch_sizerror|rrrrrrrrAeval_batch_size)rrrrs r_get_eval_samplerzTrainer._get_eval_sampler|s[  z,'? 99 / /%'3 r}}REVEVEX)*3 !$#yyCC )66 99 $ $$&:lHDTDT+Uyy33|7P7PP!!=!=> >B>S>S>_%%77:ei ( ))$!1   99  1 $$\2 2rc||j tdt|tr|nd}t |dr3||j vr%|j jr|j |St|tr|j|n||n |j}|j|d|j j|j|S)a  Returns the evaluation [`~torch.utils.data.DataLoader`]. Subclass and override this method if you want to inject some custom behavior. Args: eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*): If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. z-Trainer: evaluation requires an eval_dataset.evalr Evaluation)rrrrr) rrrstrr?rrrr rr)rrrs rget_eval_dataloaderzTrainer.get_eval_dataloaders  D$5$5$=LM M*4L#)FF D- .$"8"88 77)).9 9,,   l +'"" ## $yy00--) $  r test_datasetch|j|d|jj|jS)a Returns the test [`~torch.utils.data.DataLoader`]. Subclass and override this method if you want to inject some custom behavior. Args: test_dataset (`torch.utils.data.Dataset`, *optional*): The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. test)rrrr)r rrr)rrs rget_test_dataloaderzTrainer.get_test_dataloaders8## yy00-- $  rnum_training_stepsc|jtr;tjjj r|j j }n |j }|j||y)aZ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or `create_scheduler`) in a subclass. )r#rYN)create_optimizerrtrurvrwr rYcreate_schedulerrr#rYs rcreate_optimizer_and_schedulerz&Trainer.create_optimizer_and_schedulersO  $););00II 1CyYrcFgd}t|tjg|}|S)a- Get all parameter names that weight decay will be applied to. This function filters out parameters in two ways: 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS) 2. By parameter name patterns (containing 'bias', or variation of 'norm') )bias layernormrmsnormz(?:^|\.)norm(?:$|\.)z _norm(?:$|\.))rHr LayerNorm)rrforbidden_name_patternsdecay_parameterss rget_decay_parameter_namesz!Trainer.get_decay_parameter_namess'#q.ur||nF]^rc tr |jn |j}|jY|j |}|j Dcgc]\}}||vs |j s|c}}|jjd|j Dcgc]\}}||vs |j s|c}}ddg}|j|j\}}n|j|j|\}}d|vr|jd}d|vr|jd}d|vr|jd}||fi||_dt|vr.|jddd k(rd dl}|jj j#} d } |j%D]} t'| t(j*s| t-| j/Dcic]!}|j1|j3#c}j5z } t6j9d | d | d z d| j;| dddit6j=d| dt6j9d| d z dtr$t?j@|j|_|jScc}}wcc}}wcc}w)a Setup the optimizer. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through `optimizers`, or subclass and override this method in a subclass. N)r weight_decayrroptimizer_dict bitsandbytes optim_bitsrzskipped : izM paramsweight zbitsandbytes: will optimize z in fp32z skipped: )!rrRrrYr0named_parameters requires_gradrr2rget_optimizer_cls_and_kwargspoprrr5optimGlobalOptimManager get_instancemodulesrr Embeddingsumrdata_ptrnumelr9rrregister_module_overridedebugruDistributedOptimizer) r opt_modelr/npoptimizer_grouped_parameters optimizer_clsoptimizer_kwargsr5managerskippedmodules rr%zTrainer.create_optimizers+B*CD&& >> !#==iH '0&@&@&B"aqL\G\abapap%)II$:$: '0&@&@&B"aqP`G`efetet%(  , (,,8262O2O/ /262S2STXT]T]_h2i/ /++/?/C/CH/M,**/?/C/CG/L, #33/?/C/CDT/U,*+G\K[\DN]!338H8L8L\[_8`de8e#&,,??LLN'//1VF!&",,73IZIZI\']A aggi(?']'d'd'f#gg hvhb58I$RS88LZ\K]^ 'CF88%TU V  i%'8AB " $ 55dnnEDN~~i J(^s* K! K.K K , K 9K !&K cVtd|jjDS)z9 Get the number of trainable parameters. c3VK|]!}|js|j#ywr)r<rF)rrLs rrz7Trainer.get_num_trainable_parameters..EsQ1779Qs)))rDrrrs rget_num_trainable_parametersz$Trainer.get_num_trainable_parametersAs"Qdjj&;&;&=QQQrc|j td|jjDcgc]}|d c}Scc}w)zR Returns the learning rate of each parameter from self.optimizer. PTrainer optimizer is None, please make sure you have setup the optimizer before.lrrYrr[)rgroups rget_learning_rateszTrainer.get_learning_ratesGs< >> !op p)-)D)DEd EEEs ?rc|j td|&|jjD] }||dvs |cS|jjDcgc]}|d c}Scc}w)a Returns optimizer group for a parameter if given, else returns all optimizer groups for params. Args: param (`str` or `torch.nn.parameter.Parameter`, *optional*): The parameter for which optimizer group needs to be returned. rWrrY)rrrZs rget_optimizer_groupzTrainer.get_optimizer_groupOsn >> !op p  44 !E(O+ L !.2^^-H-HIEhIIIs A'c4'(i'jrIjjddjdD]}|jd\}}|'|<dji(jj fj d} ddtd tttfd tttfd td tttff '(fd }jtjk(rt}(j!ddd|(fSjtj"tj$fvrMddlm}|}(j!|jtj$k(r(j!ddi|(fSjtj*k(r ddlm}|}(j!||(fSjtj2k(r ddlm} | }(j!||(fSjtj8k(r ddlm} | }(j!||(fSjtj>tj@tjBtjDtjFtjHtjJtjLtjNtjPtjRtjTtjVtjXtjZfvrk ddl.m}m/} m0} d} d}d}|}djvrd} djvrd}djvr|}ndjvr| }d jj fi}nVd!jvr| }'}nBd"jvr3tcrStejfthjjjed#tejfd$kr t1d%dd&l.m6}|}to'jqd'jto'jqd(j to'jqd)d*fto'jqd+d,to'jqd-j d.}d/'vrts'd/|d/<d0'vrts'd0|d0<d1|i}d!jvr| |d2<(j!|(j!|tcr]tejfthjjjed#tejfd4krttjwd5|(fSjtjxk(r dd6l=m>}|}(j!|(j!t'jqd7d8tt'jqd9d:tt'jqd;d:tt'jqd|(fSjtjk(rtjj}|(fSjtjk(rtjj}|(fSjtjk(rtjj`}|(fSjtjtjtjtjtjtjfvr/ts t/d@ddAlMmN}mO}mP}tj|tj|tj|tj|tj|tj|i}ts'jdBdCts'jdDdEto'jdFdG'jdHdIdJ}|j||\}(jtjk(r(j!ddd|(fSjtjtjfvrts t/dKddLlUmV}tj|tj|i}ts'jdBdC'jdMdN'jdOdPts'jdDdEto'jdFdQ'jdHdIdR}|j!||j||\}(|(fSjtjtjfvrjts t/dStdTs t/dU t1dVddWl[m\}m]}dXjvr|}n|}(j!dYi|(fSjtjk(rts t1dZdd[l`ma}|}(j!to'jqd\d]to'jqd^d_to'jqd`dato'jqdbdato'jqdcdQdd|(fSjtjtjfvritrHtejfthjjjedetejfdfkr t/dgtejfthjjjedhtejfdikr t/djtejfthjjjedetejfdkk\r ddllemf}mg}nddllhmf}mg}jtjk(r|}n+jtjk(r|}n t1dm(j!||(fSjtjtjtjfvr?ts t/dntdTs t/doddplmmn}mo} i}d}!jtjk(r#tdqs t/drddslmmp}"|"}|}d}!nMjtjk(r|}|}n+jtjk(r| }n t1dtj|du<|!rj|dv<|j!to'jqdwd_to'jqdxdydz(j!||(fSjtjk(rts t/d{dd|lumv}#'jd}d}$|$ to|$}$'jd~d}%|% t|%}%j|du<t'jdd|$|%d}&|#}(j!|(j!|&|(fSt1dj#t.$r t1dwxYw#t.$r t1dwxYw#t.$r t1dwxYw#t.$r t1d3wxYw#t.$r t1d?wxYw)z Returns the optimizer class and optimizer parameters based on the training arguments. Args: args (`transformers.training_args.TrainingArguments`): The training arguments for the training session.  r,=rX)betasepsToptimizer_nameoptimizer_mapping optim_kwargsis_layerwise_supportedrc|jjd}|r.jtjk(r|rt d|d||}j td|dtj ttfstdj td|dtj txrj jd d d k(}g}jD]s\}} tj |d \} } t| tj s | r| st"j%|d|d[| s|s`|j'|dzut)|dk(rtd|dj dj+D cgc] \} } | |vs | }} } j+D cgc] \} } | |vs | }} } |j-d|id|i|g}|rj.dk7rtd|di|D]}|d|gigfi|<|D]}|d|gi|gfi|<fd}j1D] }|j2s|j5|"t6}j-dij-d|i|fScc} } wcc} } w)a Helper function to set up low-rank optimizers like GaLore and Apollo. Args: optimizer_name (str): Name of the optimizer. optimizer_mapping (dict): Mapping of optimizer names to their classes. optim_kwargs (dict): Keyword arguments for the optimizer. is_layerwise_supported (bool): Whether layerwise optimization is supported. Returns: tuple[Any, Any]: Optimizer class and updated optimizer kwargs. layerwisez Layer-wise z" does not support DDP at this timez1You need to define `optim_target_modules` to use z optimizerszX`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: z'You need to pass a model to initialize z optimizer._-z all-linearT)return_is_regexz matched but ignored. z only supports linear layers.z.weightrzNo target modules found for z ().rr z Layerwise z( does not support gradient accumulation!cl|j'|j|jyyr)gradstep zero_grad)rr4s roptimizer_hookz^Trainer.get_optimizer_cls_and_kwargs..setup_low_rank_optimizer..optimizer_hooks4zz-&u-224&u-779.rr4)lowerendswithrHrhrINotImplementedErroroptim_target_modulesrrrrrjreplace named_modulesr[rLinearrr@appendr:r;updaterrr<"register_post_accumulate_grad_hookr@)rdrerfrg is_layerwiserN all_lineartarget_params_names module_namerRtarget_module_existsis_regexrKrL target_paramsnon_target_paramsr[rrrr4rr optim_argsrOs @rsetup_low_rank_optimizerzFTrainer.get_optimizer_cls_and_kwargs..setup_low_rank_optimizerzsn$*//1::;GL 2 2l6N6N NSi)K7GGi*jkk-n=M((0 #TUcTddo!pqqd77$EnospIpIoJK} #J>JZZe!fgg444c:P--55c3?<O  #% ',':':'< D# V1K--{D2.$h"&"))4+H*m+A.AQQno+J#**;+BC D"&'1, #??OrRVRkRkQlln!opp+0+A+A+C`41aqL_G_Q`M`/4/E/E/G htq!1TgKg h  h    +,-=9L9L 33q8$z.1AAi%jkk!#.eE,9Hug;N:O,dSc,dN5)e*uE,9Hug;^Q];^:_,tcs,tN5)u: #--/QE**@@PQ!8  '')9>(JK  # #X|$< = "22 2Aa hs; K K " K/KF)scale_parameter relative_stepr)AdamWfusedz7Trainer failed to import syncfree AdamW from torch_xla.) NpuFusedAdamWz3Trainer failed to import FusedAdamW from torch_npu.) FusedAdamzFTrainer tried to instantiate apex FusedAdam but apex is not installed!)rLionRMSpropr:Npaged8bitr7adamlionrbrmspropademamixr5z0.44.0z{The AdEMAMix optimizer is not supported by your current version of `bitsandbytes`. Please install `bitsandbytes` >= 0.44.0.)AdEMAMixbeta1beta2beta3gH.?alphag@rc)rbrrct_alphat_beta3r6is_pagedzOTrainer tried to instantiate bnb optimizer but `bitsandbytes` is not installed!z0.41.1zYou are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.)AnyPrecisionAdamWuse_kahan_summationFalsemomentum_dtypefloat32variance_dtypecompensation_buffer_dtypery)rrrrz4Please install https://github.com/pytorch/torchdistxzYou need to install `galore_torch` in order to use GaLore optimizers install it with `pip install git+https://github.com/jiaweizzhao/GaLore`)GaLoreAdafactor GaLoreAdamWGaLoreAdamW8bitrupdate_proj_gapscaleg? proj_typestd)rrrrzYou need to install `apollo_torch` in order to use APOLLO optimizers install it with `pip install git+https://github.com/zhuhanqing/APOLLO`) APOLLOAdamWprojrandom scale_typechannel?)rrrrrrziYou need to install `lomo_optim` in order to use LOMO optimizers install it with `pip install lomo-optim`0.30.0zGYou need to have `accelerate>=0.30.0` to be able to use LOMO optimizerszMYou need to pass a `model` in order to correctly initialize a LOMO optimizer.)AdaLomoLomoadarz5Please install grokadamw with `pip install grokadamw`) GrokAdamW alpha_initg\(\?lambg@gammag?grokking_signal_decay_rategradient_clipping)rrrrrtorchaoz0.4.0zYou need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers.Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/aorz2.4zYou need to have `torch>2.4` in order to use torch 4-bit optimizers. Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly.z0.11.0) AdamW4bit AdamW8bitzInvalid optimizerzwYou need to install `schedulefree` in order to use schedulefree optimizers. Install it with `pip install schedulefree.`zOYou need to have `accelerate>=0.30.0` to be able to use schedulefree optimizers)AdamWScheduleFreeSGDScheduleFreerzYou need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. Install it with `pip install schedulefree.`)RAdamScheduleFreezInvalid schedulefree optimizerr2 warmup_stepsweight_lr_powerrr3)rrzwYou need to install `torch-optimi` in order to use stable_adamw optimizers. Install it with `pip install torch-optimi`.) StableAdamWmax_lr kahan_sum decouple_lr)rrrz2Trainer cannot instantiate unsupported optimizer: T)wrrwsplit learning_rate adam_beta1 adam_beta2 adam_epsilonrdictr booltupler?rg ADAFACTORr/r{ ADAMW_TORCHADAMW_TORCH_FUSED torch.optimrADAMW_TORCH_XLAtorch_xla.amp.syncfreerArADAMW_TORCH_NPU_FUSEDtorch_npu.optimrADAMW_APEX_FUSEDapex.optimizersr ADAMW_BNB ADAMW_8BIT PAGED_ADAMWPAGED_ADAMW_8BITADEMAMIX ADEMAMIX_8BITPAGED_ADEMAMIXPAGED_ADEMAMIX_8BITLION LION_8BIT PAGED_LIONPAGED_LION_8BIT RMSPROP_BNB RMSPROP_8BIT RMSPROP_32BITbitsandbytes.optimrrr{rrrrrfloatrrrr@ADAMW_ANYPRECISIONtorchdistx.optimizersrrr6rSGDADAGRADAdagradRMSPROP GALORE_ADAMWGALORE_ADAMW_8BITGALORE_ADAFACTORGALORE_ADAMW_LAYERWISEGALORE_ADAMW_8BIT_LAYERWISEGALORE_ADAFACTOR_LAYERWISEr} galore_torchrrrr> APOLLO_ADAMWAPOLLO_ADAMW_LAYERWISErz apollo_torchrLOMOADALOMOrry lomo_optimrr GROKADAMWr~ grokadamwrADAMW_TORCH_4BITADAMW_TORCH_8BITr torchao.optimrrtorchao.prototype.low_bit_optimSCHEDULE_FREE_RADAMSCHEDULE_FREE_ADAMWSCHEDULE_FREE_SGDr schedulefreerrrr2r STABLE_ADAMWroptimir))rrmappingkeyvalue adam_kwargsrrNrrrrrrr6additional_optim_kwargsr bnb_kwargsrrrrregalore_optim_kwargsrapollo_optim_kwargsrrrrrrrrequire_warmuprrrrstable_adamw_kwargsrrOs)`` @@rr=z$Trainer.get_optimizer_cls_and_kwargs_sy ????223;AA#F ($]]3/ U"' 3 (!$"4"45oot7$$ ,0 ] 3] 3#CH~] 3sCx.] 3%) ] 3 38_ ] 3] 3~ ::11 1%M  # #PU$V Wl ...k ZZN668X8XY Y )!M  # #K 0zz^=== ''$8^ ...] ZZ>99 9 \8 %  '' 4R ...M ZZ>?? ? X9 -  '' 4B ...} ZZ>:: : k5 )  '' 4r ...m ZZ  $ $  % %  & &  + +  # #  ( (  ) )  . .     $ $  % %  * *  & &  ' '  ( (  "8 tCC  $ *5'djj(#HTZZ'!"JTZZ'$)Mtzz)$(M/6$//8Z.[+$**,$+M/9+4::-02w}}!**22>B8 h/80)G <$,M"*..$//"JK!*..$//"JK!*..&"AB" "'z~~gs'C!D$Z^^E4;L;L%MN/+!J.=@IAV=W/ : J.=@IAV=W/ :*J7 DJJ.-5Jz* ''(?@ '' 3)*w}}""**>:0 h'0(rR...KZZ><< < YC 1  '' 4!''/8H]_f9g/h*1%HXZc9d*e*1%HXZc9d*e5<!:>>2Mz#Z6  z...cZZ>-- -!KKOOM`..._ZZ>11 1!KK//M\...[ZZ>11 1!KK//MX...WZZ  ' '  , ,  + +  1 1  6 6  5 5   -.!_ S R++[00///55{::O99? ! JNN6378#&z~~6G'M#Nz~~gt<='^^K? # /G -/B/ +M+zz^<<< ''ETY(Z[P...OZZ  ' '  1 1  -.!^ 1++[55{!  JNN6378"vx8(nn\9E#&z~~6G'M#Nz~~gs;<'^^K? #  & &{ 3.F -/B/ +M+Z...UZZN//1G1GH H$&!@+84!"kll} !pqq 0 " ' $  # #We$4 5n...mZZ>33 3)+ !XYY +%M  # #"' |T(J"K!*.."=>":>>'3#?@27 Gceh8i2j).z~~>QSV/W)X  ^...MZZ  + +  + +  ()W]]9;M;M;U;UV_;`-adkdqdqe."{}}Y//77@AW]]SXEYY!K}}Y//77 BCw}}U]G^^>>Pzz^<<< ) ~>>> )  !455  # #K 0V...UZZ  . .  . .  , ,  -.!B+84!"stt G&( #!Nzz^???09%F; 1 *5'!&~AAA 1 *5'~??? /  !ABB6:6G6G #N 3:>:K:K'7 # * *',Z^^66 6,.!B +^^Hd3F!v"{D9I$ O *.*;*;K '#JNN=%$HI &#  (M  # #K 0  # #$7 8...QRVR\R\Q]^_ _O  \ !Z[[ \ X !VWW X k !ijj kT t !rss t6 Y !WXX Ys>1|",|:'}5G?}*=B"~"|7:}}'*}?~rYc|jkt|jj| |jn||jj |||jj |_d|_|jS)z Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or passed as an argument. Args: num_training_steps (int): The number of training steps to do. )rYnum_warmup_stepsr#scheduler_specific_kwargsT)rZr0rlr_scheduler_typerYget_warmup_stepslr_scheduler_kwargsrr's rr&zTrainer.create_scheduler3sq    $ - ++,5,=$..9!%!;!;r: NameErrorrTrjrper_device_train_batch_size)rr rs r num_exampleszTrainer.num_examplesFsw  K ((G'#78:--5566z))* *>95 Kz?TYY%J%JJ J Ks:AA6B  B train_dlrkcd} |D]#}|dj}|||zcS||z }% |S#t$rtjdY|SwxYw)zq Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader. r input_idsz%Cannot get num_tokens from dataloader)rFKeyErrorrr@)r!rk train_tokensbatchtokenss r num_tokenszTrainer.num_tokensTsz   D! '{+113(!I--&  ' D NNB C Ds //AAtrialz optuna.Trialc ||_|j|y|jtjk(r|j |}n|jtj k(r|}|j ddn|jtjk(rI|jjDcic]#\}}|t|tr t|n|%}}}n|jtjk(r|}jD]v\}}t|j|st j#d|d6t%|j|d}|t'||}t)|j||x|jtjk(r"t j+d|j,|jtjk(r"t j+d|j|jtjk(rt j+d||j.r|jj0 t3d|j4j7d d lm}d d lm} | |jj0|j_ |jj@jC|j||jj@ |j_"tGjI|jKycc}}w) zHP search setup codeNwandbzTrying to set zY in the hyperparameter search but there is no corresponding field in `TrainingArguments`.zTrial: zSigOpt Assignments: zW&B Sweep parameters: z7For sweeps with deepspeed, `args.deepspeed` must be setr)DeepSpeedPluginHfTrainerDeepSpeedConfig) hf_ds_config)&_trialrrTOPTUNAhp_spaceRAYr>SIGOPT assignmentsitemsrrrWANDBr?rrr@r6rsetattrrrrGr&rrS free_memoryaccelerate.utilsr,#transformers.integrations.deepspeedr.hf_deepspeed_configtrainer_config_processdeepspeed_pluginr _reset_stater() rr)rrvrrold_attrr,r.s r_hp_search_setupzTrainer._hp_search_setupds  ! ! )U]   ! !_%;%; ;]]5)F  # #':': :F JJw %  # #'='= =INIZIZI`I`IbcAa:a#5Q1<cFc  # #'<'< <F ,,. +JC499c*$SE*,,tyy#t4H#&Xu- DIIsE * +  ! !_%;%; ; KK'%,,0 1  ! !_%;%; ; KK.u/@/@.AB C  ! !_%:%: : KK08 9  $ $yy""* !Z[[    ( ( * 9 T,DTYYEXEX,YDII ) II ) ) @ @ K)8diiFcFc)dDII &   + + - //1Uds2(L;rpmetricsc~|j|y|j}|j||_|jtj k(rddl}t|dr|jjsy|j|j||jrL|jj|j|j|j |j"yyy|jtj$k(rddl}t)j*5}d}|j j,r7|j/||j0j2j5|}|j|d<|j0j||dddyy#1swYyxYw)Nrstudy)checkpoint_dir objective) checkpoint)rcopycompute_objectiverGrTr1optunar?rE_is_multi_objectivereport should_pruner^ on_train_endrrvr TrialPrunedr3 ray.traintempfileTemporaryDirectoryrf_tune_save_checkpointtrain Checkpointfrom_directory)rr)rprCrKraytemp_checkpoint_dirrHs r_report_to_hp_searchzTrainer._report_to_hp_searchss  ! ! )U] ,,.//8  ! !_%;%; ; ug&u{{/N/N/P T^^T2%%'))66tyy$**dll[,&,,..(0Q&  # #':': : ,,. A2E! <<++..>Q.R!$!5!5!D!DEX!YJ'+~~ $   Z @  A A; A As -ABll>P>P>RDJJ ) )*: ; JJ # #BGGLL=O$P Q JJt~~002BGGLL^4\ ] JJt((335rww||JP^7_ ` !rct|j}|dk(r|j}n"|dk(r|j|}n td| td|S)Nrr z'model_init should have 0 or 1 argument.z"model_init should not return None.)rcrr0)rr)model_init_argcountrs rr/zTrainer.call_model_inits]1$//B ! #OO%E A %OOE*EHI I =CD D rc |s|tjd|Stt|}|j |} t j |}|j |jjdd}|r||_ td}|jj|5tj5t|t r#tj"j%||d}n4tj"j%||Dcic]}||| c}d}ddddddtj"j'|}tj5|d i||d i|ddd|}d|_|S|Scc}w#1swYoxYw#1swYsxYw#1swY3xYw#t*t,t.t0t2f$r$} tjd| dYd} ~ |Sd} ~ wwxYw) NzAfailed to use PyTorch jit mode due to current dataloader is none._original_forwardF) cache_enabled)autocast_handler)example_kwarg_inputsstrictz'failed to use PyTorch jit mode due to: rr)rr@nextiter_prepare_inputsrIr__dict__r>rVrrSautocastrno_gradrrjittracefreezerrr0rjrr IndexError) rrr training example_batch jit_modeloriginal_forwardrlres rtorch_jit_model_evalzTrainer.torch_jit_model_evals!bc  j!12M 00?M O IIe,  #,#5#5#9#9:Mt#T #(8I%#1#F %%..@P.QSXS`S`Sb!-6$)IIOOITajoO$p $)IIOO%Ub1cc#}S7I2I1c#(%4% "II,,Y7 ]]_/. .. ./"#(  u 2d // !)ZJO O!H1MNN  OstA2G5F, AF  F) F 2F,:;G5F8GF  F) %F,,F51G8G=GH"HHc jdddd}d}d}|jD]<\}}t||d}t||d} |#| &|| k7s,|d|d|d | d z }d }>|j} |jt d |j z} | | k7r|d | d | d z }d }|rt j|yy)N logging_steps eval_steps save_steps)rrrFztWarning: The following arguments do not match the ones in the `trainer_state.json` within the checkpoint directory: z r8z (from args) != z (from trainer_state.json)Tr z per_device_train_batch_size: )r6r6rrmaxn_gpur warning_once) r training_args trainer_stateattributes_map has_warning warning_strarg_attr state_attr arg_value state_value train_bs_argstrain_bs_states r#compare_trainer_and_checkpoint_argsz+Trainer.compare_trainer_and_checkpoint_argss,&&   M $2$8$8$: # Hj x>I!-TBK$)@YR]E]hZr)}oM]^l]mnHI IKK     , rcd tretjtjj r jStj |j jSjj|d|ur|SjrB|r@ddl m }|j|jj j\}_j j dkDr"t#|ddst%j&|}j j(rMt+j*}j-|||}t/t+j*|z d _|s|Sj2r dd lmdd lmdd lm}m}j@rdd l!m"d}d} t#|dd} j jHjKd| } j jHddkDr/tMjN|j jHd}nT| RtQ} | D],} tS|| }| tUd| jW|.tMjN|| }j jX}j jHdrD|jZj\r&t^jadd|jZ_.fd} j@rd}|||| x_}n|f|| d|x_}difd}|tb_2|Stgr@t$jhjk|tmtojpdg}|Sj jrttjvk(rtyr|Si}j jzj jz|d<n&t|t|r|j~ |d<nd|d<j jj j|d <j jj j|d!<td"i|j_C|S#tF$r tGdwxYw)#N)backward_passes_per_stepFkeep_torch_compileramp) opt_levelr is_loaded_in_8bit)XlaFullyShardedDataParallel)checkpoint_module)size_based_auto_wrap_policytransformer_auto_wrap_policy)SpmdFullyShardedDataParallelzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0._no_split_modulestransformer_layer_cls_to_wrapmin_num_params)rz@Could not find the transformer layer class to wrap in the model.)transformer_layer_clsxla_fsdp_grad_ckptzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.cHjsn}||g|i|Sr)r)mrkwargs target_clsFSDPFSDPv2rrs rauto_wrapper_callablez2Trainer._wrap_model..auto_wrapper_callable]s--1-H-HfJ%&7&:LTLVLLrcddlm}d}t|tjr|}n.t|t r|d}nt||r |j }| tdtj||dy)Nr )CausalLMOutputWithPastrzASomething went wrong, the output of the model shouldn't be `None`)rNN) modeling_outputsrrrTensorrlogitsrr mark_sharding)outputmeshr real_outputs r shard_outputz)Trainer._wrap_model..shard_outputdsiH"&K!&%,,7&, #FE2&,Qi #F,BC&,mm "*()lmm$$[$8LMr)rauto_wrap_policyr)rrcV|jdi|}|rtj|S)Nr)rpxm mark_step)rYbarrieroptimizer_argslosss rpatched_optimizer_stepz3Trainer._wrap_model..patched_optimizer_steps&%y~~77LLN rSMDATAPARALLEL_LOCAL_RANK) device_idsfind_unused_parametersT bucket_cap_mbbroadcast_buffersr)DrrrRrurDistributedModelrrrSr,rqrr initializerYfp16_opt_levelrr6r DataParallel jit_mode_evaltimer~roundjit_compilation_timerFtorch_xla.distributed.fsdprrtorch_xla.distributed.fsdp.wraprrr7torch_xla.experimental.spmd_fully_sharded_data_parallelrrArEr functoolsrr8rG Exceptionaddxla_fsdp_configr} use_cacherrroptimizer_steprparallelDistributedDataParallelrrggetenvrHrhrIrddp_find_unused_parametersr*is_gradient_checkpointingddp_bucket_cap_mbddp_broadcast_buffersr ddp_handler)rrryr r start_timerrrr%default_transformer_cls_names_to_wrap"fsdp_transformer_layer_cls_to_wraptransformer_cls_to_wrap layer_classtransformer_cls fsdp_kwargsrrrrrrs` @@@r _wrap_modelzTrainer._wrap_model sK " $$,,cii.H.HI)))'' HmHmn n    ( (5 ( IQV VL ==X $'NN5$..TXT]T]TlTlN$m !E4> 99??Q wu6I5'QOOE*E 99 " "J--eZJE(-diikJ.F(JD %L  # # pZH .. $ $( !4;ECVX\4] 1151F1F1J1J/1V2 .yy$$%56:#,#4#4/ @U@UVf@g$ 4?*-%'#EEK&@ &TO&.'(jkk/33OD E$-#4#40*A$ ))33Kyy$$%9:<<))''r.3ELL*M ** N&,!-%5*? & U&*&%5*?&" & U;@PR  !7B 4 3% &KK773ryy1L'M#N"O8E0 +YY $ $ (@(@ @,. Fyy33?37993W3W/0E?38=7V7V3V/037/0yy**6*.))*E*E'yy..:.2ii.M.M*++H+R6+RD   ( U p!"noo ps &RR/resume_from_checkpointignore_keys_for_evalrc |durd}|jj|j}d|_t |j t tfr&t|jdr|j|j |j|j|_ |js |jrJ|js>|j s2|j"&|j%|j|j&d|vr+|j)d}t+j,dt.t1|dkDr5t3dd j5t7|j9d |j;||jj<|_d}|j"{|jj@rtC|jjDntG|jjD|jI||_ d}d \|_%|_&t |tNr2|r0tQ|jR}|tUd |jRd |tWs)|jXs|jZs|j]|t_j`tbjdj5|tf}|j<|j<|_|rC|jhr&|j%|j|j&|j|_5tm|jn|j>|jp}|jrr5 tujv|||||tujxS|||||S#tujxwxYw)a Main training entry point. Args: resume_from_checkpoint (`str` or `bool`, *optional*): If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here. trial (`optuna.Trial` or `dict[str, Any]`, *optional*): The trial run or the hyperparameter dictionary for hyperparameter search. ignore_keys_for_eval (`list[str]`, *optional*) A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions for evaluation during the training. kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments used to hide deprecated arguments FNTr} model_pathzi`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` instead.rz*train() got unexpected keyword arguments: rrrz/No valid checkpoint found in output directory ())rrr)r)=r*r+rr'rrr3r1r?rrrXrrKrLrMr7rrPr;r>r1r2r3r:rjrrrrBrrr#r^r$rer/rYrZrr`rrrrGrN_load_from_checkpointr;load_from_jsonrgr^rbrJrRr__inner_training_loopauto_find_batch_sizerd hf_hub_utilsdisable_progress_barsenable_progress_bars) rrr)rrrmodel_reloadedrvinner_training_loops rrUz Trainer.trains1. "U *%) " ""$yy d++.E~-V W\c JJ]   & & (  # # /// ;DJ  D$7$7MM**'  & &tzz4;; ? 6 !%+ZZ %= " MM  v;?HSWX^XcXcXeSfIgHhhijk k e$!%!;!; ?? &7;yy7Q7Q #DIINN 3W_`d`i`i`n`nWo--e4DJ!N0: -DND- ,d 38N%8%I "%- #RSWSbSbRccd!eff ! -*,T5N5NW[WkWk**+AB // =SUg0hiE%%1).)?)?& ))**4::t{{C!%D 8  % %t'='=t?X?X     4224*+A)= 113&'=%9  113s 7N77O ct|jddx}|S|jrEt|jddx}r,|jj dij ddSy)zGGet the tensor parallel size from either the model or DeepSpeed config._tp_sizeNr<tensor_parallel autotp_sizer )r6rrGrr}r)rmodel_tpdeepspeed_configs r get_tp_sizezTrainer.get_tp_size ss  J= =H JO  $ $gdiiQfhl>m*m*:*m#**../@"EII-YZ[ [rct|j|jz}|j|jz|zS)zCalculates total batch size (micro_batch * grad_accum * dp_world_size). Note: Only considers DP and TP (dp_world_size = world_size // tp_size).)rrrr)rr dp_world_sizes rget_total_train_batch_sizez"Trainer.get_total_train_batch_size* s74+;+;+== %%(H(HH=XXrc  1|jj||_|jjr|j j |jk7rddlm}||j\|_ |j|_ |jrt|jj}|jtd|jjz|j_ |jd||j_ |j|j _t j#d|j|j%}|j&r t)|}|j+|} |j-||| \} } } } }}}d}|jj.r9|j1||rdn|}||r||j2z}n||j4z}t6j8|jj"vr9|jjdkDr t;dt=|jt?xs|j@xs |jB}|jBxr.tE|jj jFdddk(}|rd }|jHrd|_%d |_$|jrtM|| \|_'|_%|s|jQ| tS|jTjV|jXgzDcgc]}t[|t\s|c} |_|du|j _/|j|j _|j ja|||jbr&|jje|jf |ji|j}||ju}|r(|jBrtk|jd |_ |rg|rS|jm|jjndk7r*|jjq|j|_ |jQ| |r|jjstu|jJdr|jvr&|jjq|j}n|jxr+|jjq|jN|_'n|jjq|j|jN\}|_'nt|jjq|j|jN|jJ\}|_'|_%n*|jjq|jN|_'|jBr|x|_ |_ ||jur||_ |jr|j|_=|k|jr-t}|j|t|j n2t?s |jBr|j||j|j||j|t jdt jd| dt jd| dt jd|jjd|jj|jk7r#t jd|jdt jd| dt jd|j4t jd|dt jdt|ddd|j _Etj}d}d}|tjjtjj|trLtSjtjj|t|_|j|j|j |jt|j j| z}|js)|j j| z}||j4z}nd}t jdt jd|t jd|j j|jst jd|d |d!d"D]#}t|jT|tE||%||jT_S|j j||| |tjd#|j$}d#|_X|j j|_Y|jd}d}|jTj||j |jX|_,|jr|j||d%t|| D ]}|}tu|d&r|j||jdk\rd|_a| t|n|j|j4z} |jTj||j |jX|_,d'}!d }"||k(r=|;|dkDr |jst||}|dz }!d}"n|dk(r|j|t|}#| |j4z}$|$dk(r |j4}$d'}%| |j4zt|$|j4kz}&t|&D]}'|%dz }%|%|&dz k7r |j4n|$}(|j|#|(|j\})}*t|)|_it|)D]\}+},|!dz }!|!dz|j4zdk(xs|!dz| k(}-|jjj|-|jjd(vrtE|jd)d*}.|.|,vrt jd+n^|jjd,k(rd-|,vr|,d-j}/n|jWtu|jd.rA|jj+|,|.|jjk7j}/n}6|j|6z }7t[d?|| |j j|@}8|j]|j j^|8dA<|7|8dB<d |_|jbje|8|jg|8|ji|}9|jkd |9C}:|jjlr|j jV|jjndk(rp|:D]k};tjjq|;|j jVr:t jdD|;dEtsjt|;dFm|jTjw||j |jX|_,|jy|jz|j}|jt|j j|7|8Scc}w#1swY 8xYw#1swYxYw#1swY:xYw)GNr)release_memoryr Tz)Currently training with a batch size of: zCurrently --debug underflow_overflow is not supported under DP. Please use DDP (torchrun or torch.distributed.launch (deprecated)). fsdp_versionF)r#)r)gradient_checkpointing_kwargs) recursivefp8rpload_module_strictz***** Running training ***** Num examples = r`z Num Epochs = z( Instantaneous batch size per device = zA Training with DataParallel so batch size has been adjusted to: zE Total train batch size (w. parallel, distributed & accumulation) = z Gradient Accumulation steps = z Total optimization steps = z# Number of trainable parameters = )trainable_onlyzE Continuing training from checkpoint, will skip to saved global_stepz! Continuing training from epoch z' Continuing training from global step z Will skip the first z epochs then the first z batches in the first epoch.)rrYrZr3r;)skip_scheduler set_epoch)rFmain_input_namer#zTried to track the number of tokens seen, however the current model is not configured properly to know what item is the input. To fix this, add a `main_input_name` attribute to the model class you are using. non_paddingattention_maskrz\Could not determine method to count non-padding tokens, falling back to counting all tokens.)r;r)rz0Calculated loss must be on the original device: z but device in use is r)implicit_replicationitem)rzXThere seems not to be a single sample in your epoch_iterator, stopping training at step zI! This is expected if you're using an IterableDataset and set num_steps (z.) higher than the number of available samples.zYou enabled PyTorch/XLA debug metrics but you don't have a TPU configured. Check your training configuration if this is unexpected._pastzU Training completed. Do not forget to share your model on huggingface.co/models =) gMbP?rU) num_samples num_stepsr( total_flos train_loss use_mtimerDeleting older checkpoint [] due to args.save_total_limit ignore_errors)rSr9rrrrvrr:rrRrrGrrrpropagate_args_to_deepspeedrrHrrr(rset_initial_training_valuesinclude_tokens_per_secondr(rlrrUNDERFLOW_OVERFLOWrrrrFrNr6 fsdp_pluginrrZr%rYr(r;r^rrrr6is_hyper_param_search compute_stepsgradient_checkpointinggradient_checkpointing_enablerrr,_fsdp_qlora_plugin_updatesmixed_precisionr rUr?rq is_tp_enabledr&r&rr_load_optimizer_and_scheduler _load_scalerrrFepochrrgr^isfilerrbrr_load_callback_staterr_ignore_data_skipr8train_dataloaderinit_training_referencesrrr;_total_loss_scalar_globalstep_last_loggedrqon_train_begin eval_on_start _evaluaterr  past_indexrr:rkon_epoch_beginr_load_rng_staterpget_batch_samples#current_gradient_accumulation_steps enumerategradient_state_set_sync_gradientsinclude_num_input_tokens_seenr@rDrrrFint64num_input_tokens_seengatherr on_step_begindistributed_typer DEEPSPEEDrrno_syncrr training_steplogging_nan_inf_filterrisnanisinfrrfloating_point_ops max_grad_normr clip_master_gradsrrrrmclip_grad_norm_ master_params&torch.distributed._tensor.experimentalrrryget_global_grad_normon_pre_optimizer_steprpon_optimizer_steproptimizer_step_was_skippedr?ReduceLROnPlateau on_step_end_maybe_log_save_evaluateon_substep_endshould_epoch_stopshould_training_stoprr on_epoch_endTPU_METRICS_DEBUG master_printmetmetrics_reportdelattrr!best_model_checkpoint_load_best_modelrf store_flosrr'r*rlog_get_output_dir_sorted_checkpointsrfsave_total_limitsamefileshutilrmtreerO_finish_current_pushrXrrZ)tXa0b -DND-'  / /9 / M!!22<< ~M Q[\^`oQp   ,1+< (&*&<&< #   y1  & & JJ 4 4SWSuSu 4 v  !3!34 #(4::"5 "t';';&djjDADJ #&//1##33uWefjfpfpWqSq)*d.B.B**+A4CUCUV **+AB 01  23 ' Q'789 o&6q%9:; >tyy?d?def>ghi 99 0 0D4J4J J KK[\`\r\rst[uv w [\rst[uvw 6t7W7W6XYZ 3Ia=AB 9:OPUfj:klm9nop YY[ )*& " -"''.. GGLL/1C D3 &44RWW\\BXZl5mnDJ  4 4TYY K  % % ' !7!7;U!UVN((151G1GKe1f..$2R2RR.12. KK_ ` KK;N;KL M KKA$**BXBXAYZ [(( ,^,<=677SU ; FD D))4t1D E F1A. ++D)=MuU,,s4;;7"%'+zz'='=$ %)  ,,;;D$**dll[    NN5"6tN L>+;<b E/ '5 **51!#! "-$%^^d&F&FF   00??djjRVR^R^_DLDK&+A+M1A5d>S>S'9:JLj'k$9A=D"&K3q8(()?@!"23N&)I)IIIA~ << K*d.N.NNQTD<<<RM=)e q BMR_bcRcBdd>>js 484J4J>[fhlhshs4t1 1>mS*1$**>OQ\*]*&8"NN!b $yyFF-W#3v#=39:J3K3O3O3QL$($9$9$E(/0E0E~(V(,(=(=(J(J(V)/(?4CXCXCeCe(e&)ce%1%+NN)G%&4:/3J3P3P3RL/5o/F/L/L/N +0<< TYYM]M]ejepep+qL JJ<<@P@P@W@WXd@e@i@i@k@p@p@rr<",,-CD&+ d>>>!C'+'<'<'J'J4QUQ[Q[]a]i]i'j M 2Q 66 ,,==AZAZZ"))$*:*:*B*B%P(33  !]'+'9'9%I['\ ]33 6 8"[[6%++l:S#*Gq4::;Q;Q7QTXTpTp7p,q"q">>\-@-@@","RSZSaSaRbbxzFzMzMyN!O##*L"8%%t/F/Fv/N)OO%#((77KKDQ --9d>P>PST>T68TYY-1^^-M-MdN`N`-a !% 4.0XX-E-E$'$5$5dnn$E$($6$6." 5?4J4J 1#'#5#5$k8L$5%6%8!&151A1A1Q1Q(-(8(8(:(,(:(:2&J!&!8 9$($4$4$E$EIbIb$b,1,F,F,H #*9f#=090@I,6 '+'<'<'R'RSWY]YcYceieqeq'r ","8"8--c&:G$Y2 NN//12(,'<'<'N'NtUYU_U_aeamam'n )-(?(?(A #//JJ#-d.?.?AYAYAkAk#l $ 1 1 6 6 8) ..!3.+0D1H3N+N ('+'<'<'H'Htzz[_[g[g'h 55#%!!!0&*76 (,'<'<'K'KDRVR\R\^b^j^j'k ||559Z9Z13LLNsYv<<11T\\5V5V-/ Ke Lax ../0##,+-[] 59 100==dDJJPTP\P\]DL  ) )E5%9Mziv * ,, ?)+OOC$6$6$89NN_||00Eb H ??wtW5 D' " op  & &4::+K+K+W  ! ! # 7<<>1 #DJJ$:$:E B,,/DD   )jj**'    $ 5 5  *   44W= &&u-!55RY5Z 99 TZZ%E%E%QVZV_V_VpVptuVu0 B ww'' DJJ4T4TUKK"=j\Ig hiMM*DA B ,,99$ DLLY  !!#  # # /  $ $TZZ 04::11:wGGK  \]]R!&!&222s=0AaAaAa D!8Aa, H%Aa9 aAa) a,Aa6 a9Ab c|j0|-|jtjk(r |j}n|jtjk(r-ddl}|j jj}na|jtjk(r |j}n7|jtjk(rddl }|jj}|j|j|nd}tj j#|j$j&|}|S|j$j&}|S)Nrzrun-)rrTr1numberr3rQrU get_context get_trial_idr4idr7r+runr%rgr^rrr)rr)run_idrXr+run_namers rrbzTrainer._get_output_dir* s  ! ! -%2C%%)?)??''?+>+>> ..0==?''?+A+AA''?+@+@@.2ll.Ft||E*dSYRZOHggll499#7#7BGii**Grc  | |j}tjjt}tjjt }tjjt }tjjt}tjjt}tjjt}tjjt} tjjxrmtfdtjDxsDtjjtjjtd} tjjrtjD cgc]} tjjtjj| rtjjtjj| t sBtjjtjj| t r| c} ng} | r|j st#ddtd|||| ||fDs| s| st#dt$j'ddtjj|rLt)j*|} | j,}|)|t.k7r t$j1d |d t.d tjj|s"tjj|s| rt3rtjjtjjd rt5j6td d yt9|j:dr-|j:j<durt$j1dt?tAjB|dd}d |d<|jE|d}~y|j r@tG|jHjJjL|jH|fitOy|j:jPrAtjj|r"tRj@jU|d}n"t?tAjB|dd}|jE|d }~|jW|ytY|r+t9|ds t9|drt9|drtjj[rt9|dr5|j\}t_|dkDrt$j1d|d}n |j`}| rP| D]9}tjj|}|jc||||k(;|je|y|jc|dyt$j1dt dyt$j1d ytg|t3|j:jP!}t3s|jW|yycc} w)"Nc3K|]I}tjjtjj|r t|vKywr)rgr^isdirrFSDP_MODEL_NAME)r folder_namers rrz0Trainer._load_from_checkpoint..K s>77==.Dk!RS ;.sAAz.binzCheckpoint found at z* is only supported when using PyTorch FSDPc3ZK|]#}tjj|%ywr)rgr^r*)rfs rrz0Trainer._load_from_checkpoint..f s% q! s)+z!Can't find a valid checkpoint at zLoading model from rz9You are resuming training from a checkpoint trained with z- of Transformers but your current version is zJ. This is not recommended and could yield to errors or unwanted behaviors.user_content.ptFr^tagrload_optimizerr TzOEnabling FP16 and loading from smp < 1.10 checkpoint together is not supported.r map_location weights_only_smp_is_partialrnractive_adapteractive_adapters load_adapterr zFMultiple active adapters detected will only consider the first adapterr)rBjThe intermediate checkpoints of PEFT may not be saved correctly, consider using a custom callback to save i in corresponding saving folders. Check some examples here: https://github.com/huggingface/peft/issues/96GCould not load adapter model, make sure to have `peft>=0.3.0` installed)rn prefer_safe)4rrgr^rrmrlrkrrrqrprorrWlistdirr*rrNrrrrfrom_json_filetransformers_versionrr@rrurr?rr rwrloadload_state_dictrrSrvrrsave_safetensors safetensors load_file_issue_warnings_after_loadrexistsrr:rr set_adapterr+)rrr config_fileadapter_weights_fileadapter_safe_weights_file weights_fileweights_index_filesafe_weights_filesafe_weights_index_file is_fsdp_ckptradapter_subdirsr}checkpoint_versionrd load_resultrr subdir_namepeft_ids ` rrzTrainer._load_from_checkpoint> s =JJEggll#9;G !ww||,BDXY$&GGLL1GIb$c!ww||$:LI WW\\*@BTUGGLL)?ARS"$'',,/EG^"_ww}}%;<  #%::.D#E  ^ ww~~bggll+AoEVVZC[\] *ww}}34$&::.D#E 77==.Dk!RSGGNN277<<0F Ui#jkww~~bggll3I;Xq&rs     4 434J3KKuvw w  !%&+(-   @AW@XYZ Z )*@)ACD 77>>+ &%44[AF!'!>, '277>>:K+LP\&(77>>"'',,/EGX"YZ..3uej tyy&1diinn6Lm-.!&Lu[_!`J49J01"'"7"7 4"7"PK"%%$$**66$$*  ,- 99--"''..AR2S!,!2!2!>"89u&78*/*?*?/!3"NN+st)8);).)=)=&+:sK&(ggll3I;&WG!..w S^bpSp.rs)).9**+A>`d*eNNDDXCYZbb hi2-6M6O]a]f]f]w]wK+,// <-A sC [8c tjd|jjd|jjdt j j|jjt}t j j|jjt}t j j|jjt}t j j|jjt}tr |jn |j}|jrAt!|j|jjt#|j y|j$rTt'|j(jj*|j(||jjfit-}yt j j/|s^t j j/|s?t j j/|s t j j/|rd}trt j j1t j j|jjdr2t3j4|jjtddy|j6j8rAt j j1|r"t:j<j?|d }n"tAt=jB|d d }d|d <|jE|d }yt#|r&tG|ds tG|drtG|drtG|dr=|jHd} tK|jHdkDr"tjMdn |jN} t j j/|st j j/|r7 |jQ|jj| ddl.m/} | gg}ntjMdtdd}ntjMdd}n|j6j8rAt j j1|r"t:j<j?|d }n"tAt=jB|d d }|jE|d}ts|r|jayyyt j j/t j j|jjtbsUt j j/t j j|jjtdrGtg||jjt }ts|ja|yytjMd|dy#tR$rR} |jT| jVr3d|jT| jXjZd} tS| | d} ~ wwxYw)NzLoading best model from z (score: rmrTrFrrrrrrrrrrr zCDetected multiple active adapters, will only consider the first onez0When using prompt learning PEFT methods such as z, setting load_best_model_at_end=True can lead to errors, it is recommended to set this to False and to load the model manually from the checkpoint directory using PeftModel.from_pretrained(base_model, ) after training has finished.)_IncompatibleKeysrrrz#Could not locate the best model at zi, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.)4rrrvr^ best_metricrgr^rrrrprlrkrrRrrGr&rrNrrSrrrr*rurrrrrrrwrrr?rr:r@rrr0 peft_configis_prompt_learning peft_typertorch.nn.modules.modulerrrorqr+) rbest_model_pathbest_safe_model_pathbest_adapter_model_pathbest_safe_adapter_model_pathrrhas_been_loadedrdrexcmsgrs rr_zTrainer._load_best_model s .tzz/O/O.PPYZ^ZdZdZpZpYqqstu'',,tzz'G'GV!ww||DJJ,L,LN_`"$'',,tzz/O/OQe"f')ww||DJJ4T4TVo'p$&=&?""TZZ  $ $ %"" 00'5djj'A#A   ! !)  &&22   00  () K GGNN? +ww~~23ww~~56ww~~:;"O&(77>>"'',,tzz/O/OQb"cd..!ZZ==( %', yy11bggnnEY6Z%0%6%6%@%@AU^c%@%d 02%*ZZebf%g 49J01"'"7"7 4"7"PK!%( '78GEK\<]cj~d#5*;<-2-B-B1-EN"5#8#89A= &/t u-2-A-AN77>>*ABbggnnUqFr* % 2 24::3S3SUc d R*;B*CK"NN!LL`Kabj!j /4O'pq*/yy11bggnnEY6Z%0%6%6%@%@AU^c%@%d 02%*ZZebf%g #("7"7 E"JK.0_33K@6E0 WW^^BGGLL)I)IKbc dhjhohohvhv GGLL99;M Ni 2tzz77@W@YK+,// <- NN5o5FGPP g$0 *#(#4#4^#D#W#W)[+0+<+<^+L+V+V+\+\*]^8)8%(+7s*;$D$) *s&Y;; [A [[ct|jdk7r|jjPt |jt |jjk(r|jj n#t jd|jdt|jdk7r$t jd|jdyy)Nrz8There were missing keys in the checkpoint model loaded: rz;There were unexpected keys in the checkpoint model loaded: ) r: missing_keysr_keys_to_ignore_on_saver8rrr@unexpected_keys)rrs rrz"Trainer._issue_warnings_after_loadT s {'' (A -zz11=#kF^F^B_cf 22dC &&(!YZeZrZrYsstuv {** +q 0 NNMkNiNiMjjkl  1rc |j|}|j||jj|t |j t jj jrO|sM|jj}|jdsd|} |j j|||S|S#t$r2}td|dt|jd|d|d}~wwxYw)N ignore_keyseval_9The `metric_for_best_model` training argument is set to 'W', which is not found in the evaluation metrics. The available evaluation metrics are: zX. Please ensure that the `compute_metrics` function returns a dictionary that includes 'zM' or consider changing the `metric_for_best_model` via the TrainingArguments.)evaluaterZrvr_rrZrr?rRrr" startswithrpr$rr)rr)rrrCmetric_to_checkrs rr3zTrainer._evaluatea s--,@-A !!%)?)?I d'')A)A)S)S T]k"ii==O"--g6$)/):"; !!&&w'?@w OP_O`a==A',,.=Q -C99C>c ||jjrG|jj|jkDr#t rt ji} |j|jj} ||z}t| |jj|jz z d| d<|/t|tjr|jn|| d<||| d<n|j| d<|xj | z c_|jj|_|j#|j%| |d} |jj&r]|j)||} |j+| |} |j,j.t0j2k(r| |j_|jj4rS|j7|||j8j;|j,|j|j|_yy)Nrrrwr)rCr))r should_logrvr_r0rrr_nested_gathermeanrrrrrrr/r`rashould_evaluater3_determine_best_metricrrrXr rf_save_checkpointr^on_save) rrvrwrr)r)rrrlogstr_loss_scalarrCis_new_best_metrics rrTz Trainer._maybe_log_save_evaluatev s << " "tzz'='=@\@\'\%' %'D"009>>@EEGN w G 4::3I3IDLhLh3h!iklmDL$8B9ell8[INN$4aj[!((5_%(,(?(?(A_%  # #~ 5 #+/::+A+AD ( OO  HHT: & << ' 'nnU,@AG!%!>(+ :=/Jccww||J@H77>>(+ B ^ 8#(::h#7  8,X67 09: ""#7#>? ! #   1%8 900L4L4LL :: " " $ $VUZZ9M~ ^ ! # $UEII7K^ \ ! # $UEII7K^ \ ! # $UEII7K^ \ " $ $VUZZ9M~ ^ %# 8 8s /JJc 6d}|jjD|jj}|jdsd|} ||}|jj rtjntj}|jj;|jj r tdn td|j_ |||jjrn||j_ |jjtjtj fvr%|jj"|j_d }|S#t$r/}td|dt |j d|d}~wwxYw) z Determine if the model should be saved based on the evaluation metrics. Returns: bool: True if a new best metric was found, else False FNrrrzJ. Consider changing the `metric_for_best_model` via the TrainingArguments.z-infinfT)rr"rr$rrgreater_is_betterrgreaterlessrvrrrrXSTEPSEPOCHr_best_global_step)rrCr)rr metric_valueroperators rrzTrainer._determine_best_metric s[# 99 * * 6"ii==O"--g6$)/):"; &7 &*YY%@%@rzzbggHzz%%-:>)):U:Uv[`af[g & djj&<&<=)5 &99**|/A/AM>M>c tjtjjtjj d}tj jr~|jjtjk(r,tj jj|d<n+tj jj |d<trtj |d<tr~|jjtjk(r,tj jj|d<n+tj jj |d<t#r~|jjtjk(r,tj$jj|d<n+tj$jj |d<t'r~|jjtjk(r,tj(jj|d<n+tj(jj |d<t+rj|jjtjk(r"tj,j|d<n!tj,j |d<t/j0|d |jj2d kr5t j4|t.j6j9|d yt j4|t.j6j9|d |jj:d y)N)rrrrrrrrrTr r rrr)rgetstater get_stater get_rng_staterrrrHrhrIget_rng_state_allrrrrrrrrrrrgrhrrcr^rr)rr rng_statess rrzTrainer._save_rng_state, swoo'YY((*<<--/ :: " " $yy&&,*B*BB%*ZZ%6%6%H%H%J 6"%*ZZ%6%6%D%D%F 6" ! # " 0 0 2Ju  ! #yy&&,*B*BB$)II$4$4$F$F$H 5!$)II$4$4$B$B$D 5! ! #yy&&,*B*BB$)II$4$4$F$F$H 5!$)II$4$4$B$B$D 5! ! #yy&&,*B*BB$)II$4$4$F$F$H 5!$)II$4$4$B$B$D 5! " $yy&&,*B*BB%*ZZ%A%A%C 6"%*ZZ%=%=%? 6" J. 99  1 $ JJz277<< O#L M JJz277<< jI`I`Haae  ! !   &&22D4D4DdjjR\ `u`w    &&22D4D4DdnnVZV`V`bl YY " " JJt~~002BGGLL^4\ ])-(A(A) *   8K G % II ! !..2O*,((5 e 4,,779277<< Tb;cd e  0 -3P "Q 5 5X e es4AS6AT6S?T c  y|jrt|jtst j d5}t |jjtjtjjtddddtytr:t!j tjjt"dzntjj%tjjt"xstjj%tjjt&xsHtjj)xr't+fdtj,D}|j.rQt!j tjjd|j0j2dt"n|}|rtjj%tjjtrt5r|j.r{t tjtjjd |j0j6d |j0j2dt"d d }|d }nDt tjtjjt"d d }t j d5}t tjtjjtd d }dddtt9j:||j0j<t9j:|j0j<|j>j||jj|ytrdtjj%tjjdrfd}nfd}|j@jC|n|j0j2dkDr|j0j<nd }|jDrUtG|jHjJjL|jH|j>|jNfitQn]t |j>jtjtjjt"|d t j d5}t |jjtjtjjtddddtyyy#1swYxYw#1swYxYw#1swY1xYw)z3If optimizer and scheduler states exist, load them.NTr%r_*c3K|][}tjjtjj|rtj dd|v]yw)rrN)rgr^rrOPTIMIZER_NAME_BINr)rrrHs rrz8Trainer._load_optimizer_and_scheduler.. sM'77==j+)NO+005a8KGsA!A$z rank*-of-rkrr"rrrYrc|jtjtjj t dy)NTr)rrurrgr^rremodoptrHs r opt_load_hookz.opt_load_hook s-//jR`9ako0pqrc6trJ|jtjtj j tddy|jtjtj j tdy)NT)r back_compatr)rtrrurrgr^rrer;s rr>z.opt_load_hook sa8 # 3 3$'HHRWW\\*n-U_cqu$v!"!$ 3 3CHHRWW\\*Vd=eos4t urr ))rGrrZrr1r+rwrrrrgr^rrfrMrglobrer*r9rrWrrrrrrrsend_cpu_data_to_devicer;rYrRregister_post_step_hookrNrrSrvrrr)rrHr1checkpoint_file_existsoptimizer_statelr_scheduler_stater>rs ` rr'z%Trainer._load_optimizer_and_scheduler sv     $ $d//1JK,,D9_,.%%55 277<< N#KZ^_ $O4 '( IIbggll:~>E Frww||JGH 77>>"'',,z;M"NO GGMM*-+-::j+A $** IIbggll:499;O;O:PPQR`Qa/bc d'  "bggnnRWW\\*n5]&^%'..,.&+jj &$tyy/F/F.GtDIIL`L`Kaabcqbr(s&+%) 'O'6k&BO,.&+jj Z@ucg'O,,D9_,.). Z@ucg*& $O4**?DII>}M 8> ,, NN JJ&  45 1266!JJ " Z HWcrv ,,D9_,.%%55 277<< N#KZ^_ $O4Y'_ !;bfs'AXAX**AX7X'*X47Yc |jj}|ytrt j dt jd5}t j|jjjtjj|tt|ddd|jj rtst jd5}t#j|jjjtjj|tdddtyyy#t$rYywxYw#1swYxYw#1swY2xYw)Nsaving_scaler_stateTr%)rSscalerrTrrr r1r+rcrdrgr^r SCALER_NAMErMrrfr)rrrIr1s rrzTrainer._save_scalers+ %%,,F >  ! # MM/ 0((5 5((//::cd#O4 5 99 )?)A((5 h 4++22==?jZeAfg h  0*B      5 5  h hs+E%A&E44AF% E10E14E=F c t|ytjjtjj|t}|rYt rt jd5}ttjtjj|tdd}dddttj|jj|j j"j%|yt jd5}t|j j"j%tjtjj|tddddtyy#1swYxYw#1swY"xYw)z If scaler state exists, load it.NTr%rrr6)rgr^r*rrJrr1r+rwrrrMrrBrr;rSrIr)rrHrDr1 scaler_states rr(zTrainer._load_scaler&s8   !# Z0U!V !&',,D9_,.#(:: Z=E`d$L $O4**<9I9IJ  ''77 E,,D9_,.$$++;; 277<< K#HW[\ $O4% "s(AF"&A'F."F+.F7c |jjsyg}g}|jj|jgz}|j j jD]9\ }t|ts|g}t fd|Dr|Dcgc]}|jj k(s| }}t||D]\}}|jdi}|jdi} t|d i|} | jD]\} } t!| | | t|t"r| |_n|j%| |jj't| t(j+d)|j% <t-|dkDr(t(j/ddj1|d |D]}|jj3|ycc}w) zLIf callback states exist and were passed in, restore their states if enabledNc3PK|]}|jjk(ywr)r4r5)rrstored_callbacks rrz/Trainer._load_callback_state..Ls"eh8%%../Aes#&r attributeszPContinuing training from checkpoint, restoring any callbacks that were passed inrzPCheckpoint included callbacks not included in current configuration. Ignoring. (rrr)r'restore_callback_states_from_checkpointr^rrrvrr6rrrWr4r5ziprrr8r:rzrrrr:r@rr_)r not_found new_callbacksoriginal_callbacksrnr duplicates callback_datarrP new_callback attributerrOs @rr+zTrainer._load_callback_stateAsyy@@   !22<< ~M%)ZZ%B%B%H%H%J 2 !OTdD)veRdee.@!)8CUCUC^C^bqCqH 03:t/D N+Hm(,,VR8D!.!2!2<!DJ#14>#9D#9L,6,<,<,>@( 5 i?@!(N;'3 %,,\:))99$|:LM N no  1/ 20 y>A  NNbcgclclmvcwbxxyz & 9H  ! ! . .x 8 9/s G;6G;r2rJn_trials directionbackendrr%c 8| t}t|}t|}|j||_|j t d| |jn||_||_ |tn||_ |j|||fi|} d|_| S)az Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided, the sum of all metrics otherwise. To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom optimizer/scheduler. Args: hp_space (`Callable[["optuna.Trial"], dict[str, float]]`, *optional*): A function that defines the hyperparameter search space. Will default to [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or [`~trainer_utils.default_hp_space_sigopt`] depending on your backend. compute_objective (`Callable[[dict[str, float]], float]`, *optional*): A function computing the objective to minimize or maximize from the metrics returned by the `evaluate` method. Will default to [`~trainer_utils.default_compute_objective`]. n_trials (`int`, *optional*, defaults to 100): The number of trial runs to test. direction (`str` or `list[str]`, *optional*, defaults to `"minimize"`): If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics. If it's multi objectives optimization, direction is `list[str]`, can be List of `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics. backend (`str` or [`~training_utils.HPSearchBackend`], *optional*): The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending on which one is installed. If all are installed, will default to optuna. hp_name (`Callable[["optuna.Trial"], str]]`, *optional*): A function that defines the trial/run name. Will default to None. kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments for each backend: - `optuna`: parameters from [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html) and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize) - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run). If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available). If `progress_reporter` is not set in the `kwargs`, [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used. - `sigopt`: the parameter `proxies` from [sigopt.Connection.set_proxies](https://docs.sigopt.com/support/faq#how-do-i-use-sigopt-with-a-proxy). Returns: [`trainer_utils.BestRun` or `list[trainer_utils.BestRun]`]: All the information about the best run or best runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray backend. NzXTo use hyperparameter search, you need to pass your model through a model_init function.) r#rTr"ensure_availablerrr0default_hp_spacer2r%r\rJr) rr2rJrZr[r\r%r backend_objbest_runs rhyperparameter_searchzTrainer.hyperparameter_searchhs@ ?/1G!'*8AC $$&!( ?? "j 9A8H 44h  >O>W!:]n";??49GG!%rrrc0|jj|jj|d<|jjdk7rL|jj|d<|1|j t d||jji|d|jji}|jjj||jj|j|j|j||_ y)a8 Log `logs` on the various objects watching training. Subclass and override this method to inject custom behavior. Args: logs (`dict[str, float]`): The values to log. start_time (`Optional[float]`): The start of training. Nr)rr>rU)r(rp) rvr)rr<r>r{rfr_ log_historyrzr^on_logr)rrrrs rraz Trainer.logs ::   ' JJ,,DM 99 2 2d :,0JJ,L,LD( )% M':$**JjJjkl=D=VTZZ%;%;<= %%f-,,33DIItzz4<.sCd11!4Cr;rr)rrrr6rhrrrrrr;rGis_floating_point is_complexr{rSrvr>r/rr)rrnrr@rs` rrhzTrainer._prepare_inputs dG $4:TZZ\RTQq$"5"5a"88RS S udm ,4:CdCC C ell + 0 01F((e.E.Ed.KuO_O_`dOe w(8(8(>(>(O(O(\(\(b(b(def477$V$ $ SsE rc|j|}t|dk(r(tddj|jd|j j dk\r|j|j|d<|S)z Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and handling potential state. rzThe batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: r`rmems)rhr:rrrprr4rrrs rrqzTrainer._prepare_inputss $$V, v;! IILRVRiRiIjHkkln  99  1 $)?!ZZF6N rc|y|jdk(ry|jdvr|jd}|dkry|jdk(r|d}n|d}tj|d }tj|d kxstj|d k}t |tj r|jS|Sy ) a Check if an attention mask is causal (compatible with causal attention). Context parallelism only supports causal attention patterns. This function checks if the provided attention mask is compatible. Args: attention_mask (torch.Tensor): The attention mask to check Returns: bool: True if the mask is causal or compatible with causal attention Tr)rr r r)rrr)diagonalgư>gF)dimshapertriuallrrr)rr seq_lenmaskupper_triangular is_causals r_is_attention_mask_causalz!Trainer._is_attention_mask_causals  !    1 $    !V +%**2.G!|!!#q(%d+%a( %zz$;  "2d":;auyyIY\`I`?aI'1)U\\'J9>># YPY Yrct|jdd|jjjrxt |dr<|j j dk7r#td|j j dd|vrptjdtj|d jd |d j j|d jd d |d<d|vr\tjdd|vrCd}tj j#|dd|}|ddd dfj%|d<g}g}d |vr%|j'|d |j'd d|vr%|j'|d|j'd d|vr%|j'|d|j'd d|vr}t|dds(|d}|j)|s tdd|_|j*r<|d}|j-dk(r#|j'||j'd n d|vr*|d%|j'|d|j'd t/|jj0||t3||fSt4j6|fS)as Prepare inputs for context parallelism by setting up buffers and validation. Args: model: The model being trained inputs: Input tensors to prepare Returns: tuple: (context_manager, prepared_inputs) where context_manager is either the context parallelism wrapper or a no-op context parallelism_configNr}sdpazIContext parallelism is supported only with SDPA attention, you are using r position_idsz9Position IDs not found in the inputs, generating manuallyr#r rrr  shift_labelsz7Shift labels not found in the inputs, shifting manuallylabels)rr )rr _attn_mask_causal_checkedFzContext parallelism only supports causal attention masks. The provided attention_mask is not causal. Please ensure your data uses causal masking (lower triangular) or remove the attention_mask to use the model's default causal masking.Tr)buffersbuffer_seq_dimsno_restore_buffers)r6rSr| cp_enabledr?r}_attn_implementationrrrrarangesizer;expandr functionalpad contiguousrzrzrrrrmaybe_context_parallelr8rr)rrr _ignore_indexrrrr s r _prepare_context_parallel_inputsz(Trainer._prepare_context_parallel_inputs s D$$&:D A M  33>>uh'<<44>$cdidpdpeFeFdGGHIV+##$_`).;',,Q/{8K8R8R*& ,11!4b9~&V+##$]^v%$(M]]..vh/?}.]F-3AqrE]-E-E-GF>*G Of$vk23&&q)6!vh/0&&q)'vn56&&q)6)t%@%H&,,<%=N99.I(f 6:D211%+,<%=N%))+q0~6'..q1'F>,B,Nvn56&&q)  77 /#&w<    %%v--rctj}|j}t|tjs|j ||S)zF A helper wrapper to group together context managers. )r ExitStackautocast_smart_context_managerrr enter_context)r ctx_stack autocast_ctxs rcompute_loss_context_managerz$Trainer.compute_loss_context_managerusD((* ::< , (>(>?  # #L 1rrkc|jr$tjd||j}|St j }|S)z A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired arguments, depending on the situation. r) device_typerkr)rrrrsrzrr)rrk ctx_managers rrz&Trainer.autocast_smart_context_managersD   ..U-_c_m_mnK%002Krrc|j||\}}|5|jt|jdr9t |jjr|jj|j |}t rkt|||jj}|jjj|jjcdddS|j5|j|||}ddd~|jj <|j"j$|jj zdk(r t'rt(j*j-nt/rt(j0j-nt3rt(j4j-nt7rt(j8j-ngt;rt(j<j-n>t?rt@jCdnt(jDj-i}|jjFtHjJtHjLfvr|jO|d<|jjPdkDrjS}|jTrc>cd**,335889I9IJ@ !@ !224 _((K](^ _ 11=JJ**TYY-N-NNRSS)+II))++-II))+,.JJ**,+-II))++-II))++-NNqJJ**,Fyy>#6#68N8N"OO*.*A*A*C'yy"yy{}}$^^D$..9+[((*++ 66:L:T,,4$"J"JJD##448Q8QQ.3F?+)  ))$9&9;;=A@ !@ ! _ _J++_@ !@ !sDCO$O$O (G>O$&O7B O$ O O$O! O$$O-return_outputsc |j |jd|vr|jd}nd}|jri}|||d<i||}|di|}|jj dk\r||jj |_|j-|tjd|j|||}n||jj|} t| r$| jjjn| j} | tj vr|j||d}n|j||}nzt#|t$rPd |vrLt'd d j)|j+d d j)|j+d t#|t$r|d n|d}|jj,rb|js |jrJ|H||jj.dkr|jj0n|jj.z}|r||fS|S)a How the loss is computed by Trainer. By default, all models return the loss in the first element. Args: model (`nn.Module`): The model to compute the loss for. inputs (`dict[str, Union[torch.Tensor, Any]]`): The input data for the model. return_outputs (`bool`, *optional*, defaults to `False`): Whether to return the model outputs along with the loss. num_items_in_batch (Optional[torch.Tensor], *optional*): The number of items in the batch. If num_items_in_batch is not passed, Returns: The loss of the model along with its output if return_outputs was set to True Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss, make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculating might be slightly inaccurate when performing gradient accumulation. Nrrrz|Trainer: `compute_loss_func` is defined but `labels=None`. Your custom loss function will still be called with labels=None. rT)rrzJThe model did not return a loss from the inputs, only the following keys: r`z,. For reference, the inputs it received are rr r)r|rr>rUrr4rrr@rSr,rrr _get_namer-r9rrrrraverage_tokens_across_devicesr num_processes) rrrrrrroutputsrrrs rrzTrainer.compute_losssQ4    +t/E/E/QW_ciWiZZ)FF  ) )F!-/A+,))&)F/&/ 99  1 $ !5!56DJ  ! ! -~X))#5*D  "..;;EBO"/2 **00::<$..0  >EEGG**7F*N**7F;'4(V7-B `xx /00\]`]e]eflfqfqfs]t\uuvx '1$&?76?WQZD II 3 3//43I3I". diioo6JD$$22PTPYPYP_P_ _D"0g:d:rc4|jjdk(S)z Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several machines) main process. r)rlocal_process_indexrs rrzTrainer.is_local_process_zero>s yy,,11rcvtrtjdk(S|jjdk(S)z Whether or not this process is the global main process (when training in a distributed fashion on several machines, this is only going to be `True` for one process). r)rrurrrrs rrzTrainer.is_world_process_zeroEs/ # $88:? "99**a/ /rrr]ch||jj}tr|j|n)t rt j |d|jj}|jjr|j||trtt jj|djnt!|j"dd*|j"j$rW|j|nDt!|j&ddx}|d kDr|j|n|j(rd t+|j"j,j.j0vr|j"j3|j&}|jjr|j||n|j4rP |j"j3|j6}|jjr|j||n'|jjr|j||jjFr+|s(|jGd |jjH yyy#t8$rt:j=d |jjr|j|it?|jj|t@tBg|jjE|YwxYw)z Will save the model, so you can reload it using `from_pretrained()`. Will only save from the main process. NTr )rdrr|rrr FULL_STATE_DICTz| stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use zero_to_fp32.py to recover weightsz Model save)commit_messagerevision)%rrr _save_tpurrgrhrRrdrf_savertrr^rtouchr6rSshould_save_modelrrNrrvrstate_dict_typeget_state_dictrGr&rrr@rNrrrpr/rd hub_revision)rrr]rdtp_sizes rr`zTrainer.save_modelQs]  --J ! # NN: & $ & KK T 2++668Jyy$$ :* =(RWW\\*.?@AGGI T%%';T B N11 :& Z;;g HWWX[ JJz "  ! ! C(8(8(>(>(J(J(Z(Z$[[!--< ?s)AJ!!B L10L1c ||n|jj}tjd||j}t j t jdrYtj|dtj|jtjj|ttf}t j d|j"r|j%|j'd}tjj|d|jj(d |jj*d t,}t j||d t j d |jj.rd dlm}|tjj|ddt,d\}}|j4j4}|j6j9|} t;| |r9| j=||t j|jj>ntjdt j|tjj|t,nt;||st;|j6j9||r|j6j9|j=||jj.t j@|j%t j|jj>ntjdt j@|j%} t j| tjj|t,nm|j=||jj.t j|jj>t j@|j%|jB3|jj.r|jBj=|yyy)NSaving model checkpoint to F)localTr saving_checkpoint)rr!rr"rkr#save_full_checkpointsr)%consolidate_sharded_model_checkpointsrz rank*-of-*-) ckpt_prefix ckpt_suffixr`)rd save_functionsafe_serializationETrainer.model is not a `PreTrainedModel`, only saving its state dict.)is_main_processrdrr)rrrrd)"rrrrrrris_master_ordinalrgrhrrcr^rTRAINING_ARGS_NAMErur rrdr*rrrrrfrrrRrSr,rsave_pretrainedr_maybe_convert_to_cpur) rrrsupported_classesckpt ckpt_pathrfull_state_dictrjrrds rrzTrainer._save_tpus:#-#9Ztyy?S?S  1*>?    e , KK T 2 JJtyy"'',,z;M"N O,- )*  & &))+"'":":"<D d499#:#:";4 @T@T?UUVWcVdeI GGD) 7 MM1 2yy$$\%J " Z <"-l^ <$&"  ++"&"2"2"?"?"Fo/@A#33"#2&(gg+/99+E+E 4KK ghGGORWW\\*l-STE#45$**77>@QR  --e4DD$(II$9$9!778H8H8JK"$'''+yy'A'A E cd55e6F6F6HI  BGGLL\$JK  ! ! $ 5 5 gg#'99#=#=33E4D4D4FG "   ,1F1F  ! ! 1 1* =2G ,rc||n|jj}tj|dtj d|t stfn ttf}t|j|sF||jj}t|jj|jd|rN|jj|jdj|||jjntj d|jjrGt j"j%|tj&j)|t*dd i nkt#j,|tj&j)|t.n2|jj|||jj|j0|j0j|nr|j2ft5|j2d rP|j2j6:tj d |j2j6j|t#j,|jtj&j)|t8y) NTr rFr)rdrrrpt)rrzWSaving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`)rrrgrhrrrr*rrrrdrSr,rrrr save_filer^rrprcrrrrr?rr)rrrdrs rrz Trainer._saves#-#9Ztyy?S?S  J. 1*>?6G6I_.P_ajOk$**&78!!ZZ224 $**77 W\7]_pq  --djjU-Scc:$))JdJdd cd99--%%//"BGGLL=N$O[ceiZj0JJz277<< L+QR JJ & &zdiiF`F` '   ,  ! ! 1 1* =    ***K8"",,8 KKq r    ( ( 8 8 D  499bggll:7IJKrc|jjtjk(rm|jxj t |jg|jjjjz c_d|_y|jxj |jz c_d|_y)Nrr) rrHrhrIrvrrCrr;rDrrs rr`zTrainer.store_floss 99 " "l&>&> > JJ ! !-t/@/@.A$))JZJZ[__affh  !!"D  JJ ! !T%6%6 6 ! !D rcg}t|j|dDcgc]-}tjj |s#t |/}}|D]}|r1|j tjj||f6tjd|d|}|S|jd|j t|jd|ft|} |rMt|dkDr?| dd| ddz } | dkr)tjd|j!d || S| D cgc]} | d } } |j"j$t t|j"j$| vrk| j't t|j"j$} t)| t| d z D]} | | dz| | c| | <| | dz<| Scc}wcc} w) N-*z.*z -([0-9]+)rr r rzPmtime may not be reliable on this filesystem, falling back to numerical orderingF)rrcheckpoint_prefixr)rrArgr^rrrzgetmtimerematchgroupsrsortedr:r1r2rcrvr^indexr)rrrrordering_and_checkpoint_pathxglob_checkpointsr^ regex_matchr mtime_diffrHbest_model_indexrs rrczTrainer._sorted_checkpointss&(*$,0,<,A,AEVDWWYBZ,[pq_a_f_f_l_lmn_oCFpp$ ^D,33RWW5E5Ed5KT4RS hh,=+>i'H$O *{/A/A/C/O077[=O=O=QRS=T9UW[8\]  ^$$@A 9:Q>+B/25G5J15MMJC pq//# N_0?QQ jmQQ JJ , , 8D99:;?QQ177DAaAa|jj|jjdkry|j||}t||jjkry|jj}|jj 7|jjdk(r|d|jj k7rd}t dt||z }|d|}|D]2}tjd|dtj|d 4y) Nrrr r rrrTr) rrdrcr:rvr^rrrrfrg)rrrrrdnumber_of_checkpoints_to_deletecheckpoints_to_be_deletedrHs rrzTrainer._rotate_checkpoints,s 99 % % -1K1Kq1P "55 V`5a ! "dii&@&@ @  9955 JJ , , 8 **a/"2&$***J*JJ *-a5G1HK[1[*\'$67W8W$X!3 :J KK5j\A_` a MM*D 9 :rrmetric_key_prefixc z|du}|r|n |j}t|trJi}|jD]3\}}|j |r|n|||d|}|j |5|S|j j|j|} |jr t| } tj} |jjr |jn |j} | | d|j dnd||} |jj"|jj$z} |d| j&vr| | j&|dz } |d| j&vr| | j&|dz } | j&j t)|| | j*t-j.| j*| z  |j1| j&t2j4|jj6vr't9j:t=j>|j@jC|j|jD|jF| j&|_#|j jI| j&| j&S) a Run evaluation and returns metrics. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent (pass it to the init `compute_metrics` argument). You can also subclass and override this method to inject custom behavior. Args: eval_dataset (Union[`Dataset`, dict[str, `Dataset`]), *optional*): Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the `__len__` method. If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run separate evaluations on each dataset. This can be useful to monitor how training affects other datasets or simply to get a more fine-grained evaluation. When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`. ignore_keys (`list[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "eval_bleu" if the prefix is "eval" (default) Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The dictionary also contains the epoch number which comes from the training state. Nrj)rrrrT)rprediction_loss_onlyrr_jit_compilation_time_model_preparation_timerr)%rrrr6rr{r*r+rrr(rrrprediction_loopevaluation_looprrrrCrfrmathceilrarrYrHrrZr[r\r^ on_evaluatervrr)rrrroverriderCeval_dataset_name _eval_datasetdataset_metricseval_dataloaderr eval_looprtotal_batch_sizes rrzTrainer.evaluateEsrZ t+'/|T5F5F lD )G4@4F4F4H 00!="&--2:@Q +):(;1=N>S)TU UJ 7 8FNN J &..,=+>>U)VW WJ !"..))F$6$69I$IJ      ( (DIIOO ; OOC..0 1,,88DJJPTP\P\^d^l^lm  44V^^D~~rc |jj|j|}tj}|jj r |j n |j}||d||}|jj|jjz}|d|jvr||j|dz }|d|jvr||j|dz }|jjt|||jtj|j|z |j j#|j|j$|j&|j|_|jj)|jt+|j,|j.|jS)a Run prediction and returns predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`. Args: test_dataset (`Dataset`): Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the `model.forward()` method are automatically removed. Has to implement the method `__len__` ignore_keys (`list[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"test"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "test_bleu" if the prefix is "test" (default) If your predictions or labels have different sequence length (for instance because you're doing dynamic padding in a token classification task) the predictions will be padded (on the right) to allow for concatenation into one array. The padding index is -100. Returns: *NamedTuple* A namedtuple with the following keys: - predictions (`np.ndarray`): The predictions on `test_dataset`. - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). - metrics (`dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained labels). Prediction)rrrrrr) predictionsrrC)r*r+r"rrrrrrrrCr{rfrrrr^ on_predictrvrrrVrr) rrrrtest_dataloaderrrrrs rpredictzTrainer.predictsH ""$22<@YY[ ,0II,P,PD((VZVjVj  ;bs  9944tyy7K7KK 5 6&.. H &..,=+>>S)TU UJ 7 8FNN J &..,=+>>U)VW WJ !"..))F$6$69I$IJ   ,,77 4::t||]c]k]kl  44V^^DF,>,>&JZJZdjdrdrssrrc |j}||n |j}|jr|jt |dd\}}|j |j d|}t|jjdk(r ||j urtj} |js;|jrJ|jjdk7r1|jjs|jj|n|jj|d}t!tj| z d |_|jr||_||j ur||_|jr|j$|_|j&so|j(r,|j+t,j.|j0 }n7|j2r+|j+t,j4|j0 }|jj6} t8j;d |d t=|r(t8j;d |j?|nt8j;dt8j;d| tA|dr%tC|jDr|jEtA|jFdr9tC|jFjDr|jFjE||jH_%tM|dd} |jNdk\rd|_(tS|jjTd} tS|jjTd} tS|jjTd}tS|jjTd}d}i}d}tW|D]j\}}tY|}| ||z }| |} |j[||||\}}}tM|j dd}d|j\vr|j_||nd}tartcjd|1|jg|ji| }| jk||[|jjm|dd}|jg|}|jjnr|dk(r|jk|||jjm|dd}|y|jjm|dd}|jp|jq||}|jg|}|jjnr|dk(r| jk||=|jg|}|jjnr|dk(r|jk||jHjs||jt|jv|_;|jjnr|jxo|m|k|jjzj|}i}d|j\vr|nd|d<d|j\vr|nd|d<|jytd(||d||}~~~~t,jj|j|dz|jzdk(s | j| j|j|j~~~~t,jjm|jj|_3|jNrtA|dr t|d| j} | j} |j}|j}t=| r t| }nLt| trtM| d ddkDr | j>}nt=|r|j?|}n|}|dk(r|dkDr|}|jxb| `|^|jjnsHd|j\vr| nd|d<d|j\vr|nd|d<|jytd(| |d|}n|i}t|}t| tr:| r8tj| jj||d!<n>t| tjr$| jj||d!<tA|d"r|j||d#<tA|d$r|j"||d%<t|jD]0}|j|d&r|j|||d&|<2t| |||'S)) Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. Works both with or without labels. NrTr# inferenceFryr revaluation_moderrr; ***** Running  *****rz Num examples: Unknown Batch size = rrr padding_indexrr r#rr )rr pad_indexrrlossesrrrrr _lossrrmodel_preparation_timerrjrrrCrr)UrrrGr&r%rrr:rS_modelsrrNr% torch_compiler  prepare_modelrrrRr'rKrrfloat16r;rLryrrrrar r?rirrYr^rr6r4rr=eval_do_concat_batchesr9rEprediction_stepinclude_for_metricsrhrrrgather_functionrepeatrpad_across_processesrron_prediction_steprvrrr:end_of_dataloaderrSrreval_accumulation_stepsto_cpu_and_numpygather_for_metricsr] get_arraysrr>r]rr concatenaterrrrrrr>rR)rr rrrrrrjrrrr all_losses all_preds all_labels all_inputsrCeval_set_kwargsobserved_num_examplesrprobserved_batch_sizerrrr  inputs_decode is_last_step batch_kwargsrrs rrzTrainer.evaluation_loopsyy7K7W3]a]v]v  $ $)?!$1MDAq  e  S t'' (A -%4::2EJ,,((T-=-=-M-MQV-V_c_h_h_v_v  ((/%%33E43P  +0 j0H!*LD '##" DJJ&%*"((!%!3!3""u}}T[[I$$u~~dkkJYY..  &{m6:; j ! KK+D,=,=j,I+JK L KK1 2 oj\23 5& !huzz&: JJL 4>>6 *x8K8K/L NN   !0:-z9d; ??a DJ'tyy'G'GW[\ %dii&F&FVZ[ &tyy'G'GW[\ &tyy'G'GW[\ !"&j1C )LD&"1&"9 ".%)<<%%!4J&*%9%9%I]kv%9%w "FFF%djj2C[QO@HDLdLd@d##F?$;>v1X\>]!))>>v1X\>]55A!??OF--f5yy33{l7RMM&)!--f5yy33{l7RNN6*00CCD$**VZVbVbcDLyy++''38JvOa#'#3#3#B#B#T#TL#%L7=AYAY7YV_cL*7?4C[C[7[VaeL*"22&\6V\|\'33G FFF &&(--9tax4KgKg>gkl>l++-**,++-++-FFF &&(GC )L $//BB ??wtW5 D' " **, ((* **, **,  l #l+K &: ; Vdfg@hkl@l&33K*%"// ; 3 !  5 9/K  ,%&II006<@X@X6X ^bOH %6>$BZBZ6Z `dOH %**^9 ^o^G_G(0 j$ 'J35>>*3M3R3R3T3Y3Y3[G()/ 0  BJJ /3=??3D3I3I3KG()/ 0 4/ 0CGC\C\G())>? @ 41 2EIE`E`G())@A B ' IC>>%6$7q"9:8? C8H,-Qse45 I)zSZhsttrcX|ytr|d}t||}|Str t|}|S|jj #|jj j dk7s/|jj $|jjdk7r t|}|S) Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before concatenating them to `gathered` N nested_gatherNOr ) rrLrrrdistributed_staterA local_rankrDrtensorsnames rrzTrainer._nested_gathers ?  ! #|&,Wd;G % & )G  ii))5$)):U:U:f:fjn:n II ' ' /DII4H4HB4N(1Grct|jdk(rdntfd|jD}jd}| |j}t|jdk(xr|}|j ;t |jdr#t|jjddgng|s|r;ttfd |jD}t|d k(r|d}nd}tj5trt|} |s|r{t!| t"r(| d } tfd | j%D} n | d} | d d} | j'j)j+} t-| } nd} t!| t"r#tfd | j%D} n| } t-| } nE|s|r|j/5|j1g|j2j4}|j7|d|\} }ddd j)j9} t!t"r#tfd|j%D} n|d d} nd} |j/5|di}dddt!t"r#tfd|j%D} n|} |j2j:dk\r!||j2j:d z |_ddd|r ddfSt } t| d k(r| d}  | |fS#1swY%xYw#1swYxYw#1swYLxYw)a Perform an evaluation step on `model` using `inputs`. Subclass and override to inject custom behavior. Args: model (`nn.Module`): The model to evaluate. inputs (`dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument `labels`. Check your model's documentation for all accepted arguments. prediction_loss_only (`bool`): Whether or not to return the loss only. ignore_keys (`list[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. Return: tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and labels (each being optional). rFc3DK|]}j|duywrr)rrrs rrz*Trainer.prediction_step..s Av`a&**Q-W[B[Avs  return_lossNr}keys_to_ignore_at_inferencepast_key_valuesc3@K|]}j|ywrr;)rr8rs rrz*Trainer.prediction_step..s(WdD)9(Wrir rc3:K|]\}}|dgzvs|ywrNrrrr@rs rrz*Trainer.prediction_step..s&)o1qXcgmfnXnOn!)oc32K|]\}}|vs |ywrrrBs rrz*Trainer.prediction_step..s)d1qXcOc!)d T)rrc3:K|]\}}|dgzvs|ywrArrBs rrz*Trainer.prediction_step..,s&&hTQQ\`f_gQgHgq&hrCc32K|]\}}|vs |ywrrrBs rrz*Trainer.prediction_step..4s&]TQQ\H\q&]rEr)r:rrurrvrqr?rr6r}rJrrrtrrrrr6rrrrr_get_num_items_in_batchrr;rrr4r)rrrrr has_labelsr<loss_without_labelsr raw_outputsr logits_mbrrrrs ` ` rrzTrainer.prediction_steps<<"$"2"23q8UcAveieueuAv>v jj/  ..K!$"2"23q8H[%%f-  tzz8,%djj&7&79VYjXkl   ,"5(WdFVFV(W#WXF6{aF ]]_+ G&(.uf= !4!+t4"-f"5$))o 8I8I8K)o$o "-a.$/O "..0779==?D.y9FD!+t4$))d 8I8I8K)d$d $/ .y9F!4::<-1-I-I6(TXT]T]TdTd-e*(,(9(9!6$Se):) g  ;;=--/D!'40!&&hW]]_&h!h!(D::<2"'/&/2!'40!&&]W]]_&]!]!(yy++q0%,TYY-A-AA-E%F W+ GZ $% %v& v;! AYFff%%=22G+ G+ Gs@C.NAM*A1N8 M7A7N*M4 /N7N <NN cft|jdr|jj|Sy)a For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point operations for every backward + forward pass. If using another model, either implement such a method in the model or subclass and override this method. Args: inputs (`dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. Returns: `int`: The number of floating-point operations. rHr)r?rrHrns rrHzTrainer.floating_point_opsDs+ 4::3 4::008 8rrc|jsy|jj8t|jjj j }n|jj}||n|jj}t|||jjd}|j|_d|_ y)zE Initializes a git repo in `self.args.hub_model_id`. NT)rprivater ) rrrcrrabsoluter8 hub_tokenrhub_private_reporepo_idpush_in_progress)rr repo_namerepo_urls rrezTrainer.init_hf_repoVs ))+  99 ! ! )TYY112;;=BBI ..I* 0C0Cytyy?Y?Ydhi$,, $rlanguagelicensetagsrfinetuned_fromtasks dataset_tags dataset_argsc H|jsytjj|jj d} d} tjj | rtj| jjd} | dk(} tj| jj} |2| 0t|tr|g}| D]}||vs|j|tj ||||||||||  }|j#}t%| d5}|j'|ddd| rI|j(j+|j,j/|jj yy#1swYUxYw)a Creates a draft of a model card using the information available to the `Trainer`. Args: language (`str`, *optional*): The language of the model (if applicable) license (`str`, *optional*): The license of the model. Will default to the license of the pretrained model used, if the original model given to the `Trainer` comes from a repo on the Hub. tags (`str` or `list[str]`, *optional*): Some tags to be included in the metadata of the model card. model_name (`str`, *optional*): The name of the model. finetuned_from (`str`, *optional*): The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo of the original model given to the `Trainer` (if it comes from the Hub). tasks (`str` or `list[str]`, *optional*): One or several task identifiers, to be included in the metadata of the model card. dataset_tags (`str` or `list[str]`, *optional*): One or several dataset tags, to be included in the metadata of the model card. dataset (`str` or `list[str]`, *optional*): One or several dataset identifiers, to be included in the metadata of the model card. dataset_args (`str` or `list[str]`, *optional*): One or several dataset arguments, to be included in the metadata of the model card. Nz README.mdF library_namer) rWrXrYrrZr[r\rr]w)rrgr^rrrrrrrnrrYrrrzr) from_trainer to_model_cardopenwriterSr,rcreate_or_update_model_card)rrWrXrYrrZr[r\rr]model_card_filepathis_peft_libraryr_ existing_tagsrtraining_summary model_cardrs rcreate_model_cardzTrainer.create_model_cardhswJ))+  ggll499+?+?M 77>>- .$>>*=>CCGGWL*f4O&NN+>?DDIIMM$=dC( 6D()C$ C()+77 !)%%  &335 %s + q GGJ       ) )$** 5 Q QRVR[R[RfRf g   s 2FF!c  |jr'|jjtjk(ry|jj s'|j |j jsy|jj}ttttg}ttfD]}tj j#||}tj j%|sC|j'|t)|5}t+j,|j/}dddt1t3dj5}|j7|t9r |j7t:t<t>g|D]} tj j%tj j#|| sAtAjBtj j#|| tj j#|| |jD|jDjG|tIjJ|jtj j#|tL|jjNtPjRk(rd|jTjV} n"dtY|jTjZ} t]|j^|| |jj`ddtbdg|jjd} | g} |jjtjftjhfvr|jjtjfk(rdntk|jl} t]|j^|| | d z|jj`d|jjd }| j'||j |j jrto| |_y|j jpj7| y#1swYHxYw) N weight_mapzTraining in progress, step zTraining in progress, epoch Tr7rrS folder_pathrr run_as_futureignore_patternsrzlast-checkpointz , checkpoint)rSro path_in_reporrrpr)9rr hub_strategyrUENDhub_always_pushrTis_donerrmrnrrrprqrorgr^rr*rzrcjsonloadsreadrr8r9extendrrjrlrkrfrIrrrrcrrrXrrvr_rr)rrcrQrPr CHECKPOINTALL_CHECKPOINTSrr8rtjobs)rrrmodeling_files index_file index_pathrr shard_files modeling_filermodel_push_job push_jobsrrcheckpoint_pushs rrzTrainer._push_from_checkpoints_))+tyy/E/E/X yy((T-B-B-NW[WlWlWtWtWv YY)) %'=|M^_-/FG 3J&7DJww~~j)%%j1*%1 JJqvvx0E1"3u\':'A'A'C#DE %%k2 3    ! !#68LNg"h i+ uMww~~bggll+Y>Y%Z Z%)YY%;%;{?U?U%U![_`q[r[w[w ,))-)->ii))"//O   _ -  (D,A,A,I,I,K$29$=D !  ! ! & & - -i 8e11s $Q77R ct|dsy|jK|jjs0tj d|jj yyy)NrTz\Waiting for the current checkpoint push to be finished, this might take a couple of minutes.)r?rTrvrrwait_until_doners rrhzTrainer._finish_current_pushsUt/0   ,T5J5J5R5R5T KKv w  ! ! 1 1 36U ,rrblockingrc |jdd}|~|jjrh|jj*t |jj j }n(|jjjdd}||n|jj}|j|j||jd|jsyt|jdd\d |vrg|d <t|d tr |d g|d <|jj D]}||d vs |d j#| |j$d d|i|||jj&}|j)t+|j|jj ||| d t,d g| S)u Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`. Parameters: commit_message (`str`, *optional*, defaults to `"End of training"`): Message to commit while pushing. blocking (`bool`, *optional*, defaults to `True`): Whether the function should return only when the `git push` has finished. token (`str`, *optional*, defaults to `None`): Token with write permission to overwrite Trainer's original args. revision (`str`, *optional*): The git revision to commit from. Defaults to the head of the "main" branch. kwargs (`dict[str, Any]`, *optional*): Additional keyword arguments passed along to [`~Trainer.create_model_card`]. Returns: The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the progress of the commit if `blocking=True`. rN/r )rTr\ model_tagsrYr7rrnr)r>rrfrcrrr8rrQrer`rr6rrrrrzrkrrhrrP)rrrrrrr model_tags rrdzTrainer.push_to_hubs6ZZ d3  $))"7"7yy%%-!$))"6"67<< !YY3399#>rB * 0C0C    $   E  * t,))+  4::|T 2 >V#!#v&.#."(.!1v!ZZ22 5 F6N26N)))4 5 ?*??  yy--H !!#%% ,,)&,!&;%}tA|| | }|shd}tC|dr0tE|jFtHr|jFj4}tA|| |}tA|| |}tA|| |}|jKtC|jLdr9tO|jLjJr|jLjK|jPdk\rd|_)||jT_+tY|D]w\}}|j[||||\}}}t1|jdd}d|j\vr|j_||nd}|.|ja| }| |nt'jb| |fd} || |n te| |d} || |n te| |d} |||n te||d}|jTjg||jh|jj|_5|jjlr{|jno| m| k|jjpjr}i} d|j\vr| nd| d<d|j\vr|nd| d<|jotud(| | d| |}|jjls%|jv|dz|jvzdk(s|jy|j{| d|scjy|j{| d jy|j{| d!jy|j{|d"~ ~ ~ ~t&j|jd#\} } } }z|jPrtC|d$r t|d$|jy|j{| d|scjy|j{| d jy|j{| d!jy|j{|d"|j}!|sjnd}"|sjnd}#|sjnd}$|jnb|"`|#^|jjlsHd|j\vr|!nd|d<d|j\vr|$nd|d<|jotud(|"|#d|}n|i}t|}|!$|!jj||d%<t|jD]0}%|%j|d&r|j|%||d&|%<2t|"|#|| 'S))rz+dataloader must implement a working __len__NrTrFrrr_is_accelerate_preparedz\Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size.r r rr r )make_multiple_ofrrrr r#r)rrrr rrrr eval_losses eval_predseval_label_idseval_inputs_ids)NNNNrrrjrr)JrrarrrGr&r%rrr:rSrrNr rrRr'rKrrrr;rLryr6rrr rrrrr<r?rrrBrrYrir4rr^rr9rrrhrcatrIrrvrrrr:r rSr! add_arrays_gather_and_numpifyrrr]finalizer]rrrrrr>rR)&rr rrrrrrjrrr  losses_host preds_host labels_host inputs_hostrCr*reval_losses_gathererrpreds_gathererlabels_gathererinputs_gathererrprrrrr r-rr.r/ eval_losspredsr inputs_idsrs& rrzTrainer.prediction_loopQsyy*%JK K7K7W3]a]v]v  $ $)?!$1MDAq  e  S t'' (A -%4::2E,,0D0D  ((/%%33E43P  ##" DJJ&%*"((!%!3!3""u}}T[[I$$u~~dkkJz#>6 *x8K8K/L NN   ! ??a DJ0:-%j1. [LD&#'#7#7vG[it#7#u D&&%djj2C[QO@HDLdLd@d##F?$;&>z<&X Y  & &t'?'? M]'^ _  & &t'?'? M^'_ `(113 1E'')46JO,,.PT 7K_--/QU   ,!%II005;t?W?W5W ]aOH %6>$BZBZ6Z `dOH %**>+teW`+tds+tuG _G(0  3<>>3C3H3H3JG()/ 0 ' IC>>%6$7q"9:8? C8H,-Qse45 I%9gcopprc|ytrt||}t|Strt|}t|S|jj t jk(r t|}t|S)r1N) rrLrrrrHrhrIrDrKr6s rrzTrainer._gather_and_numpify sy ?  ! #,Wd;G g&& % & )Gg&&YY $ $ (@(@ @(1Gg&&rc|jsyddg}tjjtjj |j j drWttjj |j j d5}|j}dddnd}}|D]&}||vs|jdr||z }|d|z }(||k7rpttjj |j j dd5}tjd||j|ddd|j jdtjd |j j!s6|j j#d |j j%yy#1swY-xYw#1swYxYw) z8Add SageMaker Checkpointing patterns to .gitignore file.Nz*.sagemaker-uploadingz*.sagemaker-uploadedz .gitignorer r`z"Writing .gitignore file. Content: g?z'Add *.sagemaker patterns to .gitignore.)rrgr^rrrepo local_dirrcryrtrrHrdgit_addrsleep is_repo_clean git_commitgit_push)rpatternsrcurrent_contentcontentpatterns r_add_sm_patterns_to_gitignorez%Trainer._add_sm_patterns_to_gitignores))+ +-CD 77>>"'',,tyy':':LI Jbggll499#6#6 EF +!"#&&( + +!O" .Gg%##D)w&GG9~-G  . o %bggll499#6#6 EsK !q A'KL  ! ,' 3yy&&( II !J K II   )3 + +  ! !s$G#0*G0#G-0G9c > i}tdr@|jjj |jjj}d|vr8|jjdkDr t d|d|j_|jjj }tdrSgd}td$i|Dcic]}||j|c}}tdr|jj|_ |jd}tds |r?td |r+|jjstjd |_|jd d |jji}|jj /td s td|jj |d<tdr|d<n|j#|t%|j&dr|j&j(~|j&j(dkDred|_t-j.t0t-j.dkDr$t3|j&j(|d<n t dt5d$i||_|j6j8|_dt=j>|j:j@vr:tCjD|j:|jjF|_tI|j6jJd ddu|_&tI|j6jJdddu|_'tI|j6jJdddu|_|jNr|j6jJjP}dD]=}tS|||jjTjW|tI||?|jXr!|jjZr t d|jLr'tI|jdd|j]|jj^rL|jLs |jNr4|jj`r|jLrdnd} t | d|jLrN|j6jJjjbd k(r!|jjdr t d!|jj^rN|jNrAd"tg|j6jJjPjhvr t d#yyycc}w)%Nrrr zThe `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`.) split_batchesdispatch_batches even_batchesuse_seedable_samplerz1.1.0 non_blockingrzp`non_blocking` is only supported in accelerate v0.30.0 and above. Please upgrade accelerate to use this feature.zx`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both.gradient_accumulation_kwargsr>z1.10.1zgParallelismConfig requires accelerate v1.10.1 and above. Please upgrade accelerate to use this feature.r|dataloader_configrTr)rtorch_tp_pluginz4Requires accelerate>1.3.0 to use Tensor Parallelism.use_gather_object)rr)limit_all_gathersactivation_checkpointingzThe activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic when using FSDP.r< DeepSpeedrzJ can't be used with `save_only_model` along with `load_best_model_at_end`.rpzo`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDPSHARDED_STATE_DICTzWsave_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'r)5ryraccelerator_configrrrto_dictrr> data_seedrArrr@rr>r|r{r?rrr&rraccelerate_versionrrrSr#rrrrrreval_use_gather_objectr6rvrGrNrr8rErrr"rrr! zero_stagerrr) rgrad_acc_kwargsrr rrrrrwrappers rr(z*Trainer.create_accelerator_and_postprocessBs "8 ,1M1M1j1j1v"ii::WWO / )yy44q8 L 9H 8T 5!YY99AAC "8 , m  7!EVWE5,0077W! 'w/.2ii.A.A!+)--n= &x0!GDII$C$CO.:  *=>  : :  99 ' ' 3*84!}*.)E)ED% & "8 ,(9D$ % KK* + 4::y )djj.@.@.LQUQ[Q[QcQcfgQg!%D }}/07==3II*CDJJL^L^*_&' !WXX'..#//BB '"3"3D4H4H"I"T"T T#,#4#4$$ 8X8X$D %,D,<,<,B,BDVX\$]ei$i!&t'7'7'='=}dS[__$T%5%5%;%;=NPTU]aa   **00<%>kFGy(rst t  % %  &&77BBaG ..B  II % %$$$D,<,<,B,B,N,N,^,^(__vw w`% &}XsV cddlm}|jjj}||j j |_|j j |_|j j|j|y)zO Sets values in the deepspeed plugin based on the Trainer args rr-N) r;r.rSrvr>r/r}rr=r)rrr. ds_plugins rrz#Trainer.propagate_args_to_deepspeedsg Q$$**;; !9):P:P:W:W!X %.%;%;%B%B "55diiAUVrc|jr`t|jrIddlm}ddlm}t|jj|r5||j|jjj_ t|jddtjk(r|jj j"j$j&rt)j*t,t)j*dkDrZ|jjjj/|jj j"j$dyyyyyy)Nr) PeftConfig)fsdp_auto_wrap_policyrz0.27.0T)r)rNrrrrpeft.utils.otherrractive_peft_configrSrvrrr6rrOrrCbnb_4bit_quant_storagerjrrrset_mixed_precision)rrrs rr$z"Trainer._fsdp_qlora_plugin_updatess    N4::$> ' >$**77DF[\`\f\fFg  &&22C $94@DVDeDeeJJ++??VVhhMM"45 h8OO  &&22FFJJ++??VVaeGPif%? rrr;cbd}t|dkDxr%d|dvxr|jxs|jdu}|r td|D}|I|j jrR|j jdk\r9|jj|j|j}n2|j jdk\r||j jz}tj|r|j|}|j jdkDrH|jdk(r5|j!dj#|j jd}t%|jddx}r||j&z}|S#tt f$rY`wxYw)a Counts the number of items in the batches to properly scale the loss. Args: batch_samples (`list`): List of batches device (`torch.device`): The device on which the number of items in the batch should be. Returns: None if the number of items in the batch doesn't need to be computed else the number of items in the batch Nrrc3`K|]&}|djdj(yw)rrN)nerD)rr&s rrz2Trainer._get_num_items_in_batch..s((ee%/*<*X&XY  ,,Z8L~~!#,0J#JS ::Q>N$ %.0F$F!#'99T-B-B#C $($5$5j$ADDYDY$Y! ^^a "{{ )2 &1DNNBL $1G G NN#%   &       r) NNNNNNNNNNrNN)rNr)NFN)F)TN)NNN)NNNNN)NNminimizeNNr)FN)NF)NNr)Nr!) NNNNNNNNN)zEnd of trainingTNN)r5 __module__ __qualname____doc__trainer_pt_utilsrrrrrrr r*rModuler rirrrrrr3r$r!r1r rSrr9rrr? OptimizerrZLambdaLRrr rrpropertyrsetterrrr_rrrPrrrrrmrnSamplerrrrrr rrrr"r(r0r%rUr[ parameterrr] staticmethodr=r&r r(rBrrZrTr/r~rrrUrrrrbrr_rr3rTr6rrrrr'rr(r+rTrQrbrarhrqrzrrrrDrrrr`rrr`rPrcrrrVrrRrrrrHrerkrrhrdrrrr(rr$r;rHrr7rrrrrr1s| \~lk[+=wdhi:>,004W[Y] ?C04FJ59jvaehl!Vc_bii56Vc()Vc - Vc  g@R&R ST Vc uWd3<.@BT%TUV Vc# )+=?UWee f VcXc?&:;<Vc$H-Vc"(N+;T+A"BCVcD12Vc(5;;#8#898EKKD\D\DeDe;ffgVc#+5ekk6K6K1LdSVX[S[n1\+]"^Vc (0%,, 9UW\WcWc9c0d'e!VcjVcp %8$;<%%11 &<" 5< 8  GR \;.@;xX[};BEI'%'4B=A48 u (sDy)9 :u^T#s(^T9:u'tCy1 u  un S Y#YimwHr(X=tzx ,hl'^R)_V$"L:Ix0id?1Bq5f1(56%9RLPKO+5;?=AR8^$4d3:6F$FGHR$Hd3:.>-?-F$GHR R d3i( R % 678 R(N#3S#89:R wW % &Rh_S%Z(_huo_QU_.5s):#;ellTWFW@X$d3ellC6G0H+H&IdSVX]^c^j^jlo^oXpSpNq )VS.d3ell\_N_H`C`>aS.j  HTN  6: \!yy\!S% c 1223\!%U\\2 \!  \!D %59 Q;yyQ;S% c 1223Q; Q; %U\\2 Q;f2t2 0t 06[Xc]6[46[pF>HSMF>P)L )LV "1FRW#" c#"J:6FJ+/!' fuWd3<.@%@ABfd3i(f f c5j  fRhn>t#>t2:492E>tad>t >tH04+/!' VuVuVu'tn Vu d3i( Vu  Vu Vup0,0 j&yyj&S% c 1223j&# j& d3i( j& x %x '=x ?UU V j&Xc5s9J3K.K)L$%(3-%(#'!%,0$((,-148/348Hh3-Hh#HhCcD() Hh SM Hh ! HhS$s)T)*HhCcD01HhsDIt+,HhCcD01HhTB9H4):#"& N   N N } N 3- N  N p04+/!' xqxqxq'tn xq d3i( xq  xq xqt' %!Nyxv W 1"T1"5<<1"T\]bchcocoqtct]uTv1"f1&1581BG,,1 tXeELL#$5677 81"> %> 3=> WZ> rr(rrrIrrAimportlib.metadatarrrwrrgrrrfrrRrr1collections.abcrrrpathlibrtypingrr r r r integrationsrhuggingface_hub.utilsrmrrrsafetensors.torchrrtorch.distributed distributedr huggingface_hubrrr packagingrrtorch.utils.datarrrrrrrconfiguration_utilsrdata.data_collatorrrr debug_utilsrr!feature_extraction_sequence_utilsr feature_extraction_utilsr!rbr"r#image_processing_utilsr$integrations.deepspeedr%r&r'integrations.tpur( modelcardr)modeling_utilsr*r+r,models.auto.modeling_autor-r. optimizationr/r0processing_utilsr1 pytorch_utilsr2tokenization_utils_baser3trainer_callbackr4r5r6r7r8r9r:r;rr<r=r>r?r@rArBrCrDrErFrGrHrIrJrKrLrMrNrO trainer_utilsrPrQrRrSrTrUrVrWrXrYrZr[r\r]r^r_r`rarbrcrdrerfrrgrhrirjrkrlrmrnrorprqrrrsrtrurvrwrxryrzr{r|r}r~rrrrrrrrrrrrrrrrrrrutils.deprecationrutils.import_utilsrutils.quantization_configrr\rautils.notebookrrtorch_xla.core.xla_modelr xla_modelrtorch_xla.debug.metricsrHrCr[torch_xla.runtimeruntimer torch_xla XLA_VERSIONrrtorch_xla.distributed.spmdspmdr!smdistributed.modelparallel.torch modelparallelrusmdistributed.modelparallel SMP_VERSIONrtrrrrrrrrrraccelerate.staterr:rrrrrrr DATA_SAMPLERSraccelerate.data_loaderrrrrrrrK get_loggerr5rrrbrerJr9rfrrrrrrs    -@@  - AAcc1\\<G<`6ee1&RR3,=   ,2KJ)))))))))))T/(9)),8 8))))"4*W]];7=7==I_;``//"33F - k :mgmmF>S Sgg %:<1#OMw}}'(=7==+AA><+,,M>8$8 7&   H %)) $&  dT dT  dT r