L i;$ZddlZddlZddlmZmZddlmZddlmZm Z ddl Z ddl m Z ddl mZddlmZdd lmZdd lmZmZd d lmZmZmZmZej6eZeej>Z e!d e DZ"eGddZ#GddeZ$GddeZ%y)N) dataclassfield)Enum)OptionalUnion)FileLock)Dataset)$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging) SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc#4K|]}|jywN) model_type).0confs f/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/data/datasets/squad.py r"sEDOOEsceZdZUdZeddddj eziZee d<edddiZ ee d <ed dd iZ e e d <ed dd iZ e e d<edddiZe e d<edddiZe e d<edddiZee d<edddiZee d<edddiZee d<edddiZe e d<eddd iZe e d!<ed"dd#iZe e d$<y)%SquadDataTrainingArgumentszb Arguments pertaining to what data we are going to input our model for training and eval. Nhelpz!Model type selected in the list: z, )defaultmetadatarzFThe input data dir. Should contain the .json files for the SQuAD task.data_dirzThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks. doc_stride@zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_lengthzThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativegzIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold n_best_sizerzjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_idz3multiple threads for converting example to featuresthreads)__name__ __module__ __qualname____doc__rjoin MODEL_TYPESrstr__annotations__r r"intr#r%r'r(boolr)r*floatr,r-r/rrr%s(KdiiXcNd(deJ(pqHc  Q NCrsJ" / c# J s")\ ]OT%*)o p%T(-v'rs(uf&qrK C GSf6k-lmGSmr<rceZdZdZdZy)SplittraindevN)r0r1r2r?r@r;r<rr>r>hs E Cr<r>ceZdZUdZeed<eeed<eed<e ed<dejdddfded e d e e deeefde d e ed efd ZdZdeeej(ffdZy) SquadDatasetzH This will be superseded by a framework-agnostic approach soon. argsfeaturesmodeis_language_sensitiveNFpt tokenizer limit_length cache_dirdataset_formatc ||_||_|jr tn t |_t |tr t|}||_ |jrdnd}tjj||n |jd|jd|j j"d|j$d|} | dz} t'| 5tjj)| r|j*st-j,} t/t1j2| d|_|j4d |_|j4j9d d|_|j4j9d d|_t>jAd | d t-j,| z |j: |j<dt>jCd| dnI|tjDk(r+|j jG|j|_n*|j jI|j|_tK|j<||j$|jL|jN|tjPk(|jR|\|_|_t-j,} t1jT|j6|j:|j<d| t>jAd| dt-j,| z dddddy#t$r tdwxYw#1swYyxYw)Nzmode is not a valid split namev2v1cached__z.lockT) weights_onlyrDdatasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rSrHr"r#r% is_trainingr/return_dataset)rDrRrSz!Saving features into cached file z [took z.3fz s])+rCrFr)rr processor isinstancer6r>KeyErrorrEospathr4r value __class__r0r"rexistsr(timer torchload old_featuresrDgetrRrSloggerinfowarningr@get_dev_examplesget_train_examplesrr#r%r?r/save) selfrCrHrIrErFrJrK version_tagcached_features_file lock_pathstarts r__init__zSquadDataset.__init__ws %:"/3/K/K)+QaQc dC  AT{ "::d !ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWd e )72 i - ww~~23Dr?start_position end_position) riifeaturerurvrwr}r~rinputsrrs r __getitem__zSquadDataset.__getitem__s--"LL!2!2%**E g&<&?)) wIOO5;;)WZ^ZcZcZkZk)kmn 99 ##ll7+A+ATO!LL)=)=UZZPM MMoP]^ _ r<)r0r1r2r3rr7listrr>r9r?r rr8rr6rnrqdictr_Tensorrr;r<rrBrBms %$=!! K '+"'++&+#'"J(J'Jsm J CJ J $ JC=JJX" S%,,%6 7 r<rB)&rYr^ dataclassesrrenumrtypingrrr_filelockrtorch.utils.datar models.auto.modeling_autor tokenization_utilsr utilsr rprocessors.squadrrrr get_loggerr0rcrkeysMODEL_CONFIG_CLASSEStupler5rr>rBr;r<rrs (" $M56tt   H %E@EEGHE0DEE  ?n?n ?nDD y7yr<