`L iJdZddlZddlZddlmZddlmZddlmZddl Z ddl Z ddl mZddlmZdd lmZmZdd lmZdd lmZd ed edefdZd ed ede j2fdZdZ ddZ ddZ ddZy)z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.N) OrderedDict) Generator)List)_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support) pd_fillna arff_datainclude_columnsreturncNtttf}t|Dcic]\}}|| }}}t|d|d|dD]J\}}}||vs |dj||dj||dj||L|Scc}}w)aObtains several columns from sparse ARFF representation. Additionally, the column indices are re-labelled, given the columns that are not included. (e.g., when including [1, 2, 3], the columns will be relabelled to [0, 1, 2]). Parameters ---------- arff_data : tuple A tuple of three lists of equal size; first list indicating the value, second the x coordinate and the third the y coordinate. include_columns : list A list of columns to include. Returns ------- arff_data_new : tuple Subset of arff data with only the include columns indicated by the include_columns argument. rr)list enumeratezipappend) r r arff_data_new array_idx column_idxreindexed_columnsvalrow_idxcol_idxs c/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py_split_sparse_columnsrs.*.(@M;D_;U"7)Z I"%Yq\9Q<1!N@Wg o % !  # #C ( !  # #G , !  # #$5g$> ? @ s B!c0t|ddz}|t|f}t|Dcic]\}}|| }}}tj|tj }t |d|d|dD]\}} } | |vs ||| || f<|Scc}}w)Nrdtyperr)maxlenrnpemptyfloat64r) r rnum_obsy_shaperrryrrrs r_sparse_data_to_arrayr*9s )A,!#GO,-G;D_;U"7)Z I  +A!$Yq\9Q<1!N9Wg o %58Ag(11 29 Hs Bcz||}t|dk\r ||}||fSt|dk(r ||d}||fSd}||fS)a Post process a dataframe to select the desired columns in `X` and `y`. Parameters ---------- frame : dataframe The dataframe to split into `X` and `y`. feature_names : list of str The list of feature names to populate `X`. target_names : list of str The list of target names to populate `y`. Returns ------- X : dataframe The dataframe containing the features. y : {series, dataframe} or None The series or dataframe containing the target. rrrN)r#)frame feature_names target_namesXr)s r_post_process_framer0Ksh, mA <A ,  a4K \ a  ,q/ " a4K  a4Kc d}||}|dk(rtjntj}|dk( } tj||| } ||z} | dD cic]\} } t | t r| | vr| | }} } |dk(rt d}t| d}t |j}t| d}|j|g|d }|jd j}t|}|Dcgc] }|| vs| }}||g}t| d|D](}|j|j||d |*t!|d k\r$|d j#|dj$|d <|j'|d }t)||}~~i}|j*D]N} || d}|j-dk(rd|| <$|j-dk(rd|| <=|j$| || <P|j#|}t/|||\}}nn| d}|D cgc]} t1|| d}!} |D cgc]} t1|| d}"} t |t2rz| t5d|d dk(rd}#n |d |dz}#t7j8t:j<j?|d|#}|j@|}|dd|!f}|dd|"f}nt |tBrtE||!}$tG|ddz}%|%t!|!f}&tHjJjM|$d |$d|$d ff|&t6jN}|jQ}tS||"}nt5dtU||D chc]} | |v}'} |'sntW|'rt7jXt[|D( cgc]`\}(} t7j\t7j^|ja| d|dd|(|(dzfj#t0dbc} }(}ntc|'r t5d |jdddk(r|jAd!}n|jddd k(rd}|dk(r||dfS||d|fScc} } wcc}wcc} wcc} wcc} wcc} }(w)"aARFF parser using the LIAC-ARFF library coded purely in Python. This parser is quite slow but consumes a generator. Currently it is needed to parse sparse datasets. For dense datasets, it is recommended to instead use the pandas-based parser, although it does not always handles the dtypes exactly the same. Parameters ---------- gzip_file : GzipFile instance The file compressed to be read. output_arrays_type : {"numpy", "sparse", "pandas"} The type of the arrays that will be returned. The possibilities ara: - `"numpy"`: both `X` and `y` will be NumPy arrays; - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a pandas Series or DataFrame. columns_info : dict The information provided by OpenML regarding the columns of the ARFF file. feature_names_to_select : list of str A list of the feature names to be selected. target_names_to_select : list of str A list of the target names to be selected. Returns ------- X : {ndarray, sparse matrix, dataframe} The data matrix. y : {ndarray, dataframe, series} The target. frame : dataframe or None A dataframe containing both `X` and `y`. `None` if `output_array_type != "pandas"`. categories : list of str or None The names of the features that are categorical. `None` if `output_array_type == "pandas"`. c3@K|D]}|jdyw)Nutf-8)decode) gzip_filelines r_io_to_generatorz+_liac_arff_parser.._io_to_generators$ 'D++g& & 'ssparsepandas) return_typeencode_nominal attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deeprrr) ignore_index data_typeintegerInt64nominalcategoryindexNz6shape must be provided when arr['data'] is a Generatorr&)r!count)shaper!z-Unexpected type for data obtained from arff: Or )r@zAMix of nominal and non-nominal targets is not currently supported)rI)3rCOO DENSE_GENload isinstancerr rkeysnext DataFrame memory_usagesumr r rr#astypedtypesconcatr r?lowerr0intr ValueErrorr$fromiter itertoolschain from_iterablereshapetuplerr"spr9 coo_matrixr&tocsrr*typeallhstackrtakeasarraypopanyrK))r6output_arrays_typeopenml_columns_infofeature_names_to_selecttarget_names_to_selectrKr8streamr;r<arff_containercolumns_to_selectnamecat categoriespd columns_info columns_names first_rowfirst_df row_bytes chunksizecolcolumns_to_keepdfsr>r,rW column_dtyper/r)r col_namefeature_indices_to_selecttarget_indices_to_selectrJ arff_data_Xr'X_shapeis_classificationis) r_liac_arff_parserrksfn'i (F 2X=%))5??K-89NZZKN02HH( 5 D# c4 T->%> c J X% !"C D">,#?@ \..01 /0 << ]<O))t)488: $Y/ +8T33BS;S3TT()#N6$:IF D JJ T=u EoV   s8q=V]]3q6==1CF  #D 1"e$ MM 2D.t4[AL!!#y0 't ##%2)t $||D1t  2 V$" *,B 1#6* 4%  #H-g6 7% !% 3$  #H-g6 7$  $ i +} LQx2~a58+;;--i8D  4<<'DQ112AQ001A  5 )/ ;TUK)A,'!+G$= >?G $$Q+a.+a.!ABjj%A  A%i1IJA?Y?PQ  4J '/H "  !  " # (11G'H  $8 GG :>>(#;3G!QQY,..s.?A" #S  771:? % A WWQZ1_AX%!UD  az !!E&UL% $ N s+!R. R4R48R9R>) S!A%S c ddl}|D]2}|jdjjds2ni}|D]<} || d} | jdk(rd|| <$| jdk(s8d || <>t |D cic]\} } | |vr| || } } } dd d gd d d dd| d } i| |xsi}|j |fi|} |D cgc]} | c} |_||z}|j Dcgc] }||vs| }}||}tjdfd}|jjD cgc]\} }t||jr| }} }|D]#}||j j#|||<%t%|||\}}|dk(r|||dfS|j'|j'}}|jjD cic]6\} }t||jr| |j(j+8}} }||d|fScc} } wcc} w#t$r!}|jjd|d}~wwxYwcc}wcc}} wcc}} w)a^ARFF parser using `pandas.read_csv`. This parser uses the metadata fetched directly from OpenML and skips the metadata headers of ARFF file itself. The data is loaded as a CSV file. Parameters ---------- gzip_file : GzipFile instance The GZip compressed file with the ARFF formatted payload. output_arrays_type : {"numpy", "sparse", "pandas"} The type of the arrays that will be returned. The possibilities are: - `"numpy"`: both `X` and `y` will be NumPy arrays; - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a pandas Series or DataFrame. openml_columns_info : dict The information provided by OpenML regarding the columns of the ARFF file. feature_names_to_select : list of str A list of the feature names to be selected to build `X`. target_names_to_select : list of str A list of the target names to be selected to build `y`. read_csv_kwargs : dict, default=None Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite the default options. Returns ------- X : {ndarray, sparse matrix, dataframe} The data matrix. y : {ndarray, dataframe, series} The target. frame : dataframe or None A dataframe containing both `X` and `y`. `None` if `output_array_type != "pandas"`. categories : list of str or None The names of the features that are categorical. `None` if `output_array_type == "pandas"`. rNr4z@datarCrDrErFrGF?%"T\) header index_col na_valueskeep_default_nacomment quotecharskipinitialspace escapecharr!zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.z^'(?P.*)'$cZtj|}||S|jdS)Ncontents)researchgroup) input_stringmatchsingle_quote_patterns rstrip_single_quotesz0_pandas_arff_parser..strip_single_quotess. . = = {{:&&r1r:)r:r5rY startswithrread_csvr?r[errors ParserErrorrcompilerWitemsrPCategoricalDtypertrename_categoriesr0to_numpyrutolist)r6rlrmrnroread_csv_kwargsrvr7rWrsrrdtypes_positionaldefault_read_csv_kwargsr,excrrr}r~rr!categorical_columnsr/r)rurs @r_pandas_arff_parserr7sp ;;w  % % ' 2 27 ; F#&*40=    9 ,#F4L    !Y .%F4L&'':; GT 6> U  " M0L_5JLO BKK 5_ 5E  +>>$> 02HH&+mmPss>O7OsPOP / "E::&=>'!<<--/ D% eR00 1  #K3Z^^556IJc K u&=?U VDAqX%!UD  zz|QZZ\1!<<--/ D% eR00 1 e%%''J az !!W0? ii## @  Q.sHH-H8 H3H88 I%I%"I*);I03H88 I"II"ct|dk(rt||||||S|dk(rt||||||Std|d)a6Load a compressed ARFF file using a given parser. Parameters ---------- gzip_file : GzipFile instance The file compressed to be read. parser : {"pandas", "liac-arff"} The parser used to parse the ARFF file. "pandas" is recommended but only supports loading dense datasets. output_type : {"numpy", "sparse", "pandas"} The type of the arrays that will be returned. The possibilities ara: - `"numpy"`: both `X` and `y` will be NumPy arrays; - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array; - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a pandas Series or DataFrame. openml_columns_info : dict The information provided by OpenML regarding the columns of the ARFF file. feature_names_to_select : list of str A list of the feature names to be selected. target_names_to_select : list of str A list of the target names to be selected. read_csv_kwargs : dict, default=None Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite the default options. Returns ------- X : {ndarray, sparse matrix, dataframe} The data matrix. y : {ndarray, dataframe, series} The target. frame : dataframe or None A dataframe containing both `X` and `y`. `None` if `output_array_type != "pandas"`. categories : list of str or None The names of the features that are categorical. `None` if `output_array_type == "pandas"`. z liac-arffr:zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.)rrr[)r6parser output_typermrnrorKrs rload_arff_from_gzip_filersqv    # "     8 "    # "    x'L M  r1)N)NN)__doc__r]r collectionsrcollections.abcrtypingrnumpyr$scipyrb externalsrexternals._arffrutils._chunkingr r utils._optional_dependenciesr utils.fixesr rndarrayr*r0rrrr1rrs?  #%0??# ! 48  F ! 48 ZZ $L  I"d U"~ P r1