K i ddlZddlZddlZddlZddlZ ddlZddlZddl m Z ddl Z ddl mZddlmZmZmZmZmZmZddlmZddlmZ ddlmZddlmZm Z m!Z! ddl"Z#ddl$m%Z&ejNj8ejNjPgZ)dZ*ejNjDdZ+ejNjDdZ,ejNjDd Z-ejNjDd Z.ejNj_e0e1fd ejNjDd Z2ejNjDdZ3ejNjDdZ4ejNjDdZ5ejNjDdZ6ejNjDdZ7ejNjDejNjqddgdggejrddkejrdddkejrddjue jvdkfejNjqdddZk(d?@dAZ^ejNjDejNj|dBZ_ejNjDdCZ`dUdDZaejNjDdEZbejNjDdFZcdGZddHZedIZfdJZgdKZhdLZidMZjejNjqdNdOdPZkejNjDdQZly#e$rdZYwxYw#e$rdZYlwxYw#e$rdxZ#Z&YqwxYw)VN) FileSelector FileSystemLocalFileSystem PyFileSystemSubTreeFileSystem FSSpecHandler)util)guid) _read_table_test_dataframe _write_tablectjdgdi}|dz }|j|dz }tj|t |tj |t}|j|sJtj dtj|}|j|sJy)Nadata_dir data.parquet filesystemzdata_dir/data.parquet) patablemkdirpq write_tablestr read_tablerequalsr _filesystem_uri)tempdirr directorypathresults h/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/pyarrow/tests/parquet/test_dataset.pytest_filesystem_urir&;s HHc9% &E*$I OO ~ %DNN5#d)$]] *,F ==  ]]D,@,@,IKF ==  c0t}t||yN)r_partition_test_for_filesystem)r!locals r%test_read_partitioned_directoryr,Ns  E"5'2r'ct}|}t||tj|}|j dg}|j dgk(sJy)Nvaluescolumns)rr*rParquetDatasetread column_names)r!r+ base_pathdatasetr$s r%'test_read_partitioned_columns_selectionr6TsS  EI"5)4 *G \\8*\ -F   8* ,, ,r'c ht}|}ddg}gd}ddg}d|gd|gd|gg}tjtj|d j d tj tj tj|t d d tj tj tj|d d dtjdd}t||||tj||gd}|j} | jjd} d| djvsJd| djvsJd| djvsJgdddgg} tj||| }|j} | jjd} | ddk(| ddk7z| ddk(z} tj| ddk(| ddk(z} | j!dkDsJ| j!dkDsJ| j"d| j!| j!zk(sJdggdggfD]9} tj||| }|jj$dk(r9Jy)NrrrbcTFintegerstringbooleani4dtyperboolrr;r<r=r.))r;=r)r<!=r9)r===Truerfiltersdropr9)r;rFr)r=rHFalserIrN)r<rHs1a)r<rHz1a)rpd DataFramenparrayrepeattileobjectarange_generate_partition_directoriesrr1r2 to_pandas reset_indexr.sumshapenum_rows)r!r+r4 integer_keys string_keys boolean_keyspartition_specdfr5r result_dfrK df_filter_1 df_filter_2s r%test_filters_equivalencyreas  EIq6L!K%=L L! ; L!N 88L5<r?indexrirkr/)rirrJbyTrLrrrrOrPrQrVrRrWrr1r2rX sort_valuesrYmapintr. r!r+r4r]r`Nrar5rrbx result_lists r%%test_filters_cutoff_exclusive_integerrxs  EI"L \"N A 1HH\6$ &B $E9nbIe  G LLNE""{g{."{{-"#y'<'C'CDE1EKE 1a&  Fs# C8z5Loss of type information in creation of categoricals.)raisesreasonc 8t}|}tjdddtjdddtjdddtjdddtjdddg}d|gg}d }tjt j |t j|d d d dg}t||||tj||ddg}|j}|jjd jd} tjt jtjdddgd t j|d } | dj | k(sJy)Nirh datesrB datetime64r?)rkrrkr/)rrlz 2018-04-12)rrmz 2018-04-10rJrnTrL categories)rdatetimedaterOrPrQrVrRrWrr1r2rXrqrY Categoricalr.) r!r+r4 date_keysr`rurar5rrbexpecteds r%&test_filters_cutoff_exclusive_datetimerss  EI  dAq! dAr" dAr" dAr" dAr" I )N A 1)<8! #B $E9nbIe ( ( G LLNE""{g{."{{-~~ (--a,-\B88I\:r?rjrkr/)rirr)riz>=rrJrnTrLrrrprts r%test_filters_inclusive_integerr s  EI"L \"N A 1HH\6$ &B $E9nbIe ! ! G LLNE"++)+4+($'sIj,A,H,H#IJa3q6JKJ 1a&  Ks#Dc  t}|}ddg}gd}ddg}d|gd|gd|gg}tjtj|d j d tj tj tj|t d d tj tj tj|d d dtjdd}t||||tj||dg}|j} | jjd} d| djvsJd| djvsJd| djvsJtj||dddgfddddhfg}|j} | jjd} d| djvsJd| djvsJd| djvsJy)Nrrr8TFr;r<r=r>r?rArBrrCrrDrE)r<inabrJrLrr9r:r)r<rrr9znot inrN)rrOrPrQrRrSrTrUrVrWrr1r2rXrYr.) r!r+r4r]r^r_r`rar5rrbs r%test_filters_inclusive_setr.s   EIq6L!K%=L L! ; L!N 88L5<r?rjrkr/)rirrrJ)riz= " "a '' ' +0*4dQC)@(ACG * +,||~&&!+++,,'== ==,,s$F>F FF FF$ct}|}gd}d|gg}d}tjtj|tj |ddddg}t ||||d }tjt| 5tj||d g jdddy#1swYyxYw) NrgrirBr>r?rjrkr/z1No match for FieldRef.Name\(non_existent_column\)match)non_existent_columnrlrrJ) rrOrPrQrVrRrWrryrrr1r2)r!r+r4r]r`ruramsgs r%test_filters_invalid_columnrs  EI"L!<01N A 1HH\6$ &B $E9nbI >C z -N )#B"E GGKtvNNNs (B88CrK)rirlrrirnestedrr9 read_method)r read_pandasc tt|}t}|}gd}d|gg}t|}t j t j|t j|dt jt|D cgc]} | t| dc} d} t|||| t||} ||fi| } | jdk(sJycc} w) Nrgrir>r?r)rkrirrJr)getattrrrlenrOrPrQrVrRrrrWdictr\) r!rKrr2r+r4r]r`ruirakwargsrs r%test_filters_read_tablers 2{ #D  EI"L \"N LA 1HH\6((58Da!#a&1DE B $E9nbI UG 4F  %f %E >>Q  EsCcht}|}ddg}d|gg}d}tjtj|tj |ddddg }t ||||tj|}|j}|jdj|k(sJy) N2019_22019_3 year_weekrrUr?)rkrrkr/) rrOrPrQrVrRrWrr1r2rr) r!r+r4r^r`rurar5r$s r%$test_partition_keys_with_underscoresrs  EIX&K k"N A 1XXk:% 'B $E9nbI *G \\^F == % / / 1[ @@ @r'c|\}}|dz}tjdgdi}t|||t||}|j |sJyNz /test.parquetrrrrrr r r)s3_example_s3fsfsr#rr$s r%test_read_s3fsrsRHB / !D HHc9% &E, " -F ==  r'c|\}}|dz}tjdgdi}t|||t||}|j |sJyrr)rrr"r#rr$s r%test_read_directory_s3fsrsR#MB  &D HHc9% &E, r 2F ==  r'ct|dz }tjdgdi}t||t j |gj }|j|sJy)Nrrr)rrrr rr1r2r)r! data_pathrr$s r%test_read_single_file_listrs[Gn,-I HHc9% &E "    { + 0 0 2F ==  r'c&|\}}t||yr))r*rrr#s r%$test_read_partitioned_directory_s3fsrsHB"2t,r'c ddg}gd}d|gd|gg}d}tjtj|tj|dj d tj tj tj|td d tjj|d gd  }t||||tj||}|j}|jjdj!d} |jdj!dj#| j$ } | dj'd| d<| dj'd| d<| j$gdk(j)sJt+j,| | y)Nrrr8foobarrDr>r?rArBr)rkrrr.r/rrkrnTrLcategory)rkr.rr)rOrPrQrVrRrSrTrUrandomrandnrWrr1r2rXrqrYreindexr0astypealltmassert_frame_equal) rr4foo_keysbar_keysr`rurar5rrb expected_dfs r%r*r*s1vHH  N A 1xx-44R8wwrwwrxx?CQG))//!$  1  2B$B >2F b9G LLNE"++)+4+(>>W>-KTK*GI$5$5G6%U+22:>K$U+22:>K   !B B G G II I)[1r'c ttsttt t dt ddfd|dgy)Npathsepsep/cZ|\}}|D]v}|||fgz}jt||d|g}j||dz k(rddlm}j|t g} t |} tjj| } j| 5} t| | dddj| j|jk7sJj| j|jk(sJj|dg} j| 5} ddd<||dz|j|dg} j| 5} dddyy#1swYxYw#1swYxYw#1swYxYw)NrFrr)FileType_SUCCESS)joinr create_dir pyarrow.fsrr _filter_partitionrTable from_pandasopen_output_streamr get_file_infotypeNotFoundFile)base_dirlevel part_keysnamer.valuethis_part_keys level_dirr file_path filtered_df part_tablef file_successDEPTH _visit_levelrarr`rs r%rz5_generate_partition_directories.._visit_level2s%e, f E&4-8N H &%!&I MM) $ !/#LL)TV)<= /NC XX11+> **950 Q/0'' 2778;L;LLLL'' 2778==HHH&||Y ,CD **<8AY >B&||Y ,CD **<8A7  00  s$/ F<F9F F F F* r) isinstancerrrrr)rrr`rarrrs` ``@@@r%rWrW'sX b* % -+ ,  Eb)WR%<=G@1b!r'cDtjt|t}g}|D]`\}}|j |t |t jt j frtj|}||||k(z}b||j|dS)Nr?r)axis) rQonesrrCappendrrrrO TimestamprM)rar predicateto_droprrs r%rrUsBt,IG ' et ehmmX->->? @LL'ERX&& ' i=  gA  ..r'c6|dz }|jtjjt j dgdi}t j||dz |dz }|jtjjt j dgdi}t j||dz t j|dgg}|jdjtjgdgsJy) NzA=0BrrzA=1r8)ArHrr) rrrrrOrPrrrrr chunked_array)r!dir1table1dir2table2rs r%"test_filter_before_validate_schemares U?DJJL XX ! !",,Y/?"@ AFNN64.01 U?DJJL XX ! !",,_/E"F GFNN64.01 MM'^,<+= >E <<  # #B$4$4i[$A BB Br'crd}d}|tz }|jg}g}t|D]}t||}|dj t j |d<||dz }tjj|} t| ||j| |j||dz jdd} | |} tj|} | j| sJd d d | jd z g} | Dcgc]}| j!|j"}}t%j&|| }tjj)| Dcgc]}| j+|c}|| j,j.} |j| sJt%j&|dt|j0ddddf}|tdz }tjj|}t||ycc}wcc}w)Nr}rBseeduint32.parquetz _SUCCESS.crcTcTtj|fi|}|j||S)N)r0 use_threads)rr1r2)pathsr0rrr5s r%read_multiple_filesz5test_read_multiple_files..read_multiple_filess*##E4V4||G|EEr'rrrr/)namesmetadata)rrh)NT)r rrr rrQint64rrrr rtouch concat_tablesr num_columnsfieldrrr from_arraysrschemariloc)r!nfilessizedirpath test_datarrrar#rrr$rto_read col_namesout bad_applebad_apple_pathts r%test_read_multiple_filesr'{s F DG MMOI E 6]  T *(|**28848 A3h'$$R(UD! T  ~$$&F! 'F *H == "" "!Q**Q./G/67!a%%7I7 -- 3Cxx##w$G!V]]1%5$G*3-3]]-C-C$EH ::h  MM't, 1-221bqb59I$&22N Y'AN#8$Gs "H/3H4cd}d}|tz }|jg}g}g}t|D]}t||}t j ||z|dz|z|_d|j _||dz } tjj|} t| | |j| |j||j| tj|} ddg} | j| j!} t#j$|Dcgc]}||  c}}t'j(| || jt+| j!} | j,|j,k(sJt'j(| j/|j0|ycc}w) NrBr rrkr uint8stringsr/)r rrr rQrVrkrrrrr rrr1rrXrOconcatrrrr[rr0)r!rrrr framesrrrar#rr5r0r$rvrs r%test_dataset_read_pandasr-s F DG MMOI F E 6]  T *99QXA~6 A3h'$$R(UD! b T (G "G   1 ; ; =Fyyf5!G*56H&(+ W 6 @ @ BF <<8>> )) )&..1A1A.BHM6s, G c,|tz }|jtdd}|dz }tjj |}t ||dtj|d}|jj|sJy) Nr}rr  0.parquet2.6versionT) memory_map) r rr rrrr rr1r2r)r!rrar#rr5s r%test_dataset_memory_mapr4s~G MMO ! $B [ D HH  $Ee,D"G <<>  '' 'r'c|tz }|jtdd}|dz }tjj |}t ||dtjt5tj|ddddd D]:}tj||}|jj|r:Jy#1swYIxYw) Nr}rr r/r0r1i) buffer_size)i)r rr rrrr rryrrr1r2r)r!rrar#rr6r5s r%#test_dataset_enable_buffered_streamr8sG MMO ! $B [ D HH  $Ee, z "&   &&#, ## .||~$$U+++, &&s 6CCc|tz }|jtdd}|dz }tjj |}t ||ddD]d}tj||}|jj|sJtj||}|j|rdJy) Nr}rr r/r0r1)TF) pre_buffer) r rr rrrr rr1r2rr)r!rrar#rr:r5actuals r%test_dataset_enable_pre_bufferr< sG MMO ! $B [ D HH  $Ee,#$ ##  ,||~$$U+++w:>}}U### $r'cg}g}t|D]C}t||}||dz }|jt|||j|E|S)Nr r )rr rr )r4r file_nrowsr rrrar#s r%_make_example_multifile_datasetr?scI E 6] Za 0aS>)b$/0 T  Lr'c|Dcgc]}t|j}}t|t|jk(sJycc}wr))ras_posixrfiles)r5rr#s r%_assert_dataset_pathsrC+s@.3 4dS ! 4E 4 u:W]]+ ++ + 5s A  dir_prefix_.c|tz }|jt|dd}||dz jtj|}t ||y)Nr}rBrr>stagingr rr?rr1rCr!rDrrr5s r%test_ignore_private_directoriesrL0s^G MMO +GB78 :E *W%%,,.(G'5)r'c|tz }|jt|dd}|dz jd5}|j dddd|dz jd5}|j ddddt j |}t||y#1swYYxYw#1swY7xYw)Nr}rBrHz .DS_Storewbs gibberishz.privater rr?openwriterr1rCr!rrrr5s r%test_ignore_hidden_files_dotrSAsG MMO +GB78 :E K  % %d +q   J  $ $T *a  (G'5)B*.B6*B36B?c|tz }|jt|dd}|dz jd5}|j dddd|dz jd5}|j ddddt j |}t||y#1swYYxYw#1swY7xYw)Nr}rBrH_committed_123rNsabcd _started_321rOrRs r%#test_ignore_hidden_files_underscorerXTsG MMO +GB78 :E $ $ * *4 0A  N " ( ( .! (G'5)rTc||dz tz }|jdt|dd}tj|}t ||tj|}t ||y)NdataTparentsr}rBrHrJrKs r%/test_ignore_no_private_directories_in_base_pathr]gss :,d++df4G MM$M +GB78 :E&G'5)(G'5)r'c dgdzdgdzz}tjtjtt |tj|j gddg}t j|t|dg|dz }|jt j|t|dgt j|d g }|j|sJy) Nxxxryyyrk_partrpartition_cols_private_duplicate_private)ignore_prefixes) rrrRrrdictionary_encoderwrite_to_datasetrrrr)r!partrprivate_duplicater2s r%test_ignore_custom_prefixesrlzs 7Q;%1 $D HH s4y!" ((*w  !E s7|WIF"66s#45(/y2 ==* /D ;;u  r'c|dz }|jtj|}|j}|jdk(sJ|j dk(sJy)Nr5r)rrr1r2r\r)r! empty_dirr5r$s r%test_empty_directoryrosW)#I OO *G \\^F ??a      "" "r'c ddl}ddlm}ddlm}|j t dt dt tdtjgdztjdddjd d }|jj}d d g} tjj!||d d } |j"| || |t$j&j)t+|d} |9|j-| d5} |j.| j0| dddn|j.| j0| |j2||} t5| j0j6}|t5| j0j6k(sJ| j9}|j;}|jj}| |dt=| zdk(sJ||}| D]}||jd||<|r@|j?dj@jC}|dj||d<|jD||y#1swY*xYw)Nr aaabbbbccc eefeffgeeer} 2017-01-01 2017-01-11 datetime64[D]r?datetime64[ns])group1group2numnanrrwrxF)rsafepreserve_indexr_common_metadatarNrr)#pandaspandas.testingtestingpyarrow.parquetparquetrPlistrrQrzrVrr0tolistrrrriosr#rrrPwrite_metadatarr1rrr2rXrrrto_pandas_dtyper)r4rr index_namerOrr output_dfcols partition_by output_table metadata_pathrr5 dataset_cols input_tableinput_df input_df_colscolexpected_date_types r%&_test_write_to_dataset_with_partitionsrs]  |$|$E"Ix"} , OLSS  I    # # %Dh'L88'' &u7<(>LB i#-/GGLLY1CDM __]D 1 6Q B  l111 5 6 6 ,--}=b +57Gw~~++,L 3|22889 99 9,,.K$$&H$$++-M =c,.?)?)@A AA A~H;"3..z: #;#\\&166FFH%f-445GH &B)X.= 6 6s I<<Jc ddl}ddlm}|jt dt dt t dt jdddjd d }|jj}tjj|}| t}n$t|t st#t%|}d }t |D]}|j&||| t)t+|d d} |j-| } | D cgc] } | j.j1ds| "} } t3| |k(sJ|j4|| j7} | j9}|j;}||}t=j>||ycc} w)Nrrqrrr}rsrtrur?rv)rwrxryrrBrFT)allow_not_found recursiver ) rrrrPrrrQrVrr0rrrrrrrrrrirrrr#endswithrr1r2rXdrop_duplicatesrr)r4rrOrrrrnrselectorinfosinfo output_filesrrs r%$_test_write_to_dataset_no_partitionsrs  |$|$E"I , OLSS  I    # # %D88'' 2L$&  J /!- ";<  A 1X3L)'1 33C NE&*,H  $ $X .E%*MTdii.@.@.LDMLM |  !! !$"##j df$$&H'')H~H)X.Ns ( F= F=c,tt|yr)rrr!s r%%test_write_to_dataset_with_partitionsrs*3w<8r'c tjtjdtjtjdtjtjdtjtjdtj tjdtj dg}tt|| y) Nrw)rrxryrzrus)unitr) rrrr<rint32 timestamprr)r!rs r%0test_write_to_dataset_with_partitions_and_schemar s YY < <RXXZ8RXXZ8bll.EF HIF + G V%r'c0tt|dy)Nr)rrrs r%4test_write_to_dataset_with_partitions_and_index_namers* G /r'c,tt|yr))rrrs r%#test_write_to_dataset_no_partitionsrs(W6r'c<t|dz t|dz y)Ntest1test2)rrrs r%test_write_to_dataset_pathlibr s*7W+<=(7):;r'c|\}}tjtd5t|dz |dddtjtd5t |dz |dddy#1swY>xYw#1swYyxYw)Nz"path-like objects are only allowedrrrr)rryrrr)r!rrrEs r%&test_write_to_dataset_pathlib_nonlocalr&s EB y(L M.. g " .. y(L M., g " ... ....sA/A;/A8;Bwin32z,test fails because of unsupported characters)rzc(|\}}t||yNr)rrs r%*test_write_to_dataset_with_partitions_s3fsr5s HB* r'c(|\}}t||yr)rrs r%(test_write_to_dataset_no_partitions_s3fsrAsHB( r'ctjdgdi}tjj |}t |}t j||tt j|}|j|sJy)Nrrr) rOrPrrrrrrirrr)r!rarr#r$s r% test_write_to_dataset_filesystemrJsg sI& 'B HH  $E w.is_pickleabless$m))-*=*=c*BCCCr')r)r!rrr5s ` r%test_pickle_datasetrqs#D)1G  !! !r'cH|dz }tjgdgdgdd}tjj |}t j |t|ddgt j|j}t j||d z y) Nz ARROW-3208)r~r}g@drg333333=@)r~r}rrrrr~)rrrrrrr)onetwothreerr) root_pathrdzoutput.parquet) rOrPrrrrrirr1r2r)r!r#rars r%test_partitioned_datasetrzs \ !D 0,& B HH  $ET(-u~7   d # ( ( *ENN5$!112r'c|dz }tjtdDcgc]}tjdc}dzgdg}tjtdDcgc]}tjdc}dzgdg}t j |t|t j |t|t j|dgj}|djdj|djdjg}|djd k(sJ|djd|djd }}|j|dr|j|d sJy|j|d sJ|j|dsJycc}wcc}w) NzARROW-3325-datasetrBr}f0rb)r)read_dictionaryrrr)rrrr randsrrirr1r2chunkrh num_chunksr) r!r#rt1t2r$ ex_chunksc0c1s r%test_dataset_read_dictionaryrs ) )D E!H5qDJJrN5:;D6 JB E!H5qDJJrN5:;D6 JBc$i0c$i0    tf&&*df AQ113AQ1135I !9  1 $$ $ AY__Q !3B yy1yy1&&&yy1&&&yy1&&&%65s G(Gctjdtjgdtji}t j ||dz t j ||dz tj dg}t j|dz |}tjdgdi|}|j|sJt j||}tjdgdi|}|j|sJt j||}tjdgdi|}|jj|sJy)Nrrz data1.parquetz data2.parquet)rrr)rrrrrr) rrrRrrrrrrr1r2)r!rrr$rs r%test_read_table_schemars HHc288Irxxz:; .file_visitors\../r'zpart-{i}.parquet)rrbasename_template1zpart-0.parquet23)rrrrirrrpathlibPath)r!rr#rrexpected_pathspaths_written_setrs @r%.test_parquet_write_to_dataset_exposed_keywordsr s HHc9% &E ^ #DM0+t3%%1*;= s %% s %% s %%N C m<=  .. .r'write_dataset_kwarg))rT)rFcddlm}tjdgdi}|dz }t j |j }|\}}|t j tjjvsJ||jvsJtjj|dd5}tj||fi||i|jd\} } } | ||k(sJ dddy#1swYyxYw) zEVerify kwargs in pq.write_to_dataset are passed onto ds.write_datasetrNrrz out.parquet write_datasetT)autospec)rr5rrinspect signaturerrri parametersmockpatchrU mock_calls) r!r rrr#rkeyargmock_write_dataset_name_argsrs r%#test_write_to_dataset_kwargs_passedrs ! HHc9% &E ] "D!!""2"23I"HC g''(;(;<GG GG G )&& && &   2  >"! E46C:61<r8sJ$  HH 44 kk!!6;;#6#6 7  &33  - -C,C,L!!B ~ & B  '1 '1T = = !!B+4+4\%,%,PNN./0012""((:.2""((8S1A5""((8S166xrxxzBQF HI (EFG I4AA.      -- $2N+"\ / CC*5$5$N"N"NJ ( (,,&$$" , Sz2 *3 ***$**$Sz2*3*"*#7;266::/|59+/\99%%// 77<<  . .CLLG+IKK   8"" 3 3 '.*. ,@0 1"/0.1" "*""c( B  BNBs5Z/ Z= [ /Z:9Z:=[[ [[