gL iRlpdZddlmZddlmZddlmZmZddlZddlm Z ddl Z ddl m Z ddlmZmZmZmZmZdd lmZmZdd lmZdd lmZddlZdd lmZmZmZdd l m!Z!ddl"m#cm$cm%Z&ddl'm(Z(e r ddl)m*Z*m+Z+m,Z,edZ-edZ.ddZ/ddZ0GddZ1Gdde(ejdZ3y)a Read SAS7BDAT files Based on code written by Jared Hobbs: https://bitbucket.org/jaredhobbs/sas7bdat See also: https://github.com/BioStatMatt/sas7bdat Partial documentation of the file format: https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm ) annotations)abc)datetime timedeltaN) TYPE_CHECKING) get_option)read_double_with_byteswapread_float_with_byteswapread_uint16_with_byteswapread_uint32_with_byteswapread_uint64_with_byteswap)Parserget_subheader_index)cast_from_unit_vectorized)EmptyDataError) DataFrame Timestampisna) get_handle) ReaderBase)CompressionOptionsFilePath ReadBufferz 1970-01-01z 1960-01-01ct|rtjS|dk(rtdddt |zS|dk(rtdddt |zSt d)Ns)secondsd)dayszunit must be 'd' or 's')rpdNaTrr ValueError) sas_datetimeunits \/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/pandas/io/sas/sas7bdat.py_parse_datetimer'Csa Lvv  s{a#i &EEE a#i\&BBB233cZttz jd}|dk(rNt|jdd}|j d|z}t j||jdStj|d|z}t j|d |jd S) a Convert to Timestamp if possible, otherwise to datetime.datetime. SAS float64 lacks precision for more than ms resolution so the fit to datetime.datetime is ok. Parameters ---------- sas_datetimes : {Series, Sequence[float]} Dates or datetimes in SAS unit : {'d', 's'} "d" if the floats represent dates, "s" for datetimes Returns ------- Series Series of datetime64 dtype or datetime.datetime. rms)r%out_unitzM8[ms]FindexcopyzM8[D]dtypezM8[s]r0r-r.) _sas_origin _unix_originas_unitr_valuesviewr!Seriesr-nparray) sas_datetimesr%tdmillisdt64msvalss r&_convert_datetimesr?Qs$  $ - -c 2B s{*  ! !d X&+yy}':':GGxx W5:yyWM4G4GeTTr(cneZdZUded<ded<ded<ded<ded<ded < d d Zy ) _Columnintcol_id str | bytesnamelabelformatbytesctypelengthcX||_||_||_||_||_||_yN)rCrErFrGrIrJ)selfrCrErFrGrIrJs r&__init__z_Column.__init__ws.      r(N)rCrBrErDrFrDrGrDrIrHrJrBreturnNone)__name__ __module__ __qualname____annotations__rNr(r&rArAosl K    L K      r(rAc8eZdZUdZded<ded< d# d$dZd%dZd%d Zd%d Zd&d Z d&d Z d'd Z d(dZ d)dZ d*dZd+dZd&dZd,dZd&dZd&dZd-dZd-dZd-dZd-dZd-dZd-dZd-dZd-dZd.d/dZdZd'd Zd!Zd0d"Z y)1SAS7BDATReadera Read SAS files in SAS7BDAT format. Parameters ---------- path_or_buf : path name or buffer Name of SAS file or file-like object pointing to SAS file contents. index : column identifier, defaults to None Column to use as index. convert_dates : bool, defaults to True Attempt to convert dates to Pandas datetime values. Note that some rarely used SAS date formats may be unsupported. blank_missing : bool, defaults to True Convert empty strings to missing values (SAS uses blanks to indicate missing character variables). chunksize : int, defaults to None Return SAS7BDATReader object for iterations, returns chunks with given number of lines. encoding : str, 'infer', defaults to None String encoding acc. to Python standard encodings, encoding='infer' tries to detect the encoding from the file header, encoding=None will leave the data in binary format. convert_text : bool, defaults to True If False, text variables are left as raw bytes. convert_header_text : bool, defaults to True If False, header text, including column names, are left as raw bytes. rB _int_lengthz bytes | None _cached_pageNc ||_||_||_||_||_||_||_d|_d|_g|_ g|_ g|_ g|_ g|_ d|_g|_g|_g|_d|_d|_d|_t)|dd| |_|j*j,|_|j0|j2|j4|j6|j8|j:|j<|j>dg |_ |jC|jEy#tF$r|jIwxYw)Nzlatin-1r(rrbF)is_text compression)%r- convert_dates blank_missing chunksizeencoding convert_textconvert_header_textdefault_encodingr]column_names_raw column_namescolumn_formatscolumns%_current_page_data_subheader_pointersrY_column_data_lengths_column_data_offsets _column_types_current_row_in_file_index_current_row_on_page_indexrhandleshandle _path_or_buf_process_rowsize_subheader_process_columnsize_subheader_process_subheader_counts_process_columntext_subheader_process_columnname_subheader#_process_columnattributes_subheader_process_format_subheader_process_columnlist_subheader_subheader_processors_get_properties_parse_metadata Exceptionclose) rM path_or_bufr-r^r_r`rarbrcr]s r&rNzSAS7BDATReader.__init__s` **"  (#6 )-//113&( LN2 /1!/1!*,*+'*+'*+'! u+  !LL//  + +  . .  * *  . .  . .  4 4  * *  . .  & "   "  "  JJL  s $ EE c`tj|jtjS)z5Return a numpy int64 array of the column data lengthsr/)r8asarrayrjint64rMs r&column_data_lengthsz"SAS7BDATReader.column_data_lengthszz$33288DDr(c`tj|jtjS)z0Return a numpy int64 array of the column offsetsr/)r8rrkrrs r&column_data_offsetsz"SAS7BDATReader.column_data_offsetsrr(cjtj|jtjdS)zj Returns a numpy character array of the column types: s (string) or d (double) S1r/)r8rrlr0rs r& column_typeszSAS7BDATReader.column_typess# zz$,,BHHTNCCr(c8|jjyrL)ror~rs r&r~zSAS7BDATReader.closes r(c|jjd|jjd|_|jdt t j t j k7r td|jt jt j}|t jk(r9d|_ d|_ t j|_t j |_n8d|_ t j$|_t j&|_d|_ |jt j(t j*}|t j,k(rt j.}nd}|jt j0t j2}|dk(r d |_t6j8d k(|_nd |_t6j8d k(|_|jt j<t j>d}|t j@vr9t j@||_!|jDd k(r|jB|_"n d|d|_!tGddd}|jIt jJ|zt jL}|tOjP|dz|_)|jIt jT|zt jV}|tOjP|dz|_,|j[t j\|zt j^|_0|jj|j`dz }|xj|z c_t |j|j`k7r td|j[t jb|zt jd|_3y)Nri z'magic number mismatch (not a SAS file?)TF<big>littleinferzunknown (code=)rrr)r%z*The SAS7BDAT file appears to be truncated.)4rqseekreadrYlenconstmagicr# _read_bytesalign_1_offsetalign_1_lengthu64_byte_checker_valueU64rXpage_bit_offset_x64_page_bit_offsetsubheader_pointer_length_x64_subheader_pointer_lengthpage_bit_offset_x86subheader_pointer_length_x86align_2_offsetalign_2_lengthalign_1_checker_value align_2_valueendianness_offsetendianness_length byte_ordersys byteorder need_byteswapencoding_offsetencoding_lengthencoding_namesinferred_encodingrar _read_floatdate_created_offsetdate_created_lengthr! to_timedelta date_createddate_modified_offsetdate_modified_length date_modified _read_uintheader_size_offsetheader_size_length header_lengthpage_size_offsetpage_size_length _page_length)rMbufalign1epochxs r&r{zSAS7BDATReader._get_propertiess q! --2237   QU[[!1 2ekk AFG Gu33U5I5IJ %.. .DH D $)$=$=D !-2-O-OD *DH$)$=$=D !-2-O-OD * D u33U5I5IJ %-- -((FFu668O8OP '>!DO!$%!7D !DO!$(!:D u44e6K6KLQO %&& &%*%9%9#%>D "}}' $ 6 6 '5cU!% S  t  !T%7%7 7IJ J OO  " "V +U-C-C r(c|j|jxsd}|jr|jt|S)Nr)nrows)rr`emptyr~ StopIteration)rMdas r&__next__zSAS7BDATReader.__next__Cs6 YYT^^0qY 1 88 JJL  r(c|jJ|dk(r!t|j||jS|dk(r!t|j||jS|j t d)Nrrzinvalid float width)rYr rr r~r#rMoffsetwidths r&rzSAS7BDATReader._read_floatKs{  ,,, A:+!!64+=+= aZ,!!64+=+=  JJL23 3r(cl|jJ|dk(r|j|ddS|dk(r!t|j||jS|dk(r!t |j||jS|dk(r!t |j||jS|j td)Nrrrrzinvalid int width)rYrr rr r r~r#rs r&rzSAS7BDATReader._read_uintZs  ,,, A:##FA.q1 1 aZ,!!64+=+= aZ,!!64+=+= aZ,!!64+=+=  JJL01 1r(c|jJ||zt|jkDr|jtd|j|||zS)NzThe cached page is too small.)rYrr~r#rMrrJs r&rzSAS7BDATReader._read_bytesnsW  ,,, F?S!2!23 3 JJL<= =  &6/::r(cb|j|j||jdS)N )_convert_header_textrrstriprs r&_read_and_convert_header_textz,SAS7BDATReader._read_and_convert_header_textus0((   VV , 3 3H =  r(cd}|s|jj|j|_t |jdkryt |j|jk7r t d|j }|syy)NFrz2Failed to read a meta data page from the SAS file.)rqrrrYrr#_process_page_meta)rMdones r&r|zSAS7BDATReader._parse_metadatazsw $ 1 1 6 6t7H7H ID 4$$%*4$$%):):: !UVV**,D r(ct|jtjtjtjgz}|j |vr|j |j tjk(}|j tjk(}t|xs|xs|jgk7SrL) _read_page_headerrpage_meta_types page_amd_type page_mix_type_current_page_type_process_page_metadatapage_data_typeboolri)rMpt is_data_page is_mix_pages r&rz!SAS7BDATReader._process_page_metas   " "e&9&95;N;N%O O  " "b (  ' ' )..%2F2FF --1D1DD   @ @99R?  r(c|j}tj|z}|j|tjtj z|_tj|z}|j|tj|_ tj|z}|j|tj|_ yrL) rrpage_type_offsetrpage_type_lengthpage_type_mask2rblock_count_offsetblock_count_length_current_page_block_countsubheader_count_offsetsubheader_count_length_current_page_subheaders_count)rM bit_offsettxs r&rz SAS7BDATReader._read_page_headers**  # #j 0 OOB 6 6 7%:O:O O  % % 2)-U=U=U)V&  ) )J 6.2oo ,,/ +r(c8|j}t|jD]u}tj|z}||j |zz}|j ||j}||jz }|j ||j}||jz }|j |d}|dz }|j |d}|dk(s|tjk(r|j||j} t| } |j| } | t|tjdfv} |tjk(} |jr#| r!| r|jj!||fO|j#t%d| | ||xy)NrrzUnknown subheader signature )rrangerrsubheader_pointers_offsetrrrXtruncated_subheader_idrrrzcompressed_subheader_idcompressed_subheader_typer]riappendr~r#)rMrir total_offsetsubheader_offsetsubheader_lengthsubheader_compressionsubheader_typesubheader_signaturesubheader_indexsubheader_processorf1f2s r&rz%SAS7BDATReader._process_page_metadatas** t::;& HA44zAF!D$B$BQ$FFL#|T=M=MN  D,, ,L#|T=M=MN  D,, ,L$(OOL!$D ! A L!__\1=N!A%(E,H,HH"&"2"23CTEUEU"V 12EFO"&"<"<_"M "**u/L/La.PP#u'F'FF##r>>EE)+;<JJL$67J6KL$$46FGM& Hr(ct|j}|}|}|jr |dz }|dz }n |dz }|dz }|j|tj|zz||_|j|tj |zz||_|j|tj|zz||_ |j|tj|zz||_ tj|z}|j||z||_ |j|d|_|j|d|_y)Niiibizr)rXrrrrow_length_offset_multiplier row_lengthrow_count_offset_multiplier row_countcol_count_p1_multiplier col_count_p1col_count_p2_multiplier col_count_p2'row_count_on_mix_page_offset_multiplier_mix_page_row_count_lcs_lcp)rMrrJint_len lcs_offset lcp_offsetmxs r&rrz)SAS7BDATReader._process_rowsize_subheaders1""  88 # J # J # J # J// U77'A A   U66@ @  !OO U22W< O>O$P!-E +r(c|j}||z }|d|zz dz dz}t|D]"}|tj|dzzztjz}|tj|dzzztj z}|tj|dzzztj z}|j|tj} |j|tj} |j|tj} |j| } | | | | z} |jj|j| %y)Nr rr)rXrrcolumn_name_pointer_length!column_name_text_subheader_offsetcolumn_name_offset_offsetcolumn_name_length_offsetr!column_name_text_subheader_lengthcolumn_name_offset_lengthcolumn_name_length_lengthrerfrr)rMrrJrcolumn_name_pointers_countrtext_subheadercol_name_offsetcol_name_lengthidx col_offsetcol_lenname_rawcnames r&rvz,SAS7BDATReader._process_columnname_subheader"s`""'&,q7{&:R&?A%E"12 GA22a!e<=99:  22a!e<=112  22a!e<=112  // G GC!@!@Jooou7V7VWG,,S1HZ*w*>?E    $ $T%>%>u%E F7 Gr(c||j}|d|zz dz |dzz}t|D]}||ztjz||dzzz}|d|zztjz||dzzz}|d|zztj z||dzzz}|j ||} |jj| |j |tj} |jj| |j |tj} |jj| dk(rdndy)Nrr,rrds) rXrrcolumn_data_offset_offsetcolumn_data_length_offsetcolumn_type_offsetrrkrcolumn_data_length_lengthrjcolumn_type_lengthrl) rMrrJrcolumn_attributes_vectors_countrcol_data_offset col_data_len col_typesrs r&rwz2SAS7BDATReader._process_columnattributes_subheaderCsO""+1AK+?"+D'TU+*V'67 @A 5#B#BBQ'TU+EVV g+112w{#$ W$u'?'??!wQR{BSS 9A  % % , ,Q / e.M.MNA  % % , ,Q / 5+C+CDA    % %a1fd$ ?+ @r(cyrLrUrs r&ryz,SAS7BDATReader._process_columnlist_subheader]s r(c |j}|tjzd|zz}|tjzd|zz}|tjzd|zz}|tj zd|zz}|tj zd|zz}|tjzd|zz} |j|tj} t| t|jdz } |j|tj} |j|tj} |j|tj}t|t|jdz }|j|tj }|j| tj"}|j|}|j%||||z}|j| }|j%|| | | z}t|j&}t)||j*||||j,||j.|}|j0j3||j&j3|y)Nr)rXr)column_format_text_subheader_index_offsetcolumn_format_offset_offsetcolumn_format_length_offset(column_label_text_subheader_index_offsetcolumn_label_offset_offsetcolumn_label_length_offsetr)column_format_text_subheader_index_lengthminrrecolumn_format_offset_lengthcolumn_format_length_length(column_label_text_subheader_index_lengthcolumn_label_offset_lengthcolumn_label_length_lengthrrhrArfrlrjrgr)rMrrJrtext_subheader_formatcol_format_offsetcol_format_lentext_subheader_labelcol_label_offset col_label_lenr format_idx format_start format_len label_idx label_start label_len label_names column_label format_names column_formatcurrent_column_numbercols r&rxz(SAS7BDATReader._process_format_subheaderasX"" UDD Dq7{ R #U%F%FFWT%"C"CCa'kQ UCC Ca'k Q "E$D$DDq7{R!A!AAAKO OO !5#R#R C 5 56:;  u@@ __^U5V5VW OO %"P"P  3t'<'<#=#AB oo e>> OOM53S3ST ++I6 00  kI&= > ,,Z8 11  z(A B !$DLL 1 !   3 4     4 5  % %&; <   ""=1 C r(c||j |j}n| |j}t|jdk(r|j t d|dkDr#|j |jk\r tSt||j|j z }|jjd}|jjd}tj||ft|_ tj|d|zftj|_d|_t%|}|j'||j)}|j*|j-|j*}|S)NrzNo columns to parse from filer>r?r/r)r`r rrlr~rrmrrScountr8robject _string_chunkzerosuint8 _byte_chunk_current_row_in_chunk_indexrr_chunk_to_dataframer- set_index)rMrndnsprslts r&rzSAS7BDATReader.reads7 M :NNE ]NNE t!! "a ' JJL !@A A 1988DNNJ; E4>>D,K,KKL    % %d +    % %d +XXr5k@88RUO288D+,( 4L u '') :: !>>$**-D r(ctg|_|jj|j|_t |jdkryt |j|jk7rC|j dt |jdd|jdd}t||j|jtjvr|j|jtjtjtjgzvr|jSy)NrTz-failed to read complete page from file (read rz of z bytes)F)rirqrrrYrr~r#rrrrrrr_read_next_page)rMmsgs r&rzzSAS7BDATReader._read_next_pages572 --2243D3DE t  !Q & "" #t'8'8 8 JJL?t(()!,D1B1B10EWN S/ !   " "e&;&; ;  ' ' )  " "%*?*?    C +  '') )r(cT|j}|j}t||z |}i}d\}}td}t|jD]}|j |} |j |dk(r|j|ddfj|jdz} tj| tj|d|| <|jrc|j|t j"vrt%|| d|| <n1|j|t j&vrt%|| d|| <|d z }|j |d k(rtj|j(|ddf|d || <|j*rF|j,:|j/|| j0|| <|r|| j3d || <|d z }|j5t7d t9|j |t;||j |d} | S)N)rrzfuture.infer_stringr>rr/Fr1rrr?r,strzunknown column type )rhr-r.)rrrmrrrrfrlrqr6rr!r7r8float64r^rgrsas_date_formatsr?sas_datetime_formatsrnrbra_decode_stringr}astyper~r#reprr) rMnmixrxjsjb infer_stringjrEcol_arrdfs r&rsz"SAS7BDATReader._chunk_to_dataframes   , ,  + + 1q5!_B!"78 t(() WA$$Q'D!!!$,**2q5166T__s=R6SYYwbjjQVWT %%**1-1G1GG%7T C%HT ,,Q/53M3MM%7T C%HT a##A&$.YYt'9'9"a%'@QVWT $$$--*C!%!4!4T$Z^^!DDJ#%)$Z%6%6u%=T a  #7T=O=OPQ=R8S7T!UVV- W0tT%6%6bu M r(cT|j|jxs |jSrL)decoderardrMbs r&rzSAS7BDATReader._decode_strings xx >)>)>??r(c@|jr|j|S|SrL)rcrrs r&rz#SAS7BDATReader._convert_header_texts!  # #&&q) )Hr()NTTNNTTr)rzFilePath | ReadBuffer[bytes]r^rr_rr` int | Noneraz str | Nonerbrrcrr]rrOrP)rOz np.ndarray)rOrP)rOr)rrBrrB)rrBrrBrOrB)rrBrJrB)rrBrJrBrOrD)rOr)rrBrJrBrOrPrL)rrrOr)rrHrOrD)!rQrRrS__doc__rTrNrrrr~r{rrrrrr|rrrrrrsrtrurvrwryrxrrzrsrrrUr(r&rWrWs#< "" $#!$(*1=1= =  =  ==="=(= =~EEDD L 42(; -    )HV3< +QZGB@4 5!n>2!F@r(rW)r$floatr%r})r: pd.Seriesr%r}rOr)4r __future__r collectionsrrrrtypingrnumpyr8pandas._configrpandas._libs.byteswapr r r r r pandas._libs.sasrrpandas._libs.tslibs.conversionr pandas.errorsrpandasr!rrrpandas.io.commonrpandas.io.sas.sas_constantsiosas sas_constantsrpandas.io.sas.sasreaderrpandas._typingrrrr3r2r'r?rAIteratorrWrUr(r&rs# %E( (++.&  %  4U<6p Zp r(