L ilddlZddlZddlmZddlmZddlmZddlm Z m Z ddl Z ddl Z ddlmZmZmZmZmZmZmZmZmZmZmZddlmZmZmZmZmZm Z erddl!Z"ddl#Z"e"jHjJZ&erdd l'm(Z(e&jRe(jTe&jVe(jVe&jXe(jXe&jZe(jZe&j\e(j\e&j^e(j^iZ0niZ0erddl1Z1ejde3Z4e d e jjd e6d e6e jje6d fZ7Gd d eZ8GddeZ9GddeZ:e;edZ?GddeZ@dZAdZBde6fdZCdZDdZEdZFde jjdeGfdZHdTde=de6e7fd ZI dTde e6e7e7fde=de7fd!ZJ dTde e6e7e7fde=de6e7fd"ZKde jjfd#ZL dUde jjd$e e e=eMe=d%ffde8fd&ZN dUde jjd'e e e8ee e e;et|rtjSt|rtjSt |rtj St|rtjSt|rtjStdt|)NzUnrecognized image type ) r7r9r3rr?r r@r rAr rB ValueErrortypeimages r&get_image_typerHosuE}}ueE###U}} /U }= >>r%ct|xs2t|xs%t|xst|xs t |Sr1)r7r rr r r5s r&is_valid_imagerJ}s8   vs 3 vs7K v|\_O` vdqrudvvr%imagesc.|xrtd|DS)Nc32K|]}t|ywr1)rJ).0rGs r& z*is_valid_list_of_images..sDE./Dall)rKs r&is_valid_list_of_imagesrSs  DcDVDDDr%c8t|dtr|Dcgc] }|D]}| c}}St|dtjrtj|dSt|dt j rt j|dSycc}}w)Nraxis)dim)r2listnpndarray concatenater;Tensorcat) input_listsublistitems r&concatenate_listras~*Q-&$.C7C4CCC JqM2:: .~~jq11 JqM5<< 0yy++ 1DsBcrt|ttfr|D]}t|ryyt |syy)NFT)r2rXtuple valid_imagesrJ)imgsr6s r&rdrds?$u & C$  D ! r%cLt|ttfrt|dSy)NrF)r2rXrcrJr5s r& is_batchedrgs"#e}%c!f%% r%rGreturnc|jtjk(rytj|dk\xrtj|dkS)zV Checks to see whether the pixel values have already been rescaled to [0, 1]. Frr)dtyperYuint8minmaxrFs r&is_scaled_imagerns> {{bhh 66%=A  4"&&-1"44r%expected_ndimsc (t|r|St|r|gSt|rU|j|dzk(r t |}|S|j|k(r|g}|St d|dzd|d|jdt dt |d)a Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1. If the input is a batch of images, it is converted to a list of images. Args: images (`ImageInput`): Image of images to turn into a list of images. expected_ndims (`int`, *optional*, defaults to 3): Expected number of dimensions for a single input image. If the input image has a different number of dimensions, an error is raised. rz%Invalid image shape. Expected either z or z dimensions, but got z dimensions.ztInvalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got .)rgr7rJndimrXrDrE)rKros r&make_list_of_imagesrss& Fxf ;;.1, ,&\F [[N *XF  78J7K4P^O_`KK= .   $V ~Q 0 r%cHt|ttfr=td|Dr+td|Dr|Dcgc] }|D]}| c}}St|ttfr[t |rPt |ds|dj |k(r|S|dj |dzk(r|Dcgc] }|D]}| c}}St|r:t |s|j |k(r|gS|j |dzk(r t|Std|cc}}wcc}}w)a Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1. If the input is a nested list of images, it is converted to a flat list of images. Args: images (`Union[list[ImageInput], ImageInput]`): The input image. expected_ndims (`int`, *optional*, defaults to 3): The expected number of dimensions for a single input image. Returns: list: A list of images or a 4d array of images. c3HK|]}t|ttfywr1r2rXrcrNimages_is r&rOz+make_flat_list_of_images..K 8dE]3K "c3<K|]}t|xs| ywr1rSrws r&rOz+make_flat_list_of_images.. Yh'1A\AYrrz*Could not make a flat list of images from r2rXrcrRrSr7rrrJrD)rKroimg_listr6s r&make_flat_list_of_imagesrs" 6D%=) KFK K YRXY Y$*?h?s???&4-(-DV-L q "fQinn&FM !9>>^a/ /(.CH(C3CCCC Cf  6;;.#@8O ;;.1, ,<  A&J KK@ Ds D1Dc t|ttfr&td|Drtd|Dr|St|ttfr\t |rQt |ds|dj |k(r|gS|dj |dzk(r|Dcgc] }t|c}St|r. ryrzc3<K|]}t|xs| ywr1r|rws r&rOz-make_nested_list_of_images.. r}r~rrz]Invalid input type. Must be a single image, a list of images, or a list of batches of images.r)rKrorGs r&make_nested_list_of_imagesrs 6D%=) KFK K YRXY Y &4-(-DV-L q "fQinn&F8O !9>>^a/ /-34EDK4 4f  6;;.#@H:  ;;.1, ,L> ! t uu5sDct|stdt|tr9t |t j j rtj|St|S)NzInvalid image type: ) rJrDrErr2r3r4rYarrayrr5s r&to_numpy_arrayrsP # /S {;<<C!Axx} C=r% num_channels.c*||nd}t|tr|fn|}|jdk(rd\}}nB|jdk(rd\}}n-|jdk(rd\}}ntd|j|j||vrD|j||vr3t j d|jd tjS|j||vrtjS|j||vrtjStd ) a[ Infers the channel dimension format of `image`. Args: image (`np.ndarray`): The image to infer the channel dimension of. num_channels (`int` or `tuple[int, ...]`, *optional*, defaults to `(1, 3)`): The number of channels of the image. Returns: The channel dimension of the image. rr)r)rrz(Unsupported number of image dimensions: z4The channel dimension is ambiguous. Got image shape z. Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.z(Unable to infer channel dimension format) r2intrrrDshapeloggerwarningrr"r#)rGr first_dimlast_dims r&infer_channel_dimension_formatr(s$0#;<L&0s&CL?L zzQ" 8 q" 8 q" 8CEJJ<PQQ {{9-%++h2G<2WB5;;-PJ K  %%% Y < /%%% X , .$$$ ? @@r%input_data_formatc| t|}|tjk(r|jdz S|tjk(r|jdz St d|)a Returns the channel dimension axis of the image. Args: image (`np.ndarray`): The image to get the channel dimension axis of. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the image. If `None`, will infer the channel dimension from the image. Returns: The channel dimension axis of the image. rrUnsupported data format: )rrr"rrr#rD)rGrs r&get_channel_dimension_axisrOsd :5A,222zzA~ .33 3zzA~ 01B0CD EEr% channel_dimc| t|}|tjk(r|jd|jdfS|tjk(r|jd|jdfSt d|)a Returns the (height, width) dimensions of the image. Args: image (`np.ndarray`): The image to get the dimensions of. channel_dim (`ChannelDimension`, *optional*): Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image. Returns: A tuple of the image's height and width. r)rrr"rr#rD)rGrs r&get_image_sizergs{4U; &,,,{{2 B// (-- -{{2 B//4[MBCCr% image_size max_height max_widthcx|\}}||z }||z }t||}t||z}t||z} || fS)a Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. Important, even if image_height < max_height and image_width < max_width, the image will be resized to at least one of the edges be equal to max_height or max_width. For example: - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) Args: image_size (`tuple[int, int]`): The image to resize. max_height (`int`): The maximum allowed height. max_width (`int`): The maximum allowed width. )rlr) rrrheightwidth height_scale width_scale min_scale new_height new_widths r&#get_image_size_for_max_height_widthrsV,MFE&Le#KL+.IVi'(JEI%&I y  r% annotationct|trId|vrEd|vrAt|dttfr(t |ddk(st|ddtryy)Nimage_id annotationsrTFr2dictrXrclenrs r&"is_valid_annotation_coco_detectionrs`:t$ * $ Z ' z-04- @  =) *a /:j>WXY>Z\`3a r%ct|trMd|vrId|vrEd|vrAt|dttfr(t |ddk(st|ddtryy)Nr segments_info file_namerTFrrs r&!is_valid_annotation_coco_panopticrsh:t$ * $ z ) : % z/2T5M B  ?+ , 1Z ?@[\]@^`d5e r%rc&td|DS)Nc32K|]}t|ywr1)rrNanns r&rOz3valid_coco_detection_annotations..sN31#6NrPrQrs r& valid_coco_detection_annotationsrs N+N NNr%c&td|DS)Nc32K|]}t|ywr1)rrs r&rOz2valid_coco_panoptic_annotations..sM#05MrPrQrs r&valid_coco_panoptic_annotationsrs MM MMr%timeoutcttdgt|tr|j ds|j drHt j jttj||j}ntjj|r t j j|}n|j dr|jdd} t!j"|j%}t j jt|}n/t|t j j s t+d t j,j/|}|j1d }|S#t&$r}t)d|d |d }~wwxYw) a3 Loads `image` to a PIL Image. Args: image (`str` or `PIL.Image.Image`): The image to convert to the PIL Image format. timeout (`float`, *optional*): The timeout value in seconds for the URL request. Returns: `PIL.Image.Image`: A PIL Image. visionzhttp://zhttps://rz data:image/,rzIncorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got z. Failed with NzuIncorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image.RGB)r load_imager2str startswithr3r4openrrequestsgetcontentospathisfilesplitbase64 decodebytesencode ExceptionrD TypeErrorImageOpsexif_transposeconvert)rGrb64es r&rrsqj8*-%   I &%*:*::*FIINN78<<w+O+W+W#XYE WW^^E "IINN5)E . C(+ ((8 ws|4 syy / D   LL ' ' .E MM% E L  ijoipp~@~AB s2A F F>(F99F>c <t|ttfrjt|rDt|dttfr+|Dcgc]}|Dcgc]}t ||c}c}}S|Dcgc]}t ||c}St ||Scc}wcc}}wcc}w)aLoads images, handling different levels of nesting. Args: images: A single image, a list of images, or a list of lists of images to load. timeout: Timeout for loading images. Returns: A single image, a list of images, a list of lists of images. rr)r2rXrcrr)rKr image_grouprGs r& load_imagesrs&4-( v;:fQi$?eklVa[QEZw7Ql lDJK5Jug6K K&'22 RlKs B B B*BB do_rescalerescale_factor do_normalize image_mean image_stddo_padpad_sizedo_center_crop crop_size do_resizesizeresamplePILImageResampling interpolationrc |r | td|r | td|r|| td|r | td| | td| r| | | tdyyy)a Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method. Raises `ValueError` if arguments incompatibility is caught. Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`, sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow existing arguments when possible. Nz=`rescale_factor` must be specified if `do_rescale` is `True`.zgDepending on the model, `size_divisor` or `pad_size` or `size` must be specified if `do_pad` is `True`.zP`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.z<`crop_size` must be specified if `do_center_crop` is `True`.zbOnly one of `interpolation` and `resample` should be specified, depending on image processor type.zO`size` and `resample/interpolation` must be specified if `do_resize` is `True`.)rD) rrrrrrrrrrrrrs r&validate_preprocess_argumentsrs.n,XYY (" u  +y/@kll)+WXX X%9 p  $*0D HajkkIb0Dyr%ceZdZdZdZddZdZdejde e e fdejfd Z dd Z d Zdd Zdd ZdZdZddZy)ImageFeatureExtractionMixinzD Mixin that contain utilities for preparing image features. ct|tjjtjfs$t |st dt|dyy)Nz Got type zU which is not supported, only `PIL.Image.Image`, `np.ndarray` and `torch.Tensor` are.)r2r3r4rYrZrrDrEselfrGs r&_ensure_format_supportedz4ImageFeatureExtractionMixin._ensure_format_supported>sQ%#))//2::!>?X]H^DK=)&& I_?r%Nc|j|t|r|j}t|tj r|'t|j dtj}|jdk(r$|jddvr|jddd}|r|dz}|jtj}tjj|S|S)a" Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if needed. Args: image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`): The image to convert to the PIL Image format. rescale (`bool`, *optional*): Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default to `True` if the image type is a floating type, `False` otherwise. rrrrr)rrr<r2rYrZflatfloatingrrr transposeastyperkr3r4 fromarray)rrGrescales r& to_pil_imagez(ImageFeatureExtractionMixin.to_pil_imageEs %%e, 5 !KKME eRZZ ($UZZ]BKK@zzQ5;;q>V#;1a0 LL*E99&&u- - r%c|j|t|tjjs|S|j dS)z Converts `PIL.Image.Image` to RGB format. Args: image (`PIL.Image.Image`): The image to convert. r)rr2r3r4rrs r& convert_rgbz'ImageFeatureExtractionMixin.convert_rgbcs8 %%e,%1L}}U##r%rGscalerhc.|j|||zS)z7 Rescale a numpy image by scale amount )r)rrGrs r&rz#ImageFeatureExtractionMixin.rescaleqs %%e,u}r%c|j|t|tjjrt j |}t |r|j}|'t|jdtjn|}|r/|j|jtjd}|r"|jdk(r|jddd}|S)a Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first dimension. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image to convert to a NumPy array. rescale (`bool`, *optional*): Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise. channel_first (`bool`, *optional*, defaults to `True`): Whether or not to permute the dimensions of the image to put the channel dimension first. rp?rrr)rr2r3r4rYrrr<rintegerrrfloat32rrr)rrGr channel_firsts r&rz*ImageFeatureExtractionMixin.to_numpy_arrayxs %%e, eSYY__ -HHUOE 5 !KKME;B?*UZZ]BJJ7PW LLbjj!99EE UZZ1_OOAq!,E r%c|j|t|tjjr|St |r|j d}|St j|d}|S)z Expands 2-dimensional `image` to 3 dimensions. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image to expand. rrU)rr2r3r4r unsqueezerY expand_dimsrs r&r z'ImageFeatureExtractionMixin.expand_dimss_ %%e, eSYY__ -L 5 !OOA&E NN5q1E r%c|j|t|tjjr|j |d}nw|rut|t j r0|j|jt jd}n+t|r |j|jd}t|t j rt|t j s.t j|j|j}t|t j st j|j|j}nt|rddl}t||js?t|t j r|j |}n|j"|}t||js?t|t j r|j |}n|j"|}|j$dk(r)|j&ddvr||ddddfz |ddddfz S||z |z S)a  Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array if it's a PIL Image. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image to normalize. mean (`list[float]` or `np.ndarray` or `torch.Tensor`): The mean (per channel) to use for normalization. std (`list[float]` or `np.ndarray` or `torch.Tensor`): The standard deviation (per channel) to use for normalization. rescale (`bool`, *optional*, defaults to `False`): Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will happen automatically. T)rrrNrr)rr2r3r4rrYrZrrrrfloatrrjr;r\ from_numpytensorrrr)rrGmeanstdrr;s r& normalizez%ImageFeatureExtractionMixin.normalizes %%e, eSYY__ -''t' eRZZ (dBJJ/xx~,,U[[9c2::.hhsm**5;;7 U # dELL1dBJJ/+5++D1D'5<<-Dc5<<0c2::.*%**3/C&%,,s+C ::?u{{1~7DD$//3q$}3EE EDLC' 'r%c||ntj}|j|t|tj j s|j |}t|tr t|}t|tst|dk(r|rt|tr||fn |d|df}n|j\}}||kr||fn||f\}} t|tr|n|d} || k(r|S| t| | z|z } } |.|| krtd|d|| |kDrt|| z| z |} } ||kr| | fn| | f}|j||S)a Resizes `image`. Enforces conversion of input to PIL.Image. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image to resize. size (`int` or `tuple[int, int]`): The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to this. If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this number. i.e, if height > width, then image will be rescaled to (size * height / width, size). resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`): The filter to user for resampling. default_to_square (`bool`, *optional*, defaults to `True`): How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square (`size`,`size`). If set to `False`, will replicate [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize) with support for resizing only the smallest edge and providing an optional `max_size`. max_size (`int`, *optional*, defaults to `None`): The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater than `max_size` after being resized according to `size`, then the image is resized again so that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter than `size`. Only used if `default_to_square` is `False`. Returns: image: A resized `PIL.Image.Image`. rrz max_size = zN must be strictly greater than the requested size for the smaller edge size = )r)rBILINEARrr2r3r4rrXrcrrrrDresize) rrGrrdefault_to_squaremax_sizerrshortlongrequested_new_short new_shortnew_longs r&rz"ImageFeatureExtractionMixin.resizesy< (389K9T9T %%e,%1%%e,E dD !;D dC CIN '1$'<d|47DQRGBT % v16&ufovuo t.8s.Cda#// L&93?RUY?Y\a?a;b8 '#66()(4@@DvG (*.1(Y2F2Q.RT\8 05 8,hPYEZ||D8|44r%c|j|t|ts||f}t|st|tj rP|j dk(r|j|}|jddvr|jddn|jdd}n|jd|jdf}|d|dz dz}||dz}|d|dz dz}||dz}t|tjjr|j||||fS|jddv}|sKt|tj r|jddd}t|r|jddd}|dk\r!||dkr|dk\r||dkr |d||||fS|jddt|d|dt|d|dfz} t|tj rt j || } nt|r|j#| } | d|dz dz} | |dz} | d |dz dz} | |dz}| d| | | |f<|| z }|| z }|| z }|| z }| dtd|t%| jd|td|t%| jd |f} | S) a Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the size given, it will be padded (so the returned result has the size asked). Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)): The image to resize. size (`int` or `tuple[int, int]`): The size to which crop the image. Returns: new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels, height, width). rrrrN.r)rr)rr2rcrrYrZrrr rrr3r4croprpermuterm zeros_like new_zerosrl)rrGr image_shapetopbottomleftrightr new_shape new_imagetop_pad bottom_padleft_pad right_pads r& center_cropz'ImageFeatureExtractionMixin.center_crop#s %%e,$&$1a7{1~- bMKN2q8{1~- AF #wz)8I+==> w'   Qs9??2#6??QPST]TcTcdfTginPoAo o r%c|j|t|tjjr|j |}|dddddddfS)a Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of `image` to a NumPy array if it's a PIL Image. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should be first. Nr)rr2r3r4rrs r&flip_channel_orderz.ImageFeatureExtractionMixin.flip_channel_ordernsI %%e, eSYY__ -''.ETrT1aZ  r%c||ntjj}|j|t |tjjs|j |}|j ||||||S)a Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees counter clockwise around its centre. Args: image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`): The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before rotating. Returns: image: A rotated `PIL.Image.Image`. )rexpandcenter translate fillcolor)r3r4NEARESTrr2rrotate)rrGanglerr1r2r3r4s r&r6z"ImageFeatureExtractionMixin.rotatesn (389J9J %%e,%1%%e,E|| HVFicl  r%r1)NT)F)NTN)NrNNN)rr r!__doc__rrrrYrZrr rrrr rrr-r/r6r$r%r&rr9sj< $RZZeSj0Abjj@(2(hA5FIV!" r%rannotation_formatsupported_annotation_formatsc||vrtdtd||tjurt |s td|tj urt |s tdyy)NzUnsupported annotation format: z must be one of zInvalid COCO detection annotations. Annotations must a dict (single image) or list of dicts (batch of images) with the following keys: `image_id` and `annotations`, with the latter being a list of annotations in the COCO format.zInvalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts (batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with the latter being a list of annotations in the COCO format.)rDformatr(r+rr,r)r9r:rs r&validate_annotationsr=s  <<:6(BRSoRpqrr,;;;/ <B  ,:::.{;M <;r%valid_processor_keyscaptured_kwargsct|jt|}|r+dj|}tj d|dyy)Nz, zUnused or unrecognized kwargs: rq)set differencejoinrr)r>r? unused_keysunused_key_strs r&validate_kwargsrFsJo&11#6J2KLK;/88HJKr%T)frozenceZdZUdZdZeeed<dZeeed<dZ eeed<dZ eeed<dZ eeed<dZ eeed<d Z y) SizeDictz> Hashable dictionary to store image size information. Nrr longest_edge shortest_edgerrcPt||r t||Std|d)NzKey z not found in SizeDict.)hasattrgetattrKeyError)rkeys r& __getitem__zSizeDict.__getitem__s. 4 4% %cU"9:;;r%)rr r!r8rrr__annotations__rrJrKrrrQr$r%r&rIrIsb!FHSM E8C="&L(3-&#'M8C=' $J $#Ix}#rgsA $!"    --!<  & &(9(G(G  " "$5$9$9  ' '):)C)C  & &(9(A(A  & &(9(A(A  & &(9(A(A + '+-'   H %rzz>48I3JDQSQ[Q[L\^bcq^rr | $|$ 9\9 c5c4:!5667F  ?wEDE,  52::5$5$$D 4U4;=O8O3P UY  $sE$+ "1l+,1l/01lj\ \ ~ '"'(8#(=">d 2L$s)Ld3iL $<<