L i@ddlZddlZddlmZmZddlmZddlmZm Z ddl m Z ddl m Z mZmZmZddlmZddlZddlZdd lmZmZdd lmZmZmZdd lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)e&rddl*Z+ddl,Z+e%rdd l-m Z.e"rddl/Z/e(j`e1Z2ed e3Z4ede3Z5ee6dejnde6ejne6de6e6de6e6ejne6e6de4e6e4e6e6e4e5e6e5e6e6e5fZ8eGddeZ9dZ:dZ;dZfdZ?de6e8de6eejndffdZ@de6eejndd dffdZAde8dee9eBffdZCd>dejndeedeDeEeEffd ZFd>d!eEd"eeEfd#ZGd?d$e9fd%ZHd&ed'd(e deDejne9ffd)ZId&ed'd(e fd*ZJd&ed'd(e fd+ZKd&ed'd(e fd,ZLd&ed'd(e fd-ZMeJeIeKeLeMd.ZN d@de8d"eeEd/eeeEeOfd0e3d(ee dejnf d1ZP d>dejnd2eee3efdejnfd3ZQejd4ddfdejnd5eeEeDeEeEfeeDeEeEffd6ed7eeOeeOfd8eee3efd2eee3efdejnfd9ZSde6ddeDeBeDeEeEfdfeBeEeDeDeEeEfeEffffd:ZTd;eBeDeEeEfdfdz)VideoMetadata.__iter__..[s-1-s)rselfs r2__iter__zVideoMetadata.__iter__Zs-t --c*tt|Sr.)lenrr4s r2__len__zVideoMetadata.__len__]s6$<  r7ct||Sr.)getattr)r5items r2 __getitem__zVideoMetadata.__getitem__`stT""r7ct|||Sr.)setattr)r5keyvalues r2 __setitem__zVideoMetadata.__setitem__cstS%((r7returnc|j |j td|jDcgc]}||jz c}Scc}w)z,Timestamps of the sampled frames in seconds.zGCannot infer video `timestamps` when `fps` or `frames_indices` is None.)r&r+ ValueError)r5 frame_idxs r2 timestampszVideoMetadata.timestampsfsH 88 t22:fg g6:6I6IJ DHH$JJJsA ch|jD]\}}t||st|||!yr.)itemshasattrr@)r5 dictionaryrArBs r2updatezVideoMetadata.updatems4$**, *JCtS!c5) *r7)__name__ __module__ __qualname__int__annotations__r&r floatr'r(r)r*strr+listr6r:r>rCpropertyrHrMr7r2r$r$PsC%E8C= FHSM $Hhuo$#'M8C='*.NHT#Y'..!#)KDKKK *r7r$ct|tjjxs)t|xs t |xr|j dk(S)N) isinstancePILImagerrndim)frames r2is_valid_video_framer_ss= eSYY__ -    8/%"8MejjAor7ct|ttfs)t|xs t |xr|j dk(S|xrt d|DS)Nc32K|]}t|ywr.)r_)r0r^s r2r3z!is_valid_video..|sH-e4Hs)rZrUtuplerrr]allvideos r2is_valid_videorgysI edE] +u%?)?TUZZST_T  HSH%HHHr7ct|ttfr!|D]}t|rt |ryyt|r|j dk(ryy)NFT)rZrUrcrgr_r])videosvideo_or_frames r2 valid_videosrlsR&4-($ N">26J>6Z  F #v{{a'7 r7ct|ttfrt|dSt |s t |r|j dk(ryy)NrriTF)rZrUrcrgrrr])rjs r2is_batched_videorns@&4-(fQi((  OF$;PQAQ r7rfrDcftj|dk\xrtj|dkS)zV Checks to see whether the pixel values have already been rescaled to [0, 1]. rr)npminmaxres r2is_scaled_videorss) 66%=A  4"&&-1"44r7rjc t|dttfrt|dds|Sg}|D]J}|Dcgc]}t j |}}t j |}|j|L|Scc}w)aK Given a batch of videos, converts each video to a 4D array. If video is already in array type, it is simply returned. We assume that all inputs in the list are in the same format, based on the type of the first element. Args: videos (`VideoInput`): Video inputs to turn into a list of videos. r)rZrUrcrrparraystackappend)rjvideo_convertedrfr^s r2convert_pil_frames_to_videorys vay4- 0^F1IaL5Q O&.34U%44u%& 5sBc t|ddtr2t|dddtr|Dcgc] }|D]}| c}}St|ts t |r t |gSt|r?t|tjjrtj|}|dgSt|tstdt|dg}|D]]}t|ts t |r|j|0t|tsA|sD|jt!|_t |}|Scc}}w#ttf$rY"wxYw)a Ensure that the input is a list of videos. If the input is a single video, it is converted to a list of length 1. If the input is a batch of videos, it is converted to a list of 4D video arrays. Videos passed as list `PIL.Image` frames are converted to 4D arrays. We assume that all inputs in the list are in the same format, based on the type of the first element. Args: videos (`VideoInput`): Video inputs to turn into a list of videos. r)N.zkInvalid video input. Expected either a list of video frames or an input of 4 or 5 dimensions, but got type .)rZrUrT IndexError TypeErrorrgryrr[r\rprurFtyperwextendmake_batched_videos)rjsublist image_pathsflat_videos_listr=s r2rrsT fQilD )j1a#.N/5QGQ+KQKQ Q&#."8*F844   fciioo .XXf%Fy!""  %&\N! %  ? dC N4$8  # #D ) d #  # #$7$= > ? 33CD 5R  "   s"4EEEEE-,E-video_metadatac |l|Dcgc]a}t|ddttt|t|rt |dndt|rt |dnddc}}t |trct |dtr#|Dcgc]}|D] }t di|}}}|St |dtr|Dcgc] }t di|}}|St di|g}|Scc}wcc}}wcc}w)Nrr)r%r&r)r+r(r'rW)r9rUrangergget_video_sizerZr$dict)rjrrf metadata_listmetadatas r2make_batched_metadatars+   %(J "&uSZ'8"96DU6K./2QU5CE5J.q1PT      .$' nQ' .?M.;^kRZ )))N  q)4 0HVWHm7h7WNW (9.9: 1   XsA&C0C5 C; channel_dimc| t|d}|tjk(r|jd|jdfS|tjk(r|jd|jdfSt d|)a Returns the (height, width) dimensions of the video. Args: video (`np.ndarray`): The video to get the dimensions of. channel_dim (`ChannelDimension`, *optional*): Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the video. Returns: A tuple of the video's height and width. )rrYra) num_channelszUnsupported data format: )rrFIRSTshapeLASTrF)rfrs r2rrs}4US &,,,{{2 B// (-- -{{2 B//4[MBCCr7r% num_framesc|/tjd|||z jt}|Stjd|jt}|S)a Creates a numpy array for uniform sampling of `num_frame` frames from `total_num_frames` when loading a video. Args: total_num_frames (`int`): Total number of frames that a video has. num_frames (`int`, *optional*): Number of frames to sample uniformly. If not specified, all frames are sampled. Returns: np.ndarray: np array of frame indices that will be sampled. r)rparangeastyperQ)r%rindicess r2get_uniform_frame_indicesrsY))A/1AJ1NOVVWZ[ N))A/077< Nr7rc |j}|j}|-|+t||z |z}||kDrtd|d|d|d|"t j d|||z t}|St j d|t}|S)ak A default sampling function that replicates the logic used in get_uniform_frame_indices, while optionally handling `fps` if `num_frames` is not provided. Args: metadata (`VideoMetadata`): `VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps". num_frames (`int`, *optional*): Number of frames to sample uniformly. fps (`int` or `float`, *optional*): Desired frames per second. Takes priority over num_frames if both are provided. Returns: `np.ndarray`: Array of frame indices to sample. z When loading the video with fps=z, we computed num_frames=z which exceeds total_num_frames=z. Check fps or video metadata.r)dtype)r%r&rQrFrpr)rrr&kwargsr% video_fpsrs r2default_sample_indices_fnr's  00 Ico)I5;< ( (23%7PQ[P\]22B1CCac  ))A/1AJ1NVYZ N))A/s; Nr7 video_path)r!r"sample_indices_fnc Xttdgddl}|j|}t |j |j }|j |j}|r||z nd}tt |t|t|dt |j |jt |j |j}|dd|i|} d} g} |jr|j\} } | snk| | vrI| j\}}}|j| |j } | j#| d|d|d|f| r| dz } | |k\rn|jr|j%| |_t)j*| |fS) ax Decode a video using the OpenCV backend. Args: video_path (`str`): Path to the video file. sample_indices_fn (`Callable`): A callable function that will return indices at which the video should be sampled. If the video has to be loaded using by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`. If not provided, simple uniform sampling with fps is performed. Example: def sample_indices_fn(metadata, **kwargs): return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int) Returns: tuple[`np.ndarray`, `VideoMetadata`]: A tuple containing: - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]). - `VideoMetadata` object. cv2rNopencvr%r&r)r*r(r'rrrW)rread_video_opencvr VideoCapturerQgetCAP_PROP_FRAME_COUNT CAP_PROP_FPSr$rSCAP_PROP_FRAME_HEIGHTCAP_PROP_FRAME_WIDTHisOpenedreadrcvtColor COLOR_BGR2RGBrwreleaser+rprv)rrrrrfr%rr)rrindexframessuccessr^r(r'channels r2rrJs2'%1   Z (E599S%=%=>? #**+I/8)+aH-. ) x599S6678%))C4456 H <  QJE $ $  ..  MMO%H 88F X %%r7c ttdgddlm}m}|||d}|j }t |}|r||z nd}tt|t|t|d} |d d| i|} |j| j} | j| | jd| jdd | | fS) av Decode a video using the Decord backend. Args: video_path (`str`): Path to the video file. sample_indices_fn (`Callable`): A callable function that will return indices at which the video should be sampled. If the video has to be loaded using by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`. If not provided, simple uniform sampling with fps is performed. Example: def sample_indices_fn(metadata, **kwargs): return np.linspace(0, metadata.total_num_frames - 1, num_frames, dtype=int) Returns: tuple[`np.array`, `VideoMetadata`]: A tuple containing: - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]). - `VideoMetadata` object. decordr) VideoReadercpu)urictxr%r&r)r*rrr+r(r'rW)rread_video_decordrrr get_avg_fpsr9r$rQrS get_batchasnumpyrMr) rrrrrvrrr%r)rrrfs r2rrs2'(4' Q 0B I2w/8)+aH-. ) x H < | td|fd}|}t|tsdgt|z}||fSt |j dvrt s tdttdgddl m }t} t| 5|5} | j|gdddddd| j} t| } nx|j!d s|j!d r)tt#j$|j&} n-t(j*j-|r|} n t/d |j!d xs|j!d } | r|d k(r td t1s|dk(s.sample_indices_fn_funcs,Xc*RUcYbc cr7)zwww.youtube.comz youtube.comzETo load a video from YouTube url you have to install `yt_dlp` first.yt_dlpr) YoutubeDLzhttp://zhttps://zVIncorrect format used for video. Should be an url linking to an video or a local path.rzNIf you are trying to load a video from URL, you cannot use 'opencv' as backendrrrrzYou chose backend=zf for loading the video but the required library is not found in your environment Make sure to install z before loading the video.)rFrZrTr9r netlocr ImportErrorr load_videorrrrdownloadgetvalue startswithrequestsrcontentospathisfiler}rrrrrVIDEO_DECODERS)rfrr&rrrrrrbufferr1 bytes_objfile_obj video_is_url video_decoders `` r2rrjsN :16G6O q    d3 eS !6CJ&h!CC"$ef f*xj1$ V $ ik Q JJw   OO% 9%   ) $(8(8(D8<<.667  pqq##I.N%2B2B:2NL8+ijj! "w(':!g&7 "w(':(*w-/G')g.E  *$$+9,F H  #7+M#H.?J6JOE8 (??    s$H!H4HH HHinput_data_formatct|tjstdt || t |}t |tj|}|jddk(r|S|jddk(r|jddS|ddddddfdkjs|S|ddddddfd z }d|ddddddfz dz|ddddddf|ddddddfzz}|S) aq Convert video to RGB by blending the transparency layer if it's in RGBA format, otherwise simply returns it. Args: video (`np.ndarray`): The video to convert. input_data_format (`ChannelDimension`, *optional*): The channel dimension format of the input video. If unset, will use the inferred format from the input. zBVideo has to be a numpy array to convert to RGB format, but found N)input_channel_dimrrYr.go@) rZrpndarrayr}r~rrrrrrepeatany)rfr alphas r2convert_to_rgbrs eRZZ (\]abg]h\ijkk :5A '/?/E/EYj kE {{2!  {{2!||Ar"" #q!Q, # % * * ,  #q!Q, % 'E sD!Q' '3 .sD!Q1G%PSUVXY[\P\J]1] ]E Lr7rpaddingmodeconstant_values data_formatc tfd}tjdtjdtjdtj di}||}i}||vrt d||tjk(r |||d<tj|fd||i||t|SS) a Pads the `video` with the specified (height, width) `padding` and `mode`. Args: video (`np.ndarray`): The video to pad. padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`): Padding to apply to the edges of the height, width axes. Can be one of three formats: - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. - `((before, after),)` yields same before and after pad for height and width. - `(pad,)` or int is a shortcut for before = after = pad width for all axes. mode (`PaddingMode`): The padding mode to use. Can be one of: - `"constant"`: pads with a constant value. - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the vector along each axis. - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. constant_values (`float` or `Iterable[float]`, *optional*): The value to use for the padding if `mode` is `"constant"`. data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format for the output video. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_frames, num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: video in (num_frames, height, width, num_channels) format. If unset, will use same as the input video. input_data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format for the input video. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: video in (num_frames, num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: video in (num_frames, height, width, num_channels) format. If unset, will use the inferred format of the input video. Returns: `np.ndarray`: The padded video. ct|ttfr ||f||ff}nt|tr#t |dk(r|d|df|d|dff}nvt|tr&t |dk(rt|dtr||f}n@t|tr"t |dk(rt|dtrnt d|t jk(rddg|ndg|d}jdk(rdg|}|S|}|S)za Convert values to be in the format expected by np.pad based on the data format. rrrzUnsupported format: )rrri) rZrQrSrcr9rFrrr])valuesr rfs r2_expand_for_data_formatz$pad.._expand_for_data_format"s$ fsEl +v&(89F  &3v;!+;ay&),vay&).DEF  &3v;!+; 6RS9VY@Zf%F  &3v;!+; 6RS9V[@\ 3F8<= =*;>N>T>T)TVV %f %[aZsdjZslrZs  "'q!f 7= r7constantreflect replicate symmetriczInvalid padding mode: rr) rrCONSTANTREFLECT REPLICATE SYMMETRICrFrppadr) rfrrrrr r padding_map pad_kwargss ` ` r2r&r&sV :5A2 jY{{ K &g.GJ ;1$899 %% %(?(P $% FF5' H D(9 HZ HER]Ri '{ Reconstructs a list of videos in the original order. rr)rr9)r0r.rs r2reorder_videosr2dsOs/01   -a0345I!5LQ5OP  s5r.)NN)NNrN)Vrrcollections.abcrr contextlibr dataclassesrrr rtypingr r r r urllib.parser numpyrprimage_transformsrr image_utilsrrrutilsrrrrrrrrrrrr PIL.Imager[ PIL.ImageOpsrrr, get_loggerrNloggerrTr!r"rUr VideoInputr$r_rgrlrnboolrsryrrrrcrQrrrrrrrrrrSrrr"r&r/r2rWr7r2rBs  -&)55!FYY     !4   H % eSvs  JJ !bjj n IcOJd  $ *G* *D I 52::5$5Z(8T% TbHbBc=d*)4bjj.%QW.W(X#Y)X*eMSWDW>X<D"**D8@D V ::V 3c3h%S/)BB CV V5(5/12 V %%5 567 V  c+;&; <= VZZVr0  0 4c3h/ 0$sE%S/SVBV$??@  . r7