# coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Image processor class for Vilt.""" from collections.abc import Iterable from typing import Any, Optional, Union import numpy as np from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format from ...image_utils import ( IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ChannelDimension, ImageInput, PILImageResampling, get_image_size, infer_channel_dimension_format, is_scaled_image, make_flat_list_of_images, to_numpy_array, valid_images, validate_preprocess_arguments, ) from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging from ...utils.import_utils import requires if is_vision_available(): import PIL logger = logging.get_logger(__name__) def max_across_indices(values: Iterable[Any]) -> list[Any]: """ Return the maximum value across all indices of an iterable of values. """ return [max(values_i) for values_i in zip(*values)] def make_pixel_mask( image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None ) -> np.ndarray: """ Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. Args: image (`np.ndarray`): Image to make the pixel mask for. output_size (`tuple[int, int]`): Output size of the mask. """ input_height, input_width = get_image_size(image, channel_dim=input_data_format) mask = np.zeros(output_size, dtype=np.int64) mask[:input_height, :input_width] = 1 return mask def get_max_height_width( images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None ) -> list[int]: """ Get the maximum height and width across all images in a batch. """ if input_data_format is None: input_data_format = infer_channel_dimension_format(images[0]) if input_data_format == ChannelDimension.FIRST: _, max_height, max_width = max_across_indices([img.shape for img in images]) elif input_data_format == ChannelDimension.LAST: max_height, max_width, _ = max_across_indices([img.shape for img in images]) else: raise ValueError(f"Invalid channel dimension format: {input_data_format}") return (max_height, max_width) def get_resize_output_image_size( input_image: np.ndarray, shorter: int = 800, longer: int = 1333, size_divisor: int = 32, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> tuple[int, int]: input_height, input_width = get_image_size(input_image, input_data_format) min_size, max_size = shorter, longer scale = min_size / min(input_height, input_width) if input_height < input_width: new_height = min_size new_width = scale * input_width else: new_height = scale * input_height new_width = min_size if max(new_height, new_width) > max_size: scale = max_size / max(new_height, new_width) new_height = scale * new_height new_width = scale * new_width new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) new_height = new_height // size_divisor * size_divisor new_width = new_width // size_divisor * size_divisor return new_height, new_width @requires(backends=("vision",)) class ViltImageProcessor(BaseImageProcessor): r""" Constructs a ViLT image processor. Args: do_resize (`bool`, *optional*, defaults to `True`): Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 384}`): Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method. size_divisor (`int`, *optional*, defaults to 32): The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be overridden by the `resample` parameter in the `preprocess` method. do_rescale (`bool`, *optional*, defaults to `True`): Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` parameter in the `preprocess` method. rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be overridden by the `rescale_factor` parameter in the `preprocess` method. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): Mean to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be overridden by the `image_mean` parameter in the `preprocess` method. image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): Standard deviation to use if normalizing the image. This is a float or list of floats the length of the number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. Can be overridden by the `image_std` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by the `do_pad` parameter in the `preprocess` method. """ model_input_names = ["pixel_values"] def __init__( self, do_resize: bool = True, size: Optional[dict[str, int]] = None, size_divisor: int = 32, resample: PILImageResampling = PILImageResampling.BICUBIC, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, image_mean: Optional[Union[float, list[float]]] = None, image_std: Optional[Union[float, list[float]]] = None, do_pad: bool = True, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: do_pad = kwargs.pop("pad_and_return_pixel_mask") super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 384} size = get_size_dict(size, default_to_square=False) self.do_resize = do_resize self.size = size self.size_divisor = size_divisor self.resample = resample self.do_rescale = do_rescale self.rescale_factor = rescale_factor self.do_normalize = do_normalize self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD self.do_pad = do_pad @classmethod def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs): """ Overrides the `from_dict` method from the base class to make sure `pad_and_return_pixel_mask` is updated if image processor is created using from_dict and kwargs e.g. `ViltImageProcessor.from_pretrained(checkpoint, pad_and_return_pixel_mask=False)` """ image_processor_dict = image_processor_dict.copy() if "pad_and_return_pixel_mask" in kwargs: image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") return super().from_dict(image_processor_dict, **kwargs) def resize( self, image: np.ndarray, size: dict[str, int], size_divisor: int = 32, resample: PILImageResampling = PILImageResampling.BICUBIC, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, ) -> np.ndarray: """ Resize an image. Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then resized to the max size while preserving the aspect ratio. Args: image (`np.ndarray`): Image to resize. size (`dict[str, int]`): Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. size_divisor (`int`, *optional*, defaults to 32): The image is resized to a size that is a multiple of this value. resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): Resampling filter to use when resiizing the image. data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. input_data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ size = get_size_dict(size, default_to_square=False) if "shortest_edge" not in size: raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") shorter = size["shortest_edge"] longer = int(1333 / 800 * shorter) output_size = get_resize_output_image_size( image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format ) return resize( image, size=output_size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs, ) def _pad_image( self, image: np.ndarray, output_size: tuple[int, int], constant_values: Union[float, Iterable[float]] = 0, data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> np.ndarray: """ Pad an image with zeros to the given size. """ input_height, input_width = get_image_size(image, channel_dim=input_data_format) output_height, output_width = output_size pad_bottom = output_height - input_height pad_right = output_width - input_width padding = ((0, pad_bottom), (0, pad_right)) padded_image = pad( image, padding, mode=PaddingMode.CONSTANT, constant_values=constant_values, data_format=data_format, input_data_format=input_data_format, ) return padded_image def pad( self, images: list[np.ndarray], constant_values: Union[float, Iterable[float]] = 0, return_pixel_mask: bool = True, return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width in the batch and optionally returns their corresponding pixel mask. Args: image (`np.ndarray`): Image to pad. constant_values (`float` or `Iterable[float]`, *optional*): The value to use for the padding if `mode` is `"constant"`. return_pixel_mask (`bool`, *optional*, defaults to `True`): Whether to return a pixel mask. return_tensors (`str` or `TensorType`, *optional*): The type of tensors to return. Can be one of: - Unset: Return a list of `np.ndarray`. - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. data_format (`str` or `ChannelDimension`, *optional*): The channel dimension format of the image. If not provided, it will be the same as the input image. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format of the input image. If not provided, it will be inferred. """ pad_size = get_max_height_width(images, input_data_format=input_data_format) padded_images = [ self._pad_image( image, pad_size, constant_values=constant_values, data_format=data_format, input_data_format=input_data_format, ) for image in images ] data = {"pixel_values": padded_images} if return_pixel_mask: masks = [ make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks return BatchFeature(data=data, tensor_type=return_tensors) @filter_out_non_signature_kwargs() def preprocess( self, images: ImageInput, do_resize: Optional[bool] = None, size: Optional[dict[str, int]] = None, size_divisor: Optional[int] = None, resample: Optional[PILImageResampling] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, image_mean: Optional[Union[float, list[float]]] = None, image_std: Optional[Union[float, list[float]]] = None, do_pad: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. Args: images (`ImageInput`): Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. do_resize (`bool`, *optional*, defaults to `self.do_resize`): Whether to resize the image. size (`dict[str, int]`, *optional*, defaults to `self.size`): Controls the size of the image after `resize`. The shortest edge of the image is resized to `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest edge equal to `int(size["shortest_edge"] * (1333 / 800))`. size_divisor (`int`, *optional*, defaults to `self.size_divisor`): The image is resized to a size that is a multiple of this value. resample (`PILImageResampling`, *optional*, defaults to `self.resample`): Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): Rescale factor to rescale the image by if `do_rescale` is set to `True`. do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): Whether to normalize the image. image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`): Image mean to normalize the image by if `do_normalize` is set to `True`. image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`): Image standard deviation to normalize the image by if `do_normalize` is set to `True`. do_pad (`bool`, *optional*, defaults to `self.do_pad`): Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also created and returned. return_tensors (`str` or `TensorType`, *optional*): The type of tensors to return. Can be one of: - Unset: Return a list of `np.ndarray`. - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): The channel dimension format for the output image. Can be one of: - `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `ChannelDimension.LAST`: image in (height, width, num_channels) format. input_data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the input image. If unset, the channel dimension format is inferred from the input image. Can be one of: - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. """ do_resize = do_resize if do_resize is not None else self.do_resize size_divisor = size_divisor if size_divisor is not None else self.size_divisor resample = resample if resample is not None else self.resample do_rescale = do_rescale if do_rescale is not None else self.do_rescale rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std do_pad = do_pad if do_pad is not None else self.do_pad size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) images = make_flat_list_of_images(images) if not valid_images(images): raise ValueError( "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "torch.Tensor, tf.Tensor or jax.ndarray." ) # Here the pad() method does not require any additional argument as it takes the maximum of (height, width). # Hence, it does not need to be passed to a validate_preprocess_arguments() method. validate_preprocess_arguments( do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, do_resize=do_resize, size=size, resample=resample, ) # All transformations expect numpy arrays. images = [to_numpy_array(image) for image in images] if do_rescale and is_scaled_image(images[0]): logger.warning_once( "It looks like you are trying to rescale already rescaled images. If the input" " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." ) if input_data_format is None: # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) if do_resize: images = [ self.resize( image=image, size=size, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format, ) for image in images ] if do_rescale: images = [ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) for image in images ] if do_normalize: images = [ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) for image in images ] images = [ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] if do_pad: encoded_outputs = self.pad( images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format ) else: encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) return encoded_outputs __all__ = ["ViltImageProcessor"]