# coding=utf-8 # Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Fast Image processor class for LLaVa.""" from typing import Optional, Union import torch from torchvision.transforms.v2 import functional as F from ...image_processing_utils import BatchFeature from ...image_processing_utils_fast import ( BaseImageProcessorFast, DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, ImageInput, PILImageResampling, SizeDict, get_image_size, ) from ...processing_utils import Unpack from ...utils import ( TensorType, auto_docstring, ) class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... @auto_docstring class LlavaImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BICUBIC image_mean = OPENAI_CLIP_MEAN image_std = OPENAI_CLIP_STD size = {"shortest_edge": 224} default_to_square = False crop_size = {"height": 224, "width": 224} do_pad = False do_resize = True do_center_crop = True do_rescale = True do_normalize = True do_convert_rgb = True valid_kwargs = LlavaFastImageProcessorKwargs def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None: super().__init__(**kwargs) @auto_docstring def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature: return super().preprocess(images, **kwargs) def pad_to_square( self, images: "torch.Tensor", background_color: Union[int, tuple[int, int, int]] = 0, ) -> "torch.Tensor": """ Pads an image to a square based on the longest edge. Args: images (`np.ndarray`): The images to pad. background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0): The color to use for the padding. Can be an integer for single channel or a tuple of integers representing for multi-channel images. If passed as integer in multi-channel mode, it will default to `0` in subsequent channels. Returns: `torch.Tensor`: The padded images. """ height, width = get_image_size(images, ChannelDimension.FIRST) if height == width: return images num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0] if isinstance(background_color, int): background_color = [background_color] + [0] * (num_channels - 1) elif len(background_color) != num_channels: raise ValueError( f"background_color must have no more than {num_channels} elements to match the number of channels" ) max_dim = max(height, width) paste_x_left = (max_dim - width) // 2 paste_y_left = (max_dim - height) // 2 paste_x_right = max_dim - width - paste_x_left paste_y_right = max_dim - height - paste_y_left padded_images = F.pad( images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color ) return padded_images def _preprocess( self, images: list["torch.Tensor"], do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], do_pad: bool, do_center_crop: bool, crop_size: SizeDict, do_rescale: bool, rescale_factor: float, do_normalize: bool, image_mean: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]], disable_grouping: Optional[bool], return_tensors: Optional[Union[str, TensorType]], **kwargs, ) -> BatchFeature: # Group images by size for batched resizing grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_pad: stacked_images = self.pad_to_square( images=stacked_images, background_color=tuple(int(x * 255) for x in self.image_mean) ) resized_images_grouped[shape] = stacked_images padded_images = reorder_images(resized_images_grouped, grouped_images_index) # Group images by size for batched resizing # Needed in case do_pad is False, or padding returns images with different sizes grouped_images, grouped_images_index = group_images_by_shape(padded_images, disable_grouping=disable_grouping) resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_resize: stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) resized_images_grouped[shape] = stacked_images resized_images = reorder_images(resized_images_grouped, grouped_images_index) # Group images by size for further processing # Needed in case do_resize is False, or resize returns images with different sizes grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) processed_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_center_crop: stacked_images = self.center_crop(stacked_images, crop_size) # Fused rescale and normalize stacked_images = self.rescale_and_normalize( stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std ) processed_images_grouped[shape] = stacked_images processed_images = reorder_images(processed_images_grouped, grouped_images_index) processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) __all__ = ["LlavaImageProcessorFast"]