# coding=utf-8 # Copyright 2025 The ZhipuAI Inc. team and HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Fast Image processor class for GLM-4.1V.""" from typing import Optional, Union import torch from torchvision.transforms.v2 import functional as F from ...image_processing_utils import ( BatchFeature, ) from ...image_processing_utils_fast import ( BaseImageProcessorFast, DefaultFastImageProcessorKwargs, group_images_by_shape, reorder_images, ) from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict, ) from ...processing_utils import Unpack from ...utils import ( TensorType, auto_docstring, logging, ) from .image_processing_glm4v import smart_resize logger = logging.get_logger(__name__) class Glm4vFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): """ patch_size (`int`, *optional*, defaults to 14): The spatial patch size of the vision encoder. temporal_patch_size (`int`, *optional*, defaults to 2): The temporal patch size of the vision encoder. merge_size (`int`, *optional*, defaults to 2): The merge size of the vision encoder to llm encoder. """ patch_size: Optional[int] temporal_patch_size: Optional[int] merge_size: Optional[int] @auto_docstring class Glm4vImageProcessorFast(BaseImageProcessorFast): do_resize = True resample = PILImageResampling.BICUBIC size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000} do_rescale = True do_normalize = True image_mean = OPENAI_CLIP_MEAN image_std = OPENAI_CLIP_STD do_convert_rgb = True patch_size = 14 temporal_patch_size = 2 merge_size = 2 valid_kwargs = Glm4vFastImageProcessorKwargs model_input_names = ["pixel_values", "image_grid_thw"] def __init__(self, **kwargs: Unpack[Glm4vFastImageProcessorKwargs]): super().__init__(**kwargs) if self.size is not None and ( self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None ): raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") def _further_process_kwargs( self, size: Optional[SizeDict] = None, **kwargs, ) -> dict: """ Update kwargs that need further processing before being validated Can be overridden by subclasses to customize the processing of kwargs. """ if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") return super()._further_process_kwargs(size=size, **kwargs) def _preprocess( self, images: list["torch.Tensor"], do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], do_rescale: bool, rescale_factor: float, do_normalize: bool, image_mean: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]], patch_size: int, temporal_patch_size: int, merge_size: int, disable_grouping: Optional[bool], return_tensors: Optional[Union[str, TensorType]], **kwargs, ) -> BatchFeature: """ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`. """ grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): height, width = stacked_images.shape[-2:] if do_resize: resized_height, resized_width = smart_resize( num_frames=temporal_patch_size, height=height, width=width, temporal_factor=temporal_patch_size, factor=patch_size * merge_size, min_pixels=size.shortest_edge, max_pixels=size.longest_edge, ) stacked_images = self.resize( stacked_images, size=SizeDict(height=resized_height, width=resized_width), interpolation=interpolation, ) resized_images_grouped[shape] = stacked_images resized_images = reorder_images(resized_images_grouped, grouped_images_index) grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) processed_images_grouped = {} processed_grids = {} for shape, stacked_images in grouped_images.items(): resized_height, resized_width = stacked_images.shape[-2:] patches = self.rescale_and_normalize( stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std ) if patches.ndim == 4: # (B, C, H, W) patches = patches.unsqueeze(1) # (B, T=1, C, H, W) if patches.shape[1] % temporal_patch_size != 0: repeats = patches[:, -1:].repeat( 1, temporal_patch_size - (patches.shape[1] % temporal_patch_size), 1, 1, 1 ) patches = torch.cat([patches, repeats], dim=1) batch_size, t_len, channel = patches.shape[:3] grid_t = t_len // temporal_patch_size grid_h, grid_w = resized_height // patch_size, resized_width // patch_size patches = patches.view( batch_size, grid_t, temporal_patch_size, channel, grid_h // merge_size, merge_size, patch_size, grid_w // merge_size, merge_size, patch_size, ) # (B, grid_t, gh, gw, mh, mw, C, tp, ph, pw) patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) flatten_patches = patches.reshape( batch_size, grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size, ) processed_images_grouped[shape] = flatten_patches processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size processed_images = reorder_images(processed_images_grouped, grouped_images_index) processed_grids = reorder_images(processed_grids, grouped_images_index) pixel_values = torch.cat(processed_images, dim=0) image_grid_thw = torch.tensor(processed_grids) return BatchFeature( data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors ) @auto_docstring def preprocess( self, images: ImageInput, **kwargs: Unpack[Glm4vFastImageProcessorKwargs], ) -> BatchFeature: return super().preprocess(images, **kwargs) __all__ = ["Glm4vImageProcessorFast"]