# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/owlv2/modular_owlv2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_owlv2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # coding=utf-8 # Copyright 2025 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import warnings from typing import TYPE_CHECKING, Optional, Union import torch from torchvision.transforms.v2 import functional as F from ...image_processing_utils_fast import BaseImageProcessorFast, BatchFeature, DefaultFastImageProcessorKwargs from ...image_transforms import center_to_corners_format, group_images_by_shape, reorder_images from ...image_utils import ( OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ChannelDimension, ImageInput, PILImageResampling, SizeDict, ) from ...processing_utils import Unpack from ...utils import TensorType, auto_docstring from .image_processing_owlv2 import _scale_boxes, box_iou if TYPE_CHECKING: from .modeling_owlv2 import Owlv2ObjectDetectionOutput class Owlv2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ... @auto_docstring class Owlv2ImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR image_mean = OPENAI_CLIP_MEAN image_std = OPENAI_CLIP_STD size = {"height": 960, "width": 960} default_to_square = True crop_size = None do_resize = True do_center_crop = None do_rescale = True do_normalize = True do_convert_rgb = None model_input_names = ["pixel_values"] rescale_factor = 1 / 255 do_pad = True valid_kwargs = Owlv2FastImageProcessorKwargs def post_process(self, outputs, target_sizes): """ Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. Args: outputs ([`Owlv2ObjectDetectionOutput`]): Raw outputs of the model. target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original image size (before any data augmentation). For visualization, this should be the image size after data augment, but before padding. Returns: `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image in the batch as predicted by the model. """ # TODO: (amy) add support for other frameworks warnings.warn( "`post_process` is deprecated and will be removed in v5 of Transformers, please use" " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", FutureWarning, ) logits, boxes = outputs.logits, outputs.pred_boxes if len(logits) != len(target_sizes): raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") if target_sizes.shape[1] != 2: raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") probs = torch.max(logits, dim=-1) scores = torch.sigmoid(probs.values) labels = probs.indices # Convert to [x0, y0, x1, y1] format boxes = center_to_corners_format(boxes) # Convert from relative [0, 1] to absolute [0, height] coordinates img_h, img_w = target_sizes.unbind(1) scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) boxes = boxes * scale_fct[:, None, :] results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] return results def post_process_object_detection( self, outputs: "Owlv2ObjectDetectionOutput", threshold: float = 0.1, target_sizes: Optional[Union[TensorType, list[tuple]]] = None, ): """ Converts the raw output of [`Owlv2ForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. Args: outputs ([`Owlv2ObjectDetectionOutput`]): Raw outputs of the model. threshold (`float`, *optional*, defaults to 0.1): Score threshold to keep object detection predictions. target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*): Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the batch. If unset, predictions will not be resized. Returns: `list[Dict]`: A list of dictionaries, each dictionary containing the following keys: - "scores": The confidence scores for each predicted box on the image. - "labels": Indexes of the classes predicted by the model on the image. - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format. """ batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes batch_size = len(batch_logits) if target_sizes is not None and len(target_sizes) != batch_size: raise ValueError("Make sure that you pass in as many target sizes as images") # batch_logits of shape (batch_size, num_queries, num_classes) batch_class_logits = torch.max(batch_logits, dim=-1) batch_scores = torch.sigmoid(batch_class_logits.values) batch_labels = batch_class_logits.indices # Convert to [x0, y0, x1, y1] format batch_boxes = center_to_corners_format(batch_boxes) # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: batch_boxes = _scale_boxes(batch_boxes, target_sizes) results = [] for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes): keep = scores > threshold scores = scores[keep] labels = labels[keep] boxes = boxes[keep] results.append({"scores": scores, "labels": labels, "boxes": boxes}) return results def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None): """ Converts the output of [`Owlv2ForObjectDetection.image_guided_detection`] into the format expected by the COCO api. Args: outputs ([`Owlv2ImageGuidedObjectDetectionOutput`]): Raw outputs of the model. threshold (`float`, *optional*, defaults to 0.0): Minimum confidence threshold to use to filter out predicted boxes. nms_threshold (`float`, *optional*, defaults to 0.3): IoU threshold for non-maximum suppression of overlapping boxes. target_sizes (`torch.Tensor`, *optional*): Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to None, predictions will not be unnormalized. Returns: `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image in the batch as predicted by the model. All labels are set to None as `Owlv2ForObjectDetection.image_guided_detection` perform one-shot object detection. """ logits, target_boxes = outputs.logits, outputs.target_pred_boxes if target_sizes is not None and len(logits) != len(target_sizes): raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") if target_sizes is not None and target_sizes.shape[1] != 2: raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") probs = torch.max(logits, dim=-1) scores = torch.sigmoid(probs.values) # Convert to [x0, y0, x1, y1] format target_boxes = center_to_corners_format(target_boxes) # Apply non-maximum suppression (NMS) if nms_threshold < 1.0: for idx in range(target_boxes.shape[0]): for i in torch.argsort(-scores[idx]): if not scores[idx][i]: continue ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0] ious[i] = -1.0 # Mask self-IoU. scores[idx][ious > nms_threshold] = 0.0 # Convert from relative [0, 1] to absolute [0, height] coordinates if target_sizes is not None: target_boxes = _scale_boxes(target_boxes, target_sizes) # Compute box display alphas based on prediction scores results = [] alphas = torch.zeros_like(scores) for idx in range(target_boxes.shape[0]): # Select scores for boxes matching the current query: query_scores = scores[idx] if not query_scores.nonzero().numel(): continue # Apply threshold on scores before scaling query_scores[query_scores < threshold] = 0.0 # Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1. # All other boxes will either belong to a different query, or will not be shown. max_score = torch.max(query_scores) + 1e-6 query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9) query_alphas = torch.clip(query_alphas, 0.0, 1.0) alphas[idx] = query_alphas mask = alphas[idx] > 0 box_scores = alphas[idx][mask] boxes = target_boxes[idx][mask] results.append({"scores": box_scores, "labels": None, "boxes": boxes}) return results def __init__(self, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]): super().__init__(**kwargs) @auto_docstring def preprocess(self, images: ImageInput, **kwargs: Unpack[Owlv2FastImageProcessorKwargs]): return super().preprocess(images, **kwargs) def _pad_images(self, images: "torch.Tensor", constant_value: float = 0.5) -> "torch.Tensor": """ Pad an image with zeros to the given size. """ height, width = images.shape[-2:] size = max(height, width) pad_bottom = size - height pad_right = size - width padding = (0, 0, pad_right, pad_bottom) padded_image = F.pad(images, padding, fill=constant_value) return padded_image def pad( self, images: list["torch.Tensor"], disable_grouping: Optional[bool], constant_value: float = 0.5, **kwargs, ) -> list["torch.Tensor"]: """ Unlike the Base class `self.pad` where all images are padded to the maximum image size, Owlv2 pads an image to square. """ grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) processed_images_grouped = {} for shape, stacked_images in grouped_images.items(): stacked_images = self._pad_images( stacked_images, constant_value=constant_value, ) processed_images_grouped[shape] = stacked_images processed_images = reorder_images(processed_images_grouped, grouped_images_index) return processed_images def resize( self, image: "torch.Tensor", size: SizeDict, anti_aliasing: bool = True, anti_aliasing_sigma=None, **kwargs, ) -> "torch.Tensor": """ Resize an image as per the original implementation. Args: image (`Tensor`): Image to resize. size (`dict[str, int]`): Dictionary containing the height and width to resize the image to. anti_aliasing (`bool`, *optional*, defaults to `True`): Whether to apply anti-aliasing when downsampling the image. anti_aliasing_sigma (`float`, *optional*, defaults to `None`): Standard deviation for Gaussian kernel when downsampling the image. If `None`, it will be calculated automatically. """ output_shape = (size.height, size.width) input_shape = image.shape # select height and width from input tensor factors = torch.tensor(input_shape[2:]).to(image.device) / torch.tensor(output_shape).to(image.device) if anti_aliasing: if anti_aliasing_sigma is None: anti_aliasing_sigma = ((factors - 1) / 2).clamp(min=0) else: anti_aliasing_sigma = torch.atleast_1d(anti_aliasing_sigma) * torch.ones_like(factors) if torch.any(anti_aliasing_sigma < 0): raise ValueError("Anti-aliasing standard deviation must be greater than or equal to zero") elif torch.any((anti_aliasing_sigma > 0) & (factors <= 1)): warnings.warn( "Anti-aliasing standard deviation greater than zero but not down-sampling along all axes" ) if torch.any(anti_aliasing_sigma == 0): filtered = image else: kernel_sizes = 2 * torch.ceil(3 * anti_aliasing_sigma).int() + 1 filtered = F.gaussian_blur( image, (kernel_sizes[0], kernel_sizes[1]), sigma=anti_aliasing_sigma.tolist() ) else: filtered = image out = F.resize(filtered, size=(size.height, size.width), antialias=False) return out def _preprocess( self, images: list["torch.Tensor"], do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], do_pad: bool, do_rescale: bool, rescale_factor: float, do_normalize: bool, image_mean: Optional[Union[float, list[float]]], image_std: Optional[Union[float, list[float]]], disable_grouping: Optional[bool], return_tensors: Optional[Union[str, TensorType]], **kwargs, ) -> BatchFeature: # Group images by size for batched resizing grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping) processed_images_grouped = {} for shape, stacked_images in grouped_images.items(): # Rescale images before other operations as done in original implementation stacked_images = self.rescale_and_normalize( stacked_images, do_rescale, rescale_factor, False, image_mean, image_std ) processed_images_grouped[shape] = stacked_images processed_images = reorder_images(processed_images_grouped, grouped_images_index) if do_pad: processed_images = self.pad(processed_images, constant_value=0.5, disable_grouping=disable_grouping) grouped_images, grouped_images_index = group_images_by_shape( processed_images, disable_grouping=disable_grouping ) resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_resize: resized_stack = self.resize( image=stacked_images, size=size, interpolation=interpolation, input_data_format=ChannelDimension.FIRST, ) resized_images_grouped[shape] = resized_stack resized_images = reorder_images(resized_images_grouped, grouped_images_index) # Group images by size for further processing # Needed in case do_resize is False, or resize returns images with different sizes grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping) processed_images_grouped = {} for shape, stacked_images in grouped_images.items(): # Fused rescale and normalize stacked_images = self.rescale_and_normalize( stacked_images, False, rescale_factor, do_normalize, image_mean, image_std ) processed_images_grouped[shape] = stacked_images processed_images = reorder_images(processed_images_grouped, grouped_images_index) processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) __all__ = ["Owlv2ImageProcessorFast"]