# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/florence2/modular_florence2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_florence2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # coding=utf-8 # Copyright 2025 Microsoft and the HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import re from typing import Any, Optional, Union import numpy as np from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torch_available, logging if is_torch_available(): import torch logger = logging.get_logger(__name__) class Florence2ProcessorKwargs(ProcessingKwargs, total=False): _defaults = { "text_kwargs": {"padding": False, "return_mm_token_type_ids": False}, "images_kwargs": {}, } class Florence2Processor(ProcessorMixin): r""" Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor. [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information. Args: image_processor (`AutoImageProcessor`, *optional*): The image processor is a required input. tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*): The tokenizer is a required input. num_additional_image_tokens (`int`, *optional*, defaults to 0): Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other extra tokens appended, no need to set this arg. post_processor_config (`dict`, *optional*, defaults to 0): Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns, thresholds, or banned tokens. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = ("BartTokenizer", "BartTokenizerFast") def __init__( self, image_processor=None, tokenizer=None, num_additional_image_tokens: int = 0, post_processor_config: Optional[dict] = None, **kwargs, ): self.tasks_answer_post_processing_type = { "": "pure_text", "": "ocr", "": "pure_text", "": "pure_text", "": "pure_text", "": "description_with_bboxes", "": "description_with_bboxes", "": "phrase_grounding", "": "polygons", "": "polygons", "": "description_with_bboxes_or_polygons", "": "pure_text", "": "pure_text", "": "pure_text", "": "bboxes", } self.task_prompts_without_inputs = { "": "What is the text in the image?", "": "What is the text in the image, with regions?", "": "What does the image describe?", "": "Describe in detail what is shown in the image.", "": "Describe with a paragraph what is shown in the image.", "": "Locate the objects with category name in the image.", "": "Locate the objects in the image, with their descriptions.", "": "Locate the region proposals in the image.", } self.task_prompts_with_input = { "": "Locate the phrases in the caption: {input}", "": "Locate {input} in the image with mask", "": "What is the polygon mask of region {input}", "": "Locate {input} in the image.", "": "What is the region {input}?", "": "What does the region {input} describe?", "": "What text is in the region {input}?", } self.num_image_tokens = image_processor.image_seq_length self.num_additional_image_tokens = num_additional_image_tokens self.post_processor_config = post_processor_config self.post_processor = Florence2PostProcessor(config=post_processor_config, tokenizer=tokenizer) self.image_token = tokenizer.image_token self.image_token_id = tokenizer.image_token_id super().__init__(image_processor, tokenizer, **kwargs) def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]: """ Construct prompts by replacing task tokens with corresponding prompt strings. """ if isinstance(text, str): text = [text] prompts = [] for prompt in text: # Check for tasks without inputs for task_token, task_prompt in self.task_prompts_without_inputs.items(): if task_token in prompt: if prompt != task_token: raise ValueError(f"Task token {task_token} should be the only content in the prompt.") prompt = task_prompt break # Check for tasks with inputs for task_token, task_prompt in self.task_prompts_with_input.items(): if task_token in prompt: input_text = prompt.replace(task_token, "").strip() prompt = task_prompt.format(input=input_text) break prompts.append(prompt) return prompts def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, **kwargs: Unpack[Florence2ProcessorKwargs], ) -> BatchFeature: """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of the above two methods for more information. Args: images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch tensor. Both channels-first and channels-last formats are supported. text (`str`, `list[str]`, `list[list[str]]`): The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: - `'tf'`: Return TensorFlow `tf.constant` objects. - `'pt'`: Return PyTorch `torch.Tensor` objects. - `'np'`: Return NumPy `np.ndarray` objects. - `'jax'`: Return JAX `jnp.ndarray` objects. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ if images is None and text is None: raise ValueError("You have to specify at least one of `images` or `text`.") output_kwargs = self._merge_kwargs( Florence2ProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) image_inputs = {} if images is not None: image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) if text is None: logger.warning_once("You are using Florence-2 without a text prefix.") text = [""] * (1 if not isinstance(images, list) else len(images)) elif isinstance(text, str): text = [text] if not isinstance(text, list) or not all(isinstance(token, str) for token in text): raise ValueError("`text` must be a string or list of strings.") if isinstance(images, list) and len(images) != len(text): raise ValueError(f"Number of images ({len(images)}) must match number of texts ({len(text)}).") prompt_strings = self._construct_prompts(text) # Add image tokens and special tokens if images are provided if image_inputs.get("pixel_values") is not None: # Replace the image token with the expanded image token sequence expanded_image_prompts = [] for sample in prompt_strings: sample = ( self.image_token * self.num_image_tokens + self.tokenizer.bos_token + sample + self.tokenizer.eos_token ) expanded_image_prompts.append(sample) prompt_strings = expanded_image_prompts # Construct and tokenize prompts output_kwargs["text_kwargs"].pop("add_special_tokens", None) return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False) text_inputs = self.tokenizer( prompt_strings, **output_kwargs["text_kwargs"], add_special_tokens=False, return_tensors=None ) self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) if return_mm_token_type_ids: array_ids = np.array(text_inputs["input_ids"]) mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) mm_token_type_ids[array_ids == self.image_token_id] = 1 text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() return BatchFeature(data={**image_inputs, **text_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more information. """ return self.tokenizer.decode(*args, **kwargs) @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs): """ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes. Args: image_sizes (`list[list[int]]`, *optional*): The input sizes formatted as (height, width) per each image. Returns: `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided input modalities, along with other useful data. """ vision_data = {} if image_sizes is not None: num_image_tokens = [self.image_seq_length] * len(image_sizes) num_image_patches = [1] * len(image_sizes) vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) return MultiModalData(**vision_data) def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=False, **kwargs): """ Post-processes the output of `FuyuForConditionalGeneration` to only return the text output. Args: generated_outputs (`torch.Tensor` or `np.ndarray`): The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)` containing the token ids of the generated sequences. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method. **kwargs: Additional arguments to be passed to the tokenizer's `batch_decode method`. Returns: `list[str]`: The decoded text output. """ return self.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs) def post_process_generation(self, text=None, sequence=None, task=None, image_size=None) -> dict[str, Any]: """ Post-process generation outputs based on the task. Args: text (`str`, *optional*): Generated text. sequence (`Union[List[int], torch.Tensor]`, *optional*): Generated token sequence. task (`str`, *optional*): The task for post-processing. image_size (`Tuple[int, int]`, *optional*): Image size for dequantization. Returns: `Dict[str, Any]`: Post-processed results keyed by task. """ if task is None: raise ValueError("`task` must be provided for post-processing.") post_proc_type = self.tasks_answer_post_processing_type.get(task, "pure_text") parsed = self.post_processor( text=text, sequence=sequence, image_size=image_size, parse_tasks=[post_proc_type], )[post_proc_type] if post_proc_type == "pure_text": final_answer = parsed.replace("", "").replace("", "").strip() elif post_proc_type in ["description_with_bboxes", "bboxes"]: bboxes = [inst["bbox"] for inst in parsed] labels = [inst["cat_name"] for inst in parsed] final_answer = {"bboxes": bboxes, "labels": labels} if parsed and "score" in parsed[0]: final_answer["scores"] = [inst["score"] for inst in parsed] elif post_proc_type == "ocr": quad_boxes = [inst["quad_box"] for inst in parsed] labels = [inst["text"] for inst in parsed] final_answer = {"quad_boxes": quad_boxes, "labels": labels} elif post_proc_type == "phrase_grounding": bboxes = [] labels = [] for inst in parsed: for bbox in inst["bbox"]: bboxes.append(bbox) labels.append(inst["cat_name"]) final_answer = {"bboxes": bboxes, "labels": labels} elif post_proc_type in ["description_with_polygons", "polygons"]: polygons = [inst["polygons"] for inst in parsed] labels = [inst["cat_name"] for inst in parsed] final_answer = {"polygons": polygons, "labels": labels} elif post_proc_type == "description_with_bboxes_or_polygons": bboxes = [] bboxes_labels = [] polygons = [] polygons_labels = [] for inst in parsed: label = inst["cat_name"] if "polygons" in inst: polygons.append(inst["polygons"]) polygons_labels.append(label) else: bboxes.append(inst["bbox"]) bboxes_labels.append(label) final_answer = { "bboxes": bboxes, "bboxes_labels": bboxes_labels, "polygons": polygons, "polygons_labels": polygons_labels, } else: raise ValueError(f"Unknown post-processing type: {post_proc_type}") return {task: final_answer} class Florence2PostProcessor: """ Post-processor for Florence-2 model outputs. Parses generated text into structured results for various tasks like object detection, OCR, phrase grounding, etc. Args: tokenizer (`PreTrainedTokenizer`): The tokenizer used for decoding model outputs. """ def __init__(self, config, tokenizer): self.tokenizer = tokenizer self.parse_task_config = config or {} self.banned_grounding_tokens = set( self.parse_task_config.get("phrase_grounding", {}).get("banned_grounding_tokens", []) ) self.all_special_tokens = set(self.tokenizer.all_special_tokens) self.quantize_bins = (1000, 1000) def quantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor": """ Quantize locations. Args: locations (`torch.Tensor`): Tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates. size (`tuple[int, int]`): Original image size (width, height). Returns: `torch.Tensor`: Quantized locations as integers. """ bins_w, bins_h = self.quantize_bins size_w, size_h = size per_bin_w = size_w / bins_w per_bin_h = size_h / bins_h if locations.shape[-1] == 4: # Bounding boxes: [xmin, ymin, xmax, ymax] xmin, ymin, xmax, ymax = locations.split(1, dim=-1) q_xmin = (xmin / per_bin_w).floor().clamp(0, bins_w - 1) q_ymin = (ymin / per_bin_h).floor().clamp(0, bins_h - 1) q_xmax = (xmax / per_bin_w).floor().clamp(0, bins_w - 1) q_ymax = (ymax / per_bin_h).floor().clamp(0, bins_h - 1) return torch.cat([q_xmin, q_ymin, q_xmax, q_ymax], dim=-1).int() elif locations.shape[-1] == 2: # Points/coordinates: [x, y] x, y = locations.split(1, dim=-1) q_x = (x / per_bin_w).floor().clamp(0, bins_w - 1) q_y = (y / per_bin_h).floor().clamp(0, bins_h - 1) return torch.cat([q_x, q_y], dim=-1).int() else: raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.") def dequantize(self, locations: "torch.Tensor", size: tuple[int, int]) -> "torch.Tensor": """ Dequantize locations back to original scale. Args: locations (`torch.Tensor`): Quantized tensor of shape (N, 4) for boxes or (N, 2) for points/coordinates. size (`tuple[int, int]`): Original image size (width, height). Returns: `torch.Tensor`: Dequantized locations as floats. """ bins_w, bins_h = self.quantize_bins size_w, size_h = size per_bin_w = size_w / bins_w per_bin_h = size_h / bins_h # Add 0.5 to use the center position of the bin as the coordinate. if locations.shape[-1] == 4: # Bounding boxes xmin, ymin, xmax, ymax = locations.split(1, dim=-1) dq_xmin = (xmin + 0.5) * per_bin_w dq_ymin = (ymin + 0.5) * per_bin_h dq_xmax = (xmax + 0.5) * per_bin_w dq_ymax = (ymax + 0.5) * per_bin_h return torch.cat([dq_xmin, dq_ymin, dq_xmax, dq_ymax], dim=-1).int() elif locations.shape[-1] == 2: # Points/coordinates x, y = locations.split(1, dim=-1) dq_x = (x + 0.5) * per_bin_w dq_y = (y + 0.5) * per_bin_h return torch.cat([dq_x, dq_y], dim=-1).int() else: raise ValueError(f"Unsupported location shape: last dim must be 2 or 4, got {locations.shape[-1]}.") def decode_with_spans(self, token_ids: list[int]) -> tuple[str, list[tuple[int, int]]]: """ Decode token IDs to text and compute character spans. Args: token_ids (`list[int]`): list of token IDs to decode. Returns: `tuple[str, list[tuple[int, int]]]`: Decoded text and list of spans (start, end) for each token. """ filtered_tokens = self.tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False) text = "" spans = [] for token in filtered_tokens: if token in self.all_special_tokens: sub_text = token else: sub_text = self.tokenizer.convert_tokens_to_string([token]) span = (len(text), len(text) + len(sub_text)) text += sub_text spans.append(span) return text, spans def parse_ocr_from_text_and_spans( self, text: str, pattern: Optional[str], image_size: tuple[int, int], area_threshold: float = 0.0 ) -> list[dict[str, Any]]: """ Parse OCR results with quadrilateral boxes. Args: text (`str`): The generated text. pattern (`str`): Regex pattern for matching. image_size (`tuple[int, int]`): Image size (width, height). area_threshold (`float`, *optional*, defaults to 0.0): Minimum area threshold for filtering boxes. Returns: `list[dict[str, Any]]`: list of instances with 'quad_box' and 'text'. """ text = text.replace("", "").replace("", "").replace("", "") if pattern is None: pattern = r"(.+?)" matches = re.findall(pattern, text) instances = [] width, height = image_size for content, *quad_str in matches: quad_bins = [int(i) for i in quad_str] quad_box = self.dequantize(torch.tensor(quad_bins).reshape(-1, 2), size=image_size).flatten().tolist() if area_threshold > 0: x_coords = quad_box[0::2] y_coords = quad_box[1::2] # Apply the Shoelace formula area = 0.5 * abs( sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)) ) if area < (width * height) * area_threshold: continue instances.append({"quad_box": quad_box, "text": content.strip()}) return instances def parse_phrase_grounding_from_text_and_spans( self, text: str, image_size: tuple[int, int] ) -> list[dict[str, Any]]: """ Parse phrase grounding results. Args: text (`str`): The generated text. image_size (`tuple[int, int]`): Image size (width, height). Returns: `list[dict[str, Any]]`: list of instances with 'bbox' and 'cat_name'. """ text = text.replace("", "").replace("", "").replace("", "") phrase_pattern = r"([^<]+(?:){4,})" phrases = re.findall(phrase_pattern, text) text_pattern = r"^\s*(.*?)(?=||||||" instances = [] for phrase_text in phrases: phrase_text = phrase_text.replace("", "", 1).replace("", "", 1) if not phrase_text: continue match = re.search(text_pattern, phrase_text) if not match: continue phrase = match.group().strip() if phrase in self.banned_grounding_tokens: continue boxes_matches = list(re.finditer(box_pattern, phrase_text)) if not boxes_matches: continue bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches] bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist() phrase = phrase.encode("ascii", "ignore").decode("ascii") instances.append({"bbox": bboxes, "cat_name": phrase}) return instances def _find_matched_token_indices(self, cur_span: tuple[int, int], token_spans: list[tuple[int, int]]) -> list[int]: return [i for i, span in enumerate(token_spans) if not (span[1] <= cur_span[0] or span[0] >= cur_span[1])] def parse_description_with_bboxes_from_text_and_spans( self, text: str, image_size: tuple[int, int], allow_empty_phrase: bool = False, ) -> list[dict[str, Any]]: """ Parse descriptions with bounding boxes. Args: text (`str`): The generated text. image_size (`tuple[int, int]`): Image size (width, height). allow_empty_phrase (`bool`, *optional*, defaults to `False`): Allow phrases without text. Returns: `list[dict[str, Any]]`: list of instances with 'bbox', 'cat_name', and optional 'score'. """ text = text.replace("", "").replace("", "").replace("", "") if allow_empty_phrase: pattern = r"(?:(?:){4,})" else: pattern = r"([^<]+(?:){4,})" phrases = re.findall(pattern, text) text_pattern = r"^\s*(.*?)(?=||||||" instances = [] for phrase_text in phrases: phrase_text = phrase_text.replace("", "", 1).replace("", "", 1) if not phrase_text and not allow_empty_phrase: continue match = re.search(text_pattern, phrase_text) if not match: continue phrase = match.group().strip() boxes_matches = list(re.finditer(box_pattern, phrase_text)) if not boxes_matches: continue bbox_bins = [[int(m.group(j)) for j in range(1, 5)] for m in boxes_matches] bboxes = self.dequantize(torch.tensor(bbox_bins), size=image_size).tolist() phrase = phrase.encode("ascii", "ignore").decode("ascii") for bbox in bboxes: instance = {"bbox": bbox, "cat_name": phrase} instances.append(instance) return instances def parse_description_with_polygons_from_text_and_spans( self, text: str, image_size: tuple[int, int], allow_empty_phrase: bool = False, polygon_sep_token: str = "", polygon_start_token: str = "", polygon_end_token: str = "", with_box_at_start: bool = False, ) -> list[dict[str, Any]]: """ Parse descriptions with polygons. Args: text (`str`): The generated text. image_size (`tuple[int, int]`): Image size (width, height). allow_empty_phrase (`bool`, *optional*, defaults to `False`): Allow phrases without text. polygon_sep_token (`str`, *optional*, defaults to ""): Token separating polygons. polygon_start_token (`str`, *optional*, defaults to ""): Start token for polygons. polygon_end_token (`str`, *optional*, defaults to ""): End token for polygons. with_box_at_start (`bool`, *optional*, defaults to `False`): Whether a bounding box is at the start of polygons. Returns: `list[dict[str, Any]]`: list of instances with 'polygons', 'cat_name', and optional 'bbox'. """ text = text.replace("", "").replace("", "").replace("", "") if allow_empty_phrase: pattern = rf"(?:(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})" else: pattern = rf"([^<]+(?:|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})" phrases = re.findall(pattern, text) phrase_pattern = r"^\s*(.*?)(?=||||||)" poly_instance_pattern = rf"{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}" box_pattern = rf"((?:)+)(?:{re.escape(polygon_sep_token)}|$)" instances = [] for phrase_text in phrases: phrase_text_strip = re.sub(r"^", "", phrase_text, count=1) if not phrase_text_strip and not allow_empty_phrase: continue match = re.search(phrase_pattern, phrase_text_strip) if not match: continue phrase = match.group().strip() if polygon_start_token in phrase_text and polygon_end_token in phrase_text: poly_instances = [m.group(1) for m in re.finditer(poly_instance_pattern, phrase_text)] else: poly_instances = [phrase_text] for poly_inst in poly_instances: poly_matches = list(re.finditer(box_pattern, poly_inst)) if len(poly_matches) == 0: continue bbox = [] polygons = [] for poly_match in poly_matches: poly_str = poly_match.group(1) poly_bins = [int(m.group(1)) for m in re.finditer(r"", poly_str)] if with_box_at_start and not bbox: if len(poly_bins) > 4: bbox = poly_bins[:4] poly_bins = poly_bins[4:] else: bbox = [0, 0, 0, 0] if len(poly_bins) % 2 == 1: poly_bins = poly_bins[:-1] poly_coords = ( self.dequantize(torch.tensor(poly_bins).reshape(-1, 2), size=image_size).flatten().tolist() ) polygons.append(poly_coords) instance = {"cat_name": phrase, "polygons": polygons} if bbox: instance["bbox"] = self.dequantize(torch.tensor([bbox]), size=image_size)[0].tolist() instances.append(instance) return instances def __call__(self, text=None, sequence=None, image_size=None, parse_tasks=None) -> dict[str, Any]: """ Process model output and parse into task-specific results. Args: text (`Optional[str]`, *optional*): Generated text. Either this or `sequence` must be provided. sequence (`Optional[Union[list[int], torch.Tensor]]`, *optional*): Token sequence. Either this or `text` must be provided. image_size (`Optional[tuple[int, int]]`, *optional*): Image size (width, height) required for dequantization. parse_tasks (`Optional[Union[str, list[str]]]`, *optional*): Specific tasks to parse. If None, parse all supported tasks. Returns: `dict[str, Any]`: Parsed results for each task, including the raw 'text'. """ if parse_tasks is not None: parse_tasks = [parse_tasks] if isinstance(parse_tasks, str) else parse_tasks for task in parse_tasks: if task not in self.parse_task_config.keys(): raise ValueError(f"Unsupported parse task: {task}") if (text is None and sequence is None) or (text is not None and sequence is not None): raise ValueError("Exactly one of 'text' or 'sequence' must be provided.") if sequence is not None: if isinstance(sequence, torch.Tensor): sequence = sequence.tolist() sequence = sequence[1:] if sequence[0] == self.tokenizer.bos_token_id else sequence # Skip BOS if present text, _ = self.decode_with_spans(sequence) parsed_dict = {"text": text} tasks_to_parse = parse_tasks or self.parse_task_config.keys() for task in tasks_to_parse: config = self.parse_task_config[task] pattern = config.get("PATTERN") if task == "ocr": parsed_dict["ocr"] = self.parse_ocr_from_text_and_spans( text, pattern=pattern, image_size=image_size, area_threshold=config.get("AREA_THRESHOLD", 0.0) ) elif task == "phrase_grounding": parsed_dict["phrase_grounding"] = self.parse_phrase_grounding_from_text_and_spans( text, image_size=image_size ) elif task == "pure_text": parsed_dict["pure_text"] = text elif task == "description_with_bboxes": parsed_dict["description_with_bboxes"] = self.parse_description_with_bboxes_from_text_and_spans( text, image_size=image_size ) elif task == "description_with_polygons": parsed_dict["description_with_polygons"] = self.parse_description_with_polygons_from_text_and_spans( text, image_size=image_size ) elif task == "polygons": parsed_dict["polygons"] = self.parse_description_with_polygons_from_text_and_spans( text, image_size=image_size, allow_empty_phrase=True ) elif task == "bboxes": parsed_dict["bboxes"] = self.parse_description_with_bboxes_from_text_and_spans( text, image_size=image_size, allow_empty_phrase=True ) elif task == "description_with_bboxes_or_polygons": if "" in text: instances = self.parse_description_with_polygons_from_text_and_spans(text, image_size=image_size) else: instances = self.parse_description_with_bboxes_from_text_and_spans(text, image_size=image_size) parsed_dict["description_with_bboxes_or_polygons"] = instances else: raise ValueError(f"task {task} is not supported") return parsed_dict __all__ = ["Florence2Processor"]