# coding=utf-8 # Copyright 2024 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Processor class for LLaVa-Onevision. """ import math from collections.abc import Iterable from typing import Optional, Union import numpy as np from ...feature_extraction_utils import BatchFeature from ...image_processing_utils import select_best_resolution from ...image_utils import ImageInput, get_image_size, to_numpy_array from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging from ...video_utils import VideoInput logger = logging.get_logger(__name__) class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False): # see processing_utils.ProcessingKwargs documentation for usage. _defaults = { "text_kwargs": { "padding": False, "return_mm_token_type_ids": False, }, "image_kwargs": {}, "videos_kwargs": {}, } class LlavaOnevisionProcessor(ProcessorMixin): r""" Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor. [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information. Args: image_processor ([`LlavaOnevisionImageProcessor`], *optional*): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`], *optional*): The tokenizer is a required input. video_processor ([`LlavaOnevisionVideoProcessor`], *optional*): The video processor is a required input. num_image_tokens (`int`, *optional*): Number of image tokens for one imagethat will be returned by vision tower. vision_feature_select_strategy (`str`, *optional*): The feature selection strategy used to select the vision feature from the vision backbone. Should be same as in model's config chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. video_token (`str`, *optional*, defaults to `"