# coding=utf-8 # Copyright 2023 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Processor class for VideoLlava. """ from typing import Optional, Union import numpy as np from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput, get_image_size, to_numpy_array from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...utils import TensorType, logging logger = logging.get_logger(__name__) class VideoLlavaProcessor(ProcessorMixin): r""" Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor. [`VideoLlavaProcessor`] offers all the functionalities of [`VideoLlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the [`~VideoLlavaProcessor.__call__`] and [`~VideoLlavaProcessor.decode`] for more information. Args: image_processor ([`VideoLlavaImageProcessor`], *optional*): The image processor is a required input. video_processor ([`VideoLlavaVideoProcessor`], *optional*): The video processor is a required input. tokenizer ([`LlamaTokenizerFast`], *optional*): The tokenizer is a required input. patch_size (`int`, *optional*, defaults to 14): Patch size from the vision tower. vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Should be same as in model's config image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. video_token (`str`, *optional*, defaults to `"