# coding=utf-8 # Copyright 2024 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """OmDet-Turbo model configuration""" from ...configuration_utils import PretrainedConfig from ...utils import logging from ...utils.backbone_utils import verify_backbone_config_arguments from ..auto import CONFIG_MAPPING logger = logging.get_logger(__name__) class OmDetTurboConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`OmDetTurboForObjectDetection`]. It is used to instantiate a OmDet-Turbo model according to the specified arguments, defining the model architecture Instantiating a configuration with the defaults will yield a similar configuration to that of the OmDet-Turbo [omlab/omdet-turbo-swin-tiny-hf](https://huggingface.co/omlab/omdet-turbo-swin-tiny-hf) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: text_config (`PretrainedConfig`, *optional*): The configuration of the text backbone. backbone_config (`PretrainedConfig`, *optional*): The configuration of the vision backbone. use_timm_backbone (`bool`, *optional*, defaults to `True`): Whether to use the timm for the vision backbone. backbone (`str`, *optional*, defaults to `"swin_tiny_patch4_window7_224"`): The name of the pretrained vision backbone to use. If `use_pretrained_backbone=False` a randomly initialized backbone with the same architecture `backbone` is used. backbone_kwargs (`dict`, *optional*): Additional kwargs for the vision backbone. use_pretrained_backbone (`bool`, *optional*, defaults to `False`): Whether to use a pretrained vision backbone. apply_layernorm_after_vision_backbone (`bool`, *optional*, defaults to `True`): Whether to apply layer normalization on the feature maps of the vision backbone output. image_size (`int`, *optional*, defaults to 640): The size (resolution) of each image. disable_custom_kernels (`bool`, *optional*, defaults to `False`): Whether to disable custom kernels. layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon value for layer normalization. batch_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon value for batch normalization. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. text_projection_in_dim (`int`, *optional*, defaults to 512): The input dimension for the text projection. text_projection_out_dim (`int`, *optional*, defaults to 512): The output dimension for the text projection. task_encoder_hidden_dim (`int`, *optional*, defaults to 1024): The feedforward dimension for the task encoder. class_embed_dim (`int`, *optional*, defaults to 512): The dimension of the classes embeddings. class_distance_type (`str`, *optional*, defaults to `"cosine"`): The type of of distance to compare predicted classes to projected classes embeddings. Can be `"cosine"` or `"dot"`. num_queries (`int`, *optional*, defaults to 900): The number of queries. csp_activation (`str`, *optional*, defaults to `"silu"`): The activation function of the Cross Stage Partial (CSP) networks of the encoder. conv_norm_activation (`str`, *optional*, defaults to `"gelu"`): The activation function of the ConvNormLayer layers of the encoder. encoder_feedforward_activation (`str`, *optional*, defaults to `"relu"`): The activation function for the feedforward network of the encoder. encoder_feedforward_dropout (`float`, *optional*, defaults to 0.0): The dropout rate following the activation of the encoder feedforward network. encoder_dropout (`float`, *optional*, defaults to 0.0): The dropout rate of the encoder multi-head attention module. hidden_expansion (`int`, *optional*, defaults to 1): The hidden expansion of the CSP networks in the encoder. vision_features_channels (`tuple(int)`, *optional*, defaults to `[256, 256, 256]`): The projected vision features channels used as inputs for the decoder. encoder_hidden_dim (`int`, *optional*, defaults to 256): The hidden dimension of the encoder. encoder_in_channels (`List(int)`, *optional*, defaults to `[192, 384, 768]`): The input channels for the encoder. encoder_projection_indices (`List(int)`, *optional*, defaults to `[2]`): The indices of the input features projected by each layers. encoder_attention_heads (`int`, *optional*, defaults to 8): The number of attention heads for the encoder. encoder_dim_feedforward (`int`, *optional*, defaults to 2048): The feedforward dimension for the encoder. encoder_layers (`int`, *optional*, defaults to 1): The number of layers in the encoder. positional_encoding_temperature (`int`, *optional*, defaults to 10000): The positional encoding temperature in the encoder. num_feature_levels (`int`, *optional*, defaults to 3): The number of feature levels for the multi-scale deformable attention module of the decoder. decoder_hidden_dim (`int`, *optional*, defaults to 256): The hidden dimension of the decoder. decoder_num_heads (`int`, *optional*, defaults to 8): The number of heads for the decoder. decoder_num_layers (`int`, *optional*, defaults to 6): The number of layers for the decoder. decoder_activation (`str`, *optional*, defaults to `"relu"`): The activation function for the decoder. decoder_dim_feedforward (`int`, *optional*, defaults to 2048): The feedforward dimension for the decoder. decoder_num_points (`int`, *optional*, defaults to 4): The number of points sampled in the decoder multi-scale deformable attention module. decoder_dropout (`float`, *optional*, defaults to 0.0): The dropout rate for the decoder. eval_size (`tuple[int, int]`, *optional*): Height and width used to computes the effective height and width of the position embeddings after taking into account the stride (see RTDetr). learn_initial_query (`bool`, *optional*, defaults to `False`): Whether to learn the initial query. cache_size (`int`, *optional*, defaults to 100): The cache size for the classes and prompts caches. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder-decoder model or not. kwargs (`dict[str, Any]`, *optional*): Additional parameters from the architecture. The values in kwargs will be saved as part of the configuration and can be used to control the model outputs. Examples: ```python >>> from transformers import OmDetTurboConfig, OmDetTurboForObjectDetection >>> # Initializing a OmDet-Turbo omlab/omdet-turbo-swin-tiny-hf style configuration >>> configuration = OmDetTurboConfig() >>> # Initializing a model (with random weights) from the omlab/omdet-turbo-swin-tiny-hf style configuration >>> model = OmDetTurboForObjectDetection(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "omdet-turbo" attribute_map = { "encoder_hidden_dim": "d_model", "num_attention_heads": "encoder_attention_heads", } def __init__( self, text_config=None, backbone_config=None, use_timm_backbone=True, backbone="swin_tiny_patch4_window7_224", backbone_kwargs=None, use_pretrained_backbone=False, apply_layernorm_after_vision_backbone=True, image_size=640, disable_custom_kernels=False, layer_norm_eps=1e-5, batch_norm_eps=1e-5, init_std=0.02, text_projection_in_dim=512, text_projection_out_dim=512, task_encoder_hidden_dim=1024, class_embed_dim=512, class_distance_type="cosine", num_queries=900, csp_activation="silu", conv_norm_activation="gelu", encoder_feedforward_activation="relu", encoder_feedforward_dropout=0.0, encoder_dropout=0.0, hidden_expansion=1, vision_features_channels=[256, 256, 256], encoder_hidden_dim=256, encoder_in_channels=[192, 384, 768], encoder_projection_indices=[2], encoder_attention_heads=8, encoder_dim_feedforward=2048, encoder_layers=1, positional_encoding_temperature=10000, num_feature_levels=3, decoder_hidden_dim=256, decoder_num_heads=8, decoder_num_layers=6, decoder_activation="relu", decoder_dim_feedforward=2048, decoder_num_points=4, decoder_dropout=0.0, eval_size=None, learn_initial_query=False, cache_size=100, is_encoder_decoder=True, **kwargs, ): if use_timm_backbone: if backbone_config is None: backbone_kwargs = { "out_indices": [1, 2, 3], "img_size": image_size, "always_partition": True, } elif backbone_config is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `swin` vision config.") backbone_config = CONFIG_MAPPING["swin"]( window_size=7, image_size=image_size, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], out_indices=[2, 3, 4], ) elif isinstance(backbone_config, dict): backbone_model_type = backbone_config.get("model_type") config_class = CONFIG_MAPPING[backbone_model_type] backbone_config = config_class.from_dict(backbone_config) verify_backbone_config_arguments( use_timm_backbone=use_timm_backbone, use_pretrained_backbone=use_pretrained_backbone, backbone=backbone, backbone_config=backbone_config, backbone_kwargs=backbone_kwargs, ) if text_config is None: logger.info( "`text_config` is `None`. Initializing the config with the default `clip_text_model` text config." ) text_config = CONFIG_MAPPING["clip_text_model"]() elif isinstance(text_config, dict): text_model_type = text_config.get("model_type") text_config = CONFIG_MAPPING[text_model_type](**text_config) if class_distance_type not in ["cosine", "dot"]: raise ValueError( f"Invalid `class_distance_type`. It should be either `cosine` or `dot`, but got {class_distance_type}." ) self.text_config = text_config self.backbone_config = backbone_config self.use_timm_backbone = use_timm_backbone self.backbone = backbone self.backbone_kwargs = backbone_kwargs self.use_pretrained_backbone = use_pretrained_backbone self.apply_layernorm_after_vision_backbone = apply_layernorm_after_vision_backbone self.image_size = image_size self.disable_custom_kernels = disable_custom_kernels self.layer_norm_eps = layer_norm_eps self.batch_norm_eps = batch_norm_eps self.init_std = init_std self.text_projection_in_dim = text_projection_in_dim self.text_projection_out_dim = text_projection_out_dim self.task_encoder_hidden_dim = task_encoder_hidden_dim self.class_embed_dim = class_embed_dim self.class_distance_type = class_distance_type self.num_queries = num_queries self.csp_activation = csp_activation self.conv_norm_activation = conv_norm_activation self.encoder_feedforward_activation = encoder_feedforward_activation self.encoder_feedforward_dropout = encoder_feedforward_dropout self.encoder_dropout = encoder_dropout self.hidden_expansion = hidden_expansion self.vision_features_channels = vision_features_channels self.encoder_hidden_dim = encoder_hidden_dim self.encoder_in_channels = encoder_in_channels self.encoder_projection_indices = encoder_projection_indices self.encoder_attention_heads = encoder_attention_heads self.encoder_dim_feedforward = encoder_dim_feedforward self.encoder_layers = encoder_layers self.positional_encoding_temperature = positional_encoding_temperature self.num_feature_levels = num_feature_levels self.decoder_hidden_dim = decoder_hidden_dim self.decoder_num_heads = decoder_num_heads self.decoder_num_layers = decoder_num_layers self.decoder_activation = decoder_activation self.decoder_dim_feedforward = decoder_dim_feedforward self.decoder_num_points = decoder_num_points self.decoder_dropout = decoder_dropout self.eval_size = eval_size self.learn_initial_query = learn_initial_query self.cache_size = cache_size self.is_encoder_decoder = is_encoder_decoder super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) @property def sub_configs(self): sub_configs = {} backbone_config = getattr(self, "backbone_config", None) text_config = getattr(self, "text_config", None) if isinstance(backbone_config, PretrainedConfig): sub_configs["backbone_config"] = type(backbone_config) if isinstance(text_config, PretrainedConfig): sub_configs["text_config"] = type(text_config) return sub_configs __all__ = ["OmDetTurboConfig"]