# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/hgnet_v2/modular_hgnet_v2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_hgnet_v2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # coding=utf-8 # Copyright 2025 Baidu Inc and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from ...configuration_utils import PretrainedConfig from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices # TODO: Modular conversion for resnet must be fixed as # it provides incorrect import for configuration like resnet_resnet class HGNetV2Config(BackboneConfigMixin, PretrainedConfig): """ This is the configuration class to store the configuration of a [`HGNetV2Backbone`]. It is used to instantiate a HGNet-V2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of D-FINE-X-COCO B4 "[ustc-community/dfine_x_coco"](https://huggingface.co/ustc-community/dfine_x_coco"). Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: num_channels (`int`, *optional*, defaults to 3): The number of input channels. embedding_size (`int`, *optional*, defaults to 64): Dimensionality (hidden size) for the embedding layer. depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`): Depth (number of layers) for each stage. hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`): Dimensionality (hidden size) at each stage. hidden_act (`str`, *optional*, defaults to `"relu"`): The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. out_features (`list[str]`, *optional*): If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. (depending on how many stages the model has). If unset and `out_indices` is set, will default to the corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the same order as defined in the `stage_names` attribute. out_indices (`list[int]`, *optional*): If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. If unset and `out_features` is unset, will default to the last stage. Must be in the same order as defined in the `stage_names` attribute. stem_channels (`list[int]`, *optional*, defaults to `[3, 32, 48]`): Channel dimensions for the stem layers: - First number (3) is input image channels - Second number (32) is intermediate stem channels - Third number (48) is output stem channels stage_in_channels (`list[int]`, *optional*, defaults to `[48, 128, 512, 1024]`): Input channel dimensions for each stage of the backbone. This defines how many channels the input to each stage will have. stage_mid_channels (`list[int]`, *optional*, defaults to `[48, 96, 192, 384]`): Mid-channel dimensions for each stage of the backbone. This defines the number of channels used in the intermediate layers of each stage. stage_out_channels (`list[int]`, *optional*, defaults to `[128, 512, 1024, 2048]`): Output channel dimensions for each stage of the backbone. This defines how many channels the output of each stage will have. stage_num_blocks (`list[int]`, *optional*, defaults to `[1, 1, 3, 1]`): Number of blocks to be used in each stage of the backbone. This controls the depth of each stage by specifying how many convolutional blocks to stack. stage_downsample (`list[bool]`, *optional*, defaults to `[False, True, True, True]`): Indicates whether to downsample the feature maps at each stage. If `True`, the spatial dimensions of the feature maps will be reduced. stage_light_block (`list[bool]`, *optional*, defaults to `[False, False, True, True]`): Indicates whether to use light blocks in each stage. Light blocks are a variant of convolutional blocks that may have fewer parameters. stage_kernel_size (`list[int]`, *optional*, defaults to `[3, 3, 5, 5]`): Kernel sizes for the convolutional layers in each stage. stage_numb_of_layers (`list[int]`, *optional*, defaults to `[6, 6, 6, 6]`): Number of layers to be used in each block of the stage. use_learnable_affine_block (`bool`, *optional*, defaults to `False`): Whether to use Learnable Affine Blocks (LAB) in the network. LAB adds learnable scale and bias parameters after certain operations. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ model_type = "hgnet_v2" def __init__( self, num_channels=3, embedding_size=64, depths=[3, 4, 6, 3], hidden_sizes=[256, 512, 1024, 2048], hidden_act="relu", out_features=None, out_indices=None, stem_channels=[3, 32, 48], stage_in_channels=[48, 128, 512, 1024], stage_mid_channels=[48, 96, 192, 384], stage_out_channels=[128, 512, 1024, 2048], stage_num_blocks=[1, 1, 3, 1], stage_downsample=[False, True, True, True], stage_light_block=[False, False, True, True], stage_kernel_size=[3, 3, 5, 5], stage_numb_of_layers=[6, 6, 6, 6], use_learnable_affine_block=False, initializer_range=0.02, **kwargs, ): super().__init__(**kwargs) self.num_channels = num_channels self.embedding_size = embedding_size self.depths = depths self.hidden_sizes = hidden_sizes self.hidden_act = hidden_act self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] self._out_features, self._out_indices = get_aligned_output_features_output_indices( out_features=out_features, out_indices=out_indices, stage_names=self.stage_names ) self.stem_channels = stem_channels self.stage_in_channels = stage_in_channels self.stage_mid_channels = stage_mid_channels self.stage_out_channels = stage_out_channels self.stage_num_blocks = stage_num_blocks self.stage_downsample = stage_downsample self.stage_light_block = stage_light_block self.stage_kernel_size = stage_kernel_size self.stage_numb_of_layers = stage_numb_of_layers self.use_learnable_affine_block = use_learnable_affine_block self.initializer_range = initializer_range if not ( len(stage_in_channels) == len(stage_mid_channels) == len(stage_out_channels) == len(stage_num_blocks) == len(stage_downsample) == len(stage_light_block) == len(stage_kernel_size) == len(stage_numb_of_layers) ): raise ValueError("All stage configuration lists must have the same length.") __all__ = ["HGNetV2Config"]