# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # This file was automatically generated from src/transformers/models/hgnet_v2/modular_hgnet_v2.py. # Do NOT edit this file manually as any edits will be overwritten by the generation of # the file from the modular. If any change should be done, please apply the change to the # modular_hgnet_v2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 # coding=utf-8 # Copyright 2025 Baidu Inc and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional import torch import torch.nn.functional as F from torch import Tensor, nn from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention, ImageClassifierOutputWithNoAttention from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring from ...utils.backbone_utils import BackboneMixin from .configuration_hgnet_v2 import HGNetV2Config # General docstring @auto_docstring class HGNetV2PreTrainedModel(PreTrainedModel): config: HGNetV2Config base_model_prefix = "hgnetv2" main_input_name = "pixel_values" _no_split_modules = ["HGNetV2BasicLayer"] class HGNetV2LearnableAffineBlock(nn.Module): def __init__(self, scale_value: float = 1.0, bias_value: float = 0.0): super().__init__() self.scale = nn.Parameter(torch.tensor([scale_value]), requires_grad=True) self.bias = nn.Parameter(torch.tensor([bias_value]), requires_grad=True) def forward(self, hidden_state: Tensor) -> Tensor: hidden_state = self.scale * hidden_state + self.bias return hidden_state class HGNetV2ConvLayer(nn.Module): def __init__( self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, groups: int = 1, activation: str = "relu", use_learnable_affine_block: bool = False, ): super().__init__() self.convolution = nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, groups=groups, padding=(kernel_size - 1) // 2, bias=False, ) self.normalization = nn.BatchNorm2d(out_channels) self.activation = ACT2FN[activation] if activation is not None else nn.Identity() if activation and use_learnable_affine_block: self.lab = HGNetV2LearnableAffineBlock() else: self.lab = nn.Identity() def forward(self, input: Tensor) -> Tensor: hidden_state = self.convolution(input) hidden_state = self.normalization(hidden_state) hidden_state = self.activation(hidden_state) hidden_state = self.lab(hidden_state) return hidden_state class HGNetV2ConvLayerLight(nn.Module): def __init__( self, in_channels: int, out_channels: int, kernel_size: int, use_learnable_affine_block: bool = False ): super().__init__() self.conv1 = HGNetV2ConvLayer( in_channels, out_channels, kernel_size=1, activation=None, use_learnable_affine_block=use_learnable_affine_block, ) self.conv2 = HGNetV2ConvLayer( out_channels, out_channels, kernel_size=kernel_size, groups=out_channels, use_learnable_affine_block=use_learnable_affine_block, ) def forward(self, hidden_state: Tensor) -> Tensor: hidden_state = self.conv1(hidden_state) hidden_state = self.conv2(hidden_state) return hidden_state class HGNetV2Embeddings(nn.Module): def __init__(self, config: HGNetV2Config): super().__init__() self.stem1 = HGNetV2ConvLayer( config.stem_channels[0], config.stem_channels[1], kernel_size=3, stride=2, activation=config.hidden_act, use_learnable_affine_block=config.use_learnable_affine_block, ) self.stem2a = HGNetV2ConvLayer( config.stem_channels[1], config.stem_channels[1] // 2, kernel_size=2, stride=1, activation=config.hidden_act, use_learnable_affine_block=config.use_learnable_affine_block, ) self.stem2b = HGNetV2ConvLayer( config.stem_channels[1] // 2, config.stem_channels[1], kernel_size=2, stride=1, activation=config.hidden_act, use_learnable_affine_block=config.use_learnable_affine_block, ) self.stem3 = HGNetV2ConvLayer( config.stem_channels[1] * 2, config.stem_channels[1], kernel_size=3, stride=2, activation=config.hidden_act, use_learnable_affine_block=config.use_learnable_affine_block, ) self.stem4 = HGNetV2ConvLayer( config.stem_channels[1], config.stem_channels[2], kernel_size=1, stride=1, activation=config.hidden_act, use_learnable_affine_block=config.use_learnable_affine_block, ) self.pool = nn.MaxPool2d(kernel_size=2, stride=1, ceil_mode=True) self.num_channels = config.num_channels def forward(self, pixel_values: Tensor) -> Tensor: num_channels = pixel_values.shape[1] if num_channels != self.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." ) embedding = self.stem1(pixel_values) embedding = F.pad(embedding, (0, 1, 0, 1)) emb_stem_2a = self.stem2a(embedding) emb_stem_2a = F.pad(emb_stem_2a, (0, 1, 0, 1)) emb_stem_2a = self.stem2b(emb_stem_2a) pooled_emb = self.pool(embedding) embedding = torch.cat([pooled_emb, emb_stem_2a], dim=1) embedding = self.stem3(embedding) embedding = self.stem4(embedding) return embedding class HGNetV2BasicLayer(nn.Module): def __init__( self, in_channels: int, middle_channels: int, out_channels: int, layer_num: int, kernel_size: int = 3, residual: bool = False, light_block: bool = False, drop_path: float = 0.0, use_learnable_affine_block: bool = False, ): super().__init__() self.residual = residual self.layers = nn.ModuleList() for i in range(layer_num): temp_in_channels = in_channels if i == 0 else middle_channels if light_block: block = HGNetV2ConvLayerLight( in_channels=temp_in_channels, out_channels=middle_channels, kernel_size=kernel_size, use_learnable_affine_block=use_learnable_affine_block, ) else: block = HGNetV2ConvLayer( in_channels=temp_in_channels, out_channels=middle_channels, kernel_size=kernel_size, use_learnable_affine_block=use_learnable_affine_block, stride=1, ) self.layers.append(block) # feature aggregation total_channels = in_channels + layer_num * middle_channels aggregation_squeeze_conv = HGNetV2ConvLayer( total_channels, out_channels // 2, kernel_size=1, stride=1, use_learnable_affine_block=use_learnable_affine_block, ) aggregation_excitation_conv = HGNetV2ConvLayer( out_channels // 2, out_channels, kernel_size=1, stride=1, use_learnable_affine_block=use_learnable_affine_block, ) self.aggregation = nn.Sequential( aggregation_squeeze_conv, aggregation_excitation_conv, ) self.drop_path = nn.Dropout(drop_path) if drop_path else nn.Identity() def forward(self, hidden_state: Tensor) -> Tensor: identity = hidden_state output = [hidden_state] for layer in self.layers: hidden_state = layer(hidden_state) output.append(hidden_state) hidden_state = torch.cat(output, dim=1) hidden_state = self.aggregation(hidden_state) if self.residual: hidden_state = self.drop_path(hidden_state) + identity return hidden_state class HGNetV2Stage(nn.Module): def __init__(self, config: HGNetV2Config, stage_index: int, drop_path: float = 0.0): super().__init__() in_channels = config.stage_in_channels[stage_index] mid_channels = config.stage_mid_channels[stage_index] out_channels = config.stage_out_channels[stage_index] num_blocks = config.stage_num_blocks[stage_index] num_layers = config.stage_numb_of_layers[stage_index] downsample = config.stage_downsample[stage_index] light_block = config.stage_light_block[stage_index] kernel_size = config.stage_kernel_size[stage_index] use_learnable_affine_block = config.use_learnable_affine_block if downsample: self.downsample = HGNetV2ConvLayer( in_channels, in_channels, kernel_size=3, stride=2, groups=in_channels, activation=None ) else: self.downsample = nn.Identity() blocks_list = [] for i in range(num_blocks): blocks_list.append( HGNetV2BasicLayer( in_channels if i == 0 else out_channels, mid_channels, out_channels, num_layers, residual=(i != 0), kernel_size=kernel_size, light_block=light_block, drop_path=drop_path, use_learnable_affine_block=use_learnable_affine_block, ) ) self.blocks = nn.ModuleList(blocks_list) def forward(self, hidden_state: Tensor) -> Tensor: hidden_state = self.downsample(hidden_state) for block in self.blocks: hidden_state = block(hidden_state) return hidden_state class HGNetV2Encoder(nn.Module): def __init__(self, config: HGNetV2Config): super().__init__() self.stages = nn.ModuleList([]) for stage_index in range(len(config.stage_in_channels)): resnet_stage = HGNetV2Stage(config, stage_index) self.stages.append(resnet_stage) def forward( self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True ) -> BaseModelOutputWithNoAttention: hidden_states = () if output_hidden_states else None for stage in self.stages: if output_hidden_states: hidden_states = hidden_states + (hidden_state,) hidden_state = stage(hidden_state) if output_hidden_states: hidden_states = hidden_states + (hidden_state,) if not return_dict: return tuple(v for v in [hidden_state, hidden_states] if v is not None) return BaseModelOutputWithNoAttention( last_hidden_state=hidden_state, hidden_states=hidden_states, ) class HGNetV2Backbone(HGNetV2PreTrainedModel, BackboneMixin): has_attentions = False def __init__(self, config: HGNetV2Config): super().__init__(config) super()._init_backbone(config) self.depths = config.depths self.num_features = [config.embedding_size] + config.hidden_sizes self.embedder = HGNetV2Embeddings(config) self.encoder = HGNetV2Encoder(config) # initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None ) -> BackboneOutput: r""" Examples: ```python >>> from transformers import HGNetV2Config, HGNetV2Backbone >>> import torch >>> config = HGNetV2Config() >>> model = HGNetV2Backbone(config) >>> pixel_values = torch.randn(1, 3, 224, 224) >>> with torch.no_grad(): ... outputs = model(pixel_values) >>> feature_maps = outputs.feature_maps >>> list(feature_maps[-1].shape) [1, 2048, 7, 7] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) embedding_output = self.embedder(pixel_values) outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True) hidden_states = outputs.hidden_states feature_maps = () for idx, stage in enumerate(self.stage_names): if stage in self.out_features: feature_maps += (hidden_states[idx],) if not return_dict: output = (feature_maps,) if output_hidden_states: output += (outputs.hidden_states,) return output return BackboneOutput( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=None, ) @auto_docstring( custom_intro=""" HGNetV2 Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for ImageNet. """ ) class HGNetV2ForImageClassification(HGNetV2PreTrainedModel): def __init__(self, config: HGNetV2Config): super().__init__(config) self.num_labels = config.num_labels self.embedder = HGNetV2Embeddings(config) self.encoder = HGNetV2Encoder(config) self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) self.flatten = nn.Flatten() self.fc = nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity() # classification head self.classifier = nn.ModuleList([self.avg_pool, self.flatten]) # initialize weights and apply final processing self.post_init() @auto_docstring def forward( self, pixel_values: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> ImageClassifierOutputWithNoAttention: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). Examples: ```python >>> import torch >>> import requests >>> from transformers import HGNetV2ForImageClassification, AutoImageProcessor >>> from PIL import Image >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> model = HGNetV2ForImageClassification.from_pretrained("ustc-community/hgnet-v2") >>> processor = AutoImageProcessor.from_pretrained("ustc-community/hgnet-v2") >>> inputs = processor(images=image, return_tensors="pt") >>> with torch.no_grad(): ... outputs = model(**inputs) >>> outputs.logits.shape torch.Size([1, 2]) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) embedding_output = self.embedder(pixel_values) outputs = self.encoder(embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict) last_hidden_state = outputs[0] for layer in self.classifier: last_hidden_state = layer(last_hidden_state) logits = self.fc(last_hidden_state) loss = None if labels is not None: loss = self.loss_function(labels, logits, self.config) if not return_dict: output = (logits,) + outputs[2:] return (loss,) + output if loss is not None else output return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) __all__ = ["HGNetV2Backbone", "HGNetV2PreTrainedModel", "HGNetV2ForImageClassification"]