# coding=utf-8
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DepthPro model configuration"""

from copy import deepcopy

from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING, AutoConfig


logger = logging.get_logger(__name__)


class DepthProConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a
    DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the DepthPro
    [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        fusion_hidden_size (`int`, *optional*, defaults to 256):
            The number of channels before fusion.
        patch_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each patch. This is also the image_size for backbone model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        intermediate_hook_ids (`list[int]`, *optional*, defaults to `[11, 5]`):
            Indices of the intermediate hidden states from the patch encoder to use for fusion.
        intermediate_feature_dims (`list[int]`, *optional*, defaults to `[256, 256]`):
            Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`.
        scaled_images_ratios (`list[float]`, *optional*, defaults to `[0.25, 0.5, 1]`):
            Ratios of scaled images to be used by the patch encoder.
        scaled_images_overlap_ratios (`list[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`):
            Overlap ratios between patches for each scaled image in `scaled_images_ratios`.
        scaled_images_feature_dims (`list[int]`, *optional*, defaults to `[1024, 1024, 512]`):
            Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`.
        merge_padding_value (`int`, *optional*, defaults to 3):
            When merging smaller patches back to the image size, overlapping sections of this size are removed.
        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the pre-activate residual units of the fusion blocks.
        use_fov_model (`bool`, *optional*, defaults to `False`):
            Whether to use `DepthProFovModel` to generate the field of view.
        num_fov_head_layers (`int`, *optional*, defaults to 2):
            Number of convolution layers in the head of `DepthProFovModel`.
        image_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
            The configuration of the image encoder model, which is loaded using the [`AutoModel`] API.
            By default, Dinov2 model is used as backbone.
        patch_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
            The configuration of the patch encoder model, which is loaded using the [`AutoModel`] API.
            By default, Dinov2 model is used as backbone.
        fov_model_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
            The configuration of the fov encoder model, which is loaded using the [`AutoModel`] API.
            By default, Dinov2 model is used as backbone.

    Example:

    ```python
    >>> from transformers import DepthProConfig, DepthProModel

    >>> # Initializing a DepthPro apple/DepthPro style configuration
    >>> configuration = DepthProConfig()

    >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration
    >>> model = DepthProModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "depth_pro"
    sub_configs = {"image_model_config": AutoConfig, "patch_model_config": AutoConfig, "fov_model_config": AutoConfig}

    def __init__(
        self,
        fusion_hidden_size=256,
        patch_size=384,
        initializer_range=0.02,
        intermediate_hook_ids=[11, 5],
        intermediate_feature_dims=[256, 256],
        scaled_images_ratios=[0.25, 0.5, 1],
        scaled_images_overlap_ratios=[0.0, 0.5, 0.25],
        scaled_images_feature_dims=[1024, 1024, 512],
        merge_padding_value=3,
        use_batch_norm_in_fusion_residual=False,
        use_bias_in_fusion_residual=True,
        use_fov_model=False,
        num_fov_head_layers=2,
        image_model_config=None,
        patch_model_config=None,
        fov_model_config=None,
        **kwargs,
    ):
        super().__init__(**kwargs)

        # scaled_images_ratios is sorted
        if scaled_images_ratios != sorted(scaled_images_ratios):
            raise ValueError(
                f"Values in scaled_images_ratios={scaled_images_ratios} should be sorted from low to high"
            )

        # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims should be consistent
        if not (len(scaled_images_ratios) == len(scaled_images_overlap_ratios) == len(scaled_images_feature_dims)):
            raise ValueError(
                f"len(scaled_images_ratios)={len(scaled_images_ratios)} and "
                f"len(scaled_images_overlap_ratios)={len(scaled_images_overlap_ratios)} and "
                f"len(scaled_images_feature_dims)={len(scaled_images_feature_dims)}, "
                f"should match in config."
            )

        # intermediate_hook_ids, intermediate_feature_dims should be consistent
        if not (len(intermediate_hook_ids) == len(intermediate_feature_dims)):
            raise ValueError(
                f"len(intermediate_hook_ids)={len(intermediate_hook_ids)} and "
                f"len(intermediate_feature_dims)={len(intermediate_feature_dims)}, "
                f"should match in config."
            )

        # fusion_hidden_size should be consistent with num_fov_head_layers
        if fusion_hidden_size // 2**num_fov_head_layers == 0:
            raise ValueError(
                f"fusion_hidden_size={fusion_hidden_size} should be consistent with num_fov_head_layers={num_fov_head_layers} "
                "i.e fusion_hidden_size // 2**num_fov_head_layers > 0"
            )

        self.fusion_hidden_size = fusion_hidden_size
        self.patch_size = patch_size
        self.initializer_range = initializer_range
        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
        self.use_fov_model = use_fov_model
        self.num_fov_head_layers = num_fov_head_layers
        self.intermediate_hook_ids = intermediate_hook_ids
        self.intermediate_feature_dims = intermediate_feature_dims
        self.scaled_images_ratios = scaled_images_ratios
        self.scaled_images_overlap_ratios = scaled_images_overlap_ratios
        self.scaled_images_feature_dims = scaled_images_feature_dims
        self.merge_padding_value = merge_padding_value
        self.image_model_config = image_model_config
        self.patch_model_config = patch_model_config
        self.fov_model_config = fov_model_config

        for sub_config_key in self.sub_configs:
            sub_config = getattr(self, sub_config_key)

            if sub_config is None:
                sub_config = CONFIG_MAPPING["dinov2"](image_size=patch_size)
                logger.info(
                    f"`{sub_config_key}` is `None`. Initializing `{sub_config_key}` with the `Dinov2Config` "
                    f"with default values except `{sub_config_key}.image_size` is set to `config.patch_size`."
                )
            elif isinstance(sub_config, dict):
                sub_config = deepcopy(sub_config)
                if "model_type" not in sub_config:
                    raise KeyError(
                        f"The `model_type` key is missing in the `{sub_config_key}` dictionary. Please provide the model type."
                    )
                elif sub_config["model_type"] not in CONFIG_MAPPING:
                    raise ValueError(
                        f"The model type `{sub_config['model_type']}` in `{sub_config_key}` is not supported. Please provide a valid model type."
                    )
                image_size = sub_config.get("image_size")
                if image_size != patch_size:
                    logger.info(
                        f"The `image_size` in `{sub_config_key}` is set to `{image_size}`, "
                        f"but it does not match the required `patch_size` of `{patch_size}`. "
                        f"Updating `image_size` to `{patch_size}` for consistency. "
                        f"Ensure that `image_size` aligns with `patch_size` in the configuration."
                    )
                    sub_config.update({"image_size": patch_size})
                sub_config = CONFIG_MAPPING[sub_config["model_type"]](**sub_config)
            elif isinstance(sub_config, PretrainedConfig):
                image_size = getattr(sub_config, "image_size", None)
                if image_size != patch_size:
                    raise ValueError(
                        f"`config.{sub_config_key}.image_size={image_size}` should match `config.patch_size={patch_size}`."
                    )
            else:
                raise TypeError(
                    f"Invalid type for `sub_config`. Expected `PretrainedConfig`, `dict`, or `None`, but got {type(sub_config)}."
                )

            setattr(self, sub_config_key, sub_config)


__all__ = ["DepthProConfig"]