# mypy: allow-untyped-defs
"""
DTensor operator schema definitions and utilities.

This module defines the core data structures and utilities for describing and managing
distributed tensor operations in PyTorch's DTensor system. It provides the foundational
schema types used for sharding propagation, operator strategy selection, and distributed
execution planning.

Key components:
- OpSpec: Describes acceptable sharding placements for operations
- OpStrategy: Represents the possible sharding strategies for an operator
- TupleStrategy: Container for multiple strategies when ops have tuple/list of tensors input
- OpSchema: Describes operator input/output schemas with DTensorSpecs
- OutputSharding: Manages output sharding specifications and redistribution
- RuntimeSchemaInfo: Runtime execution metadata for operators
- OpInfo: Complete runtime operator execution information

These schema definitions enable the DTensor system to:
1. Propagate tensor sharding information to the operator outputs
2. Greedily select sharding strategies for distributed operations
3. Plan and execute tensor redistributions when needed
4. Cache sharding decisions for performance optimization
"""

from collections.abc import Sequence
from dataclasses import dataclass
from functools import cached_property
from typing import Any, Optional, Union
from typing_extensions import deprecated

import torch
from torch._ops import OpOverload
from torch.distributed.device_mesh import DeviceMesh
from torch.distributed.tensor._dtensor_spec import DTensorSpec
from torch.distributed.tensor.placement_types import Placement


try:
    from torch.utils._cxx_pytree import (
        register_pytree_node,
        tree_leaves,
        tree_map_only,
        TreeSpec,
    )
except ImportError:
    from torch.utils._pytree import (  # type: ignore[no-redef, assignment]
        register_pytree_node,
        tree_leaves,
        tree_map_only,
        TreeSpec,
    )


# Common type aliases
ArgsType = tuple[object, ...]
KwargsType = dict[str, object]

PlacementList = list[Optional[Placement]]

# ATen op schemas could have Tensor, Tuple[Tensor] and List[Tensor], so output type should
# be the same set of possibilities.
OutputSpecType = Optional[Union[DTensorSpec, Sequence[Optional[DTensorSpec]]]]


def _rebuild_tensor_from_dtensor_meta(arg) -> object:
    """
    This is used to propagate tensor metadata, must be under fake mode
    """
    assert arg.tensor_meta is not None, "DTensorSpec does not contain tensor_meta."
    return torch.empty_strided(
        arg.tensor_meta.shape,
        arg.tensor_meta.stride,
        dtype=arg.tensor_meta.dtype,
    )


def _pretty_print_spec(spec: object) -> str:
    if spec is None:
        return "None"
    elif isinstance(spec, DTensorSpec):
        return "".join([str(p) for p in spec.placements])
    elif isinstance(spec, Sequence):
        return "(" + ", ".join([_pretty_print_spec(s) for s in spec]) + ")"
    else:
        raise RuntimeError(f"Unknown spec type to print: spec={spec}")


@dataclass
class OpSpec:
    """
    An OpSpec describes an acceptable sharding placements of an operation, with the
    specified DTensorSpecs for both the output and the inputs.

    note: when the op return value is a single DTensor object, output_specs is
    DTensorSpec; when the return value is a tuple of Optional[DTensor],
    output_specs is a tuple of Optional[DTensorSpec].

    note: we MUST produce an DTensorSpec for every output that is a Tensor.  None
    entries only occur for non-Tensor outputs (e.g., operators that return Optional[Tensor],
    or non-Tensor outputs.)

    invariant: the DeviceMesh on all DTensorSpec must be the same
    """

    # output_specs and input_specs are related: for this op, given these input_specs,
    # this is the way the output would look
    output_specs: Union[DTensorSpec, tuple[Optional[DTensorSpec], ...]]
    input_specs: Optional[Sequence[DTensorSpec]] = None

    """
    redistribute_cost tells how expensive it is to redistribute a given input into the
    placement specified in this OpSpec.

    outer list: one entry (list) per (tensor) input in the op's arg schema
    inner list: one entry (cost value) per possible sharding spec for that input

    Example:
    -------
    another_op() -> tensor_a   # another_op produces the output that becomes our first input
    my_op(tensor_a)

    Let's assume this OpSpec's input_specs are [Replicate()],
    but another_op() supports 2 strategies (OpSpecs) which produce outputs of
       Replicate()
       Shard(0)

    In this example, redistribute_costs would look like this
    [
        # one row representing "my_op's first input" (tensor_a)
        [
            # two entries, one for each strategies supported by another_op
            0.0,  # cost of redistributing tensor_a from 'Replicate()'
            K,    # cost of redistributing tensor_a from 'Shard(0)'
        ],
    """
    redistribute_cost: Optional[list[list[float]]] = None

    @cached_property
    def output_spec(self) -> DTensorSpec:
        """
        This function requires that the strategy have exactly one DTensorSpec as the
        output spec. If the output_specs is a tuple, we throw an exception.
        """
        if isinstance(self.output_specs, DTensorSpec):
            return self.output_specs
        else:
            raise ValueError(
                f"function output_spec expects a single DTensorSpec but got: {self.output_specs}"
            )

    @cached_property
    def mesh(self):
        if isinstance(self.output_specs, DTensorSpec):
            return self.output_specs.mesh
        elif isinstance(self.output_specs, tuple):
            out_spec = self.output_specs[0]
            assert isinstance(out_spec, DTensorSpec)
            return out_spec.mesh
        else:
            raise ValueError(
                f"function output_spec expects a single DTensorSpec or a tuple of DTensorSpec but got: {self.output_specs}"
            )

    def input_spec(self, index: int = 0) -> DTensorSpec:
        assert self.input_specs is not None, "input_specs of OpSpec is None!"
        assert len(self.input_specs) > index, (
            f"Invalid index {index} for input_specs of length "
            f"{len(self.input_specs)}: {self.input_specs}"
        )
        return self.input_specs[index]

    def __str__(self) -> str:
        if self.input_specs is not None:
            input_specs_str = f"{_pretty_print_spec(self.input_specs)} -> "
        else:
            input_specs_str = ""
        output_spec_str = _pretty_print_spec(self.output_specs)
        return f"{input_specs_str}{output_spec_str}"


class StrategyType:
    """
    Base class type for op strategy, We have two StrategyType:
        OpStrategy and TupleStrategy
    """


class OpStrategy(StrategyType):
    """
    OpStrategy that consists of a list of sharding strategies associated with the op,
    where each strategy is an OpSpec that describes the acceptable input/output sharding.

    invariant: the DeviceMesh on all OpSpec must be the same
    """

    def __init__(self, strategies: list[OpSpec]) -> None:
        super().__init__()
        self.strategies: list[OpSpec] = strategies

    def __str__(self) -> str:
        strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
        mesh_shape = self.mesh_shape
        return f"[{strategy_list_str}] @ mesh: {mesh_shape}"

    def max_num_shards(self) -> int:
        """
        Returns the max number of shards across all OpSpecs
        """
        return max(strategy.output_spec.num_shards for strategy in self.strategies)

    @property
    def mesh(self):
        return self.strategies[0].mesh

    @property
    def mesh_shape(self):
        return self.strategies[0].mesh.shape

    @property
    def ndim(self):
        return self.strategies[0].output_spec.ndim

    @property
    def shape(self):
        return self.strategies[0].output_spec.shape


class TupleStrategy(StrategyType):
    """
    TupleStrategy is a special case for operators that are fundamentally compound or batched such that some subset
    of the inputs and outputs are completely unrelated to some other subset.

    Generally, foreach_* ops are the most common use-case for TupleStrategy, because they accept lists of inputs,
    but operate independently on each input or tuple of zipped inputs.

    For example, [out_a, out_b] = torch.foreach_add([a,  b], scalar): input a's sharding only affects out_a's sharding,
    independent of b and out_b.

    An example of an operator that should NOT use TupleStrategy is torch.split.  It produces a List[Tensor]
    as its output, but the sharding decision of one output is bound together with the decision
    of each other output and the common input.
    """

    def __init__(
        self,
        children: Sequence[StrategyType],
    ) -> None:
        super().__init__()
        self.children: Sequence[StrategyType] = children

    @property
    @deprecated(
        "TupleStrategy.childs is deprecated, use TupleStrategy.children instead.",  # codespell:ignore childs
        category=FutureWarning,
    )
    def childs(self) -> Sequence[StrategyType]:  # codespell:ignore childs
        """
        Alias for children, to maintain backward compatibility.
        """
        return self.children

    def child_mesh(self, index: int) -> DeviceMesh:
        op_strategy = self.children[index]
        assert isinstance(op_strategy, OpStrategy)
        return op_strategy.mesh

    def __str__(self) -> str:
        child_strategies_str = ", ".join(
            [f"{str(strat)}" for idx, strat in enumerate(self.children)]
        )
        return f"TupleStrategy({child_strategies_str})"


try:
    register_pytree_node(
        TupleStrategy,
        lambda node: (node.children, None),
        lambda children, _: TupleStrategy(tuple(children)),
    )
except ValueError:
    # already registered TupleStrategy, skip
    pass


@dataclass
class RuntimeSchemaInfo:
    """
    RuntimeSchemaInfo stores the operator schema related information for runtime (eager)
    execution. This is mainly used for two ways: 1. to generate hash for args to determine
    whether to re-run sharding prop or not 2. to determine if we need pytree
    """

    # This static_argnum records static arg "starting index" for ops that have non-tensor
    # args/kwargs which would affect sharding propagation results. All args starting from
    # this index would be hashed to our sharding cache.
    # Note that only a few ops need this information, e.g. view, transpose, var.dim, etc.
    static_argnum: int = 100
    # This static_kwargkey records static kwarg names which would affect sharding prop
    static_kwargkey: Optional[list[str]] = None
    # each op can decide if it wants to use pytree flatten/unflatten during operator
    # eager execution, by default we don't need to do flatten/unflatten, only if the
    # op indicate it needs to, this is to accelerate eager performance.
    needs_pytree: bool = False


@dataclass
class OpSchema:
    """
    OpSchema is a data class that describes an operator input schemas, it includes
    DTensorSpecs/OpStrategies (instead of DTensor) and non-tensor args/kwargs (positional
    order preserved). It is mainly used by the DTensor's dispatching logic to perform various
    actions (i.e. sharding propagation, caching sharding decisions, redistribute, etc.)

    NOTE: this must be used as a read only data class
    TODO: make this a frozen dataclass

    Args:
        op: the operator overload we are intercepting
        args_schema: contains args except that the DTensor args have been replaced
            with its DTensorSpec or OpStrategy
        kwargs_schema: contains kwargs except that the DTensor kwargs have been replaced
            with its DTensorSpec or OpStrategy
    """

    op: OpOverload
    args_schema: ArgsType
    kwargs_schema: KwargsType

    schema_info: Optional[RuntimeSchemaInfo] = None

    _comparison_key: Optional[tuple[object, ...]] = None

    @property
    def args_spec(self) -> tuple[DTensorSpec, ...]:
        """
        args_spec: Tuple[DTensorSpec, ...]: contains a clean list of args spec list
            with NO non-DTensor positional arguments (i.e. int/float/tuple, etc)
            mainly used by sharding propagation to propagate the output spec
        """
        args = (
            tree_leaves(self.args_schema)
            if self.schema_info is not None and self.schema_info.needs_pytree
            else self.args_schema
        )
        return tuple(item for item in args if isinstance(item, DTensorSpec))

    @property
    def args_strategy(self) -> tuple[OpStrategy, ...]:
        # filter out non-relevant values from args schema to get a clean OpStrategy list
        # separate with args_spec for the ease of type annotation
        # TODO: see if we should merge this with args_spec
        args = (
            tree_leaves(self.args_schema)
            if self.schema_info is not None and self.schema_info.needs_pytree
            else self.args_schema
        )
        return tuple(item for item in args if isinstance(item, OpStrategy))

    def __repr__(self) -> str:
        args_schema = ", ".join([str(arg_schema) for arg_schema in self.args_schema])
        return (
            f"OpSchema(op={self.op},"
            f" args_schema=({args_schema}),"
            f" kwargs_schema={self.kwargs_schema})"
        )

    def __str__(self) -> str:
        args_schema: list[str] = []
        mesh_shape = None
        for arg in self.args_schema:
            if isinstance(arg, DTensorSpec):
                args_schema.append(str(arg))
                mesh_shape = arg.mesh.shape
            elif isinstance(arg, OpStrategy):
                assert len(arg.strategies) == 1
                args_schema.append(_pretty_print_spec(arg.strategies[0].output_specs))
                mesh_shape = arg.mesh_shape
            elif isinstance(arg, TupleStrategy):
                first_op_strategy = arg.children[0]
                assert isinstance(first_op_strategy, OpStrategy)
                mesh_shape = first_op_strategy.mesh_shape
                args_schema.append(str(arg))
            else:
                args_schema.append(str(arg))
        return f"Op(op={self.op}, args_schema={', '.join(args_schema)} @ mesh: {mesh_shape})"

    def __post_init__(self) -> None:
        has_symints = False
        for a in self.args_schema:
            if isinstance(a, DTensorSpec) and a.tensor_meta is not None:
                if any(isinstance(s, torch.SymInt) for s in a.tensor_meta.shape):
                    has_symints = True
                    break
        self.has_symints = has_symints
        self._recompute_comparison_key()

    def arg_type_tensor_or_tensor_list_like(self, arg: object) -> bool:
        is_tensor = isinstance(arg, DTensorSpec)
        if is_tensor:
            return True

        if not isinstance(arg, list):
            return False

        return all(isinstance(e, DTensorSpec) or e is None for e in arg)

    def return_type_tuple_tensor_like(self) -> bool:
        # all dispatch ops could only return Tuple[Tensor] or have None/ints/floats
        # in the tuple, but the first element must be a Tensor, so this check is enough
        return_types = self.op._schema.returns
        return len(return_types) > 1 and isinstance(
            return_types[0].type, torch.TensorType
        )

    def return_type_list_tensor_like(self) -> bool:
        # returns True if the return type is a List
        return_types = self.op._schema.returns
        return len(return_types) == 1 and isinstance(
            return_types[0].type, torch.ListType
        )

    def return_type_tensor(self) -> bool:
        return_types = self.op._schema.returns
        # all dispatch ops only return Tensor or Tuple[Tensor] for tensor like
        # return types, so this check is enough for tensor like types
        return isinstance(return_types[0].type, torch.TensorType)

    def get_mesh_from_args(self, validate: bool = True) -> DeviceMesh:
        """
        This util can be used to get a mesh from the OpSchema that contains multiple
        DTensors as arguments. When `validate` is True, it will try to validate that all the
        arguments have the same mesh to avoid unexpected cross mesh errors.

        NOTE: this util currently does not handle TupleStrategy when `validate=True`,
        this is because for TupleStrategy there could be different types of checks, i.e.:
            - for stack and cat like op, we need to check within a TupleStrategy is every
              input is on the same mesh
            - for foreach like ops we need to check "zipped" inputs are on the same mesh
              for each index.
        """
        first_arg = self.args_schema[0]
        if isinstance(first_arg, (DTensorSpec, OpStrategy)):
            mesh = first_arg.mesh
        elif isinstance(first_arg, (list, tuple, TupleStrategy)):
            first_elem = (
                first_arg.children[0]
                if isinstance(first_arg, TupleStrategy)
                else first_arg[0]
            )
            assert isinstance(first_elem, (DTensorSpec, OpStrategy))
            mesh = first_elem.mesh
        else:
            raise ValueError(f"Cannot find device mesh from args for op : {self.op}.")

        if validate:
            for arg in self.args_schema[1:]:
                if isinstance(arg, (DTensorSpec, OpStrategy)) and arg.mesh != mesh:
                    raise RuntimeError(
                        f"DTensor does not support cross-mesh operation on {self.op}! "
                        f"Got meshes: {mesh} {arg.mesh}. "
                        f"Please make sure all the arguments have the same DeviceMesh."
                    )

        return mesh

    def is_inplace_op(self) -> bool:
        # simple analysis of function schema to determine
        # if this is an inplace variant, it might not
        # be entirely correct, but it's good enough for now.
        return self.op._schema.name[-1] == "_"

    def is_out_variant_op(self) -> bool:
        # simple analysis of function schema to determine
        # if this is an out variant, it might not
        # be entirely correct, but it's good enough for now.
        return "out" in self.op._schema.overload_name

    def is_view_op(self) -> bool:
        return self.op._schema._is_view_op()

    def _recompute_comparison_key(self):
        if not self.schema_info:
            static_argnum = len(self.args_schema)
            static_kwargkey = None
        else:
            static_argnum = self.schema_info.static_argnum
            static_kwargkey = self.schema_info.static_kwargkey

        args_to_hash = tuple(
            tuple(e) if isinstance(e, list) else e
            for i, e in enumerate(self.args_schema)
            if self.arg_type_tensor_or_tensor_list_like(e) or i >= static_argnum
        )
        if static_kwargkey is not None:
            kwargs_to_hash = tuple(
                self.kwargs_schema.get(k, None) for k in static_kwargkey
            )
            self._comparison_key = (self.op, args_to_hash, kwargs_to_hash)
        else:
            self._comparison_key = (self.op, args_to_hash)

    def __hash__(self) -> int:
        return hash(self._comparison_key)

    def __eq__(self, other: object) -> bool:
        # early return checks
        if not isinstance(other, OpSchema):
            return False

        if self.op != other.op:
            return False

        if len(self.args_schema) != len(other.args_schema):
            return False

        return self._comparison_key == other._comparison_key

    def gen_fake_args(self) -> ArgsType:
        """
        gen_fake_args: generate fake args for the operator, this is mainly used
            by sharding propagation rules to generate fake args for the operator
            to run the local tensor operator and get the output spec.
        """
        return tree_map_only(
            DTensorSpec,
            _rebuild_tensor_from_dtensor_meta,
            self.args_schema,
            is_leaf=lambda x: isinstance(x, DTensorSpec),
        )

    def gen_fake_kwargs(self) -> KwargsType:
        """
        gen_fake_kwargs: generate fake kwargs for the operator, this is mainly used
            by sharding propagation rules to generate fake kwargs for the operator
            to run the local tensor operator and get the output spec.
        """
        return tree_map_only(
            DTensorSpec,
            _rebuild_tensor_from_dtensor_meta,
            self.kwargs_schema,
            is_leaf=lambda x: isinstance(x, DTensorSpec),
        )

    def _inplace_rewrap_schema_suggestion(self, origin_schema: "OpSchema") -> None:
        suggestion_args_spec = self.args_spec
        new_arg_schema: list[object] = []
        idx_of_args_spec = 0
        if (
            origin_schema.schema_info is not None
            and origin_schema.schema_info.needs_pytree
        ):
            args_schema: Sequence[Any] = tree_leaves(origin_schema.args_schema)
        else:
            args_schema = origin_schema.args_schema
        for arg in args_schema:
            if isinstance(arg, DTensorSpec):
                new_arg_schema.append(suggestion_args_spec[idx_of_args_spec])
                idx_of_args_spec += 1
            else:
                new_arg_schema.append(arg)
        self.args_schema = tuple(new_arg_schema)
        self.kwargs_schema = origin_schema.kwargs_schema
        self._recompute_comparison_key()


@dataclass
class OutputSharding:
    """
    OutputSharding is a data class that is used by the sharding propagation,
    it could set the output_spec upon successful propagation. If needs_redistribute
    is set to True, a redistribute_schema would be returned together to indicate
    the input arguments needs to be redistributed before the op execution.

    NOTE: the redistribute_schema generated by sharding propagation should be
    exactly the same as the operator OpSchema, except the DTensorSpecs
    """

    # specifies the output sharding pattern
    output_spec: OutputSpecType
    # schema for redistribution if needed
    redistribute_schema: Optional[OpSchema] = None
    # flag indicating if inputs need redistribution
    needs_redistribute: bool = False
    # flag to use values from `redistribute_schema`
    use_val_from_redistribute_schema: bool = False

    @cached_property
    def mesh(self):
        if isinstance(self.output_spec, DTensorSpec):
            return self.output_spec.mesh
        elif isinstance(self.output_spec, tuple):
            out_spec = self.output_spec[0]
            if isinstance(out_spec, DTensorSpec):
                return out_spec.mesh
            else:
                raise ValueError(f"Unknown output spec type: {type(out_spec)}")
        else:
            raise ValueError(f"Unknown output spec type: {type(self.output_spec)}")


@dataclass
class OpInfo:
    """
    All Runtime Op execution info are packed here
    """

    # The first compute device mesh recorded from args
    # NOTE: one op could have multiple meshes from its args. We just record the first
    # mesh here to check if current rank should participate in computation or not.
    compute_mesh: DeviceMesh

    # compete runtime operator infos
    schema: OpSchema
    flat_args_schema: list[object]
    local_args: Sequence[object]
    local_kwargs: dict[str, object]
    args_tree_spec: Optional[TreeSpec] = None

    # the output sharding info
    output_sharding: Optional[OutputSharding] = None