L ic* UddlZddlmZmZmZddlZddlmcmZ ddl m Z ddl m Z ddlmZmZddlmZddlmZmZmZerdd lmZdd lmZmZn ej8Zed d Zedd ZeZerddl m!Z!ejDe#Z$dedefdZ%dedefdZ&de'de'de'de'de(f dZ)de'defdZ*de'dej8defdZ+de'defdZ,de'defdZ-de'dej8defd Z.d!ej8defd"Z/d#ej8defd$Z0d%ed&e'd'e'defd(Z1d[d%ed)e(defd*Z2 d[d+eej8d,e'd'e'd-e(deej8f d.Z3 d\d!eej8d/e'd,e'd'e'd0ee'de(f d1Z4de)ddd fd2e'd3ej8d,e'd'e'd%ed+eej8d4ee'd5e(deej8fd6Z5de)ddd d fd2e'd3ej8d,e'd'e'd%ed+eej8d4ee'd5e(d7e(deej8fd8Z6ere5ne6Z7de)dejpfd2e'd3ej8d,e'd'e'd%ed+eej8d9ejrdej8fd:Z:de)dfd2e'd3ej8d,e'd'e'd%ed+eej8f d;Z;de)dfd2e'd3ej8d,e'd'e'd%ed+eej8defd<Z<Gd=d>eZ=e=Z>e=e?d?<d@ej8dej8fdAZ@dBe dCej8d+eeej8efd3ej8dDee d@eej8dEee'deAe(eeej8efe'e'ffdFZB d]dBe dCej8d+eej8d3ej8dDee d@eej8dGeedHeedeeej8effdIZC d]dBe dCej8d+eej8d3ej8dDee d@eej8dGeedHeedeeej8effdJZD d]dBe dCej8d+eej8d3ej8dDee d@eej8dGeedHeedeeej8effdKZEeCeDeEdLZF d]dBe dCej8d+eej8d3ej8dDee d@eej8dGeedHeefdMZGdNZHdOZIdPZJdQZKdRZLdSZMdTZNdUZOdVZPeIeKeJZQeHeKeJZRd^dWej8deSfdXZTGdYdZej8ZUy)_N)CallableOptionalUnion)Cache)PretrainedConfig)is_torch_xpu_availablelogging)GeneralInterface)is_torch_flex_attn_availableis_torch_greater_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE) BlockMaskcreate_block_maskz2.5T) accept_devz2.6)TransformGetItemToIndexmask_functionsreturncRtdDstdfd}|S)zKReturns a mask function that is the intersection of provided mask functionsc32K|]}t|ywNcallable.0args `/mnt/ssd/data/python-lab/Trading/venv/lib/python3.12/site-packages/transformers/masking_utils.py zand_masks..07x}7.All inputs should be callable mask_functions: c|jdtj}D])}||||||j|jz}+|SN)dtype)new_onestorchbooltodevice batch_idxhead_idxq_idxkv_idxresultmaskrs rand_maskzand_masks..and_mask3sS%**5" YDd9hvFII&--XXF Y all RuntimeError)rr3s` r and_masksr8.s1 77 7KNK[\]] Or4cRtdDstdfd}|S)zDReturns a mask function that is the union of provided mask functionsc32K|]}t|ywrrrs rrzor_masks..>r r!r"c|jdtj}D])}||||||j|jz}+|Sr$) new_zerosr(r)r*r+r,s ror_maskzor_masks..or_maskAsS5::6" YDd9hvFII&--XXF Y r4r5)rr=s` ror_masksr><s1 77 7KNK[\]] Nr4r-r.r/r0c ||kS)z: This creates a basic lower-diagonal causal mask. r%)r-r.r/r0s rcausal_mask_functionr@Js U?r4sliding_windowc Pdtdtdtdtdtf fd }|S)z This is an overlay depicting a sliding window pattern. Add it on top of a causal mask for a proper sliding window mask. r-r.r/r0rc||z kDSrr%)r-r.r/r0rAs r inner_maskz*sliding_window_overlay..inner_maskWs...r4intr))rArDs` rsliding_window_overlayrGQs3 /c/S//c/d/ r4 chunk_size left_paddingc Tdtdtdtdtdtf fd }|S)z This is an overlay depicting a chunked attention pattern. Add it on top of a causal mask for a proper chunked attention mask. r-r.r/r0rc2||z z||z zk(Srr%)r-r.r/r0rHrIs rrDz#chunked_overlay..inner_maskcs.i00Z?ELYbLcDchrCrrrr4rE)rHrIrDs`` rchunked_overlayrL]s9 scsSsscsds r4c Pdtdtdtdtdtf fd }|S)z Same as the above function, but do not correctly account for left padding tokens. Only kept for compatibility with older torch versions (< 2.6). r-r.r/r0rc|z|zk(Srr%)r-r.r/r0rHs rrDz+_legacy_chunked_overlay..inner_maskos#u ':::r4rE)rHrDs` r_legacy_chunked_overlayrOis3 ;c;S;;c;d; r4c4tt|tS)zQ This return the mask_function function to create a sliding window mask. )r8rGr@)rAs r#sliding_window_causal_mask_functionrQus +N;=Q RRr4cttstt|tStt ||tS)zT This return the mask_function function to create a chunked attention mask. )#_is_torch_greater_or_equal_than_2_6r8rOr@rL)rHrIs rchunked_causal_mask_functionrT|s0 /0<>RSS _Z>@T UUr4 padding_maskc Pdtdtdtdtdtf fd }|S)zT This return the mask_function function corresponding to a 2D padding mask. r-r.r/r0rc||fSrr%)r-r.r/r0rUs rrDz)padding_mask_function..inner_masksIv-..r4rE)rUrDs` rpadding_mask_functionrXs3 /c/S//c/d/ r4packed_sequence_maskc Pdtdtdtdtdtf fd }|S)z\ This return the mask_function function corresponding to a 2D packed sequence mask. r-r.r/r0rc"||f||fk(Srr%)r-r.r/r0rYs rrDz1packed_sequence_mask_function..inner_masks$#Iu$459MiY_N_9```r4rE)rYrDs` rpacked_sequence_mask_functionr\s9 acaSaacada r4 mask_functionq_offset kv_offsetc Xdtdtdtdtdtf fd }|S)z This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths, not start and end indices. r-r.r/r0rc&|||z|zSrr%)r-r.r/r0r_r]r^s rrDz0add_offsets_to_mask_function..inner_masksY%(2BFYDVWWr4rE)r]r^r_rDs``` radd_offsets_to_mask_functionrbs9 XcXSXXcXdX r4 bh_indicescvddg}|r|jddg|D]}tj||d}|S)a Used to vmap our mask_functions over the q_idx and kv_idx dimensions of the inputs. Optionally, vmap over the batch and head indices as well if `bh_indices=True`. Using vmap here allows us to keep the performance of vectorized ops, while having a single set of primitive functions between attention interfaces (i.e. between flex and sdpa/eager, FA2 being a bit different). Args: mask_function (`Callable`): The mask_function to vmap. bh_indices (`bool`, optional): Whether to vmap over the batch and head indices as well, or only q and kv indices. Returns: Callable: The vmapped function. )NNNr)NNrN)NrNN)rNNNr)in_dimsout_dims)extendr(vmap)r]rc dimensionsdimss r_vmap_for_bhqkvrksP"()>?J02GHIL =$K L r4attention_mask kv_length_slicec|}|w||z|jdz x}dkDr,tjjj |d|f}|r/tj ||j }||z }|dd|f}|S)z From the 2D attention mask, prepare the correct padding mask to use by potentially padding it, and slicing according to the `kv_offset` if `_slice` is `True`. Nrr+)shaper(nn functionalpadaranger+)rlrmr_rnlocal_padding_maskpadding_length mask_indicess rprepare_padding_maskrzs(!')3n6J6J26NN NNRS S!&!4!4!8!8!^I\!]  !<< :L:S:STL I %L!3A|O!D  r4 query_lengthlocal_attention_sizectjjxs0t|tjj xs t }|A|jd|kDr/tj||j}||z }|dd|f}|sO|dk(s ||k(str?|||kr8|5tr|dk(r|jry|ddd|fjryy)a Detects whether the causal mask can be ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument. In case no token is masked in the 2D `padding_mask` argument, if `query_length == 1` or `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks, allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed). NrprqrTF) r(jit is_tracing isinstancefxProxyrrrrvr+_is_torch_xpu_availabler6)rUr{rmr_r|rrys r_ignore_causal_mask_sdpars%%'q:lEHHNN+SqWoWqJL$6$6r$:Y$F||Il6I6IJ  ! #A|O4   Q 9 #<@W ! )Y9M-M  /,!2C  "  "!]l]"23779 r4 batch_sizecache_position local_sizeallow_is_causal_skipc |jd} t|||d} |rt| | |||rytj||j } | |z } | t |t| }tj||j } tjd|j } t5t|| | || }ddd|S#1swYSxYw)u  Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that the element should take part in the attention computation, and False that it should not. This function can only be used with torch>=2.5, as the context manager is otherwise not available. Args: batch_size (`int`): The batch size of the input sequence. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. kv_length (`int`): The size that the key and value states will have during the attention computation. kv_offset (`int`, optional): An optional offset to indicate at which first position the key and values states will refer to. mask_function (`Callable`): The mask factory function describing the mask pattern. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length) local_size (`int`, optional): The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True` to try to skip mask creation if possible. allow_is_causal_skip (`bool`, optional): Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in `torch.sdpa` instead. Default to `True`. allow_torch_fix (`bool`, optional): Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older versions. We need an arg to skip it when using eager. By default `True`. ## Creating a simple causal mask: To create the following causal mask: 0 ■ ⬚ ⬚ ⬚ ⬚ 1 ■ ■ ⬚ ⬚ ⬚ 2 ■ ■ ■ ⬚ ⬚ 3 ■ ■ ■ ■ ⬚ 4 ■ ■ ■ ■ ■ You can do ```python >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5) >>> tensor([[[[ True, False, False, False, False], [ True, True, False, False, False], [ True, True, True, False, False], [ True, True, True, True, False], [ True, True, True, True, True]]]]) ``` ## Creating a sliding window mask: To create the following sliding window mask (`sliding_window=3`): 0 ■ ⬚ ⬚ ⬚ ⬚ 1 ■ ■ ⬚ ⬚ ⬚ 2 ■ ■ ■ ⬚ ⬚ 3 ⬚ ■ ■ ■ ⬚ 4 ⬚ ⬚ ■ ■ ■ You can do ```python >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=sliding_window_causal_mask_function(3)) >>> tensor([[[[ True, False, False, False, False], [ True, True, False, False, False], [ True, True, True, False, False], [False, True, True, True, False], [False, False, True, True, True]]]]) ``` ## Creating a chunked attention mask To create the following chunked attention mask (`chunk_size=3`): 0 ■ ⬚ ⬚ ⬚ ⬚ 1 ■ ■ ⬚ ⬚ ⬚ 2 ■ ■ ■ ⬚ ⬚ 3 ⬚ ⬚ ⬚ ■ ⬚ 4 ⬚ ⬚ ⬚ ■ ■ You can do ```python >>> sdpa_mask(batch_size=1, cache_position=torch.arange(5), kv_length=5, mask_function=chunked_causal_mask_function(3, torch.zeros(1, dtype=int))) >>> tensor([[[[ True, False, False, False, False], [ True, True, False, False, False], [ True, True, True, False, False], [False, False, False, True, False], [False, False, False, True, True]]]]) ``` rFrnNrqr) rrrzrr(rvr+r8rXrrk)rrrmr_r]rlrrkwargsq_lengthrU kv_arange batch_arange head_arange causal_masks rsdpa_mask_recent_torchr sP##A&H' 9UZ[L 8xQZ\egq r Y~/D/DEI I!-1F|1TU << >3H3HIL,,q)>)>?K ! "k4om4\;P^`ij k k s :CC#allow_torch_fixc  |jd} t|||} |rt| | |||rytj||j } | |z } t |ddd|| } | ddddddfj|ddd} | | | ddddddfz} ts|r| tj| ddz} | S) a NOTE: This function is only used when torch version is torch<2.5 - see `sdpa_mask_recent_torch` otherwise. Create a 4D boolean mask of shape `(batch_size, 1, query_length, kv_length)` where a value of True indicates that the element should take part in the attention computation, and False that it should not. If `allow_torch_fix=True` (the default), rows corresponding to query tokens that do not attend to any other tokens (due to padding) will be fully attended to instead, in order to avoid `nan` propagation (this does not change the final result). Args: batch_size (`int`): The batch size of the input sequence. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. kv_length (`int`): The size that the key and value states will have during the attention computation. kv_offset (`int`, optional): An optional offset to indicate at which first position the key and values states will refer to. mask_function (`Callable`): The mask factory function describing the mask pattern. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length) local_size (`int`, optional): The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True` to try to skip mask creation if possible. allow_is_causal_skip (`bool`, optional): Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in `torch.sdpa` instead. Default to `True`. allow_torch_fix (`bool`, optional): Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older versions. We need an arg to skip it when using eager. By default `True`. rNrqF)rcrpT)dimkeepdim) rrrzrr(rvr+rkexpand#_is_torch_greater_or_equal_than_2_5r6)rrrmr_r]rlrrrrrrUrrs rsdpa_mask_older_torchrsX##A&H' 9ML 8xQZ\egq r Y~/D/DEI I C/-EB4~_hiKdD!Q./66z2r2NK!LD$1A$BB  /?uyy+2tDD r4r&c |jdd}td||||||ddd|} tj|j} tj | tj d| j|| } | S)ax Create a 4D float mask of shape `(batch_size, 1, query_length, kv_length)` where a value of 0 indicates that the element should take part in the attention computation, and -inf (minimum value for the given `dtype`) that it should not. Args: batch_size (`int`): The batch size of the input sequence. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. kv_length (`int`): The size that the key and value states will have during the attention computation. kv_offset (`int`, optional): An optional offset to indicate at which first position the key and values states will refer to. mask_function (`Callable`): The mask factory function describing the mask pattern. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length) dtype (`torch.dtype`, optional): The dtype to use for the mask. By default, `torch.float32`. rNF)rrrmr_r]rlrrgr+r&r%)pop sdpa_maskr(finfominwheretensorr+) rrrmr_r]rlr&r_r2 min_dtypes r eager_maskrs@  )40A  %#%"   D E"&&I ;;tU\\#dkkOQZ [D Kr4c F||dd| df}|jrd}|S)a" Create the attention mask necessary to use FA2. Since FA2 is un-padded by definition, here we simply return `None` if the mask is fully causal, or we return the 2D mask which will then be used to extract the seq_lens. We just slice it in case of sliding window. Args: batch_size (`int`): The batch size of the input sequence. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. kv_length (`int`): The size that the key and value states will have during the attention computation. kv_offset (`int`, optional): An optional offset to indicate at which first position the key and values states will refer to. mask_function (`Callable`): The mask factory function describing the mask pattern. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length) N)r6)rrrmr_r]rlrs rflash_attention_maskr s58!'I:;7    !N r4c |jd|d}}||jdtzdztz} | |jdz } ts3| dkDr.tjj j |dd| f}t|||d} t|t| }t|||}t||d|||jt} | S)a Create a 4D block mask which is a compressed representation of the full 4D block causal mask. BlockMask is essential for performant computation of flex attention. See: https://pytorch.org/blog/flexattention/ Args: batch_size (`int`): The batch size of the input sequence. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. kv_length (`int`): The size that the key and value states will have during the attention computation. kv_offset (`int`, optional): An optional offset to indicate at which first position the key and values states will refer to. mask_function (`Callable`): The mask factory function describing the mask pattern. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length) rNr)valueruFr)mask_modBHQ_LENKV_LENr+_compile) rrflex_default_block_sizerSr(rsrtrurzr8rXrbrr+) rrrmr_r]rlrrr^pad_lenrU block_masks rflex_attention_maskr4s6(--a0.2ChH!#((+/FF!KOffN00332w{"XX0044^1STV]R^4_N+NIyY^_ !-1F|1TU 1)TM#  $$4J r4ceZdZeeeeedZy)AttentionMaskInterface)sdpaeagerflash_attention_2flash_attention_3flex_attentionN)__name__ __module__ __qualname__rrrr_global_mappingr%r4rrrms11- Or4rALL_MASK_ATTENTION_FUNCTIONS position_idscz|ddddfdz }tj||d}|dk7jd}|S)aj Find the indices of the sequence to which each new query token in the sequence belongs when using packed tensor format (i.e. several sequences packed in the same batch dimension). Args: position_ids (`torch.Tensor`) A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences. Returns: A 2D tensor where each similar integer indicates that the tokens belong to the same sequence. For example, if we pack 3 sequences of 2, 3 and 1 tokens respectively along a single batch dim, this will return [[0, 0, 1, 1, 1, 2]]. Nrrp)prependr)r(diffcumsum)rfirst_dummy_value position_diffrYs rfind_packed_sequence_indicesr}sM$%QU+a/JJ|5FBOM)Q.66r: r4config input_embedspast_key_values layer_idxct|tjtfrt |j dk(rd|dddfS|j tjvry|:|jdk(r+|j|jtj}||j||\}}n|j dd}}d} |B|@|>|j d} | |j dk7r|j| d }t|} d || ||fS) ar Perform some common pre-processing of the mask arguments we get from the modeling code. Mostly determine the key-value length and offsets, and if we should early exit or not. Args: config (`PretrainedConfig`): The model config. input_embeds (`torch.Tensor`): The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the batch size, query length and dtype. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length). It can also be an already prepared 4D mask, in which case it is returned as-is. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. past_key_values (`Cache`, optional): The past key values, if we use a cache. position_ids (`torch.Tensor`, optional) A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences. layer_idx (`int`, optional): If `past_key_values` is not None, this is the layer index of the cache from which to get the key-value length and offset. Indeed, for hybrid caches, different layers may return different lengths. Returns: early_exit (`bool`): Whether we should early exit mask creation, and return the mask as-is. attention_mask (`torch.Tensor` or `BlockMask` or `None`): The attention mask to either return immediately, or to use in downstream mask creation. packed_sequence_mask (`torch.Tensor`, optional): In case we detected packed sequence format, this is a tensor where each similar integer indicates that the tokens belong to the same sequence. kv_length (`int`): The size that the key and value states will have during the attention computation. kv_offset (`int`): An offset to indicate at which first position the key and values states will refer to. TN)TNNNNrrrrpF)rr(Tensorrlenrr_attn_implementationrrndimr*r+r)get_mask_sizesrr) rrrlrrrrrmr_rYrs r_preprocess_mask_argumentsrs)\.5<<";<^EYEYAZ^_A_^T455""*F*V*VV+!n&9&9Q&>'**.2G2Guzz*Z".==niX 9 ,11!4a9  N$:?V!''* ++A. .'..z2>L;LI ."6 9 LLr4or_mask_functionand_mask_functionc &t|dr*d|jvr|jjd}nd}t|||||||\} }} } } | r|S|jd|j }} t }t|j}trd}nt|dd }|ts tdt||}d}|ts tdt||}d}| trt|t| }d}|| || | ||||| }|S)a- Create a standard causal mask based on the attention implementation used (stored in the config). If `past_key_values` has an hybrid cache structure, this function will return the mask corresponding to one of the "full_attention" layers (to align to what is needed in the `modeling_xxx.py` files). Args: config (`PretrainedConfig`): The model config. input_embeds (`torch.Tensor`): The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the batch size, query length and dtype. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length). It can also be an already prepared 4D mask, in which case it is returned as-is. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. past_key_values (`Cache`, optional): The past key values, if we use a cache. position_ids (`torch.Tensor`, optional) A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences. or_mask_function (`Callable`, optional): An optional mask function to combine with the causal mask function (by doing the union of both). This is useful to easily overlay another mask on top of the causal one, for example for image tokens handling. and_mask_function (`Callable`, optional): An optional mask function to combine with the causal mask function (by doing the intersection of both). This is useful to easily overlay another mask on top of the causal one, for example for image tokens handling. is_slidingFrTis_compileableLUsing `or_mask_function` or `and_mask_function` arguments require torch>=2.6) rrrmr_r]rlrr&r)hasattrrindexrrrr&r@rrrgetattrrS ValueErrorr>r8r\)rrrlrrrrrr early_exitrYrmr_rr&mask_factory_functionmask_interfacerrs rcreate_causal_maskrsbL -%?;U;U2U#..44U;  Mg nno|]fNJJ 4i$**1-|/A/AJ01&2M2MNN##*?@P Q$$2kl l )*?AR S$',O )*?A^_sAt u$!%+%1 K r4c Tt|dr*d|jvr|jjd}nd}t|||||||\} }} } } | r|St |dd} | t d|j d|j}}t| }t|j}t |dd }|ts t d t||}d}|ts t d t||}d}| trt|t| }d}|||| | |||| || }|S) a Create a sliding window causal mask based on the attention implementation used (stored in the config). This type of attention pattern was mostly democratized by Mistral. If `past_key_values` has an hybrid cache structure, this function will return the mask corresponding to one of the "sliding_attention" layers (to align to what is needed in the `modeling_xxx.py` files). Args: config (`PretrainedConfig`): The model config. input_embeds (`torch.Tensor`): The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the batch size, query length and dtype. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length). It can also be an already prepared 4D mask, in which case it is returned as-is. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. past_key_values (`Cache`, optional): The past key values, if we use a cache. position_ids (`torch.Tensor`, optional) A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences. or_mask_function (`Callable`, optional): An optional mask function to combine with the sliding causal mask function (by doing the union of both). This is useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling. and_mask_function (`Callable`, optional): An optional mask function to combine with the sliding causal mask function (by doing the intersection of both). This is useful to easily overlay another mask on top of the sliding causal one, for example for image tokens handling. rTrrANzJCould not find a `sliding_window` argument in the config, or it is not setrFr rrrmr_r]rlrrr&r)rrrrrrrrr&rQrrrSr>r8r\)rrrlrrrrrrrrYrmr_rArr&rrrrs r!create_sliding_window_causal_maskrGsN -$/:T:T2T#..44T:  Mg nno|]fNJJ 4iV%5t@P Q$$2kl l )*?AR S$',O )*?A^_sAt u$!%+%1! K r4c t|dr*d|jvr|jjd}nd}t|||||||\} }} } } | r|St |dd} | t d|j dk(r| | z| kDr t d|jd|j}}|9|jd tj|k(jd }n&tj||jt }t s0| | z| kDr(|dkDj#rt$j'd t)| |}t*|j }t |d d }|t s t dt-||}d}|t s t dt/||}d}| t rt/|t1| }d}|||| | |||| || }|S)a Create a chunked attention causal mask based on the attention implementation used (stored in the config). This type of attention pattern was mostly democratized by Llama4. If `past_key_values` has an hybrid cache structure, this function will return the mask corresponding to one of the "chunked_attention" layers (to align to what is needed in the `modeling_xxx.py` files). Args: config (`PretrainedConfig`): The model config. input_embeds (`torch.Tensor`): The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the batch size, query length and dtype. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length). It can also be an already prepared 4D mask, in which case it is returned as-is. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. past_key_values (`Cache`, optional): The past key values, if we use a cache. position_ids (`torch.Tensor`, optional) A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences. or_mask_function (`Callable`, optional): An optional mask function to combine with the chunked causal mask function (by doing the union of both). This is useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling. and_mask_function (`Callable`, optional): An optional mask function to combine with the chunked causal mask function (by doing the intersection of both). This is useful to easily overlay another mask on top of the chunked causal one, for example for image tokens handling. rTrattention_chunk_sizeNzQCould not find an `attention_chunk_size` argument in the config, or it is not setrzFlash attention 2 cannot handle chunked attention, and the key-value length is larger than the chunk size so the chunked pattern cannot be respected. You should use another `attn_implementation` when instantiating the modelrp)rra#Due to limitations of your current torch version, we cannot correctly account for the left-padding when computing the chunked attention pattern. This will lead to a wrong attention mask for the padded sequences. Behavior will be undefined. Please upgrade to `torch>=2.6` to solve this issue.rFrr)rrrrrrrrrr&rr( zeros_likesumzerosr+rFrSanylogger warning_oncerTrr>r8r\)rrrlrrrrrrrrYrmr_rHrr&left_padding_tokensrrrrs rcreate_chunked_causal_maskrsGN -$/:T:T2T#..44T:  Mg nno|]fNJJ 4i!7>Jlmm""&99i)>SV`>` }  %**1-|/A/AJ!-444<@P@PQ_@``eejlem#kk*^=R=RZ]^ 0  !J . 1 $ ) ) + i 9EXY1&2M2MNN '8H%PP #2kl l ()>@P Q$$2kl l )*?AR S$',O )*?A^_sAt u$!%+%1 K r4)full_attentionsliding_attentionchunked_attentionc $|j} | |||||||d} t| dr/i} t| jD]} t | di| | | <| St | dd t di| St | dd tdi| Stdi| S)a This function mimics how we create the masks in the `modeling_xxx.py` files, and is used in `generate` in order to easily create the masks in advance, when we compile the forwards with Static caches. Args: config (`PretrainedConfig`): The model config. input_embeds (`torch.Tensor`): The input embeddings of shape (batch_size, query_length, hidden_dim). This is used only to infer the batch size, query length and dtype. attention_mask (`torch.Tensor`, optional): The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length). It can also be an already prepared 4D mask, in which case it is returned as-is. cache_position (`torch.Tensor`): A tensor of shape (query_length,) indicating the current indices of the input sequence elements. past_key_values (`Cache`, optional): The past key values, if we use a cache. position_ids (`torch.Tensor`, optional) A 2D tensor of shape (batch_size, query_length) indicating the positions of each token in the sequences. or_mask_function (`Callable`, optional): An optional mask function to combine with the other mask function (by doing the union of both). This is useful to easily overlay another mask on top of the causal one, for example for image tokens handling. and_mask_function (`Callable`, optional): An optional mask function to combine with the other mask function (by doing the intersection of both). This is useful to easily overlay another mask on top of the causal one, for example for image tokens handling. )rrrlrrrrr layer_typesrANrr%) get_text_configrsetr&LAYER_PATTERN_TO_MASK_FUNCTION_MAPPINGrrrr) rrrlrrrrrreffective_config mask_kwargs causal_masks layer_patterns rcreate_masks_for_generater)sL--/#$((*$,. K/  !1!=!=> oM*PQ^*_*nbm*nL ' o !#3T : F0?;?? !#94 @ L)8K88  , ,,r4zzzu■u⬚u∙u⬕u⬔c>|dk(r d}d}d}d}d}nd}d}d}d }||||fS) Nmajongu🀞u🀙u🀆u🀛u█u░u▙u▜r%)style BLACK_SQUARE WHITE_SQUARE LOW_TRIANGLEUPPER_TRIANGLEs r get_stylerxsF         |^ CCr4original_tensorc t|\}}}}|j\}}|\} } || kr|| ksd|z|z } | dkDr%| }t| tdt | | z }n| }tdt | | z}|j dj d} t j| ||fd} n|} g} t|D]}d}t|D]}| ||fdk(r||z }| ||fdk(r||z }#|dkDr8| ||dz fdk(r||z };| ||dz fdk(r||z }N|| ||fdk(r|n|z }`|| ||fdk(r|n| ||fdk(r|n| ||dzfdk(r|n|z }| j|dj| S)Nrrr) output_size)rr ) rrrrmaxround unsqueezeFadaptive_avg_pool2drangeappendjoin)r grid_sizerrrrrhwmax_hmax_w aspect_ratiorr1irowjs rtensor_to_mask_visualrs?H?O?AAAuU\123A!**1-77:&&vAq6B4H F 1Xq Aad|q |#1"|#q5aQh'1,|+1q5)Q.~-vad|q/@|lR!!Q$<1,% &ad|q0)4:1a!e84D4I.| C .  c36 99V r4cheZdZd dZdZd dZdZdZed de jde e d dfd Z y) AttentionMaskNcT||_tjj||dS)NF) require_grad)rr(r_make_subclass)clsdatars r__new__zAttentionMask.__new__s% ||**35*IIr4cyrr%)selfr s r__init__zAttentionMask.__init__s r4c |}|j^}}}g}ttj|Dcgc] }t |c}D]k\} } | |k(r5|j d|j d|j dn.t || ||j} |j | m|j dt|jd|jddj|Scc}w) z2Returns a string representation of the block mask.z...z7To print out more, set AttentionMask.to_string(limit=N)zRYou can also index (AttentionMask[batch, head]) to choose a specific batch or head)rrztorch.Tensor(shape=z, dtype=)r) rr enumerate itertoolsproductr rrrtupler&r) r#rlimit dense_mask batch_dimsnum_rowsnum_cols total_visridxr- block_viss r to_stringzAttentionMask.to_strings *4*:*:'Xx ' (9(9j;YE!H;Y(Z[ (NCe|  '  !Z[  !uv-j.Cy`d`j`jkI   Y ' ( .uTZZ/@.A$**UVWXyy##r)Vr(typingrrrr(torch.nn.functionalrsrtr  cache_utilsrconfiguration_utilsrutilsr r utils.genericr utils.import_utilsr r r!torch.nn.attention.flex_attentionrrrrrrrSr,torch._dynamo._trace_wrapped_higher_order_opr get_loggerrrr8r>rFr)r@rGrLrOrQrTrXr\rbrkrzrrrrfloat32r&rrrrr__annotations__rr*rrrrrrGREENYELLOWRESETrr GREY_SQUARErrr YELLOW_SQUARE GREEN_SQUARErBrrr%r4rrUs,, 12+qq !gNN I&?RV&W#&?RV&W#02&T   H % x H  h 8 C3sCTX 3 8   5<< H    SSSVSV VQYV      C TW \d 88\`U\\*7:GJTX ell6+/ +5<<(+++ + #3- +  +d2-1 $!%AALLAA A  A U\\* A AAellAP2-1 $!% FFLLFF F  F U\\* F FFFellFV'J "Od 2-1//LL// /  / U\\* / ;;/ \\/l2-1 $$LL$$ $  $ U\\* $V2-1 66LL66 6  6 U\\* 66r - 8N7O4O u||   6NM NM,,NMU5<<#:;<NMLL NM e_ NM 5<<( NM}NM 4% i 7893 CDNMn,0+/,0[ [,,[U\\*[LL [ e_ [ 5<<( [x([ )[eELL)+,-[H,0+/,0^ ^,,^U\\*^LL ^ e_ ^ 5<<( ^x(^ )^eELL)+,-^N,0+/,0w w,,wU\\*wLL w e_ w 5<<( wx(w )weELL)+,-wv):3*&,0+/,0@- @-,,@-U\\*@-LL @- e_ @- 5<<( @-x(@- )@-J       D$(<.0 ug. 15<<1`c1h&ELL&r4