
    Mj|_                     h   d dl mZmZ d dlmZ d dlmZmZmZ d dl	m
Z
 d dlZ G d d          Z ed	           G d
 de                      Z ed	           G d de                      Z ed	           G d de                      Z ed	           G d de                      Z ed	           G d de                      Z ed	           G d de                      Z edd           G d de                      Z G d d          Ze
d             Z ed	           G d de                      Z edd           G d de                      Z edd           G d  d!e                      Z ed	           G d" d#e                      Zd&d$Zd% ZdS )'    )	dataclassfield)List)_unwrap_if_constexpr_unwrap_shapeconstexpr_type)constexpr_functionNc                   >    e Zd ZdZed             Zed             ZdS )DistributedLayoutz@
    Base class for distributed memory layouts in Gluon IR.
    c                      t          |           S Nr   selfs    p/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/triton/experimental/gluon/language/_layouts.pytypezDistributedLayout.type       d###    c                      t          d          )Nz-DistributedLayout subclasses must define rank)NotImplementedErrorr   s    r   rankzDistributedLayout.rank   s    !"QRRRr   N)__name__
__module____qualname____doc__propertyr   r    r   r   r   r   	   sY          $ $ X$ S S XS S Sr   r   T)frozenc                   0    e Zd Zd Zd Zed             ZdS )
AutoLayoutc                 *    |                                 S r   )get_auto_layoutr   builders     r   _to_irzAutoLayout._to_ir   s    &&(((r   c                     dS )NALr   r   s    r   manglezAutoLayout.mangle       tr   c                      t          d          )NzAutoLayout has no rank
ValueErrorr   s    r   r   zAutoLayout.rank    s    1222r   Nr   r   r   r%   r(   r   r   r   r   r   r    r       sM        ) ) )   3 3 X3 3 3r   r    c                   0    e Zd Zd Zd Zed             ZdS )CoalescedLayoutc                 *    |                                 S r   )get_coalesced_layoutr#   s     r   r%   zCoalescedLayout._to_ir(   s    ++---r   c                     dS )NCLr   r   s    r   r(   zCoalescedLayout.mangle+   r)   r   c                      t          d          )NzCoalescedLayout has no rankr+   r   s    r   r   zCoalescedLayout.rank.   s    6777r   Nr-   r   r   r   r/   r/   %   sM        . . .   8 8 X8 8 8r   r/   c                        e Zd ZU dZee         ed<   ee         ed<   ee         ed<   ee         ed<    ee          Z	eee                  ed<    fdZ
d	 Zd
efdZd Zed             Z xZS )BlockedLayouta  
    Represents a blocked layout, partitioning a tensor across threads, warps, and CTAs.

    Args:
        size_per_thread (List[int]): Number of elements per thread per dimension.
        threads_per_warp (List[int]): Number of threads per warp per dimension.
        warps_per_cta (List[int]): Number of warps per CTA per dimension.
        order (List[int]): The ordering of dimensions for partitioning.
        cga_layout (Optional[List[List[int]]]): Bases describing how CTAs tile each dimension.
    size_per_threadthreads_per_warpwarps_per_ctaorderdefault_factory
cga_layoutc                    t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t          | j                  }t                              | d| j	                   t          | j                  |k    sJ t          | j                  |k    sJ t          | j                  |k    sJ d S )Nr7   r8   r9   r:   r=   )
super__setattr__r   r7   r8   r9   r:   lenobjectr=   )r   r   	__class__s     r   __post_init__zBlockedLayout.__post_init__E   s   -/CDDX/Y/YZZZ.0DTEZ0[0[\\\O-A$BT-U-UVVVG%9$*%E%EFFF4'((4t???4())T11114%&&$....4:$&&&&&&r   c                 f    |                     | j        | j        | j        | j        | j                  S r   )get_blocked_layoutr7   r8   r9   r:   r=   r#   s     r   r%   zBlockedLayout._to_irQ   s6    )) !JO
 
 	
r   returnc                    d } || j                   } || j                  } || j                  } || j                  }| j        r$d                    d | j        D                       nd}d| d| d| d| d| dS )Nc                 Z    | dS d                     t          t          |                     S N _joinmapstrxs    r   	stringifyz'BlockedLayout.mangle.<locals>.stringify\   &    yr88CQKK(((r   rL   c              3   f   K   | ],}d                      t          t          |                    V  -dS ~NrM   .0vecs     r   	<genexpr>z'BlockedLayout.mangle.<locals>.<genexpr>e   6      QQ#chhs3}}55QQQQQQr   rK   B)r7   r8   r9   r:   r=   rN   )r   rS   r7   r8   r9   r:   r=   s          r   r(   zBlockedLayout.mangleZ   s    	) 	) 	)
 $)D$899$9T%:;;!	$"455	$*%%UYUdlSXXQQQQQQQQjl
]?]]%5]]]]]]PZ]]]]r   c           
          t          t          | j                  t          | j                  t          | j                  t          | j                  t          d | j        D                       f          S )Nc              3   4   K   | ]}t          |          V  d S r   tuplerX   s     r   r[   z)BlockedLayout.__hash__.<locals>.<genexpr>j   s(      -T-TSeCjj-T-T-T-T-T-Tr   )hashra   r7   r8   r9   r:   r=   r   s    r   __hash__zBlockedLayout.__hash__h   sn    U4/00%8M2N2NPUVZVhPiPi4:&&-T-TDO-T-T-T(T(TV W W 	Wr   c                 *    t          | j                  S r   )rA   r:   r   s    r   r   zBlockedLayout.rankl       4:r   r   r   r   r   r   int__annotations__r   listr=   rD   r%   rP   r(   rc   r   r   __classcell__rC   s   @r   r6   r6   3   s        	 	 #Y3i99"'%"="="=JT#Y===
' 
' 
' 
' 
'
 
 
^ ^ ^ ^ ^W W W   X    r   r6   c                   |     e Zd ZU dZeed<   eed<    fdZd Zde	fdZ
d Zed	             Zed
             Z xZS )SliceLayoutz
    Represents a layout corresponding to slicing a distributed tensor along one dimension.

    Args:
        dim (int): The dimension index to slice.
        parent (DistributedLayout): The parent layout before slicing.
    dimparentc                     t                                          dt          | j                             t                                          dt          | j                             d S )Nrn   ro   )r?   r@   r   rn   ro   r   rC   s    r   rD   zSliceLayout.__post_init__}   sS    E#7#A#ABBBH&:4;&G&GHHHHHr   c                 h    |                     | j        | j                            |                    S r   )get_slice_layoutrn   ro   r%   r#   s     r   r%   zSliceLayout._to_ir   s2    ''HKw''
 
 	
r   rG   c                 L    d| j          d| j                                         dS )NSLrL   )rn   ro   r(   r   s    r   r(   zSliceLayout.mangle   s+    7DH77t{11337777r   c                 8    t          | j        | j        f          S r   )rb   rn   ro   r   s    r   rc   zSliceLayout.__hash__   s    TXt{+,,,r   c                      | j         j        dz
  S N   ro   r   r   s    r   r   zSliceLayout.rank   s    {!##r   c                       j         j        }|sg S  j         j        }d j        cxk    r|k     sn J  fd|D             S )Nr   c                 T    g | ]$}|d j                  |j         dz   d          z   %S rx   )rn   )rY   basisr   s     r   
<listcomp>z*SliceLayout.cga_layout.<locals>.<listcomp>   s8    WWWEitxi 5A#77WWWr   )ro   r=   r   rn   )r   parent_cga_layoutr   s   `  r   r=   zSliceLayout.cga_layout   sg     K2  	I{DH####t######WWWWEVWWWWr   r   r   r   r   rg   rh   r   rD   r%   rP   r(   rc   r   r   r=   rj   rk   s   @r   rm   rm   q   s           
HHHI I I I I
 
 
8 8 8 8 8- - - $ $ X$ X X XX X X X Xr   rm   c                        e Zd ZU dZeee                  ed<   eee                  ed<   eee                  ed<   eee                  ed<   ee         ed<    fdZd Zd	 Z	d
 Z
ed             Z xZS )DistributedLinearLayouta  
    Represents a linear distributed layout with explicit bases at register, lane, warp, and block levels.
    See: https://arxiv.org/abs/2505.23819 for reference.

    Args:
        reg_bases (List[List[int]]): Bases for register-level distribution.
        lane_bases (List[List[int]]): Bases for lane-level distribution.
        warp_bases (List[List[int]]): Bases for warp-level distribution.
        block_bases (List[List[int]]): Bases for block-level distribution.
        shape (List[int]): The tensor global shape.
    	reg_bases
lane_bases
warp_basesblock_basesshapec                 0   t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t          | j                  }| j        D ]}t          |          |k    sJ | j        D ]}t          |          |k    sJ | j        D ]}t          |          |k    sJ | j        D ]}t          |          |k    sJ d S )Nr   r   r   r   r   )	r?   r@   r   r   r   r   r   r   rA   r   r   r}   rC   s      r   rD   z%DistributedLinearLayout.__post_init__   s|   Kt~)F)FGGGL-*H*HIIIL-*H*HIIIM=9I+J+JKKKG]4:%>%>???4:^ 	& 	&Eu::%%%%%_ 	& 	&Eu::%%%%%_ 	& 	&Eu::%%%%%% 	& 	&Eu::%%%%%	& 	&r   c                 f    |                     | j        | j        | j        | j        | j                  S r   )get_distributed_linear_layoutr   r   r   r   r   r#   s     r   r%   zDistributedLinearLayout._to_ir   s6    44T^T_VZVegkgw59ZA A 	Ar   c                 X    d| j          d| j         d| j         d| j         d| j         dS )NDLLrL   )r   r   r   r   r   r   s    r   r(   zDistributedLinearLayout.mangle   s@    lT^lldollll$JZll]a]gllllr   c                 v   t          t          t          t          | j                            t          t          t          | j                            t          t          t          | j                            t          t          t          | j                            t          | j                  f          S r   )rb   ra   rO   r   r   r   r   r   r   s    r   rc   z DistributedLinearLayout.__hash__   s    #eT^,,--#eT_--..#eT_--..#eT-..//$*
   	r   c                 *    t          | j                  S r   )rA   r   r   s    r   r   zDistributedLinearLayout.rank   re   r   )r   r   r   r   r   rg   rh   rD   r%   r(   rc   r   r   rj   rk   s   @r   r   r      s         
 
 DIT#YT#Yd3i   9& & & & &$A A Am m m     X    r   r   c                        e Zd ZU dZeed<   eed<   eed<    fdZd Zde	fdZ
d	 Zed
             Zed             Z xZS )DotOperandLayouta
  
    Represents a layout for a dot operand.

    Args:
        operand_index (int): 0 for LHS and 1 for RHS of the dot operation.
        parent (DistributedLayout): The parent layout, representing the MMA.
        k_width (int): Number of elements per 32-bits.
    operand_indexro   k_widthc                 @   t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             d S )Nr   ro   r   )r?   r@   r   r   ro   r   rq   s    r   rD   zDotOperandLayout.__post_init__   sy    O-A$BT-U-UVVVH&:4;&G&GHHHI';DL'I'IJJJJJr   c                 t    |                     | j        | j                            |          | j                  S r   )get_dot_operand_layoutr   ro   r%   r   r#   s     r   r%   zDotOperandLayout._to_ir   s2    --d.@$+BTBTU\B]B]_c_klllr   rG   c                 \    d| j          d| j                                         d| j         dS )NDOrL   )r   ro   r(   r   r   s    r   r(   zDotOperandLayout.mangle   s6    PD&PP););)=)=PPPPPPr   c                 D    t          | j        | j        | j        f          S r   )rb   r   ro   r   r   s    r   rc   zDotOperandLayout.__hash__   s    T'dlCDDDr   c                     | j         j        S r   rz   r   s    r   r   zDotOperandLayout.rank   s    {r   c                 X   t          t          | j        dg                     pg }|sg S | j        j        t	          fd|D                       sJ | j        dk    rdz
  ndz
  }d|cxk    rk     sn J g }|D ]+}t          |          }d||<   |                    |           ,|S )Nr=   c              3   >   K   | ]}t          |          k    V  d S r   rA   )rY   r}   r   s     r   r[   z.DotOperandLayout.cga_layout.<locals>.<genexpr>   s.      EE%3u::%EEEEEEr   r   ry      )r   getattrro   r   allr   ri   append)r   r   k_dimderivedr}   	new_basisr   s         @r   r=   zDotOperandLayout.cga_layout   s    0lTV1W1WXX^\^  	I{EEEE3DEEEEEEEE .!33qE    D      & 	& 	&EUI IeNN9%%%%r   r   rk   s   @r   r   r      s           LLLK K K K K
m m mQ Q Q Q QE E E     X    X    r   r   )r   eqc                        e Zd ZU dZee         ed<   ee         ed<   ee         ed<    ee          Z	eee                  ed<    fdZ
d Zd	efd
Zd Zed             Z xZS )NVMMADistributedLayouta_  
    Represents a layout for NVIDIA MMA (tensor core) operations.

    Args:
        version (List[int]): Version identifier for the MMA instruction.
        warps_per_cta (List[int]): Number of warps per CTA.
        instr_shape (List[int]): Instruction shape for MMA.
        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
    versionr9   instr_shaper;   r=   c                    t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                              | d| j                   d S )Nr   r9   r   r=   )r?   r@   r   r   r9   r   rB   r=   rq   s    r   rD   z$NVMMADistributedLayout.__post_init__  s    I';DL'I'IJJJO-A$BT-U-UVVVM+?@P+Q+QRRR4t?????r   c                 Z    |                     | j        | j        | j        | j                  S r   )get_mma_layoutr   r9   r=   r   r#   s     r   r%   zNVMMADistributedLayout._to_ir!  s0    %%LO	
 
 	
r   rG   c           	          | j         r$d                    d | j         D                       nd}d| j         d| j         d| j         d| d	S )NrL   c              3   f   K   | ],}d                      t          t          |                    V  -dS rV   rM   rX   s     r   r[   z0NVMMADistributedLayout.mangle.<locals>.<genexpr>*  r\   r   rK   MMA__MMA)r=   rN   r   r9   r   r   r=   s     r   r(   zNVMMADistributedLayout.mangle)  sc    UYUdlSXXQQQQQQQQjl
]dl]]T%7]]$:J]]Z]]]]r   c           	          t          t          | j                  t          | j                  t          | j                  t          d | j        D                       f          S )Nc              3   4   K   | ]}t          |          V  d S r   r`   rX   s     r   r[   z2NVMMADistributedLayout.__hash__.<locals>.<genexpr>/  (      AA#5::AAAAAAr   )rb   ra   r   r9   r   r=   r   s    r   rc   zNVMMADistributedLayout.__hash__-  s_    U4<((%0B*C*CU4K[E\E\AAAAAAAC D D 	Dr   c                 *    t          | j                  S r   )rA   r9   r   s    r   r   zNVMMADistributedLayout.rank1  s    4%&&&r   rf   rk   s   @r   r   r   
  s           #Y9c"'%"="="=JT#Y===@ @ @ @ @
 
 
^ ^ ^ ^ ^D D D ' ' X' ' ' ' 'r   r   c                   (    e Zd ZdZed             ZdS )SharedLayoutz;
    Base class for shared memory layouts in Gluon IR.
    c                      t          |           S r   r   r   s    r   r   zSharedLayout.type;  r   r   N)r   r   r   r   r   r   r   r   r   r   r   6  s9          $ $ X$ $ $r   r   c                    |s| S t          |           }t          |d                   }dg|z  }|D ]H}t          |          |k    sJ t          |          D ]!}t          ||         ||                   ||<   "It          |          D ]}||xx         dz  cc<   t          |          D ]=}||         ||         z  dk    sJ d|  d|             ||xx         ||         z  cc<   >|S )Nr   ry   r   zShape z  is not divisible by CGA layout )ri   rA   rangemax)r   r=   shape_per_ctar   	cga_shaper}   irn   s           r   _get_shape_per_ctar   @  s9    KKMz!}Dd
I 7 75zzT!!!!t 	7 	7Ay|U1X66IaLL	7 4[[  !T{{ . .S!IcN2a7779u%9u9uis9u9u777cy~-r   c                        e Zd ZU dZeed<   eed<   dZeed<   dZeed<   dZ	eed<    e
e	          Zeee                  ed
<    fdZd Zeedd                        ZdefdZd Z xZS )NVMMASharedLayouta  
    Represents a layout for shared memory suitable for NVIDIA MMA operations.

    Args:
        swizzle_byte_width (int): Width in bytes for swizzling.
        element_bitwidth (int): Bitwidth of element type.
        rank (int): Rank of the tensor.
        transposed (bool): Whether the layout is transposed.
        fp4_padded (bool): Whether FP4 padding is used.
        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
    swizzle_byte_widthelement_bitwidthr   r   F
transposed
fp4_paddedr;   r=   c                    t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             | j        pg }|r t          |d                   | j	        k    sJ t                                          dt          | j	                             t                                          dt          |                     | j        dv sJ | j        d	v sJ d S )
Nr   r   r   r   r   r   r=   )          @   )r   r   r      )
r?   r@   r   r   r   r   r   r=   rA   r   )r   r=   rC   s     r   rD   zNVMMASharedLayout.__post_init__h  sG   02FtG^2_2_```.0DTEZ0[0[\\\L*>t*O*OPPPL*>t*O*OPPP _*
 	3z!}%%2222F$8$C$CDDDL*>z*J*JKKK$7777&*:::::::r   c                 r    |                     | j        | j        | j        | j        | j        | j                  S r   )get_nvmma_shared_layoutr   r   r   r   r=   r   r#   s     r   r%   zNVMMASharedLayout._to_iry  s:    ..#!OOOI
 
 	
r   Nc                    |rdnd}|| nt          | |          }t          |           }|r|dd         |dd         z   }|d         |z  }||j        z  dz  }	|	dk    r|	dz  dk    rd}
n&|	dk    r|	dz  dk    rd}
n|	d	k    r|	d	z  dk    rd	}
nd}
d}|dd         D ]}||z  }t          |           dk     s|dk     rd}
t          |
|j        ||||
          S )zReturns an NVMMASharedLayout with default swizzling for a given shape.

        This picks the largest swizzle pattern compatible with the shape, which
        allows emitting the fewest TMA or MMA messages.
        r   ry   Nr   r   r   r   r   )r   r   r   r   r   r=   )r   rA   primitive_bitwidthr   )block_shapedtyper   r   r=   packing_factorr   r   contig_dim_sizecontig_dim_bytesr   flatten_outer_dimsizes                r   get_default_forz!NVMMASharedLayout.get_default_for  so    )/a'1'9?QR]_i?j?j; 	B)!""-bqb0AAM'+n<*U-EEJs""'7#'='B'B!$##(82(=(B(B!###(82(=(B(B!#!"!#2#& 	& 	&D%{a#4q#8#8!" 1"5!!!
 
 
 	
r   rG   c                     | j         r$d                    d | j         D                       nd}d| j         d| j         d| j         d| j         d| dS )NrL   c              3   f   K   | ],}d                      t          t          |                    V  -dS rV   rM   rX   s     r   r[   z+NVMMASharedLayout.mangle.<locals>.<genexpr>  r\   r   rK   NVMMA__NVMMA)r=   rN   r   r   r   r   r   s     r   r(   zNVMMASharedLayout.mangle  s    UYUdlSXXQQQQQQQQjl
 A/  A  A$2G  A  A$/  A  A\`\k  A  Anx  A  A  A  	Ar   c                     t          | j        | j        | j        | j        | j        | j        rt          d | j        D                       nd f          S )Nc              3   4   K   | ]}t          |          V  d S r   r`   rX   s     r   r[   z-NVMMASharedLayout.__hash__.<locals>.<genexpr>  r   r   )rb   r   r   r   r   r   r=   ra   r   s    r   rc   zNVMMASharedLayout.__hash__  s`    T,d.CTYPTP_aeapEI_^UAAAAAAAAZ^` a a 	ar   )FFN)r   r   r   r   rg   rh   r   r   boolr   r   ri   r=   r   rD   r%   staticmethodr	   r   rP   r(   rc   rj   rk   s   @r   r   r   T  s!        
 
 D#MMMJJ"'%"="="=JT#Y===; ; ; ; ;"
 
 
 #
 #
 #
  \#
JA A A A Aa a a a a a ar   r   c                        e Zd ZU dZeed<   eed<   eed<   ee         ed<    ee          Z	eee                  ed<    fdZ
d	 Zd
efdZd Z xZS )SwizzledSharedLayoutaq  
    Represents a generic swizzled shared memory layout.

    Args:
        vec (int): Vector width for swizzling.
        per_phase (int): Elements per swizzle phase.
        max_phase (int): Maximum number of swizzle phases.
        order (List[int]): Dimension ordering for swizzling.
        cga_layout (Optional[List[List[int]]]): Bases describing CTA tiling.
    rZ   	per_phase	max_phaser:   r;   r=   c                    t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                              | d| j                   d S )NrZ   r   r   r:   r=   )	r?   r@   r   rZ   r   r   r:   rB   r=   rq   s    r   rD   z"SwizzledSharedLayout.__post_init__  s    E#7#A#ABBBK)=dn)M)MNNNK)=dn)M)MNNNG%9$*%E%EFFF4t?????r   c                 f    |                     | j        | j        | j        | j        | j                  S r   )get_swizzled_shared_layoutrZ   r   r   r:   r=   r#   s     r   r%   zSwizzledSharedLayout._to_ir  s3    11HNNJO
 
 	
r   rG   c                     d }| j         r$d                    d | j         D                       nd}d| j         d| j         d| j         d || j                   d| dS )Nc                 Z    | dS d                     t          t          |                     S rJ   rM   rQ   s    r   rS   z.SwizzledSharedLayout.mangle.<locals>.stringify  rT   r   rL   c              3   f   K   | ],}d                      t          t          |                    V  -dS rV   rM   rX   s     r   r[   z.SwizzledSharedLayout.mangle.<locals>.<genexpr>  r\   r   rK   SSS__SSS)r=   rN   rZ   r   r   r:   )r   rS   r=   s      r   r(   zSwizzledSharedLayout.mangle  s    	) 	) 	)
 VZUdlSXXQQQQQQQQjl
kdhkkkk$.kk99TZCXCXkk[ekkkkr   c           
          t          | j        | j        | j        t	          | j                  t	          d | j        D                       f          S )Nc              3   4   K   | ]}t          |          V  d S r   r`   rX   s     r   r[   z0SwizzledSharedLayout.__hash__.<locals>.<genexpr>  s-      OvOv_bPUVYPZPZOvOvOvOvOvOvr   )rb   rZ   r   r   ra   r:   r=   r   s    r   rc   zSwizzledSharedLayout.__hash__  sP    Xt~t~uTZ7H7H%OvOvfjfuOvOvOvJvJvwy y 	yr   )r   r   r   r   rg   rh   r   r   ri   r=   rD   r%   rP   r(   rc   rj   rk   s   @r   r   r     s         	 	 
HHHNNNNNN9"'%"="="=JT#Y===@ @ @ @ @
 
 
l l l l ly y y y y y yr   r   c                        e Zd ZU dZeee                  ed<   eee                  ed<   eee                  ed<   ee         ed<    fdZd Zde	fd	Z
d
 Zeed                         Zd Z xZS )PaddedSharedLayouta  
    Represents a layout for the access to shared memory. Compared to SwizzledSharedLayout,
    it combined padding and element reordering via linear transformation (e.g. row permutation)
    to avoid shared memory bank conflicts. After every interval tensor elements, the
    corresponding number of padding elements are inserted. If a position corresponds to
    multiple intervals, the padding amounts are summed.

    In the following example of a tensor,
    `eM` represents original elements in the and `pN` represents padded element.

    Before padding, the shared memory looks like:
    [e0, e1,
     e2, e3,
     e4, e5,
     e6, e7,
     ...]

    After padding with interval-padding list [[2, 1], [4, 2]] with an identity remapping,
    the shared memory will be
    [e0, e1, p0,
     e2, e3, p1, p2, p3,
     e4, e5, p4,
     e6, e7, p5, p6, p7,
     ...]

    Furthermore this encoding allows for a linear remapping from the 1-D shared
    memory offset to logical n-D tensor elements. The remapping is given in the form
    of linear bases mapping from offset to [dim0, dim1...dimN-1].
    See LinearLayout.h for more details how linear layouts are applied to remap
    elements.
    Some concrete examples using `xN` and `yN` to mean the logical n-D tensor elements
    and `pN` to mean padding:

    After padding for shape = [8] with interval-padding list [[2, 2]], offset_bases = [[2], [1]] and block_bases = []:
    [x0, x2, p0 p1, x1, x3]

    After padding for shape = [8, 4] with interval_padding_pairs = [[8, 1]], offset_bases = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]] and block_bases = []:
    [
        x0y0, x0y1, x0y2, x0y3,
        x2y0, x2y1, x2y2, x2y3,
        p0,
        x4y0, x4y1, x4y2, x4y3,
        x6y0, x6y1, x6y2, x6y3,
        p1,
        x1y0, x1y1, x1y2, x1y3,
        x3y0, x3y1, x3y2, x3y3,
        p2,
        x5y0, x5y1, x5y2, x5y3,
        x7y0, x7y1, x7y2, x7y3,
    ]

    Args:
        interval_padding_pairs (List[int]): List of [interval, padding] pair and both interval and padding must be powers of 2.
        offset_bases (List[int]): Bases for shared memory offsets
        block_bases (List[List[int]]): Bases for block-level shared memory offsets.
        shape (List[int]): n-D logical shared memory shape
    interval_padding_pairsoffset_basesr   r   c                 t   t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t          | j                  }| j        D ]}t          |          |k    sJ | j        D ]}t          |          |k    sJ |                                  d S )Nr   r   r   r   )	r?   r@   r   r   r   r   r   rA   verifyr   s      r   rD   z PaddedSharedLayout.__post_init__%  s   4mDD_6`6`aaaNM$:K,L,LMMMM=9I+J+JKKKG]4:%>%>???4:& 	& 	&Eu::%%%%%% 	& 	&Eu::%%%%%r   c                 t    t          | j         \  }}|                    ||| j        | j        | j                  S r   )zipr   get_padded_shared_layoutr   r   r   )r   r$   	intervalspaddingss       r   r%   zPaddedSharedLayout._to_ir4  s;    !4#>?	8//	8TEVX\Xhjnjtuuur   rG   c           	      H    d| j          d| j         d| j         d| j         d	S )NPaddedShared_rL   _PaddedShared)r   r   r   r   r   s    r   r(   zPaddedSharedLayout.mangle8  s9    }t:}}T=N}}QUQa}}dhdn}}}}r   c                    | j         }t          |          dk    s
J d            t          d |D                       sJ t          | \  }}t	          t          |                    }t          |          t          |          k    sJ d t          fd|D                       s
J d            t          fd|D                       s
J d            t          | j                  }|dk    s
J d	            d S )
Nr   zVPaddedSharedLayout interval_padding_pairs must have at least one interval-padding pairc              3   <   K   | ]}t          |          d k    V  dS )r   Nr   )rY   pairs     r   r[   z,PaddedSharedLayout.verify.<locals>.<genexpr>>  s,      44d3t99>444444r   c                 &    | dk    o| | dz
  z  dk    S Nr   ry   r   ns    r   <lambda>z+PaddedSharedLayout.verify.<locals>.<lambda>D      !a%"<AQK1,< r   c              3   .   K   | ]} |          V  d S r   r   rY   r  is_power_of_2s     r   r[   z,PaddedSharedLayout.verify.<locals>.<genexpr>E  s-      77==##777777r   z;PaddedSharedLayout interval values must all be power of twoc              3   .   K   | ]} |          V  d S r   r   r  s     r   r[   z,PaddedSharedLayout.verify.<locals>.<genexpr>F  s-      66==##666666r   z:PaddedSharedLayout padding values must all be power of twoz*PaddedSharedLayout order must not be empty)r   rA   r   r   ri   setr   )r   pairsr   r   unique_intervalsr   r  s         @r   r   zPaddedSharedLayout.verify;  s   +5zzA~~~w~~~44e44444444!5k	8I//#$$I6666<<7777Y77777vv9vvv76666X66666tt8ttt64:axxxExxxxxr   c           	         t          |          t          |          k    sJ d t          fd|D                       sJ t          |          }g }|D ]ht          t          t	          j        |                                       D ]1|                    fdt          |          D                        2it          | |g |          S )zReturns a PaddedSharedLayout with the given interval and padding pairs and an identity mapping as the linear component for the given shape and order.
        c                 &    | dk    o| | dz
  z  dk    S r  r   r  s    r   r  z6PaddedSharedLayout.with_identity_for.<locals>.<lambda>Q  r  r   c              3   .   K   | ]} |          V  d S r   r   r  s     r   r[   z7PaddedSharedLayout.with_identity_for.<locals>.<genexpr>R  s-      33==##333333r   c                 *    g | ]}|k    rd z  ndS )ry   r   r   )rY   r   r}   rn   s     r   r~   z8PaddedSharedLayout.with_identity_for.<locals>.<listcomp>Y  s)    $X$X$Xq188Q%ZZ$X$X$Xr   )rA   r   r   rg   mathlog2r   r   )r   r   r:   r   r   r}   rn   r  s        @@@r   with_identity_forz$PaddedSharedLayout.with_identity_forK  s    
 5zzSZZ''''<<3333U333333335zz 	Z 	ZCs49U3Z#8#899:: Z Z##$X$X$X$X$XERVKK$X$X$XYYYYZ ""8,ERRRr   c           
      *   t          t          t          t          | j                            t          t          t          | j                            t          t          t          | j                            t          | j                  f          S r   )rb   ra   rO   r   r   r   r   r   s    r   rc   zPaddedSharedLayout.__hash__]  sp    U3ud&ABBCCU3uVZVgKhKhEiEi3ud&67788%
:K:KM N N 	Nr   )r   r   r   r   r   rg   rh   rD   r%   rP   r(   r   r   r	   r  rc   rj   rk   s   @r   r   r     s        8 8r !cO+++tCy/!!!d3i   9    v v v~ ~ ~ ~ ~F F F  S S  \S N N N N N N Nr   r   c                        e Zd ZU dZeee                  ed<    ee          Z	eee                  ed<   dZ
eed<    fdZd Zd	efd
Zd Z xZS )SharedLinearLayoutzGRepresents a shared memory layout defined via an explicit LinearLayout.r   r;   r   r   	alignmentc                    t                                          dt          | j                             t                                          dt          | j                             t                                          dt          | j                             t          | j                  dk    s
J d            t          | j        d                   }|dk    s
J d            | j        D ]}t          |          |k    sJ | j        D ]}t          |          |k    sJ | j        dk    r| j        | j        dz
  z  dk    s
J d            d S )Nr   r   r  r   z1SharedLinearLayout offset_bases must not be emptyry   z<SharedLinearLayout alignment must be a positive power of two)r?   r@   r   r   r   r   r  rA   r   s      r   rD   z SharedLinearLayout.__post_init__j  sV   NM$:K,L,LMMMM=9I+J+JKKKK)=dn)M)MNNN4$%%***,_***4$Q'((axxxLxxx& 	& 	&Eu::%%%%%% 	& 	&Eu::%%%%%~!!t~!9K'LQR&R&R&RJ 'S&RR&R&Rr   c                 N    |                     | j        | j        | j                  S r   )get_shared_linear_layoutr   r   r  r#   s     r   r%   zSharedLinearLayout._to_iry  s$    //0A4CSUYUcdddr   rG   c                 8    d| j          d| j         d| j         dS )NSharedLinear_rL   _SharedLinear)r   r   r  r   s    r   r(   zSharedLinearLayout.mangle|  s+    ct0cc43Cccdnccccr   c           	          t          t          t          t          | j                            t          t          t          | j                            | j        f          S r   )rb   ra   rO   r   r   r  r   s    r   rc   zSharedLinearLayout.__hash__  sN    #eT.//00#eT-..//N
   	r   )r   r   r   r   r   rg   rh   r   ri   r   r  rD   r%   rP   r(   rc   rj   rk   s   @r   r  r  b  s         QQtCy/!!!#(5#>#>#>Kd3i>>>IsK K K K Ke e ed d d d d      r   r  c                     dg|z  }| s|S d }| D ]T}t          d t          |          D             d           }||}||xx         dz  cc<   >|s|J ||xx         dz  cc<   U|S )Nry   c              3   ,   K   | ]\  }}|d k    |V  dS )r   Nr   )rY   r   vs      r   r[   z bases_per_dim.<locals>.<genexpr>  s*      ==$!Qa1ffAffff==r   r   )next	enumerate)basesr   skip_broadcastresultnon_zero_idxr}   idxs          r   bases_per_dimr)    s    S4ZF L 	& 	&==)E"2"2===tDD?L3KKK1KKKK 	&+++<   A%   Mr   c                     t          | t                    r"t          | j        t	          |                    S t          | t
          t          f          rt          | j        |          S | j        S r   )	
isinstancer   r)  r   rA   rm   r   r9   ro   )layoutr   s     r   r9   r9     sa    &122 $V.E

;;;	F[*:;	<	< $V]E222##r   )T)dataclassesr   r   typingr   triton.language.corer   r   r   triton.runtime.jitr	   r  r   r    r/   r6   rm   r   r   r   r   r   r   r   r   r  r)  r9   r   r   r   <module>r1     s;   ( ( ( ( ( ( ( (       T T T T T T T T T T 1 1 1 1 1 1 S S S S S S S S $
3 
3 
3 
3 
3" 
3 
3 
3 $
8 
8 
8 
8 
8' 
8 
8 
8 $: : : : :% : : :z $'X 'X 'X 'X 'X# 'X 'X 'XT $6 6 6 6 6/ 6 6 6r $0 0 0 0 0( 0 0 0f $4   (' (' (' (' ('. (' (' ! ('V$ $ $ $ $ $ $ $   & $[a [a [a [a [a [a [a [a| $4   .y .y .y .y .y< .y .y ! .yb $4   yN yN yN yN yN yN yN ! yNx $! ! ! ! ! ! ! !J   ,$ $ $ $ $r   