
    /j(I                   z   U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZmZm Z  d dl!m!Z! d dlm"Z" d dl	m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1 d dl2m3Z3m4Z4m5Z5 d d	lm6Z6 d dl7Z7d dl8Z8d dl9m:c m;Z< d d
l=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dl9mGZGmHZH ddgZId dlJmKZKmLZLmMZMmNZN e-rhd dlmOZOmPZPmQZQ d dlRmSZS d dl8mTZTmUZUmVZV d dlWmXZX d dlYmZZZ d dl[m\Z\ d dl]m^Z^ ddl_m`Z` ddlambZb ddlcmdZd ddlemfZf ddlgmhZhmiZimjZjmkZkmlZlmmZm dd lnmoZo dd!lpmqZqmrZr g d"Zs e0d#          Zteju        dtd&            Zvd d'lwmxZx d d(lymzZz d d)l{m|Z| d d*l}m~Z~ d d+lmZ d d,lmZ d d-lmZmZmZmZmZ d d.lmZmZ d d/lmZmZ dd0lmZ dd1lmZ ej        d2k    Z ej        e          Z e0d3          Zee7j        e7j        f         Ze+e1e8j        ee8jV        f                  Zerd4n ej        d5d6          Zd7d8d9e d:Zd;Zd;Zd;Zd<Z eFe8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        e8j        g          Zd=ed><   d?Zeedz
  z  d k    red@k    s
J dA            dudDZdvdHZ G dI dJe7j                  Z ej        dKL           G dM dN                      ZdwdxdVZ	 	 	 dydzdYZ	 	 	 dydzdZZeju        d{d[            Zd|d_Zd}dbZÐd~dfZĐddiZŐddmZddpZƐddtZǐddwZȐddyZɐdd|ZʐddZd fddZ̐ddZ͐dddZ	 	 dddZ	 	 	 	 	 dddZАddZѐddZҐddZӐddZԐddZ e4d          Z e0ddK          Zee'e%ef         ef         Z G d de,e(eef                   ZِddZڐddZېddZܐddZݐddZސddńZ	 ddd˄Zdd΄ZddЄZddӄZddՄZddلZddۄZdd݄ZddZddZddZ eg d          ZddZddZddZd dlZddZg Zded<   ddZddZej        dd            Zej        	 	 	 ddd            ZeZeZeZdWdddZdWdddZ ej"        d@          dd            Z G d de*          Zej         G d d                      Z G d	 d
          Z  G d de           Zej        dd            Z G d d          Z G d de          Zeju        ddd            Zej"        dd            Zej"        d{d            ZddZ	 dddZ	dd!Z
dd#Zdd$ZdWdWdKd%dd)ZddWd*dd0ZdWd1dd2ZdWd1dd3Zdd9Z ej"        d:          d{d;            Z ej"        d:          d{d<            Z ej"        d:          d{d=            ZddFZddJZ	 	 dddOZddQZe1ee7j        f         ZdRedK<   eju        	 dddT            Zeju        ddU            Zeju        dÐdV            Zeju        dĐdW            Zeju        dŐdY            ZdƐdZZdd[Z dd\Z!dƐd]Z"dƐd^Z#dǐdbZ$	 	 	 	 dȐdɐdgZ%d{dhZ& G di dj          Z'dʐdoZ(dʐdpZ)dːdrZ*d̐dsZ+d͐dtZ,d͐duZ-dΐdwZ.ej        dϐdz            Z/	 ddАd~Z0dѐdZ1dҐdZ2dӐdZ3dӐdZ4dԐdZ5dՐdZ6ej        d֐d            Z7ddZ8eju        dd            Z9eju        dאd            Z:eju        dd            Z;ddZ<ddZ=dؐdZ>dِdZ?d{dZ@d{dZAdڐdZBddZC G d dejD                  ZEdېdZFdܐdZGdܐdZH	 ddݐdZIdސdZJ	 ddߐdZKddZLddZMddZNd fddZOd fddZPddĄZQddƄZRej         G dǄ dȦ                      ZSej        ddʄ            ZTdd˄ZUdd̄ZVd{d̈́ZWdd΄ZXddτZYddՄZZddքZ[ddلZ\ddۄZ]dd݄Z^dd߄Z_ddZ`ddZaddZb	 dddZcddZdddZeddZfd{dZgddZhddddddddZid  eij                                D             Zk ejl        d          ZmddZnddZoddZpddZqeju        dd            Zrej         G d	 d
                      Zsi Ztded<   ddZu eF            Zvded<   ddZwdddZxddZy e0d          Zz e0d          Z{ G d deeze{f                   Z| e3dK          ddKdLd d            Z}dd!Z~ G d" d#ejD                  Zeju        dd$            Zd{d%Zdd'Zdd*Zdd+Zdd-Zdtd.Zdd/Zd{d0Zdd1Zd2Zdd3Zdd4Zd	d9Z	 	 dd
dAZddCZddFZd{dGZddKZddMZ ej        dKL           G dN dO                      ZedPe%f         Zeeegef         Z G dQ dR          Z e            ZddTZddVZddYZdd\Zdd]Z eFg d^          Zdd_Ze"dd`            ZddbZddmZddpZ	 dddsZdS (      )annotationsN)Callable
Collection	GeneratorIteratorMappingMutableMapping
MutableSet)datetime)	lru_cache)StringIO)AnycastConcatenateGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKING	TypeAlias	TypeGuardTypeVarUnion)dataclass_transform	ParamSpecSelf)mock)datasheet_tops)DeviceProperties)_needs_inductor_compile)dtype_abbrs)
OrderedSet)tree_flattentree_map_only!activation_quantization_aten_passinductor_autotune_lookup_table)free_symbolsfree_unbacked_symbolsIterateExprsShapeEnv)IterableSequence
ValuesView)Path)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)Node)ScalingType   )WorkspaceArgPythonWrapperCodegen)DepGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpumtiaTreturnstrc                     d t           D             } t          |           dk    sJ t          |           dk    rdn|                                 }|S )Nc                `    g | ]+}t          t          |                                          )|,S  )getattrtorchis_available.0xs     Z/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/torch/_inductor/utils.py
<listcomp>z get_gpu_type.<locals>.<listcomp>l   s3    KKK'%*;*;*H*H*J*JK!KKK    r7   r   rH   )	GPU_TYPESlenpop)
avail_gpusgpu_types     rX   get_gpu_typer`   j   sT    KKYKKKJz??aZA--vv:>>3C3CHOrZ   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRangesconfig)ceildivwin32_TspvTORCHINDUCTOR_XPU_KERNEL_FORMATzebinz.cubinz.hsaco.)rH   hiprJ         zOrderedSet[torch.dtype]_TMA_SUPPORTED_DTYPES@      zmust be power of 2nbytesintc                .    | t           z   dz
  t            z  S )z/Round up to the nearest multiple of ALIGN_BYTESr7   )ALIGN_BYTES)r   s    rX   _alignr      s    [ 1$44rZ   v
sympy.Exprboolc                   t          | t          j        t          j        f          r't	          t          t          | j                            S t          | t                    p"t          j	        | t                    t          k    S )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddMaxallmap_is_alignedargsaligngcdr   )r   s    rX   r   r      s]    !ei+,, -3{AF++,,,aK59Q#<#<#KKrZ   c                  2    e Zd ZdZdZdZed
d            Zd	S )r   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr7   Tvaluer   rM   Optional[sympy.Expr]c                    t          |t          t          j        f          rt	          t          |                    S t          |          r|S d S N)r   r   r   Integerr   r   )clsr   s     rX   evalz
align.eval   sN    ec5=122 	&#e**%%%u 	L	 	rZ   N)r   r   rM   r   )__name__
__module____qualname____doc__nargs
is_integerclassmethodr   rQ   rZ   rX   r   r      sB        FFEJ   [  rZ   r   T)frozenc                  <    e Zd ZU dZded<   ded<   ded<   ded<   d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    r   idzlist[Optional[int]]input_index_mappingoutput_index_mapping	list[str]constant_namesNr   r   r   r   __annotations__rQ   rZ   rX   r   r      sO          
 GGG -,,,---- rZ   r      d   fnCallable[[], Any]warmuprepfloatc                    |              t           j                                         t          j        t	          d          t           j        d          }t           j                            d          }t           j                            d          }|                                 t          d          D ] }|	                                  |              !|                                 t           j                                         |
                    |          dz  }t          dt	          ||z                      }t          dt	          ||z                      }	t          |          D ]} |              d t          |	          D             }d	 t          |	          D             }t           j                            t           j        j        j        g
          5 }
t           j                                         t          |	          D ]}|	                                 ||                                          t           j        j                            d          5   |              ddd           n# 1 swxY w Y   ||                                          t           j                                         t          j        d t%          ||          D                       }ddd           n# 1 swxY w Y   t          j        |                                          }t*                              d           t*                              |
                                                    dd                     t3          d |
                                D                       }|r$|t7          j        d |D                       dz  z  }t*                              d|           |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        ArH   dtypedeviceTenable_timing   r7   c                N    g | ]"}t           j                            d           #S Tr   rS   rH   EventrV   _s     rX   rY   zfp8_bench.<locals>.<listcomp>  s+    QQQA5:##$#77QQQrZ   c                N    g | ]"}t           j                            d           #S r   r   r   s     rX   rY   zfp8_bench.<locals>.<listcomp>  s+    OOO!!!!55OOOrZ   
activitiesRunCudaModuleNc                >    g | ]\  }}|                     |          S rQ   )elapsed_time)rV   ses      rX   rY   zfp8_bench.<locals>.<listcomp>  s(    GGG41aQ^^AGGGrZ   
raw eventsself_device_time_totalsort_by	row_limitc                p    g | ]3}|j         t          j        k    rt          j        d |j                  1|4S )zfused_abs_max_\d)device_typerc   CUDArematchnamerV   events     rX   rY   zfp8_bench.<locals>.<listcomp>  sJ     	
 	
 	
!Z_44H0%*==I	  JIIrZ   c              3  $   K   | ]}|j         V  d S r   device_time_totalr   s     rX   	<genexpr>zfp8_bench.<locals>.<genexpr>(  s%      QQE3QQQQQQrZ        @@profiling results: %s ms)rS   rH   synchronizeemptyr   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   nvtxtensorzipmeanitemlogdebugkey_averagestablerd   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventss                  rX   	fp8_benchr      s    BDDD	JKJu}VLLLE *"""66K
  t 44I1XX  
	J**9559K 1c&;.//00H1c#+,,--H 8__  
QQxQQQKOOuXOOOI			N+0
 
  
 
 
 

   x 	" 	"AKKMMMN!!###&&77                aL!!!!
   GG3{I+F+FGGG
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
" *U


 
 
"
"CIIlIIann$$-EQS$TTUUU	
 	
	
 	
 	
	 	O  
OQQQQQQQ	

 II(#...Js8   (BK=*J5K=JK=J	A(K==LLFis_vetted_benchmarkingc                F    ddl m}   |t                    | |||          S )Nr   )may_distort_benchmarking_result)$torch._inductor.runtime.benchmarkingr  _do_bench_using_profiling)r   r   r   r   r  s        rX   do_bench_using_profilingr  0  sB    " UTTTTTE**+DEE
FC/  rZ   c                   |sddl m}  |             t                      }|                                t	          |          } |              |                                 t          j        t          d          t          j        |          }|	                    d          }|	                    d          }	|
                                 t          d          D ] }
|                                  |              !|	
                                 |                                 |                    |	          dz  }t          dt          ||z                      }t          dt          ||z                      }t          |          D ]}
 |              |                                 t          j                            t#          t          j        j                  g	          5 }t          |          D ] }
|                                  |              !|                                 d
d
d
           n# 1 swxY w Y   t&                              d           t&                              |                                                    dd                     t/          fd|                                D                       }t3          |          |z  dk    rt5          d|t3          |          |          t3          |          |z  t/          fdt7          |          D                       }|                                 |                                }t&                              d           t&                              |                    d                     t;          d |D                       dz  |z  }t&                              d|           |S )r   r   )may_ban_benchmarkingr   r   Tr   r   r7   r   Nr   r   r   r   c                f    g | ]-}|j         t          t                    k    r|j        d k    +|.S )zContext Sync)r   rR   rc   r   )rV   r   device_type_uppers     rX   rY   z-_do_bench_using_profiling.<locals>.<listcomp>  sK     	
 	
 	
 GJ8I$J$JJJ
n,,  -,,rZ   zWFailed to divide all profiling events into #repeat groups. #%s events: %d, #repeats: %sc                ,    g | ]\  }}|z  d k    |S r   rQ   )rV   r   r   num_event_per_groups      rX   rY   z-_do_bench_using_profiling.<locals>.<listcomp>  s8     	
 	
 	
5&&!++ +++rZ   zprofiling time breakdown)r   c              3  $   K   | ]}|j         V  d S r   r   r   s     rX   r   z,_do_bench_using_profiling.<locals>.<genexpr>  s%      AA%e%AAAAAArZ   r   r   )r  r  r`   upperra   r   rS   r   r   r   r   r   r   r   r   r   r   rR   r   r   r   r   r   rd   r   r\   RuntimeError	enumerate_build_treesum)r   r   r   r   r  r   device_interfacer   r   r   r   r   r   r   r   r   actual_eventsr   r  r  s                     @@rX   r  r  H  s*    " MMMMMM..K#))++/<<BDDD  """KJuyMMME #((t(<<K &&T&::I1XX  
  """**9559K 1c&;.//00H1c#+,,--H 8__  
  """			EN35FGG
 
  
 
 ' 
x 	 	AKKMMMBDDDD$$&&&' ' ' ' ' ' ' ' ' ' ' ' ' ' ' IIlIIann$$-EQS$TTUUU	
 	
 	
 	
	
 	
 	
 O ?h&!+++  
 
 	
 o..9	
 	
 	
 	
%o66	
 	
 	
 M !..00MII()))IIm!!B!//000
AA=AAA
A
AF
JX
UCII(#...Js   0AIIIc                    	 ddl m}  t          j                            dd           | d uo(t          t          t          j        dd           d          S # t          $ r Y dS t          $ r}dt          |          v sJ Y d }~dS d }~ww xY w)	Nr   )	roi_alignztorchvision::nmsMetatorchvisionr  Fztorchvision::nms does not exist)torchvision.opsr  rS   _C%_dispatch_has_kernel_for_dispatch_keyhasattrrR   opsImportErrorr  rN   )r  r   s     rX   has_torchvision_roi_alignr    s    ------667I6RRR$ 
EI}d33[*
 *
 	
    uu   0CFF::::uuuuus   AA 
B	"	B	+BB	r   "Union[Optional[torch.device], str]torch.devicec                :   | t          j        d          j        S t          | t                    rt          j        |           } | j        dvrM| j        Ft          | j                  }t          j        | j        |j        	                                          S | S )Ng        )cpumeta)index)
rS   r   r   r   rN   typer$  ra   Workercurrent_devicer   r  s     rX   decode_devicer)    s    ~|C  ''&# &f%%{/))fl.B3FK@@|FK/?/F/U/U/W/WXXXXMrZ   itIterable[sympy.Expr]c                `    t          j        t          j        | t          j        j                  S r   )	functoolsreduceoperatormulr   SOner*  s    rX   sympy_productr4    s    HL"egk:::rZ   seq1Sequence[sympy.Expr]seq2c           	         t          |           t          |          k    sJ t          j        t          d t	          | |          D                                 S )Nc              3  &   K   | ]\  }}||z  V  d S r   rQ   )rV   abs      rX   r   zsympy_dot.<locals>.<genexpr>  s*      >>daAE>>>>>>rZ   )r\   r   expandr  r   )r5  r7  s     rX   	sympy_dotr=    sN    t99D		!!!!<>>c$oo>>>>>???rZ   Iterable[_T]ValuesView[_T]c                >    d | D                                              S )Nc                .    i | ]}t          |          |S rQ   )r   rU   s     rX   
<dictcomp>zunique.<locals>.<dictcomp>  s     !!!BqEE1!!!rZ   )valuesr3  s    rX   uniquerD    s"    !!b!!!((***rZ   numberUnion[int, sympy.Expr]denomc           
        t          | t          j                  st          |t          j                  r4t          t          j        |           t          j        |                    S t          | t
                    rt          |t
                    s/J |  dt          |            d| dt          |                       t          | |          S )Nz: , )r   r   Exprrg   sympifyr   r%  runtime_ceildiv)rE  rG  s     rX   rr   rr     s     &%*%% DE5:)F)F Du}V,,emE.B.BCCC fc"" z%'='=  ;;T&\\;;U;;d5kk;; = 65)))rZ   keyOptional[torch.dtype]c                   | dS t          |                               d          d         }i dddddd	d
ddddddd	dddddddddddddddddd d!d"dd#d$d%d&}|                    d' t          |                                          D                        t          | t                     r| n
d(||          S ))Nz*i8rx   r   r   i1
float8e4nvfp8e4nvfloat8e5fp8e5float8e4b15fp8e4b15float8e4b15x4
fp8e4b15x4float8_e4m3fnfloat8_e5m2float8_e8m0fnuu8float4_e2m1fn_x2r   fp16bfloat16bf16float32fp32float64fp64int8i8int16i16int32i32int64i64u16u32u64)uint8uint16uint32uint64c                    i | ]}||S rQ   rQ   )rV   r   s     rX   rB  z_type_of.<locals>.<dictcomp>  s    1111111rZ   *)rN   splitupdatelistrC  r   )rM  	dtype_strtyss      rX   _type_ofr{    so   
 {uCs##B'Ii 	G 	z	
 	 	 	w 	$ 	D 	6 	F 	6 	6  	!" 	#$ 	%& 	'( /  C4 JJ11d3::<<00111222S#&&@33,@I,@,@@rZ   lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                    d | D             S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                6    g | ]}t          j        |          S rQ   )r   rK  rV   r   s     rX   rY   z-convert_shape_to_inductor.<locals>.<listcomp>  s"    ***EM!***rZ   rQ   r|  s    rX   convert_shape_to_inductorr    s     +*c****rZ   valUnion[int, torch.SymInt]c                R    t          | t          j                  r| j        j        S | S )z
    Convert SymInt to sympy.Expr, leave int as is.

    Unlike sympy.sympify() which converts int to sympy.Integer,
    this function preserves int as int and only converts SymInt to Expr.
    r   rS   r2   nodeexprr  s    rX   convert_symint_to_exprr    s'     #u|$$ x}JrZ   r   c                    ddl m} t          | t                    r| nNt          | t          j                  rt          |           n%|j        j        j        	                    | d          S )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r7   VN)hint)
virtualizedr  r   r   r   r   graphsizevars	shape_envcreate_symintnode)r   r  s     rX   convert_to_symintr    sr      a	
 !U]++LCFFF!+==ad=KKrZ    Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                    d | D             S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                ,    g | ]}t          |          S rQ   )r  r  s     rX   rY   z+convert_shape_to_symint.<locals>.<listcomp>5  s!    ...Qa  ...rZ   rQ   r  s    rX   convert_shape_to_symintr  .  s     /.#....rZ   optorch._ops.OpOverloadc                H    t          d | j        j        D                       S )z-
    Does this op overload have aliasing
    c              3  (   K   | ]}|j         d uV  d S r   )
alias_inforV   r:  s     rX   r   zis_view.<locals>.<genexpr><  s)      FFAq|4'FFFFFFrZ   )any_schema	argumentsr  s    rX   is_viewr  8  s&     FF1EFFFFFFrZ   c                    dS NFrQ   )r   s    rX   <lambda>r  A  s     rZ   user5   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   | j         dk    rdS t          | j        t          j        j                  s| j        t          j        u sdS t          t          j        j        | j                  }|t          j        u st          |          r t          fd| j        D                       S t          j        j        |j        v p
 |          S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc              3  8   K   | ]}t          |          V  d S r   )is_pointwise_use)rV   ur  s     rX   r   z#is_pointwise_use.<locals>.<genexpr>R  s.      KKA#A77KKKKKKrZ   )r  r   targetrS   _ops
OpOverloadr/  getitemr   r  r   usersTag	pointwisetags)r  r  r  s    ` rX   r  r  ?  s     v  u3:uz4559<xGW9W9Wu%*'44F!!!WV__!KKKKKKKKKK9&+-H1H1HHrZ   r  r   r   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c           	        t           j                                        g d	fd} j        | gt	          t           j        |||f          R  }t          | j        j                  dk    r+t          | j        j        d         j
                  dk    r|f}                    |           t           j                            i           }|fS )
Nargtorch.TensorrM   r5   c                x                         |                                dt                               S )Nr  )appendplaceholderr\   )r  g
graph_argss    rX   add_tensor_argz)gen_gm_and_inputs.<locals>.add_tensor_arg]  s8    #}}43z??44555rZ   r7   r   Tensor)r  r  rM   r5   )rS   fxGraphr  r%   r  r\   r  returnsrN   r%  outputr4   )r  r   r  r  r  gmr  r  s         @@rX   gen_gm_and_inputsr  W  s     	A%'J6 6 6 6 6 6 6 1?u|^dF^LL  D 	FN"##q((&q).//8;;wHHTNNN			b!	$	$Bz>rZ   rH   Nonec                    | dk    rd S t          |           }|                                r|                                 d S d S Nr"  )ra   rT   r   r(  s     rX   r   r   o  sT    /77$$&& '$$&&&&&' 'rZ   modelCallable[..., Any]example_inputsSequence[Any]r   c                    t          |           t          j        d           t          j                    }t          |          D ]} | | }t          |           t          j                    }|J ||z
  S )Ni9  )r   rS   manual_seedtimeperf_counterr   )r  r  r   r   t0r   resultt1s           rX   timedr  w  s     	d				B5\\  'F				B7NrZ   rQ   
         ?repeatbaselinec                     t          j         fdt          |          D                       }t          j        |          z  }t	          ||z  d           |                                S )Nc                4    g | ]}t                    S rQ   )r  )rV   r   r   r  r  r   s     rX   rY   z%print_performance.<locals>.<listcomp>  s'    LLLuneV	4	4LLLrZ   z.6f)rS   r   r   medianprintr   )r  r  r   r  r  r   timingstooks   ```  `  rX   print_performancer    s{     lLLLLLLLeFmmLLL G <  5(D	TH_
"
"###99;;rZ   objmethodc                `     t          | |                      t          | |fd           dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                      S r   rQ   )r  s   rX   r  z#precompute_method.<locals>.<lambda>  s     rZ   N)rR   setattr)r  r  r  s     @rX   precompute_methodr    s8    !WS&!!##FC(((((rZ   methodsr   c                0    |D ]}t          | |           dS )zFReplace methods with new methods that returns a precomputed constants.N)r  )r  r  r  s      rX   precompute_methodsr    s.     ' '#v&&&&' 'rZ   r:  r;  c                P    t          | |k              t          | |k               z
  S r   )r   r:  r;  s     rX   cmpr    s!    q1u::AE

""rZ   rW   Union[int, Sequence[int]]sizeSequence[int]c                    t          | t                    r| g|z  S t          |           dk    r" t          |           | d         g          |z  S | S )Nr7   r   )r   r   r\   r%  )rW   r  s     rX   pad_listliker    sS    !S sTz
1vv{{tAww!v%%HrZ   tuple[_T, ...]list[_T]c                V    t          |           dk    rg S dd}t          | |          S )	Nr   elemrt   rM   rN   c                    t          | t                    r| S ddlm} t          | |          sJ |                                 S )Nr7   )rF   )r   rN   	schedulerrF   get_name)r  rF   s     rX   	sort_funcztuple_sorted.<locals>.sort_func  sP    dC   	K000000$ 122222}}rZ   rM  )r  rt   rM   rN   )r\   sorted)rW   r  s     rX   tuple_sortedr    s?    
1vv{{	    !####rZ   PRV)	covariantc                  .    e Zd Zedd            ZddZdS )CachedMethodr   r   rM   r  c                    d S r   rQ   )r   s    rX   clear_cachezCachedMethod.clear_cache  s    ),rZ   r   P.argsr  P.kwargsr  c                    d S r   rQ   selfr   r  s      rX   __call__zCachedMethod.__call__  s      rZ   N)r   r   rM   r  )r   r
  r  r  rM   r  )r   r   r   staticmethodr	  r  rQ   rZ   rX   r  r    s2        ,,, \,DDDDDDrZ   r  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c           	         | j         }d| dd| i}t          d| d d d                                |            t          j        |           || d                   }dfd}||_        |S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        _cache_on_selfr  r   rM   r  c                L    t          |           rt          |            d S d S r   r  delattrr  rM  s    rX   r	  z"cache_on_self.<locals>.clear_cache  s5    4 	D#	 	rZ   r  r   rM   r  r   execlstripr-  wrapsr	  )r   r   ctxwrapperr	  rM  s        @rX   cache_on_selfr"    s    ;D
t


C *C				 		 !		 		 (+		 		 		 FHH   "iob!!#&=&=&=">??G      &GNrZ   c                     t          |           S )z]
    Variant of cache_on_self for properties. The only difference is the type signature.
    )r"  )r   s    rX   cache_property_on_selfr$    s     rZ   
class_name*Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]]c                     d fd}|S )Nr   FN_TYPE[P, RV]rM   c           	         d d| j          dd| i}t          d d d d                                |            t          j        |           |d	                   }dfd}||_        |S )Nr  r   r  r   z            def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
                args_kwargs = (args, tuple(sorted(kwargs.items())))

                if not hasattr(self, "z2"):
                    object.__setattr__(self, "z%", {})

                cache = self.z

                try:
                    return cache[args_kwargs]
                except KeyError:
                    pass

                rv = fn(self, *args, **kwargs)

                cache[args_kwargs] = rv
                return rv
            innerr  r   rM   r  c                L    t          |           rt          |            d S d S r   r  r  s    rX   r	  z<cache_on_self_and_args.<locals>.wrapper.<locals>.clear_cache  s5    tS!! #c"""""# #rZ   r  r  )r   r   r*  r	  rM  r%  s       @rX   r!  z'cache_on_self_and_args.<locals>.wrapper  s     4:33333 Rj (+	 
 03  "  $ )	
 	
 	
, $	##CL11	# 	# 	# 	# 	# 	# (rZ   )r   r(  rM   r(  rQ   )r%  r!  s   ` rX   cache_on_self_and_argsr,    s)    
$ $ $ $ $ $L NrZ   node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                    ddl m} t          | t                    r6t	          j        t          j        d | D             t                                S t          | |j	                  r| j
        S t                      S )Nr7   irc                T    g | ]%}t          |d           |j        |j        j        &S )r  )r  r  originsrV   r  s     rX   rY   z%aggregate_origins.<locals>.<listcomp>,  sK        4((	 .2Y		!  rZ   ) r2  r   rx  r-  r.  r/  or_r#   r?   r4  )r-  r2  s     rX   aggregate_originsr8  $  s     -&& L  *   LL	
 	
 		
 
M2?	3	3 $$||rZ   Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                   t          |           }|dk    r.d fd|D             }t          t          |                    }n|dk    rg }|D ]}|j        dk    rd }d}d|j        v r|j        d         d         }nd	|j        v r|j        d	         d         }d
}|sOt          |d         t                    r|                    |d         |z              |                    |d         j        |z              t          t          |                    }n|dk    rd |D             }nt          d
                    dg|z             S )Noriginal_atenc                    | j         d         }d}t          |t          j        j                  r|j        j        }n@t          |t          j        j                  r!t          |	                                          }|S )Nr=  r6  )
r#  r   rS   r  r  _overloadpacketr   HigherOrderOperatorrN   r   )originr=  rM  s      rX   get_origin_meta_strz2get_fused_kernel_name.<locals>.get_origin_meta_strA  sl    "K8MC-)>?? 0#3<M5:+IJJ 0-,,..//JrZ   c                h    g | ].}|j         d k    r!d|j        v r|j        d         # |          /S )r  r=  )r  r#  )rV   rA  rB  s     rX   rY   z)get_fused_kernel_name.<locals>.<listcomp>K  sW     
 
 
yO++6;..O,8	  '' 988rZ   rS   r  r6  source_fn_stackr   fwd_source_fn_stackbackwardr7   inductor_nodec                2    g | ]}|j         d k    |j        S r  )r  r   rV   rA  s     rX   rY   z)get_fused_kernel_name.<locals>.<listcomp>i  s-     
 
 
"VY/5Q5QFK5Q5Q5QrZ   r   fused)r8  r  r#   r  r#  r   rN   r  r   NotImplementedErrorjoin)r-  r:  all_originssourcesrA  	source_fnsuffixrB  s          @rX   get_fused_kernel_namerR  :  s    $M22KO++	 	 	
 
 
 
%
 
 
 G,,--	g	%	%! 	C 	CFyO++ 	$33 &,= >r BII*fk99 &,A B2 FI'F  ilC00 CNN9Q<&#89999NN9Q<#86#ABBBG,,--	o	-	-
 
&1
 
 
 "!88WI'(((rZ   r!  r:   tuple[str, str]c                h
   t          |           }d |D             }t          j        t                    }t          j        t                    }d|rt	          d |D                       }t          |          dk    r[|d         j        t          d          s%d t          j	                  D             }|_
        |                    fd	           |D ]D}d
|j        v r|j        d
         |j        d
         }	d}
t          |	t          j        j                  rt#          |	j                  }
n@t          |	t          j        j                  r!t#          |	                                          }
|
r ||
                             |j                   d|j        v r:|j        d         d         j        }
||
                             |j                   |j                            d          dk    r%||j                                     |j                   Fdnd}|j         d| dd                    |                                           dd                    |                                           d}|j         dg}t5          |                                          D ]G\  }}|                    |j         d| dd                    t5          |                                Hddlm |                    |j         d           t	                      }g }t          | j                  slddlm } d4fd!}d5d$d6fd'}| D ]R}t          |d(          r|j!        t          |j!        d)          r|j!        j"        |j!        j"        D ]}|j        |v r|#                    |j                   |j        $                    |j                  }|H |||j                  \  }}|                    |j         d*| d+ ||           d,| d           t          |j!        d-          ri|j!        j%        ]|j!        j%        D ]P}|j        $                    |j                  }|$ |||j                  \  }}|                    d.|z              QT|D ]5}|                    |j         d|&                    d/0                      6|                    |j         d1d2                    |                      |d3                    |          fS )7aH  
    Retrieves metadata information for a kernel.
    Args:
        node_schedule (Union[Sequence[BaseSchedulerNode], ExternKernel]):
            Either a sequence of BaseSchedulerNode objects or an ExternKernel instance.
        wrapper (PythonWrapperCodegen):
            An instance of PythonWrapperCodegen, used to define the code comment format.
    Returns:
        tuple[str, str]:
            A tuple containing two strings:
                - The first string represents the kernel's metadata.
                - The second string represent the kernel's detailed metadata.
    c                (    g | ]}|j         d k    |S rI  r  rJ  s     rX   rY   z'get_kernel_metadata.<locals>.<listcomp>  s$    WWW&):V:Vf:V:V:VrZ   Nc              3  $   K   | ]}|j         V  d S r   )r  )rV   ns     rX   r   z&get_kernel_metadata.<locals>.<genexpr>  s$      "C"Cq17"C"C"C"C"C"CrZ   r7   r   )_inductor_kernel_metadata_node_to_idx_mapc                    i | ]\  }}||	S rQ   rQ   )rV   idxrW  s      rX   rB  z'get_kernel_metadata.<locals>.<dictcomp>  s    "V"V"Vfc11c"V"V"VrZ   c                    j         |          S r   )rX  )rW  single_graphs    rX   r  z%get_kernel_metadata.<locals>.<lambda>  s    lTUVW rZ   r   r=  	from_nodepartitioner_tagis_backwardzTopologically SortedUnsorted z Source Nodes: [rI  z], Original ATen: []z" Source node to ATen node mapping:z   z => r1  z Graph fragment:r  buffer2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]rw_namerN   rM   tuple[str, ir.Layout | None]c                   t          | j                  r,t          | j        j                  r| j        j        j        }n| j        }||}n|j        }	 |                                 }n# t          $ r d }Y nw xY w||fS r   )r   	TensorBoxdata
StorageBoxorigin_noder   
get_layoutrL  )rc  re  rk  r   layoutr2  s        rX   get_buffer_infoz,get_kernel_metadata.<locals>.get_buffer_info  s     fbl33 5
K9 9 5 #)+"2">KK"("4K&"DD&+D"#..00FF* " " "!FFF"V|#s   A, ,A;:A;shapeIterable[int]c                H    dd                     d | D                        dS )N[rI  c                ,    g | ]}t          |          S rQ   )rN   rU   s     rX   rY   z@get_kernel_metadata.<locals>.stringify_shape.<locals>.<listcomp>  s    %<%<%<c!ff%<%<%<rZ   rb  )rM  )ro  s    rX   stringify_shapez,get_kernel_metadata.<locals>.stringify_shape  s.    @499%<%<e%<%<%<==@@@@rZ   rm  ir.Layout | Nonec                    | dS  | j                    } | j                   }| j         }dt          | j                  | | | dS )Nr6  ")r  strider   r"   r   )rm  shape_annotationstride_annotationdevice_annotationrt  s       rX   stringfy_layoutz,get_kernel_metadata.<locals>.stringfy_layout  s|    >2&5ofk&B&B#D '6v}'E'E$G!'-}$6!?FL1 ?3C ?(?*;? ? ?rZ   read_writesreadsz   %z
 : Tensor z = PlaceHolder[target=writes%T)include_tensor_metadataz
   return ,
)rc  rd  re  rN   rM   rf  )ro  rp  rM   rN   )rm  ru  rM   rN   )'r8  collectionsdefaultdictrx  r#   r\   r  r  r  nodesrX  sortr#  r   rS   r  r  rN   r?  r@  r   r  getcommentrM  keysr  itemsr6  r2  r?   r  r  r}  r~  addtry_get_bufferr  format_node) r-  r!  rN  inductor_nodesfrom_node_dictoriginal_aten_dictunique_graphsnode_to_idx_mapr  r=  rM  sort_strmetadatadetailed_metadataoriginal_noder  	all_reads
all_writesr  rn  r|  rW  rrc  
input_namerm  woutput_namer   r2  r\  rt  s                                 @@@rX   get_kernel_metadatar  q  s=   $ $M22KWW;WWWN ,T22N$066
 L 
""C"CN"C"C"CCC}"")!,2L<)TUU Y"V"V	,BT8U8U"V"V"VIXFWWWW       8 8di''DIo,F,R Io6MC-)>?? 0-788M5:+IJJ 0-,,..// :"3'..ty999$)##)K(+0C3&&ty1111Y]],-->>49%,,TY777)5)A%%zH? 	C 	CX 	C 	CtyyATATAVAV7W7W 	C 	C99%7%<%<%>%>??	C 	C 	C  $OOOOP &~';';'='= > > 
 
u  PP=PPdiiu6N6NPP	
 	
 	
 	

   GO!E!E!EFFF%/\\	 "
-99 B	=&&&&&&$ $ $ $ $ $(A A A A
 
 
 
 
 
 # = =q-00 AM4I1='22 q}7J7V]0  6Y..$!af---!"!7!7!?!?!>$-<_VQV-L-L*
F)00& \ \J \ \.v66\ \NX\ \ \    AM844
=,8]1 = =!"!7!7!?!?!>$)8)H)HQ"))#*;<<<<" 	 	D$$?WWt'7'7PT'7'U'UWW    	  GO!U!Usxx
?S?S!U!UVVVTYY01111rZ   initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                    t          |           } t          |           }| r\|                                 }|j        D ]>}|r ||          r||vr*|                    |           |                     |           ?| \|S )zJReturns the set of nodes whose values depend on those within initial_queue)rx  r#   r]   r  r  r  )r  r  dominated_setr  users        rX   dominated_nodesr    s    
 ''M}--M
 +  ""J 	+ 	+D {{400 =((!!$'''$$T***  + rZ   Sequence[IRNode]dict[str, IRNode]c                    ddl m d
fdt          |          \  }}fd|D             }t          |           \  }}fd	|D             }t          t	          j        g ||R            S )Nr7   r1  rW  r@   rM   r   c                   t          | j                  r | j                  S t          | j                  r | j                  S t          | j                  o(t          | j        j        j        j        f           S r   )	r   rh  ri  rj  r@   ComputedBufferInputsKernelInputBufferTemplateBuffer)rW  r2  is_unrealized_nodes    rX   r  z*gather_origins.<locals>.is_unrealized_node$  s    a&& 	.%%af---a'' 	.%%af---!RY'' 

!!	1
 1
 -
 	
rZ   c                4    g | ]} |          |j         S rQ   r4  rV   r  r  s     rX   rY   z"gather_origins.<locals>.<listcomp>6  s-    WWWc?Q?QRU?V?VWckWWWrZ   c                4    g | ]} |          |j         S rQ   r  r  s     rX   rY   z"gather_origins.<locals>.<listcomp>8  s,    SSSC;M;Mc;R;RSCKSSSrZ   )rW  r@   rM   r   )r6  r2  r$   r#   	itertoolschain)	r   r  kwargs_flattenr   kwargs_originsargs_flattenargs_originsr2  r  s	          @@rX   gather_originsr    s     
 
 
 
 
 
 
" %V,,NAWWWW^WWWN"4((OL!SSSS<SSSLioE|EnEEEFFFrZ   r  c                P    d
ddfddfddfd	 |           S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    r  r   rM   r   c                    t          | t          j                  o(t          | j                  dk    o| j        d         dk    S )N   r   r   )r   r   Mulr\   r   )r  s    rX   is_neg_leadzsympy_str.<locals>.is_neg_leadC  s9    tUY''VC	NNa,?VDIaLTVDV	
rZ   rN   c                n   t          | t          j                  rt          | j                  dk    rP | j        d                   r: | j        d                    d | j        d         j        d                    S d                    t          | j                            S  |           S )Nr  r7   r   z - z + )r   r   r   r\   r   rM  r   )r  r  sympy_str_muls    rX   sympy_str_addz sympy_str.<locals>.sympy_str_addH  s    dEI&& 	' 49~~""{{49Q<'@'@"'-	!55__--	RSHYZ[H\:]:]___zz#mTY"?"?@@@ =&&&rZ   c                    t          | t          j                  rL |           rd | j        d                    S d                    t          | j                            S  |           S )N-r7   z * )r   r   r  r   rM  r   )r  r  sympy_str_atoms    rX   r  z sympy_str.<locals>.sympy_str_mulS  sx    dEI&& 	({4   B :>>$)A,77999zz#ndi"@"@AAA!>$'''rZ   c                   t          | t          j                  r| j        S t          | t          j        t          j        f          rd |            dS t          | t          t          t          t          f          r=| j
        j         dd                    t          t          | j                             dS t!          |           S )N()rI  )r   r   Symbolr   r   r  rk   rh   ri   rj   funcr   rM  r   	sympy_strr   rN   )r  r  s    rX   r  z!sympy_str.<locals>.sympy_str_atom^  s    dEL)) 	9uy%)455 	-}}T**----(HMNN 	i(RR499SDI5N5N+O+ORRRRt99rZ   )r  r   rM   r   r  r   rM   rN   rQ   )r  r  r  r  r  s    @@@@rX   r  r  <  s    
 
 
 

	' 	' 	' 	' 	' 	' 	'	( 	( 	( 	( 	( 	( 	(      =rZ   r$  ValueRanges[Any]c                    ddl m} t          j        r2t	          |j        dd           x}r|j        dk    rt          |           S t          j	                    S )Nr7   r  current_node
index_expr)
r  r  rq   compute_all_boundsrR   interpreterr  rn   ro   unknown)r$  r  fx_nodes      rX   get_bounds_index_exprr  k  sh     	!%~tDDDW% Nl**5!!!"$$$rZ   prefixc                    | d         dk    S )Nr   r  rQ   )r  s    rX   prefix_is_reductionr  y  s    !9rZ   rm   rZ  sympy.Symbolc                L    | t           j        k    sJ t          | |dd          S )9
    Used to generate an integer-nonnegative symbol.
    Tintegernonnegative)rm   SIZErl   )r  rZ  s     rX   sympy_index_symbol_with_prefixr  }  s0     TY vsDdCCCCrZ   checkc                6    | st           j        ot           j        S r   )rq   debug_index_assertsassert_indirect_indexing)r  s    rX   generate_assertr    s    /V/TV5TTrZ   r   c                L    | d         dk    sJ t          j        | dd          S )r  r   r   Tr  )r   r  r   s    rX   sympy_index_symbolr    s.     7c>>>> <d====rZ   replacementsdict[sympy.Expr, Any]c                    d	dt          j        |                               fd|                                D                       S )
z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedr   replacementUnion[sympy.Expr, str]rM   r  c                    t          | t          j                  sJ t          |t                    r!t          j        || j        | j                  S |S )Nr  )r   r   rJ  rN   r  r   is_nonnegative)r  r  s     rX   	to_symbolzsympy_subs.<locals>.to_symbol  s^     (EJ/////k3'' 	< +$3    rZ   c                0    i | ]\  }}| ||          S rQ   rQ   )rV   kr   r  s      rX   rB  zsympy_subs.<locals>.<dictcomp>  s)    ===1IIaOO===rZ   )r  r   r  r  rM   r  )r   rK  xreplacer  )r  r  r  s     @rX   
sympy_subsr    s^        =''====(:(:(<(<===  rZ   ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                x    t          | t          j                  p t          | t          j                  o| j        S r   )r   rS   r2   r  _has_symbolic_sizes_strides)r:  s    rX   is_symbolicr    s2    a&& 1el##E(ErZ   c                 4    t          d | D                       S )Nc              3  4   K   | ]}t          |          V  d S r   )r  r  s     rX   r   z"any_is_symbolic.<locals>.<genexpr>  s(      ,,!{1~~,,,,,,rZ   r  )r   s    rX   any_is_symbolicr    s    ,,t,,,,,,rZ   )z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultrun_and_save_rng_staterun_with_rng_statezaten._local_scalar_densezaten._assert_scalarr  torch.fx.GraphModuleOptional[torch.fx.Node]c                    ddl m} | j        j        D ]@}t	          |          r|c S |j                            d          x} ||          r|c S Ad S )Nr   )r)   r  )%torch.fx.experimental.symbolic_shapesr)   r  r  is_cudagraph_unsafe_fx_noder#  r  )r  r)   r  r  s       rX   %get_first_incompatible_cudagraph_noder    s|     LKKKKK  &t,, 	KKK9=='''C49N9Ns9S9S4KKK4rZ   c                    t          t          t          | j        j                                      }|j        dk    sJ |S )z$Get the output node from an FX graphr  )nextiterreversedr  r  r  )r  	last_nodes     rX   output_noder    s<    T(28>223344I<8####rZ   OrderedSet[torch.device]c                   | j                             d          }t          d |D                       }t          |           j        d         }t          |t                    r|n|f}t          d |D                       }||z  S )Nr  r  c              3     K   | ]H}t          |j                            d           t          j                  4|j        d          j        V  IdS r  N)r   r#  r  rS   r  r   r5  s     rX   r   z"get_all_devices.<locals>.<genexpr>  s`       9 9dimmE**EL999	%9 9 9 9 9 9rZ   r   c              3     K   | ]g}t          |t          j        j                  !t          |j                            d           t          j                  S|j        d          j        V  hdS r  )r   rS   r  r5   r#  r  r  r   )rV   r  s     rX   r   z"get_all_devices.<locals>.<genexpr>  sw       7 7c58=))7 sx||E**EL99	77 7 7 7 7 7rZ   )r  
find_nodesr#   r  r   r   tuple)r  placeholder_nodesinput_devicesout_argout_argsout_devicess         rX   get_all_devicesr    s    ++}+==.8 9 9%9 9 9 / /M "oo"1%G$We44Dww7*H,6 7 77 7 7 - -K ;&&rZ   c                    t          t          j                                                  D ]} |                     d          st          j        |          }|j        D ]}|                    d          rt          ||          }t          |t          j	        j
        j        j                  rV|j        D ]N}t          |t          j	        j
        j        j                  r#|j        j        j                                         Ot          j        | = dt          j        v r<t          j        d         }t'          |j        j        j                  `|j        j        `t1          j                     d S )Nz&torch._inductor.runtime.compile_tasks.triton_ztriton.runtime.driver)rx  sysmodulesr  
startswith__dict__rR   r   rS   	_inductorruntimetriton_heuristicsCachingAutotunercompile_resultsTritonCompileResultkernelrunmod__del__r%  driveractiveutilsinstancegccollect)module_namem	attr_namer&  r  r(  s         rX   unload_xpu_triton_pydsr3    sS   CK,,..// % %%%&NOO 	K$ 	< 	<I##I.. < I..EO3EV  	< #)"8 < <%"!O3EY  <
 #M-199;;;K$$ #+--k12"())2J#JLLLLLrZ   _registered_cachesc                    t          | d          rt          | j                  st          |  d          t                              |            | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r  callabler6  AttributeErrorr4  r  r  s    rX   clear_on_fresh_cacher:    sY    
 3&& Jhs.G.G JHHHIIIc"""JrZ   c                 B    t           D ]} |                                  dS )z&
    Clear all registered caches.
    N)r4  r6  r9  s    rX   clear_cachesr<  )  s0     "   rZ   r   Iterator[None]c              #  D  K   t           j                            |           }	 |t           j        | <   dV  |"t           j                            | d           dS |t           j        | <   dS # |!t           j                            | d           n|t           j        | <   w xY w)a  Thread-safe env var set/restore using atomic C-level lookups.

    We avoid mock.patch.dict(os.environ, ...) because it internally calls
    os.environ.copy(), which iterates all env var keys then fetches values in
    separate steps. That approach is not atomic and can race with background threads
    (e.g. Triton async compilation) modifying the environment, causing KeyError,
    so we use os.environ.get() for individual keys which is an atomic C-level lookup.
    N)osenvironr  r]   )rM  r   olds      rX   _set_envrB  1  s       *..

C"
3;JNN3%%%%%!BJsOOO ;JNN3%%%%!BJsO!!!!s   A+ +4Bcache_entriesOptional[dict[str, Any]]dirOptional[str]deletec              #    K   t                       ddlm}  |t          j        |                    	 t          d          5  t                              d            |t          j	        
                    d                    t          d          5  dV  t          | t                    rqt          |           dk    s
J d	            t          j	                                      r5t          j                  }|                     fd
|D                        ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   |r`t#                      r,t$          j                                        rt+                       t-          j        t#                      fd           n*# t0          $ r t                              d            w xY wt                       dS # t                       w xY w)z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    r   )normalize_path_separator)rE  TORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonTRITON_CACHE_DIRNz!expected empty cache_entries dictc           	         i | ]D}d |v|t           j                            t           j                            |                    ES )z.lock)r?  pathgetsizerM  )rV   ftriton_cache_dirs     rX   rB  zfresh_cache.<locals>.<dictcomp>d  sP       $%#*!#3#3 !"27??27<<@PRS3T3T#U#U#3#3#3rZ   c                >    t                               d|          S )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  rN  rS  inductor_cache_dirs      rX   r  zfresh_cache.<locals>.<lambda>t  s$    S[[@&% 6A 6 6 rZ   )ignore_errorsonerrorz(on error, temporary cache dir kept at %s)r<  torch._inductor.cpp_builderrI  tempfilemkdtemprB  r   r   r?  rN  rM  r   dictr\   existslistdirrw  
is_windowsrS   rJ   rT   r3  shutilrmtree	ExceptionrT  )rC  rE  rG  rI  filesrU  rQ  s        @@rX   fresh_cacherc  F  s      NNNDDDDDD11(2Bs2K2K2KLL'/1CDD 	 	II35GHHH77/::    ,.>??  mT22 
}--2224W222w~~&677  "
+; < <%,,   ).                  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	$  	|| )	 6 6 8 8 )&(((M" )ll          >@RSSS 	sh   F9 AEBD4(E4D8	8E;D8	<E?F9 EF9 EA%F9 8G3 9'G  G3 3H)reverseseqrd  	list[int]c                   | j         }t          t          |                     }t          t	          ||d                    }|st          t          |                    S |S )NTrM  rd  )__getitem__r   r\   rx  r  r
  )re  rd  gettera_rsort_idxs        rX   argsortrm    s[    _F
C//C F3FD999::H (HX&&'''OrZ   r  r+   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                    d
 fd}d t          |          D             }t          |t          j        |          |          }d	 |D             }|S )Nr:  tuple[int, sympy.Expr]r;  rM   r   c                    | \  }}|\  }}d	fd} |||k               rdS  |||k              rdS ||k     rdS ||k    rdS dS )
Nr  %Union[bool, torch.SymInt, sympy.Expr]rM   r   c                `    t          | t                    r| S                     | d          S )NT)size_oblivious)r   r   evaluate_expr)r  r  s    rX   evaluatez*argsort_sym.<locals>.cmp.<locals>.evaluate  s3    $%% **4*EEErZ   r   r7   r   )r  rr  rM   r   rQ   )r:  r;  a_idxa_valb_idxb_valrv  r  s          rX   r  zargsort_sym.<locals>.cmp  s    uu	F 	F 	F 	F 	F 	F
 8EEM"" 	28EEM"" 	1
 5==15==2qrZ   c                h    g | ]/\  }}|t          |t          j                  r|j        j        n|f0S rQ   r  )rV   rZ  r   s      rX   rY   zargsort_sym.<locals>.<listcomp>  sJ       C 
Z5<88?afkka@  rZ   rh  c                    g | ]\  }}|S rQ   rQ   )rV   rZ  r   s      rX   rY   zargsort_sym.<locals>.<listcomp>  s    &&&fc1c&&&rZ   )r:  rp  r;  rp  rM   r   )r  r  r-  
cmp_to_key)r  re  rd  r  exprsr  s   `     rX   argsort_symr    s~         0 nn  E 5i2377IIIE&&&&&FMrZ   r   torch.dtypec                v    | t           j        k    rdS t          j        d|                                           S )Nr~   rQ   r   )rS   rs  r   element_sizer  s    rX   get_dtype_sizer    s7     q;r'''44666rZ   c                      e Zd ZU ded<   dS )LineContextr   contextNr   r   r   r   rQ   rZ   rX   r  r    s         LLLLLrZ   r  c                  $    e Zd ZU ded<   ded<   dS )ValueWithLineMaprN   r   zlist[tuple[int, LineContext]]line_mapNr  rQ   rZ   rX   r  r    s'         JJJ++++++rZ   r  c                      e Zd ZdZd1d2dZej        d3d
            Zd4dZd5dZ	d5dZ
d6dZd7dZd5dZd6dZd8dZd9dZd:d;dZd:d<dZd:d<d Z	 d=d>d%Zd?d(Zd5d)Zd@d,ZdAd/Zd0S )BIndentedBuffer   r   initial_indentr   rM   r  c                "    g | _         || _        d S r   )_lines_indent)r  r  s     rX   __init__zIndentedBuffer.__init__  s    GI%rZ   tabwidthr=  c              #  V   K   | j         }	 || _         d V  || _         d S # || _         w xY wr   )r  )r  r  prevs      rX   set_tabwidthzIndentedBuffer.set_tabwidth  s@      }	!$DMEEE DMMMDDM    s    	(r  c                   t                      }d}g }| j        D ]}t          |t                    r |            }|$n4t          |t                    r|                    ||j        f           W|}t          |t                    sJ |                    |           |                    d           |d|	                    d          z   z  }t          |                                |          S )Nr7   r  )r   r  r   DeferredLineBaser  r  r  rN   writecountr  getvalue)r  bufr   linemaplilines         rX   getvaluewithlinemapz"IndentedBuffer.getvaluewithlinemap  s    jj13+ 	& 	&B".// rtt<  B,, 2:///dC(((((IIdOOOIIdOOOTZZ%%%%AA888rZ   rN   c                4    |                                  j        S r   )r  r   r  s    rX   r  zIndentedBuffer.getvalue  s    ''))//rZ   c                   t                      }| j        D ]}t          |t                    r |            }|$nt          |t                    r;|}t          |t
                    sJ |                    d          r|                    |d d                    |                    |           |                    d           |                                S )N\r   r  )	r   r  r   r  r  rN   endswithr  r  )r  r  r  r  s       rX   getrawvaluezIndentedBuffer.getrawvalue  s    jj+ 	  	 B".// rtt<  B,, dC(((((}}T""  		$ss)$$$$		$		$||~~rZ   c                8    | j                                          d S r   )r  clearr  s    rX   r  zIndentedBuffer.clear  s    rZ   r   c                *    t          | j                  S r   )r   r  r  s    rX   __bool__zIndentedBuffer.__bool__  s    DK   rZ   c                &    d| j         | j        z  z  S )Nra  )r  r  r  s    rX   r  zIndentedBuffer.prefix  s    dlT]233rZ   c                0    |                      d           d S )Nr  	writeliner  s    rX   newlinezIndentedBuffer.newline  s    trZ   r  )Union[LineContext, DeferredLineBase, str]c                   t          |t                    r| j                            |           d S t          |t                    rA| j                            |                    |                                                      d S |                                r2| j                            |                                  |            d S | j                            d           d S Nr6  )r   r  r  r  r  with_prefixr  stripr  r  s     rX   r  zIndentedBuffer.writeline  s    dK(( 	#Kt$$$$$.// 	#Kt//>>?????ZZ\\ 	#K$++--77788888Kr"""""rZ   lines3Sequence[Union[LineContext, DeferredLineBase, str]]c                :    |D ]}|                      |           d S r   r  )r  r  r  s      rX   
writelineszIndentedBuffer.writelines"  s2      	! 	!DNN4    	! 	!rZ   r7   offset'contextlib.AbstractContextManager[None]c                L     t           j        d fd            } |            S )NrM   r=  c               3     K   xj          z  c_         	 d V  xj          z  c_         d S # xj          z  c_         w xY wr   r  )r  r  s   rX   r   z"IndentedBuffer.indent.<locals>.ctx)  sQ      LLF"LL'&&s   + =rM   r=  )
contextlibcontextmanager)r  r  r   s   `` rX   indentzIndentedBuffer.indent(  sB    		"	' 	' 	' 	' 	' 	' 
#	"	' suurZ   c                &    | xj         |z  c_         d S r   r  r  r  s     rX   	do_indentzIndentedBuffer.do_indent3      rZ   c                &    | xj         |z  c_         d S r   r  r  s     rX   do_unindentzIndentedBuffer.do_unindent6  r  rZ   F
other_codeUnion[IndentedBuffer, str]r  c           	        t          |t                    rt          d          }|j        D ]X}t          |t                    sA|r?t          |t          |          t          |                                          z
            }Yt          j	        |          rd}|j        D ]b}t          |t                    r| j        
                    |           2t                              | |t          |          d                     cd S t          j        |          }|r|                                }|sd S |                                }|                    d          D ]}|                     |           d S )Ninfr   r  )r   r  r   r  r  minr\   r  mathisinfr  r  r   textwrapdedentrstriprv  )r  r  r  r  r  r   s         rX   splicezIndentedBuffer.splice9  s    j.11 	"5\\F") I I!$44 I I TS5G5G)GHHFz&!! ") H HdK00 HK&&t,,,,",,T4F3FGGGG	H H "44J 1'..00
 #**,,J%%d++ " "q!!!!" "rZ   r  Callable[[Any], Any]c                b    t          | j                  }fd| j        D             |_        |S )Nr  c                &    g | ]} |          S rQ   rQ   )rV   r  r  s     rX   rY   z&IndentedBuffer.map.<locals>.<listcomp>U  s!    999Tdd4jj999rZ   )r  r  r  )r  r  r   s    ` rX   r   zIndentedBuffer.mapS  s7    DL9999999T[999

rZ   c                P    t          |            d|                                  dS )Nr  r  )r%  r  r  s    rX   __repr__zIndentedBuffer.__repr__X  s'    t**11t}}1111rZ   otherr   c                    | j         |j         k    sJ t          | j                   }|                    | j                   |                    |j                   |S )Nr  )r  r  r  r  )r  r  r   s      rX   __add__zIndentedBuffer.__add__[  sV    |u},,,,DL999t{###u|$$$
rZ   new_line)Union[DeferredLineBase, LineContext, str]c                    || j         v S r   )r  )r  r  s     rX   containszIndentedBuffer.containsc  s    4;&&rZ   Nr
  )r  r   rM   r  )r  r   rM   r=  )rM   r  rM   rN   rM   r  rM   r   )r  r  rM   r  )r  r  rM   r  r   )r  r   rM   r  )r  r   rM   r  F)r  r  r  r   rM   r  )r  r  rM   r  )r  r   rM   r  )r  r  rM   r   )r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  rQ   rZ   rX   r  r    s       H& & & & & ! ! ! !9 9 9 9(0 0 0 0   (   ! ! ! !4 4 4 4   # # # #! ! ! !	 	 	 	 	         EJ" " " " "4   
2 2 2 2   ' ' ' ' ' 'rZ   r  c                  (     e Zd Zd fdZd	dZ xZS )
FakeIndentedBufferrM   r  c                H    t                                                       d S r   )superr  )r  	__class__s    rX   r  zFakeIndentedBuffer.__init__h  s    rZ   r   rN   r   c                j    |dk    rt                               | |          S t          d| d          )Nr  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r  )r  r   s     rX   r  z#FakeIndentedBuffer.__getattribute__k  sK    ;**4666=$ = = =
 
 	
rZ   r  )r   rN   rM   r   )r   r   r   r  r  __classcell__r  s   @rX   r  r  g  sQ             
 
 
 
 
 
 
 
rZ   r  c               #     K   t           j        t           j        }} 	 d V  | |ct           _        t           _        d S # | |ct           _        t           _        w xY wr   )r  stdoutstderr)initial_stdoutinitial_stderrs     rX   restore_stdout_stderrr  v  sR      %(ZNN@!/
CJJJ
CJ????s	   ; Ac                  R    e Zd ZdZddZddZdd	ZddZddZddZ	ddZ
ddZdS )r  z.A line that can be 'unwritten' at a later timer  rN   c                @    |                                 sd}|| _        d S r  )r  r  r  s     rX   r  zDeferredLineBase.__init__  s"    zz|| 	D			rZ   rM   Union[str, None]c                    t           )zJReturns either self.line or None to indicate the line has been 'unwritten'rL  r  s    rX   r  zDeferredLineBase.__call__      !!rZ   r   c                    t           )z3Returns a new deferred line with the same conditionr  r  s     rX   	_new_linezDeferredLineBase._new_line  r  rZ   r  c                >    |                      | | j                   S r   r  r  )r  r  s     rX   r  zDeferredLineBase.with_prefix  s!    ~~444555rZ   c                Z    |                      | j                                                  S r   )r  r  r  r  s    rX   r  zDeferredLineBase.lstrip  s"    ~~di..00111rZ   r$  Union[int, slice]c                B    |                      | j        |                   S r   r  )r  r$  s     rX   ri  zDeferredLineBase.__getitem__  s    ~~di.///rZ   r   c                *    t          | j                  S r   )r   r  r  s    rX   r  zDeferredLineBase.__bool__  s    DIrZ   r   c                *    t          | j                  S r   )r\   r  r  s    rX   __len__zDeferredLineBase.__len__  s    49~~rZ   N)r  rN   )rM   r  )r  rN   rM   r   )r  rN   rM   r   )rM   r   )r$  r  rM   r   r  rM   r   )r   r   r   r   r  r  r  r  r  ri  r  r  rQ   rZ   rX   r  r    s        88   
" " " "" " " "6 6 6 62 2 2 20 0 0 0        rZ   r  c                  4     e Zd ZdZd fdZdd	Zdd
Z xZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`rM  rN   value_fnCallable[[], str]r  c                f    t                                          |           || _        || _        d S r   )r  r  rM  r	  )r  rM  r	  r  r  s       rX   r  zDelayReplaceLine.__init__  s-     rZ   rM   c                f    | j                             | j        |                                           S r   )r  replacerM  r	  r  s    rX   r  zDelayReplaceLine.__call__  s$    y  4==??;;;rZ   c                8    t          | j        | j        |          S r   )r  rM  r	  r  s     rX   r  zDelayReplaceLine._new_line  s    $->>>rZ   )rM  rN   r	  r
  r  rN   r  )r  rN   rM   r  )r   r   r   r   r  r  r  r  r  s   @rX   r  r    sk        @@! ! ! ! ! !
< < < <? ? ? ? ? ? ? ?rZ   r  index_or_deviceUnion[int, torch.device]c                   t          | t          j                  r| }n!t          j        t                      |           }t	          j        |          }t          j        j        r=|j        J |j        dk     s|j        dk    rt          
                    d           dS dS |j        dk    rdnd}|j        }||k     r!t          
                    d	||d
           dS dS )N	   r  z6GPU arch does not support max_autotune_gemm mode usageFTrJ   rz   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)r   rS   r   r`   r    createversionry   majorr   rT  r%  multi_processor_count)r  r   propr  r  s        rX   
is_big_gpur    s    /5<00 ? lnno>>"6**D } z%%%:>>TZ2--KKPQQQ5tK5((bbbG*I7:%I>> 	 	
 	
 	
 u4rZ   c                     t           j                                        r#t           j                                        j        S t           j                            d          j        S )NrH   )rS   rJ   rT   get_device_propertiesgpu_subslice_countrH   r  rQ   rZ   rX   get_max_num_smsr     sI    y Dy..00CC:++F33IIrZ   c                     t           j                                        sdS t           j                            t           j                                                  } | j        dk    S )zEReturns true if the device is a NVIDIA B200, otherwise returns false.Fr  )rS   rH   rT   r  r'  r  )device_propertiess    rX   
using_b200r#    sQ     :""$$ u
889R9R9T9TUU"b((rZ   c                     t           j                                        rt                      S t           j                                        } t                      | | ndz
  S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rS   rJ   rT   r   r  _get_sm_carveout_experimental)carveouts    rX   get_num_smsr'    sS     y !   x5577HH,@aHHrZ   num_tma_descriptorsnum_programsOptional[int]r8   c                    ddl m}m} |t                      }|                    d          }|| z  t
          z  } |||| |j                              S )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r7   )r8   WorkspaceZeroModeNF)r  	zero_moder   
outer_name)codegen.commonr8   r,  r'  	from_boolTMA_DESCRIPTOR_SIZEunique_name)r(  r   r)  r8   r,  r-  r  s          rX   get_tma_workspace_argr3    s     @???????"}}!++E22I--0CCD<+<+--	   rZ   rm  rA   allowed_layout_dtypeslist[torch.dtype]c                    | j         |vr!t                              d| j         |           t          | j        j                  o| j         |v ot          | j                  S )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r%  r  )rm  r4  s     rX   _use_template_for_gpur8    sl     |000		RL!	
 	
 	
 	v}!"" 	&L11	&v}%%rZ   backendc                    |                                  d t          j                                                             d          D             v S )Nc                6    g | ]}|                                 S rQ   r  rU   s     rX   rY   z)_use_autotune_backend.<locals>.<listcomp>
  -       		  rZ   r  )r  rq   max_autotune_gemm_backendsrv  r9  s    rX   _use_autotune_backendr@  	  P    ==??  !<BBDDJJ3OO    rZ   c                    |                                  d t          j                                                             d          D             v S )Nc                6    g | ]}|                                 S rQ   r<  rU   s     rX   rY   z._use_conv_autotune_backend.<locals>.<listcomp>  r=  rZ   r  )r  rq   max_autotune_conv_backendsrv  r?  s    rX   _use_conv_autotune_backendrE    rA  rZ   )enable_int32enable_float8check_max_autotunerF  rG  rH  c                  ddl m}m} t          j        t          j        t          j        g}|r.t          j        t          j        t          j        t          j        g}|r+|                    t          j	        t          j
        g           t          | j        j                  ot          | |          p| j        j        dk    o| j        |v o?t           j        pt           j        p| o$t'          d          o || j        |j                  S )Nr7   )BackendFeaturehas_backend_featurer"  TRITON)r/  rJ  rK  rS   r   r_  ra  ri  extendrY  rZ  r7  r   r%  r8  r   rq   max_autotunemax_autotune_gemmr@  TRITON_TEMPLATES)rm  rF  rG  rH  rJ  rK  layout_dtypess          rX   use_triton_templaterR    s    DCCCCCCC]ENEMBM Tu{S Ge153DEFFF v})** A)&-@@O "e+M0M
	P  VF$<VDV@V
	P "(++
	P  ~/NOOrZ   output_layout
add_guardsmatricesr@   rT  Optional[Layout]rU  c                   	 ddl m} ddlm dfd	dfd}d	fddfddfd	 |            o%t	          fd|D                       o
 ||           S )u^  
    Return True iff *all* supplied tensors satisfy the CUDA TMA constraints
    that Triton relies on today.
    * https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html

    A tensor is accepted when:
      * 1 ≤ rank ≤ 5 (cuTensorMapEncodeTiled)
      * dtype in _TMA_SUPPORTED_DTYPES (CUtensorMapDataType enum)
      * Base pointer 16-byte aligned
      * Exactly one contiguous ("inner") dim with stride 1
      * All "outer" dims have 16-byte aligned strides
      * Inner dim size × itemsize is a multiple of 16
      * For 1-byte dtypes (e.g. FP8), inner dim ≥ 32
    r   )has_triton_tma_devicer7   r  
expr_bytesrF  rM   r   c                N    j         j                            | t                    S r   )r  r  statically_known_multiple_ofTMA_ALIGNMENT)rZ  r  s    rX   _alignedzcan_use_tma.<locals>._alignedG  s    w<<ZWWWrZ   rm  rW  c                t    | dS | j         }| j        }| j        } | j                  sdS  |||          S )NTF)r  rx  r   r  )rm  sizesstridesr   r^  _is_tma_compatibles       rX   _is_tma_compatible_layoutz.can_use_tma.<locals>._is_tma_compatible_layoutJ  sR    >4- x&& 	5!!%%888rZ   r1  r@   c                6   |                                  }|                                 }|                                 }|                                 j        j        v rdS |                                 x}|j        dk    r |||          S  |||          S )NFrJ   )get_size
get_stride	get_dtyper  r  unaligned_buffers
get_devicer%  )r1  r`  ra  r   m_devicer  rb  _is_tma_compatible_xpus        rX   _is_tma_compatible_matrixz.can_use_tma.<locals>._is_tma_compatible_matrixW  s    

,,.. ::<<174445&H38N8N))%%@@@!!%%888rZ   r`  r6  ra  Sequence[_IntLike]r   r  c                j   t          |           }|j        }|dk     s|dk    rdS |t          vrdS r?j        j                            |           }j        j                            |          }nfd| D             }fd|D             }fdt          |          D             }t          |          dk    rdS |d         }t          |          D ]\  }	}
|	|k    r |
|z            s dS ||         } ||z            sdS |dk    r"j        j                            |d          sdS d	S )
Nr7   r   Fc                N    g | ]!}j         j                            |          "S rQ   r  r  symbolic_hint)rV   r   r  s     rX   rY   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>w  s,    HHHQqw'55a88HHHrZ   c                N    g | ]!}j         j                            |          "S rQ   rp  )rV   str  s     rX   rY   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>x  s,    NNN)77;;NNNrZ   c                Z    g | ]'\  }}j         j                            |d           %|(S r   r  r  statically_known_equals)rV   r   rs  r  s      rX   rY   z;can_use_tma.<locals>._is_tma_compatible.<locals>.<listcomp>{  sH     
 
 
2w77A>>

 
 
rZ   r       T)r\   itemsizer|   r  r  guard_int_seqr  statically_known_geq)r`  ra  r   rankrx  sizes_i	strides_ir*  	inner_idxr   rs  	inner_dimr  r^  rU  s               rX   rb  z'can_use_tma.<locals>._is_tma_compatiblee  s   
 5zz>!88taxx5---5 	Og&44U;;G(66w??IIHHHH%HHHGNNNNgNNNI
 
 
 
"9--
 
 

 u::??5!H	 y)) 	 	EArI~~8BM** uu I&	x	H,-- 	5 q==!1!F!FyRT!U!U=5trZ   c                ,   |d         }j         j                            |          }j         j                            |d          sdS d}| D ]D}j         j                            |          }j         j                            ||          r dS EdS )Nr   r7   Fl    T)r  r  rq  rv  statically_known_gt)	r`  ra  r   last_stridelast_stride_hint
MAX_UINT32r  	size_hintr  s	           rX   rk  z+can_use_tma.<locals>._is_tma_compatible_xpu  s     bk7+99+FFw778H!LL 	5 
 	 	D(66t<<Iw33IzJJ uu trZ   c              3  .   K   | ]} |          V  d S r   rQ   )rV   r1  rl  s     rX   r   zcan_use_tma.<locals>.<genexpr>  s/      ??))!,,??????rZ   )rZ  rF  rM   r   )rm  rW  rM   r   )r1  r@   rM   r   )r`  r6  ra  rm  r   r  rM   r   )torch.utils._tritonrY  r  r  r   )
rT  rU  rV  rY  rc  r  r^  rb  rl  rk  s
    `   @@@@@rX   can_use_tmar  2  s/   " :99999X X X X X X9 9 9 9 9 9 99 9 9 9 9 9 9 9/ / / / / / / /b     * 	 	5????h?????	5%%m44rZ   )rU  c                    t           j        j        r| nd }t          d |D                       ot	          |||dot           j        j        S )Nc              3  `   K   | ])}t          |                                          d k    V  *dS )r  N)r\   re  )rV   r1  s     rX   r   z*use_triton_tma_template.<locals>.<genexpr>  s7      55qC

"555555rZ   rS  )rq   rK  enable_template_tma_storer   r  enable_persistent_tma_matmul)rT  rU  rV  rm  s       rX   use_triton_tma_templater    sZ     %mEO]]4F55H55555 	7JOOO	7M6rZ   c                `    t          || |dsdS ddlm} ddlm}  |            o	 |            S )NrS  Fr   )%has_triton_tensor_descriptor_host_tmar7   is_datacenter_blackwell_arch)r  r  r  codegen.cuda.cuda_envr  )rT  rU  rV  r  r  s        rX   !use_triton_blackwell_tma_templater    sm     #	:    uIIIIIICCCCCC 1022U7S7S7U7UUrZ   scale_option_ar6   scale_option_bscaling_typeslist[ScalingType]c                    | |v o||v S r   rQ   )r  r  r  s      rX   use_triton_scaling_templater    s    
 ]*N~/NNrZ   )maxsizec                 h    	 t           j                            d          duS # t          $ r Y dS w xY w)zCheck if CuTeDSL is importable; cache the result for reuse.

    Call ensure_cute_available.cache_clear() after installing CuTeDSL
    in the same interpreter to retry the import.
    cutlassNF	importlibutil	find_specr  rQ   rZ   rX   ensure_cute_availabler    sD    ~''	22$>>   uu    # 
11c                 h    	 t           j                            d          duS # t          $ r Y dS w xY w)zCheck if NVIDIA Universal GEMM (cutlass_api) is importable; cache the result for reuse.

    Call ensure_nv_universal_gemm_available.cache_clear() after installing cutlass_api
    in the same interpreter to retry the import.
    cutlass_apiNFr  rQ   rZ   rX   "ensure_nv_universal_gemm_availabler    sD    ~''66dBB   uur  c                 h    	 t           j                            d          duS # t          $ r Y dS w xY w)aG  Check if nvMatmulHeuristics is importable; cache the result for reuse.

    nvMatmulHeuristics provides performance model-based kernel selection
    for NVIDIA GEMM operations.

    Call ensure_nvmatmul_heuristics_available.cache_clear() after installing
    nvMatmulHeuristics in the same interpreter to retry the import.
    nvMatmulHeuristicsNFr  rQ   rZ   rX   $ensure_nvmatmul_heuristics_availabler    sE    ~''(<==TII   uur  mat_amat_ba_is_2db_is_2doffsOptional[Any]biasscale_resultc                   t                      sdS t          d          sdS ddlm} t	          |j        j                  sdS  |            sdS t          j        g}	t          ||	          sdS t          j        st          j        sdS t          | ||          sdS t          d | |fD                       rdS |r|rdS |dS ||dS dS )	a  
    Returns True if we can use the blackwell kernel for grouped mm.
    Required conditions:
        1. CuTeDSL backend is enabled
        2. CuTeDSL is available
        3. We are on a blackwell arch
        4. The dtype is bf16
        5. Max autotune or max autotune gemm is enabled
        6. A, B, and the output are 16B aligned
        7. We are not using dynamic shapes
        8. A is 2d
        9. B is 3d
        10. Offsets are provided
        11. Bias and Scale are not provided
    FCUTEDSLr7   r  )rT  c              3  4   K   | ]}t          |          V  d S r   )
is_dynamicrU   s     rX   r   z3use_blackwell_cutedsl_grouped_mm.<locals>.<genexpr>0  s(      
1
1Q:a==
1
1
1
1
1
1rZ   NT)r  r@  r  r  r7  r   r%  rS   r_  r8  rq   rN  rO  r  r  )
r  r  rm  r  r  r  r  r  r  rQ  s
             rX    use_blackwell_cutedsl_grouped_mmr    s"   2 !"" u ++ uCCCCCC&-$%% u'')) u^$M 77 u 6#; u ue6::: u

1
15%.
1
1
111 u g u|u<3u4rZ   r1  rW  r  c                   ddl m} |j        j                            ||z  |z  d          }|dk    s|t
          j        j        k     rdS ddlm	} t          j        j        rdS t          j        t          j        t          j        g}t!          | |          o&t
          j        pt
          j        ot'          d          }|r6 |            s,t(                              d	t
          j        j                   dS |S )
Nr7   r  r   fallbackr   F)try_import_cutlassCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cutlass.cutlass_dir %s is set correctly. Skipping CUTLASS backend for now.)r  r  r  r  optimization_hintrq   r  cutlass_backend_min_gemm_sizecodegen.cutlass.utilsr  rS   r  ry   r   r_  ri  r8  rN  rO  r@  r   rT  cutlass_dir)	rm  r1  rW  r  r  	gemm_sizer  rQ  r   s	            rX   use_cutlass_templater  ?  s    221q519r2JJIA~~V^%QQQu999999 } u ]ENEK@Mfm44 	- <F$<	-!),,   !!## 	KK4 *	   5JrZ   _IntLikeOptional[IRNode]r  Optional[_IntLike]c                  
 ddl m t                      sdS t                      sdS t	          d          sdS ddlm
 
j        rdS | j        j	        dk    st          j        j        rdS t          j        st          j        sdS |||g}||                    |           t#          fd	|D                       rdS ||g}	||	                    |           t#          
fd
|	D                       rdS dS )a3  
    Return True if we can use the NVIDIA Universal GEMM Template.

    Required conditions:
        1. NVGEMM backend is enabled
        2. cutlass_api is available
        3. We are on a NVIDIA GPU
        4. Max autotune or max autotune gemm is enabled
        5. Not in AOT Inductor mode (requires runtime JIT compilation)
        6. Base pointers are 16-byte aligned
        7. Shape dimensions are not unbacked symbols

    Note:
        - Shape and stride constraints are handled internally by
          cutlass_api.get_kernels() which filters incompatible kernels.
        - GroupedGemm currently only supports TN layout (column-major B).
          Any other layout will act as a noop and fall back to ATen.
        - Dynamic shapes are supported as long as they have hints
          (from example inputs).
    r   )has_free_unbacked_symbolsFNVGEMMr7   r  rH   Nc              3  .   K   | ]} |          V  d S r   rQ   )rV   dimr  s     rX   r   z1use_nv_universal_gemm_template.<locals>.<genexpr>  s/      
C
Cc$$S))
C
C
C
C
C
CrZ   c              3  X   K   | ]$}|                                 j        j        v V  %d S r   )r  r  rh  )rV   tr  s     rX   r   z1use_nv_universal_gemm_template.<locals>.<genexpr>  s5      
O
O1::<<1744
O
O
O
O
O
OrZ   T)r  r  r  r  r@  r  r  aot_compilationr   r%  rS   r  ry   rq   rN  rO  r  r  )rm  r1  rW  r  r  r  r  r  dims_to_checktensors_to_checkr  r  s             @@rX   use_nv_universal_gemm_templater  `  sU   < POOOOO "" u-// u ** u u}V##u}'8#u 6#; u
 1IM}Q

C
C
C
C]
C
C
CCC u u~%%%

O
O
O
O>N
O
O
OOO u4rZ   op_namec                    t           j        j                                        }|dk    rdS |                                 d |                    d          D             v S )z8Check if CUTLASS should be used for the given operation.ALLTc                6    g | ]}|                                 S rQ   r<  rU   s     rX   rY   z'_use_cutlass_for_op.<locals>.<listcomp>  s     IIIQqwwyyIIIrZ   r  )rq   r  cutlass_enabled_opsr  rv  )r  enabled_opss     rX   _use_cutlass_for_opr    sX    .4::<<Ket==??II+2C2CC2H2HIIIIIrZ   r   threshold_multiplec           
     P   ddl m} t          j        j        |z  }|j        j                            t          j	        t          j
        ||| z            t          j
        |||z                                o.|j        j         o!|j        j         ot          j        j        dk    S )Nr   r  )torch._inductor.virtualizedr  rq   rK  decompose_k_thresholdr  r  statically_known_truer   AndGeaot_modecpp_wrappernum_decompose_k_splits)r1  rW  r  r  r  r  s         rX   use_decompose_k_choicer    s     .-----"M?BTT 	
..I1A5661A566 	
 	
 	5   	5 ##	5 M014
rZ   c           
     \   t           j        j        }ddlm} t          t          j        j                  ow|j	        j
                            t          j        t          j        ||| z            t          j        |||z                                o|j	        j         o|j	        j         S )z
    Check if we should use the contiguous subgraph transform.
    This transform makes the second matrix contiguous before the matmul.
    r   r  )rq   rocmcontiguous_thresholdr  r  r   rS   r  ry   r  r  r  r   r  r  r  r  )r1  rW  r  r  r  s        rX   use_contiguousr    s     ";; .----- 	U] 	$G22I0145501455 
 
	$   	$ ##
rZ   c                   t           j        j        }g d}t          |t          j                  r	|j        s|S |dk    rg S t          | t          j                  r| j        r!t          |t          j                  r
|j        sdnt          || z  ||z            dt	          j        |          }fd|D             }g g g }}}|D ]j}	||	z  }
|
dk     r|
|
dz
  z  dk    r|
dk    r|	                    |	           6|
dz  dk    r|	                    |	           U|	                    |	           kt           j
        d	k    r||z   |z   S ||z   |z   }|d |         S )
N)rz   rw  r}   r{      r   r  r  c                ,    g | ]}|k    |k    |S rQ   rQ   )rV   divisormax_k_splitmin_k_splits     rX   rY   z get_k_splits.<locals>.<listcomp>  s8       k!!g&<&< 	&<&<&<rZ   r{   r7   rw  
EXHAUSTIVE)rq   rK  r  r   r   rJ  	is_numberr  divisorsr  max_autotune_gemm_search_space)r1  rW  r  k_splits_limitdefault_k_splitsr  pow_of_2_divisorsmul_of_32_divisorsrest_of_splitsdkPartbest_splitsr  r  s               @@rX   get_k_splitsr    s    ]9N .--!UZ    	1			1ej!! *!+ *1ej!!**++* !q&!q&))K~a  H      H =?B>) % %Q 3;; EAI!##$$Q''''RZ1__%%a(((( !!!$$$$,<< #55FF#&88>IK''rZ   c                J    t           j                            |           j        S r   )rS   rH   r  gcnArchNamer   s    rX   _rocm_native_device_arch_namer  	  s    :++F33??rZ   Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                     	 dd l } ddlm}m} ddlm} t          j                            | j	                  }n'# t          $ r d
d}d
d} G d d	          }d }Y nw xY w||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationrM   r  c                     g S r   rQ   rQ   rZ   rX   r  z*try_import_ck_lib.<locals>.gen_ops_library/	      IrZ   c                     g S r   rQ   rQ   rZ   rX   r  z.try_import_ck_lib.<locals>.gen_ops_preselected2	  r  rZ   c                      e Zd ZdS )*try_import_ck_lib.<locals>.CKGemmOperationN)r   r   r   rQ   rZ   rX   r   r  5	  s        DrZ   r   )rM   r  )ck4inductor(ck4inductor.universal_gemm.gen_instancesr  r  ck4inductor.universal_gemm.opr   r?  rN  dirname__file__r  )r  r  r  r   package_dirnames        rX   try_import_ck_libr  	  s    	
 	
 	
 	
 	
 	
 	
 	
	
 	
 	
 	
 	
 	
 '//+*>??   	 	 	 		 	 	 		 	 	 	 	 	 	 	  O-@/QQs   69 !AAc                f   t           j        st           j        sdS t          j        j        sdS | j        j        dk    rdS t          | j                  }d t           j	        j
        D             p|                    d          d         |ifd                                t           j	        j        z  D             }|sdS | j        t          j        t          j        t          j        fvrdS t%                      \  }}}}|st&                              d           dS |t           j	        _        dS )	NFrH   c                F    i | ]}|                     d           d         |S ):r   )rv  )rV   r  s     rX   rB  z#use_ck_template.<locals>.<dictcomp>I	  s(    DDDaqwws||ADDDrZ   r  r   c                     g | ]
}|         S rQ   rQ   )rV   r  requested_archss     rX   rY   z#use_ck_template.<locals>.<listcomp>L	  s.     ! ! ! 	! ! !rZ   z,Please pip install Composable Kernel packageT)rq   rN  rO  rS   r  ry   r   r%  r  r  archrv  r  ck_supported_archr   r   r_  ra  r  r   rT  ck_dir)rm  native_archrequested_supported_archsck_package_dirnamer   r  s        @rX   use_ck_templater  <	  sA    6#; u= u}V##u 0>>KDD6;3CDDD #q!;IO! ! ! ! %%''&+*GG! ! ! % u|EM5>5=IIIu"3"5"51a BCCCu+FK4rZ   c                    ddl m} t          d          o9t          |           o*|j        j                            ||z  |z  d          dk    S )Nr7   r  CKr   r  r   r  r  r@  r  r  r  r  rm  r1  rW  r  r  s        rX   use_ck_gemm_templater  a	  se     	d## 	KF##	KG..q1uqy2.FFJrZ   c                    ddl m} t          d          o9t          |           o*|j        j                            ||z  |z  d          dk    S )Nr7   r  CKTILEr   r  r   r  r  s        rX   use_ck_tile_gemm_templater   k	  se     	h'' 	KF##	KG..q1uqy2.FFJrZ   c                >    t          d          ot          |           S )Nr  )rE  r  rm  s    rX   use_ck_conv_templater#  u	  s    %d++G0G0GGrZ   c                R    t           j        pt           j        o| j        j        dk    S r  )rq   rN  rO  r   r%  r"  s    rX   _use_template_for_cpur%  y	  s'    7v7&
-

%&rZ   mat1Union[ReinterpretView, Buffer]mat2c                   ddl m} t          |j        |          sJ |j        j        }|j        j        }t          |           oe|                                t          j	        k    oCt          |          dk    o0t          |          dk    o|d         |d         k    o|d         dk    }t          | ||d          o|j                                        p|S )Nr7   )rA      r  F)require_constant_mat2)r2  rA   r   rm  r  rx  r%  rg  rS   ra  r\   use_cpp_gemm_templateis_contiguous)rm  r&  r(  rA   	mat1_sizemat1_stridemat1_each_batch_is_contiguouss          rX   use_cpp_bmm_templater1  	  s     dk6*****
  I+$Kf%% 	"NN-	"^^q 	" "	" ^y|+		"
 ^q  " !t5QQQ !!##D'DrZ   mat2_transposedr+  is_woq_int4q_group_sizec                v   ddl m} ddlm} ddlm}	 ddlm}
 t          |           rt          d          sdS t          j        j        sdS |                                t          j        t          j        fv }t          j        t          j        t          j        t          j        t          j        g} |
|||r| j        nd ||          \  }}}} }}t+          ||f          rdS t-          ||j                  r|                                } |	|                                          \  }} |d	||||                                |                                |t3                      | |

  
        }dd}| j        |v o:|d uo6 ||          o+t-          ||j                  o|                                p| S )Nr7   r1  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyper2  use_4x2_dim
micro_gemm)input_dtypeinput2_dtypeoutput_dtypenum_threadsuse_refr4  rW   r@   rM   r   c                f    |                                   |                                 d         dk    S )Nr   r7   )freeze_layoutrf  rW   s    rX   is_last_dim_stride1z2use_cpp_gemm_template.<locals>.is_last_dim_stride1	  s*    	||~~b!Q&&rZ   )rW   r@   rM   r   )r6  r2  codegen.cpp_micro_gemmr6  codegen.cpp_utilsr7  kernel.mm_commonr8  r%  r@  rq   cppweight_prepackrg  rS   rp  re  ra  r_  halfr   has_free_symbolsr   BaseViewunwrap_viewparallel_num_threadsrj  is_module_buffer)rm  r&  r(  r2  r+  r3  r4  r2  r6  r7  r8  	int8_gemmrQ  r1  rW  r  r?  r   r<  rE  s                       rX   r,  r,  	  s$    999999MMMMMM)))))) (( 0Ee0L0L u:$ u  U[%*$==I]ENEJUZXM")'"+5&,,'# # #Aq!VT4 A u$$$ "!!@@AQAQRROL!""			NN$$^^%%!(**!  J' ' ' '
 	% 	Cd"	C%%	C tR]++	C ""$$A,A(ArZ   c                 R    t           j        pt           j         pt          d          S )NATEN)rq   rN  rO  r@  rQ   rZ   rX   use_aten_gemm_kernelsrT  	  s*    7v7 '	v	&	&'rZ   c                  R    e Zd ZU  ej        d          Zded<   ddZddZdd
Z	dS )DebugDirManagerr   rN   prev_debug_namerM   r  c                B    t          t          j                  | _        d S r   )r  rV  counterr   r  s    rX   r  zDebugDirManager.__init__	  s    .//rZ   c                    t           j        j        j        | _        | j         d| j         | _        | j        t           j        j        _        d S )N_tmp_)rS   _dynamorq   debug_dir_rootrW  r   new_namer  s    rX   	__enter__zDebugDirManager.__enter__	  sA    $}3B/??dg??.2m+++rZ   r   r   c                n    t          j        | j                   | j        t          j        j        _        d S r   )r_  r`  r^  rW  rS   r\  rq   r]  )r  r   s     rX   __exit__zDebugDirManager.__exit__	  s*    dm$$$.2.B+++rZ   Nr  )r   r   rM   r  )
r   r   r   r  r  rY  r   r  r_  ra  rQ   rZ   rX   rV  rV  	  su         ioa  G0 0 0 0< < < <
C C C C C CrZ   rV  Callable[P, _T]r
  r  tuple[_T, list[str]]c                   ddl m} t                      d	fd}t          j                            |d|          5  t          j                                          | |i |}d d d            n# 1 swxY w Y   |t                    fS )
Nr7   r<   coderN   rM   r  c                2                         |            d S r   )r  re  source_codess    rX   save_output_codez*run_and_get_code.<locals>.save_output_code	  s    rZ   ri  re  rN   rM   r  )
r  r=   r#   r   patchr  rS   r\  resetrx  )r   r   r  r=   ri  r  rh  s         @rX   run_and_get_coderm  	  s    
 %$$$$$$.LLL      
		=*<>N	O	O % %T$V$$% % % % % % % % % % % % % % % 4%%%%s   'A00A47A4c                    |                     dd          }t          | g|R i |\  }}g }|D ]C}|                    t          j        d|t          j                             |rd |D             }D||fS )Nremove_quoteFz	'''.*?'''c                "    g | ]}|d d         S )r*  rQ   )rV   r&  s     rX   rY   z'run_and_get_kernels.<locals>.<listcomp>
  s     :::vad|:::rZ   )r]   rm  rM  r   findallDOTALL)r   r   r  ro  r  rh  kernelsre  s           rX   run_and_get_kernelsru  
  s     ::ne44L+B@@@@@@FLG ; ;rz,bi@@AAA 	;::':::G7?rZ   tuple[Any, list[str]]c                .     d fd}t          |          S )NrM   r   c                 h                 } |                                                                   | S r   )r  rF  )r  r   s    rX   run_with_backwardz1run_fw_bw_and_get_code.<locals>.run_with_backward
  s-    

rZ   )rM   r   )rm  )r   ry  s   ` rX   run_fw_bw_and_get_coderz  
  s2         
 -...rZ   c                b   ddl m} g dfddfd}t          j                            |d|          5  t          j                            |d          5  t
          j                                          | |i |}ddd           n# 1 swxY w Y   ddd           n# 1 swxY w Y   S )zLGet the inductor-generated code, but skip any actual compilation or running.r7   r<   re  rN   rM   r  c                2                         |            d S r   r  rg  s    rX   ri  z"get_code.<locals>.save_output_code
  s    D!!!!!rZ   r  r=   r   c                     G d d          }| j         r|                                 n|                                 \  }} |j                   |r |j                    |            S )Nc                  "    e Zd ZdZd
dZddZd	S )@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerM   r  c                    d S r   rQ   r  s    rX   r  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__%
  s    rZ   r   r   r  c                    d S r   rQ   r  s      rX   callzEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.call(
  s    rZ   Nr  r   r   r  r   rM   r  )r   r   r   r   r  r  rQ   rZ   rX   DummyModuler  "
  sB        FF        rZ   r  )r  codegen_with_cpp_wrappercodegenr   )r  r  wrapper_codekernel_coderi  s       rX   patched_compile_to_modulez+get_code.<locals>.patched_compile_to_module!
  s    	 	 	 	 	 	 	 	 04/?SD))+++T\\^^ 	"k 	+,,, 	0[.///{}}rZ   compile_to_moduleri  Nrj  )r  r=   rM   r   )r  r=   r   rk  r  rS   r\  rl  )r   r   r  r=   r  r   ri  rh  s         @@rX   get_coder  
  st   $$$$$$ L" " " " " "     , 	
.0I	
 	
    	
-);=MNN	    	B                                                            s5   "B$'BB$B	B$B	B$$B(+B(c                    t          | g|R i |}dt          |          cxk    rdk    sn J dt          |                       |d         S Nr7   r  z%expected one or two code outputs got r   )r  r\   )r   r   r  rh  s       rX   get_triton_coder  C
  ss    B000000LL!!&&&&Q&&&&&CL0A0ACC '&& ?rZ   c                    t          | g|R i |\  }}dt          |          cxk    rdk    sn J dt          |                       |d         S r  )rm  r\   )r   r   r  r   rh  s        rX   run_and_get_triton_coder  M
  sy     'r;D;;;F;;OA|L!!&&&&Q&&&&&CL0A0ACC '&& ?rZ   tuple[Any, list[GraphLowering]]c                    ddl m ddlm} |j        g dfd	}t
          j                            |d
|          5   | |i |}d d d            n# 1 swxY w Y   |fS )Nr   r<   rD   r   r   r  rM   r  c                 v     | i | | d         }t          |          sJ                     |           d S )Nr  )r   r  )r   r  r  r=   graph_lowerings	real_inits      rX   	fake_initz-run_and_get_graph_lowering.<locals>.fake_initb
  sQ    	4"6"""Q%/////u%%%%%rZ   r  r  )torch._inductor.graphr=   torch._inductor.output_coderE   r  r   rk  r  )	r   r   r  rE   r  r  r=   r  r  s	         @@@rX   run_and_get_graph_loweringr  Y
  s     433333;;;;;;(IO& & & & & & & & 
		?J		B	B % %T$V$$% % % % % % % % % % % % % % % ?""s   	AAAaten_opoverride_fnc              #     K   ddl m} |j        |          }	 t          j        ||          |j        | <   dV  ||j        | <   dS # ||j        | <   w xY w)z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)torch._inductorr  	loweringsr-  partial)r  r  r  orig_fns       rX   override_loweringr  n
  s{       )((((( )G.&/&7W&M&M7#&-7###g7#----s   !A Apre_fnpost_fnOptional[Callable[..., Any]]c                ~     ddl m} |j        d	 fd}t          j        j                            |d|          S )
zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr  r   r  rM   c                T     | |            | |          }r | |           |S r   rQ   )r  r  outr  r  r  s      rX   r!  z(add_scheduler_init_hook.<locals>.wrapper
  sE    y%   gi'' 	&GIu%%%
rZ   r  )r  r   r  r   rM   r   )torch._inductor.schedulerr  r  unittestr   rk  r  )r  r  r  r!  r  s   ``  @rX   add_scheduler_init_hookr  
  sh     433333 G        =%%iWEEErZ   msgc                    t           j        rt                              |            dS t                              |            dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rq   developer_warningsr   rT  info)r  s    rX   developer_warningr  
  s:       CrZ   c                    	 t           j                            d          } | dz   t          t           j                  k     rZt          t           j        | dz                      dk    r4t           j        | dz            d         dk    rt           j        | dz            S n# t          $ r Y nw xY wt           j        D ]0}|                    d          r|t          d          d         c S 1dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr7   r   r  z--only=N)r  argvr$  r\   
ValueErrorr  )rZ  r  s     rX   get_benchmark_namer  
  s    	hnnX&&!Gc#(mm##CHS1W%&&**q!!$++8C!G$$    x ) )>>)$$ 	)s9~~''((((	) 4s   BB 
B)(B)r  c                4    t          d | D                       S )Nc              3  "   K   | ]
}|d k    V  dS r7   NrQ   rU   s     rX   r   zis_ones.<locals>.<genexpr>
  &      %%!qAv%%%%%%rZ   r   r  s    rX   is_onesr  
      %%u%%%%%%rZ   c                4    t          d | D                       S )Nc              3  "   K   | ]
}|d k    V  dS )r   NrQ   rU   s     rX   r   zis_zeros.<locals>.<genexpr>
  r  rZ   r  r  s    rX   is_zerosr  
  r  rZ   inputsSequence[torch.Tensor]c                4    t          d | D                       S )Nc              3     K   | ];}t          |t          j                  |j        t          j        d           k    V  <dS )r"  N)r   rS   r  r   )rV   r   s     rX   r   z is_cpu_device.<locals>.<genexpr>
  sY        dEL))u|E***     rZ   r  )r  s    rX   is_cpu_devicer  
  s0           rZ   c                    t          | t          j                  s
J d            | j        rt          j        S t          j        S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)r   r   rJ  r   rS   rk  rc  r  s    rX   get_sympy_Expr_dtyper  
  sG    c5:&&  B & ~ {}rZ   should_profileIterator[Any]c              /     K   | r5t          j        j        |i |5 }|V  d d d            d S # 1 swxY w Y   d S d V  d S r   )rS   r   r   )r  r   r  r   s       rX   maybe_profiler  
  s       ^#T4V44 	GGG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   -11c                 Z    t           j        j        } | dk     rt          j                    } | S Nr7   )rq   rI  threadsrS   get_num_threads)r  s    rX   rO  rO  
  s(    j G{{'))NrZ   c                 t    ddl m}   |             }|                    dt          j        j        rdnd          S )Nr7   )get_backend_options
num_stagesr  r*  )runtime.triton_helpersr  r  rS   r  ry   )r  optionss     rX   get_backend_num_stagesr  
  sD    ;;;;;;!!##G;;|%-*;%BQQCCCrZ   c                   t          | t          j        j        j        j        dk              }||S ddlm}m} t          j        	                                o!t          j        
                                dk    }| t          j        t          j        t          j        fv sJ t          j        |          j                            d          rddlm}  |            }| t          j        t          j        fv r|r || |          S t          j        j        j        j        dk    r |t          j        |          S  |t          j        |          S | t          j        t          j        fv r|r ||           S t          j        j        j        j        dk    r |t          j                  S  |t          j                  S )	z
    We don't want to throw errors in this function. First check to see if the device is in device_info.py,
    then fall back to the inaccurate triton estimation.
    tf32)is_tf32Nr   )get_max_simd_tflopsget_max_tensorcore_tflops)r~   r   
clock_rate)max_clock_rate)r   rS   backendsrH   matmulfp32_precisiontriton.testingr  r  rT   get_device_capabilityr   r_  ra  inspect	signature
parametersr  torch._utils_internalr  )r   ds_topsr  r  SM80OrLaterr  sm_clocks          rX   get_device_tflopsr  
  s    u~*1@FJ  G MMMMMMMM*))++ 
0P0P0R0R W 1K
 U]ENEMBBBBB,--8<<\JJ 6888888!>##U]EN3333,,UH===>%4>>,,U]HEEE&&u}h???U]EN3333,,U333>%4>>,,U];;;&&u}555rZ   c                 "    ddl m}   |             S )Nr   get_dram_gbps)r  r  r  s    rX   get_gpu_dram_gbpsr    s     ,,,,,,=??rZ   c                 t    ddl m}  | j        j                            d                              dd          S )Nr   r*  max_shared_mem)triton.runtimer*  r+  r,  r  r  r  s    rX   get_gpu_shared_memoryr  &  s>    %%%%%%=44Q77;;<LaPPPrZ   c                     t           j                                        rGt           j                                        j        } t           j                                        j        }nd} d}|| z  S )Nrw  i   )rS   rH   rT   r  	warp_sizemax_threads_per_block)r  r  s     rX   get_max_numwarpsr  ,  s`    z   %J4466@	 %
 @ @ B B X 	 $ I--rZ   reduction_typec                ,    |                      d          S )Nwelford)r  r  s    rX   is_welford_reductionr  8  s    $$Y///rZ   c                8    t          |           rdS | dk    rdS dS )Nr*  online_softmax_reducer  r7   )r  r  s    rX   reduction_num_outputsr  <  s-    N++ q	2	2	2qqrZ   c                 0    t          j                    dk    S )NLinux)platformsystemrQ   rZ   rX   is_linuxr  E  s    ?''rZ   c                 "    t           j        dk    S )Nrs   )r  r  rQ   rZ   rX   r^  r^  I  s    <7""rZ   itrIterable[Any]c                4    t          d | D                       S )Nc              3  Z   K   | ]&}t          |t          j                  o|j         V  'd S r   )r   r   rJ  r  rU   s     rX   r   z#has_free_symbols.<locals>.<genexpr>N  s7      JJz!UZ((<_JJJJJJrZ   r  )r  s    rX   rL  rL  M  s    JJcJJJJJJrZ   c            	     t   ddl m} | D ]}t          ||j        |j        |j        |j        |j        f          rJt          |	                                pd          s#t          |
                                pd          r dS zt          ||j                  st          dt          |                     dS )Nr7   r1  rQ   Tzunexpected type for is_dynamic F)r6  r2  r   rh  rj  rM  r  r>   rL  maybe_get_sizemaybe_get_strider@   	TypeErrorr%  )r   r2  r  s      rX   r  r  Q  s     I IbmR[":KRYW
 
 
	I   0 0 2 2 8b99 =M""$$*> >  tt Ary)) 	IGd1ggGGHHH5rZ   c                      e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r   r   r   r  r  rQ   rZ   rX   r  r  e  s          K *rZ   r  r  r4   inpc                   ddl m} t          j        dd          5 }t	          j                    }t	          j                    } t          |t          |                    j        |  t          d|j
         |           t          |j
        |           t          j                    }t          ||          5   | |j
                   d d d            n# 1 swxY w Y   t          j                    |z
  }	 ||j
                   |j
                                         |                                 t          d	|j
         |           t          |j
        |           |                                |                                k    }
t"                              d
||j        |
|	           d d d            d S # 1 swxY w Y   d S )Nr7   )stable_topological_sortr  zutf-8)modeencoding)r  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)pattern_matcherr  rY  NamedTemporaryFileior   rf   rb   	propagater  r  r   nowre   lint	recompiler  r   r  r   )r  r  r  r  r  rP  	before_ioafter_io
start_timetime_elapsedr  s              rX   pass_execution_and_saver$  o  sA    988888		$
 
 
 
 
KMM	;==C	R#3C#8#8999CSII$"($$1----bhY''''\^^
#B,, 	 	DNNN	 	 	 	 	 	 	 	 	 	 	 	 	 	 	|~~
2)))

###!,,,,bhX&&&&  H$5$5$7$77hF	
 	
 	
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s7   BG <CG C	G  C	!CG  GG	input_buf"Optional[Union[Buffer, Operation]]c                l    ddl m} t          | |j                  ot          | j        |j                  S )zB
    Check if input buffer is a multi-outputs template buffer
    r7   r1  )r6  r2  r   CppTemplateBufferrm  MultiOutputLayoutr%  r2  s     rX   is_multi_outputs_templater+    sG     i!566 :".< < rZ   c                    ddl m} t          | |j                  o1t	          | j                  dk    ot          | j        d                   S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r7   r1  r   )r6  r2  r   MultiOutputr\   r  r+  r*  s     rX   #is_output_of_multi_outputs_templater.    s\      	9bn-- 	;	 !!Q&	;%i&6q&9::rZ   r   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                D   | dS ddl m} t          | |j                  o"t          | |j                   o|d u p| j        |u pt          |           |j        u ot          t          j
        j        d          o#| j        t          j
        j        j        j        k    pt          t          j
        j        d          o#| j        t          j
        j        j        j        k    pBt          t          j
        j        d          o#| j        t          j
        j        j        j        k    S )NFr7   r1  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r6  r2  r   _CollectiveKernel_WaitKernelop_overloadr%  FallbackKernelr  rS   r  torchrecr2  defaultr3  r4  r  r  r2  s      rX   is_collectiver<    s%    |u 	4-.. 	34000	34Z14+r1 	T

b'' 	

 	*,?@@ U$	(:(L(TT
 	*,DEE E$9%<DE 	*,CDD Y$	(:(P(XX/rZ   "Optional[Union[IRNode, Operation]]c                :    ddl m} t          |           |j        u S Nr7   r1  )r6  r2  r%  r6  )r  r2  s     rX   is_waitr@    s'    ::''rZ   snoderF   	filter_fn-Optional[Callable[[BaseSchedulerNode], bool]]c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  o|d u p
 ||           S )Nr   GroupedSchedulerNodec              3  4   K   | ]}t          |          V  d S r   )contains_collectiverU   s     rX   r   z&contains_collective.<locals>.<genexpr>  s+      @@a&q))@@@@@@rZ   )r  rF  r   r  snodesr<  r  )rA  rB  rF  s      rX   rH  rH    sr     ?>>>>>%-.. A@@5<@@@@@@$$P)t*;*Oyy?O?OPrZ   c                    ddl m} t          | |          rt          d | j        D                       S t          | j                  S )Nr   rE  c              3  4   K   | ]}t          |          V  d S r   )contains_waitrU   s     rX   r   z contains_wait.<locals>.<genexpr>  s*      ::=##::::::rZ   )r  rF  r   r  rI  r@  r  )rA  rF  s     rX   rL  rL    sV    >>>>>>%-.. #::U\::::::uz"""rZ   Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                    ddl m} t          |t          j        j                  r|g}t          | |j                  o| j        |v S r?  )r6  r2  r   rS   r  r  r8  r7  r;  s      rX   is_fallback_oprP    sT     "ej+,, TdB-..I43Cr3IIrZ   buf_namename_to_bufname_to_fused_nodec                L    |||          j                                                  S r   )defining_opr  )rQ  rR  rS  s      rX   buf_name_to_fused_snoderV    s#     k(3?HHJJKKrZ   c                    dS r  rQ   rA  s    rX   r  r        u rZ   collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                     ||           rd S |                     |            | j        D ]1}t          |j        ||          }||v rt	          |||||           2d S )Nr^  )r  unmet_dependenciesrV  r   find_recursive_deps_of_node)rA  rZ  rR  rS  r^  depdefining_op_for_deps          rX   rc  rc    s     {5 5!!!' 
 
5Hk#5
 
 "444##	
 	
 	
 	
 	

 
rZ   c                    dS r  rQ   rX  s    rX   r  r    rY  rZ   c           	         ||           rd S |                     |            |                                 D ]}|j        D ]}}|j        J |j                                        dk    r)|j                                        |vrE||j                                                 }||v rit          |||||           ~d S )NOUTPUTra  )r  get_outputsr  r  r  find_recursive_users_of_node)rA  rZ  rR  rS  r^  or  user_ops           rX   rj  rj    s    {5 5!!!    G 	 	D9(((y!!##x//y!!##+===(););)=)=>G,,,(""'    	 rZ   dynamo_gm_num_inputsaot_fw_gm_num_inputsc                F    t           j        j        j        rdnd}|| z
  |z
  S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rS   
_functorchrq   functionalize_rng_ops)rm  rn  num_rng_seed_offset_inputss      rX   num_fw_fixed_argumentsrs  4  s2     $:A   "669SSSrZ   fx_gc                   d	d}d}g }| j         j        D ]2}|j        dk    r% ||          r|                    |           |dz  }3|t	          t          t          |                              k    sJ t          |          S )
z>
    Infers which inputs are static for a backwards graph
    rW   r5   rM   r   c                J    d| j         vod| j         vod| j         vod| j         vS )Ntangentsbwd_seedbwd_base_offsetbwd_rng_stater  rD  s    rX   is_saved_tensorz'count_tangents.<locals>.is_saved_tensorD  s@    af$ .!&(.!/.  qv-		
rZ   r   r  r7   )rW   r5   rM   r   )r  r  r  r  rx  r   r\   )rt  r{  	arg_countstatic_arg_idxsrW  s        rX   count_tangentsr~  ?  s    

 
 
 
 IOZ  4=  q!! 2&&y111NId5_)=)=#>#>??????rZ   c                  :    e Zd ZU ded<   d
dZedd            Zd	S )	BoxedBoolr   r   rM   c                    | j         S r   )r   r  s    rX   r  zBoxedBool.__bool__\  s
    zrZ   r  r   Union[BoxedBool, bool]c                B    t          | t                    r	d| _        | S dS r  )r   r  r   r9  s    rX   disablezBoxedBool.disable_  s%    c9%% 	CIJurZ   Nr  )r  r   rM   r  )r   r   r   r   r  r  r  rQ   rZ   rX   r  r  X  sS         KKK       \  rZ   r  kernel_listc              #      K   ddl m} |j        	 	 	 dd fd}t          j                            |d|          5  d V  d d d            d S # 1 swxY w Y   d S )Nr7   r9   Tr  r:   kernel_namerN   r  r  rF  gpur   cpp_definitionrM   r   c                N                         |            | |||||          S r   r}  )r  r  r  r  r  r  r  orig_define_kernels         rX   define_kernelz.collect_defined_kernels.<locals>.define_kernelm  s;     	;'''!!+{Hc>
 
 	
rZ   r  )NTN)r  r:   r  rN   r  rN   r  rF  r  r   r  rF  rM   r   )codegen.wrapperr:   r  r   rk  r  )r  r:   r  r  s   `  @rX   collect_defined_kernelsr  g  s      555555-; #'(,
 
 
 
 
 
 
 
 
		/-	P	P                   s   AAAc                    | dz   S )N__original__rQ   r  s    rX    get_cloned_parameter_buffer_namer  ~  s    .  rZ   c                    | t           v S r   )r[   r  s    rX   r7  r7    s    YrZ   c                 (    t           j        j        duS )z,Check if we're running on ROCm/HIP platform.N)rS   r  ry   rQ   rZ   rX   is_rocmr    s    =D((rZ   c                ,    | dk    ot          |           S )NrI   )r7  r  s    rX   device_need_guardr    s    U?-vf~~-rZ   c                6   | t           j        k    r@t           j                                        r"t           j                                        dk     S | t           j        k    r t           j                                        rdS | t           j        t           j        fv S )N)r  r   T)rS   r_  rH   rT   r  rJ   rk  r   r  s    rX   ,needs_fallback_due_to_atomic_add_limitationsr    st    5:#:#:#<#<z//11F::	%.	 	 UY%;%;%=%=	 tej111rZ   r7  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                *   | j         t          j        j        j        t          j        j        j        fv r|dS | j         t          j        j        j        k    rdnd}|d |fvp|ot          |          ot          |          p| j         t          j        j        j        k    oA|dk    o;|o9|dk    o3t          j
        j        o"t          j
        j        pt                      dk    p2||k    o|t          j        t          j        fv pt          j                    S )NFr  r  r"  r7   )overloadpacketrS   r  atenscatter_reduce_scatter_reducescatter_r7  r  rq   rI  fallback_scatter_reduce_sumdynamic_threadsrO  r   rk  $are_deterministic_algorithms_enabled)r7  r  r  r  r  r  	reduce_tys          rX   use_scatter_fallbackr    s=    	"IN*EIN,IJ	K 	K"u +uy~/FFFE 
 	tY// 	8 H''H<YGG		8 &%).*HH L%'LL  5(L 
6	L
 +J/C/E/E/J	8 i'SJ5:u{:S,S	8 577!rZ   c                   ddl m}m} ddlm} t          dt          |            d           t          |           D ]!\  }}t          d|dd           ||u rt          d	           .||u rt          d
           Bt          ||          r|	                                }t          |rdnd d           |r*|j
        J t          d|j
        j        j                    t          d           |j        j        D ]}t          |           t          d           |j        j        D ]}t          |           t!          dt#          |                     dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   )DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesra  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )torch._inductor.codegen.simdr  r  r  r  r  r\   r  r   is_reductionr  ri  reduction_hintr}  r~  r  r  r%  )r-  r  r  r  rZ  r  is_redrd  s           rX   dump_node_scheduler    s   
 ONNNNNNN777777	
:M 2 2
:
:
:;;;}-- H H	Tl#llll?""$%%%%%%%%&&&&m,, 	H&&((Ff.UU$???@@@ Ry,,,P1NPPQQQ*'-  c



+'.  c



 F$t**FFGGG+H HrZ   r   r  c                    ddl m}  ||                                 t          | j                  z  t
          z  dk              S )Nr   )r  )r  r  storage_offsetr  r   GPU_ALIGN_BYTES)r   r  s     rX   tensor_is_alignedr    sU     LKKKKK  				 	 >&,#?#?	??RVWW  rZ   example_inputc                n    t          | j        j                  sdS t          j        pt          |           S r  )r7  r   r%  rq   assume_aligned_inputsr  )r  s    rX   should_assume_input_alignedr    s6     -&+,, u'K+<]+K+KKrZ   r  c                     t           j        j                                        } | st	          j                    S | j        r| j        j        st	          j                    S | j        j        }|                                S r   )	rS   _guardsTracingContexttry_getr  nullcontextr  r  suppress_guards)tracing_contextr  s     rX   #maybe_get_suppress_shape_guards_ctxr    sw    
 m2::<<O (%''' $ (O,E,O (%''')3I$$&&&rZ   tuple[_T, str]c                   t           j        j                            t          dd          5  t
          j                                         dd l}dd l	} |j
                    } |j        |          }ddlm} |                    |           |j        }|                    |j                    | |i |}	|                                }
|                    |           |                    |           d d d            n# 1 swxY w Y   |	|
fS )Nr   Tr   )output_code_log)r  r   rk  r  rq   rS   r\  rl  r  loggingr   StreamHandlertorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGr  removeHandler)r   r   r  r  r  log_capture_stringchr  
prev_levelr  r   s              rX   run_and_get_cpp_coder    sZ    
		#	#FGT	:	: * *			(R[]]"W"#566======""2&&&$*
  ///T$V$$''))  ,,,%%b)))* * * * * * * * * * * * * * *  19s   CD  DDSequence[InputType]Optional[ShapeEnv]c                   t          |           }||j        S | D ]}t          |t          j                  r|j        j        c S t          |t          j                  r|                                D ],}t          |t          j                  r|j        j        c c S -|                                D ],}t          |t          j                  r|j        j        c c S -d S r   )	rb   r  r   rS   r2   r  r  r  rx  )r  r  inputr  rx  s        rX   shape_env_from_inputsr     s    ((I ""  1 1eU\** 	(:'''' eU\** 	1

 / /dEL11 /9....../,,.. 1 1fel33 1!;0000001 4rZ   Callable[[list[InputType]], _T]inputs_to_checkmutated_input_idxsOrderedSet[int]c                F     t                    dk    r S d fd}|S )Nr   
new_inputslist[InputType]rM   r   c                    t          |           \  }} |           }t          |          rt          j        ||           |S r   )copy_misaligned_inputsr\   rS   _foreach_copy_)r  old_tensorsnew_tensorsr  r  r  r  s       rX   r'  z)align_inputs_from_check_idxs.<locals>.runE  sZ    #9);$
 $
 [ eJ { 	; k:::
rZ   )r  r  rM   r   )r\   )r  r  r  r'  s   ``` rX   align_inputs_from_check_idxsr  =  sN    
 ?q          JrZ   c                   d|                                  v rd}nNt          d t          |                                  |                                           D                       dz   }t	          j        | |fd                                          }t	          j        ||                                  |                                           S )Nr   c              3  ,   K   | ]\  }}|d z
  |z  V  dS r  rQ   )rV   ro  rx  s      rX   r   z)clone_preserve_strides.<locals>.<genexpr>[  s/      TTf$TTTTTTrZ   r7   r   )r  r  r   rx  rS   
as_stridedclone)rW   needed_sizerc  s      rX   clone_preserve_stridesr  U  s    AFFHH}} TT#affhh

:S:STTTTTWXX 	 a+66<<>>FFAFFHHahhjj999rZ   r  r  check_inputs_idxsreturn_pair_idxsOptional[OrderedSet[int]]-tuple[list[torch.Tensor], list[torch.Tensor]]c                d   g }g }|du}|D ]}| |         }t          |t          j                  sJ dt          |                       |                                t
          z  rHt          |          | |<   |r4||v r0|                    |           |                    | |                    ||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )r   rS   r  r%  data_ptr	ALIGNMENTr  r  )r  r  r  r  r  ret_pair_definedr   _inps           rX   r  r  a  s     ')K&(K (t3 
2 
2!}$-- 	
 	
;tDzz;;	
 	
- ==??Y& 	22488JqM 2A)9$9$9""4((("":a=111##rZ   static_input_idxsc                   g }|D ]Y}| |         }t          |t          j                  r5|                                t          z  dk    r|                    |           Zt          |          t          |          k    r|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r   rS   r  r  r  r  r\   )r  r  aligned_static_input_idxsrZ  r  s        rX   remove_unaligned_input_idxsr    s     !#  2 2seU\** 	20@0@90LQR/R/R%,,S111
$%%->)?)???((rZ   r   c                   ddl m} t          j        t          j                  j        }|j        j        j        }|j        j        j	        j
        }t          j        r"|j        j                            | |           dS |j        j                            | |k              rdS |j        r%|j        j                            | dk               rdS  ||           o ||           |k    S )Nr7   r  Tg@xDF)r  r  rS   iinfori  r   r  r  r  r  has_hintrq   assume_32bit_indexing	check_leqr  r  )r   r  int_maxr  r  s        rX   expr_fits_within_32bitr    s    k%+&&*G *Iw)2H# 	""1g...t 	w--a7l;; t 	  711!d(;; 	 5 8A;;299Q<<722rZ   compiled_graphrE   c                   t           j        j                                        }||j        t          |j                  dk    sJ t          |           |j        J |j        D ]}||j                            d            dt           j        j                                        x}r|j        d	fd|j                            t          fd|D                                  d S d S d S )
Nr   Fr   r   rM   ,Union[float, int, SymInt, SymFloat, SymBool]c                ~    t          |           S r                    |           S                     |           S r   )r   deserialize_symexprevaluate_symexpr)r   fakify_first_callr  s    rX   map_exprz4set_tracing_context_output_strides.<locals>.map_expr  sE     ("1vv( @(<<Q???$55a888rZ   c              3  .   K   | ]} |          V  d S r   rQ   )rV   r   r  s     rX   r   z5set_tracing_context_output_strides.<locals>.<genexpr>  s+      55!((1++555555rZ   )r   r   rM   r  )
rS   r  r  r  output_stridesr\   r  r  r  r  )r  r  r  r~  r   r  r  r  s        @@@rX   "set_tracing_context_output_stridesr	    s>    m*2244Gw5A7)**a////).99	,888#2 	 	E}&--d3333$)!-6>>@@@3 >(+(=%9 9 9 9 9 9 9 &--5555u55555   ' AA	 	rZ   c                    t           j        t           j        S t          j                    sdS t          j                                        rdS 	 ddlm}  n# t          $ r Y dS w xY w| t          j        	                    d          k    S )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rq   fx_graph_remote_cache	is_fbcoderS   _utils_internalis_fb_unit_testtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorjustknobs_getval_intr  s    rX    should_use_remote_fx_graph_cacher    s    #/++ u,,.. uHHHHHHH   uu  5#8#M#M8$ $  s   A 
A$#A$c                .    t          j        dd|           S )Nz[^a-zA-Z0-9_]r   )r   subr  s    rX   normalize_namer    s    6"C...rZ   ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                    i | ]\  }}||	S rQ   rQ   )rV   r  r   s      rX   rB  rB    s    GGG$!QAGGGrZ   z^.*[.]c                    t                               dt          |                     }t                              ||          S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  rN   _triton_type_mappingr  )r   triton_type_names     rX   triton_typer    s6    &**5#e**==##$46FGGGrZ   c                    t                               | |           }|                    dd          }t          t          |          }t          |t          j                  sJ |S )Nr  r6  )_torch_triton_mappingr  r  rR   rS   r   r   )r   adjusted_type	type_namer:  s       rX   triton_type_to_torchr#    sY    )--eU;;M%%eR00Iy))Ii-----rZ   ri  c                   | j          o|                                 |                                k    o|                                 |                                k    o| j        |j        k    o| j        |j        k    ow|                                                                 |                                                                k    o)|                                 |                                k    S r   )	is_mkldnnr  rx  r   r   untyped_storager  r  ri  r   s     rX   is_same_tensorr(    s    N 	<IIKK5::<<'	<KKMMU\\^^+	< J%+%	< K5<'		<
   ""++--1F1F1H1H1Q1Q1S1SS	< !!U%9%9%;%;;rZ   c                8   | j         o|                                 |                                k    oi| j        |j        k    oY| j        |j        k    oIt          j        j                            |           t          j        j                            |          k    S r   )r%  r  r   r   rS   r  mkldnnr  r'  s     rX   is_same_mkldnn_tensorr+    s     	PIIKK5::<<'	PJ%+%	P K5<'	P I%%d++uy/?/H/H/O/OOrZ   tuple[str, ...]c                     dS )N)r  isnanlogical_notlogical_andsignbitand_leltgegteqner7  xorrQ   rQ   rZ   rX   boolean_opsr:  "  s     rZ   c                  $    e Zd ZU ded<   ded<   dS )OpDtypeRuler3   type_promotion_kindrN  override_return_dtypeNr  rQ   rZ   rX   r<  r<  6  s*         8888000000rZ   r<  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr=  r3   r>  c                6    t          ||          t          | <   d S r   )r<  r?  )r   r=  r>  s      rX   #register_op_dtype_propagation_rulesrA  ?  s%    
 (32( (t$$$rZ   zOrderedSet[str]op_requires_libdevice_fp64c                :    t                               |            d S r   )rB  r  r  s    rX   #register_op_requires_libdevice_fp64rD  L  s    ""4(((((rZ   r   c                    ddl m} | s|j                                        j        } | dk    rt
          j        S | dk    rdS | dk    rt
          j        S t
          j        S )Nr   r  r"  rI   rJ   )	r  r  r  get_current_device_or_throwr%  rq   cpu_backendxpu_backendcuda_backend)r   r  s     rX   get_current_backendrJ  P  st    ------ Ag99;;@e!!			u			!!""rZ   c                    | t           j        t           j        fv r/t          j        j        rt                      dk    rt           j        S | S )z"Maybe upcast [b]float16 to float32rK  )rS   r   r_  rq   rK  codegen_upcast_to_fp32rJ  ra  r  s    rX   upcast_compute_typerM  _  sC     	%-000M0 	1!!X--}LrZ   KeyTypeValTypec                  \    e Zd ZdZddZdd	ZddZddZdddZd dZ	d!dZ
d"dZd#dZdS )$
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                "    || _         i | _        d S r   rR  	new_items)r  rR  s     rX   r  zScopedDict.__init__v  s    *13rZ   rM  rN  rM   rO  c                H    || j         v r| j         |         S | j        |         S r   rV  rR  r  s     rX   ri  zScopedDict.__getitem__z  s*    $.  >#&&!#&&rZ   r   r  c                    || j         |<   d S r   )rV  )r  rM  r   s      rX   __setitem__zScopedDict.__setitem__  s    #srZ   r  r   c                &    || j         v p|| j        v S r   rX  r  s     rX   __contains__zScopedDict.__contains__  s    dn$At/A(AArZ   Nr:  Optional[ValType]c                d    || j         v r| j         |         S | j                            ||          S r   )rV  rR  r  )r  rM  r:  s      rX   r  zScopedDict.get  s5    $.  >#&&!%%c7333rZ   r   c                ^    t          | j                  }| j        D ]}|| j        vr|dz  }|S r  )r\   rR  rV  )r  rW  r  s      rX   r  zScopedDict.__len__  s@    "## 	 	A***QrZ   Iterator[KeyType]c              #  R   K   | j         E d {V  | j        D ]}|| j         vr|V  d S r   rU  )r  r  s     rX   __iter__zScopedDict.__iter__  sT      %%%%%%%% 	 	A***	 	rZ   c                8    t          | j        p| j                  S r   )r   rR  rV  r  s    rX   r  zScopedDict.__bool__  s    D&8$.999rZ   c                    t           r   r  r  s     rX   __delitem__zScopedDict.__delitem__  s    !!rZ   )rR  rS  )rM  rN  rM   rO  )rM  rN  r   rO  rM   r  )rM  r  rM   r   r   )rM  rN  r:  r]  rM   r]  r  )rM   r`  r  )rM  rN  rM   r  )r   r   r   r   r  ri  rZ  r\  r  r  rb  r  re  rQ   rZ   rX   rQ  rQ  n  s         4 4 4 4' ' ' '
$ $ $ $B B B B4 4 4 4 4
      : : : :" " " " " "rZ   rQ  )frozen_defaultr   Optional[type[Any]]r   c              .    dfd}| |S  ||           S )Nr   rt   rM   c                2    t          j        | d          S )NT)kw_onlyr   )dataclasses	dataclass)r   r   s    rX   wrapzir_dataclass.<locals>.wrap  s    $S$vFFFFrZ   )r   rt   rM   rt   rQ   )r   r   rm  s    ` rX   ir_dataclassrn    sA    G G G G G G {499rZ   Optional[list[int]]c                 v    t           j        j                                        } | | j        r| j        j        S d S r   )rS   r  r  r  fw_metadatabw_donated_idxs)r  s    rX   get_donated_idxsrs    s7    m2::<<O"'B"*::4rZ   c                  "    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r7   r  r*  r  N)r   r   r   V0_NO_TRITONV1_COMPILERV2_BACKENDSV3_BACKENDS_TUPLEV4_DICTrQ   rZ   rX   ru  ru    s,        LKK	  GGGrZ   ru  c                    t           j                            d          t          j        S dd l} dd l} t          | j        j	        d          rt          j
        S t          | j	        j	        d          rt          j        S t          j        S )NrK  r   AttrsDescriptor)r  r  r  ru  rv  triton.backends.compilertriton.compiler.compilerr  r  compilerrx  rw  rz  )rK  s    rX   #get_triton_attrs_descriptor_versionr    s    ~))1+88########v'):;; 4 ,77	)+<	=	= 4+77 ,33rZ   c                 :    t                      t          j        k    S r   )r  ru  rz  rQ   rZ   rX   triton_version_uses_attrs_dictr    s    .004P4XXXrZ   torch._ops.OperatorBasec                    |                                  }t          | t          j        j                  r| d| j         n|}||fS )Nrx   )r   r   rS   r  r  _overloadname)r  op_overload_packet_nameop_overload_names      rX   get_op_namesr    sV    #%7799 b%*/00	%"77R%5777$ 
 #$444rZ   r  torch.fx.Nodec                   ddl m} | j        }t          |t          j        j                  sdS |t          j        j        j	        j
        t          j        j        j        j
        t          j        j        j        j
        fv r\ ||| j        | j        d          }|A|\  }}|d         }|D ]1}|-|j        d         j        t          j        t          j        fv r dS 2dS )	a  
    Check if an FX node is cudagraph-unsafe based on its input arguments.

    Some ops are only cudagraph-unsafe depending on their inputs (e.g., index_put
    with boolean indices triggers .nonzero() during capture, but integer indices
    are safe).
    r   )normalize_functionFT)normalize_to_only_use_kwargsNindicesr  )torch.fx.operator_schemasr  r  r   rS   r  r  r  r  	index_putr:  
index_put__unsafe_index_putr   r  r#  r   r   rp  )r  r  r  
normalizedr   r  r  rZ  s           rX   ,_fx_node_is_input_dependent_cudagraph_unsafer    s     =<<<<<^Ffej344 u 	 (	!)	(0  
 ('GL'.t
 
 

 !"IAvY'G    ?sx'<JKA ( (  445rZ   c                   | j         }t          |          t          v rdS t          |t          j        j                  rt          j        j        j	        |j
        v rdS t          |           rdS | j                            d          x}Jt          |t          t          f          s|gn|}|D ]&}t          |t          j                  r
|j        r dS 'dS )a   
    Check if an FX node is cudagraph-unsafe.

    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - Input-dependent unsafe ops (e.g., index_put with boolean indices)
    - Ops with sparse tensor outputs
    Tr  NF)r  rN   FORBIDDEN_CUDAGRAPH_OPSr   rS   r  r  r  r  cudagraph_unsafer  r  r#  r  rx  r  r  	is_sparse)r  r  r  valsr   s        rX   r  r    s     ^F 6{{---t 	65:011HL)V[88t 4G<< t |&&&3&sT5M::Cuu 	 	A!U\** q{ tt5rZ   rB   c                    ddl m} t          | |j        |j        f          rdS t          | |j        |j        f          sdS t          | dd          }|t          |          rdS dS )ah  
    Returns True if the node is an op that is not cudagraphable.
    This includes:
    - Ops in FORBIDDEN_CUDAGRAPH_OPS (CPU sync, dynamic alloc, etc.)
    - Ops with the cudagraph_unsafe tag
    - index_put_ with boolean indices (triggers .nonzero() during capture)
    - Control flow nodes (Conditional, WhileLoop)
    - Ops with sparse tensor outputs
    r7   r1  TFr  N)	r6  r2  r   Conditional	WhileLoopr8  r?   rR   r  )r  r2  r  s      rX   is_cudagraph_unsafe_opr  *  s      $677 tdR.@AA udIt,,G:7CCt5rZ   c                    t           j                            dd          } t          j                    rXddlm}  |            }|rFt           j                            |dd          }| r!t           j	                            || g          n|} | S )NLD_LIBRARY_PATHr6  r   )get_runtime_pathr!  lib)
r?  r@  r  rq   r  libfb.py.parutilr  rN  rM  pathsep)rN  r  runtime_pathlib_paths       rX   get_ld_library_pathr  D  s    :>>+R00D K555555'')) 	Kw||L)UCCH8<J2:??Hd#3444(DKrZ   c                @    ddl m} t          | |          o| j        d uS )Nr   )SubgraphPythonWrapperCodegen)torch._inductor.codegen.wrapperr  r   partition_signatures)r!  r  s     rX   #is_codegen_graph_partition_subgraphr  Q  s9    LLLLLL 	7899 	5(4rZ   c                     t           j        j        j        j        pt
          j        d uot           j        j        j        S r   )rS   r   rq   rK  
cudagraphs&_unstable_customized_partition_wrapperr!  graph_partitionrQ   rZ   rX   is_using_cudagraph_partitionr  Z  s8    %0 	F19E1 /
 
01rZ   c                    ddl m} |j        j                            | d          r,|j        j                            | d          rt          j        S t          j        S )Nr7   r  l        i   )	r  r  r  r  statically_known_ltrz  rS   ri  rk  )r  r  s     rX   dtype_from_sizer  a  se    w++e  
'

/
/h
?
? {{rZ   )r"  rJ   c                d    | dk    r#t           j        j                                        S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r"  rJ   TF)rS   r  r*  _is_mkldnn_bf16_supportedr   s    rX   is_mkldnn_bf16_supportedr  o  ;     ey99;;;	+		t5rZ   c                d    | dk    r#t           j        j                                        S d| v rdS dS )z;
    Returns True if the device supports MKL-DNN FP16.
    r"  rJ   TF)rS   r  r*  _is_mkldnn_fp16_supportedr  s    rX   is_mkldnn_fp16_supportedr  {  r  rZ   elementsSequence[Sequence[T]]headersSequence[T]c           
        d |D             }| D ]l}t          |          t          |          k    sJ t          |          D ]8\  }}t          ||         t          t          |                              ||<   9mg }|                    d                    d t          ||          D                                  t          |          t          |          dz  z   t          |          dz
  z   }|                    d|z             | D ]B}|                    d                    d t          ||          D                                  Cd                    |          S )	Nc                F    g | ]}t          t          |                    S rQ   )r\   rN   )rV   r   s     rX   rY   ztabulate_2d.<locals>.<listcomp>  s$    +++ac#a&&kk+++rZ   |c              3  ,   K   | ]\  }}d || d V  dS ra  NrQ   )rV   hr  s      rX   r   ztabulate_2d.<locals>.<genexpr>  s2      HH41a,a,,,,HHHHHHrZ   r  r7   r  c              3  ,   K   | ]\  }}d || d V  dS r  rQ   )rV   r   r  s      rX   r   ztabulate_2d.<locals>.<genexpr>  s2      HHtq!l!QllllHHHHHHrZ   r  )r\   r  r   rN   r  rM  r   r  )r  r  widthsrowr   r   r  total_widths           rX   tabulate_2dr    s]   ++7+++F 4 43xx3w<<''''cNN 	4 	4DAqF1Is3q66{{33F1II	4E	LLHH3w3G3GHHHHHIIIf++Vq1S[[1_EK	LL{"### J JSXXHHs37G7GHHHHHIIII99UrZ   dict1rS  dict2
d1_defaultValType | None
d2_defaultEGenerator[tuple[KeyType, ValType | None, ValType | None], None, None]c              #    K   t          |                                           t          |                                          z  }|D ];}|                     |          }|                    |          }|||n|||n|fV  <dS )a  
    Zip two dictionaries together, replacing missing keys with default values.

    Args:
        dict1 (dict): The first dictionary.
        dict2 (dict): The second dictionary.
        d1_default (Any): the default value for the first dictionary
        d2_default (Any): the default value for the second dictionary

    Yields:
        tuple: A tuple containing the key, the value from dict1 (or d1_default if missing),
               and the value from dict2 (or d2_default if missing).
    N)r#   r  r  )r  r  r  r  all_keysrM  value1value2s           rX   	zip_dictsr    s      ( %**,,''*UZZ\\*B*BBH  	
 	
33 (FFj(FFj
 	
 	
 	
 	
	
 	
rZ   config_patchesc                (   dd	}dd
}|                      dt          j        j                  }|                                 } |rm || dd            || dd            || dt
          j        j                     || dd            || dt          j        j	                    || dd           |                      dt          j
        j                  }|                      dt          j
        j                  }|dk    r|rt          d          | S )a6  
    Ensures the configuration is internally consistent for standalone AOTInductor.

    If `aot_inductor_mode.compile_standalone` is set to True in the provided
    `config_patches` (or falls back to the global config), this function ensures
    that the following configs are also enabled:
        - `aot_inductor.package_cpp_only`

    Args:
        config_patches (dict[str, Any]): A dictionary of user-provided config
            overrides for AOTInductor compilation.

    Returns:
        dict[str, Any]: The possibly-updated `config_patches` dictionary.
    r  r  config_namerN   config_valuer   rM   r  c                    |                      |t          t          |                    }||| |<   d S |s||k    rt          d| d| d          d S d S )NzInvalid config: =z3 when aot_inductor_mode.compile_standalone is True.)r  rR   rq   r  r  r  r  r   s       rX   patch_configz2maybe_aoti_standalone_config.<locals>.patch_config  s     "";0L0LMM=*6N;''' 	5L00r;rrrrr  	 	00rZ   c                    |                      |t          t          |                    }||k    rt                              d||           || |<   d S )NzDOverriding: %s=%s when aot_inductor_mode.compile_standalone is True.)r  rR   rq   r   rT  r  s       rX   force_patch_configz8maybe_aoti_standalone_config.<locals>.force_patch_config  s_     "";0L0LMML  KKV  
 '3{###rZ   z$aot_inductor_mode.compile_standalonezaot_inductor.package_cpp_onlyTz aot_inductor.embed_kernel_binaryz#aot_inductor.emit_multi_arch_kernelz+aot_inductor.model_name_for_generated_files
aoti_modelzaot_inductor.link_libtorchzaot_inductor.dynamic_linkageFz"aot_inductor.cross_target_platformz$aot_inductor.package_constants_in_sowindowszconfig.aot_inductor.package_constants_in_so is not supported for windows cross-compilation. Please use config.aot_inductor.package_constants_on_disk_format = binary_blob.)r  r  r  rN   r  r   rM   r  )r  rq   aot_inductor_modecompile_standalonecopyrS   r  ry   test_configsuse_libtorchaot_inductorcross_target_platformpackage_constants_in_sor  )r  r  r  r  r  r  s         rX   maybe_aoti_standalone_configr    s   "	 	 	 	
3 
3 
3 
3 (++. 3 
 $((**N R^%DdKKK^%GNNNAu}GXCX	
 	
 	
 	I<	
 	
 	
 	(,	
 	
 	

 	>+I5QQQ*..,1 
 -00.3 
 	)).E)]
 
 	

 rZ   consts_sizetuple[bool, bool]c                d   t           j        j        r$t           j        j        dk    rt	          d          t           j        j        r,t           j        j        dk    rt	          d          d}d}||fS t           j        j        dk    rd}d}||fS | dk    rdS d}t          j                     }||fS )	a  
    Decide whether we should mmap weights, and whether to store the weights with .so.

    If force_mmap_weights or package_constants_on_disk_format == "binary_blob" configs are set, respect the config.

    Returns tuple (use_external_weights, use_mmap_weights).
    binary_blobzconfig.aot_inductor.package_constants_on_disk_format = binary_blob and config.aot_inductor.force_mmap_weights cannot both be True.r  zKwhen cross_target_platform is windows, use_mmap_weights should not be true.TFi 5w)FF)rq   r  force_mmap_weights package_constants_on_disk_formatr  r  r  )r  use_mmap_weightsuse_external_weightss      rX   determine_aoti_mmap_flagsr    s     	.
@MQQJ
 
 	

 - 64	AA]    $#%555;}LL# #%555m##| !+---!111rZ   c                     ddl m}  | j        j        }|dS t	          |t
                    st          d          |dk    rdS t          j        d|          st          d          dS )	zL
    Validates if a model name is suitable for use in code generation.

    r   rp   NTz4Invalid AOTI model name: Model name must be a stringr6  z^[a-zA-Z_][a-zA-Z0-9_]*$zVInvalid AOTI model name: Model name can only contain letters, numbers, and underscores)	r  rq   r  model_name_for_generated_filesr   rN   r  r   r   )rq   
model_names     rX   is_valid_aoti_model_namer  8  s    
 '&&&&&$CJtj#&& QOPPPRt 8/<< 
d
 
 	
 4rZ   r*   unbacked_onlyOrderedSet[sympy.Symbol]c                B    |rt          |           S t          |           S r   )r)   r(   )rW   r  s     rX   get_free_symbolsr  S  s$     $Q'''ArZ   dict[str, str]c                    i t           j        dt           j                            dt           j                            t
          j                            i} t          j                    rt          j
        d          | d<   | S )zA
    Get a base environment for running Python subprocesses.
    
PYTHONPATHTORCH_CUSTOM_PYTHONPATHri  
PYTHONHOME)r?  r@  r  r  rM  r  rN  rq   r  	sysconfigget_path)envs    rX   python_subprocess_envr  Z  sv    

* 	bjnn%rzsx'@'@
 
	 C   7%.v66LJrZ   c                  (    e Zd ZU dZded<   ded<   dS )CUDAGraphWrapperMetadataz
    Metadata for Customized CUDAGraphWrapper.

    Currently assumes there is 1 dynamo graph and will extend to
    multiple graphs in the future.
    r   num_partitionspartition_indexNr   rQ   rZ   rX   r  r  u  s8            rZ   r  .c                      e Zd ZU dZded<   dS )CUDAGraphWrapperNzOptional[CUDAGraphWrapperType]r!  )r   r   r   r!  r   rQ   rZ   rX   r	  r	    s#         .2G222222rZ   r	  CUDAGraphWrapperTypec                    | t           _        d S r   )r  r!  )r!  s    rX   !set_customized_partition_wrappersr    s    5<*222rZ    tuple[list[Any], dict[str, Any]]c                \   | j         j        }| j                             g || j         j        | j         j                  }| j         j        }t          j        ||f          \  }}ddfd|D             }dddfd	fd
|D             }t          j        ||          \  }}||fS )NrM   r   c                    t          | t          j        j        j                  o$t          | t          j        j        j                   S r   )r   rS   r   r2  r@   GeneratorStaterD  s    rX   _is_tensor_irz(snode_args_kwargs.<locals>._is_tensor_ir  sA    !U_/677 

u!0A
 A
 =
 	
rZ   c                v    g | ]5} |          r&t           j        j                            |d           n|6S )F)guard_shape)rS   r   r2  ir_node_to_tensor)rV   r:  r  s     rX   rY   z%snode_args_kwargs.<locals>.<listcomp>  sZ         =	,,QE,BBB  rZ   r  c                0    t          j        | ||          S )Nr   )rS   r   )r  r   r   s      rX   _tensorz"snode_args_kwargs.<locals>._tensor  s    {4uV<<<<rZ   r   r   c                    t          | t          j                  s| S  |                                 | j        | j                  }|S r   )r   rS   r  r  r   r   )r   r  r  s     rX   to_real_tensorz)snode_args_kwargs.<locals>.to_real_tensor  s@    !U\** 	Hgaffhh22
rZ   c                &    g | ]} |          S rQ   rQ   )rV   r:  r  s     rX   rY   z%snode_args_kwargs.<locals>.<listcomp>  s#    666q""666rZ   r  )rM   r  )r   r   rM   r   )r  r  fill_non_provided_argsconstant_argsr  pytreer$   tree_unflatten)rA  r   r  	flat_argsflat_args_pytree_specr  r  r  s        @@@rX   snode_args_kwargsr     s   :D:,,*$*)*
 D ZF'-':D&>'J'J$I$
 
 
 

    	  I= = = =      7666I666I(4IJJLD&<rZ   rd  r;   c                    ddl m} | j        }|j        j        r"|                    |j        j        dz             }|                    d          S )Nr7   r  r   )primals_r  fwd_rng_staterz  rw  )r  r  r   r  removeprefixr  )rd  r  dep_names      rX   is_nonfreeable_buffersr&    s`    xH 	w| =(();<<I  rZ   template_dirr/   c                    t          ||  dz            5 }|                                cddd           S # 1 swxY w Y   dS )z,Load a template file and return its content.z	.py.jinjaN)openread)r   r'  rP  s      rX   load_templater+    s    	l////	0	0 Avvxx                 s   7;;c                    | j         }t          |t          j        j        t          j        j        f          sJ dt          |                       t          j        sdS t          t          j
        j        j        j        t          j
        j        j        j        g          }||v rdS t          t          j
        j        j        g          }t          |t          j        j                  r||v S t#          |            S )zLDecide whether fallback for a node. This is only used in inductor lite mode.z6Expected OpOverload or HigherOrderOperator, but found F)r  r   rS   r  r  r@  r%  rq   fallback_by_defaultr#   r  r  _assert_scalarr:  lift_fresh_copyhigher_order triton_kernel_wrapper_functionalr!   )r  r  "skip_fallback_due_to_dynamic_shapefallback_hopss       rX   should_fallback_by_defaultr4    s   [F&
(FG  O ONVNNO O  % u *4IN)1IN*2	
* *& 333u 			@A M &%*899 '&&&t,,,,rZ   )	z-torch.ops._c10d_functional.all_reduce.defaultz.torch.ops._c10d_functional.all_reduce_.defaultz9torch.ops._c10d_functional.all_gather_into_tensor.defaultz8torch.ops._c10d_functional.reduce_scatter_tensor.defaultz4torch.ops._c10d_functional.all_to_all_single.defaultz6torch.ops._c10d_functional_autograd.all_reduce.defaultzBtorch.ops._c10d_functional_autograd.all_gather_into_tensor.defaultzAtorch.ops._c10d_functional_autograd.reduce_scatter_tensor.defaultz=torch.ops._c10d_functional_autograd.all_to_all_single.defaultc                    | t           v S )z0Check if an operation is a collective operation.)COLLECTIVE_OPS)r  s    rX   is_collective_opr7    s    n$$rZ   c                 b    t          j                    r	 ddlm}  | S # t          $ r g cY S w xY wg S )Nr   tlx_only_cuda_options)rq   r  )torch._inductor.fb.tlx_templates.registryr:  r  r9  s    rX   r:  r:    s[     
	WWWWWW(( 	 	 	III	 	s    ,,yc                    | |z   dz
  |z  |z  S )z(Round x up to the nearest multiple of y.r7   rQ   )rW   r<  s     rX   	_round_upr>  "  s    UQY1!!rZ   mat_sizetuple[Any, Any]
scale_sizetuple[Any, ...]scale_numel	mat_dtypescale_dtypeeq_fnCallable[[Any, Any], bool]#tuple[Optional[Any], Optional[Any]]c                   ddl m}m}  ||d          r|j        |j        fS t          |          dk    rF ||d         | d                   r ||d         d          s* ||d         d          r& ||d         | d                   r|j        |j        fS  ||d         | d                   r& ||d         t          | d         d                    s> ||d         | d                   r4 ||d         t          | d         d                    r|j        |j        fS  ||d         t          | d         d                    r4 ||d         t          | d         d                    r|j	        |j        fS |t          j        k    rdnd}|t          j        k    r|t          j        k    rt          | d         d          t          t          || d         z  d          d          z  }	t          | d         d          t          t          || d         z  d          d          z  }
 |||	          s |||
          r|j        |j        fS |t          j        k    rt          j        j        st          | d         d          t          t          || d         z  d          d          z  }	t          | d         d          t          t          || d         z  d          d          z  }
 |||	          s |||
          r|j        |j        fS njt          | d         d          |z  | d         z  }	t          || d         z  d          | d         z  }
 |||	          s |||
          r|j        |j        fS d	S )
z:
    Core implementation for scale/swizzle inference.
    r   )r6   SwizzleTyper7   r  r{   rz   r  rw  NN)torch.nn.functionalr6   rJ  
TensorWise
NO_SWIZZLEr\   RowWiserr   BlockWise1x128BlockWise128x128rS   r]  rY  r>  BlockWise1x16SWIZZLE_32_4_4r[  r  ry   BlockWise1x32)r?  rA  rC  rD  rE  rF  r6   rJ  K_multiplierexpected_numel_aexpected_numel_bs              rX   _infer_scale_swizzle_implrX  '  s>    =<<<<<<< u[! >%{'=== :!E*Q-!-- 	?%%
1q2I2I 	?E*Q-##	?(-jmXa[(I(I	? &(>>> E*Q-!--	FjmWXa[#%>%>??	F E*Q-!--		F
 jmWXa[#%>%>??	F -{/EEE 5AS 9 9:: 	HuuqM78A;44@
 @
 	H /1GGG "U%;;;11L E***{e>Q/Q/Q$Xa[#66L8A;.33Q:
 :
 
 %Xa[#66L8A;.33Q:
 :
 
 5.// 	I55FV3W3W 	I,k.HHH e***}  	I(!c::Yx{2B77> >    )!c::Yx{2B77> >   u["233 Muu-8 8 M #0+2LLLM  'x{B77,FRST&|hqk'A2FFRSTu["233 Iuu-8 8 I #0+2HHH:rZ   matscalec                    t          | j        d         | j        d         ft          |j                  |                                | j        |j        d           S )a  
    Infer the scaling type and swizzle mode from matrix and scale tensor shapes/dtypes.

    This function determines how scale factors are laid out relative to the matrix:
    - TensorWise: Single scale for entire tensor
    - RowWise: One scale per row
    - BlockWise1x128/128x128: Block-scaled with float32 scales
    - BlockWise1x32: MXFP8 with float8_e8m0fnu scales (swizzled on NVIDIA)
    - BlockWise1x16: NVFP4 with float8_e4m3fn scales (swizzled)

    Args:
        mat: The matrix tensor (FP8 or FP4)
        scale: The scale factor tensor

    Returns:
        Tuple of (ScalingType, SwizzleType) or (None, None) if unrecognized
    r   r7   c                    | |k    S r   rQ   r  s     rX   r  z%infer_scale_swizzle.<locals>.<lambda>  s
    16 rZ   r?  rA  rC  rD  rE  rF  )rX  ro  r  numelr   )rY  rZ  s     rX   infer_scale_swizzler_  w  sW    ( %)A,	!-%%KKMM)K!!   rZ   r>   	transposec                   ddl m |                                 }|                                }|r|d         |d         f}|r t          j        t
          j        |d          nd}dfd	}t          t          |          d
k    r|d         |d         fn	|d         dft          |          || j
        |j
        |          S )z
    Infer the scaling type and swizzle mode for IR nodes (used during graph lowering).

    This is the IR-compatible version of infer_scale_swizzle, using symbolic
    size comparisons via V.graph.sizevars.statically_known_equals.
    r   r  r7   r:  r   r;  rM   r   c                D    j         j                            | |          S )z5Compare values using symbolic equality when possible.ru  )r:  r;  r  s     rX   symbolic_eqz+infer_scale_swizzle_ir.<locals>.symbolic_eq  s    w771===rZ   r  r]  )r:  r   r;  r   rM   r   )r  r  re  r-  r.  r/  r0  rX  r\   r  r   )rY  rZ  r`  r?  rA  rC  rc  r  s          @rX   infer_scale_swizzle_irrd    s     .-----||~~H!!J  .QK!- DNT)"8<Q???STK> > > > > > %/28}}/A/A(1+x{++QRUVGW$$)K   rZ   r  )r   r   rM   r   )r   r   rM   r   )r   r   )r   r   r   r   r   r   rM   r   )r   r   F)
r   r   r   r   r   r   r   r   rM   r   r  )r   r  rM   r   )r*  r+  rM   r   )r5  r6  r7  r6  rM   r   )r*  r>  rM   r?  )rE  rF  rG  rF  rM   rF  )rM  rN  rM   rN   )r|  r}  rM   r~  )r  r  rM   rF  )r   rF  rM   r  )r|  r  rM   r  )r  r  rM   r   )r  r5   r  r  rM   r   )r  r   r   r  r  r  rM   r  )rH   )r   rN   rM   r  )r7   rH   )
r  r  r  r  r   r   r   rN   rM   r   )rQ   r  r  r  rH   )r  r  r  r  r   r   r  r   r  r   r   rN   rM   r   )r  r   r  rN   rM   r  )r  r   r  r   rM   r  )r:  r   r;  r   rM   r   )rW   r  r  r   rM   r  )rW   r  rM   r  )r   r  rM   r  )r%  rN   rM   r&  )r-  r.  rM   r/  )r-  r9  r:  r;  rM   rN   )r-  r.  r!  r:   rM   rS  r   )r  r  r  r  rM   r  )r   r  r  r  rM   r  r  )r$  r   rM   r  )r  rN   rM   r   )r  rm   rZ  r   rM   r  )r  r   rM   r   )r   rN   rM   r  )r  r   r  r  rM   r   )r:  r   rM   r  )r   r   rM   r   )r  r  rM   r  )r  r  rM   r5   )r  r  rM   r  r  )r  r   rM   r   )rM  rN   r   rN   rM   r=  )NNT)rC  rD  rE  rF  rG  r   rM   r=  )re  r  rd  r   rM   rf  )r  r+   re  rn  rd  r   rM   rf  )r   r  rM   r   r  r
  )r  r  rM   r   r  )r(  r   r   r   r)  r*  rM   r8   )rm  rA   r4  r5  rM   r   )r9  rN   rM   r   )
rm  rA   rF  r   rG  r   rH  r   rM   r   )rV  r@   rT  rW  rU  r   rM   r   )rV  r@   rT  rA   rU  r   rM   r   )r  r6   r  r6   r  r  rM   r   )r  r   r  r   rm  rA   r  r   r  r   r  r  r  r  r  r  rM   r   )
rm  rA   r1  r   rW  r   r  r   rM   r   rK  )rm  rA   r1  r  rW  r  r  r  r  r@   r  r@   r  r  r  r  rM   r   )r  rN   rM   r   r   )
r1  r  rW  r  r  r  r  r   rM   r   )r1  r  rW  r  r  r  rM   r   )r1  r  rW  r  r  r  rM   rf  )r   rN   rM   rN   )rM   r  )rm  rA   rM   r   )rm  rA   r&  r'  r(  r@   rM   r   )FTFN)rm  rA   r&  r@   r(  r@   r2  r   r+  r   r3  r   r4  r*  rM   r   )r   rb  r   r
  r  r  rM   rc  )r   r  rM   rv  )r   rb  r   r
  r  r  rM   r   )r   rb  r   r
  r  r  rM   rN   )r   rb  r   r
  r  r  rM   r  )r  r  r  r  rM   r=  )r  r  r  r  rM   r   )r  rN   rM   r  )rM   rF  )r  r  rM   r   )r  r  rM   r   )r  r   rM   r  )r  r   r   r   r  r   rM   r  )r   r  rM   r   )r  rN   rM   r   )r  rN   rM   r   )r  r  rM   r   )
r  r  r  r4   r  r  r  rN   rM   r  )r%  r&  rM   r   )r  r/  r  r0  rM   r   )r  r=  rM   r   )rA  rF   rB  rC  rM   r   )rA  rF   rM   r   )r  rM  r  rN  rM   r   )rQ  rN   rR  r  rS  r  rM   r   )rA  rF   rZ  r[  rR  r\  rS  r]  r^  r_  rM   r  )rm  r   rn  r   rM   r   )rt  r  rM   r   )r  r   rM   r=  )r   rN   rM   rN   )r   rF  rM   r   )r   rN   rM   r   )r   r  rM   r   )r7  r  r  rF  r  r  r  r  r  rN   r  r   rM   r   )r-  r9  rM   r  )r   r  rM   r   )r  r  rM   r   )rM   r  )r   rb  r   r
  r  r  rM   r  )r  r  rM   r  )r  r  r  r  r  r  rM   r  )rW   r  rM   r  )r  r  r  r  r  r  rM   r  )r  r  r  r  rM   r  )r   r   rM   r   )r  r  r  rE   rM   r  )r   r  rM   rN   )r   rN   rM   r  )ri  r  r   r  rM   r   )rM   r,  )r   rN   r=  r3   r>  rN  rM   r  )r   rN   rM   r  )r   rF  rM   rN   )r   r  rM   r  )r   rg  r   r   rM   r   )rM   ro  )rM   ru  )r  r  rM   rS  )r  r  rM   r   )r  rB   rM   r   )r!  r:   rM   r   )r  r   rM   r  )r   rN   rM   r   )r  r  r  r  rM   rN   )
r  rS  r  rS  r  r  r  r  rM   r  )r  r  rM   r  )r  r   rM   r  )rW   r*   r  r   rM   r  )rM   r  )r!  r
  rM   r  )rA  rF   rM   r  )rd  r;   rM   r   )r   rN   r'  r/   rM   rN   )r  r  rM   r   )rM   r   )rW   r   r<  r   rM   r   )r?  r@  rA  rB  rC  r   rD  r  rE  r  rF  rG  rM   rH  )rY  r  rZ  r  rM   rH  r  )rY  r>   rZ  r>   r`  r   rM   rH  (  
__future__r   r  r  rk  enumr-  r  r  r  r  r  r  r/  r?  r  r   r_  r   r  r   rY  r  r  r  collections.abcr   r   r   r   r   r	   r
   r   r   r   typingr   r   r   r   r   r   r   r   r   r   r   r   r   typing_extensionsr   r   r   r   r   rS   torch.utils._pytreer,  _pytreer  $torch._inductor.analysis.device_infor   torch._inductor.runtime.hintsr    !torch.fx.passes.regional_inductorr!   torch.utils._dtype_abbrsr"   torch.utils._ordered_setr#   r$   r%   OPTIMUS_EXCLUDE_POST_GRADr  r(   r)   r*   r+   r,   r-   r.   pathlibr/   r0   r1   r2   torch._prims_commonr3   torch.fxr4   torch.fx.noder5   rL  r6   r/  r8   r  r:   dependenciesr;   r  r=   r2  r>   r?   r@   rA   rB   rC   output_coderE   r  rF   rG   r[   rL   r   r`   torch._dynamo.device_interfacera   torch._dynamo.utilsrb   torch.autogradrc   torch.autograd.profiler_utilrd   (torch.fx.passes.graph_transform_observerre   torch.fx.passes.shape_proprf   torch.utils._sympy.functionsrg   rh   ri   rj   rk   torch.utils._sympy.symbolrl   rm   torch.utils._sympy.value_rangesrn   ro   r6  rq   runtime.runtime_utilsrr   rL  _IS_WINDOWS	getLoggerr   r   rt   r[  rJ  	VarRangesr  r   	InputTypegetenvXPU_KERNEL_FORMATGPU_KERNEL_BIN_EXTSr  r  r]  r1  rp  re  rq  rg  rr  ri  rk  r   r_  ra  rc  rY  rZ  float8_e4m3fnuzfloat8_e5m2fnuzr|   r   r   r   r   Functionr   rl  r   r   r  r  r  r)  r4  r=  rD  r{  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  FN_TYPEr  r"  r$  r,  r8  rR  r  r  r  r  r  r  r  r  r  r  r  r  	frozensetr  r  r  r  r.  r3  r4  r:  r<  r  rB  rc  clear_on_fresh_inductor_cacheclear_inductor_cachesfresh_inductor_cacherm  r  r  r  r  r  r  r  r  r  r  r   r#  r'  r3  r8  r@  rE  rR  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r#  r%  r1  r,  rT  rV  rm  ru  rz  r  r  r  r  r  r  r  r  r  r  r  r  r  rO  r  r  r  r  r  r  r  r  r^  rL  r  Enumr  r$  r+  r.  r<  r@  rH  rL  rP  rV  rc  rj  rs  r~  r  r  r  r7  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r   compiler  r  r#  r(  r+  r:  r<  r?  rA  rB  rD  rJ  rM  rN  rO  rQ  rn  rs  ru  r  r  r  r  r  r  r  r  r  r  SUPPORTED_MKLDNN_DEVICESr  r  r  r  r  r  r  r  r  r  PartitionFnTyper
  r	  r  r  r   r&  r+  r4  r6  r7  r:  r>  rX  r_  rd  rQ   rZ   rX   <module>r     sU   " " " " " " "                       				        				  				      



                                                                         C B B B B B B B B B         $ $ $ $ $ $ $ $ $ ? ? ? ? ? ? : : : : : : E E E E E E 0 0 0 0 0 0 / / / / / / ; ; ; ; ; ; ; ; ($ 
             >>>>>>>>>>>//////////CCCCCC$$$$$$""""""//////,,,,,,555555!!!!!!$$$$$$TTTTTTTTTTTTTTTT,,,,,,======== +**	GCLL
     D C C C C C 0 0 0 0 0 0 % % % % % % 2 2 2 2 2 2 K K K K K K 0 0 0 0 0 0              8 7 7 7 7 7 7 7 D D D D D D D D       = = = = = = lg%g!! WT]]UZ'(	U5<el:;<	 SEEibi(I7SS 
 " ""   	 
 2<
2 2     ( {Q'A--+2B2B2BDX2B2BB5 5 5 5
L L L L    EN    d###       $#"G G G G GX #(	    4 #(	_ _ _ _ _D       ; ; ; ;@ @ @ @
+ + + +* * * *#A #A #A #AL+ + + +	 	 	 	   "/ / / /G G G G @OI I I I I0   0' ' ' ' ' 	    ( %'     ) ) ) )' ' ' '# # # #   $ $ $ $  IcNNWTT"""
;sAv&*
+E E E E E8WQU^ E E E   :   + + + +\   ,4) 4) 4) 4)nW2 W2 W2 W2x 48    (G G G G:, , , ,^% % % %   	D 	D 	D 	DU U U U	> 	> 	> 	>   2   - - - - $)    $      ' ' ' '& 
			   < !#  " " " "	 	 	 	    " " " "( .27 7 7 7 7v !5 $ "  49      ( 	$ $ $ $ $ $N Q7 7 7 7    *    , , , , , , , ,
S' S' S' S' S' S' S' S'l
 
 
 
 
 
 
 
 @ @ @ @       @? ? ? ? ?' ? ? ?     8 J J J J ) ) ) )I I I I #'    (           #     < :>RW| | | | | |@ BG      BGV V V V V V O O O O Q	 	 	  	 Q	 	 	  	 Q    > > > >B   P " E E E E EPJ J J J CO, , , , , EF    (    . 5( 5( 5( 5(p @ @ @ @ R R R R:" " " "J      H H H H& & & &   : ""&"&= = = = =@' ' ' 'C C C C C C C C"& & & &$   / / / /( ( ( (V   	 	 	 	# # # #* . . . .$ IMF F F F F*	 	 	 	   B& & & && & & &              D D D D '6 '6 '6 '6T    Q Q Q Q	. 	. 	. 	.0 0 0 0   ( ( ( (# # # #K K K K   (* * * * *$) * * * 
  
  
  
F      " -1# # # # #L( ( ( ( @D	Q 	Q 	Q 	Q 	Q# # # #J J J JL L L L *=)<
 
 
 
 
< *=)<    :T T T T       2            ,! ! ! !   ) ) ) )
. . . .2 2 2 2$ $ $ $NH H H HB   L L L L' ' ' '    2   :   0	: 	: 	: 	: 37$ $ $ $ $<   $$3 $3 $3 $3N   :   &/ / / / '#)* $%
 
  HG*>*D*D*F*FGGG  "*Y''H H H H   	 	 	 	       & 1 1 1 1 1 1 1 1
 68  7 7 7 7    /9jll  : : : :) ) ) )# # # # #    ')


')

-" -" -" -" -" 01 -" -" -"` D)))t      *)       49    4 4 4 42Y Y Y Y5 5 5 5! ! ! !H" " " "J   4
 
 
 
   1 1 1 1    * 	 	 	 		 	 	 	   & "&!%	 
  
  
  
  
FR R R Rj&2 &2 &2 &2R   6      6 d###       $# 38$./@ 3 3 3 3 3 3 3 3 *:)9);); &= = = =       F
 
 
 
   "- "- "- "-L 
 
 
 % % % %
    " " " "
M M M M`   B " " " " " " "rZ   