
    WjnI                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ dd	l m!Z! erd d
l"m#Z#m$Z$m%Z% d dl&m'Z' d dl(Z(d dl)Z)d dl*Z)d dl+m,c m-Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZC ddlDmEZEmFZFmGZGmHZHm Z mIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZQmRZR ddlHmSZSmTZTmUZUmVZV ddlWmXZXmYZY ddlZm[Z[ ddl m\Z\m]Z]m^Z^m_Z_m`Z`maZa ddlbmcZc ddldmeZemfZf ddlgmhZh ddlimjZjmkZk dd llmmZm dd!l,mnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZ dd"lmZ  ej        e          Ze)j                            ed#          Ze)j                            ed$          Ze)j                            ed%          Ze)j                            ed&          Zed'         Zd(ed)<    ed*          Z ed+          Zej         G d, d-                      Zej         G d. d/                      Z G d0 d1          Zej         G d2 d3                      Zej         G d4 d5e                      Z G d6 d'          Zej        dd9            Zdd<Zdd>Zdd@Z ej        dAB           G dC dD                      ZddGZ G dH dI          ZddPZ G dQ dRe          Z G dS dTe          Z G dU dVe          ZddYZdd^Z G d_ d`e          Z G da dbe          Z G dc dde          Z G de dfe          Z	 dddoZddtZddvZej         G dw dx                      Z ej                    ZddzZdd}Zdd~ZddZddZddZddZ G d d[          Z G d d          ZdS )    )annotationsN)Counterdefaultdict)as_completedFuture)AnyGenericOptionalTYPE_CHECKING	TypeAliasTypeVarUnion)	ParamSpec
OrderedSet   )ComputedBuffer)CallableIteratorSequence)
ModuleType)countersdynamo_timed)use_pipelined_autotuning)LambdaFuturePyCodeCache)TritonTemplateCallerBase)get_metric_tableis_metric_table_enabled)free_symbols)free_symbol_is_typesymbol_is_typeSymT)
has_triton)commsconfigconfig_commsdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime/estimate_nccl_collective_runtime_nccl_estimator)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fx)assign_origin_nodeget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)ReductionHint)
green_textred_text)SimplifyIndexing)&_unstable_customized_partition_wrappercache_on_selfcmpdevice_need_guardget_current_backendget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsget_op_namesGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingcompute_dependencies
cudagraphsBaseSchedulerNoder   PartitionType_T_Pc                  t    e Zd ZU dZded<   dZded<   dZded<   d Zedd
            Z	e	 ddd            Z
dS )FusionResultNzOptional[bool]should_fusezOptional[Callable[[], bool]]callable_fnOptional[LambdaFuture]futurec                @    | j         d u| j        d uz  s
J d            d S )NzLFusion result should contain either fusion decision or callable_fn, not both)rb   rc   selfs    ^/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/torch/_inductor/scheduler.py__post_init__zFusionResult.__post_init__n   s>     ,1A1MN 	
 	
Z	
 	
N 	
 	
    boolc                "    t          |          S )N)rb   ra   )clsrb   s     ri   fusezFusionResult.fuses   s    4444rk   Callable[[], bool]c                $    t          ||          S )Nrc   re   rn   )ro   rc   re   s      ri   from_callablezFusionResult.from_callablew   s     FCCCCrk   )rb   rl   N)rc   rq   re   rd   )__name__
__module____qualname__rb   __annotations__rc   re   rj   classmethodrp   rt    rk   ri   ra   ra   h   s         "&K&&&&04K4444%)F))))
 
 

 5 5 5 [5 OSD D D D [D D Drk   ra   c                  D    e Zd ZU ded<   ded<   ded<   dZded<   ddZdS )PendingFusionrq   rc   r\   node1node2Nrd   re   return+tuple[BaseSchedulerNode, BaseSchedulerNode]c                    | j         | j        fS ru   r~   r   rg   s    ri   get_fusion_nodeszPendingFusion.get_fusion_nodes   s    
DJ''rk   )r   r   )rv   rw   rx   ry   re   r   r{   rk   ri   r}   r}   ~   s_         ####%)F))))( ( ( ( ( (rk   r}   c                  2   e Zd ZdZedd            Zedd            Zedd            Zedd            Z	edd            Z
edd            Zed d            Zed!d            Zedd            Zedd            Zedd            Zed"d            ZdS )#MixOrderReductionz
    This class contains utility functions to decide if we should fuse reductions
    reducing across different dimensions of the same input tensor.
    noder\   r   rl   c                    |                                  o*t          d |                                 D                       S )Nc              3     K   | ]U}t          |t                    |                                +t          |j        t                    E|j        j        d uV  Vd S ru   )
isinstanceSchedulerNodeis_reductionr   r   _split_size.0subnodes     ri   	<genexpr>z7MixOrderReduction.is_split_reduction.<locals>.<genexpr>   s|       +
 +
'=11+
 $$&&	+

 7<88+
L$D0+
 +
 +
 +
 +
 +
rk   )r   all	get_nodesr   s    ri   is_split_reductionz$MixOrderReduction.is_split_reduction   sM      "" 
s +
 +
>>+++
 +
 +
 (
 (
 	
rk   tuple[sympy.Expr, sympy.Expr]c                   |                      |          rjd }d }|                                D ]H}t          |t                    r.|                                rt          |j        t                    sG|j        j        J t          j	        j
                            t          |j        j                            }|j        j        J t          j	        j
                            t          |j        j                            }||}|}t          j	        j
                            ||          sJ | d|             t          j	        j
                            ||          sJ | d|             J|J ||fS |j        d         S )N v.s. r   )r   r   r   r   r   r   r   _original_rangesrW   graphsizevarssimplifyrV   _original_reduction_rangesstatically_known_equalsgroup)ro   r   xnumelrnumelr   	curxnumel	currnumels          ri   get_numel_rnumelz"MixOrderReduction.get_numel_rnumel   s   !!$'' "	!FF>>++ 4 4w66,,.. #7<@@
 |4@@@G,55!',"?@@ 	 |>JJJG,55!',"IJJ 	 >&F&FF7+CC	  4 4 33	334 4  7+CC	  4 4 33	334 4  4 %%%F##:a= rk   r~   r   c                   |                      |          }|                      |          }t          |          dk    st          |          dk    s||k    rdS t          |          t          t          |                    k    S )N   F)r   lentuplereversed)ro   r~   r   g1g2s        ri   has_mix_reduction_ordersz*MixOrderReduction.has_mix_reduction_orders   ss     !!%((!!%((r77a<<3r77a<<2885RyyE(2,,////rk   bufstrc                .   d}|j         j        D ]&}t          |t                    r|j        |k    r|} n'|sdS |j        }|j         j        }|sDt          |t                    sJ t          |                       |j	        d         j         j        }|sJ t          |          t          |j                  z
  sdS t          j        j                            t!          |j                  t!          |                                                    rdS dS )z@
        The access to 'buf' is not a broadcast access.
        NFr   T)read_writesreadsr   r2   nameindex
var_rangesFusedSchedulerNodetypesnodesr   r    rW   r   r   r   rV   sizevalues)ro   r   r   	found_depdepr   r   s          ri   _is_full_accessz!MixOrderReduction._is_full_access   s"   
 	#) 	 	C#y)) ch#oo	 	5%0
 	?d$677HHDJJHH7Q3>Jz:&&E4F)G)GG 	4
 733).))=9J9J9L9L+M+M
 
 	 4urk   	list[str]c                    g }|                                 |                                 z  }|D ]C}|                     ||          r+|                     ||          r|                    |           D|S ru   )used_buffer_namesr   append)ro   r~   r   outcommon_readsr   s         ri   get_common_readz!MixOrderReduction.get_common_read   s~     ..0053J3J3L3LL 	  	 C""3..  33F3FsE3R3R  

3
rk   c                P    t          |                     ||                    dk    S Nr   )r   r   ro   r~   r   s      ri   has_common_readz!MixOrderReduction.has_common_read   s'     3&&ue445599rk   intc                    |                      |          }t          j        j                            |d         |d         z  d          S )Nr   r   fallback)r   rW   r   r   optimization_hint)ro   r   r   s      ri   	get_numelzMixOrderReduction.get_numel   s?    !!$''w11"Q%"Q%-!1LLLrk   c                ,    |                      |          S ru   )r   r   s      ri   get_fusion_scorez"MixOrderReduction.get_fusion_score  s    
 }}U###rk   c                   t           j        j        sdS t          j        j        rdS |                                r|                                sdS |                                j        }|dvst          |          dk    rdS |
                                r|
                                sdS |j        |                                z  s|j        |                                z  rdS |                     ||          sdS t                              ||          }t!          |          dk    rdS |                     |          r||}}n|                     |          r||}}ndS |                     |          }|\  }}	t           j        j        sd}
t          j        j                            t-          j        ||	z  |
                    sdS t          j        j                            t-          j        ||	dz                      sdS t          j        j                            t-          j        |d                    sdS t1          d |                                D                       rdS t          j        j                            |	d	          sdS t7          d
 |                                D                       }|S )zP
        Check whether we can fuse two reductions with mix loop orders.
        F)cudaxputritonr   i  P r   i   c              3     K   | ]A}|                                 |j        j        j        t          j        t          j        fvV  Bd S ru   )r   r   datareduction_hintrA   INNERDEFAULTr   s     ri   r   z-MixOrderReduction.can_fuse.<locals>.<genexpr>V  sh       
 
 ##%%
L,#%
 
 
 
 
 
rk   i @  c              3  t   K   | ]3}|                                 |j                                        d v V  4dS )>   sumprodN)r   r   get_reduction_typer   s     ri   r   z-MixOrderReduction.can_fuse.<locals>.<genexpr>i  sb       
 
 ##%%
L++--
 
 
 
 
 
rk   )r&   r   mix_order_reductionrW   r   cpp_wrapperrR   
get_devicer   rI   r   	ancestorsget_operation_namesr   r   r   r   is_contiguous_noder   #mix_order_reduction_non_strict_moder   guard_or_truesympyGeanyr   statically_known_leqr   )ro   r~   r   device_typer   contiguous_node
other_noder   nrowncol
size_thresr   s               ri   can_fusezMixOrderReduction.can_fuse  s   
 }0 	5 7 	5||~~ 	U\\^^ 	5&&((-..";//8;;5!!## 	5+=+=+?+? 	5Oe77999 	Oe77999	 5 ++E599 	5 )88FF|!!5!!%(( 	*/ZOO##E** 	*/ZOO5!!/22
d }@ 	 #J
 7#11%(4$;
2S2STT u
 7#11%(42J2JKK u
 7#11%(42F2FGG u  
 
 +4466
 
 
 
 
 		 5
 w44T9EE 	5  
 
 &//11
 
 
 
 
 
rk   c                .    |                      ||          S ru   )r   r   s      ri   are_mix_order_reductionsz*MixOrderReduction.are_mix_order_reductionst  s     ||E5)))rk   c                Z     t           fdj        j        D                       sdS dS )Nc              3  N   K   | ]}                     |j                  V   d S ru   )is_contiguous_loadr   )r   r   ro   r   s     ri   r   z7MixOrderReduction.is_contiguous_node.<locals>.<genexpr>|  sF       
 
7:C""38T22
 
 
 
 
 
rk   FT)r   r   r   )ro   r   s   ``ri   r   z$MixOrderReduction.is_contiguous_nodez  sS     
 
 
 
 
>B>N>T
 
 
 
 
 	 5trk   parent_nodec                   ddl m} |                                D ]}t          |t                    sJ |j        }|j        |j                 }fd|D             }t          |          dk    rT|D ]y}|j	        |         }	|j
        }
t          |
                                          }t          j        j                            |	||          }|d         dk    s|d         dk    s  dS zdS )Nr   )MemoryUsageTypec                4    g | ]}|j         k    |j        S r{   )buffer_name
index_name)r   er   s     ri   
<listcomp>z8MixOrderReduction.is_contiguous_load.<locals>.<listcomp>  s'    QQQAAMS<P<P1<<P<P<Prk   r   FT)torch._inductor.loop_bodyr   r   r   r   _bodymemory_usageLOADr   indexing_exprsr   listkeysrW   r   r   stride_vars)ro   r   r   r   r   	loop_bodyentriesindex_namesr   
index_exprr   var_symbolsr  s    `           ri   r   z$MixOrderReduction.is_contiguous_load  s#   ======))++ 	! 	!DdM22222
I,_-ABGQQQQQQQK;1$$ * ! !
&5jA
&1
 #:??#4#455g.::  $B1,,B10D0D 555! trk   Nr   r\   r   rl   )r   r\   r   r   r~   r\   r   r\   r   rl   )r   r   r   r\   r   rl   )r~   r\   r   r\   r   r   )r   r\   r   r   r~   r\   r   r\   r   r   )r   r   r   r\   r   rl   )rv   rw   rx   __doc__staticmethodr   rz   r   r   r   r   r   r   r   r   r   r   r   r{   rk   ri   r   r      s        
 
 
 
 \
 #! #! #! [#!J 	0 	0 	0 [	0    [B 	 	 	 [	 : : : [:
 M M M [M $ $ $ [$ e e e [eN * * * [*
    [    [  rk   r   c                      e Zd ZU ded<   ded<   ded<    ej        e          Zded	<    ej        e          Z	d
ed<   ddZ
ddZddZddZd dZd!dZd"dZd#dZd#dZd$dZdS )%SchedulerBuffer	Scheduler	schedulerz	ir.Bufferr   Optional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr?   
mpi_bufferr   r   c                @    | j         }|J |                                S ru   )r  get_name)rh   ops     ri   defining_op_namez SchedulerBuffer.defining_op_name  s!    ~~~{{}}rk   r   c                4    t          | j        j                  S ru   )hashr   r   rg   s    ri   __hash__zSchedulerBuffer.__hash__  s    DIN###rk   c                   t                      }|                                 }|                    | dt          | j                  j                    |                    | d| j        j                    |                                 r9|                    | dt          |                                                       | 	                                r9|                    | dt          | 	                                                      t          | j                  dk    r |                    | d| j                    n}|                    | d           |                    d          5  | j        D ]}|                    | d           	 d d d            n# 1 swxY w Y   |                    d	           |                                S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rO   r  	writeliner   r   rv   layoutget_aliasespformatget_mutationsr   r  indentgetrawvalue)rh   resultr   users       ri   	debug_strzSchedulerBuffer.debug_str  s   !!}}D>>DOO$<>>???D>>DI,<>>??? 	PNN9I9I9K9K1L1LNNOOO 	TRR74;M;M;O;O3P3PRRSSStz??a;;tz;;<<<<000111q!! 1 1 J 1 1D$$ZZZ000011 1 1 1 1 1 1 1 1 1 1 1 1 1 1 S!!!!!###s   7#F''F+.F+c                4    | j                                         S ru   r   r  rg   s    ri   r  zSchedulerBuffer.get_name      y!!###rk   Nonec                ^   | j         J | j                                         sd S | j                                         sJ| j                                         s1t	          | j                                         t          j                  r+t          j	        j
                            | j                    d S t          t          j        d          r|                                 t          j        j        v rt          j        j        |                                          }|| j        j        v r| j        j        |         j         }n| j        j        |         j         }t          j	        j
                            || j                    d S t          j	        j
                            | j                    d S )Nargs)r   should_allocateget_inputs_that_alias_outputget_mutation_namesr   get_output_specr)   CommBufferLayoutrW   r   wrapper_codecodegen_allocationhasattrkernelr  inplace_update_buffersr  name_to_donated_buffername_to_bufcodegen_inplace_reuse)rh   input_buffer_nameinput_buffers      ri   allocatezSchedulerBuffer.allocate  sy   y$$$y((** 	F I2244	y++--	 $)3355r7JKK	
 G 33DI>>>F AHf%%	?18#BBB !" ? P DN$III#~D%    $~9:KLQG 66	    
 G 33DI>>>>>rk   rl   c                    | j         J t          | j         j        t          j                  st          | j                   rdS | j        D ]}t          |j         t                    r dS  dS NFT)r   r   r&  r)   r=   rS   r  
OutputNode)rh   uses     ri   can_freezSchedulerBuffer.can_free  s}    y$$$di&66 	:SI;
 ;
 	 5: 	 	C#(J// uutrk   c                @   i }|D ]r}t          |j                  |v rC|                    |t          |j                                     |t          |j                  <   [||t          |j                  <   st          |                                          | _        d S ru   )idr   merger  r   r  )rh   r  r,  rH  s       ri   	set_userszSchedulerBuffer.set_users  s    &( 	+ 	+C#(||v%%'*yy381E'F'Fr#(||$$'*r#(||$$&--//**


rk   Sequence[str]c                F    | j         J | j                                         S ru   )r   r6  rg   s    ri   r'  zSchedulerBuffer.get_aliases  s$    y$$$y55777rk   c                F    | j         J | j                                         S ru   )r   r7  rg   s    ri   r)  zSchedulerBuffer.get_mutations  $    y$$$y++---rk   Optional[torch.device]c                X    | j                                                                         S ru   )r   r8  r   rg   s    ri   r   zSchedulerBuffer.get_device
  s"    y((**55777rk   Nr   r   r   r   r   r2  r   rl   )r  r  r   r2  r   rN  r   rR  )rv   rw   rx   ry   dataclassesfieldr  r  r?   r  r  r   r.  r  rD  rI  rM  r'  r)  r   r{   rk   ri   r  r    sQ        OOO,,,,-K-dCCCECCCC.?k.?3/ / /J       
$ $ $ $$ $ $ $($ $ $ $? ? ? ?B
 
 
 
+ + + +8 8 8 8. . . .8 8 8 8 8 8rk   r  c                      e Zd ZU dZded<   dS )SchedulerDonatedBufferNr  r  )rv   rw   rx   r  ry   r{   rk   ri   r]  r]    s#         /3K333333rk   r]  c                     e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   dZded<   ded<   ded<   dZded<   ded<   ded<   dZded<   dyd!Zdzd#Zd{d%Z	d{d&Z
d{d'Zd|d)Zd{d*Zd}d+Zd~d/Zdd1Zdd4Zdd5Zdd7Zdd:Zd}d;Zdd<Zdd=Zd}d>Zd}d?ZddBZd{dCZd{dDZeddE            ZeddF            ZeddG            Z eddH            Z!ddJZ"ddLZ#ddOZ$ddQZ%ddRZ&ddSZ'ddTZ(ddUZ)ddVZ*ddWZ+ddXZ,ddYZ-dd\Z.dd]Z/d}d^Z0	 dddcZ1eddd            Z2edde            Z3eddf            Z4ddiZ5ddkZ6eddm            Z7ddoZ8eddp            Z9ddrZ:ddtZ;e<ddx            Z=dS )r\   OrderedSet[str]r   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]r   
last_usager   	min_order	max_orderr@   mpi_nodedict[str, str]mutation_renamesNzOptional[ir.Operation]r   list[SchedulerBuffer]outputsdict[str, SchedulerBuffer]outputs_by_nameOptional[float]override_estimated_runtimedependencies.ReadWritesr   OrderedSet[Dep]unmet_dependenciesFrl   writtenr  r  r   r2  c                $    || _         d | _        d S )Nc                     g S ru   r{   )r4  kwargss     ri   <lambda>z,BaseSchedulerNode.__init__.<locals>.<lambda>+  s    B rk   )r  debug_device_str)rh   r  s     ri   __init__zBaseSchedulerNode.__init__(  s    $-&& 	rk   ir.Operationc                
    | _         t                       _        t          t                                _        d _         fd|                                D              _        d  j        D              _        i  _	        d S )NFc                >    g | ]}t          j        |           S ))r  r   r  )r  r  )r   outputrh   s     ri   r   z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>5  sE     
 
 
  .   
 
 
rk   c                8    i | ]}|                                 |S r{   r  r   r   s     ri   
<dictcomp>z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>=  s"    LLLLLLrk   )
r   r   r   r   r`  ro  get_outputsrg  ri  re  rh   r   s   ` ri   _init_from_nodez!BaseSchedulerNode._init_from_node.  s    	#$
   
 
 
 
 **,,
 
 
  MLt|LLL !#rk   r   c                Z    t          |           j         d|                                 dS )Nz(name=)r   rv   r  rg   s    ri   __repr__zBaseSchedulerNode.__repr__F  s*    t**%AAT]]__AAAArk   c                   |                                  }t                      }|                    | dt          |           j         dt          t          | dd                    j         d| dt          | j        j                   d| dt          | j	                   d| d	t          | j        j
        | j	        z
             d| d
           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t           $ r t"                              dd           Y nw xY w|                                                                S )#Longer form printout for trace logsr"  (r   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r$  Ignoring error in debug_str()Texc_info)r  rO   splicer   rv   getattrr(  r   writesrn  r   r*  r~  r.  r%  debug_str_extra	Exceptionlogwarningr+  rstrip)rh   r   r   r   s       ri   r.  zBaseSchedulerNode.debug_strI  sG   }}

 	d	 #GD&$$?$?@@I  )011    %T%<==  	  #4#3#9D<S#STT	 
   	
 	
 	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   %?D11D58D5'E: :&F#"F#c                    dS )N r{   rg   s    ri   r  z!BaseSchedulerNode.debug_str_extrab      rrk   r   c                ,    |                      |           S ru   )rt  rg   s    ri   _debug_str_for_devicez'BaseSchedulerNode._debug_str_for_devicee  s    $$T***rk   c                   t          | j        dd           }d}t          |t          j        j        j                  r/d|                    |                                gdd          z   }net          |t          j        j        j	                  rAd|                    |
                                |                                gdd          z   }|  | S )Nr   r  , F)shorten	multiline)r  r   r   torch	_inductorr)   	Pointwise
str_helperget_size	Reductionget_reduction_sizer   )rh   
maybe_datadata_strs      ri   debug_str_shortz!BaseSchedulerNode.debug_str_shorth  s    TY55
j%/"4">?? 		j33$$&&'% 4   HH 
EO$6$@AA 	j33..00*2O2O2Q2QR 4   H
 """"rk   c                ^    t                               d| | j        | j        j                   d S )Nz(%s: unmet_dependencies = %s, writes = %s)r  inforn  r   r  rg   s    ri   log_detailszBaseSchedulerNode.log_detailsw  s7    6##		
 	
 	
 	
 	
rk   self_depr2   	other_depc                    dS NFr{   )rh   r  r  s      ri   reorder_loops_by_dep_pairz+BaseSchedulerNode.reorder_loops_by_dep_pair  	     urk   renamesc                    fdd | j                                         D             D             | _        |                     | j                             | j                             d S )Nc                *    i | ]}|v ||         S r{   r{   )r   r   r  s     ri   r}  z:BaseSchedulerNode.update_mutated_names.<locals>.<dictcomp>  s0     !
 !
 !
w '$-rk   c              3  $   K   | ]}|j         V  d S ru   r   r   r   s     ri   r   z9BaseSchedulerNode.update_mutated_names.<locals>.<genexpr>  s$      QQcQQQQQQrk   )r   reads_and_writesre  set_read_writesrenamerh   r  s    `ri   update_mutated_namesz&BaseSchedulerNode.update_mutated_names  s{    !
 !
 !
 !
QQT-=-N-N-P-PQQQ!
 !
 !

 	T-44T5JKKLLLLLrk   r   r1   c                `    |                      | j                            |                     d S ru   )r  r   	with_readrh   r   s     ri   add_fake_depzBaseSchedulerNode.add_fake_dep  s-    T-77<<=====rk   c                X    t          d |                                 D                       S )Nc              3  f   K   | ],}|                                 p|                                V  -d S ru   )r'  r)  r|  s     ri   r   z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  sN       
 
9<COO4!2!2!4!4
 
 
 
 
 
rk   )r   r~  rg   s    ri   has_aliasing_or_mutationz*BaseSchedulerNode.has_aliasing_or_mutation  s<     
 
@D@P@P@R@R
 
 
 
 
 	
rk   rwc                ^    || _         | j         j        | _        |                                  d S ru   )r   r   rn  
prune_deps)rh   r  s     ri   r  z!BaseSchedulerNode.set_read_writes  s.    "&"2"8rk   future_used_buffersmutation_real_namec                z    |                                  }t          fd|D                       }||z
  | _        d S )Nc              3  D   K   | ]}                     ||          V  d S ru   )get)r   kr  s     ri   r   z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>  s4      !U!U1"4"8"8A">">!U!U!U!U!U!Urk   )used_or_aliased_buffer_namesr   r`  )rh   r  r  used_bufferss     ` ri   set_last_usagez BaseSchedulerNode.set_last_usage  sI     88::!!U!U!U!U!U!U!UUU&)<<rk   c                B    | j         D ]}|                                 d S ru   )rg  rD  )rh   r   s     ri   mark_runzBaseSchedulerNode.mark_run  s,    < 	 	CLLNNNN	 	rk   c                    t          d t          j        | j        j        | j        j                  D                       S )Nc              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>  s8       
 
 H
 
 
 
 
 
rk   )r   	itertoolschainr   r   r  rg   s    ri   r   z#BaseSchedulerNode.used_buffer_names  sH     
 
 t'7'=t?O?VWW
 
 
 
 
 	
rk   c                    t                      d t          j        | j        j        | j        j                  D             }t          |          dk    r|                                }                    |           t          j
        j                            |          rH|                    fdt          j
        j        |                                         D                        t          |          dk    S )z
        Returns buffer names used by this node, including aliases.

        Note: is_fake WeakDeps are excluded since they are purely for ordering
        and should not affect buffer lifetime.
        c                T    g | ]%}t          |t                    r|j        |j        &S r{   )r   r4   is_faker   r  s     ri   r   zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>  sF     
 
 
sG,,
 25
H
 
 
rk   r   c              3  $   K   | ]
}|v|V  d S ru   r{   )r   alias
used_namess     ri   r   zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>  s>         J..	  /... rk   )r   r  r  r   r   r  r   popaddrW   r   name_to_bufferr  extendr6  )rh   depsr   r  s      @ri   r  z.BaseSchedulerNode.used_or_aliased_buffer_names  s     '1ll

 
 t'7'=t?O?VWW
 
 

 $ii!mm((**CNN3w%))#..     !"!7"2244	     	 $ii!mm rk   c                R     t           fd j        D                        _        d S )Nc              3  B   K   | ]}|j         j        j        v|V  d S ru   )r   r  available_buffer_namesr   r   rh   s     ri   r   z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>  sA       -
 -
xt~DDD DDDD-
 -
rk   r   rn  rg   s   `ri   r  zBaseSchedulerNode.prune_deps  sD    ", -
 -
 -
 -
.-
 -
 -
 #
 #
rk   c                     d fdt          fd j        j        D                       }                      j                            |                     d S )Nr   r1   r   rl   c                    t          | t                    sdS | j        j        j        vrdS j        j        | j                                                 }|t          j        j        v S r  )	r   r4   r   r  r@  r  rW   r   removed_operations)r   op_namerh   s     ri   should_prunez7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune  s^    c7++ uxt~999un0:KKMMGag888rk   c              3  2   K   | ]} |          |V  d S ru   r{   r   r   r  s     ri   r   z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>  sF       
 
\\#5F5F

 
 
 
 
 
rk   r   r1   r   rl   )r   r   r   r  remove_reads)rh   	to_remover  s   ` @ri   prune_weak_depsz!BaseSchedulerNode.prune_weak_deps  s    	9 	9 	9 	9 	9 	9  
 
 
 
+1
 
 
 
 
	 	T-::9EEFFFFFrk   name_to_fused_nodedict[str, BaseSchedulerNode]c                <    t          | || j        j                   d S ru   )_prune_redundant_depsr  r@  )rh   r  s     ri   prune_redundant_depsz&BaseSchedulerNode.prune_redundant_deps  s"     	d$68RSSSSSrk   c                F    | j         J | j                                         S ru   )r   get_operation_namerg   s    ri   r  zBaseSchedulerNode.get_name  rQ  rk   c                *    |                                  S ru   r{  rg   s    ri   get_first_namez BaseSchedulerNode.get_first_name  s    }}rk   c                X    t          d |                                 D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   r{  r   r   s     ri   r   z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>  s*      GGd$--//GGGGGGrk   )r   r   rg   s    ri   r   z%BaseSchedulerNode.get_operation_names  s)    GGdnn6F6FGGGGGGrk   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   r{  r   r   s     ri   r   z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s*      AAS#,,..AAAAAArk   )r   rg  rg   s    ri   get_buffer_namesz"BaseSchedulerNode.get_buffer_names  s!    AADLAAAAAArk   c                X    t          d |                                 D                       S )Nc              3  b   K   | ]*}t          |t                    ot          |d           V  +dS )T)disallow_fp32_opsNr   r   r+   r   ns     ri   r   zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  sX       
 
  q-(( G+AFFF
 
 
 
 
 
rk   r   r   rg   s    ri   can_codegen_in_low_precisionz.BaseSchedulerNode.can_codegen_in_low_precision  s<     
 
 ^^%%
 
 
 
 
 	
rk   c                X    t          d |                                 D                       S )Nc              3  ^   K   | ](}t          |t                    ot          |          V  )d S ru   r  r  s     ri   r   z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  sN       
 
 q-((K-H-K-K
 
 
 
 
 
rk   r	  rg   s    ri   r+   z-BaseSchedulerNode.can_codegen_without_upcasts  s:     
 
^^%%
 
 
 
 
 	
rk   Sequence[BaseSchedulerNode]c                    | gS ru   r{   rg   s    ri   r   zBaseSchedulerNode.get_nodes  s	    vrk   Sequence[SchedulerBuffer]c                    | j         S ru   )rg  rg   s    ri   r~  zBaseSchedulerNode.get_outputs  s
    |rk   buf_namer  c                    | j         |         S ru   )ri  )rh   r  s     ri   
get_outputzBaseSchedulerNode.get_output  s    #H--rk   rR  c                F    | j         J | j                                         S ru   )r   r   rg   s    ri   r   zBaseSchedulerNode.get_device  s$    y$$$y##%%%rk   c                H    |                                  }|d uo
|j        dk    S Ncpu)r   r   rh   devices     ri   is_cpuzBaseSchedulerNode.is_cpu  s(    ""T!:fkU&::rk   c                Z    |                                  }|d uot          |j                  S ru   )r   rR   r   r  s     ri   rR   zBaseSchedulerNode.is_gpu  s+    ""T!9fV[&9&99rk   c                    dS r  r{   rg   s    ri   r   zBaseSchedulerNode.is_reduction      urk   c                    dS r  r{   rg   s    ri   is_native_matmulz"BaseSchedulerNode.is_native_matmul  r  rk   c                    dS r  r{   rg   s    ri   is_split_scanzBaseSchedulerNode.is_split_scan  r  rk   c                    dS r  r{   rg   s    ri   is_templatezBaseSchedulerNode.is_template  r  rk   c                    dS r  r{   rg   s    ri   	is_externzBaseSchedulerNode.is_extern   r  rk   c                    dS r  r{   rg   s    ri   
is_foreachzBaseSchedulerNode.is_foreach#  r  rk   read_depdependencies.Depc                    dS r  r{   rh   r(  s     ri   can_inplacezBaseSchedulerNode.can_inplace&  r  rk   c                    dS r  r{   rg   s    ri   has_side_effectsz"BaseSchedulerNode.has_side_effects)  r  rk   c                    ddl m} t           t                    rt          j        rt          j                             	                                t          j                  rht          t          j        t          j        j        j        j                  rt%          t          j        dd          t'          t          j        d          sdS  j        t          j        j        z   j        j        z  d fd
}                                 D ]Z}|j        }|J |                                rM|                                s9|                                s%|                                t          j        j        v rp j        j         D ]}|j!         j        j"        v r j        j"        |j!                 }n$ j        j#        $                    |j!                  }|rt          j        j%        &                    |           rat          |j'        tP                    sF|j)        J fd|j)        D             }tU          |          dk    r|d         j+        r|d         j         u r|j        t          |j        ,                                tZ          j.        tZ          j/        tZ          j0        f          s|j'        r[t          |j'        j        tZ          j1        tZ          j2        f          r+tU          |j                                                  dk    sE ||j        |j                  r. ||          r"t          j        j3        4                    |                                |                                           t          t          j        t          j        j        j        j                  rlt          j        j5        6                    |                                           t          j        j5        6                    |                                           |                                t          j        j7        |                                <    nސ\dS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNr4  buf_to_be_inplacedr  r   rl   c                   | j                                       }|                                 t                      }| j        D ]}|j        }t          |t                    s|                                | j         j	        vs| j                             |          |ur\|fd|j
                                        D             z  }t          |          dk    r dS dS )Nc              3  2   K   | ]}|j         k    |V  d S ru   r  )r   or  s     ri   r   z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>_  s<        v)) )))) rk   r   FT)r  get_fused_noder  r   r  r   r   r\   r  r  r   r  r   )r2  
fused_noder  r-  	user_noder  rh   s        @ri   single_index_in_fused_nodezKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodeG  s   
 ,5DDTJJJ)2244H %/LLD*0 ! ! I	!)->??  ,,..-7JK K)3BB9MM%& &     &2CCEE   
 t99q== 55 ! 4rk   c                J    g | ]}|j                                         v| S r{   r0  )r   xinconsequential_nodess     ri   r   z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>  s;     & & &6??,,4III IIIrk   r   )r2  r  r   rl   )8codegen.wrapperr0  r   r   r&   inplace_buffersrW   r   has_featurer   r,   INPLACE_BUFFERSr=  r  r  codegensimd
SIMDKernelr  r<  r   r  r  completed_operationsr~  r   r5  r6  r7  r  removed_buffersr   r   r   r?  r@  r  r:  	can_reuser  NopKernelSchedulerNoder  r   r,  r8  r)   r=   r<   MutationLayoutSHOULDREMOVEFallbackKernelr;   r4  make_inplacer1  r  r>  )	rh   r0  r9  r   buf_noderead	input_bufremaining_usesr<  s	   `       @ri   decide_inplace_updatez'BaseSchedulerNode.decide_inplace_update,  s3   
 	;::::: t]++	&	 ##DOO$5$5~7UVV	
 qx)@)E)PQQ	 18[$77C &)) D
 F Ng()n12 	 	  	  	  	  	  	D ##%% C	 C	CxH''',,..88:: ..00 <<>>QW%<<<(. 8 89 EEE $ Edi PII $ : > >ty I II 1,66y$GG1 'y'<>TUU1
 %?666& & & &!*& & &N N++q00*1-9 1*1-2d::%N6 *%N::<< " " 4 " =! ! 7 &1 7 !+ ) 5 :!#!2BN C! ! 7 !$IN$O$O$Q$Q R RUV V V11).#(KK !W 76yAA !W 2293E3E3G3GXXX%Heo&=&B&M  C H.2293E3E3G3GHHHH.223<<>>BBB &..00 7G GC	 C	rk   TbufferrO   	only_oncec                n   t           j        sd S |r	| j        rd S | j        J | j                                        }g }|D ]B}|j        dk    r|                    d           |                    d           d|j         d|j         }d|j        v r|d|j        d          z   }|                    |           d|j        v r|j        d          }|	                    d	d
          d         }|                    d|
                    dd          
                    dd          
                    dd          
                    dd          z              |                    d           |                    d           Dt          |          dk    rd S |                    |           d| _        d S )Nry  r  z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|r   )maxsplitr   {z{{}z}}r  \z\\z#pragma CMT END ORIGINr   T)r&   comment_originro  r   get_originsr  r   targetmetarsplitreplacer   
writelines)	rh   rP  rQ  origins	out_linesr5  op_info_strrU  stack_trace_last_lines	            ri   codegen_originating_infoz*BaseSchedulerNode.codegen_originating_info  s    $ 	F 	 	Fy$$$)''))	 	% 	%AtxR   2333:::::K16!!),Iqvh7G,I,II[)))&&!"!68(3(:(:3(:(K(KB(O%  "+33C>>WS$''WT4((Wf 	     !9:::  $$$y>>QF 	)$$$rk   c                0    |                      dd          S )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implrg   s    ri   get_read_write_buffers_sizesz.BaseSchedulerNode.get_read_write_buffers_sizes  s$    55t 6 
 
 	
rk   c                0    |                      dd          S )NTFrh  rk  rg   s    ri   get_read_buffer_sizesz'BaseSchedulerNode.get_read_buffer_sizes  s$    55u 6 
 
 	
rk   c                0    |                      dd          S )NFTrh  rk  rg   s    ri   get_write_buffer_sizesz(BaseSchedulerNode.get_write_buffer_sizes  s$    55 6 
 
 	
rk   ri  rj  c                r    t          |                     ||                                          d          S )Nrh  r   )start)r   get_read_write_buffer_accessesr   )rh   ri  rj  s      ri   rl  z3BaseSchedulerNode.get_read_write_buffers_sizes_impl  sD     //+N 0  fhh	
 
 
 	
rk   dict[str, int]c                    t           t                    ri S t           t                    rt           j        t                    ri S t           t                    rCt           j        t
          j                  r$ j        j        t          j	        j
        j        u ri S ddt           t                    rY t                                           d                   t                                           d                   z            nt          d          t!          j        t$                    }|r/ j        j        D ]"}||j                                     |           #|r/ j        j        D ]"}||j                                     |           #|r#t1          d	  j        j        D                       nt1                      }|r#t1          d
  j        j        D                       nt1                      }d fdt           t2                    r&t1           fd|D                       }||z
  }||z
  }i }||z  D ]}	t5          fd||	         D                       |	t6          j        j        v rt6          j        j        |	         }
n,|	t6          j        j        v rt6          j        j        |	         }
nzd fd |
          }|	|vr|||	<   ||	xx         |z  cc<   |S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.Exprr   r   c                N    t           j        j                            | d          S )Nr   r   )rW   r   r   r   )rw  s    ri   try_size_hintzGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint#  s     7#55a!5DDDrk   r   r       eAc              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>8  s$      BBCsxBBBBBBrk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>=  $      CCCsxCCCCCCrk   r   r   r   r  rl   c                    j         j        |          j        }t          d |D                       }t	          |t          |          z
            dk    S )Nc              3  $   K   | ]}|j         V  d S ru   r   )r   r-  s     ri   r   z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>D  s$      !>!>$)!>!>!>!>!>!>rk   r   )r  r@  r  r   r   )r   r   r  buf_usesrh   s       ri   is_materializedzIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedB  sQ    N.s39E!!>!>!>!>!>>>Hx*V"4"445599rk   c              3  >   K   | ]} |j                   |V  d S ru   r   )r   r   r  rh   s     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>H  sJ       ) )__S$+-N-N)) ) ) ) ) )rk   c              3     K   | ]}V  d S ru   r{   )r   r   
node_numels     ri   r   zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>Q  s#      $R$RCZ$R$R$R$R$R$Rrk   <Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]c                `   | sdS t          | t          j                  r|                                 S t          | j        t
                    rj        j        |                                          j	        }d}|D ]}t          |j
        t                    rt          |j
        t                    sJ t          |j
        j
        t                    r0|j
                                        D ]}| |j
                  z  } dS |S t          | j        t          j                  r-t!          fd|                                 D                       S  	t%          |                                                     }t)          |                                           t-          |          z  S )Nr   c              3  h   K   | ],} t           j                            |                    V  -d S ru   )rW   r   
get_buffer)r   mut_nameget_buf_bytess     ri   r   zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>u  sQ        $ &ag&8&8&B&BCC     rk   )r   r)   TorchBindObjectr  r&  r<   r  r@  r  r  r   rG  r\   r;   r~  r=   r   r7  rV   r  rK   	get_dtypemin)
r   r  totr-  	sched_buf	buf_elemsbuf_accessed_elemsr  rh   rz  s
         ri   r  zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesZ  s     1c2#566 ,,...
,=>>  !N6s||~~FLEC % % %%di<< %$)$)5FGGGGG%dinkBB %-1Y-B-B-D-D E E	 #}}Y^'D'D DE $%11J
BM:: 	    (+(>(>(@(@     
 !.mCLLNN.K.K L LI)#--//::S*I> >  rk   )rw  rx  r   r   )r   r   r   r  r   rl   )r   r  r   r   )r   rG  ExternKernelSchedulerNoder   r;   r)   rI  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater   rV   
get_rangesr   collectionsr   r  r   r   r   r   r  r   r   r   rW   r   r  graph_inputs)rh   ri  rj  buf_accessesr   r   r  rE  buf_byte_accessesr  r   	buf_bytesr  r  r  r  rz  s   `           @@@@@ri   rt  z0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233 	Id566 	:I{<
 <
 	 It677	49b&788	 	%|%BC C I	E 	E 	E 	E dM** 	"&doo//233 1 1! 4556 JJ
 SJ".t44 	3'- 3 3SX&--c2222 	3'. 3 3SX&--c2222 JBB4+;+ABBBBBB 	 JCC4+;+BCCCCCC 		: 	: 	: 	: 	: 	:
 d.// 	,( ) ) ) ) )%) ) )  O o-FO+E,. 3	9 3	9H!$$R$R$R$R<;Q$R$R$R!R!R17111g,X6QW111g*84# # # # # # # # #J &c**I000.7!(++!(+++y8++++  rk   
int | Nonec                F   | j         d S | j                                         }|d S t          |          }|d S t          |t          j                  r|j         j        }t          j        j	        
                    |d          }t          d         dxx         |z  cc<   |S )Nr   r   inductor
flop_count)r   get_origin_noder7   r   r  SymIntexprrW   r   r   r   r   )rh   fx_nodeflopsresolved_flopss       ri   estimate_flopsz BaseSchedulerNode.estimate_flops  s    94)++--?4w''=4eU\** 	$JOE);;EA;NN\***n<***rk   floatc                F    | j         | j         S |                                 S ru   )rk  _get_estimated_runtimerg   s    ri   get_estimated_runtimez'BaseSchedulerNode.get_estimated_runtime  s&    *622**,,,rk   c                   |                                  d                                         d         }|j                                        }t	          t          |                    sdS t          | j                  r,t          | j        t          j	                  sJ 	 t          j        rt          |           }t                      }|                    |          }|t          |t                    sJ |S t!          |           }|t#          | j                  }|                    ||           |S t#          | j                  S # t&          $ r%}t(                              |           Y d}~dS d}~wt,          $ r%}t(                              |           Y d}~dS d}~ww xY wt/          | j                  rdS t1          |           }||S |j                                        }		 t5                      }
t7          |	          dz  }|
dk    rt9          d|
           |dk    rt9          d|           n# t:          $ r Y dS w xY w|                                 }|dk    s||                                 |
z  }|dz  }|S d}|                                 }|dn|}||z  |z  d	z  }||
z  }tA          ||          }|dz  }|S )
zC
        Returns estimated op runtime in milliseconds (ms)
        r   Nvaluel    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g    .Ag      ?r{  )!r   r~  r   r8  rR   r9   rP   r   r)   IRNoder'   ,runtime_estimations_use_nccl_lib_estimations)get_estimate_runtime_cache_key_from_snodeget_estimate_runtime_cachelookupr  r0   r/   	set_value
ValueErrorr  r  	TypeErrorrU    maybe_estimate_runtime_benchmarkmaybe_get_dtyperL   rJ   AssertionErrorr  r  rm  max)rh   r   r&  	cache_keycache	cache_valmsr   retdtypegpu_memory_bandwidth	gpu_flops	flops_estnsfactorcounted_bytescompute_timetransfer_times                     ri   r  z(BaseSchedulerNode._get_estimated_runtime  s#   
 nnq!--//2))++of--.. 	1 ## "	di33333L  I$ O OI688E %Y 7 7I ,))U;;;;;((HNNBz=diHHOOIRO888I7	BBB    qqqqq   qqqqq
 TY 	
 1.t44?J((**	#4#6#6 )%0069I $q(($ZDXZZ   A~~$%U)%U%UVVV  	 	 	11	 ''))	>>Y.22447KKBcBI 99;;*2*Y6#=%(<< }--#X	sD   !AE :=E 8E 
F(E66F(F##F(.AH? ?
IIOptional[ir.TemplateBuffer]c                    d S ru   r{   rg   s    ri   get_template_nodez#BaseSchedulerNode.get_template_node      trk   ir.TemplateBufferc                6    |                                  }|J |S ru   r  )rh   templates     ri   get_template_node_or_throwz,BaseSchedulerNode.get_template_node_or_throw  s$    ))++###rk   nodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                    t          d t          |           D                       }| d|         }| |         }| |dz   d         }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c              3  H   K   | ]\  }}|                                 |V  d S ru   r#  r   ir  s      ri   r   zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>  s3      PPDAqPaPPPPPPrk   Nr   )next	enumerate)r  template_indexprologuetemplate_nodeepilogues        ri   get_prologue_template_epiloguez0BaseSchedulerNode.get_prologue_template_epilogue   sb     PPIe,<,<PPPPP.)n-!+--.00rk   )r  r  r   r2  )r   rv  r   r2  rT  )r   r   rV  r  r2   r  r2   r   rl   r  rd  r   r2  )r   r1   r   r2  rW  )r  rl  r   r2  r  r_  r  rd  r   r2  r   r_  r  r  r   r2  r   r  )r   r  )r  r   r   r  rY  r(  r)  r   rl   T)rP  rO   rQ  rl   r   r2  rU  )ri  rl   rj  rl   r   r   )ri  rl   rj  rl   r   ru  r   r  r   r  r   r  )r   r  )r  r  r   r  )>rv   rw   rx   ry   r   rk  ro  ru  r  r  r.  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  rF   r   r  r
  r+   r   r~  r  r   r  rR   r   r  r!  r#  r%  r'  r,  r.  rO  rf  rm  ro  rq  rl  rt  r  r  r  r  r  r  r  r{   rk   ri   r\   r\     s        BBBB NNNNNN''''$$$$#'D''''""""////266666((((''''G
 
 
 
# # # #0B B B B* * * *2   + + + +# # # #
 
 
 
   
M M M M> > > >
 
 
 

   
= = = =   
 
 
 
   6
 
 
 
G G G G T T T T
. . . .    H H H ]H B B B ]B 
 
 
 ]
 
 
 
 ]
      . . . .& & & &; ; ; ;: : : :                        @ @ @ @F 9=- - - - -^ 
 
 
 ]

 
 
 
 ]

 
 
 
 ]


 
 
 
L! L! L! L!\    ]$- - - - U U U ]Un      
 1 1 1 \1 1 1rk   r   $torch._inductor.codecache.LocalCachec                 H    t           j        j                                        S ru   )r  r  	codecache
LocalCacher{   rk   ri   r  r    s    ?$//111rk   snoder   c                \   t          | j        dd          }| j        j        }| j                            g || j        j        | j        j                  }| j        j        }t          j        ||f          \  }}ddt          |ft          fd|D                       z             }|S )Npython_kernel_namer  r   rl   c                l    t          | t          j                  ot          | t          j                   S ru   )r   r)   r  GeneratorStater;  s    ri   _is_tensor_irz@get_estimate_runtime_cache_key_from_snode.<locals>._is_tensor_ir  s)    !RY''P
1b>O0P0P,PPrk   c              3  t   K   | ]2} |          r!t          |                                          nd V  3d S ru   )r   r  )r   ar  s     ri   r   z<get_estimate_runtime_cache_key_from_snode.<locals>.<genexpr>#  sG      UUa}}Q'7'7Aajjll###TUUUUUUrk   rW  )
r  r   inputsfill_non_provided_argsconstant_argsrr  pytreetree_flattenr   r   )r  r  r4  rr  	flat_argsflat_args_pytree_specr  r  s          @ri   r  r    s     -A2FF:D:,,*$*)*
 D ZF'-':D&>'J'J$I$Q Q Q Q 	
UUUU9UUU
U
U	V I rk   Optional[Callable[[Any], Any]]c                >   t          | t                    sd S t          j        j        j        t          j        j        j        t          j        j        j        d}t          | j	        dd          }||vrd S t          | j	        t          j                  sd S ||         S )N)zextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmr  r  )r   r  r  opsatenmmbmmaddmmr  r   r)   ExternKernel)r  mms_fnsr  s      ri   _get_mm_like_fnr  (  s    e677 t"Y^.#in0 %	 4 G
 !-A2FF((tej"/22 t%&&rk   rj  c                    d }d }t           j        rt                     }|d S |} fd}nd S t                     }t	                      }|                    |          }|t          |t                    sJ |S ddlm	  |            \  }}ddl
m}	 |	                    |||ddd          }
|                    ||
	           |
S )
Nc                                 S ru   r{   )r  snode_args_kwargss   ri   rs  z2maybe_estimate_runtime_benchmark.<locals>.<lambda>A  s    !2!25!9!9 rk   r   )r  r   )benchmarker   
   )memory_warmup_itersbenchmark_itersmax_benchmark_durationr  )r&   !runtime_estimations_mms_benchmarkr  r  r  r  r   r  utilsr  $torch._inductor.runtime.benchmarkingr  	benchmarkr  )r  bench_fnargs_kwargs_fnmm_fnr  r  r  r4  rr  r  r  r  s   `          @ri   r  r  8  s   HN/ &&=499999t9%@@I&((EY''I)U+++++((((((!>##LD&@@@@@@			! 
 
 
B 
OOIRO(((Irk   T)slotsc                  P    e Zd ZU ded<   ded<   ded<   ded<   ddZddZddZdS )	WhyNoFuser   name1name2reasonztuple[Any, ...]r4  r~   r\   r   r   r2  c                j    |                                 | _        |                                 | _        d S ru   )r  r  r   rh   r~   r   s      ri   ru  zWhyNoFuse.__init__e  s(    ^^%%
^^%%


rk   r   c                V    || _         || _        t                              |            d S ru   )r!  r4  
fusion_logdebug)rh   r!  r4  s      ri   __call__zWhyNoFuse.__call__i  s*    	rk   c                H    d| j          d| j         d| j        | j        z  z   S )Nzcannot fuse z with r"  )r  r   r!  r4  rg   s    ri   __str__zWhyNoFuse.__str__n  s2    >dj>>
>>>K$)#
 	
rk   Nr~   r\   r   r\   r   r2  )r!  r   r4  r   r   r2  rT  )rv   rw   rx   ry   ru  r'  r)  r{   rk   ri   r  r  ^  sy         JJJJJJKKK& & & &   

 
 
 
 
 
rk   r  objr   c                    t          | t          t          f          rt          | t                    } t          j        | d          }d|v rdt          j        |d           S |S )Nkey   )r*  r      )	r   r   setsortedr   pprintr(  textwrapr*  )r+  r,  s     ri   r(  r(  t  sg    #
C()) #Sc"""^C***Fv~~6HOFG44666Mrk   c                  2    e Zd ZddZddZdd	ZddZeZdS )rG  r   r3   r   r2  c                0    t          |g          | _        d S ru   r  r  s     ri   ru  zOutputNode.__init__  s    ",cU"3"3rk   rl   c                    dS r  r{   rg   s    ri   r   zOutputNode.is_reduction  r  rk   rN  c                    dS )Nr{   r{   rg   s    ri   r6  z'OutputNode.get_inputs_that_alias_output  r  rk   r   c                    dS )NOUTPUTr{   rg   s    ri   r  zOutputNode.get_name  s    xrk   N)r   r3   r   r2  rW  rX  rT  )rv   rw   rx   ru  r   r6  r  r  r{   rk   ri   rG  rG  ~  se        4 4 4 4          HHHrk   rG  r   r  r  r@  rh  r2  c                    t          j                     j        D ]^}t          |t                    sG|j                                                 }|                                         xx         dz  cc<   _d	 fdt          fd j        D                       }|r> j        |z
   _         	                     j
                            |                     dS dS )
am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r   r1   r   rl   c                   t          | t                    ru| j                                                 }|                                                  dk    o!j                            | |                   }|         k    }|p|S dS )Nr   F)r   r4   r   r  r  r  fusable_weak_dep)r   r  is_redundantis_self_depr@  name_to_dep_countr  r   s       ri   r  z+_prune_redundant_deps.<locals>.should_prune  s    c7## 	!#(+<<>>G,"7+4466 n55'0$   -W5=K.;.5rk   c              3  2   K   | ]} |          |V  d S ru   r{   r  s     ri   r   z(_prune_redundant_deps.<locals>.<genexpr>  sF        ,,s2C2C     rk   Nr  )r  r   rn  r   r4   r   r  r  r   r  r   r  )r   r  r@  r   r  deps_to_pruner@  r  s   ```   @@ri   r  r    s?    '2&9&;&;& K K#w'' 	K!#(+<<>>G09BBDDEEEJEEE              .    M  K"&"9M"IT-::=IIJJJJJK Krk   c                  8     e Zd Zd fdZdd	ZddZddZ xZS )r  r  r  r   rv  r   r2  c                    t                                          |           |                     |           |                     |                                           d S ru   superru  r  r  get_read_writesrh   r  r   	__class__s      ri   ru  z"ExternKernelSchedulerNode.__init__  U    ###T"""T113344444rk   r   c                \    |                                   dt          | j        dd            S )Nz.node.kernel = r  )r  r  r   rg   s    ri   r  z)ExternKernelSchedulerNode.debug_str_extra  s.    --//bb'$)EY[_2`2`bbbrk   rl   c                    dS NTr{   rg   s    ri   r%  z#ExternKernelSchedulerNode.is_extern  r  rk   c                p    | j         J t          | j         d          o| j                                         S )Nr.  )r   r<  r.  rg   s    ri   r.  z*ExternKernelSchedulerNode.has_side_effects  s6    y$$$ty"455V$):T:T:V:VVrk   r  r  r   rv  r   r2  rT  rW  )rv   rw   rx   ru  r  r%  r.  __classcell__rI  s   @ri   r  r    s        5 5 5 5 5 5
c c c c   W W W W W W W Wrk   r  c                        e Zd Zd fdZ xZS )	rG  r  r  r   rv  r   r2  c                    t                                          |           |                     |           |                     |                                           d S ru   rE  rH  s      ri   ru  zNopKernelSchedulerNode.__init__  rJ  rk   rO  )rv   rw   rx   ru  rP  rQ  s   @ri   rG  rG    s=        5 5 5 5 5 5 5 5 5 5rk   rG  c                  ^    e Zd ZU dZded<   ded<   dE fdZ	 	 dFdGdZ	 	 dFdHdZdIdZdJdZ	dKdZ
dLdZdKdZdMd#ZdKd$ZdNd(ZdOd*ZdPd,ZdQd-ZdQd.ZdQd/ZdQd0ZdRd2ZdSd5ZdTd7ZdUd8Z	 dVdWd<ZedXd=            ZedXd>            ZdYdAZedZdC            ZedQ fdD            Z  xZ!S )[r   zu
    A SchedulerNode is a node for scheduling that encapsulates either
    a ComputedBuffer or a TemplateBuffer.
    z tuple[Sequence[sympy.Expr], ...]_sizesr>   r   r  r  r   +Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r2  c                    t                                          |           |                     |           |                                  d S ru   )rF  ru  r  _compute_attrsrH  s      ri   ru  zSchedulerNode.__init__  sI    
 	###T"""rk   Nextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[_P, _T]]c                   t          | j        t          j        t          j        f          sJ | j                            ||          \  | _        }|| _        | j                                        }| j	        
                    |          j        }| || j                  f| _        t          j         pt          |j                   }t          | j        t          j                  r0|                     | j                            |                     d S |                     t'          j        | j        g| j        R d|i           d S )NrY  r[  )	normalizer_  )r   r   r)   r   TemplateBuffersimplify_and_reorderrU  r   get_device_or_errorr  get_backendgroup_fnr   r&   loop_ordering_after_fusionrR   r   r  extract_read_writesr(   )rh   rY  r[  bodyr  rd  should_normalizes          ri   rX  zSchedulerNode._compute_attrs  so   
 $)b&79J%KLLLLL I::'A&? ; 
 
T 
..00>--f55>hht{334
  &@@ 
KI
 I
 E
 di!233 		  	--8H-II       0J!%  8H     rk   Optional[Callable[..., Any]]c                4    |                      ||           d S )Nr^  )rX  )rh   rY  r[  s      ri   recompute_size_and_bodyz%SchedulerNode.recompute_size_and_body  s1    
 	'A&? 	 	
 	
 	
 	
 	
rk   r_  rl   need_clear_tiling_cachec                   t          d | j        j        D                       }|                     t	          j        | j        g| j        R d|i                    |          	                    | j
                             | j                            |            |r!ddlm} |j                                         d S d S )Nc              3  R   K   | ]"}t          |t          t          f          |V  #d S ru   )r   r4   r3   r  s     ri   r   z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>  sJ       0
 0
ZgwEW5X5X0
0
 0
 0
 0
 0
 0
rk   r_  r   SIMDScheduling)r   r   r   r  r(   rf  r   rU  r  r  re  pointwise_read_writesclear_cachecodegen.simdrp  candidate_tilingscache_clear)rh   r_  rl  	fake_depsrp  s        ri   refresh_dependenciesz"SchedulerNode.refresh_dependencies  s    
 &0 0
 0
+10
 0
 0
 &
 &
	 	,
![  4=  Yy!!VD)**	
 	
 	
 	"..t444" 	;444444 ,88:::::	; 	;rk   	new_orderSequence[int]c                    | j                             |          | _         | j         j        | _        |                     dd           d S )NFTr_  rl  )r   reorder_iter_loopssizesrU  rw  )rh   rx  s     ri   apply_new_loop_orderz"SchedulerNode.apply_new_loop_order.  sK    Z22
 

 j&!!E4!PPPPPrk   c                   | j                                         }t          | j         j                  |z
  }t	          t          |                    }t	          t          |||z                       }|                     ||z              t          | j        d                   dk    sJ | j        d         | j        d         d         | j        d         d         ff| _        d S )Nr   r   r   )r   get_original_num_rdimsr   	iter_varsr   ranger~  r   )rh   	num_rdims
num_pwdimspwdimsrdimss        ri   swap_pw_red_dimensionz#SchedulerNode.swap_pw_red_dimension6  s    J5577	-..:
uZ(())eJ
Y(>??@@!!%&.1114:a=!!Q&&&&Z]TZ]1%5tz!}Q7G$HH


rk   r\   c                B    | j                                         | _         | S ru   )r   extract_pw_from_reductionrg   s    ri   r  z'SchedulerNode.extract_pw_from_reduction@  s    Z99;;
rk   c                   t                               |           sd S t          | j        t          j                  sJ | j                                        5  |                                  d d d            d S # 1 swxY w Y   d S ru   )r   r   r   r   r)   r   with_original_inner_fnrX  rg   s    ri   cancel_reduction_splitz$SchedulerNode.cancel_reduction_splitD  s     33D99 	F$)R%677777Y--// 	" 	"!!!	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	" 	"s   A99A= A=	dimensionr   	new_rangec                   t          | j        t          j        t          j        f          sJ | j                            ||          | _        | j        j        | _        | j        	                                }| j
                            |          j        }| || j                  f| _        |                     dd           d S )NTr{  )r   r   r)   r   r`  r   #expand_dimension_for_pointwise_noder}  rU  rb  r  rc  rd  r   rw  )rh   r  r  r  rd  s        ri   r  z1SchedulerNode.expand_dimension_for_pointwise_nodeK  s     $)b&79J%KLLLLLZCCy
 

 j&..00>--f55>hht{334
 	!!D$!OOOOOrk   c                    | j                                         | _         | j         j        | _        |                     dd           d S )NTFr{  )r   merge_loopsr}  rU  rw  rg   s    ri   r  zSchedulerNode.merge_loops\  sD    Z++--
j& 	!!D%!PPPPPrk   r  r2   r  c                   d }| j         d         }t          |          |j        cxk    r|j        k    rn n|                    |          }|rZt          xj        dz  c_        t                              d|                                 |           | 	                    |           dS t                              d|                                            dS )Nr   r   z"Reorder loops for %s with order %sTzEDon't reordering %s because we can not decide the suitable loop orderF)
rU  r   num_varsdecide_loop_order_to_matchr*   num_loop_reorderingloop_ordering_logr&  r  r~  )rh   r  r  rx  
self_sizess        ri   r  z'SchedulerNode.reorder_loops_by_dep_pairh  s     	[^
z??h/EEEE93EEEEEE ;;IFFI 	''1,''##4dmmooy   %%i0004##W   5rk   r   c                0   |                                  }| d| j        d          | d| j        d          | d| j         g}| j                                        D ]}t          |t                    sl|j        }t          j	        
                    |          }t          |t          j                  s,|                    | dt          |j                              t          | j        t"                    rX|                    d| d           |                    t%          j        | j                                        d	                     | j        J |                    |                                            d
                    |          S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r0  r  )r  r   rU  r   r  r   r4   r   rW   r   r  r)   r  r   r(  r&  r   r>   r4  r*  r.  r   r  r  join)rh   r   linesr   r  r   s         ri   r  zSchedulerNode.debug_str_extra  s   }}44TZ]4477
177++dk++

 #4466 	O 	OCc7++ O8g((22!#r'9:: OLLH!M!M
8K8K!M!MNNNdj(++ 	JLL3$333444LL)=)=)?)?HHIIIy$$$T//11222yyrk   Sequence[Sequence[sympy.Expr]]c                    | j         S ru   )rU  rg   s    ri   r  zSchedulerNode.get_ranges  
    {rk   c                   t          | j        t          j        t          j        f          sJ dt          | j                              t          | j                                                  o| j        d u p| j        j	         S Ntype(self.node)=)
r   r   r)   r   r`  r   rl   r   r   has_partial_accumulaterg   s    ri   r   zSchedulerNode.is_reduction  s    $)b&79J%KLL 	
 	
!tDI!!	
 	
L DI002233 
J$Gdj&G"G	
rk   c                    t          | j        t          j                  sJ dt	          | j                              | j                                        dk    S )Nr  dot)r   r   r)   r   r   r   rg   s    ri   r  zSchedulerNode.is_native_matmul  sO    $)R%677NN9NDOO9N9NNN7y++--66rk   c                   t          | j        t          j        t          j        f          sJ dt          | j                              t          | j        t          j                  o#t          | j        j        t          j                  S r  )r   r   r)   r   r`  r   r   	SplitScanrg   s    ri   r!  zSchedulerNode.is_split_scan  s{    $)b&79J%KLL 	
 	
!tDI!!	
 	
L $)R%677 
JINBL=
 =
 	
rk   c                @    t          | j        t          j                  S ru   r   r   r)   r`  rg   s    ri   r#  zSchedulerNode.is_template  s    $)R%6777rk   r  c                R    t          | j        t          j                  r| j        nd S ru   r  rg   s    ri   r  zSchedulerNode.get_template_node  s"    &ty"2CDDNtyy$Nrk   
index_varsSequence[sympy.Expr]c                    |                                   |                                  |                     |           d S ru   )rO  r  rA  )rh   r  s     ri   runzSchedulerNode.run  s9    ""$$$Z     rk   dict[sympy.Expr, sympy.Expr]c                R   | j         }t          t          t          |                    t          t          t          |                    k    sJ t	          t          t          j                            |          t          j                            |                              }|S ru   )	rU  r   mapr   dictzipr  r  from_iterable)rh   r  r}  r   s       ri   ranges_from_index_varsz$SchedulerNode.ranges_from_index_vars  s     3sE??##s3sJ+?+?'@'@@@@@--j99--e44 
 

 rk   c                   |                      |          }	 t          j        t          t          j                    |                    5  t          j                            |           5   | j        |  ddd           n# 1 swxY w Y   ddd           dS # 1 swxY w Y   dS # t          $ r" t          
                    d| j                    w xY w)a  
        Generate code for this node using the provided index variables.

        This method sets up the appropriate context for code generation, including
        simplifying indexing expressions based on the variable ranges, and then
        calls the node's body function with the index variables.

        Args:
            index_vars: A sequence of sequences of sympy expressions representing
                        the index variables for each dimension of the computation.
        NzError in codegen for %s)r  rW   set_ops_handlerrD   get_ops_handlerr=  set_current_noder   r  r  fatalr   )rh   r  r   s      ri   rA  zSchedulerNode.codegen  sX    00<<
	!"213D3F3F
"S"STT( ())$//( ( 
J''	( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ( (
  	 	 	II/;;;	sS   3B& 
 B*B5BB	BB		BB& BB&  B!B& &,CT	pointwiserl  c                    |r| j         nt          | j                   \  }}t          j        | j        |t
          j        j        gt          |          z  g          S )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )hidden_args)	rU  r   r(   rf  r   r   SZeror   )rh   r  
keep_sizesignore_sizess       ri   "pointwise_or_reduction_read_writesz0SchedulerNode.pointwise_or_reduction_read_writes  s\     3<#V4;;$+AVAV 
L/J
%',#lBSBS1S0T
 
 
 	
rk   c                .    |                      d          S )zH
        Get the memory dependencies in the non-reduction axes.
        Tr  r  rg   s    ri   rq  z#SchedulerNode.pointwise_read_writes  s    
 666FFFrk   c                .    |                      d          S )zD
        Get the memory dependencies in the reduction axes.
        Fr  r  rg   s    ri   reduction_read_writesz#SchedulerNode.reduction_read_writes  s    
 666GGGrk   r(  r)  c                   |                                  rdS t          d |                                 D                       rdS t          | j        j                  dk    rt          |t          j                  rzt          t          | j        j                            }t          |t          j                  sJ dt          |                      |j        |j        k    o|j        |j        k    S dS )NFc              3  >   K   | ]}|                                 V  d S ru   )r'  r  s     ri   r   z,SchedulerNode.can_inplace.<locals>.<genexpr>  s,      ??Ss  ??????rk   r   ztype(write_dep)=)r#  r   r~  r   r   r  r   r(   r2   r  iterr   r   r   )rh   r(  	write_deps      ri   r,  zSchedulerNode.can_inplace  s     	5??D,<,<,>,>????? 	5t&''1,,l,2
 2
, T$"2"9::;;Ii)?@@WWBWT)__BWBWWW@>Y_4X).9XXurk   r_  c                   t                      }t          | j        t                    r| j                                        D ]}|j        dk    r|j        dk    rd|j        v r|j        d         dk    s)t          |j	                  dk    ra|j	        d         dk    rP|
                    d|j        v r|j        d         n&t          |j	                  dk    r|j	        d	         nd
           |S )Ncall_methodstoremode
atomic_addr  r/  r   r   r   r  )r   r   r   r>   r   r  r]  rr  r   r4  r  )rh   buffers_store_as_atomic_addr   s      ri   _get_atomic_add_buffersz%SchedulerNode._get_atomic_add_buffers  s    7A||#dj(++ 	
,,..  G},,w..4;..4;v3F,3V3V	NNa//DIaLL4P4P 033!T[00 F++.1$)nn.A.Adillr  
 +*rk   c                    | j         | j                             d          rdS t                                                      S )Ndevice_assert_asyncT)r   has_oprF  r.  rh   rI  s    ri   r.  zSchedulerNode.has_side_effects  s>     :!dj&7&78M&N&N!4ww'')))rk   )r  r  r   rV  r   r2  NN)rY  rZ  r[  r\  r   r2  )rY  rZ  r[  ri  r   r2  )r_  rl   rl  rl   r   r2  )rx  ry  r   r2  rV  r   r\   )r  r   r  r   r   r2  r  rT  )r   r  rW  r  )r  r  r   r2  )r  r  r   r  )r  r  r   r2  r  )r  rl   r   rl  )r   rl  r  r  )"rv   rw   rx   r  ry   ru  rX  rk  rw  r~  r  r  r  r  r  r  r  r  r   r  r!  r#  r  r  r  rA  r  rF   rq  r  r,  r  r.  rP  rQ  s   @ri   r   r     s         
 -,,,OOO      RV@D    F RVBF
 
 
 
 
; ; ; ;<Q Q Q QI I I I   " " " "P P P P"
Q 
Q 
Q 
Q   .       ,   
 
 
 
7 7 7 7
 
 
 
8 8 8 8O O O O! ! ! !
      0 !%	
 	
 	
 	
 	
 G G G ]G H H H ]H    + + + ]+& * * * * * ]* * * * *rk   r   group_snode/Union[FusedSchedulerNode, GroupedSchedulerNode]c                     j         }                     t          j                            d |D                                  t           fdt          j        d |D              D                        j        j        z
   _	        d S )Nc                    g | ]	}|j         
S r{   r   r   r;  s     ri   r   z3refresh_group_node_dependencies.<locals>.<listcomp>'  s    +J+J+JaAM+J+J+Jrk   c              3  R   K   | ]!}|j                                         v|V  "d S ru   r   r  )r   r   r  s     ri   r   z2refresh_group_node_dependencies.<locals>.<genexpr>+  sH       
 
x{;;==== ====
 
rk   c                    g | ]	}|j         
S r{   )rn  r  s     ri   r   z3refresh_group_node_dependencies.<locals>.<listcomp>-  s    )O)O)O1!*>)O)O)Ork   )
r   r  r(   
ReadWrites
merge_listr   unionr   r  rn  )r  r   s   ` ri   refresh_group_node_dependenciesr  "  s     F**+J+J6+J+J+JKK  
 	 
 
 
 
!')O)O)O)O)OP
 
 
 	
 	

 
!
(	) """rk   r  r  r   r  c                   t          | t          t          f          sJ || _        || _        d | _        t          j        d |D              | _        t          |            t          d | j        D                       | _        t          d | j        D                       | _        d |                                 D             | _        d S )Nc                *    g | ]}|j         	|j         S ru   r   r  s     ri   r   z#init_group_node.<locals>.<listcomp>>  s!    	A	A	A!)@!+)@)@)@rk   c              3  $   K   | ]}|j         V  d S ru   ra  r  s     ri   r   z"init_group_node.<locals>.<genexpr>C  $      HHHHHHHHrk   c              3  $   K   | ]}|j         V  d S ru   )rb  r  s     ri   r   z"init_group_node.<locals>.<genexpr>D  r  rk   c                8    i | ]}|                                 |S r{   r{  r|  s     ri   r}  z#init_group_node.<locals>.<dictcomp>E  s/     # # # ## # #rk   )r   r   GroupedSchedulerNoder   r  r   r   r  r   r  r  ra  r  rb  r~  ri  )r  r  r   s      ri   init_group_noder  4  s    
 k$68L#MNNNNNK%KK&,	A	Av	A	A	AK $K000HH[5GHHHHHKHH[5GHHHHHK# #'2'>'>'@'@# # #Krk   c                      e Zd ZU dZded<   ed;d            Zd<d	Zd=dZe	d>d            Z
d?dZd@ fdZe	dAd            ZdAdZe	dBd            ZdCdZdAdZdAdZdD fd"Ze	dBd#            Ze	dBd$            ZdEd&ZdAd'Ze	dFd(            Ze	dFd)            Ze	dFd*            Ze	dFd+            Ze	dGd-            ZdHd/Ze	dFd0            ZdId2ZdJd5Z dKd8Z!dAd9Z"e	dF fd:            Z# xZ$S )Lr   z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r   r~   r\   r   r   c           	        |j         |j         u sJ t          |t          t          f          sJ |                                rt          |t
                    rxt          |j        t                    sJ t          |j	        j
                  dk    sJ t          t          t          |j	        j
                            t                    sJ t          t          |j	        j
                            j        }d |                                D             }t          |          dk    sJ |d         }t          |j	        j
                  dk    sJ t          t          |j	        j
                            }t          |t                     sJ t#          t!          ||j        |j        |j        |j                  g          |j	        _
        nt          |t          t          f          sJ t-          t/          j        |                                |                                                    } | |j         |          S )Nr   c                :    g | ]}|                                 |S r{   r  r  s     ri   r   z+FusedSchedulerNode.fuse.<locals>.<listcomp>c  s)    WWWtDDTDTDVDVWdWWWrk   r   )r  r   r   r   r#  r  r   r;   r   r   r  r  r  r3   r   r   r2   r   r   	var_namesr   r  r  r  r  )ro   r~   r   r   template_nodesr  writer  s           ri   rp   zFusedSchedulerNode.fuseS  s    %/1111%-1C!DEEEEE 	J:e5N#O#O 	J ej+66666u(/00A5555d4(9(@#A#ABBGLLLLLU.56677<DWWu/@/@WWWN~&&!++++*1-M}0788A====m7>??@@EeY/////'1ek5?EJ
 ( (E$$ em5G%HIIIIIY_U__%6%68I8IJJKKs5?E***rk   c                    | j         D ]C}t          |t                    sJ |                                sJ |                                 D| S ru   )r   r   r   r   r  rh   r   s     ri   r  z,FusedSchedulerNode.extract_pw_from_reductionu  s[    { 	0 	0Gg}55555'')))))--////rk   r2  c                p    | j         D ]-}t          |t                    sJ |                                 .d S ru   )r   r   r   r  r  s     ri   r  z(FusedSchedulerNode.swap_pw_red_dimension|  sH    { 	, 	,Gg}55555))++++	, 	,rk   r  c                    t          t          d d |                                 D                                 }t          |          dk    rd S t	          |          }|S )Nc              3     K   | ]@}|                                 s|                                *|                                V  Ad S ru   r#  r%  r  r  s     ri   r   z4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>  h        '')) .2^^-=-=''))     rk   r   r  filterr   r   r   rh   fpsr  s      ri   r  z!FusedSchedulerNode.estimate_flops  o       $ 0 0   	
 	
 s88q==4#hh
rk   r  r2   r  rl   c                   |                                  rdS d}| j        D ]p}t          |t                    sJ |Ht	          |          t	          |j        d                   k    rt                              d            dS |j        d         }qd}|J t          |          |j	        cxk    r|j	        k    rn n|
                    |          }|s/t                              d|                                            dS t          xj        dz  c_        t                              d|                                 |           | j        D ].}t          |t                    sJ |                    |           /t          |            dS )	z@
        Return true if a loop reordering is performed.
        FNr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %sT)r#  r   r   r   r   rU  r  r&  r   r  r  r  r*   r  r~  r  )rh   r  r  r  r  rx  s         ri   r  z,FusedSchedulerNode.reorder_loops_by_dep_pair  s     	5
[ 	) 	)Ee]33333%%
*;*;uU\RS_?U?U*U*U!''G   uuaJJ	%%%z??h/EEEE93EEEEEE ;;IFFI 	##a   5##q(##;T]]__i	
 	
 	
 [ 	2 	2Ee]33333&&y1111'---trk   r  r  c                    t                                          |           t          | ||           g | _        t	          |d           j        | _        d S )Nc                D    t          |                                           S ru   )r   r   r  s    ri   rs  z-FusedSchedulerNode.__init__.<locals>.<lambda>  s    s1>>3C3C/D/D rk   r-  )rF  ru  r  r  r  r   )rh   r  r   rI  s      ri   ru  zFusedSchedulerNode.__init__  sS    ###i000%'
%D%DEEEK


rk   r   c                J    d                     d | j        D                       S )N_c                6    g | ]}|                                 S r{   r{  r  s     ri   r   z/FusedSchedulerNode.get_name.<locals>.<listcomp>       ;;;!;;;rk   r  r   rg   s    ri   r  zFusedSchedulerNode.get_name  %    xx;;t{;;;<<<rk   c                @    | j         d                                         S r   r   r  rg   s    ri   r  z!FusedSchedulerNode.get_first_name      {1~&&(((rk   r_  c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r{   r  r  s     ri   r   z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>  $    !L!L!L1!"4"4"6"6!L!L!Lrk   r   r  r   rg   s    ri   r  z#FusedSchedulerNode.get_buffer_names  !    !L!L!L!L!LMMrk   rf  c                l    g }| j         D ])}|                    |                                           *|S ru   r   r  r~  rh   r,  r   s      ri   r~  zFusedSchedulerNode.get_outputs  >    (*K 	. 	.DMM$**,,----rk   c                .     fdt           j                  D             } j        d         j        }|'|                                                                t          j        d                    |                                          d          S )Nc                r    g | ]3\  }}                                  d | d|                                 4S )z.snodes[z] =
)r  r.  )r   r  r   rh   s      ri   r   z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>  sU     
 
 
4 }}BBBB0@0@BB
 
 
rk   r   r  r0  )	r  r   r   r  r  r4  r*  r  r  )rh   r  r   s   `  ri   r  z"FusedSchedulerNode.debug_str_extra  s    
 
 
 
$T[11
 
 
 {1~"LL3355666tyy//6688&AAArk   c                2    d | j         D             }|  d| S )Nc                6    g | ]}|                                 S r{   )r  r  s     ri   r   z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>  s$    EEEd**,,EEErk   z
, snodes: r  )rh   
snodes_strs     ri   r  z"FusedSchedulerNode.debug_str_short  s+    EEEEE
..*...rk   r  r  rd  c                    t                                          ||           t                      }t          | j                  D ]2}|                    ||           |                    |j                   3d S ru   )rF  r  r   r   r   updater`  )rh   r  r  r   rI  s       ri   r  z!FusedSchedulerNode.set_last_usage  s    
 	24FGGG 0:||T[)) 	8 	8D 35GHHH&&t7777	8 	8rk   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r{   )r   r  s     ri   r   z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>  s$    !M!M!MA!"5"5"7"7!M!M!Mrk   r  rg   s    ri   r   z$FusedSchedulerNode.used_buffer_names  s!    !M!M!M!M!MNNrk   c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r{   )r  r  s     ri   r   zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>  s$    DDD1a,,..DDDrk   r  rg   s    ri   r  z/FusedSchedulerNode.used_or_aliased_buffer_names  s%    DDDDD
 	
rk   r  c                    | j         S ru   r  rg   s    ri   r   zFusedSchedulerNode.get_nodes  r  rk   c                Z    t          |           j         d|                                  dS )Nz(nodes=r  r  rg   s    ri   r  zFusedSchedulerNode.__repr__  s*    t**%@@dmmoo@@@@rk   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   r   r  s     ri   r   z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s,      991>>##999999rk   r   r   rg   s    ri   r   zFusedSchedulerNode.is_reduction   s!    99T[999999rk   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   )r  r  s     ri   r   z6FusedSchedulerNode.is_native_matmul.<locals>.<genexpr>  s.      ==A1%%''======rk   r'  rg   s    ri   r  z#FusedSchedulerNode.is_native_matmul  s!    ========rk   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   )r!  r  s     ri   r   z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>
  s,      ::1??$$::::::rk   r'  rg   s    ri   r!  z FusedSchedulerNode.is_split_scan  s!    ::dk::::::rk   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   r  r  s     ri   r   z1FusedSchedulerNode.is_template.<locals>.<genexpr>  s*      88q1==??888888rk   r'  rg   s    ri   r#  zFusedSchedulerNode.is_template  s!    88DK888888rk   r  c                n    | j         D ],}|                                r|                                c S -d S ru   )r   r#  r  r  s     ri   r  z$FusedSchedulerNode.get_template_node  sI    K 	0 	0D!! 0--/////0trk   torch.devicec                    | j         d         S r   )r   rg   s    ri   r   zFusedSchedulerNode.get_device  s    z!}rk   c                >    t          d | j        D                       S )Nc              3  >   K   | ]}|                                 V  d S ru   )r  r  s     ri   r   z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s.      EEA1--//EEEEEErk   r'  rg   s    ri   r  z+FusedSchedulerNode.has_aliasing_or_mutation  s!    EEEEEEEErk   r  c                    t           ru   NotImplementedErrorr  s     ri   r  z'FusedSchedulerNode.update_mutated_names       !!rk   r   r1   c                    t           ru   r4  )rh   r   s     ri   r  zFusedSchedulerNode.add_fake_dep#  r6  rk   r(  r)  c                    t           ru   r4  r+  s     ri   r,  zFusedSchedulerNode.can_inplace&  r6  rk   c                   |                                  }d                    d | j        D                       }t                      }|                    | dt          |           j         d| d| dt          | j        j	                   d| dt          | j
                   d| d	t          | j        j        | j
        z
             d| d
           |                                5  |                                 D ])}|                    |                                           *	 ddd           n# 1 swxY w Y   |                    d           	 |                    |                                            n,# t"          $ r t$                              dd           Y nw xY w|                                                                S )r  r#  c              3  >   K   | ]}t          |          j        V  d S ru   )r   rv   r  s     ri   r   z/FusedSchedulerNode.debug_str.<locals>.<genexpr>,  s+      FFQQ 0FFFFFFrk   r"  r  r  r  r  r  r  z.outputs = [
            Nr$  r  Tr  )r  r  r   rO   r  r   rv   r(  r   r  rn  r   r*  r~  r.  r%  r  r  r  r  r+  r  )rh   r   node_typestrr   r   s        ri   r.  zFusedSchedulerNode.debug_str)  sR   }}xxFF$+FFFFF

 	d	 +  )011    %T%<==  	  #4#3#9D<S#STT	 
   	
 	
 	
 ZZ\\ 	, 	,'')) , ,

3==??++++,	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	, 	c	HJJt++--.... 	H 	H 	HKK7$KGGGGG	H   '')))s$   (?D44D8;D8'E= =&F&%F&c                    | j         t          d | j         D                       S t                                                      S )Nc              3  >   K   | ]}|                                 V  d S ru   )r.  r  s     ri   r   z6FusedSchedulerNode.has_side_effects.<locals>.<genexpr>F  s.      GG4t,,..GGGGGGrk   )r   r   rF  r.  r  s    ri   r.  z#FusedSchedulerNode.has_side_effectsC  s@    ;"GG4;GGGGGGww'')))rk   r~   r\   r   r\   r   r   r  rV  r  r  )r  r  r   r  r   r2  rT  r  r   rf  r  r  rW  r  )r   r/  r  )r   r1   r   r2  r  )%rv   rw   rx   r  ry   rz   rp   r  r  rF   r  r  ru  r  r  r  r~  r  r  r  r   r  r   r  r   r  r!  r#  r  r   r  r  r  r,  r.  r.  rP  rQ  s   @ri   r   r   J  sQ          $###+ + + [+B   , , , ,
    ]"( ( ( (TL L L L L L = = = ]=) ) ) ) N N N ]N   	B 	B 	B 	B/ / / /8 8 8 8 8 8 O O O ]O 
 
 
 ]

   A A A A : : : ]: > > > ]> ; ; ; ]; 9 9 9 ]9    ]    F F F ]F
" " " "" " " "" " " "* * * *4 * * * * * ]* * * * *rk   r   c                  8     e Zd Zd fdZdd	ZddZddZ xZS )FusedMixOrderReductionsr~   r\   r   r   r2  c                   t                               |          s t                               |          sJ ||}}|| _        || _        t	                                          |j        t          |                                          t          |                                          z              t           	                    | j                  | _
        d S ru   )r   r   r~   r   rF  ru  r  r  r   r   numel)rh   r~   r   rI  s      ri   ru  z FusedMixOrderReductions.__init__K  s     33E:: 	($77>>>>> %5E

OT%//"3"344tEOO<M<M7N7NN	
 	
 	
 '00<<


rk   other_nodestuple[BaseSchedulerNode, ...]c                *   t          |t                    rJ t          |t                    rJ | j                            ||d          sdS t                              |          rt                              |          sdS d
d}d
d}|r4 |||f           ||          z  s ||           |||f          z  rdS |                                 p=t          j        t          | j        
                    ||d	                    | j        k    S )a  
        node1 is from the current mix order reduction; node2 is another node we want to fuse in.

        other_nodes are passed in to check if fusion will introduce producer/consumer relationship
        between the inner and outer reduction. If yes, we don't fuse.
        Fallow_mix_order_reductionr  rE  r   r_  c                F    t                      } |j        d | D              S )Nc              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zTFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestors.<locals>.<genexpr>u  s$      ::qq{::::::rk   r   r  r  r   s     ri   _get_ancestorszAFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_ancestorss  s)    ,,C39::E:::;;rk   c                F    t                      } |j        d | D              S )Nc              3  >   K   | ]}|                                 V  d S ru   )r   r  s     ri   r   zZFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_names.<locals>.<genexpr>{  s.      FF1q4466FFFFFFrk   rK  rL  s     ri   _get_operation_nameszGFusedMixOrderReductions.sub_node_can_fuse.<locals>._get_operation_namesw  s+     ,,C39FFFFFGGrk   )count_bytes)r  rE  r   r_  )r   rA  r  r   r   r   r   typingcastr   score_fusion_memoryrC  )rh   r~   r   rD  rM  rP  s         ri   sub_node_can_fusez)FusedMixOrderReductions.sub_node_can_fuseW  ss    e%<=====e%<=====
 ~&&ueu&UU 	5 //
 
 	#66u==	 5	< 	< 	< 	<	H 	H 	H 	H  	u~..1E1Ek1R1RR {++.B.BE5>.R.RR u ""$$$ {T^77uRW7XX  z	
rk   otherc                h   t          |t                    sD|                     | j        || j        f          p!|                     | j        || j        f          S |                     | j        |j        | j        |j        f          o,|                     | j        |j        t                                S ru   )r   rA  rU  r~   r   r   rh   rV  s     ri   can_fuse_withz%FusedMixOrderReductions.can_fuse_with  s    %!899 		K))
EDJ=  J''
EDJ=IIJ ))
EK$*ek)B  K((U[%''JJKrk   c                6   | j                                         }| j                            |          }t	          |t
                    rP|                    | j         |j                   }|                    | j        |j                  }t          ||          S |                     | j         || j        f          r0|                    | j         |          }t          || j                  S |                    | j        |          }t          | j         |          S ru   )	r~   r   r  rc  r   rA  rp   r   rU  )rh   rV  r  backendfused_node1fused_node2r7  s          ri   	fuse_withz!FusedMixOrderReductions.fuse_with  s    &&((.,,V44e455 
	G!,,tz5;??K!,,tz5;??K*;DDD%%dj%$*GG G$\\$*e<<
.z4:FFF$\\$*e<<
.tz:FFFrk   r*  )r~   r\   r   r\   rD  rE  )rV  r\   )rv   rw   rx   ru  rU  rY  r^  rP  rQ  s   @ri   rA  rA  J  s        
= 
= 
= 
= 
= 
=2
 2
 2
 2
h
K 
K 
K 
KG G G G G G G Grk   rA  c                      e Zd ZU dZd,dZd-dZed.d
            Zed/d            Z	 	 	 d0d1 fdZ	ed2d            Z
ed3d            ZeZded<   ed4d            Zed3d             Zd5d!Zd5d"Zd6d#Zd7d$Zd8d&Zd9d(Zd:d+Z xZS );ForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerr\   r   r  c                    |                                 D ]>}|                                | j        v r!| j        |                                         c S ?d S ru   )r~  r  read_to_node)rh   ra  r   s      ri   get_consumer_subnode_forz3ForeachKernelSchedulerNode.get_consumer_subnode_for  s]     '')) 	9 	9C||~~!222(8888 3 trk   consumerc                   t          t                               }|j        j        D ]h}|j        | j        j        vr| j        j        |j                                                 }|| j        v r |	                    | j        |                    it          |          dk    rt          t          |                    S d S Nr   )r   r\   r   r   r   r  r@  r  name_to_noder  r   r  r  )rh   re  	producersrd	node_names        ri   get_producer_subnode_forz3ForeachKernelSchedulerNode.get_producer_subnode_for  s     0133	&, 	< 	<Bwdn888227;LLNNID---d/	:;;; y>>QY(((4rk   rl   c                
   t          |          }                                r|                                rt          j        t                    t          j        t          |          }t          j                  t          |j                  k    }|s |d           |o2t          fdt          j        |j                  D                       S |                                rz	                                r |d           dS t          j        t          |          }|
                              }||j                            |          S  |d           dS                                 rz|	                                r |d           dS t          j        t                                        |          }|j                            ||          S  |d           dS t          d          )	Nzforeach do not have same lengthc              3  T   K   | ]"\  }}j                             ||          V  #d S ru   )r  r   )r   lrra  s      ri   r   z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  sN       ) )Aq "++Aq11) ) ) ) ) )rk   zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r'  rR  rS  r`  r   r   r   r  r   rd  r  r   rl  r  )ro   ra  re  whyforeach_matchconsumer_subnodeproducer_subnodes    `     ri   r   z#ForeachKernelSchedulerNode.can_fuse  s<   (++   &	X%8%8%:%: &	{#=xHHH{#=xHHH00C4H4HHM  75666  S ) ) ) )AA) ) ) & &    "" 	$$&& n   u{#=xHHH'@@JJ+)228=MNNNCGHHH5  "" 	$$&& n   u{#=xHHH'@@JJ+)223CXNNNCGHHH5f
 
 	
rk   c                   |                                 s|                                 sJ |                                 r)t          j        t          |          }|j        }|j        }n(t          j        t          |          }|j        }|j        }d }d }|                                 rn|                                 rZt          j        t          |          }t          j        t          |          }d t          |j        |j                  D             }nO|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    |	|          }
|
}|                    |
           9|                    |	           On|                                 rt          j        t          |          }|                    |          }g }|}d }|j        D ]N}	|	|u r3t          
                    ||	          }
|
}|                    |
           9|                    |	           Ont          d           | |j        |||||          S )Nc                J    g | ] \  }}t                               ||          !S r{   )r   rp   )r   ro  rp  s      ri   r   z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>	  s<       Aq #''1--  rk   zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r'  rR  rS  r`  rw  rz  r  r   rl  r   rp   r   rd  r  r  )ro   ra  re  rw  rz  rx  ry  fused_nodesrt  r   new_noders  s               ri   rp   zForeachKernelSchedulerNode.fuse  s    ""$$=(;(;(=(====   	7{#=xHHH(0(J%&6OO{#=xHHH(0(J%&6O   &	X%8%8%:%: &	{#=xHHH{#=xHHH AA  KK   "" 	{#=xHHH'@@JJK"KK  - -+++166tXFFH"*K&&x0000&&t,,,,-   "" 	{#=xHHH'@@JJK"KK  - -+++166xFFH"*K&&x0000&&t,,,,- !f   s&?##+
 
 
 	
rk   NFr  r  r   r  rw  rx  ry  rz  r2  c                    i  _         i  _        ||ht                                          ||           |D ]A}|j        j        D ]}| j         |j        <   |                                D ]}	| j        |	<   Bn| _        | _	        d  _
        g  _                             t          j                            |j        |j        g                     t!           fdt!          j        |j        |j                  D                        j        j        z
   _        t)          |j        |j        g           _        t-          |j        |j        g           _        |                                rt3          |t4                    sJ ||}}
nt3          |t4                    sJ ||}}
|
j         _         j                            |j                   |
j         _        |                                D ]}	| j        |	<   d  j	        D              _        | _        |d                                         }|sJ |tA          j!        d          fff _"        t!          tF          j$        j%                              _&        | _'        d S )Nc              3  R   K   | ]!}|j                                         v|V  "d S ru   r  r  s     ri   r   z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>\	  sL         xt'<'<'>'>>>	  ?>>> rk   c                R    i | ]$}|j                                         D ]\  }}||	%S r{   )ri  items)r   r  r  vs       ri   r}  z7ForeachKernelSchedulerNode.__init__.<locals>.<dictcomp>w	  sW     @ @ @%:O:U:U:W:W@ @26!Q1@ @ @ @rk   r   combo_kernel)(rc  rh  rF  ru  r   r   r   r   r  r   r   r  r  r(   r  r  r   r  rn  r  r  ra  r  rb  r'  r   r`  r   r  ri  rw  r   r   Exprr   r  fxNoderb  rz  )rh   r  r   rw  rx  ry  rz  r   rL  r   foreach_noder   r  rI  s   `            ri   ru  z#ForeachKernelSchedulerNode.__init__:	  s    +"5GGY/// 3 3 ,2 8 8D37D%di00 4466 3 3D.2D%d++3	3 'DN DKDI)+DJ  '22 ,k.EF        )/#68V        ")* # !+"79N!OPPDN +"79N!OPPDN%%'' D!+/IJJJJJ+6j!+/IJJJJJ+6j)3DNN!!*"6777 , 9D"6688 5 5*4!$''@ @"&+@ @ @D  *C&%%''v
> : :<>?
!%(-022.rk   r  c                $   d |D             }|r3t                               dt          |          d |D                        d |D             }|r(t                               dt          |                     d |D             }|r(t                               dt          |                     d |D             }d	 |D             }|r(t                               d
t          |                     d |D             }d |D             r)t                               dt                               fd|D             }t          j        rBd |D             }|r(t                               dt          |                     d |D             }|S )Nc                <    g | ]}t          |t                    |S r{   )r   r  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s(    OOOj4M&N&NO!OOOrk   z/ComboKernels: %d external nodes are filtered %sc                N    g | ]"}|j         	|j                                         #S ru   r   r\  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s-    UUUTty?T&&((?T?T?Trk   c                <    g | ]}t          |t                    |S r{   )r   r  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s(    KKKz!5I'J'JK1KKKrk   z+ComboKernels: %d grouped nodes are filteredc                <    g | ]}t          |t                    |S r{   )r   rA  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s(    PPP1A7N)O)OPQPPPrk   z;ComboKernels: %d FusedMixOrderReductions nodes are filteredc           	     b    g | ],}t          |t          t          t          t          f          *|-S r{   )r   rG  r  r  rA  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  sL     
 
 
*-(+	 

 
 
rk   c                <    g | ]}t          |t                    |S r{   r   r`  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s8     
 
 
A7Q)R)R

 
 
rk   z+ComboKernels: %d foreach nodes are filteredc                <    g | ]}t          |t                    |S r{   r  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s8     
 
 
Z;U-V-V

 
 
rk   c                :    g | ]}|                                 |S r{   r  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s%    GGGq}}G!GGGrk   z0ComboKernels: %d template nodes are filtered: %sc                    g | ]}|v|	S r{   r{   )r   r;  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s#    OOOq7N7N!7N7N7Nrk   c                :    g | ]}|                                 |S r{   r&  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s'    MMMQANN<L<LMqMMMrk   zCComboKernels: %d reduction nodes are filtered (pointwise_only mode)c                :    g | ]}|                                 |S r{   r&  r  s     ri   r   z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>	  s'    PPPAq~~?O?OPaPPPrk   )r  r&  r   r&   combo_kernels_pointwise_only)	ro   r  externgrouped	mix_orderfiltered_nodesforeach_nodesreduction_nodesr  s	           @ri   combinable_nodesz+ForeachKernelSchedulerNode.combinable_nodes	  s     POUOOO 	IIAFUUVUUU  
 LKeKKK 	II=G   QPPPP	 	IIMI  

 

 
 

 
%
 
 
  	YIICSEWEWXXX
 
%
 
 
 HG^GGG 	IIBN##  
 POOO^OOO . 	QMM.MMMO 		Y((   QPPPPNrk   list[list[BaseSchedulerNode]]c                  	 |                                  }g }d	t          d |D                       }|D ]}t          t                    }|D ]b}|                                }|r|j        dk    s|j        dk    r/|                                |z  rG||                             |           c|                                D ]@|	                    	fdt          dt                    	          D                        A|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                t    g | ]5}|D ]0}t          |t                    |                                D ]}|16S r{   )r   rA  r  )r   r   r   r  s       ri   r   zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>	  sw       !  d$;<<	
 !% 5 5 7 7 
 	     rk   mpsr  c                *    g | ]}||z            S r{   r{   )r   r  device_nodesmax_num_nodess     ri   r   zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>	  s8        %Q]):%:;  rk   r   )_topological_sort_nodesr   r   r  r   r   r   r   r   r  r  r   )
r  sorted_nodesgrouped_nodesexcluded_buffer_namesr  device_groupsr   r  r  r  s
           @@ri   &_default_group_nodes_for_combo_kernelszAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels	  st    !88::1; )  2
 2
 " 	 	E D!!   3 3** v{e33v{e7K7K ))++.CC f%,,T2222 !. 4 4 6 6  $$    !&q#l*;*;]!K!K      rk   4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                    | t           _        d S ru   r`  r  )r  s    ri   %set_group_algorithm_for_combo_kernelsz@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels	  s    
 # 	#DDDrk   c                6    t                               |           S ru   r  r  s    ri   group_nodes_for_combo_kernelsz8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels	  s     *KKIVVVrk   c                    t           ru   r4  rg   s    ri   r  z#ForeachKernelSchedulerNode.mark_run
  r6  rk   c                    t           ru   r4  rg   s    ri   rA  z"ForeachKernelSchedulerNode.codegen
  r6  rk   c                    dS rM  r{   rg   s    ri   r'  z%ForeachKernelSchedulerNode.is_foreach	
  r  rk   c                *    t          | j                  S )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r  r   rg   s    ri   get_subkernel_nodesz.ForeachKernelSchedulerNode.get_subkernel_nodes
  s     DK   rk   r  c                x    t          t          j                            d | j        D                                 S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c              3  >   K   | ]}|                                 V  d S ru   )r   r  s     ri   r   z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>
  s*      1U1UA!++--1U1U1U1U1U1Urk   )r  r  r  r  r   rg   s    ri   r   z$ForeachKernelSchedulerNode.get_nodes
  s3     IO111U1U1U1U1UUUVVVrk   r   c                @    | j         d                                         S r   )r   r  rg   s    ri   r  z)ForeachKernelSchedulerNode.get_first_name
  s    {1~,,...rk   r  r  c                z    t          | || j        j                   | j        D ]}|                    |           d S ru   )r  r  r@  r   r  )rh   r  r   s      ri   r  z/ForeachKernelSchedulerNode.prune_redundant_deps
  sO     	d$68RSSSK 	: 	:D%%&89999	: 	:rk   )ra  r\   r   r  )re  r\   r   r  ra  r\   re  r\   r   rl   )ra  r\   re  r\   r   r`  )NNF)r  r  r   r  rw  rl   rx  r  ry  r  rz  rl   r   r2  r  r  r   r  )r  r  r   r  )r  r  r   r2  rV  rW  r   r  r  rT  r  )rv   rw   rx   r  rd  rl  rz   r   rp   ru  r  r  r  r  ry   r  r  r  rA  r'  r  r   r  r  rP  rQ  s   @ri   r`  r`    s         
      & ,
 ,
 ,
 [,
\ >
 >
 >
 [>
J 4837 %F/ F/ F/ F/ F/ F/ F/P ? ? ? [?B * * * \*\ 	/ & / / / / 
 
 
 \
 W W W \W
" " " "" " " "   ! ! ! !
W W W W
/ / / /: : : : : : : :rk   r`  c                       e Zd ZU dZded<   ed"d            Z	 d#d$ fdZd%dZd&dZ	e
d'd            Zd'dZe
d(d            Zd)dZe
d*d            Zd+dZd,dZed-d!            Z xZS ).r  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r   r   c                    |d         j         t          fd|D                       sJ  | |          }|D ]}|j        |                                <   |j        |                                <   |S )Nr   c              3  *   K   | ]}|j         u V  d S ru   r  )r   r   r  s     ri   r   z.GroupedSchedulerNode.create.<locals>.<genexpr>1
  s*      BB44>Y.BBBBBBrk   )r  r   r  r  )ro   r   grouped_snoder  r  s       @ri   createzGroupedSchedulerNode.create.
  s    1I'	BBBB6BBBBBBBBIv.. 	K 	KE=JI()9)9::AN	$]%;%;%=%=>rk   Fr  r  temp_groupingrl   r2  c                z    t                                          |           t          | ||           || _        d S ru   )rF  ru  r  r  )rh   r  r   r  rI  s       ri   ru  zGroupedSchedulerNode.__init__8
  s?     	###i000 +rk   c                    | j         r| j        S | j        D ]#}|| j        j        |                                <   $| j        j        |                                 = | j                            | j                  S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  r   r  r  r  
fuse_nodes)rh   r  s     ri   unpackzGroupedSchedulerNode.unpackG
  ss    
  	;[ 	H 	HEBGDN-enn.>.>??N-dmmoo>~((555rk   fake_depr1   c                    |                      | j                            |                     | j                            |           d S ru   )r  r   r  rn  r  )rh   r  s     ri   r  z!GroupedSchedulerNode.add_fake_depT
  sD    T-77AABBB##H-----rk   r   c                J    d                     d | j        D                       S )Nr  c                6    g | ]}|                                 S r{   r{  r  s     ri   r   z1GroupedSchedulerNode.get_name.<locals>.<listcomp>Z
  r  rk   r  rg   s    ri   r  zGroupedSchedulerNode.get_nameX
  r	  rk   c                @    | j         d                                         S r   r  rg   s    ri   r  z#GroupedSchedulerNode.get_first_name\
  r  rk   r_  c                <    t          j        d | j        D              S )Nc                6    g | ]}|                                 S r{   r  r  s     ri   r   z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>a
  r  rk   r  rg   s    ri   r  z%GroupedSchedulerNode.get_buffer_names_
  r  rk   rf  c                l    g }| j         D ])}|                    |                                           *|S ru   r  r  s      ri   r~  z GroupedSchedulerNode.get_outputsc
  r  rk   r  c                    t          t          d d |                                 D                                 }t          |          dk    rd S t	          |          }|S )Nc              3     K   | ]@}|                                 s|                                *|                                V  Ad S ru   r  r  s     ri   r   z6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>o
  r  rk   r   r  r  s      ri   r  z#GroupedSchedulerNode.estimate_flopsi
  r   rk   r  c                    | j         S ru   r  rg   s    ri   r   zGroupedSchedulerNode.get_nodes{
  r  rk   rR  c                R    | j         r| j         d                                         nd S r   )r   r   rg   s    ri   r   zGroupedSchedulerNode.get_device~
  s&    .2kCt{1~((***tCrk   ra  r\   re  c                    dS r  r{   )ro   ra  re  s      ri   r   zGroupedSchedulerNode.can_fuse
  r  rk   )r   r  r   r  )F)r  r  r   r  r  rl   r   r2  r  )r  r1   r   r2  rT  r  r?  r  r  rY  r  )rv   rw   rx   r  ry   rz   r  ru  r  r  rF   r  r  r  r~  r  r   r   r   rP  rQ  s   @ri   r  r  "
  s          $###   [ $	+ + + + + + +6 6 6 6. . . . = = = ]=) ) ) ) N N N ]N       ]"   D D D D    [    rk   r  r{   stride_lengthslist[list[int]]r}  r  priority_idxry  	list[int]c           
     :    t           j        d	 fd            }t          t          t	          t           d                                                 }t          |          dk    r fd|D              t          j        r|                    |           |S )
z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    r  r   br   c                              dk    s         dk    r$t                    dk             dk              S  fdD             }fdD             }t          d t          ||          D                       }t          d t          ||          D                       }||k    rdS ||k    rdS t                     S )Nr   c                :    g | ]}t          |                   S r{   abs)r   slr  s     ri   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>
  #    <<<rBqE

<<<rk   c                :    g | ]}t          |                   S r{   r  )r   r  r  s     ri   r   z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>
  r  rk   c              3  4   K   | ]\  }}|d k    p||k     V  dS r   Nr{   r   sl_asl_bs      ri   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>
  D       
 
)3tDAI$
 
 
 
 
 
rk   c              3  4   K   | ]\  }}|d k    p||k     V  dS r  r{   r  s      ri   r   z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>
  r  rk   r   )rG   r   r  )r  r  stride_len_astride_len_ba_firstb_firstr}  r  s   ``    ri   	index_cmpz"pick_loop_order.<locals>.index_cmp
  s   8q==E!HMMuQx1}eAh!m444 =<<<^<<<<<<<^<<<  
 
7:<7V7V
 
 
 
 
  
 
7:<7V7V
 
 
 
 
 W2W1 1ayyrk   r   c                     g | ]
}|         S r{   r{   )r   pir  s     ri   r   z#pick_loop_order.<locals>.<listcomp>
  s    DDD.,DDDrk   r-  )r  r   r  r   r   r   )		functools
cmp_to_keyr  r   r  r   r&   pick_loop_orderssort)r  r}  r  r  orders   ``   ri   pick_loop_orderr  
  s           4 %N1$5 6 6778899E
<1DDDD|DDD "

y
!!!Lrk   	orig_nodeir.MultiTemplateBufferr|  ir.OperationBufferc                z   |                                 }|                                  }t          |t                    rt          |t                    sJ |                                }|                                 }t          |t                    rt          |t                    sJ t          j        j        |= ||_        t          j        j        |= ||_	        t          j        j
                            |           }t          j        j
                            |           |t          j        j
        |<   |t          j        j        |<   t          j        j                            |           }t          j        j                            |           |t          j        j        |<   |t          j        j        |<   d S ru   )r  r   r   r  rW   r   r  r   
name_to_opoperation_namebuffersr   remove
operations)r  r|  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          ri   _replace_operation_bufferr  
  sb    !))++&&((MmS))Pj9JC.P.PPPP2244//11LlC((NZ8H#-N-NNNN	01!HM	+,*H7?  ++DGO8$$$$AGOD,4AG=)7##I..DGh''''AGt'/AG|$$$rk   r  c                    |                                 }|                                 }||z
  }||z  }|d|z   z  }||z  S rg  )ro  rq  )r~   r   epilogue_runtimetotal_read_bytestemplate_write_bytesextra_bytesextra_bytes_ratioextra_memory_ratios           ri    _estimate_fused_epilogue_runtimer  
  s^     2244 7799"%99K#&:: +a2C.CD 000rk   c                  V    e Zd ZU ded<   dZded<   dZded<   dd	ZddZddZddZ	dS )NodeUser$Union[BaseSchedulerNode, OutputNode]r   Frl   r,  is_weakr   r   c                h    t          | j                                        | j        | j        f          S ru   )r  r   r  r,  r  rg   s    ri   r   zNodeUser.__hash__
  s*    TY''))4+;T\JKKKrk   rV  objectc                    t          |t                    oI|                                 |                                k    o| j        |j        k    o| j        |j        k    S ru   )r   r  r  r,  r  rX  s     ri   __eq__zNodeUser.__eq__
  sY    uh'' .5>>#3#33. E$55. -		
rk   r   c                4    | j                                         S ru   r0  rg   s    ri   r  zNodeUser.get_name
  r1  rk   c                ~    | j         |j         u sJ t          | j         | j        o|j        | j        o|j                  S ru   )r   r  r,  r  rX  s     ri   rL  zNodeUser.merge
  sH    yEJ&&&&I2!2L*U]
 
 	
rk   NrU  )rV  r  r   rl   rT  )rV  r  r   r  )
rv   rw   rx   ry   r,  r  r   r  r  rL  r{   rk   ri   r  r  
  s         ....K GL L L L
 
 
 
$ $ $ $
 
 
 
 
 
rk   r  rl   c                     t           j        S ru   )r&   r  r{   rk   ri   *used_non_deterministic_runtime_estimationsr    s    33rk   	ir.IRNodeOrderedSet[sympy.Symbol]c                   t                      }|                                 }t          |t          j                  r|                    t          |j                  t          |j                  z  t          |j	                  z             t          |t          j
                  r'|                    t          |j                             n|J d|             |S )z=Get free symbols from a node's layout (size, stride, offset).Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r)   Layoutr  r    r   strideoffsetrH  get_layout_symintsr]  )r   free_symbol_usesr&  s      ri   r"  r"    s    1;""$$F&")$$ 
U%%6=))*6=))*	
 	
 	

 fb;<< 	G##$6v}$E$EFFF~~TFTT~~~rk   c                   t          | t                    r% t                      j        d | j        D              S | j        J | j                                        } |j        d | j                                        D               |S )z
    Gets symbols used in a scheduler node, including free symbols from
    the node's operations and layout symints from outputs.
    c              3  4   K   | ]}t          |          V  d S ru   get_scheduler_node_symbol_uses)r   r  s     ri   r   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>   s+      MM,U33MMMMMMrk   Nc              3  4   K   | ]}t          |          V  d S ru   )r"  )r   ir_nodes     ri   r   z1get_scheduler_node_symbol_uses.<locals>.<genexpr>%  s+      	M	M'
W
%
%	M	M	M	M	M	Mrk   )	r   r   r   r  r   r   get_free_symbol_usesr  r~  )r   r#  s     ri   r'  r'    s     $*++ 
!z||!MMMMM
 	
 9   y5577	M	MTY5J5J5L5L	M	M	M  rk   r~   r   c                l    |                                  o t          j        o|                                  S ru   )r#  r&   epilogue_fusionr   s     ri   is_epilogue_fusionr-  *  0    U6#9U%BSBSBUBU>UUrk   c                l    |                                 o t          j        o|                                   S ru   )r#  r&   prologue_fusionr   s     ri   is_prologue_fusionr1  .  r.  rk   c                B    t          | |          pt          | |          S ru   )r-  r1  r   s     ri   is_template_fusionr3  2  s"    eU++O/A%/O/OOrk   c                *    t          | |          r|n| S ru   r-  r   s     ri   template_fusion_pw_noder6  6  s    &ue44?55%?rk   c                  J    e Zd ZdZddZd fdZdd	Zedd            Zej	        dd            ZddZ
ddZddZddZddZddZddZddZddZdd!Zdd#Zdd$Zdd%Zdd&Zdd'Zdd*Z	 ddd/Zdd3Zdd6Zdd7Zdd=Zdd?Z	 dddAZ ddEZ!ddFZ"ddIZ#ddLZ$ddOZ%ddVZ&ddWZ'ddZZ(dd[Z)ddd]Z*dd^Z+dd_Z,dd`Z-ddaZ.ddcZ/dddZ0ddgZ1ddhZ2ddiZ3ddjZ4ddoZ5ddqZ6	 	 dddvZ7ddwZ8ddzZ9ddZ:dddZ;	 	 	 dddZ<ddZ=ddZ>ddZ?ddZ@ddZAddZBddZCddZDddZEddZFddZGeHdd            ZIddZJddZKddZLddZMddZNddZOddZPddZQddZRddZSddZTddZUddZVd dZWddZXddZYddZZddZ[ddZ\ddZ]ddZ^ddZ_ xZ`S (  r  z
    A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
    optimizations such as fusion, reorder, and graph partition.
    r  list[ir.Operation]r   r2  c                    t          d          5  |                     |           d d d            d S # 1 swxY w Y   d S )NzScheduler.__init__)r   _initrh   r  s     ri   ru  zScheduler.__init__@  s    .// 	 	JJu	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   377c           
     l    t                                                        t          j        _        i  _        t          t                     _        t          j
                     _        t                       _        t          g t          j        j                                        t          j        j                                        t          j        j                                                   _         fd|D              _        d  _        d  _                                           j                            t          j        j                                                    j        D ]}|                                 d  _                                          _        d  j        D              _        d  j        D              _         j                                         _        i  _         i  _!        t                       _"        tG          j$         j         j         j                   _         %                                  &                     j                   _         '                                 d  j        D              _         (                                 tR          xj*        tW           j                  z  c_*        ddl,m-}m.}  | j                   tW           j                   _/         0                                  &                     j                   _        t          tb          td          td          f                               _3        th          j5        ti          j5         j                   _        th          j6        r/ddl7m8} |9                                 (                                  :                     j                   _        th          j;        ti          j;         j                   _         <                                  =                                 th          j>        sth          j?        r6t                      r(t          jB        jC        jD        E                                 th          jF        r@t          d	d
d
          5   H                    d            d d d            n# 1 swxY w Y   th          jI        rddlJmI}  | j         j         j        t          t          j        j                                                  t          t          j        K                                                     _        th          jL        sth          jM        rth          jI        sddlJmN}  | j         j                   t                      r`t          jQ        rTth          jR        st          jS        r<d} j        D ]}t          |jU                  rd
} n|rddl#mV}	  |	 j                   t          jW        rddlXmY}
  |
dd  fd           tG          jZ         j                   _         [                                 th          j\        r`th          j]        j^        rOth          j]        j_        r> `                     j                   _         a                     j                   _         b                                 t          jB        j4        jc        jd        r e                                  | j                   t          jf        g                     j                    h                                 t                       _i        i  _j        t          d          l                     fd           t                       _m        d S )Nc                :    g | ]}                     |          S r{   )create_scheduler_noder   r  rh   s     ri   r   z#Scheduler._init.<locals>.<listcomp>S  s'    CCCd0033CCCrk   c                8    i | ]}|                                 |S r{   r{  r  s     ri   r}  z#Scheduler._init.<locals>.<dictcomp>b  s/     ;
 ;
 ;
 !AJJLL!;
 ;
 ;
rk   c                f    i | ].}|                                 D ]}|                                |/S r{   )r~  r  )r   r   r   s      ri   r}  z#Scheduler._init.<locals>.<dictcomp>f  sO     8
 8
 8
$($BRBRBTBT8
 8
;>CLLNNC8
 8
 8
 8
rk   c                8    i | ]}|                                 |S r{   r{  r  s     ri   r}  z#Scheduler._init.<locals>.<dictcomp>  s"    "G"G"Gq1::<<"G"G"Grk   r   )log_ir_post_fusionlog_ir_pre_fusionr   )distributed_autotunez#Scheduler.create_combo_kernel_nodesTlog_pt2_compile_eventlog_waitcounter)num_ck_nodes)reorder_for_peak_memory)1assign_memory_planning_info_for_scheduler_buffersF)6align_runtime_estimations_across_all_distributed_ranks)trace_structuredartifactc                     dddS )N#scheduler_nodes_before_comm_overlapstring)r   encodingr{   r{   rk   ri   rs  z!Scheduler._init.<locals>.<lambda>  s     E$,) ) rk   c                 f    d                     d t           j                  D                       S )Nz

c                z    g | ]8\  }}d | d|                                 z   d|                                 z   9S )zsnode[r$  z buffer_names:)r.  r  r  s      ri   r   z5Scheduler._init.<locals>.<lambda>.<locals>.<listcomp>  sc        !%1 *QMMMkkmm,Eq/A/A/C/CEEF  rk   )r  r  r  rg   s   ri   rs  z!Scheduler._init.<locals>.<lambda>  s=    v{{  )2$*(=(=	  ( ( rk   )metadata_fn
payload_fngraph_statsc                 H     j          j        t           j                  dS )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesr   r  rg   s   ri   rs  z!Scheduler._init.<locals>.<lambda>  s&     3+/+>*-dj//  rk   )nrF  ru  rW   r   r  backendsr  _post_grad_graph_counterr\  r  count_graph_partition_counterr   rD  r  r  	constantstorchbind_constantsr  r  previous_nodecurrent_nodeupdate_zero_dim_cpu_tensorr  r  default_device_contextget_donated_buffersr?  rh  r@  copyr  r  re  seen_template_fusionsr%   decide_global_ordering_of_commsrZ   topological_sort_scheduledead_node_eliminationcompute_ancestorsr*   ir_nodes_pre_fusionr   torch._inductor.debugrC  rD  r]  create_foreach_nodesr   r   logged_slow_fusionr&   _pre_fusion_custom_passdistributed_max_autotune_gemmr  rE  scheduler  _post_fusion_custom_passr  finalize_multi_template_buffersmax_autotune_gemmmax_autotuner   r  r  select_algorithmPrecompileThreadPoolshutdown_instancecombo_kernelsr   create_combo_kernel_nodesrJ  memoryget_output_namesdeterministic reorder_for_compute_comm_overlaprK  r  r'   6runtime_estimations_align_across_all_distributed_ranksr  r  rP   r   rL  reorder_sink_verbose_loggingtorch._loggingrM  $reorder_compute_and_comm_for_overlapprocess_grouped_nodesgraph_partitionr   r[   %reorder_for_reducing_graph_partitions&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usagetest_configstrack_memory_lifecycleinsert_memory_check_nodesr&  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_rowremoved_ops)rh   r  r   rC  rD  rE  rJ  rK  has_collectivesrL  rM  rI  s   `          ri   r:  zScheduler._initD  s    <>"&'?"@"@(1(9(9%5?\\!&0%**,,"'')) ,1133'
 '
# DCCCUCCC
:>9='')))#**17+<+A+A+C+CDDDJ 	 	DOO ?C# $$&& 	#;
 ;
%)Z;
 ;
 ;
8
 8
,0J8
 8
 8
 AE@Q@V@V@X@X 35 13 LL 	"
 :J#
 

 	!!###33DJ??
""$$$"G"GDJ"G"G"G    	##s4:6##OOOOOOOO$*%%%!$*oo!!###33DJ??
",U38_"="?"?)57
CCDJ/ 	%...... ))$///""$$$__TZ00
*68DDDJ,,...$	V(.(;	V&((	V O,ASSUUU 	B5&* $   B B
 ..D.AAAB B B B B B B B B B B B B B B ) 		77777700
 '17/44667717335566 DJ # /	P(O /	P1 UUUUUUAAJ 0  
 ;<<W WW <	W
 $PW #( J  D$TY// *. # W      KJ4:VVV 8 ;;;;;;  ! !           CDJOODJ""$$$ "		W (			W C		W DDTZPPDJJJ4:VVDJ!!!?!.E 	-**,,,4:&&&	dj))) 6@\\! :<''//   	
 	
 	
 -7LLs   -SSS!dict[str, SchedulerDonatedBuffer]c                    i }t           j        j        D ][}t          t           j        j        |         t          j                  r*t          | t           j        j        |         d           ||<   \|S )N)r  )rW   r   graph_inputs_originalr   r)   DonatedBufferr]  )rh   name_to_donated_bufr   s      ri   rh  zScheduler.get_donated_buffers  sq     G1 	 	D!'7=r?OPP ,BG1$7 $- - -#D)
 #"rk   rR  c                $    t           j        j        S ru   rW   r   current_devicerg   s    ri   r  zScheduler.current_device&  s    w%%rk   r  c                (    |t           j        _        d S ru   r  r  s     ri   r  zScheduler.current_device*  s    !'rk   c                    t           j                            dd          dk    rddlm}  || j        d           dS dS )z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr  r&  r  r  )rh   r  s     ri   r  zScheduler.debug_draw_graph.  sV    :>>:DAASHH++++++L666666 IHrk   labelr   c                    t                               t          j                  r9t                               d|           | j        D ]}|                                 d S d S )Nz%s:)r  isEnabledForloggingINFOr  r  r  )rh   r  r   s      ri   debug_print_nodeszScheduler.debug_print_nodes5  sh    GL)) 	#HHUE"""
 # #  """"	# 	## #rk   r   rv  r\   c                d   |                                 
J d            |                                rt          | |          S t          |t          j        t          j        f          rt          | |          S t          |t          j                  rt          | |          S t          |          )Nz2All nodes passed to scheduling must have an origin)r\  is_no_oprG  r   r)   r   r`  r   r	  r  r5  r  s     ri   r>  zScheduler.create_scheduler_node;  s    !!--@ .-- ==?? 	,)$555r0"2CDEE 	, t,,,bo.. 	,,T4888%d+++rk   c                    t                      g } j                                        t          j        j                                        D ]~} fd|D             }|s                    |            fd|D             }t          j	        dk    }t           |d|          }|                    |           |D ]}| j        |<   fd j        D             t          |          z    _        d S )Nc                \    g | ](}|v t          j        |         t                    &|)S r{   )r   rh  rG  )r   r   kept_node_namesrh   s     ri   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>N  sI       ?**"4#4T#:<RSS + ***rk   c                *    g | ]}j         |         S r{   )rh  r   r   rh   s     ri   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>Y  s!    @@@$d'-@@@rk   r   Frw  rz  c                @    g | ]}|                                 v|S r{   r{  )r   r   removed_node_namess     ri   r   z2Scheduler.create_foreach_nodes.<locals>.<listcomp>h  s3     
 
 
4==??BT+T+TD+T+T+Trk   )r   r  r  rW   r   listsr   r  r&   combo_kernels_autotuner`  r   r  r  )	rh   fe_nodesnamesr   rz  fe_noder   r  r  s	   `      @@ri   rq  zScheduler.create_foreach_nodesH  sQ   .8ll16688W]))++ 	8 	8E    !  E  %%e,,,@@@@%@@@F$;a?O0*/ /	  G OOG$$$ 8 807'--8
 
 
 
!Z
 
 
NN


rk   c                B   $%&'  G $fddt           t                             $t          j        $          & j        D ]}|                                D ]}|                                }t          |j        j	        t          j                  r&t          |                                          dk    r`|                                D ]Y}|&v r8|&v r4&|         }&|         }||z   }&D ]}&|         |u s
&|         |u r|&|<   >|&v r&|         &|<   N&|         &|<   Zόd*' fd'	 	 d+d,&'fd}	i }
t          j        j                                        D ]x}t          |t$          j                  r|j        D ]}d|
|<   ,t          |t          j                  r2d |                                D             }|D ]}|j        D ]}d|
|<   yd} j        D ]r}|j        J t/          |j                                        d           }|D ];}t          |t$          j                  sJ d}||
vr|                                |
|<   <s j        D ]}t4                              d|j                   |r|j        J t/          |j                            d          d           }|D ]u}||
v sJ | d|
             |
|         x}V j        |                                         D ]6}|                    t?          |                                                     7vt          |j         j!                  dk    rEtE          tG          |j         j!                            x}rt          |tH                    r|j%        }nd}|                                D ]l}t          |&                                          dk    sJ |&                                D ],} '|          } |	||           |                    t?          ||                     &|         j'        D ]}|                                |                                k    r-t          |j        tP                    sJ |j                                        D ]}}|                                } '|          }||                                v }|                    tS          ||                                |                       |	||d           ~.nt          j        j*        |                                         D ]G} |	||d           |                    tS          ||                                d                     Ht          j        j+        |                                         D ]2} |	||d           |                    t?          |                     3|j         j,        D ]<}t          |tR                    s% |	|j-        ||.                    |                     =|/                     j0                   |                                D ]}|&                                D ]x}|                                 j0         '|          <   |                                 j0        |<    j1        2                    ||           j1        |                                <   yt          j        3                                D ]C}t4                              d|            |	|ti          t?          |                               D|rt          j        j5        D ]}|                    d          D ]}||
v s!J | d|
6                                             |
|         x}rd j        |         7                                D ]D}t4                              d||            |	|ti          t?          |                               E j0        D ]}|t          j        j        v rK |	|ti          t?          |                               t          j        j8        9                    |           `|t          j        j:        v r& |	|ti          t?          |                               d  tw          t          j        j        6                                          D             %%fd!t          j        j8        D             t          j        _<         j        D ]K}|                                D ]4}|=                    &|                                         j'                   5L j>        D ]-} j>        |         =                    &|         j'                   .t                      } | @                    d"           &'                                D ]^\  }}!| A                                5  d# |!j'        D             }"| @                    d$| d%|" d&           ddd           n# 1 swxY w Y   _| @                    d'           | B                                C                                }#t                              d(           t                              d)|#           dS )-zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        c                  6    e Zd ZdZ	 	 ddd	ZddZd fdZdS )1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            Nr  Optional[list[_T]]
membershipOptional[OrderedSet[_T]]r   r2  c                B    |pg | _         |pt                      | _        d S ru   )r  r   r  )rh   r  r  s      ri   ru  z:Scheduler.compute_dependencies.<locals>.DedupList.__init__|  s#    
 #[b
","<
rk   	node_userr^   c                    || j         v rd S | j                            |           | j                             |           d S ru   )r  r  r   r  )rh   r  s     ri   r   z8Scheduler.compute_dependencies.<locals>.DedupList.append  sF    //F
!!),,,##I.....rk   rV  DedupList[_T]c                     t          j         j        |j                  } j         fd|j        D             z   } ||          S )Nc                &    g | ]}|j         v|S r{   )r  )r   r;  rh   s     ri   r   zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>  s,     * * *at.F.FA.F.F.Frk   )r   r  r  r  )rh   rV  new_membership	new_items	DedupLists   `   ri   __add__z9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1$/5CS!T!T J * * * *${* * * 	 !yN;;;rk   r  )r  r  r  r  r   r2  )r  r^   r   r2  )rV  r  r   r  )rv   rw   rx   r  ru  r   r  )r  s   ri   r  r  r  sr          -17;= = = = =/ / / /< < < < < < < <rk   r  r   r  r   r   c                F    | j         v r j         |                    S | S ru   )re  )r  r  rh   s    ri   r  z.Scheduler.compute_dependencies.<locals>.rename  s.    D)))vd3A6777Hrk   Fused_by_namer8  r  r,  rl   r  r2  c                n     |                                         t          |||                     d S ru   )r   r  )r  r8  r,  r  name_to_usersr  s       ri   add_userz0Scheduler.compute_dependencies.<locals>.add_user  sE     &&../66K99    rk   Nc                F    g | ]}t          |t          j                  |S r{   )r   r   r  r   rw  s     ri   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>  s)    SSS!Auz9R9RSASSSrk   c                    | j         S ru   r  r  s    ri   rs  z0Scheduler.compute_dependencies.<locals>.<lambda>  s    AF rk   r-  Tzscheduling %s)unbacked_onlyc                    | j         S ru   r  r  s    ri   rs  z0Scheduler.compute_dependencies.<locals>.<lambda>  s    !& rk   z not in )r  )mutating_bufr  )r  )r  zscheduling output %sz+scheduling output %s for unbacked symint %sc                    i | ]\  }}||	S r{   r{   )r   r   r   s      ri   r}  z2Scheduler.compute_dependencies.<locals>.<dictcomp>`  s+     
 
 
'E4D%
 
 
rk   c                     g | ]
}|         S r{   r{   )r   r   	inp_namess     ri   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>c  s*     &
 &
 &
 $IdO&
 &
 &
rk   rX  c                6    g | ]}|                                 S r{   r{  )r   r  s     ri   r   z2Scheduler.compute_dependencies.<locals>.<listcomp>t  r  rk   'z': r#  rY  zBUFFER USER LIST
z===== AFTER SCHEDULING =====
%s)r  r   r   r   )FF)
r  r   r8  r  r,  rl   r  rl   r   r2  )Er	   r^   r  r   r  r~  r  r   r   r&  r)   r=   r   r'  rW   r   r  r   r   r  r    	TensorBoxr  r2  get_unbacked_symbol_defsSymbolr  r&  r*  rh  r  r3   r   r  r  r  r2   r  r)  r  r\   r4   additional_buffer_depsadditional_star_depsr   r   r,  r  re  r  r  r  rG  graph_outputsr  r  mutated_inputsr  rb  r  mutated_input_idxsrM  r?  rO   r  r*  r+  r  compute_dependencies_log)(rh   r   buf1	buf1_name	buf2_namelist1list2combinedr.  r  unbacked_symbol_to_origin_nodevalfssym_sizerw  has_non_input_unbacked_defsunbacked_symbol_defsunbacked_symbol_usesrp  r   r   	node_modealt_namer-  out_buf
other_nameis_aliasadd_deprL  r  r   r   logbufr  r  r   r  r  r  r  s(   `                                   @@@@ri   rZ   zScheduler.compute_dependenciesl  s   	< 	< 	< 	< 	< 	< 	< 	< 	< 	<@ @K?V@
 @
 J 	L 	LD((** L L MMOO	 ty/??D,,..//!33!%!1!1!3!3 L LI M11i=6P6P -i 8 -i 8#(5=#0 > >C -c 2e ; ;#0#5#>#>5=c 2> #m333@3Ki003@3Ki00LL:	 	 	 	 	 	 	 !&!		 		 		 		 		 		 		 		 MO&
 7'..00 
	B 
	BC#uz** 	B* > >B9=2266>C.. B TSs||~~SSS! B BAn B B=A6r::B ',#J 	H 	HD9((( $*	2244:J:J$ $ $  * H H!!U\22222 /3+:::8<215H J V	 V	DIIoty111* Gy,,,'-I222FF((( ( ($
 . G GA >>>>FF&DFF ?>> <A>>K#'#4Q#7#C#C#E#E G GC --gcllnn.E.EFFFF D$+,,11 d&6&=!>!>???S 2sI.. 2  H		 	 '')) E E3,,..//14444 # 1 1 3 3 E EH%vh//HHXt,,,%%ghY&G&G&GHHH -h 7 = E E==??dmmoo==$)$)5FGGGGG'+y'<'<'>'> E EG)0)9)9););J)/
););J (073F3F3H3H'HH -- '$.1408L!" !" !"   %HZtDDDDD%EEE< 79$--//J S S$5555 !!''4==??D"Q"Q"QRRRR77H 4 4$6666!!''"2"23333 (. F F!$00 FHTYd.>.>t.D.DEEE%%d&;<<< ''))   # 1 1 3 3  H>AllnnD)&&*:*:;69llnnD)(3/33HhGG +CLLNN;; 0022 	> 	>HII,h777HXz'(*;*;<<==== ' 	Nw, N N111EE N NA >>>>MM&D&I&I&K&KMM ?>> ;1==q N(,(9!(<(M(M(O(O N NHII M ( !  
 %HXz'(:K:K/L/LMMMMN ) 	: 	:Dqw+++z'$--88999&**40000***z'$--88999
 
+4QW5I5N5N5P5P+Q+Q
 
 
	&
 &
 &
 &
()(>&
 &
 &
"
 J 	C 	CD'')) C CmCLLNN;ABBBBC / 	S 	SD'-77d8K8QRRRR  !!c'--// 	4 	4JC 4 4;;u{;;;2#22%2223334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 	c  ""))++ &&';<<< &&'I3OOOOOs   ).h##h'	*h'	c           
         ddl m}m}m}m} t          t          j        j        	                                          } | j
        |          }t          j        j        j        s | j
         j                   t          t          j                                                  } | j
        ||          \  }}	}	d t#          t%           j
                            D             |D ]~}
|
j        dk    r|
j        dk    r|
j                                        }|
j                 d                             |           |
j                 d                             |           ddlm}  |             d fd}g }t9           j
                  D ]S\  }}|                    |           |                     |||t%           j
                  dz
  k                         T| _
        d S )Nr   )rK  compute_memory_timelineFreeableInputBufferget_freeable_input_bufc                    g | ]}g g fS r{   r{   )r   r  s     ri   r   z7Scheduler.insert_memory_check_nodes.<locals>.<listcomp>  s/     C
 C
 C
RHC
 C
 C
rk   r   )register_check_mem_opstep_idxr   is_final_steprl   r   r  c                Z   |          d         }|          d         }|||g}t          j        t          t          j        d                    t          j        j        j        j        g |d           }dj	        |          
                                 |_        t          |          S )Nr   r   r  )r  c                6    | |d         |d         |d         dfS )Nr   r   r   )alivedeadr   r{   )tensor_argsr  s     ri   rs  zWScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node.<locals>.<lambda>  s/    !.q!1 -a 0)6q)9 C rk   )r&  r=  r  nontensor_argsunflatten_args
mem_check_)r)   MemoryCheckKernelr=   r  r  r  _inductor_debugcheck_memory_stepdefaultr  r  r  r  )r  r   expected_newly_aliveexpected_newly_deadr  r   rh   step_allocs_deallocss         ri   construct_mem_check_nodezEScheduler.insert_memory_check_nodes.<locals>.construct_mem_check_node  s     $8#A!#D "6x"@"C24GWN'!e)<)<===y0BJ-     D #Qtz(/C/L/L/N/N"P"PD,T4888rk   )r   )r  r   r   rl   r   r  )r  rK  r  r  r  r   rW   r   r  r  r  r  r  r&   rJ  r@  r  r  r   
size_alloc	size_freerP  r  
start_stepr   end_step#torch._inductor.runtime.debug_utilsr  r  )rh   rK  r  r  r  r  name_to_freeable_input_bufr  buf_info_listr  buf_infor  r  r  	new_nodesr  r   r  s   `                @ri   r  z#Scheduler.insert_memory_check_nodes{  sr   	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 	
 )3173G3L3L3N3N(O(O""4:|<< 	# %= 	==
D,   *4AG4L4L4N4N)O)O55J&
 
q!C
 C
#C
OO44C
 C
 C
 & 	H 	HH"a''H,>!,C,C//11H !45a8??III !23A6==hGGGGMMMMMM	9 	9 	9 	9 	9 	9 	92 	 ,, 	 	GAtT"""((1DJRS@S;SUUU    


rk   c                   t           j        sdS g }t          | j                  D ]ddd}                                D ]}t          fd|j        D                       }|rdt                              d	|	                                           t          j        j                            |	                                           d
}                                 o| }|s|                               t                              d	                                           t          j        j                            	                                           j        j        D ]J}|j        | j        v r:| j        |j                 j        }fd|D             | j        |j                 _        Kt+          t          |                    | _        | j        D ]                                 dS )z0
        Remove any nodes without users
        Nr-  r  r   rl   c                Z    | j         p$|                                 t          j        j        v S ru   )r  r  rW   r   r  )r-  s    ri   can_eliminate_userz;Scheduler.dead_node_elimination.<locals>.can_eliminate_user  s!    |Tt}}!':T'TTrk   Fc              3  .   K   | ]} |          V  d S ru   r{   )r   ur  s     ri   r   z2Scheduler.dead_node_elimination.<locals>.<genexpr>  s/      #M#Ma$6$6q$9$9#M#M#M#M#M#Mrk   zremoved dead buffer: %sTzremoved dead operation: %sc                r    g | ]3}|j                                                                         k    1|4S r{   r0  )r   r  r   s     ri   r   z3Scheduler.dead_node_elimination.<locals>.<listcomp>  s>     = = ="#0A0AT]]__0T0TA0T0T0Trk   )r-  r  r   rl   )r&   use_dcer   r  r~  r   r  r  r&  r  rW   r   rE  r  r.  r   r  r   r   r   r@  r  r  )	rh   updated_nodesactive_buffersr   can_eliminaterL  r  r  r   s	          @@ri   rm  zScheduler.dead_node_elimination  s    ~ 	F
 TZ(( 	 	DU U U U #N'')) * * ##M#M#M#M39#M#M#M M M  *II7HHHG+//????%)NN $ 5 5 7 77N<NM  $$T**** 		6HHH*..t}}??? ,2  DyD$444 $ 0 ; A= = = =',= = =(39 (=1122
 J 	# 	#D  """"	# 	#rk   r  Optional[str]rl   c                
    |duS )z:Check if store mode requires cross-thread synchronization.Nr{   )rh   r  s     ri   mode_requires_synchronizationz'Scheduler.mode_requires_synchronization  s    4rk   r  c                    t          t                               t                      g dfd|D ]}|                                D ]}||<   |D ]} |           S )z?
        Ensure nodes is in topologically sorted order
        r  r\   r   r2  c                    | vrf                     |            t          | j        d           D ]"}|j        vr |j                            #                    |            d S d S )Nc                    | j         S ru   r  )ds    ri   rs  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>  s    af rk   r-  )r  r2  rn  r   r   )r  r   rh  r,  seenvisits     ri   r,  z2Scheduler.topological_sort_schedule.<locals>.visit  s    }}!!"6<L<LMMM 2 2Cx|33 E,sx01111a      }rk   )r  r\   r   r2  )r   r\   r  r  )rh   r  r   r   rh  r,  r+  r,  s       @@@@ri   rl  z#Scheduler.topological_sort_schedule  s     +,..59VV*,	! 	! 	! 	! 	! 	! 	! 	! 	!  	* 	*D--// * *%)T""* 	 	DE$KKKKrk   r  c                r    t                      }t          |t          t          t          t
          t          f          r%|j        D ]}|                    |j	                   n t          dt          |           d           fd|D             }t          t           fd|D                                 S )Nz+get_unmet_dep_nodes is not implemented for .c              3  V   K   | ]#}j         |                                         V  $d S ru   )r@  r  r  s     ri   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>&  s7      XXc)#.??AAXXXXXXrk   c              3  2   K   | ]}j         |         V  d S ru   r  r?  s     ri   r   z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>'  s+      QQat6q9QQQQQQrk   )r   r   r   r  rG  r   r  rn  r  r   RuntimeErrorr   r  )rh   r  
unmet_depsr   unmet_dep_opss   `    ri   _get_unmet_dep_nodeszScheduler._get_unmet_dep_nodes  s    &0ll
)&"$	
 	
 	 / ) )sx(((() Ld5kkLLL   YXXXZXXXJQQQQ=QQQQQRRRrk   r  c                b   g }t                               | j        d          }i }| j        D ]^}|                     |          }t	          |          ||<   |D ]2}|                    |g           }|                    |           |||<   3_d |                                D             }|rx|                    |           |D ]@}	|                    |	g           D ]}
||
xx         dz  cc<   |                    |	           Ad |                                D             }|x|r
J d            |S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                $    g | ]\  }}|d k    |S r   r{   r   r  r  s      ri   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>8  s!    @@@1a!rk   r   c                $    g | ]\  }}|d k    |S r8  r{   r9  s      ri   r   z5Scheduler._topological_sort_nodes.<locals>.<listcomp>?  s!    DDDDAqQ!VVaVVVrk   zTopological sort failed!)	r  fromkeysr  r5  r   r  r   r  r  )rh   r  r  childrenr   r  r   czero_deg_nodesr  r-  s              ri   r  z!Scheduler._topological_sort_nodes)  sf    dj!,,#%J 	" 	"D,,T22Dd))E$K " "LLb)) !"
 A@@@@ 	ELL(((#  $LLB// % %D$KKK1$KKKK		!DDEKKMMDDDN  	E 44444yrk   c                b   i }| j         D ]|}t                      }|j        D ]F}| j        |j                                                 }|                    |           |||         z  }G|||                                <   ||_        }t          | j                   D ]\  }}||_
        ||_        dS )z.
        Populate each node.ancestors
        N)r  r   rn  r@  r   r  r  r  r   r  ra  rb  )rh   name_to_ancestorsr   r   r   dep_node_namer  s          ri   rn  zScheduler.compute_ancestorsC  s    
 9;J 	' 	'D)3I. > > $ 0 : K K M Mm,,,.}==		1:dmmoo.&DNN$TZ00 	# 	#KE4"DN"DNN	# 	#rk   c                b   t           j        sd S | j        D ]}t          |t          t
          f          r$|                                st           j        dk    rC|                                D ]@}t          |t                    r|	                                r,|
                                 Ad S )Nhalide)r&   re  r  r   r   r   rR   cpu_backendr   r#  r  )rh   r   r  s      ri   r  zScheduler.merge_loopsV  s    0 	FJ 	$ 	$D d]4F$GHH KKMM&,&8H&D&D)) $ $!%77 5;L;L;N;N !!####$	$ 	$rk   c                    t          ddd          5  t          d          D ]}t          |          }t                              d|dz   |           |                     |d          }t          |          }t                              d	|dz   ||           ||k    s|dk    r t                              d
|dz               nt          j        st          j        r|                     |d          }|cddd           S # 1 swxY w Y   dS )zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodesTrF  r  z/===== attempting fusion (%d/10): %d nodes =====r   F)is_reorder_roundz=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)	r   r  r   r%  r&  fuse_nodes_oncer&   re  loop_index_inversion_in_fusion)rh   r  r  old_lennew_lens        ri   r  zScheduler.fuse_nodesq  s    #4QU
 
 
 	 	 2YY  e**  EE  
 ,,UU,KKe**  TE	   g%%A$$Eq1u   E	 *6 1K8K ,,UT,JJ;	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   C#DD
Dc                    g }| j         D ]A}|                    t          |t                    r|                                n|g           B|| _         dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r  r   r  r  )rh   r  r   s      ri   r  zScheduler.process_grouped_nodes  se     .0	J 	 	D!+D2F!G!GSdV    


rk   r  tuple[float, str]c                   t          |          dk    sJ |d                                         }|| _        |                     |          }t	          ddd          5  |                    |          cddd           S # 1 swxY w Y   dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTcompile_time_autotune_time_us)rG  dynamo_compile_column_usN)r   r   r  rc  r   rO  )rh   r  r  r[  s       ri   rO  zScheduler.benchmark_fused_nodes  s     5zzA~~~~q$$&&$""6**#"&%D
 
 
 	8 	8
 0077	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8s   B  BBNbenchmark_kernelhint_overrideOptional[int]c                   t          |          dk    sJ |d                                         }|| _        |                     |          }t	          d          5  |                    |||          cddd           S # 1 swxY w Y   dS )rN  r   generate_kernel_code_from_nodesrS  N)r   r   r  rc  r   rV  )rh   r  rR  rS  r  r[  s         ri   rV  z)Scheduler.generate_kernel_code_from_nodes  s     5zzA~~~~q$$&&$""6**;<< 	 	::'} ;  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   B  BBmoduler   r/  c                    || _         |                     |          }t          d          5  |                    |          cddd           S # 1 swxY w Y   dS )rN  benchmark_codegened_moduleN)r  rc  r   rZ  )rh   rX  r  r[  s       ri   rZ  z$Scheduler.benchmark_codegened_module  s     %""6**677 	> 	>55f==	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   AAA
multi_noder  c                   t           j        j        }|sdS t                              d||           |j        D ]}|                                }t          |dd          r||vr,|j        }||         }t          |t          j                  r!|                    |j                   |j        }t          |t          j                  r&||k    r t                              d|||            dS dS )z
        Check if selecting a Triton template would cause layout conflicts.
        Returns True if there's a conflict and we should fall back to ATen.
        FzNode %s has constraints %sr&  NzOLayout conflict detected for %s: template expects %s but layout is frozen to %sT)rW   r   buffer_layout_constraintsr  r&  r  r  r  r&  r   r)   FlexibleLayout freeze_layout_with_exact_stridesr   FixedLayoutr  )rh   r[  constraintsinpinp_namer&  expected_layouts          ri   !_has_layout_conflict_for_templatez+Scheduler._has_layout_conflict_for_template  s    g7 	5		.
KHHH$ 	 	C||~~H3$// 8;3N3NZF)(3O&""344 $ 44_5KLLL&".11 o6O6Oe#	   tturk   c                   t          | j                  D ]\  }}t          |t                    rt          |j        t
          j                  r|j        }t          j        j	        s|
                                \  }}n+t          d |                                D                       }t          |t          j        j        j                  r|                     |          rm|                                D ]*}t          |t          j        j        j                  r|} n+t          |t          j        j        j                  s
J d            t          |t          j        j        j                  rt          j        ri }||d<   t          j        D ]e}|                    |          }	d |	                                D             }
t+          |
                                d           d         }|||<   f|j                            |           n|j                            |           t
          j                            |j                  5  |                                }ddd           n# 1 swxY w Y   |j        }t          |t
          j                  sJ |j        }t          |t
          j                  sJ |j        rtA          ||j                   |j!        |_!        | "                    ||||           dS )	a  
        Finalize a backing choice for MultiTemplateBuffers which did not already have a
        choice finalized through fusion. In the case of an extern choice, this will result
        in replacing the SchedulerNode.

        If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
        will force completion of compilation and benchmarking.
        c              3  b   K   | ]*}t          |t          j        j        j                  &|V  +d S ru   )r   r  r  rz  ExternKernelCaller)r   timings     ri   r   z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>	  sT         &) & % @ S   "     rk   zZNo extern kernel detected to fallback to when layout constraints fail for Triton templatesNrW  c                D    i | ]\  }}t          |t                    ||S r{   r   r   )r   r  r  s      ri   r}  z=Scheduler.finalize_multi_template_buffers.<locals>.<dictcomp>4  s?     . . .$(Aq#-a1I#J#J. !1. . .rk   c                    | d         S rg  r{   r  s    ri   rs  z;Scheduler.finalize_multi_template_buffers.<locals>.<lambda>9  s    qQRt rk   r-  r   )#r  r  r   r   r   r)   MultiTemplateBufferr&   r  %force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr  r  r   re  rz  rh  multi_kernel_hintsr  r  finalize_as_triton_callersfinalize_as_triton_callerr  current_originsrb  output_noder   
StorageBoxOperationBufferorigin_noder8   r&  _replace_node)rh   r  r   r[  min_node_unfusedr  choicecallershinttimingstriton_timingsout_tensorboxout_storage
out_buffers                 ri   rw  z)Scheduler.finalize_multi_template_buffers  s    !,, L	D L	DGAt$.. KD:	214 4 KD "Y
*P *4*C*C*E*E'$aa'+ *4*C*C*E*E  	( 	($ $O&?  
 ==jII &0&?&?&A&A & &F) & % @ S    & 4: 0 %&  *"EO$D$W      y    $O&?  
 0 NQS(8 %+$= 3 3D&0&?&?d&?&S&SG. .,3MMOO. . .N
 &))=)=)?)?^^%T%T%TUV%WF,2GDMM	<<WEEEE	;;<LMMMY..z/ABB C C$4$@$@$B$BMC C C C C C C C C C C C C C C+0!+r}=====(-
!*b.@AAAAA) N&}j6LMMM$.$5
!"":z1dCCCYL	D L	Ds   I77I;	>I;	r  r  r  r   r   c                   t          ||           |                     |          }|| j        |<   || j        |                                <   || j        |                                <   i t          j        |j        j	        |j
                  D ].}| j                            |j        d           x}r
|j        |<   /dfd} ||j
                  |_
         ||j        j	                  |j        _	        t          |                                |                                          D ]-\  }	}
|	| j        |
                                <   |
j        |	_        .|j        |_        |j        |_        |j        |_        |j        |_        d S )Nr  rm  r   c                :    t          fd| D                       S )Nc              3  B   K   | ]}|                               V  d S ru   )r  )r   r   re  s     ri   r   z?Scheduler._replace_node.<locals>.rename_deps.<locals>.<genexpr>c  s0      KKscjj)9::KKKKKKrk   r   )r  re  s    ri   rename_depsz,Scheduler._replace_node.<locals>.rename_depsb  s&    KKKKdKKKKKKrk   )r  rm  r   rm  )r  r>  r  rh  r  r  r  r  r   r   rn  r  r  r   r  r~  r@  r  ra  rb  r   r`  )rh   r  r[  r  r   new_scheduler_noder   	real_namer  new_outold_outre  s              @ri   ry  zScheduler._replace_nodeN  s    	"*j999!77
CC*
1-?$--//*3E0 ?4#3#94;RSS 	7 	7C 377$GGGy 7.1h +	L 	L 	L 	L 	L 	L 1<11
 1
- 0;{*00
 0
&, !$**,,d.>.>.@.@!
 !
 	* 	*GW 4;DW--//0#MGMM'+~$'+~$'+~$(,%%%rk   	node_listc                4    t          d |D                       S )Nc              3     K   | ]Q}t          |j        d           o7|j        duo.t          |j        j        d          o|j        j        j        dk    V  RdS )r   Nscatter_moder  )r<  r   r   r  r  s     ri   r   z,Scheduler._any_atomic_add.<locals>.<genexpr>x  s       
 

 	 AFF## 9d"9^449 (L8	
 
 
 
 
 
rk   )r   )rh   r  s     ri   _any_atomic_addzScheduler._any_atomic_addw  s2     
 

 
 
 
 
 
 	
rk   )tuple[Optional[LambdaFuture], ModuleType]c                2   |                      |d|          }t          j        |          }t          j        j                                        }|                                sd }n.|                    d|          }t          |t                    sJ ||fS )NT)rR  rS  triton_)kernel_namesource_code)rV  r   loadr  r  async_compileAsyncCompileuse_process_poolr   r   r   )rh   r  rS  src_codemodr  futs          ri   compile_kernelzScheduler.compile_kernel  s     77D 8 
 
 x((5BBDD--// 	1CC&&9(&SSCc<00000Szrk   r~   r   ra   c                    !"#$%&'() t          d fD                       }t          j        s|st                              d          S                                 r,t                                          t          j	                  r(
                                s
                                rt                              d          S                                 }|d                                         sJ j        dk    r*t          j        dk    rt                              d          S                                 }t          t!          j        ||                    }                     |          rt                              d          S ddlm t+                    )|d                                         J d fd#|rt          d fD                       r                                dur                                n                                (t          (t          j                  sJ                      (          rt                              d          S i "g  t          j        D ]}(                    |          t5                                          d           D ]\  }}	t          |t8          j        j        j                  s*(                     |          5   !                    |g "                    ||j#                  R            ddd           n# 1 swxY w Y   tI          d          }
d}i } D ]\  }}}	 ||%                                 nh# tL          $ r[}tN          (                    tR          j*                  r-tN          +                    dsdndtY          |                     Y d}~~d}~ww xY w(                     |          5   -                    |          \  }}|||<   ||
k     r|}
|}ddd           n# 1 swxY w Y   |(j.        |<   t          |t^                    sJ |"|<   t          j0        tc          d (j2        D                       }tg                      o o|t          j4        k    !tI          d          tI          d          c%&d$!sa(                                (5                                \  $%t5                                          tm          j7        d                    }nd (j2        D             }r0r 8                    |          n 8                    |          \  &}nAst                              d          S 9                                &tu          &          'g  d}|D ]\  }}t          |t^                    ss!tw          |d          r|j<        (j<        k    r>r|%&z   k    r nq|dz  }|t          j4        k    r nZ(                     |          5   !                    |g "                    |          R            ddd           n# 1 swxY w Y   t{                     dk    rt                              d          S d! !"#$%&'( fd}t          >                    | d         d                   S  "                    |           "                    |           "                    |          d!# )fd}t          >                    |d                   S )"
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c              3     K   | ]D}|                                 o+t          |                                t          j                  V  Ed S ru   )r#  r   r  r)   rm  r  s     ri   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sb        
  
  MMOO J1..00"2HII 
  
  
  
  
  
rk   Tr   r  r   CompilationErrorNms_fusedr  ms1ms2r   r2  c           
        t                               t          j                  r| ||z   k     rXt                               d                                                                t          ||z   | z  d                     d S t                               d                                                                t          | ||z   z  d                     d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r%  r  r  DEBUGr&  r  rB   rC   )r  r  r  r~   r   s      ri   
log_fusionz/Scheduler.speedup_by_fusion.<locals>.log_fusion  s    &&w}55 cCi''$$S..00..00"sSyH&<#B#BCC	     $$W..00..00 Hc	$:!@!@AA	     rk   c              3  B   K   | ]}|                                 d uV  d S ru   r  r  s     ri   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  sD       %
 %
23A!!-%
 %
 %
 %
 %
 %
rk   Fc                    | d         S rg  r{   r  s    ri   rs  z-Scheduler.speedup_by_fusion.<locals>.<lambda>  s    aPQd rk   r-  rW  infException in compiling %s: %sr  r  c              3  @   K   | ]}t          |t                    V  d S ru   rk  r   r=  s     ri   r   z.Scheduler.speedup_by_fusion.<locals>.<genexpr>  s>       % %<=
1677% % % % % %rk   r   c                    g | ]}|d fS r8  r{   r  s     ri   r   z/Scheduler.speedup_by_fusion.<locals>.<listcomp>'  s    &J&J&J!1v&J&J&Jrk   allowed_prologue_inpsrl   c            	     Z   t          d          } d }i }r]rt          t          j                  sJ                                                                 \  t          fd          D ]O\  }}}	 ||                                }n s|j        }|	                                 nd }nh# t          $ r[}t                              t          j                  r-t                              dsdndt!          |                     Y d }~d }~ww xY wrW                    |          5                      |          \  }}	|||<   || k     r|} |}d d d            n# 1 swxY w Y   |k    pz   |         z   k    }
|r4t'          |j                  dk    r|j        d         j        d	k    r|
r|} nQr |            r	| z   k     rJ|Ht,          j        r|d <                                  n                    |           |j        d <   d
S dS )Nr  c                     | d                  S r   r{   )r;  rp  s    ri   rs  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>f  s    nQqT&: rk   r-  r  r  r  r   r   r  TF)r  r   r)   rm  rp  ro  r2  r,  r  
precompiler  r%  r  r  r  r&  r   swap_as_triton_callerrZ  r   	launchersn_spillsr&   rq  rr  rs  _choice_timings)min_ms_fusedms_fused_choicenew_timingsr{  re   	mod_fusedresr   r  pathfusible_choicebench_epiloguerp  r  r,  future_choicesget_choice_timings_async hint_override_best_fusion_choicer  
min_choicer  r  	ms2_fusedr[  rh   s              ri   benchmark_when_readyz9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyY  s%   $U||"& + %X*ZAW*X*XXXX%/%>%>%@%@N&0&?&?&A&AOJ%+&::::& & &N
 2@ 0" 0"-FFI!!-"(--//CC!/ '"+"3CNN,,,,"&C % ! ! !%227=AA &,, ?2A Q

z #A  
 !! & "'==fEE 	9 	9-1-L-L ) &. .NHd
 3;K/',66/728	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 '&0 N"Sy>&+AI+MM '  	" !$CM 2 2a 7 7 #a 0 9Q > > . !? /5O!E! 7J|S#666 '!*6#)*D*D%10 NAP8>"==<   
 #<<_MMM 8CJ.t44 5s+   >7B66
D ADD5)E**E.	1E.	c                 T   ddl m}  	 d         d         d         fD ]}||                                                     d         
          \  t	          j                  r d           dS                     d         
          \  t	          j                  r d           dS                     d         
          \  t	          j                  r d           dS             t          d          rZz   k    rQfj        vrFj                            f           t          d          
                    fd	           z   k     S # | $ r Y dS 	$ r}d
t          |          v rY d }~dS  d }~ww xY w)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $      z   z  dS )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior{   )r  r  r  path1path2
path_fuseds   ri   rs  zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s-    053605365?8@3;sSy3I% % rk   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  r,  rZ  mathisinfr   rr  r  r   r  r   )r  r  r   r  r  r  r  r  r  r  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  rh   rq  s      @@@@@@ri   r  z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  sm        A *!,)!,/2  ) )
 ?JJLLL!%!@!@)!," "JC
 z# %CDDD$u!%!@!@)!," "JC
 z# %DEEE$u+/+J+J/2, ,(Hj
 z(++ %CDDD$uJxc222 0>>$c	11"EN$2III/33UENCCC(77??        
 
 
 $cCi//+ ! ! ! 55'   .#a&&88#ttttts7   A.E> ?>E> ?>E> ?A>E> >F'F'
F"!F""F'rs   )r  r  r  r  r  r  r   r2  rW  )?r   r&   benchmark_fusionra   rp   r#  r   r  r)   TritonTemplateBufferr'  r   r   r   rD  r  r  r  r  triton.compiler.errorsr  r  rm  re  rq  rp  r2  r  r  r  rz  TritonTemplateCallerr  r   r  rS  r  r,  r  r%  r  r  r  r&  r   rZ  r  r   benchmark_epilogue_fusionr   choicesr    max_epilogue_benchmarked_choicesro  operator
itemgetterrO  r  r  r<  r  r   rt   )*rh   r~   r   is_multi_templatenode_list_1node_list_2node_list_fusedrS  r{  r  r  r  r  re   r  r   r  r  num_triton_callerschoice_timings_iterr  triton_choicesunfused_timer  r  r  rp  r  r,  r  r  r  r  r  r  r  r  r  r  r  r[  rq  s*   ```                     @@@@@@@@@@@@@@@@@@ri   speedup_by_fusionzScheduler.speedup_by_fusion  s	       
  
 U^ 
  
  
 
 

 & 	+/@ 	+$$T*** 	+u6688":QRR	+ !!	+ !!		+  $$T***oo''Q**,,v ;%F$6($B$B$$T***oo''y{KHHII
 00 	+$$T***;;;;;;u%% #..00!!!	 	 	 	 	 	 	"  s	 %
 %
8=u~%
 %
 %
 "
 "
 s	 $5577tCO #/''))),,.. 
 j"*@AAAAA55jAA 0#((///  - TVN!'!: *R *R!+!:!:=!I!I!'(<(<(>(>NN!S!S!S  IFA% @ U  ! !#99&AA  &-- &!%!4!4$36CW "5 "" ""                   %U||FJ 1? 5 5-FFI
!!-"MMOOO$ ! ! !%227=AA &,, ?2A Q

z #A  
 !! $99&AA 5 5)-)H)H%v* *$ /7F+#l22+3L.4O5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 =H
*=9!/3KLLLLLBQ0??#=N!$ % %AKAS% % % " "
 )** R&&R&&*QQ % U||U5\\HC15J+ 
K!+!:!:!<!<",";";"="=
C&,"((**0CA0F0F' ' '## 'K&Jz7I&J&J&J# P 'AD..{;;;33K@@ UU ' 4',,U3332244<UE3OO	 TVNN(;  $!&*BCC  ((?@@ 4
8XXX! lcCi&?&?E!#!F$KKKE55f==  "))G$"5"5o"F"FGG                
 >""a''#((///V! V! V! V! V! V! V! V! V! V! V! V! V! V! V! V! V! V! V!p  --$nQ&7&:   !% 3 3K @ @ $ 3 3K @ @&*&9&9/&J&J#F F F F F F F F F F F F FP  --09PQR9S .   sO   4MMM?N
O; AO66O;)QQQ-ZZ		Z		c                @    | j         |                                         S )z0Look up the node in Scheduler name_to_fused_node)r  r  r  s     ri   r6  zScheduler.get_fused_node  s    &t':':'<'<==rk   r{  OrderedSet[BaseSchedulerNode]c                $   t                               d|                                |                                           |                                }|                                |k    sJ |                     |                              ||          |                    |           |                    |           |                               | j        	                    fd
                                D                        S )Nzfusing %s with %sc                :    i | ]}|                                 S r{   r{  )r   r  node3s     ri   r}  z,Scheduler.fuse_two_nodes.<locals>.<dictcomp>  s#    'W'W'W

e'W'W'Wrk   )r%  r&  r  r   rc  rp   r  r  r  r  r   )rh   r~   r   r{  r  r  s        @ri   fuse_two_nodeszScheduler.fuse_two_nodes  s     	,enn.>.>@P@PQQQ!!##!!V++++  ((--eU;;5!!!5!!!&&'W'W'W'WU__EVEV'W'W'WXXXrk   
speedup_fnrq   c                    |                      ||          r9|                     ||          s# |            r|                     |||           dS dS NTF)r   will_fusion_create_cycler  )rh   r~   r   r  r{  s        ri   fuse_if_speedupzScheduler.fuse_if_speedup  sf     MM%''	11%??	 
	
 uk:::4urk   template_fusion_candidates,dict[BaseSchedulerNode, list[PendingFusion]]c                   |rg }i }t                      }|D ]8}||v rt          ||                   dk    sJ ||                             d          }t          ||                   dk    r|                    |           |                                \  }}	|	|k    rt          ||	          sJ |}
n||k    sJ t          ||	          sJ |	}
|                     |
          |
ur|j        r.|j        j        }|J |	                    |           ||f||<   | 
                    ||	|j        |          r|                    |           :t          |          D ]o}||         \  }}| 
                    |                     |j                  |                     |j                  |j        |          r|                    |           p|D ]}|                    |           |dS dS )z
        Evaluate pending template fusions for a set of fusion candidate nodes.
        The fusion candidate nodes are pointwise nodes as potential epilogue
        or prologue fusions
        r   r   N)r   r   r  r  r   r-  r1  r6  re   r   r  rc   r   r~   r   )rh   r  r{  template_futuresfuture_to_pending_fusionfusions_to_remove	candidatepending_fusionr~   r   r  fcands                ri   "_evaluate_pending_template_fusionsz,Scheduler._evaluate_pending_template_fusions-  s    ) 9	2-/  % @J||7 %9 %9	!;;;6yABBaGGGH "<I!F!J!J1!M!M1)<==BB%)))444->>@@uI%%-eU;;;;;$)MM I-----eU;;;;;$)M &&}55]JJ!( 
9&-4A===$++A...3A92M,Q// ++un&@+  9 *--i888 ""233 0 0'?'B$''''(<==''(<==".	  0 &))$///& 2 2*..q1111s ) 9	2 9	2 9	2 9	2 9	2rk   possible_fusion_pairs1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]pending_fusions&dict[BaseSchedulerNode, PendingFusion]template_fusion_nodesrF  c                    d fd}|D ]Y\  }} |||                                 |          }                      |          }t          ||          r||f j        v rX                     |||          r                     ||          sԉ                     ||          }	|	j        t          |	j        |||	j                  }
t          ||          r^||f j        vsJ  j        	                    ||f           t          ||          }||vrg ||<   ||                             |
           n
|
|<   |
|<   9|	j        sB                     ||           [d S )	Nr~   r\   r   r   r2  c                                        |           v s                     |          v r:                                         |                                                    |                              }|J |                                \  }}|j        }                    |d                                |d                                 |          |u sJ                      |          |u sJ  |            r                    | |          r                     ||                                |           v                       |          v 8d S d S ru   )r6  r  r   rc   r  r  r  )	r~   r   r  	node_key1	node_key2
is_speedupr{  r  rh   s	         ri   resolve_pending_fusionsz<Scheduler._try_fusion_pairs.<locals>.resolve_pending_fusions{  s   
 ##E**o==&&u--@@!0!4!4''..#''(;(;E(B(BCC" " &111'5'F'F'H'H$	9+7
##It444##It444**955BBBB**955BBBB!z|| t'D'DUE'R'R ##Iy+FFF+ ##E**o==&&u--@@@@@@rk   )rc   r~   r   re   r*  )r6  r3  rj  r   r  r  rc   r}   re   r  r6  r   rb   r  )rh   r  r  r  r{  rF  r  r~   r   
fusion_resr  template_pw_nodes   ` ` `       ri   _try_fusion_pairszScheduler._try_fusion_pairss  s    	G 	G 	G 	G 	G 	G 	G 	G8 2 +	? +	?LE5 $#E5111''..E''..E #5%00ENd&@@@}}u.  ?33E5AA? "33E5AA
)5%2$.$:##)0	& & &N *%77 
@ %u~T5OOOOO266u~FFF+B5%+P+P(+3HHHFH12BC-.>?FF~VVVV1?.1?.!- ##E5+>>>W+	? +	?rk   c                t   t                      }|                                D ]}|                                \  }}|j        }||v st	          ||          r5|                    |           |                     |          |u sJ |                     |          |u sJ |                     ||||           d S ru   )r   r   r   rc   r3  r  r6  r  )rh   r{  r  seen_pair_speedup_fnr  r
  r  is_speedup_fns           ri   _finish_pending_fusionsz!Scheduler._finish_pending_fusions  s    
 @J|| .4466 	S 	SN#1#B#B#D#D Iy*6M 4448J99 94  $$]333&&y11Y>>>>&&y11Y>>>>  I}kRRRR	S 	Srk   possible_fusionsdeferred_prologue_fusionsc                    t          d |D                       }g }|D ]H\  }}t          ||          r||v r|                    ||f           1|                    ||f           I|}d S )Nc                8    g | ]\  }}t          ||          |S r{   r5  )r   n1n2s      ri   r   z6Scheduler._handle_template_overlap.<locals>.<listcomp>  s,    MMMFB2DR2L2LMRMMMrk   )r   r1  r   )rh   r  r  epilogue_template_nodesnew_possible_fusionsr  r  s          ri   _handle_template_overlapz"Scheduler._handle_template_overlap  s     #-MM.MMM#
 #
  "& 	6 	6FB!"b)) 6b4K.K.K)00"b::::$++RH5555/rk   c                T   |                      |           t          |          }t                              t          j                  rLt                              d           |D ]/}t                              d|                                           0i }i }g }|                     ||          }t          j
        st          j        r.t          j        r"t          j        r|                     ||           |                     |||||           |                     ||           |                     ||           |                                 |r/|                     |||||           |                     ||           t'          |d           }|                     |          }|S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  %sc                    | j         S ru   r  r  s    ri   rs  z+Scheduler.fuse_nodes_once.<locals>.<lambda>3  s    !+ rk   r-  )r  r   r%  r  r  r  r&  r  get_possible_fusionsr&   rx  ry  r0  r,  r  r  r  r  clearr2  rl  )	rh   r  rF  r{  r   r  r  r  r  s	            ri   rG  zScheduler.fuse_nodes_once  s    	!!%((( ''""7=11 	A;<<<# A A  )=)=)?)?@@@@  	
 OQ  	"  44
 
 %	W)/)<	W&	W &	W
 ))*:<UVVV!	
 	
 	
 	$$[/BBB//0E{SSS##%%%$ 	X"")%    334I;WWW{(=(=>>>..u55rk   rI  c                F  	 t          | j                  }d}t          | j                  }t                              d|           t          t                              |                     D ]2\  }}t                              |          }t          |          dk     r4|||k    r n| 	                    |          st                              d|           o|dz  }t          j        dk    }t          |d         j        |d|          	t                              d	t          |          |           |D ]}|                    |           |                    	           | j                            	fd
	                                D                        4t'          |d           | _        |                     | j                  | _        t                              d||t          | j                             |                     | j                   dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %s...r   Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                :    i | ]}|                                 S r{   r{  )r   r  r  s     ri   r}  z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>[  s#    LLLq{LLLrk   c                    | j         S ru   r  r  s    ri   rs  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>]  s    q{ rk   r-  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   r  r   r  r&  r  r`  r  r  speedup_by_combo_kernelr&   r  r  r  r  r  r  r  r   r2  rl  r  )
rh   rI  r{  r`  num_nodes_orignumr  rz  r   r  s
            @ri   r~  z#Scheduler.create_combo_kernel_nodes7  s/    !,,TZ		FUUU'&DDTJJ
 
 	 	NC 3CCINNI9~~!!'EL,@,@//	:: 		EsKKKQJE$;a?O4!&*. /	  K HHBI  
 " ) )""4((((OOK(((#**LLLLK4I4I4K4KLLL    K-B-BCCC
33DJ??
R
OO		
 	
 	
 	!!$*-----rk   c                D    |D ]}|                     | j                   d S ru   )r  r  )rh   r  r   s      ri   r  zScheduler.prune_redundant_depsg  s5     	? 	?D%%d&=>>>>	? 	?rk   c                0   
 g 
t          t          t          t          f                              d
 fd}t          j        t
                    }|D ]J}                     |          r|                                D ]}||                             |           K|	                                D ]} ||           t          j        rnt          j        t
                    }|D ]0}t          |dd          }	|	r||	                             |           1|	                                D ]} ||                                
          

                     j        d	           t                               d
t%          
                     
S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  r   r2  c                   t          |           D ]\  }}| |dz   |dz   t          j        z            D ]}||f}|v r                    |                               ||          r                    |           M|                                s|                                r.                    ||          r                    ||f           d S rg  )r  r&   )max_fusion_buffer_group_pairwise_attemptsr  r   r   r#  r'  )	r  node1_indexr~   r   r.  rF  r  r+  rh   s	        ri   check_all_pairsz7Scheduler.get_possible_fusions.<locals>.check_all_pairsv  s'   &/&6&6 @ @"U"!Ok'F'G G @ @E
 !%.Cd{{ HHSMMM}}UE3CDD @(//4444++-- @1A1A1C1C @u&6J J @ )//???!@@ @rk   r   NT)r.  reversezfound %d possible fusionsr  r  r   r2  )r   r   r\   r  r   r  unfusable_noder   r   r   r&   aggressive_fusionr  *get_possible_fusions_with_highest_priorityr  score_fusion_keyr%  r&  r   )rh   r  rF  r-  buffer_names_groupingr   r   node_groupinggroup_groupingr   r  r+  s   ` `       @@ri   r   zScheduler.get_possible_fusionsk  s    % 13D DEFHH	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@( !, 7 = = 	8 	8D""4(( --// 8 8%c*11$77778299;; 	+ 	+MOM****# 	/(4T::N 7 7gt44 7"5)00666!/!6!6!8!8 / /....JJ
 
 	$"7FFF4c:J6K6KLLLrk   c                    t          t                               d fd|                                j                                        |                                j                                        z  |j        j                                        |j        j                                        z  z
  t           fdD                       }|r t          ||          d           |S )	z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        r   r\   r   rl   c                ,   t          | t                    r}| vry                    |            |                                                               rdS t          | j        z            p#t          fd| j        z
  D                       S dS )NFc              3  D   K   | ]} j         |                   V  d S ru   r1  r   r  
found_pathrh   s     ri   r   zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  sQ       H H #
4#:1#=>>H H H H H Hrk   )r   r   r  r   issubsetrl   r   r   )r   combined_ancestorscombined_namesr;  rh   visiteds    ri   r;  z6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 233 G8K8KD!!!++--667IJJ  !5   ?@@ C H H H H H!%2D!DH H H E E  5rk   c              3  D   K   | ]} j         |                   V  d S ru   r1  r:  s     ri   r   z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s5      WWqJJt6q9::WWWWWWrk   zwill create cycler  )r   r   r   _dictr  r   r   r  )rh   r~   r   cycler=  r>  r;  r?  s   `   @@@@ri   r  z"Scheduler.will_fusion_create_cycle  s    /022	 	 	 	 	 	 	 	 	 	2 %%''-2244''))/44667 	
 O!&&((5?+@+E+E+G+GG WWWWWDVWWWWW 	9#IeU##$7888rk   c                    ddl m d fd} ||          } ||          }t          fd|D                       }t          fd	|D                       }|                    |          }d
}	|D ]-}
	 |	t	          |
d                   z  }	# t
          $ r Y  dS w xY w                     ||          }t          j        j	        
                    |	d|z            rdS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two nodes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   )buffer_reuse_keyr   r\   r   list[ir.Buffer]c                   g }| j         j        D ]n}j                            |j                  }|rKt          |j                  dk    r3|j                                        r|	                    |j                   o|S rg  )
r   r   r@  r  r   r   r  r   has_tensor_outputr   )r   ry  rj  r   rh   s       ri   _find_single_user_inputszKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  s~     F&, , ,&**2733 ,3sy>>Q..383M3M3O3O.MM#(+++Mrk   c              3  .   K   | ]} |          V  d S ru   r{   r   r   rD  s     ri   r   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  /      #S#Sc$4$4S$9$9#S#S#S#S#S#Srk   c              3  .   K   | ]} |          V  d S ru   r{   rJ  s     ri   r   z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>  rK  rk   r   r   F    T)r   r\   r   rE  )r=  rD  r   intersectionr   r  rT  rW   r   r   statically_known_gt)rh   r~   r   rH  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr.  	bw_savingrD  s   `           @ri   can_fusion_increase_peak_memoryz)Scheduler.can_fusion_increase_peak_memory  sG   * 	655555	 	 	 	 	 	 10770077##S#S#S#S]#S#S#SSS##S#S#S#S]#S#S#SSS*77GG$ 	 	C3s1v;;.   uuu ,,UE::	 7//iPP 	4us   7B
BB	thresholdc                0   t          d |                                D             d |                                D             z             }t          d |j        j        D                       }t          d |j        j        D                       }||z  }t                      }|j        j        D ]7}	|                     |	j        |          r|                    |	j                   8t          d |j        j        D                       t          d |j        j        D                       z  }
t          d |j        j        D                       t          d |j        j        D                       z  }|
|z
  }||z
  }||z  }t          |          |k    S )	Nc                6    g | ]}|                                 S r{   r{  r  s     ri   r   zFScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<listcomp>  s     ;;;T]]__;;;rk   c                6    g | ]}|                                 S r{   r{  r  s     ri   r   zFScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<listcomp>  s     ===4t}}===rk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s$      &T&TCsx&T&T&T&T&T&Trk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>  s$      %R%R3ch%R%R%R%R%R%Rrk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>&  s5       $
 $
CH$
 $
 $
 $
 $
 $
rk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>(  r~  rk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>+  s5       %
 %
CH%
 %
 %
 %
 %
 %
rk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zEScheduler.fusion_prevent_too_many_reads_and_writes.<locals>.<genexpr>-  s$      DDCsxDDDDDDrk   )	r   r   r   r  r   $can_buffer_be_removed_through_fusionr   r  r   )rh   r~   r   rX  fused_node_namesnode1_write_namesnode2_read_namesreads_removed_through_fusionwrites_removed_through_fusionr  all_read_namesall_write_namesunique_readsunique_writesunique_io_bufferss                  ri   (fusion_prevent_too_many_reads_and_writesz2Scheduler.fusion_prevent_too_many_reads_and_writes
  s    &;;):):;;;==5??+<+<===>
 
 '&T&T5;L;S&T&T&TTT%%R%R%:K:Q%R%R%RRR'7:K'K$ :D%*1 	B 	BI88 0  B .11).AAA $ $
 $
 % 1 7$
 $
 $
 
 
CC5+<+BCCCCCD
 % %
 %
 % 1 8%
 %
 %
 
 
DD5+<+CDDDDDE
 &(DD (*GG )=8$%%	11rk   c                    t          t          |j        |j        z
            t          |j        |j        z
                      }|dk    S )aA  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heuristic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  ra  rb  )rh   r~   r   proximity_scores       ri   are_long_distant_nodesz Scheduler.are_long_distant_nodes:  sH    * %/122%/122
 
 ##rk   common_buf_names'Union[tuple[str, ...], OrderedSet[str]]c                   i }d |j                                         D             }d |j                                         D             }|D ]}t          j                            |          }||         }	||         }
t          |	t                    rt          |
t                    s&dt          |	           dt          |
           ||<   |	                                |
                                k    r0d|	                                 d|
                                 ||<   t          |	j
                  t          |
j
                  k    rd||<   |	                                }|
                                }||k    rd| d| ||<   H|	                                |
                                k    rd|	 d|
 ||<   d	}t          |t          j                  s
d
|j         }d|	 d|
 d| ||<   t!          |          S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                    i | ]
}|j         |S r{   r  r  s     ri   r}  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>`      XXXC#(CXXXrk   c                    i | ]
}|j         |S r{   r  r  s     ri   r}  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>a  rv  rk   znot MemoryDep: r   zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r  zLayout: zUnknown reason: z. )r   r  rW   r   r  r   r2   r   r   rV   r   
get_offsetnormalize_with_stride_orderr)   r  r&  r   )rh   r~   r   rr  reasonsnode1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                 ri   decide_fusion_fail_reasonz#Scheduler.decide_fusion_fail_reasonU  sY    XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX( ,	 ,	H'$$X..C$X.G$X.Ggy11 GY9W9W Jd7mmJJ4==JJ !   ""g&7&7&9&999X(9(9(;(;XX7CTCTCVCVXX !  W\**mGL.I.III$/!((**G((**G'!! %R$Q$Q$Q$Q! 335566889 9 %VW$U$UG$U$U! Jc2#566 54
44
I7II'IIZII H 7||rk   c                0   t           j        sdS t          d ||fD                       rdS |j                                        }|j                                        }||z  }|sdS t          d |j        D                       }||z
  rdS t          |          dk    rdS t          |j        j                  dk    st          |j        j	                  dk    rdS t          t          |j        j                            }t          t          |j        j	                            }t          |t                    rt          |t                    sdS d |j        j	        D             }	|j        |	vrdS |	|j                 }
t          |
t                    sdS |
                                }
|
j        |j        k    r|
j        |j        k    rdS |j        |j        k    st          |j                  dk    rdS t          |j        j                  dk    rdS |j        j        rdS d|j        j        v rd|j        j        v sJ t          d	 |j                                        D                       }t          |          dk    rdS t          t          |                    }||j        j        d         k    rd}d}n||j        j        d         k    sJ d}d}d
dlm} |j        j        d
         }t          |          dk    rdS g }t4          j                            |          D ]9}|                    t<          j        j         !                    |                     :tE          |          } |||d
                   }|dS |j        j        |         |j        j        |<   ||j        j        |<   |#                    dd           | $                    ||          }t          |tJ                    sJ tL          '                    d|           |S )aW  
        Attempts to enable fusion between two nodes by inverting indexing patterns.

        This optimization targets cases where node1 has a contiguous write and
        node2 has a contiguous write but discontiguous read. By inverting the
        indexing in node2's read and write operations, we can make them compatible
        with node1 for potential fusion.

        Args:
            node1: First scheduler node (source)
            node2: Second scheduler node (target for inversion)

        Returns:
            int: Fusion score if successful, 0 if optimization not applicable
        r   c              3  >   K   | ]}|                                 V  d S ru   r  r  s     ri   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s*      22aqxxzz222222rk   c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s5       .
 .
CH.
 .
 .
 .
 .
 .
rk   r   c                    i | ]
}|j         |S r{   r  r  s     ri   r}  zBScheduler.shared_data_after_inverting_indexing.<locals>.<dictcomp>  s    JJJ##JJJrk   r   index0index1c              3     K   | ]}|V  d S ru   r{   )r   r  s     ri   r   zAScheduler.shared_data_after_inverting_indexing.<locals>.<genexpr>  s"      %T%Ttd%T%T%T%T%T%Trk   r   )generate_inverse_formulaNTFz!Shared memory after inversion: %d)(r&   rH  r   r   buffer_namesr   rn  r   r   r  r  r  r   r2   r   r_  r   r   r  r   r  	subblocksget_read_exprs$torch._inductor.invert_expr_analysisr  varsr   Add	make_argsr   rW   r   r   combine_modular_indexing_pairsr   rw  rT  r   r%  r  )rh   r~   r   node1_buffer_namesnode2_buffer_namescommon_buffer_namesnode2_unmet_dependencies
node2_readnode2_writenode1_writesnode1_writenode2_read_exprs	read_exprread_expr_indexwrite_expr_indexr  r  simplified_termstermsimplified_read_exprinverse_formulascores                         ri   $shared_data_after_inverting_indexingz.Scheduler.shared_data_after_inverting_indexing  s   & 4 	222E5>22222 	2 #.;;==".;;==03EE" 	2 $. .
 .
 % 8.
 .
 .
 $
 $
  $&88 	2'((1,,2 u &''!++s53D3K/L/Lq/P/P2$u067788
4 1 899::*i00 	
9
 9
 	 2JJ1B1IJJJ?,..2":?3+y11 	2 "++-- !222 K$4442?k...#j6J2K2Kq2P2P2 u{)**a//2 ;  	2 222EK66667
 &%T%Tu{7Q7Q7S7S%T%T%TTT  A%%2.//00	 28<<<&O' :8 DDDDD&O'QQQQQQ[%a(
z??a2I''	22 	 	D## ??EE     ##344223GTUWW "2
 7<k6P7
"?3 8G"#34 	""4///((66%%%%%%;UCCCrk   c                   t           j        rt          d ||fD                       rdS |                                s|                                rdS |j                                        }|j                                        }||z  }|sdS d |j                                        D             }d |j                                        D             }g }|D ]}	||	         }
||	         }|
                                |                                k    rN|                    t          j
        j                            |
                                d          |
|f           t          |          dk    rdS t          |t!          j        d                    \  }}
}t%          |
t&                    rt%          |t&                    sdS |
j        |j        k    rA|
                                |                                k    r|                     |
          S dS d}|                                s|                    |
|          }nk|                                s|                    ||
          }n@t2                              d	|                                |                                           |r.t9          j        t<          |                     ||                    ndS )
a  
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatible with node1 if that's more efficient.

        Return the amount of shared data re-computed in this method.
        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
        amount of shared data).

        c              3  >   K   | ]}|                                 V  d S ru   r  r  s     ri   r   z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>2  s;       8
 8
AHHJJ8
 8
 8
 8
 8
 8
rk   r   c                    i | ]
}|j         |S r{   r  r  s     ri   r}  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>D  rv  rk   c                    i | ]
}|j         |S r{   r  r  s     ri   r}  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>E  rv  rk   r   r   r-  Fz?Don't reorder loops since both nodes are reductions: %s v.s. %s) r&   re  r   r#  r   r  r  rz  r   rW   r   r   	size_hintr   r   r  r  r  r   r2   r  r_  dep_size_hintr   r  r  r&  r  rR  rS  r   rT  )rh   r~   r   r  r  r  r|  r}  
candidatesr   r~  r  _numel	reordereds                 ri   !shared_data_after_reordering_loopz+Scheduler.shared_data_after_reordering_loop"  s     0 	C 8
 8
!&8
 8
 8
 5
 5
 	 2
  	%"3"3"5"5 	2".;;==".;;==03EE" 	2XX53D3U3U3W3WXXXXX53D3U3U3W3WXXX 
. 	 	K$[1G$[1G335566889 9 !!(2273D3D3F3FQR2SS   z??a2 $'zx7J17M7M#N#N#N '9-- 	Z5S5S 	2w///
   ""g&7&7&9&999))'2222	!!## 		77IIII##%% 	77IIII##Q       FKT55eUCCDDD	
rk   c                    t          |t          t          f          o)|                                 ot	          |j                   S )z>
        Is this node unfusable under any conditions.
        )r   r  rG  r#  rT   r   r  s     ri   r0  zScheduler.unfusable_node}  sK    
 t79OPQQ C$$&&&C7	BBB	
rk   prologue_noder  rq  r  c                   |                                 t          j        j        k    rdS |                                }|                                }d}|||z  k    r |d           dS t          d |                                D                       }|t          j	        j
        j        j        fk    r |d           dS dd} ||                                j                  r!|                                s |d           dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     K   | ]9}|j         0|j                                         D ]}|j        dk    |j        V  :d S )Ncall_function)r   r\  r  r]  )r   r  r   s      ri   r   zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sf       
 
v!V'')) "!t&&	 H '&&&&
 
rk   z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsr  torch.dtyper   rl   c                &    | j         dk    o| j        S )Nr   )itemsizeis_floating_point)r  s    ri   low_prec_fpzGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >Q&B5+BBrk   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)r  r  r   rl   )r   rW   r   invoke_quant_opsro  rq  r   r   r  r  r  constant_pad_ndr  r  r  r
  )	rh   r  r  rq  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERrb  r  s	            ri   (check_prologue_fusion_heuristics_fusablez2Scheduler.check_prologue_fusion_heuristics_fusable  s[    ,,..!'2JJJ4"88::
#::<< &)"'AABBCRSSS5  
 
",,..
 
 
 
 
 uy~5=???Cn   5	C 	C 	C 	C K@@BBHII	!>>@@	 Ch   5trk   /Optional[tuple[int, SchedulerNode, sympy.Expr]]c                    t          |t                    rt          |t                    sdS t          |j        t          j                  rt          |j        t          j                  sdS |                                s|                                rdS t          j        dk    rdS |j        |j        }}|\  }}|\  }}|	                                s:|	                                s&||k    s t          |          t          |          k    rdS t          |j        j                  dk    st          |j        j                  dk    rdS                      t          t          |j        j                                      }	                     t          t          |j        j                                      }
t!          |	|
          t          j        k    rdS d
 fd} ||          s ||          rdS g }t%          t'          ||                    D ]#\  }\  }}||k    r|                    |           $t          |          dk    rdS |d	         }||         ||         }}t*          j        j                            ||          r|||fS t*          j        j                            ||          r|||fS dS )ao  
        Fusing two small pointwise nodes significantly reduces kernel overhead
        and launch overhead. However, slightly different sizes would prevent fusion.
        Here, we decide if expanding sizes of one node is profitible by allowing
        fusion, and returns the dimension to expand, node with smaller sizes,
        and new size after expand.
        NrC  r   r   r\   r   rl   c                .   | j         j        D ]}|j        j        v rj        |j                 }nj                            |j                  }|rBt          j        j        	                    ||           rt          |j        t                    s dS dS r  )r   r   r   r?  r@  r  rW   r   r:  rF  r   r  rG  )r   rL  rM  rh   s      ri   has_reusable_bufferzIScheduler.get_expand_dim_for_pointwise_nodes.<locals>.has_reusable_buffer  s    (.    9 ;;; $ ;DI FII $ 0 4 4TY ? ?I  ,66y$GG  'y'<>TUU 
  445rk   r   r  )r   r   r   r)   r   r  r&   rD  rU  r   r   r   r  r  r  r  r  small_memory_access_thresholdr  r  r   rW   r   r   statically_known_lt)rh   r~   r   n1_sizesn2_sizesn1_iter_sizesn1_reduce_sizesn2_iter_sizesn2_reduce_sizesnode1_write_memorynode2_write_memoryr  mismatch_dimensionsidxn1_sizen2_sizemismatch_dimmismatch_size1mismatch_size2s   `                  ri   "get_expand_dim_for_pointwise_nodesz,Scheduler.get_expand_dim_for_pointwise_nodes  s    %// 	z%7W7W 	4 uz2#455	5:r'899	 4 ))++ 	u/M/M/O/O 	4 ))4 #\5<()1&)1&  	!!##	 /11=!!S%7%7774 u '((1,,E4E4L0M0MPQ0Q0Q4 "//T%:K:R5S5S0T0TUU!//T%:K:R5S5S0T0TUU"$67723 3 4	 	 	 	 	 	  u%% 	)<)<U)C)C 	4 !'0]M1R1R'S'S 	0 	0#C#'7'!!#**3///"##q((4*1-,',' ' 7//OO 	66W11..QQ 	664rk   FTcan_reorderrH  c                   ||u rdS t          |t                    r|                    |          S t          |t                    rdS t          ||          }|                                r=|                     |                                                              ||          rdS t          |t                    st          |t                    r |d           dS t          |t          t          f          r!|                                s |d           dS t          |t          t          f          r!|                                s |d           dS |                                |j        z  r |d           dS |                                rXt          j        s |d           dS |                                s|                                r |d           dS |                                }t          |t"          j                  s |d	           dS |                                }t)          d
 |j        D                       |z
  }|                                |z  r |d           dS |                                s|                                r |d           dS |                                dd         D ]J}	|	                                }
|
D ]1}t5          fd|j        D                       s |d             dS 2Kt          |t8                    s|gnd |j        D             }t=          |          dk    sJ |d         }t=          d         j                  dk    rNt=          d         j        d         j                  dk    r%d         j        d         j        d         j         |u s |d           dS | !                    |||          sdS |                                rA|                                s |                                st          j"        s |d           dS |                                tF          j$        j%        z  s&|                                tF          j$        j%        z  r |d           dS |                                }|                                }||k    r |d||           dS ~| &                    |||          }t          |tN                    sJ |r:|t          j(        k     r*t          j)        r| *                    ||          }|dk    r|}t          j+        ra| ,                    ||          x}rI|\  }}}|-                    ||           | &                    ||          }t          |tN                    sJ t          j.        r.|t          j(        k     r| /                    ||          }|dk    r|}t`          1                    td          j3                  rAt`          4                    d|5                                |5                                |           tF          j6        7                    | |||          sdS |                                |j        z  ra| 8                    ||          oJtF          j6        8                    | |||          o(|                     |          8                    ||          S tF          j6        9                    | |||          o(|                     |          9                    ||          S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  >   K   | ]}|                                 V  d S ru   r{  )r   rb  s     ri   r   z%Scheduler.can_fuse.<locals>.<genexpr>g  s*      EEc3<<>>EEEEEErk   z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr   c              3  *   K   | ]}|j         v V  d S ru   r   )r   r-  prologue_nodess     ri   r   z%Scheduler.can_fuse.<locals>.<genexpr>w  s*      QQttyN:QQQQQQrk   z7template prologue can only fuse nodes with a single usec                :    g | ]}|                                 |S r{   r  r  s     ri   r   z&Scheduler.can_fuse.<locals>.<listcomp>~  s%    AAAAAaAAArk   r   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)rG  z%s and %s has %s shared data):r   rA  rY  r  r#  rc  r   can_fuse_multi_outputs_templater  r  rG  r   r   r&   r0  r   r  r)   r  get_allowed_prologue_inpsr   r  r  r  r   r~  r   r  r   r   r   rg  r   r  r,  rW   r   no_fuse_buffer_namesrT  r   score_fusion_memory_thresholdre  r  $expand_dimension_for_pointwise_nodesr  r  rH  r  r  r  r  r  r&  r  r  r   can_fuse_verticalcan_fuse_horizontal)rh   r~   r   r  rH  rq  r  r  unsupported_prologue_argsr   	node_outsr   template_snodestemplate_snoder  device2shared_data_scorenew_shared_data_scoreexpand_analysis
expand_dimsmaller_nodeexpand_sizer  s                         @ri   r   zScheduler.can_fuse$  s#    E>>5e455 	.&&u---e455 	 5u%% 	4#3#3$
 $

)
)%
7
7	 4e122 	j'7
 7
 	 CABBB5u8:PQRR	%%''	 C()))5u8:PQRR	%%''	 C()))5$$&&8 	C,---5 8	) 0111u!!## u'8'8':': HIIIu7799Hh(?@@ HIIIu$,$F$F$H$H! EEX_EEEEE'( &
 %%''*CC QRRRu--// 53Q3Q3S3S PQQQu"__..N&ss+ % % ,,..	$ % %CQQQQsyQQQQQ %UVVV$uuu%% "%);<<BAAAAA 
 ''1,,,,,Q/N N2&.//144r*215;<<AA"2&.q17:?>QQ[   u@@sSS u 	**,,	!!##	 )	
 C12225""$$qw'CC 	""$$qw'CC	 C56665!!##""$$WC,fg>>>5 4454M 5 
 
 +S11111 	:!F$HHH1 I %)$J$J5RW$X$X!$))$9!6 	6#FFueTTTO	6 7F3Z{<<ZUUU $ 8 8 F F/55555 1	:!F$HHH$($M$Mu% %! %))$9!))'-88 	##.    !	   y!!$u6GHH 	5$$&&8 
	M &&ue44 MI//eUDUVVM$$V,,>>ueLL 900eU$5  M""6**>>ueLLMrk   c                   |                                 }t          ||          }t          t                    }|j        D ]o}| j                            |j        |j                  }t          |t                    r| 
                    |||          rT||                             |           p|j        j        D ]}t          |t                    s|                    | j                            |j        |j                            }	|	r0|	D ]-}
|                     |
|          r|	                    |
           .t#          d t$          j                            |                                          D                       }||z  r |d           dS |                                }|D ]D}| j        |                                         }|| j        |         j        z  r |d            dS EdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c              3  $   K   | ]}|j         V  d S ru   r  r  s     ri   r   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>  s8       $
 $
 H$
 $
 $
 $
 $
 $
rk   zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r  rn  re  r  r   r   r4   r=  r   r   r  r2   fusable_read_and_writer  r   r  r  r  r   r   r@  r  r  r   )rh   r~   r   node1_buf_namesrq  remaining_deps_by_namer   r   cd	remainingrj  remaining_depsnode1_op_namesr  s                 ri   r  zScheduler.can_fuse_vertical  s.     0022u%%7B47H7H+ 	5 	5C(,,SXsx@@D#w'' D,A,A#ue,T,T "4(//4444#* 		- 		-Bb),, .22%))"'27;; I  -# - -B222r:: -!((,,,# $
 $
 445K5R5R5T5TUU$
 $
 $
 
 

 O+ 	
 C+,,,52244" 	 	D&t,==??G 7 @ JJ >???uu trk   weak_depr4   c                B  	
 j         |                                vrdS fd|j        j        D             }t	          |          dk    rdS |d         
t          
t                    rdS t          
t                    sJ t          
j	        t          j                  rdS | j        j                 	|g}t          |t                    r|j        }d}|D ]@}	fd|j        j        D             }|s|dz  }t#          
fd|D                       s dS A|dk    S )NFc                4    g | ]}|j         j        k    |S r{   )r   r  )r   r  r  s     ri   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>  s3     
 
 
zX222 222rk   r   r   c                *    g | ]}|j         k    |S r{   r  )r   rL  r  s     ri   r   z.Scheduler.fusable_weak_dep.<locals>.<listcomp>-  s0       9	)) )))rk   c              3     K   | ]Y}t          |t                    o?t          |j        t          j                   o|j        j        k    o|j        j        k    V  Zd S ru   )r   r2   r!   r   r#   TMPr   )r   rL  r  s     ri   r   z-Scheduler.fusable_weak_dep.<locals>.<genexpr>5  s        
 	 4++ ,+DJAAA,J%+-, I+	     rk   )r   r  r   r  r   r   r3   r2   r!   r   r#   r  r  r  r`  r   r   r   )rh   r  r~   r   mutating_writesrelevant_reading_nodesnum_concurrent_readsreading_noderelevant_readsr  r  s    `       @@ri   r=  zScheduler.fusable_weak_dep  s    = 6 6 8 8885
 
 
 
*1
 
 

 1$$5"eW%% 	5%+++++u{DH55 	5+H,AB	"'e788 	2%*\" 2 	 	L   (4:  N
 "  A%     
 +      uu $q((rk   rL  r1   r  r2   c                   t          |t                    r4| j                            |j        |j                  }||j        k    s>t          |j        t          j                  st          |j        t          j                  rdS t          j
        r8|j        |j        k    r(|                                }|                                }|                     |j                  rdS |j        |j        k    oSt          |j                  t          |j                  k    o)|j        d t          |j                           |j        k    S t          |t"                    ri| j                            |j        |j                  }| j                            |j        |j                  }|j        |j        k    r|j        ||k    rdS dS rF  )r   r2   re  r  r   r!   r   r#   r  r&   re  r  r_  r&  r  r   r   r3   )rh   rL  r  	read_name
write_names        ri   r  z Scheduler.fusable_read_and_writeC  s   dI&& #	-11$)TYGGI UZ''&tz48<< (&u{DH== ( u0 *T]en5T5T ~~'')) 11%*== u 
ek) ?	NNc%*oo5?I/EJ/0EJ>
 g&& 	-11$)TYGGI.225:uzJJJ	UZ''J*++turk   r   rQ  c                B    t           j                            ||          S ru   )rW   r   get_dep_size_hint)rh   r   rQ  s      ri   r  zScheduler.dep_size_hintj  s    w((k:::rk   return_is_mix_order_reductionint | tuple[int, bool]c                    fd}|rBt                               |          r't                               |          } ||d          S t          |j        j                  t          |j        j                  z   }t          j        j                  t          j        j                  z   }	t          ||	          dz  t          ||	          k     rV||	k    r|c}fd|j        j        |j        j        z  D             }
 |t           fd|
D                       d          S |j        j        |j        j        z  j        j        j        j        z  z  } |t           fd|D                       d          S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        c                    r| |fn| S ru   r{   )r  is_mix_order_reductionr	  s     ri   _construct_return_valuez>Scheduler.score_fusion_memory.<locals>._construct_return_valuez  s      1.//rk   Tr/  c                L    g | ] }|j         j        v s|j         j        v |!S r{   )r   r   r  )r   r   r   s     ri   r   z1Scheduler.score_fusion_memory.<locals>.<listcomp>  sD       %+111SE<M<T5T5T 5T5T5Trk   c              3  D   K   | ]}                     |          V  d S ru   r  )r   r   rQ  rh   s     ri   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s3      IISD&&sK88IIIIIIrk   Fc              3  B   K   | ]}                     |          V  d S ru   r  r  s     ri   r   z0Scheduler.score_fusion_memory.<locals>.<genexpr>  s1      FFC""3''FFFFFFrk   )
r   r   r   r   r   r   r  r  r  r   )rh   r~   r   rQ  r	  rH  r  r  node1_dep_lennode2_dep_lenr  common_memory_depss   ` ```       ri   rT  zScheduler.score_fusion_memorym  s   	 	 	 	 	 % 	8):)C)CE5)Q)Q 	8
 &66ueDDE**5$777E-344s5;L;S7T7TTE-344s5;L;S7T7TT }m,,q03}m3T3TTT},,$eu    ,2U5F5MM  D +*IIIIIDIIIII5   $/58I8PP#e&7&>>
 '&FFFF3EFFFFF
 
 	
rk   c                $   t          |          dk    r|S i }|D ]\  }}|                                |                                k    sJ |                                }t          |                     |                              ||                    }||vr	||fg||<   ||                             ||f           t          |                                t          j	        d                    d         }t          |          dk    sJ |S )Nr   r-  r   )
r   r   r   rc  get_fusion_pair_priorityr   r  r  r  r  )rh   r  "possible_fusions_group_by_priorityr~   r   r  fusion_pair_priority&possible_fusions_with_highest_prioritys           ri   r2  z4Scheduler.get_possible_fusions_with_highest_priority  sY   
   A%%##  	+ - 	 	LE5##%%)9)9););;;;;%%''F#&  ((AA%OO$ $  $+MMMENL23GHH 33GHOOEN    25.4466H<OPQ<R<R2
 2
 2

2. 9::Q>>>>55rk   r   r   c                0    t          j        j        | g|R  S )z-
        Shim for list.sort(key=...)
        )rW   r  score_fusionr;  s     ri   r3  zScheduler.score_fusion_key  s     y%d3U3333rk   c                    t          t          j                                                  }t	          | j                  D ]7}|                    || j                   |                    |j	                   8dS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rW   r   r  r   r  r  r  r  r`  )rh   r  r   s      ri   r  zScheduler.compute_last_usage  sv    
 ))A)A)C)CDDTZ(( 	8 	8D 3T5LMMM&&t7777	8 	8rk   c                ,   t          | j        t          j        j        z
  t          j        j        j        z
            D ];}|| j        v rK| j        |         }|                                r)t          j        j        	                    |j
                   W|t          j        j        v rt          j        j        |         }t          |t          j                  r%t          j        j        	                    |           t          |t          j                  r|j        }t          |t          j                  r|                                sJ t          j        j        	                    |j                   =| j                                         dS )z*Free any buffers that are no longer neededN)r2  r  rW   r   rE  r:  freedr@  rI  codegen_freer   r  r   r)   r  r  r   rv  is_input_bufferr!  )rh   r   r   rb  storages        ri   free_bufferszScheduler.free_buffers  sb   %g%&g"()
 
 	D 	DD
 t'''&t,<<>> @G(55ch???---g*40c2#566 	DG(55c::::R%677 D!hG"7BM::?F?V?V?X?X XG(55glCCC!'')))))rk   c                    | j                                         D ]}|                                 |                                  d S ru   )r^  r   flushr#  )rh   r[  s     ri   r%  zScheduler.flush  sF    }++-- 	 	GMMOOOOrk   scheduler_noder  c                   t          |t                    sJ t          d         dxx         dz  cc<   t          j        t          d                    5  |                                 |                                 d d d            n# 1 swxY w Y   |j        }t          |t          j
                  sJ dt          |                      |                    t          j        j                   |                                  d S )Nr  extern_callsr   F)increase_kernel_countztype(node)=)r   r  r   rW   set_kernel_handlerr.   rO  r  r   r)   r	  r   rA  r   r:  r#  )rh   r&  r   s      ri   codegen_extern_callzScheduler.codegen_extern_call  s9   .*CDDDDD
 	^,,,1,,,!&u"E"E"EFF 	& 	&00222##%%%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& "$00BB2BT$ZZ2B2BBB0QW)***s   )B

BBBaseSchedulingc                F   t          |j                  r|j        J | d            t          j                            |           t          |j                  }|t          d|j                   t                      s|j        dk    rKt          j
                            |          x}j        dk     r!t          |t          j                              t          |j                  r+|j        dk    s t!          t          j                               ||           S )Nz( should have been normalized in loweringzUnsupported device type: r      r  )rR   r   r   rW   r   add_device_infor-   r2  r$   r  r   get_device_propertiesmajorr5   inspectcurrentframer6   )rh   r  device_schedulingdevice_propss       ri   create_backendzScheduler.create_backend  s   &+&& 	
&,*B*B??? +C*BB 	
'''5fkBB$H6;HHIII|| 	<v%%%*Z%E%Ef%M%MM\TWXXX(w7K7M7MNNN$$ <V[E-A-A#G$8$:$:;;;  &&&rk   c                p    |J || j         vr|                     |          | j         |<   | j         |         S ru   )r^  r6  r  s     ri   rc  zScheduler.get_backend  sB    !!!&&$($7$7$?$?DM&!}V$$rk   c                4    d	 fdfd|                                 D             }t          |                                          }|rLt          |t	          j        d                    \  }}t          j        j        	                    |           d S d S )
Nr  torch.fx.Noder   r   c                    | j         vr;j                             d t          | j        j                  D                        j         |          S )Nc                    i | ]\  }}||	S r{   r{   r  s      ri   r}  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>!  s    ,W,W,WdaQ,W,W,Wrk   )r  r  r  r   r  )r  rh   s    ri   	get_orderz*Scheduler.enter_context.<locals>.get_order  sQ    ,,,$++,W,Wi>V>V,W,W,WXXX'**rk   c                r    i | ]3}|j         	|j                                         D ]} |          |fd 4S ru   r  )r   r  r   r<  s      ri   r}  z+Scheduler.enter_context.<locals>.<dictcomp>%  sY     
 
 
v!V'')) "! Yq\\1t!!!!rk   r   r-  )r  r9  r   r   )
r   r  r  r  r  r  rW   r   r:  enter_context)rh   r   rb  r  lastr<  s   `    @ri   r>  zScheduler.enter_context  s    	+ 	+ 	+ 	+ 	+ 	+
 
 
 
^^%%
 
 
 w||~~&& 	5'x':1'='=>>>GAtG ..t44444	5 	5rk   r   rc  r_  c                    	 | j         |         j        }n# t          $ r Y dS w xY wt          fd|D                       o|| j        vo|| j        vS )NFc              3  R   K   | ]!}|j         p|                                v V  "d S ru   )r  r  )r   r-  rc  s     ri   r   zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>8  s8      VVC3C CVVVVVVrk   )r@  r  KeyErrorr   re  r  )rh   r   rc  r  s     ` ri   rb  z.Scheduler.can_buffer_be_removed_through_fusion0  s    	$T*0EE 	 	 	55	 VVVVPUVVVVV 4D114D33	
s    
$$c                   |j         }t          |t          j        j        j                  r]|j        x}rTt          |          \  }}|t          j	        v s|t          j	        v r&t          |t          j
        j                  sJ d| S t          j        j        j        j        st          j        dS t          |t                     r'|j        D ]}|                     |          }|r|c S dS |j         J |                                s|                                 dS t          |j         t          j                  rdS t          |j         t          j                  rdS t/          |j         dd          rdS t1          |j                   rd	S |                     |          x}r|S t          j        j        rt7          |          rd
S dS )z
        Return the reason why we should partition the inductor graph on this node,
        or None if the node is cudagraphable.
        zcustom partition op: Nz6partition includes all ops when cudagraphs is disabledz opszDeviceCopy opszConditional opsunbacked_bindingszunbacked binding opszCUDAGraph-unsafe custom opszdynamic shape ops)r   r   r  r  r)   rI  r  rM   r&   custom_should_partition_ops_ops
OpOverloadr   r[   rE   wrapperr   r   should_partitionrR   r   
DeviceCopyConditionalr  rQ   &_uses_cudagraph_unsafe_unbacked_symintcudagraph_skip_dynamic_graphsr'  )rh   r   r)  r  op_overload_packet_nameop_overload_namer  r!  s           ri   rI  zScheduler.should_partition=  s    )gu1@AA 		B%%B		B 9ER8H8H5#%5'6+MMM#v'III!"ej&;<<<<<A/?AAA &-8	L6>FKKd.// 	 " "..u55 "!MMM"4y$$${{}} 	.oo''----di// 	$##di00 	%$$491488 	*))!$),, 	100@@FFF6 	M =6 	+-d33 +**trk   r  c                   t                      }t          j        s|S | j        D ]}|j        }|t          |t          j        j        j	                  s1|j
        }|;t          |          \  }}|t          j        vr|t          j        vrj|                                D ]a}t          j        j                            |          }t#          |t$          j        t$          j        f          r|                    |           b|S )zc
        Collect output unbacked symints from ops in config.cudagraph_unsafe_unbacked_ops.
        )r   r&   cudagraph_unsafe_unbacked_opsr  r   r   r  r  r)   rI  r  rM   r  rW   r   r   r   r"   r#   UNBACKED_INTUNBACKED_FLOATr  )rh   unsafe_symintsr   r)  r  rN  rO  syms           ri   &_get_cudagraph_unsafe_unbacked_symintsz0Scheduler._get_cudagraph_unsafe_unbacked_symints|  s   
 4><<3 	"!!J 	, 	,DiGgu'9'HII $Bz8DR8H8H5#%5'v/SSS$F,PPP7799 , ,g&//44!#(94;N'OPP ,"&&s+++,
 rk   c                    |                                  }|sd S t          |          }|D ]=}t          j        j                            |          }|j        D ]}||v r	d| c c S >d S )Nz'uses cudagraph-unsafe unbacked symint: )rV  r'  rW   r   r   r   r    )rh   r   rT  node_symbolsrU  simplified_symfree_syms          ri   rL  z0Scheduler._uses_cudagraph_unsafe_unbacked_symint  s     DDFF 	45d;; 	P 	PCW-66s;;N*7 P P~--OXOOOOOOO .P trk   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]]c                    i }|                     t          j        j                   | j        D ]+}|j                                        D ]\  }}|j        ||<   ,|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r  rW   r   r  r  ri  r  r   )rh   rh  r   r   scheduler_buffers        ri   get_name_to_nodeszScheduler.get_name_to_nodes  sv     UWAG0111J 	; 	;D*.*>*D*D*F*F ; ;&&%5%:T""; rk   
signatureslist[GraphPartitionSignature]c           	        d t          t          j        j                  D             }d t          t          j                                                  D             }g t          j        _        t          |          D ]\  }}|j        rg }|j        D ]*}|                    |	                    |                     +g }|j
        D ]<}	|                    |	                    |	                                                     =t          j        j                            t          ||||j                             dS )z
        computes a mapping from partition input/output indices to graph input/output
        indices for each partition.
        c                    i | ]\  }}||	S r{   r{   r   r  r   s      ri   r}  z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>  s+     %
 %
 %
##tD#%
 %
 %
rk   c                    i | ]\  }}||	S r{   r{   rc  s      ri   r}  z:Scheduler.compute_graph_partition_maps.<locals>.<dictcomp>  s+     &
 &
 &
##tD#&
 &
 &
rk   N)r  rW   r   r  r  partition_mapsskip_cudagraphinput_nodesr   r  output_nodesr  rN   constant_names)
rh   r_  name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingr   output_mappingr   s
             ri   compute_graph_partition_mapsz&Scheduler.compute_graph_partition_maps  si   %
 %
'01E'F'F%
 %
 %
!&
 &
'01I1I1K1K'L'L&
 &
 &
" "$'0'<'< 	 	#L)' 
 M!- J J$$%>%B%B4%H%HIIIIN!. W W%%&@&D&DT]]__&U&UVVVVG"))! !",	    !	 	rk   	partitionr]   rg  c                   dddd} t                      j        d |D              } |j        fd	|                                D                ||          }t                      }|D ]@}t          j        j                            |          }|                    |j                   At          t          |t          j        d
                              S )ai  
        Returns all symbol inputs which are required to be in scope to successfully
        perform codegen for this graph partition, including:
        - free symbols used in partition nodes
        - free symbols in partition input/node shapes, strides, and offsets. This is needed
          for recording cudagraphs for tensors with dynamic shapes.
        r   0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   r  c                    t          | t          j                  rt                      S t          | t          j                  rt          |           S t          dt          |                      )zW
            Gets symbols used in input node shapes, strides, and offsets.
            zUnsupported input node type: )r   r)   r  r   r  r"  r5  r   r   s    ri   get_input_node_symbolszKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbols  sf     $ 233 X!||#D"),, X)$/// **V$t***V*VWWWrk   symbolsc                4    t          d | D                       S )z
            Filters a set of symbols that are required for codegen. Skip symbols
            that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
            and SymT.R0_INDEX.
            c              3     K   | ]B}t          |t          j        t          j        t          j        t          j        f          >|V  Cd S ru   )r"   r#   SIZEFLOATrR  rS  r  s     ri   r   zVScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sd        !	
)+	      rk   r   )rv  s    ri   filter_symbolszCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s2             rk   c              3  4   K   | ]}t          |          V  d S ru   r&  r  s     ri   r   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s+      IIt,T22IIIIIIrk   c              3  4   K   | ]\  }} |          V  d S ru   r{   )r   r  r   ru  s      ri   r   z>Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>!  s3      NNwq$$$T**NNNNNNrk   r   r-  )r   rs  r   r  )rv  r  r   r  )r   r  r  r  rW   r   r   r   r    r2  r  
attrgetter)	rh   rq  rg  r{  candidate_symbolsr  rw  symplified_sru  s	           @ri   !get_graph_partition_symbol_inputsz+Scheduler.get_graph_partition_symbol_inputs  s   	X 	X 	X 	X 	 	 	 	, 7Ijll6HIIyIII7
 	! NNNN+:K:K:M:MNNN	
 	
 +N+<==(2" 	2 	2A7+44Q77LJJ|01111&(*=f*E*EFFFGGGrk   
partitionslist[PartitionType]skip_cudagraphs
list[bool]c           	         g }t          t          j                                                  }                                 d fdt          t          |          t          |                    D ]\  }}t                      }|D ].}|                    |j        	                                           /|
                    |          }	t          j                            d |D                       }
t          d |
j        |
j        z  D                       |z
  }t           fd|D                       }t                      |D ]}                    |j                   fd	|z
  D             }|                    |           fd
|D             }fd|D             }fd|D             }|	                    |           t           fd|	D                       }	fd|	D             }d |D             }                     ||          }t%          ||||||          }|                    |           |                    ||	z
            }|ddd         S )z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        r  r   r   rl   c                    j                             | d          }|dS t          |j        j        t
                    r*j                            | d          x}r |          S dS dS )z
            Checks if buf_name resolves to a NoneLayout buffer (following mutation_real_name).
            Buffers with NoneLayout are not allocated so graph partition should not
            take them as inputs or outputs.
            NFT)r@  r  r   r   r&  r=   r  )r  r   r  is_unallocated_bufferrh   s      ri   r  zFScheduler.get_graph_partition_signature.<locals>.is_unallocated_buffer:  sz     "&&x66C{u#(/:66  !% 7 ; ;Hd K KK9 <00;;;t5rk   c                    g | ]	}|j         
S r{   r  r  s     ri   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>]  s    888d!888rk   c                F    g | ]}t          |t                    |j        S r{   )r   r4   r   r  s     ri   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>d  s:       )!W55  rk   c              3  N   K   | ]}j                             ||          V   d S ru   r  r  r  s     ri   r   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>m  J       / / '++D$77/ / / / / /rk   c                    g | ]}|v |	S r{   r{   r   r   rh  s     ri   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>y  s.     ! ! !<'' '''rk   c                *    i | ]}|v ||         S r{   r{   r  s     ri   r}  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>  s5       <'' l4('''rk   c                "    i | ]}|v ||v S r{   r{   r   r   r  rh  s     ri   r}  z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>  s6     " " "<'' d22'''rk   c                $    g | ]}|v |v
|S r{   r{   r  s     ri   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>  s9     " " "<''D8L,L,L ,L,L,Lrk   c              3  N   K   | ]}j                             ||          V   d S ru   r  r  s     ri   r   z:Scheduler.get_graph_partition_signature.<locals>.<genexpr>  r  rk   c                6    g | ]} |          |         S r{   r{   )r   r   r  rh  s     ri   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>  s?       ,,T22T"  rk   c                8    g | ]}|t           j        j        v |S r{   )rW   r   rb  r   r   s     ri   r   z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>  s.       $!'BS:S:S:S:S:Srk   Nr   )r  r   r   rl   )r   rW   r   r  r^  r  r   r  ri  r  rN  r(   r  r  r   r  r`  r  r:   r   r  )rh   r  r  r_  unmet_output_namesrq  rf  output_namesr   returned_output_namesr   partition_input_namesextra_input_namesrg  input_deallocationextra_output_namesrh  ri  symbol_inputspartition_signaturer  r  rh  s   `                   @@@ri   get_graph_partition_signaturez'Scheduler.get_graph_partition_signature.  s    
'(@(@(B(BCC--//	 	 	 	 	 	 	, *-Z  (?";";*
 *
 g	 g	%I~ -7LLL! A A##D$8$=$=$?$?@@@@$0$=$=>P$Q$Q! '1<<88i888 K  !,!2[5G!G     " %/ / / / /1/ / / % %!
 5?LL ! = =$++DO<<<<
! ! ! !1L@! ! !
 "(():;;;   1  K
" " " " "1" " "" " " " "1" " " "(();<<<$. / / / /1/ / / % %!
    1  L !6  N !BB; M #:"# # 1222!6!<!<"%::" " $$B$rk   rm  r:   c                   d |j                                         D             }d |j                                        D             }d |j        D             }d |j        D             }t          |j        ||||j        |          S )z
        Updates the partition signature by removing buffers specified in
        V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
        c                @    i | ]\  }}|t           j        j        v||S r{   rW   r   rE  )r   r   rP  s      ri   r}  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>  s8     
 
 
f17222 &222rk   c                @    i | ]\  }}|t           j        j        v||S r{   r  )r   r   r  s      ri   r}  zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<dictcomp>  s8     
 
 
c17222 #222rk   c                \    g | ])}|                                 t          j        j        v'|*S r{   )maybe_get_namerW   r   rE  r  s     ri   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>  s>     
 
 
""$$AG,CCC CCCrk   c                8    g | ]}|t           j        j        v|S r{   r  r  s     ri   r   zLScheduler.clean_removed_buffer_from_partition_signatures.<locals>.<listcomp>  s2     
 
 
17222 222rk   )rg  r  r  rh  ri  r:   r  rf  )rh   rm  rg  r  rh  ri  s         ri   .clean_removed_buffer_from_partition_signaturesz8Scheduler.clean_removed_buffer_from_partition_signatures  s    
 
 ) 5 ; ; = =
 
 


 
&9??AA
 
 


 
!.
 
 


 
!0
 
 

 '#$
 
 	
rk   c                   	
 ddl t                      
g g d t          |          D             d fd	d	
fd	}|D ]5}t          |j        j                  
|<   
|         dk    r 	|           6g }d}|t          |          k     rsrr:                              \  }}|                    |            ||           :r:                              \  }}|                    |            ||           :|d
z  }|t          |          k     r|t          |          k    rt          d          |S )a  
        Reorder nodes to minimize the number of partitions via a bfs
        topological sort. This is the optimal reordering such that the
        number of partitions cannot be reduced further. This may be
        sub-optimal for other metrics such as peak memory. This does not
        change relative orders of two cudagraphable nodes, nor the
        relative order of two non_cudagraphable nodes.
        r   Nc                    i | ]\  }}||	S r{   r{   )r   r  r   s      ri   r}  z>Scheduler.reorder_for_minimizing_partition.<locals>.<dictcomp>  s    EEEysDsEEErk   r   r\   r   r2  c                    |          | f}                     |           r                    |           d S                     |           d S ru   )rI  heappush)r   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesrh   s     ri   insert_pending_nodeszHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodes  s_    ,T2D9O$$T** E6HHHHH2ODDDDDrk   c                    | j         j        D ]7}|         dk    sJ |xx         dz  cc<   |         dk    r |           8d S )Nr   r   )rc  
succ_nodes)r   	succ_noder  node_to_indegrees     ri   update_indegreezCScheduler.reorder_for_minimizing_partition.<locals>.update_indegree  su    !]5 4 4	'	2Q6666 +++q0+++#I.!33((333	4 4rk   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                r   r\   r   r2  )	r  r  r  r   rc  
pred_nodesheappopr   r2  )rh   r  r  r   ru  	num_itersr  r  r  r  r  r  r  s   `      @@@@@@ri    reorder_for_minimizing_partitionz*Scheduler.reorder_for_minimizing_partition  s    	9=CEGIEEIe4D4DEEE	E 	E 	E 	E 	E 	E 	E 	E 	E 	E	4 	4 	4 	4 	4 	4 	4  	+ 	+D%()A%B%BT"%**$$T***,.	#e**$$# %': % * &--(?@@4%%%%%% * &
 & &--(;<<4%%%%%% & &
 NI #e**$$# %': % s5zz!!   rk   c           	     `   ddl m}m} t          t          j                                                  } ||| j        | j        t          t          j        j	        
                                          |          \  }}|                     |          } ||||          \  }}	||dz  k     r|S |S )zx
        Reorder nodes to minimize the number of partitions if this only slightly
        increase peak memory.
        r   )estimate_peak_memoryprepare_planning_infor  )r  r  r  r   rW   r   r  r@  r  r  r  r  )
rh   r  r  r  r  default_peak_memoryr  reordered_nodesreorder_peak_memoryr  s
             ri   r  z0Scheduler.maybe_reorder_for_minimizing_partition   s     	HGGGGGGG"17#;#;#=#=>>:O:O#qw+002233;
 ;
77 ??FF!5!57"
 "
Q
 !4s!:::""rk   c                .   g }g }g }dd}|D ]}|                      |          du}|r.t          |j                  dk    r|                    |           I|r! ||          r|                    |           l|                    |           ||z   |z   S )	a  
        Reorder a node if it should be partitioned and has simple dependency:
        1. move a partitioned node to the front if it has no dependency
        2. move a partitioned node to the back if it is only used by OutputNode
        3. otherwise do not reorder
        r   r\   r   rl   c                    |                                  D ]*}|j        D ] }t          |j        t                    s  dS !+dS rF  )r~  r  r   r   rG  )r   r   rH  s      ri   only_output_userzPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_userM  s\    '')) % %9 % %C%ch
;; %$uuu%% 4rk   Nr   r  )rI  r   rn  r   )rh   r  frontmiddlebackr  r   rI  s           ri   r  z6Scheduler.reorder_for_partition_with_simple_dependency?  s     *,*,(*	 	 	 	  	$ 	$D#44T::$F $C(?$@$@A$E$ET""""! $&6&6t&<&< $D!!!!d####v~$$rk   9tuple[list[PartitionType], list[GraphPartitionSignature]]c                   g }d}g }g }| j         D ]d}|                     |          du}|r2||k    r,|                    |           |                    |           g }|}|                    |           e|r*|                    |           |                    |           |                     ||          }|                     |           |                     ||           ||fS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        TN)r  r  )r  rI  r   r  rp  _log_graph_partitions)rh   r  rf  cur_partitionr  r   node_should_partitionr_  s           ri   r  zScheduler.graph_partition_  s$    +-
')J 	' 	'D$($9$9$$?$?t$K! #3H!H!H!!-000&&~666 "2N  &&&& 	3m,,,"">22277!? 8 
 

 	))*555"":z::::%%rk   c                   t                               t          j                  sd S t	          d t
          j        j        D                       }|sd S t          d |D                       }t          |          |z
  }t           
                    dt          |          ||           t          t          ||                    D ]\  }\  }}t           
                    d|t          |          |j        rdndt          |j                  t          |j                             |j        r|D ]}	|                     |	           d S )Nc              3  4   K   | ]}t          |          V  d S ru   )rR   )r   r  s     ri   r   z2Scheduler._log_graph_partitions.<locals>.<genexpr>  s(      OOVF^^OOOOOOrk   c              3  (   K   | ]}|j         	d V  dS )r   N)rf  r  s     ri   r   z2Scheduler._log_graph_partitions.<locals>.<genexpr>  s*      !P!Pq?O!P!!P!P!P!P!P!Prk   zCCreated %d graph partitions: %d cudagraphable, %d non-cudagraphablez3  Partition %d: %d nodes, %s, inputs=%d, outputs=%dznon-cudagraphablecudagraphable)cudagraphs_logr  r  r  r   rW   r   device_typesr   r   r&  r  r  rf  rg  rh  _log_non_cudagraphable_node)
rh   r  r_  has_gpu_devicecudagraphable_countnon_cudagraphable_countr  rq  rm  r   s
             ri   r  zScheduler._log_graph_partitions  si   
 **7=99 	F OO!':NOOOOO 	F!!P!PZ!P!P!PPP"%j//4G"GQ
OO#		
 	
 	
 *33z:3N3N)O)O 	; 	;%A%	9  EI'0'?T##_I)**I*++   ' ;% ; ;D44T::::	; 	;rk   c                   |                      |          }|sdS |                                }|j        |j                                        nd}d| g}t	          |j                  j        }|                    d|            |G|j         dd                    d |j	        D                        d}|                    d|            t                              d	|d                    |                     |b|j                            d
d          }|rG|                                                    d          D ]!}	t                              d|	           dS dS dS )z)Log details for a non-cudagraphable node.Nzreason=zir=r  r  c              3  4   K   | ]}t          |          V  d S ru   )r   )r   r  s     ri   r   z8Scheduler._log_non_cudagraphable_node.<locals>.<genexpr>  s(      2P2Pa3q662P2P2P2P2P2Prk   r  zfx=z
    %s: %srU  r  z         %s)rI  r  r   r  r   rv   r   r]  r  r4  r  r&  r^  r  stripsplit)
rh   r   r!  rk  r  partsir_typefx_strrU  lines
             ri   r  z%Scheduler._log_non_cudagraphable_node  s   &&t,, 	FMMOO	151F$)++---D#6##$ty//*_7__%%%SS2P2P7<2P2P2P)P)PSSSFLLv(((\9dii6F6FGGG !,**=$??K >'--//55d;; > >D"((====	 > >> >rk   c                    t          d          5  t          j        j        j        r|                                 n|                     | j                  	 cd d d            S # 1 swxY w Y   d S )NzScheduler.codegen)r   r  r  r&   r  _codegen_partitions_codegenr  rg   s    ri   rA  zScheduler.codegen  s    -.. 	 	 ?)9/((***]]4:..	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	s   AA""A&)A&c                   ddl m} t          j        j        }t          | j                  }t          j                                        5  t          j                            dd| ||           | 	                    |           t          t          j        j        |          sJ |                     |          }|t          j        j        _        t          j        j                                         t          j        j        }t          j        j                            t          j        j                  \  }}ddd           n# 1 swxY w Y   t          j        j                            ||           t          j        j                            ||           t          j        j        j                            d |j        D                        dS )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesNc                6    g | ]}|                                 S r{   r{  r  s     ri   r   z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>  s     @@@T]]__@@@rk   )r=  r  rW   r   r:  r  ra  set_current_wrapper_codeinit_wrapper_coder  r   r  r  write_prefixr   generateis_inferencedefine_subgraph_launcher_fncodegen_partition_call	allocatedr  rh  )	rh   rq  rm  r  r  graph_partition_id
graph_namepartition_coder  s	            ri   _codegen_partition_wrapperz$Scheduler._codegen_partition_wrapper  s    	BAAAAAg2!$"?@@W--// 	T 	TG%% ?+=??$7%.	 &    MM)$$$ ag24PQQQQQKKIVVI8AAG 5G --///J ! 4 = =ag>R S SNA/	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T 	T2 	
88^TTT	334F	RRR	&--@@)?@@@	
 	
 	
 	
 	
s   
C2EEE'contextlib.AbstractContextManager[None]c                P     t           j        d fd            } |            S )Nr   Iterator[None]c               3  <  K                                    j        r]t          j        j                  rDj        j        
J d            t
          j        j                            j        j                   	 d V  j        r<t          j        j                  r#t
          j        j        	                                 d _        d S # j        r<t          j        j                  r#t
          j        j        	                                 d _        w xY w)Ndevice should have an index)
%update_graph_partition_default_devicerg  rH   r   r   rW   r   r:  codegen_device_guard_entercodegen_device_guard_exit)r  rh   r_  s   ri   ctxz1Scheduler.use_default_device_context.<locals>.ctx  s@     66z:NNN* /@+00 0  28DD1 EDD $??/5  3. E3D/44 4 E G(BBDDD.2+++	 . E3D/44 4 E G(BBDDD.2+2222s   ?C AD)r   r  )
contextlibcontextmanager)rh   r  r_  r 	  s   ``` ri   use_default_device_contextz$Scheduler.use_default_device_context  sJ     
	"	3 	3 	3 	3 	3 	3 	3 
#	"	3* suurk   c                    t          |          dk    r|d         j        sd S dd}dd
}d }t          ||          D ]\  }}|j        s ||          } n|d S t          ||          D ]\  }}|j        r |||          s d S || _        d S )Nr   r   rq  r]   r   r/  c                B    | d                                          }|J |S r   r   )rq  partition_devices     ri   get_cudagraph_partition_devicezWScheduler.update_graph_partition_default_device.<locals>.get_cudagraph_partition_device  s*    (|6688#///##rk   target_devicerl   c                J    | D ]}|                                 }||k    r dS  dS rF  r	  )rq  r		  r   r  s       ri   all_on_target_devicezMScheduler.update_graph_partition_default_device.<locals>.all_on_target_device  s>     " ! !**]** 55 +4rk   )rq  r]   r   r/  )rq  r]   r		  r/  r   rl   )r   rf  r  rg  )rh   r  r_  r	  r	  cudagraph_partition_devicerq  rm  s           ri   r  z/Scheduler.update_graph_partition_default_device  s    z??a
1(D F	$ 	$ 	$ 	$
	 	 	 	 &*"$'
J$?$? 	 	 Iy+ -K-KI-V-V*
 &-F$'
J$?$? 	 	 Iy' 0D0D51 1  &@###rk   c                6   |                                  \  }}t          |          dk    r(t          d         dxx         t          |          z  cc<   |                     ||          5  t	          ||          D ]e\  }}t          |          dk    sJ dt          |                       |j        r|                     |           O|                     ||           f	 ddd           n# 1 swxY w Y   t          | j	                  }t          j        j                            |           |dk    rat          j        j        J |t          t          j        j                  k    s.J d| dt          t          j        j                               dS dS )	z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   r  cudagraph_partitionsz5Each partition must have at least one node but found Nr   zExpect z partition maps but got )r  r   r   r	  r  rf  r  r  r  ra  rW   r   r:  set_all_partition_namesre  )rh   r  r_  rq  rm  num_partitionss         ri   r  zScheduler._codegen_partitions:  s     "&!5!5!7!7
Jz??QZ !7888C
OOK888,,ZDD 		J 		J(+J
(C(C J J$	99~~***\CPYNN\\ +** + JMM),,,,33IyIIIIJ		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J 		J d;<<	44^DDD A7)555!S)?%@%@@@@_.__#agF\B]B]__ A@@ @@s   )A7C--C14C1c                (   t           j        rdd l}t          j                    }t                      }t          |          D ]k}|j        dk    r|j        |j	        j
        j        k    r nC|j        |j        f}||vsJ d|j         d|j         d            |                    |           l| j        | _        | j        J | j        r4t           j        j        r#t&          j        j                                         |D ]!}t.                              t2          j                  r	 t.                              d|                                |                                           n=# t<          $ r0 t.                              d|                                           Y nw xY w|                     |           |                                 x}r|| j        k    s(|!                                s|"                                r| #                                 || j        k    r| j        r<tI          | j        j%                  r#t&          j        j        &                                 || _        tI          |j%                  r:|j'        
J d            t&          j        j        (                    |j'                   || _)        | j*        +                    |j,                   |"                                rd|-                    t]          |/                                                    \  }	}
}| 0                    |          1                    |
||	           n|!                                r1te          j3        th          |          }| 5                    |           nG|6                                rte          j3        tn          |          }| 0                    |          }d	d
l8m9} d	dl:m;} ty          |||f          r|}nt{          dtK          |                     |>                    |           nty          |t~                    r)| 0                    |          @                    |           npty          |t          t          f          r)| 0                    |          C                    |           n+ty          |t                    sJ |E                                 t           j        jF        r'| 0                    |          G                                 | jH        +                    |I                                           | jJ        +                    |K                                           ty          |t                    s\|                                 }|F|j%        dk    r;| 0                    |          L                                r| #                                 t          d |/                                D                       r	|| _        d | _        #| j        | j        k    rE| j        J tI          | j        j%                  r#t&          j        j        &                                 d | _        | #                                 d S )Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0r  r   )CUDACombinedSchedulingro  ztype(self)=r^  c              3  @   K   | ]}t          |t                    V  d S ru   )r   r   r  s     ri   r   z%Scheduler._codegen.<locals>.<genexpr>  s,      JJA:a//JJJJJJrk   )Nr&   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r   r   filename_dynamoconvert_frame__file__linenor  rg  r  rd  r   autotune_at_compile_timerW   r   r:  write_get_raw_stream_headerr  r  r  r  r&  r  r  r  r>  r   r%  r#  r%  rH   r   r  r   r  re  r  r  r`  r  r  r   rc  codegen_templaterR  rS  r  r+  r'  r`   codegen.cuda_combined_schedulingr	  rs  rp  r   r  codegen_combo_kernelrA  codegen_mix_order_reductionr   r   codegen_noderG  r  debug_sync_kernelcodegen_syncr  r  rD  r   ready_to_flushr   )rh   r  r  stackr+  framer.  r   r  r  r  r  backend_r	  rp  r[  s                   ri   r  zScheduler._codegenZ  s{   4 	....+--E7A||D!%   J"222%-*E*NNNE~u|4$JU^ J Jel J J J '
 "9!))) & 	?6=+Q 	?G <<>>> Y	* Y	*D.. 
IIO2244   
 !   IIP     t$$$***v Vd111~~'' 2'')) 2 JJLLLT000* I/@+00 0 I ,FFHHH*0D'(55 V%|779V777,GGUUU $D%,,T_===!!  484W4W))**5 51-   ((99!8X    !!  {#<dCC((....""  {#=tDD++F33TTTTTT888888h9O(PQQ ;&GG()9DJJ)9)9:::,,T2222D"9::    ((DDTJJJJD#5}"EFF    ((55d;;;;!$(>????? }. 8  ((55777'..t/D/D/F/FGGG%,,T-E-E-G-GHHHd$:;; !**&v--((00??AA . JJLLLJJ9I9IJJJJJ *%)""%)""$"=== &222 !4!9:: A $>>@@@!

s   'A E((7F"!F"(tuple[float, float, list[Optional[str]]]c                    |d                                          }| t          j        _        || _        |J |                     |          }|                    ||          S )rN  r   )r   rW   r   r  r  rc  benchmark_combo_kernel)rh   r  node_benchmark_resultsr  r[  s        ri   r.	  z Scheduler.benchmark_combo_kernel  sa     1((** $!!!""6**--i9OPPPrk   c                   |}|d                                          t          fd|D                       s
J d            t          j        sdS ddlm} dg }}i }t          |          D ]\  }}|                                }	|                     |	          rt          
                    d           	 |                     |	          \  }
}|
|f||<   t          j        |
          rt          
                    d|            d	S n@# |$ r8}d
t          |          v r!t          
                    d           Y d}~ dS  d}~ww xY w||
z  }|                    |           	 |                     ||          \  }}}n?# |$ r7}d
t          |          v r t          
                    d           Y d}~dS  d}~ww xY w||z
  dk     p|dk     }t                              t"          j                  rc||k    s|r.t          
                    dt'          ||z  d                     n-t          
                    dt)          ||z  d                     ||z
  |k     p|S )r  r   c              3  H   K   | ]}|                                 k    V  d S ru   r	  )r   r   r  s     ri   r   z4Scheduler.speedup_by_combo_kernel.<locals>.<genexpr>  s2      KK44??$$.KKKKKKrk   z<All nodes in a combo kernel group must be on the same deviceTr  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableNg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r   r&   r.	  r  r  r  r   r  r%  r&  rO  r  r  r   r   r  r  r  rB   rC   )rh   r  subkernel_nodesr  r  
path1_listr/	  r  r  r  r  r  r   r  	ms2_clone_path2_listsmall_kernelr  s                    @ri   r%  z!Scheduler.speedup_by_combo_kernel  s      #..00KKKK?KKKKK 	
 	
J	
 	
K , 	4;;;;;;rZ!#!/22 	$ 	$HAu))I ##I..   R  55i@@D13T
&u-:b>> !$$U   !55! $   *c!ff44$$]    444444 2ICd####	*.*E*E!7+ +'CKK   	 	 	&#a&&00  Y   ttttt	 Y,9c	""7=11 
	SyyLy  E#)1122   
   Ic	//00  
 Y$44s=   5ADE+D?>D??E#E> >F:+F54F55F:r  	ir.Layoutc                `    | j         |         }|j        J |j                                        S ru   )r@  r   
get_layout)rh   r  r   s      ri   get_buffer_layoutzScheduler.get_buffer_layout<  s1    x(x###x""$$$rk   c                   | j         D ]}|                                r|j        j        D ]}t          j        j                            |j                  }|rut          |          dk    rbt          |j        t          t          f          sA|                                g k    r)t          j        j                            |j                   d S r  )r  rR   r   r   rW   r   r  r  r   r9   r   r&  r=   r<   r  zero_dim_cpu_tensor_listr  )rh   r   rL  rP  s       ri   rf  z$Scheduler.update_zero_dim_cpu_tensorA  s    J 	H 	HD{{}} H ,2 
H 
HDW377	BBFH+F33u<< *"MJ8I+J! ! = #OO--338<<TYGGG	H 	Hrk   )r  r8  r   r2  )r   r  rY  )r  rR  r   r2  rV  )r  r   r   r2  )r   rv  r   r\   )r  r$  r   rl   r  )r  r\   r   r  )r   r  r  r  r   rL  ru   r  r  rR  rl   rS  rT  r   r   )rX  r   r  r/  r   rL  )r[  r  r   rl   )
r  r  r[  r  r  r   r   r   r   r2  )r  r  r   rl   )r  r  rS  rT  r   r  )r~   r\   r   r\   r   ra   )r   r\   r   r\   )r~   r\   r   r\   r{  r  r   r\   )r~   r\   r   r\   r  rq   r{  r  )r  r  r{  r  r   r2  )
r  r  r  r  r  r  r{  r  rF  rl   )r{  r  r  r  )r  r  r  r  )r  r  rF  rl   r   r  )rI  rT  r   r2  r/  )r  r  rF  rl   r   r  r  )r~   r\   r   r\   rX  r   r   rl   )r~   r\   r   r\   rr  rs  r   r   r  r  )r  r\   r  r\   rq  r  r   rl   )r~   r\   r   r\   r   r  )FT)
r~   r\   r   r\   r  rl   rH  rl   r   rl   )r  r4   r~   r\   r   r\   r   rl   )rL  r1   r  r2   r   rl   r  )r   r1   rQ  rl   r   r   )TFT)r~   r\   r   r\   rQ  rl   r	  rl   rH  rl   r   r
  )r  r  r   r  )r  r   r   r   )r&  r  r   r2  )r  r/  r   r,  )r  rR  r   r,  r  )r   r   rc  r_  r   rl   )r   r\   r   r$  )r   r  )r   r[  )r_  r`  r   r2  )rq  r]   rg  r[  r   r  )r  r  r  r  r   r`  )rm  r:   r   r:   )r   r  )r  r  r_  r`  r   r2  )rq  r]   rm  r:   r   r2  )r  r  r_  r`  r   r  r  r  r   r,	  )r  r  r   rl   )r  r   r   r7	  )arv   rw   rx   r  ru  r:  rh  propertyr  setterr  r  r>  rq  rZ   r  rm  r&  rl  r5  r  rn  r  r  r  rO  rV  rZ  re  rw  ry  r  r  r  r6  r  r  r  r  r  r  rG  r~  r  r   r  rW  rm  rq  r  r  r  r0  r  r  r   r  r=  r  r  rT  r2  r3  r  r#  r%  r+  r6  rc  r>  rb  rI  rF   rV  rL  r^  rp  r  r  r  r  r  r  r  r  r  rA  r  r	  r  r  r  r.	  r%  r:	  rf  rP  rQ  s   @ri   r  r  :  s8        
   U9 U9 U9 U9 U9 U9n	# 	# 	# 	# & & & X& ( ( ( (7 7 7 7# # # #, , , ," " " "HMP MP MP MP^K K K KZ+# +# +# +#Z          6S S S S*   4# # # #&$ $ $ $6! ! ! !F	 	 	 	8 8 8 8, (,	    &
> 
> 
> 
>$ $ $ $LVD VD VD VDp'8 '8 '8 '8R
 
 
 
 RV     u u u un> > > >   "   "D2 D2 D2 D2LO? O? O? O?bS S S S00 0 0 0(E E E EN.. .. .. .. ..`? ? ? ?6  6  6  6 p, , , ,\7 7 7 7r.2 .2 .2 .2`$ $ $ $6< < < <|M M M M^Y
 Y
 Y
 Y
v
 
 
 
9 9 9 9v` ` ` `L "*.uM uM uM uM uMn3 3 3 3j-) -) -) -)f% % % %N; ; ; ; ; !.3*.3
 3
 3
 3
 3
j6 6 6 6@4 4 4 4	8 	8 	8 	8* * * *4   
   ' ' ' '*% % % %5 5 5 5$
 
 
 
= = = =~ ! ! ! ]!F   "    ' ' ' 'RBH BH BH BHHK  K  K  K Z"
 "
 "
 "
H? ? ? ?B   >% % % %@ &  &  &  &D"; "; "; ";H> > > >0   )
 )
 )
 )
V   6-A -A -A -A^   @B B B BHQ Q Q QN5 N5 N5 N5`% % % %
H H H H H H H Hrk   c                       e Zd Zd: fdZd;dZd<d
Zd=dZd=dZd=dZd>dZ	d?dZ
d@dZ	 dAdBd$ZdCd'ZdDd)Zd;d*ZdEd+Zd;d,ZdFd.ZdGd1ZdHd3ZdId6Z	 dAdJd9Z xZS )Kr,  r  Optional[Scheduler]c                V    t                                                       || _        d S ru   )rF  ru  r  )rh   r  rI  s     ri   ru  zBaseScheduling.__init__R  s$    "rk   r   r2  c                J    | j         r| j                                          d S d S ru   )r  r#  rg   s    ri   free_buffers_in_schedulerz(BaseScheduling.free_buffers_in_schedulerV  s0    > 	*N'')))))	* 	*rk   r  r/  OrderedSet[BackendFeature]c                    t                      S )z0Return a set of .codegen.common.BackendFeature()r   r  s     ri   get_backend_featuresz#BaseScheduling.get_backend_featuresZ  s    ||rk   r~   r\   r   rl   c                    t           )zO
        Check whether node1 and node2 can be vertically fused or not.
        r4  r#  s      ri   r  z BaseScheduling.can_fuse_vertical^  
     "!rk   c                    t           )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        r4  r#  s      ri   r  z"BaseScheduling.can_fuse_horizontalf  rK	  rk   c                    dS )au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        Fr{   r#  s      ri   r  z.BaseScheduling.can_fuse_multi_outputs_templaten  s	     urk   r   c                h   |                                 s|                                 rt                              ||          S t                              ||          rt          ||          S t          |t
                    r|                    |          S t                              ||          S )z 
        Fuse two nodes
        )	r'  r`  rp   r   r   rA  r   r^  r   r#  s      ri   rp   zBaseScheduling.fusez  s      	9!1!1!3!3 	9-225%@@@77uEE 	9*5%888677 	9??5)))%**5%888rk   r}  r  "tuple[tuple[sympy.Expr, ...], ...]c                    t           )z[
        Process the iteration sizes in case a transformation needs to be applied.
        r4  )rh   r}  s     ri   rd  zBaseScheduling.group_fn  rK	  rk   r  epilogue_nodesr  r  r$  c                    t           )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        r4  )rh   r  rQ	  r  s       ri   r!	  zBaseScheduling.codegen_template  s
     "!rk   Nr  rR  rS  rT  r   c                    t           zD
        Generate a kernel given a list of pre-fused nodes.
        r4  )rh   r  rR  rS  s       ri   rV  z.BaseScheduling.generate_kernel_code_from_nodes  s
     "!rk   r   (Union[FusedSchedulerNode, SchedulerNode]c                    t           rT	  r4  r  s     ri   r%	  zBaseScheduling.codegen_node  
     "!rk   rA  c                    t           ru   r4  r  s     ri   r$	  z*BaseScheduling.codegen_mix_order_reduction  r6  rk   c                    t           )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        r4  rg   s    ri   r'	  zBaseScheduling.codegen_sync  rW	  rk   c                    dS )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        Fr{   rg   s    ri   r(	  zBaseScheduling.ready_to_flush  s	    
 urk   c                    t           )z]
        Flush the generated kernel and python wrapper code to the source code file.
        r4  rg   s    ri   r%  zBaseScheduling.flush  rW	  rk   rL  c                    t           )rN  r4  r;  s     ri   rO  z$BaseScheduling.benchmark_fused_nodes  
     "!rk   rX  r   c                    t           )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        r4  )rh   rX  s     ri   rZ  z)BaseScheduling.benchmark_codegened_module  s
    
 "!rk   r   c                    dS )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   r{   r#  s      ri   r  z'BaseScheduling.get_fusion_pair_priority  s	     qrk   r  r,	  c                    t           )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        r4  )rh   r  r/	  s      ri   r.	  z%BaseScheduling.benchmark_combo_kernel  r]	  rk   node_scheduler  c                |    |r9ddl m}  |||          }t          j        j                            ||           d S d S )Nr   )'set_kernel_post_grad_provenance_tracing)rp  rc	  rW   r   r:  write_provenance_debug_handle)rh   ra	  r  rc	  debug_handles        ri   codegen_commentzBaseScheduling.codegen_comment  sn    
  		UUUUUUBB L G >>\    		 		rk   )r  rC	  rV  )r  r/  r   rG	  r  r>  )r}  r  r   rO	  )r  r\   rQ	  r  r  r  r   r$  ru   r>	  )r   rU	  r   r2  )r   rA  r   r2  rW  r=	  )rX  r   r   rL  r  r?	  )ra	  r  r  r$  r   r2  )rv   rw   rx   ru  rF	  rI	  r  r  r  rp   rd  r!	  rV  r%	  r$	  r'	  r(	  r%  rO  rZ  r  r.	  rf	  rP  rQ  s   @ri   r,  r,  Q  s       # # # # # #* * * *   " " " "" " " "
 
 
 
9 9 9 9" " " "" " " "$ (,		" 	" 	" 	" 	"" " " "" " " "" " " "   " " " "" " " "" " " "   " " " " &*        rk   r,  )r   r  )r  r\   r   r   )r  r\   r   r  )r  r\   r   rj  )r+  r   r   r   )r   r\   r  r  r@  rh  r   r2  )r  r  r   r2  )r  r  r  r  r   r  r   r2  )r{   )r  r  r}  r  r  ry  r   r  )r  r  r|  r  r   r2  r  rW  )r   r  r   r  )r   r\   r   r  )r~   r\   r   r\   )
__future__r   r  r	  rZ  r  r2  r  r  r  r  r  r3  r4  r	  rR  r   r   concurrent.futuresr   r   r   r	   r
   r   r   r   r   typing_extensionsr   torch.utils._ordered_setr   r)   r   collections.abcr   r   r   typesr   r   r  torch._inductor.async_compiletorch.utils._pytreer  _pytreer  torch._dynamo.utilsr   r    torch._inductor.autotune_processr   torch._inductor.codecacher   r   torch._inductor.irr   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr    torch.utils._sympy.symbolr!   r"   r#   torch.utils._tritonr$   r  r%   r&   r'   r(   r*   analyze_preserves_zero_maskr+   codegen.commonr,   r-   r.   comm_analysisr/   r0   r1   r2   r3   r4   excr5   r6   fx_utilsr7   r8   r9   r:   r;   r<   r=   r  r>   r  r?   r@   runtime.hintsrA   runtime.runtime_utilsrB   rC   r   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   virtualizedrW   	getLoggerrv   r  _logginggetArtifactLoggerr%  r  r  r  r  r]   ry   r^   r_   	dataclassra   r}   r   r  r]  r\   r  r  r  r  r  r  r(  rG  r  r  rG  r   r  r  r   rA  r`  r  r  r  r  r  r`  r_  r  r"  r'  r-  r1  r3  r6  r  r,  r{   rk   ri   <module>r	     s	   " " " " " " "                         				        , , , , , , , , 3 3 3 3 3 3 3 3 S S S S S S S S S S S S S S S S S S ' ' ' ' ' ' / / / / / /        !<<<<<<<<<<         $ $ $ $ $ $ $ $ $ $ $ $ $ 6 6 6 6 6 6 6 6 E E E E E E ? ? ? ? ? ? ? ? 7 7 7 7 7 7 M M M M M M M M > > > > > > O O O O O O O O O O * * * * * * D D D D D D D D D D D D D D D D D D D D D D M M M M M M M M M M        ; : : : : : : : : : : : 2 2 2 2 2 2 2 2 $ $ $ $ $ $                       J J J J J J J J ( ( ( ( ( ( 7 7 7 7 7 7 7 7 & & & & & &                                       (       g!!^--hAA
N44XOO  >;;$   11(LII 34 4 4 4 4WT]]Yt__ D D D D D D D D* ( ( ( ( ( ( ( (V V V V V V V Vr h8 h8 h8 h8 h8 h8 h8 h8V 4 4 4 4 4_ 4 4 4y1 y1 y1 y1 y1 y1 y1 y1x 2 2 2 2   (' ' ' ' # # # #L T"""
 
 
 
 
 
 
 #"
*           *K *K *K *KZW W W W W 1 W W W"5 5 5 5 5. 5 5 5L* L* L* L* L*% L* L* L*^
   $   ,}* }* }* }* }** }* }* }*@[G [G [G [G [G0 [G [G [G|w: w: w: w: w:!3 w: w: w:tb b b b b, b b bP #%+ + + + +\0 0 0 08
1 
1 
1 
1 
 
 
 
 
 
 
 
> +9?,, 4 4 4 4   $   &V V V VV V V VP P P P@ @ @ @T@H T@H T@H T@H T@H T@H T@H T@Hn@e e e e e e e e e erk   