
    Wj*                        d dl Z d dlmZ d dlZd dlmZ d dlmZmZm	Z	m
Z
 d dlmZ ddZd Zdd	Zd
 Ze j        ded         fd            Ze j        ded         fd            Ze j        ded         fd            ZdS )    N)	Generator)global_decomposition_table)_rnn_helpergather_paramsgru_cell	lstm_cell)
while_loopFc                 b   |d         }|d         |r|d         nd}|r|d         ndt          |          dk    r|d         nt          |          dk    r|d         nd|d                             d          }|d                             d          }t          j        j                            | ||          |r                    d          nt          j                            d          gt          |j
        dd                   R |j        |j        d}	fd	}
fd
}t          j        dt          j                  }t          |
|||	||g          \  }}}}|r|                    d          }||                    d          |                    d          ffS )ay  
    1 layer fn for while loop LSTM

    Args:
        inp: Input tensor of shape (seq_len, batch, input_size)
        hidden: Tuple of (hx, cx) hidden states
        params: List of weight and bias tensors
        has_biases: Whether biases are included
        reverse: Whether to process sequence in reverse

    Returns:
        Tuple of (output, (final_hx, final_cx))
    r         N         dtypedevicec                 6    |                      d          k     S Nr   size)iouthxcxprecomputed_inputs       Z/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/torch/export/_patches.pycond_fnz*one_layer_while_loop_lstm.<locals>.cond_fn.       $))!,,,,    c           	      T   |                                  }t          j        |           t          j        |                    d          dz
             t	          |         ||d          \  }}|                                }|                    d          ||<   | dz   |||fS )Nr   r   maxr   )	chunk_dim)itemtorch_check_is_sizer   r   clonesqueeze)	idxr   r   r   r   hh_bias	hh_weight	hr_weightr   s	        r   body_fnz*one_layer_while_loop_lstm.<locals>.body_fn1   s    HHJJQQ$5$:$:1$=$=$ABBBBa "b)WiST
 
 
B iikkAAQwR##r   r   )len	unsqueezer%   nn
functionallinearflipemptyr   tupleshaper   r   tensorint64r	   r(   )inphiddenparams
has_biasesreverse	ih_weightih_biasr   r   step_outputr   r-   cnt_r   final_hxfinal_cxr*   r+   r,   r   s                    @@@@r   one_layer_while_loop_lstmrF   
   s    q	Iq	I%/fQii4G%/fQii4G[[A%%q		Fq8H8H6!99d  
		Q		B			Q		B+223	7KK5<S)..q111BS +q!!	rx|		  hy	  K- - - - -$ $ $ $ $ $ $ $ ,q
,
,
,C!+3R4" "AsHh  hhqkk !!!$$h&6&6q&9&9:::r   c	                    t          |          dk    rt          d          t          |||d                             d          |d                             d          k              }t	          t          |d         |d                             }	t          }
t          | |	||||||||

  
        \  }}t	          t          |           }|t          j	        |d         d          t          j	        |d         d          fS )a  
    LSTM implementation using while_loop for export compatibility.

    This is a drop-in replacement for the default LSTM decomposition that uses
    while_loop instead of Python loops, making it more suitable for torch.export.

    Args:
        input: Input tensor
        hx: Tuple of (h0, c0) hidden states
        params: List of weight and bias tensors
        has_biases: Whether biases are included
        num_layers: Number of LSTM layers
        dropout: Dropout probability
        train: Training mode
        bidirectional: Whether to use bidirectional LSTM
        batch_first: Whether batch dimension is first

    Returns:
        Tuple of (output, h_n, c_n)
    r   zlstm expects two hidden statesr   r   )
r/   AssertionErrorr   r   listziprF   r   r%   stackinputr   r<   r=   
num_layersdropouttrainbidirectionalbatch_firstr;   layer_fnr   final_hiddenss                r   lstm_while_loop_implrU   I   s    > 2ww!||=>>>6:r!uzz!}}1

1/MNNF#beRU##$$F(H$ C m,--MM!,a00%+mA>NPQ2R2RRRr   c                    |d         |d         |r|d         nd|r|d         ndt           j        j                            |           |r                    d          n|                    d          }t          j                            d          gt          |j	        dd                   R |j
        |j        d}fd}fd}t          j        dt           j        	          }	t          |||	||g          \  }
}}|r|                    d          }||                    d          fS )
ad  
    1 layer fn for while loop GRU

    Args:
        inp: Input tensor of shape (seq_len, batch, input_size)
        hidden: Hidden state tensor
        params: List of weight and bias tensors
        has_biases: Whether biases are included
        reverse: Whether to process sequence in reverse

    Returns:
        Tuple of (output, final_hidden)
    r   r   r   Nr   r   c                 6    |                      d          k     S r   r   )r   r   
cur_hiddenr   s      r   r   z)one_layer_while_loop_gru.<locals>.cond_fn   r   r   c                 H   |                                  }t          j        |           t          j        |                    d          dz
             t	          |         |          }|                                }|                    d          ||<   | dz   ||fS )Nr   r   r!   )r$   r%   r&   r   r   r'   r(   )	r)   r   rX   r   r*   r+   r@   r?   r   s	       r   r-   z)one_layer_while_loop_gru.<locals>.body_fn   s    HHJJQQ$5$:$:1$=$=$ABBBBa *i)W
 

 iikk##A&&AQwZ''r   r.   )r%   r1   r2   r3   r4   r0   r5   r   r6   r7   r   r   r8   r9   r	   r(   )r:   r;   r<   r=   r>   rX   rA   r   r-   rB   rC   r   final_hiddenr*   r+   r@   r?   r   s                @@@@@r   one_layer_while_loop_grur[   }   s    q	Iq	I%/fQii4G%/fQii4G+223	7KK5<S)..q111BS!!!$$J +q!!	z#	$	$   	  K- - - - -
( 
( 
( 
( 
( 
( 
( 
( 
( ,q
,
,
,C%gwk:8VWWAsL hhqkk$$Q''''r   c	                     t          ||d          }t          |                    d                    }	t          }
t	          | |	||||||||

  
        \  }}|t          j        |d          fS )a  
    GRU implementation using while_loop for export compatibility.

    This is a drop-in replacement for the default GRU decomposition that uses
    while_loop instead of Python loops, making it more suitable for torch.export.

    Args:
        input: Input tensor
        hx: Hidden state tensor
        params: List of weight and bias tensors
        has_biases: Whether biases are included
        num_layers: Number of GRU layers
        dropout: Dropout probability
        train: Training mode
        bidirectional: Whether to use bidirectional GRU
        batch_first: Whether batch dimension is first

    Returns:
        Tuple of (output, h_n)
    Fr   )r   rI   unbindr[   r   r%   rK   rL   s                r   gru_while_loop_implr^      s|    > 6:u55F"))A,,F'H$ C M1----r   return)NNNc              #     K   t           d         }|                    | d          }| j                            t          j        j        j        d          }	 ||| <   || j        t          j        j        j        <   dV  |||| <   n|                    | d           | || j        t          j        j        j        <   dS | j                            t          j        j        j        d           dS # |||| <   n|                    | d           ||| j        t          j        j        j        <   w | j                            t          j        j        j        d           w xY w)a  
    Generic context manager for registering while_loop-based RNN decompositions.

    Args:
        rnn_op: The aten operation to patch (e.g., torch.ops.aten.lstm.input)
        rnn_impl: The while_loop-based implementation function

    Note:
        This is an internal helper. Use register_lstm_while_loop_decomposition()
        or register_gru_while_loop_decomposition() instead.
    post_autogradN)r   get
py_kernelsr%   _CDispatchKeyCompositeImplicitAutogradpop)rnn_oprnn_implregistryoriginal_decomporiginal_py_kernels        r   &_register_rnn_while_loop_decompositionrm      sq      */:H ll6400O  *..6 X#LT%(.HI &.HV LL&&& )" eh2LMMM
 !!%("6"PRVWWWWW &.HV LL&&& )" eh2LMM
 !!%("6"PRVWWWWs   'C. .A0Ec               #      K   t          t          j        j        j        j        t                    5  dV  ddd           dS # 1 swxY w Y   dS )a  
    Context manager that temporarily registers the while_loop-based LSTM decomposition.

    The while_loop-based decomposition is more suitable for export and graph-based
    execution, as it avoids Python control flow that cannot be captured in the graph.
    This should support dynamic sequence lengths, however as while_loop does not
    support Autograd yet, an ExportedProgram created with this will not be trainable.

    Usage::

        from torch.export._patches import register_lstm_while_loop_decomposition
        from torch.export import export

        with register_lstm_while_loop_decomposition():
            # Export your model with LSTM
            ep = export(model, (x, h0, c0))

    Note:
        This context manager temporarily modifies the global decomposition table
        and py_kernels registration. The original registrations are restored when
        exiting the context.
    N)rm   r%   opsatenlstmrM   rU    r   r   &register_lstm_while_loop_decompositionrs     s      0 
0	!#7
 
   	                    AA
Ac               #      K   t          t          j        j        j        j        t                    5  dV  ddd           dS # 1 swxY w Y   dS )a  
    Context manager that temporarily registers the while_loop-based GRU decomposition.

    The while_loop-based decomposition is more suitable for export and graph-based
    execution, as it avoids Python control flow that cannot be captured in the graph.
    This should support dynamic sequence lengths, however as while_loop does not
    support Autograd yet, an ExportedProgram created with this will not be trainable.

    Usage::

        from torch.export._patches import register_gru_while_loop_decomposition
        from torch.export import export

        with register_gru_while_loop_decomposition():
            # Export your model with GRU
            ep = export(model, (x, h0))

    Note:
        This context manager temporarily modifies the global decomposition table
        and py_kernels registration. The original registrations are restored when
        exiting the context.
    N)rm   r%   ro   rp   grurM   r^   rr   r   r   %register_gru_while_loop_decompositionrw   2  s      0 
0	 "5
 
   	                 rt   )F)
contextlibcollections.abcr   r%   torch._decompr   torch._decomp.decompositionsr   r   r   r   "torch._higher_order_ops.while_loopr	   rF   rU   r[   r^   contextmanagerrm   rs   rw   rr   r   r   <module>r~      se       % % % % % %  4 4 4 4 4 4 X X X X X X X X X X X X 9 9 9 9 9 9<; <; <; <;~1S 1S 1Sh3( 3( 3( 3(l.. .. ..b ,X ,X ,X ,X ,X^ 	:J0K    : y9I/J      r   