
    /j|                       d Z ddlmZ ddlZddlZddlmZ ddlmc mZ	 ddl
mZmZ ddlmZ ddlmZ ddlmZmZmZ d	Z G d
 dej                  Z G d de          Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z dS )zTransformer modules.    )annotationsN)	constant_xavier_uniform_)
TORCH_1_11   )Conv)_get_clonesinverse_sigmoid#multi_scale_deformable_attn_pytorch)
AIFIMLPDeformableTransformerDecoder!DeformableTransformerDecoderLayerLayerNorm2dMLPBlockMSDeformAttnTransformerBlockTransformerEncoderLayerTransformerLayerc                       e Zd ZdZddd ej                    dfd fdZedd d            Z	 	 	 d!d"dZ		 	 	 d!d"dZ
	 	 	 d!d"dZ xZS )#r   a  A single layer of the transformer encoder.

    This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
    supporting both pre-normalization and post-normalization configurations.

    Attributes:
        ma (nn.MultiheadAttention): Multi-head attention module.
        fc1 (nn.Linear): First linear layer in the feedforward network.
        fc2 (nn.Linear): Second linear layer in the feedforward network.
        norm1 (nn.LayerNorm): Layer normalization after attention.
        norm2 (nn.LayerNorm): Layer normalization after feedforward network.
        dropout (nn.Dropout): Dropout layer for the feedforward network.
        dropout1 (nn.Dropout): Dropout layer after attention.
        dropout2 (nn.Dropout): Dropout layer after feedforward network.
        act (nn.Module): Activation function.
        normalize_before (bool): Whether to apply normalization before attention and feedforward.
                  Fc1intcm	num_headsdropoutfloatact	nn.Modulenormalize_beforeboolc                .   t                                                       ddlm} |st	          d          t          j        |||d          | _        t          j        ||          | _	        t          j        ||          | _
        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        || _        || _        dS )a  Initialize the TransformerEncoderLayer with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
           )	TORCH_1_9z]TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).T)r   batch_firstN)super__init__utils.torch_utilsr&   ModuleNotFoundErrornnMultiheadAttentionmaLinearfc1fc2	LayerNormnorm1norm2Dropoutr   dropout1dropout2r    r"   )	selfr   r   r   r   r    r"   r&   	__class__s	           g/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/nn/modules/transformer.pyr)   z TransformerEncoderLayer.__init__3   s    & 	222222 	%o   'IwTXYYY9R$$9R$$\"%%
\"%%
z'**
7++
7++ 0    Ntensortorch.Tensorpostorch.Tensor | Nonereturnc                    || n| |z   S )z2Add position embeddings to the tensor if provided. r<   r>   s     r:   with_pos_embedz&TransformerEncoderLayer.with_pos_embed[        vv&3,6r;   srcsrc_masksrc_key_padding_maskc           	        |                      ||          x}}|                     |||||          d         }||                     |          z   }|                     |          }|                     |                     |                     |                     |                                        }||                     |          z   }| 	                    |          S )a  Perform forward pass with post-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        value	attn_maskkey_padding_maskr   )
rD   r.   r6   r3   r1   r   r    r0   r7   r4   )r8   rF   rG   rH   r>   qksrc2s           r:   forward_postz$TransformerEncoderLayer.forward_post`   s    $ ##C---Awwq!3(MawbbcdeDMM$'''jjooxxTXXdhhsmm%<%<==>>DMM$'''zz#r;   c           	        |                      |          }|                     ||          x}}|                     |||||          d         }||                     |          z   }|                     |          }|                     |                     |                     |                     |                                        }|| 	                    |          z   S )a  Perform forward pass with pre-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        rJ   r   )
r3   rD   r.   r6   r4   r1   r   r    r0   r7   )r8   rF   rG   rH   r>   rP   rN   rO   s           r:   forward_prez#TransformerEncoderLayer.forward_prez   s    $ zz###D#...Awwq!48NbwccdefDMM$'''zz#xxTXXdhhtnn%=%=>>??T]]4((((r;   c                p    | j         r|                     ||||          S |                     ||||          S )a  Forward propagate the input through the encoder module.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after transformer encoder layer.
        )r"   rS   rQ   )r8   rF   rG   rH   r>   s        r:   forwardzTransformerEncoderLayer.forward   sF    $   	N##C3GMMM  h0DcJJJr;   r   r   r   r   r   r   r   r   r    r!   r"   r#   Nr<   r=   r>   r?   r@   r=   NNN)
rF   r=   rG   r?   rH   r?   r>   r?   r@   r=   )__name__
__module____qualname____doc__r,   GELUr)   staticmethodrD   rQ   rS   rU   __classcell__r9   s   @r:   r   r       s         *  !&&1 &1 &1 &1 &1 &1 &1P 7 7 7 7 \7 )-48#'    : )-48#') ) ) ) ): )-48#'K K K K K K K K Kr;   r   c                  r     e Zd ZdZddd ej                    dfd fdZd fdZe	 ddd            Z	 xZ
S ) r   zAIFI transformer layer for 2D data with positional embeddings.

    This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
    embeddings and handling the spatial dimensions appropriately.
    r   r   r   Fr   r   r   r   r   r   r    r!   r"   r#   c                T    t                                          ||||||           dS )a  Initialize the AIFI instance with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
        N)r(   r)   )r8   r   r   r   r   r    r"   r9   s          r:   r)   zAIFI.__init__   s.    & 	RGS:JKKKKKr;   xr=   r@   c                   |j         dd         \  }}}|                     |||          }t                                          |                    d                              ddd          |                    |j        |j                            }|                    ddd          	                    d|||g          
                                S )zForward pass for the AIFI transformer layer.

        Args:
            x (torch.Tensor): Input tensor with shape [B, C, H, W].

        Returns:
            (torch.Tensor): Output tensor with shape [B, C, H, W].
        r   N   r   )devicedtype)r>   )shape"build_2d_sincos_position_embeddingr(   rU   flattenpermutetorg   rh   view
contiguous)r8   rd   chw	pos_embedr9   s         r:   rU   zAIFI.forward   s     '!""+1a;;Aq!DD	GGOOAIIaLL00Aq99y||STS[cdcj|?k?kOllyyAq!!&&Aq!}55@@BBBr;           @rs   rr   	embed_dimtemperaturec                   |dz  dk    s
J d            t          j        | t           j                  }t          j        |t           j                  }t          rt          j        ||d          nt          j        ||          \  }}|dz  }t          j        |t           j                  |z  }d||z  z  }|                                d         |d	         z  }|                                d         |d	         z  }	t          j        t          j        |          t          j        |          t          j        |	          t          j        |	          gd
          d	         S )a~  Build 2D sine-cosine position embedding.

        Args:
            w (int): Width of the feature map.
            h (int): Height of the feature map.
            embed_dim (int): Embedding dimension.
            temperature (float): Temperature for the sine/cosine functions.

        Returns:
            (torch.Tensor): Position embedding with shape [1, h*w, embed_dim].
           r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embeddingrh   ij)indexingg      ?.NNr   )	torcharangefloat32r   meshgridrl   catsincos)
rs   rr   rw   rx   grid_wgrid_hpos_dimomegaout_wout_hs
             r:   rk   z'AIFI.build_2d_sincos_position_embedding   s<    1}!!!#m!!!au}555au}555JTxFFFFZ_ZhioqwZxZxq.WEM:::WD{E)*  +eDk9  +eDk9y%)E**EIe,<,<ei>N>NPUPYZ_P`P`acdeefjkkr;   rV   rd   r=   r@   r=   )ru   rv   )
rs   r   rr   r   rw   r   rx   r   r@   r=   )rZ   r[   r\   r]   r,   r^   r)   rU   r_   rk   r`   ra   s   @r:   r   r      s           !&L L L L L L L*C C C C C C CJl l l l \l l l l lr;   r   c                  ,     e Zd ZdZd
 fdZdd	Z xZS )r   zeTransformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).rq   r   r   c                   t                                                       t          j        ||d          | _        t          j        ||d          | _        t          j        ||d          | _        t          j        ||          | _        t          j        ||d          | _	        t          j        ||d          | _
        dS )zInitialize a self-attention mechanism using linear transformations and multi-head attention.

        Args:
            c (int): Input and output channel dimension.
            num_heads (int): Number of attention heads.
        F)bias)rw   r   N)r(   r)   r,   r/   rN   rO   vr-   r.   r0   r1   )r8   rq   r   r9   s      r:   r)   zTransformerLayer.__init__   s     	1ae,,,1ae,,,1ae,,,'!yIII9Q...9Q...r;   rd   r=   r@   c                
   |                      |                     |          |                     |          |                     |                    d         |z   }|                     |                     |                    |z   S )zApply a transformer block to the input x and return the output.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after transformer layer.
        r   )r.   rN   rO   r   r1   r0   r8   rd   s     r:   rU   zTransformerLayer.forward  sb     GGDFF1IItvvayy$&&))44Q7!;xx$$q((r;   )rq   r   r   r   r   rZ   r[   r\   r]   r)   rU   r`   ra   s   @r:   r   r      sW        oo/ / / / / /
) 
) 
) 
) 
) 
) 
) 
)r;   r   c                  ,     e Zd ZdZd fdZddZ xZS )r   a  Vision Transformer block based on https://arxiv.org/abs/2010.11929.

    This class implements a complete transformer block with optional convolution layer for channel adjustment, learnable
    position embedding, and multiple transformer layers.

    Attributes:
        conv (Conv, optional): Convolution layer if input and output channels differ.
        linear (nn.Linear): Learnable position embedding.
        tr (nn.Sequential): Sequential container of transformer layers.
        c2 (int): Output channel dimension.
    r   r   c2r   
num_layersc                ,   t                                                       d| _        |k    rt          |          | _        t	          j                  | _        t	          j        fdt          |          D              | _	        | _
        dS )aL  Initialize a Transformer module with position embedding and specified number of heads and layers.

        Args:
            c1 (int): Input channel dimension.
            c2 (int): Output channel dimension.
            num_heads (int): Number of attention heads.
            num_layers (int): Number of transformer layers.
        Nc              3  8   K   | ]}t                    V  d S rW   )r   ).0_r   r   s     r:   	<genexpr>z,TransformerBlock.__init__.<locals>.<genexpr>-  s.      !]!]a"22y"A"A!]!]!]!]!]!]r;   )r(   r)   convr   r,   r/   linear
Sequentialrangetrr   )r8   r   r   r   r   r9   s     `` r:   r)   zTransformerBlock.__init__  s     		88RDIiB''-!]!]!]!]!]5Q[K\K\!]!]!]^r;   rd   r=   r@   c                \   | j         |                      |          }|j        \  }}}}|                    d                              ddd          }|                     ||                     |          z                                 ddd                              || j        ||          S )zForward propagate the input through the transformer block.

        Args:
            x (torch.Tensor): Input tensor with shape [b, c1, h, w].

        Returns:
            (torch.Tensor): Output tensor with shape [b, c2, h, w].
        Nrf   r   r   )r   rj   rl   rm   r   r   reshaper   )r8   rd   br   rr   rs   ps          r:   rU   zTransformerBlock.forward0  s     9 		!AW
1aIIaLL  Aq))wwq4;;q>>)**221a;;CCAtwPQSTUUUr;   )r   r   r   r   r   r   r   r   r   r   ra   s   @r:   r   r     sc        
 
     "V V V V V V V Vr;   r   c                  :     e Zd ZdZej        fd
 fdZdd	Z xZS )r   z+A single block of a multi-layer perceptron.embedding_dimr   mlp_dimc                    t                                                       t          j        ||          | _        t          j        ||          | _         |            | _        dS )a  Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.

        Args:
            embedding_dim (int): Input and output dimension.
            mlp_dim (int): Hidden dimension.
            act (type): Activation function class.
        N)r(   r)   r,   r/   lin1lin2r    )r8   r   r   r    r9   s       r:   r)   zMLPBlock.__init__C  sR     	ImW55	Ig}55	355r;   rd   r=   r@   c                x    |                      |                     |                     |                              S )zForward pass for the MLPBlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP block.
        )r   r    r   r   s     r:   rU   zMLPBlock.forwardP  s,     yy$))A,,//000r;   )r   r   r   r   r   )	rZ   r[   r\   r]   r,   r^   r)   rU   r`   ra   s   @r:   r   r   @  sa        55=?W       	1 	1 	1 	1 	1 	1 	1 	1r;   r   c                  @     e Zd ZdZej        dddfd fdZddZ xZS )r   a  A simple multi-layer perceptron (also called FFN).

    This class implements a configurable MLP with multiple linear layers, activation functions, and optional sigmoid
    output activation.

    Attributes:
        num_layers (int): Number of layers in the MLP.
        layers (nn.ModuleList): List of linear layers.
        sigmoid (bool): Whether to apply sigmoid to the output.
        act (nn.Module): Activation function.
    FN	input_dimr   
hidden_dim
output_dimr   sigmoidr#   residualout_normr!   c	                   t                                                       || _        |g|dz
  z  }	t          j        d t          |g|	g |	|          D                       | _        || _         |            | _        |r||k    rt          d          || _
        t          |t          j                  s|J |pt          j                    | _        dS )a7  Initialize the MLP with specified input, hidden, output dimensions and number of layers.

        Args:
            input_dim (int): Input dimension.
            hidden_dim (int): Hidden dimension.
            output_dim (int): Output dimension.
            num_layers (int): Number of layers.
            act (type): Activation function class.
            sigmoid (bool): Whether to apply sigmoid to the output.
            residual (bool): Whether to use residual connections.
            out_norm (nn.Module, optional): Normalization layer for the output.
        r   c              3  F   K   | ]\  }}t          j        ||          V  d S rW   )r,   r/   )r   nrO   s      r:   r   zMLP.__init__.<locals>.<genexpr>  s0      #g#g1BIaOO#g#g#g#g#g#gr;   z5residual is only supported if input_dim == output_dimN)r(   r)   r   r,   
ModuleListziplayersr   r    
ValueErrorr   
isinstanceModuleIdentityr   )r8   r   r   r   r   r    r   r   r   rr   r9   s             r:   r)   zMLP.__init__i  s    . 	$LJN+m#g#gYOQROUeWXUeZdUe@f@f#g#g#ggg355 	V	Z//TUUU (BI..B(2B2BB 1BKMMr;   rd   r=   r@   c                   |}t          | j                  D ]R\  }}|| j        dz
  k     r4 t          | dt	          j                               ||                    n
 ||          }St          | dd          r||z   } t          | dt	          j                              |          }t          | dd          r|                                n|S )zForward pass for the entire MLP.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP.
        r   r    r   Fr   r   )	enumerater   r   getattrr,   ReLUr   r   )r8   rd   orig_xilayers        r:   rU   zMLP.forward  s     !$+.. 	c 	cHAu=>STAT=T=T/eRWYY//a999Z_Z_`aZbZbAA4U++ 	F
A4GD*bkmm44Q77%dIu==Dqyy{{{1Dr;   )r   r   r   r   r   r   r   r   r   r#   r   r#   r   r!   r   )	rZ   r[   r\   r]   r,   r   r)   rU   r`   ra   s   @r:   r   r   \  sy        
 
$ G""2 "2 "2 "2 "2 "2 "2HE E E E E E E Er;   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   ap  2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    This class implements layer normalization for 2D feature maps, normalizing across the channel dimension while
    preserving spatial dimensions.

    Attributes:
        weight (nn.Parameter): Learnable scale parameter.
        bias (nn.Parameter): Learnable bias parameter.
        eps (float): Small constant for numerical stability.

    References:
        https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
        https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
    ư>num_channelsr   epsr   c                   t                                                       t          j        t	          j        |                    | _        t          j        t	          j        |                    | _        || _	        dS )zInitialize LayerNorm2d with the given parameters.

        Args:
            num_channels (int): Number of channels in the input.
            eps (float): Small constant for numerical stability.
        N)
r(   r)   r,   	Parameterr   onesweightzerosr   r   )r8   r   r   r9   s      r:   r)   zLayerNorm2d.__init__  s\     	l5:l#;#;<<L\!:!:;;	r;   rd   r=   r@   c                   |                     dd          }||z
                      d                               dd          }||z
  t          j        || j        z             z  }| j        ddddf         |z  | j        ddddf         z   S )zPerform forward pass for 2D layer normalization.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Normalized output tensor.
        r   Tkeepdimrf   N)meanpowr   sqrtr   r   r   )r8   rd   uss       r:   rU   zLayerNorm2d.forward  s     FF1dF##UKKNN400UejTX...{111dD=)A-	!!!T4-0HHHr;   )r   )r   r   r   r   r   r   ra   s   @r:   r   r     sh         
 
 
 
 
 
 
I I I I I I I Ir;   r   c                  8     e Zd ZdZdd fd
Zd Z	 dddZ xZS )r   a  Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    This module implements multiscale deformable attention that can attend to features at multiple scales with learnable
    sampling locations and attention weights.

    Attributes:
        im2col_step (int): Step size for im2col operations.
        d_model (int): Model dimension.
        n_levels (int): Number of feature levels.
        n_heads (int): Number of attention heads.
        n_points (int): Number of sampling points per attention head per feature level.
        sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
        attention_weights (nn.Linear): Linear layer for generating attention weights.
        value_proj (nn.Linear): Linear layer for projecting values.
        output_proj (nn.Linear): Linear layer for projecting output.

    References:
        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
    ru   rz   r   d_modelr   n_levelsn_headsn_pointsc                   t                                                       ||z  dk    rt          d| d|           ||z  }||z  |k    s
J d            d| _        || _        || _        || _        || _        t          j	        |||z  |z  dz            | _
        t          j	        |||z  |z            | _        t          j	        ||          | _        t          j	        ||          | _        |                                  dS )a>  Initialize MSDeformAttn with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_levels (int): Number of feature levels.
            n_heads (int): Number of attention heads.
            n_points (int): Number of sampling points per attention head per feature level.
        r   z.d_model must be divisible by n_heads, but got z and z(`d_model` must be divisible by `n_heads`@   rf   N)r(   r)   r   im2col_stepr   r   r   r   r,   r/   sampling_offsetsattention_weights
value_projoutput_proj_reset_parameters)r8   r   r   r   r   _d_per_headr9   s         r:   r)   zMSDeformAttn.__init__  s    	W!!egee\ceefff(W$///1[///   "	'7X3E3PST3T U U!#7Gh4F4Q!R!R)GW559Wg66     r;   c                   t          | j        j        j        d           t	          j        | j        t          j                  dt          j	        z  | j        z  z  }t	          j
        |                                |                                gd          }||                                                    dd          d         z                      | j        ddd	                              d| j        | j        d          }t'          | j                  D ]}|d
d
d
d
|d
d
fxx         |dz   z  cc<    t	          j                    5  t+          j        |                    d                    | j        _        d
d
d
           n# 1 swxY w Y   t          | j        j        j        d           t          | j        j        j        d           t3          | j        j        j                   t          | j        j        j        d           t3          | j        j        j                   t          | j        j        j        d           d
S )zReset module parameters.r   r{   g       @ri   Tr   r   r   rf   N)r   r   r   datar   r   r   r   mathpistackr   r   absmaxro   repeatr   r   r   no_gradr,   r   r   r   r   r   r   )r8   thetas	grid_initr   s       r:   r   zMSDeformAttn._reset_parameters  sA   $'.3S999dl%-@@@C$'MTXT`D`aKvzz|| <bAA	,,R,>>qAAT$,1a((VAt}dmQ77 	
 t}%% 	+ 	+AaaaAqqqj!!!QU*!!!!]__ 	J 	J)+innR6H6H)I)ID!&	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J 	J$(/4c:::$(-2C888.3444$/&+S111(/4555$"',c22222s   2F

FFNqueryr=   
refer_bboxrK   value_shapeslist
value_maskr?   r@   c           	        |j         dd         \  }}|j         d         }t          d |D                       |k    sJ |                     |          }|)|                    |d         t	          d                    }|                    ||| j        | j        | j        z            }|                     |                              ||| j        | j	        | j
        d          }	|                     |                              ||| j        | j	        | j
        z            }
t          j        |
d                              ||| j        | j	        | j
                  }
|j         d         }|dk    rct          j        ||j        |j                                      d          }|	|ddddddddf         z  }|ddddddddddf         |z   }nW|d	k    r>|	| j
        z  |ddddddddddf         z  d
z  }|ddddddddddf         |z   }nt'          d| d          t)          ||||
          }|                     |          S )a  Perform forward pass for multiscale deformable attention.

        Args:
            query (torch.Tensor): Query tensor with shape [bs, query_length, C].
            refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2 or 4], range
                in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
            value (torch.Tensor): Value tensor with shape [bs, value_length, C].
            value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
            value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for padding elements,
                False for non-padding elements.

        Returns:
            (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].

        References:
            https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        Nrf   r   c              3  8   K   | ]}|d          |d         z  V  dS )r   r   NrB   )r   r   s     r:   r   z'MSDeformAttn.forward.<locals>.<genexpr>-  s.      5511Q4!A$;555555r;   r~   r   ri   )rh   rg   rz   g      ?z5Last dim of reference_points must be 2 or 4, but got .)rj   sumr   masked_fillr   ro   r   r   r   r   r   r   Fsoftmaxr   	as_tensorrh   rg   flipr   r   r   )r8   r   r   rK   r   r   bslen_qlen_vr   r   
num_pointsoffset_normalizeraddsampling_locationsoutputs                   r:   rU   zMSDeformAttn.forward  s   2 KO	EA5555555>>>>&&!%%j&;U1XXFFE

2udlDLDL4PQQ0077<<RVZVceiertuvv 22599>>r5$,X\XehlhuXuvvI&7<<AA"eT\[_[hjnjwxx%b)
?? %EKX]Xd e e e j jkm n n"%6tT4DRSRSRS7S%TTC!+AAAqqq$4,B!Cc!I1__"T]2Z111dAAAtUVUWUW@W5XX[^^C!+AAAqqq$4!,C!Ds!JbU_bbbccc4ULJ\^opp'''r;   )ru   rz   r   rz   )r   r   r   r   r   r   r   r   rW   )r   r=   r   r=   rK   r=   r   r   r   r?   r@   r=   )rZ   r[   r\   r]   r)   r   rU   r`   ra   s   @r:   r   r     sy         (! ! ! ! ! ! !<3 3 36 +/0( 0( 0( 0( 0( 0( 0( 0( 0(r;   r   c                  ~     e Zd ZdZdddd ej                    ddfd$ fdZed%d            Zd&dZ		 	 	 d'd(d#Z
 xZS ))r   a6  Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
    attention, and a feedforward network.

    Attributes:
        self_attn (nn.MultiheadAttention): Self-attention module.
        dropout1 (nn.Dropout): Dropout after self-attention.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn (MSDeformAttn): Cross-attention module.
        dropout2 (nn.Dropout): Dropout after cross-attention.
        norm2 (nn.LayerNorm): Layer normalization after cross-attention.
        linear1 (nn.Linear): First linear layer in the feedforward network.
        act (nn.Module): Activation function.
        dropout3 (nn.Dropout): Dropout in the feedforward network.
        linear2 (nn.Linear): Second linear layer in the feedforward network.
        dropout4 (nn.Dropout): Dropout after the feedforward network.
        norm3 (nn.LayerNorm): Layer normalization after the feedforward network.

    References:
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    ru   r   i   r   rz   r   r   r   d_ffnr   r   r    r!   r   r   c                   t                                                       t          j        |||          | _        t          j        |          | _        t          j        |          | _        t          ||||          | _
        t          j        |          | _        t          j        |          | _        t          j        ||          | _        || _        t          j        |          | _        t          j        ||          | _        t          j        |          | _        t          j        |          | _        dS )a  Initialize the DeformableTransformerDecoderLayer with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_heads (int): Number of attention heads.
            d_ffn (int): Dimension of the feedforward network.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            n_levels (int): Number of feature levels.
            n_points (int): Number of sampling points.
        )r   N)r(   r)   r,   r-   	self_attnr5   r6   r2   r3   r   
cross_attnr7   r4   r/   linear1r    dropout3linear2dropout4norm3)	r8   r   r   r  r   r    r   r   r9   s	           r:   r)   z*DeformableTransformerDecoderLayer.__init__^  s    * 	 .wQQQ
7++\'**
 'w'8LL
7++\'**
 y%00
7++y00
7++\'**


r;   r<   r=   r>   r?   r@   c                    || n| |z   S )z;Add positional embeddings to the input tensor, if provided.rB   rC   s     r:   rD   z0DeformableTransformerDecoderLayer.with_pos_embed  rE   r;   tgtc           	         |                      |                     |                     |                     |                                        }||                     |          z   }|                     |          S )zPerform forward pass through the Feed-Forward Network part of the layer.

        Args:
            tgt (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after FFN.
        )r  r
  r    r	  r  r  )r8   r  tgt2s      r:   forward_ffnz-DeformableTransformerDecoderLayer.forward_ffn  s_     ||DMM$((4<<3D3D*E*EFFGGDMM$'''zz#r;   Nembedr   featsshapesr   padding_maskrL   	query_posc                n   |                      ||          x}}	|                     |                    dd          |	                    dd          |                    dd          |          d                             dd          }
||                     |
          z   }|                     |          }|                     |                      ||          |                    d          |||          }
||                     |
          z   }|                     |          }| 	                    |          S )a?  Perform the forward pass through the entire decoder layer.

        Args:
            embed (torch.Tensor): Input embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Feature maps.
            shapes (list): Feature shapes.
            padding_mask (torch.Tensor, optional): Padding mask.
            attn_mask (torch.Tensor, optional): Attention mask.
            query_pos (torch.Tensor, optional): Query position embeddings.

        Returns:
            (torch.Tensor): Output tensor after decoder layer.
        r   r   )rL   rf   )
rD   r  	transposer6   r3   r  	unsqueezer7   r4   r  )r8   r  r   r  r  r  rL   r  rN   rO   r  s              r:   rU   z)DeformableTransformerDecoderLayer.forward  s$   2 ##E9555AnnQ[[A..Aq0A0A5??STVWCXCXdmnnn

)Aq// 	 c***

5!! ooy11:3G3G3J3JESY[g
 
 c***

5!! &&&r;   )r   r   r   r   r  r   r   r   r    r!   r   r   r   r   rX   )r  r=   r@   r=   rY   )r  r=   r   r=   r  r=   r  r   r  r?   rL   r?   r  r?   r@   r=   )rZ   r[   r\   r]   r,   r   r)   r_   rD   r  rU   r`   ra   s   @r:   r   r   E  s         4  '+ '+ '+ '+ '+ '+ '+R 7 7 7 \7   & -1)-)-(' (' (' (' (' (' (' (' ('r;   r   c                  4     e Zd ZdZdd fd	Z	 	 dddZ xZS )r   aq  Deformable Transformer Decoder based on PaddleDetection implementation.

    This class implements a complete deformable transformer decoder with multiple decoder layers and prediction heads
    for bounding box regression and classification.

    Attributes:
        layers (nn.ModuleList): List of decoder layers.
        num_layers (int): Number of decoder layers.
        hidden_dim (int): Hidden dimension.
        eval_idx (int): Index of the layer to use during evaluation.

    References:
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    ri   r   r   decoder_layerr!   r   eval_idxc                    t                                                       t          ||          | _        || _        || _        |dk    r|n||z   | _        dS )aL  Initialize the DeformableTransformerDecoder with the given parameters.

        Args:
            hidden_dim (int): Hidden dimension.
            decoder_layer (nn.Module): Decoder layer module.
            num_layers (int): Number of decoder layers.
            eval_idx (int): Index of the layer to use during evaluation.
        r   N)r(   r)   r	   r   r   r   r  )r8   r   r  r   r  r9   s        r:   r)   z%DeformableTransformerDecoder.__init__  sV     	!-<<$$$,MMzH7Lr;   Nr  r=   r   r  r  r   	bbox_head
score_headpos_mlprL   r?   r  c
                (   |}
g }g }d}|                                 }t          | j                  D ]9\  }} ||
||||	| ||                    }
 ||         |
          }t          j         |t	          |          z             }| j        rx|                     ||         |
                     |dk    r|                    |           n~|                    t          j         |t	          |          z                        nF|| j        k    r;|                     ||         |
                     |                    |            n!|}| j        r|                                n|};t          j	        |          t          j	        |          fS )a  Perform the forward pass through the entire decoder.

        Args:
            embed (torch.Tensor): Decoder embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Image features.
            shapes (list): Feature shapes.
            bbox_head (nn.Module): Bounding box prediction head.
            score_head (nn.Module): Score prediction head.
            pos_mlp (nn.Module): Position MLP.
            attn_mask (torch.Tensor, optional): Attention mask.
            padding_mask (torch.Tensor, optional): Padding mask.

        Returns:
            dec_bboxes (torch.Tensor): Decoded bounding boxes.
            dec_cls (torch.Tensor): Decoded classification scores.
        Nr   )
r   r   r   r   r
   trainingappendr  detachr   )r8   r  r   r  r  r  r   r!  rL   r  r  
dec_bboxesdec_clslast_refined_bboxr   r   bboxrefined_bboxs                     r:   rU   z$DeformableTransformerDecoder.forward  s   : 
 ''))
!$+.. 	R 	RHAuU6:uflIW^W^_iWjWjkkF9Q<''D =
0K0K)KLLL} 	}z!}V4455566%%l3333%%emD?K\;];]4]&^&^____dm##}z!}V44555!!,/// ,26-Q,,...\JJ{:&&G(<(<<<r;   )ri   )r   r   r  r!   r   r   r  r   )NN)r  r=   r   r=   r  r=   r  r   r  r!   r   r!   r!  r!   rL   r?   r  r?   r   ra   s   @r:   r   r     st         M M M M M M M0 *.,06= 6= 6= 6= 6= 6= 6= 6= 6=r;   r   )!r]   
__future__r   r   r   torch.nnr,   torch.nn.functional
functionalr   torch.nn.initr   r   ultralytics.utils.torch_utilsr   r   r   utilsr	   r
   r   __all__r   r   r   r   r   r   r   r   r   r   r   rB   r;   r:   <module>r3     s     " " " " " "                  4 4 4 4 4 4 4 4 4 4 4 4 4 4       T T T T T T T T T THK HK HK HK HKbi HK HK HKVEl El El El El" El El ElP) ) ) ) )ry ) ) )>+V +V +V +V +Vry +V +V +V\1 1 1 1 1ry 1 1 18@E @E @E @E @E") @E @E @EF(I (I (I (I (I") (I (I (IVx( x( x( x( x(29 x( x( x(v|' |' |' |' |'	 |' |' |'~U= U= U= U= U=29 U= U= U= U= U=r;   