
    jT                        d dl mZ d dlZd dlmZ d dlmZ ddlmZ  G d dej                  Z	 G d	 d
ej                  Z
 G d de
          Zd ZdS )    )annotationsN)nn)_get_clones   get_valid_ratioc                  p     e Zd ZdZ	 	 d d! fdZ	 	 	 	 	 	 d"d#dZ	 	 	 	 	 	 	 d$d%dZ	 	 	 	 	 	 	 d$d%dZ xZS )&TransformerEncoderLayera  Transformer encoder layer that performs self-attention followed by cross-attention.

    This layer was previously called TransformerDecoderLayer but was renamed to better reflect its role in the
    architecture. It processes input sequences through self-attention and then cross-attention with another input
    (typically image features).

    The layer supports both pre-norm and post-norm configurations, as well as positional encoding at different stages of
    the attention mechanism.
    Nd_modelintdim_feedforwarddropoutfloatpos_enc_at_attnboolpos_enc_at_cross_attn_keyspos_enc_at_cross_attn_queriespre_normself_attention	nn.Modulecross_attentionc
                &   t                                                       || _        || _        || _        |pt          j        ddd          | _        |	pt          j        ddd          | _        t          j	        ||          | _
        t          j        |          | _        t          j	        ||          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j        |          | _        t          j                    | _        || _        || _        || _        || _        d| _        dS )a  Initialize a transformer encoder layer.

        Args:
            d_model: Model dimension/hidden size
            dim_feedforward: Dimension of the feedforward network
            dropout: Dropout probability
            pos_enc_at_attn: Whether to add positional encodings at self-attention
            pos_enc_at_cross_attn_keys: Whether to add positional encodings to keys in cross-attention
            pos_enc_at_cross_attn_queries: Whether to add positional encodings to queries in cross-attention
            pre_norm: Whether to use pre-norm (True) or post-norm (False) architecture
            self_attention: Self-attention module
            cross_attention: Cross-attention module for attending to image features
           g?   )	num_headsr   	embed_dimN)super__init__r   r   dropout_valuer   MultiheadAttention	self_attncross_attn_imageLinearlinear1Dropoutr   linear2	LayerNormnorm1norm2norm3dropout1dropout2dropout3ReLU
activationr   r   r   r   	layer_idx)selfr   r   r   r   r   r   r   r   r   	__class__s             h/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/encoder.pyr   z TransformerEncoderLayer.__init__   sJ   2 	.$'i2+@1VYeh+i+i+i / q23HST^amp3q3q3q y/::z'**y'::\'**
\'**
\'**

7++
7++
7++')) .-J**D'    tgttorch.Tensormemorytgt_maskmemory_masktgt_key_padding_maskmemory_key_padding_maskpos	query_posreturnc	           	        | j         r||z   n|x}
}|                     |
||||d          d         }||                     |          z   }|                     |          }|                     | j        r||z   n|| j        r||z   n||||d          d         }||                     |          z   }|                     |          }| 	                    | 
                    |                     |                     |                                        }||                     |          z   }|                     |          }|S )aK  Forward pass for post-norm architecture.

        In post-norm architecture, normalization is applied after attention and feedforward operations.

        Args:
            tgt (torch.Tensor): Input tensor to be processed.
            memory (torch.Tensor): Memory tensor for cross-attention.
            tgt_mask (torch.Tensor): Mask for self-attention.
            memory_mask (torch.Tensor): Mask for cross-attention.
            tgt_key_padding_mask (torch.Tensor): Key padding mask for self-attention.
            memory_key_padding_mask (torch.Tensor): Key padding mask for cross-attention.
            pos (torch.Tensor): Positional encoding for memory.
            query_pos (torch.Tensor): Positional encoding for query.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            Processed tensor
        F)value	attn_maskkey_padding_maskneed_weightsr   )querykeyr@   rA   rB   rC   )r   r!   r+   r(   r"   r   r   r,   r)   r&   r   r/   r$   r-   r*   )r1   r5   r7   r8   r9   r:   r;   r<   r=   kwargsqktgt2s                r3   forward_postz$TransformerEncoderLayer.forward_postO   sV   < $(#7@iS@A ~~qxBVej  
 

 DMM$'''jjoo $$%)%GP#	//S $ ?KV!4 % 
 
  DMM$'''jjoo ||DLLc9J9J)K)KLLMMDMM$'''jjoo
r4   Fdacc
           	        |rF|j         d         dz  dk    sJ ||j         d         dz  d         }
|d|j         d         dz           }|                     |                                          }| j        r||	z   n|x}}|                     |||||          d         }||                     |          z   }|rt          j        ||
fd          }|                     |          }|	                    |j
                                                  }|                     | j        r||	z   n|| j        r||z   n||||          d         }||                     |          z   }|                     |          }|                     |                     |                     |                     |                                        }||                     |          z   }|S )a  Forward pass for pre-norm architecture.

        In pre-norm architecture, normalization is applied before attention and feedforward operations.

        Args:
            tgt: Input tensor to be processed
            memory: Memory tensor for cross-attention
            dac: Whether to use Divide-and-Conquer attention
            tgt_mask: Mask for self-attention
            memory_mask: Mask for cross-attention
            tgt_key_padding_mask: Key padding mask for self-attention
            memory_key_padding_mask: Key padding mask for cross-attention
            pos: Positional encoding for memory
            query_pos: Positional encoding for query

        Returns:
            Processed tensor
        r      N)r@   rA   rB   dim)rD   rE   r@   rA   rB   )shaper(   
contiguousr   r!   r+   torchcatr)   todtyper"   r   r   r,   r*   r&   r   r/   r$   r-   )r1   r5   r7   rK   r8   r9   r:   r;   r<   r=   	other_tgtrI   rG   rH   s                 r3   forward_prez#TransformerEncoderLayer.forward_pre   s   <  	+9Q<!#q((((CIaLA-//0I)	!))*Czz#))++$($8By  dBA~~a$(Ui~jjklmDMM$''' 	5)S),!444Czz#4:&&1133$$&*&HR$""d $ ?KV!4 % 
 
  DMM$'''zz#||DLLd9K9K)L)LMMNNDMM$'''
r4   c
                T    | j         r| j        n| j        }
 |
|||||||||		  	        S )a  Forward pass for the transformer encoder layer.

        Args:
            tgt: Input tensor to be processed
            memory: Memory tensor (e.g., image features) for cross-attention
            dac: Whether to use Divide-and-Conquer attention (only apply self-attention to first half)
            tgt_mask: Mask for self-attention
            memory_mask: Mask for cross-attention
            tgt_key_padding_mask: Key padding mask for self-attention
            memory_key_padding_mask: Key padding mask for cross-attention
            pos: Positional encoding for memory
            query_pos: Positional encoding for query

        Returns:
            Processed tensor after self-attention, cross-attention, and feedforward network
        )rK   r8   r9   r:   r;   r<   r=   )r   rW   rJ   )r1   r5   r7   rK   r8   r9   r:   r;   r<   r=   fwd_fns              r3   forwardzTransformerEncoderLayer.forward   sO    8 &*]I!!8Iv#!5$;
 
 
 	
r4   )NN)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )NNNNNN)r5   r6   r7   r6   r8   r6   r9   r6   r:   r6   r;   r6   r<   r6   r=   r6   r>   r6   )FNNNNNN)r5   r6   r7   r6   rK   r   r8   r6   r9   r6   r:   r6   r;   r6   r<   r6   r=   r6   r>   r6   )	__name__
__module____qualname____doc__r   rJ   rW   rZ   __classcell__r2   s   @r3   r
   r
      s         & %)%)3 3 3 3 3 3 3r "&$(-104 "&7 7 7 7 7z !%$(-104 "&7 7 7 7 7z !%$(-104 "&)
 )
 )
 )
 )
 )
 )
 )
 )
r4   r
   c                  D     e Zd ZdZ	 	 dd fdZd Z	 	 	 	 	 dddZ xZS ) TransformerEncodera  Transformer encoder that processes multi-level features.

    This encoder takes multi-level features (e.g., from a backbone network) and processes them through a stack of
    transformer encoder layers. It supports features from multiple levels (e.g., different resolutions) and can apply
    activation checkpointing for memory efficiency during training.

    Args:
        layer: The encoder layer to be stacked multiple times
        num_layers: Number of encoder layers to stack
        d_model: Model dimension/hidden size
        num_feature_levels: Number of feature levels to process
        frozen: Whether to freeze the parameters of this module
        use_act_checkpoint: Whether to use activation checkpointing during training
    Flayerr   
num_layersr   r   num_feature_levelsfrozenr   use_act_checkpointc                   t                                                       t          ||          | _        || _        || _        d| _        |dk    r,t          j        t          j
        ||                    | _        |r,|                                 D ]}|                    d           || _        t          | j                  D ]\  }}||_        dS )z#Initialize the transformer encoder.Nr   F)r   r   r   layersrd   re   level_embedr   	ParameterrR   Tensor
parametersrequires_grad_rg   	enumerater0   )
r1   rc   rd   r   re   rf   rg   pr0   r2   s
            r3   r   zTransformerEncoder.__init__   s     	!%44$"4!!!|EL9KW,U,UVVD 	(__&& ( (  ''''"4
 !*$+ 6 6 	( 	(Iu'EOO	( 	(r4   c                   t          |          | j        k    s
J d            g }g }g }g }|duo	|d         du}t          t          |||                    D ]\  }	\  }
}}|
j        \  }}}}||f}|                    |           |
                    d                              dd          }
|r|                    d          }|                    d                              dd          }| j        &|| j        |	         	                    ddd          z   }n|}|                    |           |                    |
           |r|                    |           
t          j        |d          }|rt          j        |d          nd}t          j        |d          }t          j        |t          j        |j                  }t          j        |                    d          |                    d                              d          dd         f          }|r t          j        d	 |D             d          }n4t          j        |j        d         | j        df|j        |j        
          }||||||fS )z5Prepare multi-level features for transformer encoder.z:mismatch between expected and received # of feature levelsNr   rM   r   )rU   device)r   c                ,    g | ]}t          |          S  r   ).0ms     r3   
<listcomp>zCTransformerEncoder._prepare_multilevel_features.<locals>.<listcomp>A  s     'J'J'Jq(:(:'J'J'Jr4   )rs   rU   )lenre   ro   ziprP   appendflatten	transposerj   viewrR   rS   tensorlongrs   	new_zerosprodcumsumstackonesrU   )r1   srcsmasks
pos_embedssrc_flattenmask_flattenlvl_pos_embed_flattenspatial_shapeshas_masklvlsrcmask	pos_embed_hwspatial_shapelvl_pos_embedlevel_start_indexvalid_ratioss                       r3   _prepare_multilevel_featuresz/TransformerEncoder._prepare_multilevel_features  s   4yyD33335q333 "$=q)=+4Suj5Q5Q+R+R 	* 	*'C'#tYJAq!QFM!!-000++a..**1a00C '||A!))!,,66q!<<I+ )D,<S,A,F,Fq!R,P,P P )!((777s### *##D)))iQ//5=Guyq1114 %	*? C CnEJ{Oabbb!I((..##A&&--a00"5
 
  	 ;'J'JE'J'J'JANNLL :"1%t'>B")!'  L !
 	
r4   Nr   list[torch.Tensor]src_key_padding_maskslist[torch.Tensor] | Noner<   promptr6   prompt_key_padding_maskencoder_extra_kwargsdict | Noner>   Ytuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]c                l   t          |          | j        k    s
J d            |t          |          | j        k    sJ |t          |          | j        k    sJ |                     |||          \  }}}	}
}}|}| j        D ]k}i }t	          |t
                    sJ ||d<   ||d<   |	|d<   ||d<   ||d<   | j        r| j        s
J d            ||                    |            |di |}l|	                    d	d
          ||	                    d	d
          nd|		                    d	d
          |
||fS )a  Process multi-level features through the transformer encoder.

        Args:
            src: List of multi-level features, each with shape (batch_size, channels, height, width)
            src_key_padding_masks: List of padding masks for each feature level, each with shape (batch_size, height,
                width)
            pos: List of positional embeddings for each feature level, each with shape (batch_size, channels, height,
                width)
            prompt: Optional text/prompt features to attend to, with shape (seq_len, batch_size, d_model)
            prompt_key_padding_mask: Optional padding mask for prompt, with shape (batch_size, seq_len)
            encoder_extra_kwargs: Optional additional arguments to pass to each encoder layer

        Returns:
            A tuple containing:
            - output: Processed features with shape (seq_len, batch_size, d_model)
            - key_padding_masks_flatten: Flattened padding masks
            - lvl_pos_embed_flatten: Flattened positional embeddings
            - level_start_index: Starting indices for each feature level
            - spatial_shapes: Spatial dimensions of each feature level
            - valid_ratios: Valid ratios for each feature level
        z#must be equal to num_feature_levelsNr7   r;   r=   r5   r:   z&activation ckpt not enabled in encoderr   r   ru   )
ry   re   r   ri   
isinstancer
   trainingrg   updater}   )r1   r   r   r<   r   r   r   r   key_padding_masks_flattenr   r   r   r   outputrc   layer_kwargss                   r3   rZ   zTransformerEncoder.forwardR  s   < 3xx422224Y222 ,,--1HHHHH?s88t66666 --c3H#NN	
%! [ 	+ 	+ELe%<=====%+L"6ML23(=L%"(L3LL/0} Y.XX0XXX.#/##$8999U**\**FF Q"":S:_&00A666ei!++Aq11
 	
r4   )FF)rc   r   rd   r   r   r   re   r   rf   r   rg   r   NNNNN)r   r   r   r   r<   r   r   r6   r   r6   r   r   r>   r   )r[   r\   r]   r^   r   r   rZ   r_   r`   s   @r3   rb   rb      s         * #(( ( ( ( ( ( (>4
 4
 4
r <@)-#04,0E
 E
 E
 E
 E
 E
 E
 E
 E
r4   rb   c                  D     e Zd ZdZ	 	 	 dd fdZ	 	 	 	 	 d d! fdZ xZS )"TransformerEncoderFusiona  Transformer encoder that fuses text and image features.

    This encoder extends TransformerEncoder to handle both text and image features, with the ability to add pooled text
    features to image features for better cross-modal fusion. It supports torch.compile for performance optimization.

    Args:
        layer (nn.Module): The encoder layer to be stacked multiple times.
        num_layers (int): Number of encoder layers to stack.
        d_model (int): Model dimension/hidden size.
        num_feature_levels (int): Number of feature levels to process.
        add_pooled_text_to_img_feat (bool): Whether to add pooled text features to image features.
        pool_text_with_mask (bool): Whether to use the mask when pooling text features.
        compile_mode (str | None): Mode for torch.compile, or None to disable compilation.
        **kwargs (Any): Additional arguments to pass to the parent class.
    TFNrc   r   rd   r   r   re   add_pooled_text_to_img_featr   pool_text_with_maskcompile_mode
str | Nonec                     t                      j        ||||fi | || _        | j        rt          j        ||          | _        || _        |#t          j        | j	        |d          | _	        dS dS )z:Initialize the transformer encoder with text-image fusion.NT)mode	fullgraph)
r   r   r   r   r#   text_pooling_projr   rR   compilerZ   )
r1   rc   rd   r   re   r   r   r   rF   r2   s
            r3   r   z!TransformerEncoderFusion.__init__  s     			
 	

 	
 	
 	
 ,G(+ 	A%'Yw%@%@D"#6 # =LTXYYYDLLL $#r4   r   r   r   r6   src_key_padding_maskr   src_posr   
feat_sizeslist[int] | Noner   r   c           	        |d         j         d         }|t          |          t          |          k    sJ |dgt          |          z  }t          |          D ]\  }	\  }
}||	                             |
||d                              dddd          ||	<   ||	                             |
||d                              dddd          ||	<   ||	         2||	                             |
||                              ddd          nd||	<   n#t          d |D                       s
J d            | j        r?t          ||| j                  | 	                              d	         fd
|D             }t                                          ||||                    dd          ||          \  }}}}}}|||||||dS )z@Forward pass for the transformer encoder with text-image fusion.r   r   Nrr   rM      c              3  ,   K   | ]}|j         d k    V  dS )   NrN   )rv   xs     r3   	<genexpr>z3TransformerEncoderFusion.forward.<locals>.<genexpr>  s(      //aquz//////r4   z&expected list of (bs, c, h, w) tensors).NNc                :    g | ]}|                               S ru   )add_)rv   r   pooled_texts     r3   rx   z4TransformerEncoderFusion.forward.<locals>.<listcomp>  s%    4441166+&&444r4   )r   r<   r   r   r   )r7   padding_maskr   memory_textr   r   r   )rP   ry   ro   reshapepermuteallr   pool_text_featr   r   r   rZ   r}   )r1   r   r   r   r   r   r   r   bsir   r   outr   r   r   r   r   r   r2   s                     @r3   rZ   z TransformerEncoderFusion.forward  s%    V\!_!z??c#hh....#+(,vC'8$&z22  	6AqQ1b"55==aAqIIA$QZ//1b"==EEaAqQQ
 ,A.: )+33Aq"==EEaANNN %Q'' //3/////YY1YYY/+ 	5(1H$JbccK00==oNK4444444C GGOO"6##Aq))$;!5  
 
	
%! 5.!!2,(
 
 	
r4   )TFN)rc   r   rd   r   r   r   re   r   r   r   r   r   r   r   r   )r   r   r   r6   r   r   r   r   r   r6   r   r   r   r   )r[   r\   r]   r^   r   rZ   r_   r`   s   @r3   r   r     s         , -1$)#'Z Z Z Z Z Z Z< ;?-104'+,0:
 :
 :
 :
 :
 :
 :
 :
 :
 :
 :
r4   r   c                T   |s|                      d          S |                                dk    sJ |                                                     dd          d         }t	          j        t	          j        |d          d          }| |z                      d          |z  }|S )z;Mean-pool the prompt embeddings over the valid tokens only.r   rN   rM   r   ).Ng      ?)min)meanrO   r   r   rR   clampsum)r   prompt_maskpool_with_maskis_valid	num_validr   s         r3   r   r     s      "{{q{!!! ??!!!!##%%--a33I>HEIhA666C@@@I H$))a)009<Kr4   )
__future__r   rR   r   ultralytics.nn.modules.utilsr   
model_miscr   Moduler
   rb   r   r   ru   r4   r3   <module>r      s  
 # " " " " "        4 4 4 4 4 4 ' ' ' ' ' '[
 [
 [
 [
 [
bi [
 [
 [
|j
 j
 j
 j
 j
 j
 j
 j
Ze
 e
 e
 e
 e
1 e
 e
 e
P    r4   