
    j$:                        d dl mZ d dlZd dlZd dlmZmZ d dlmZ  G d dej                  Z	 G d dej                  Z
 G d	 d
ej                  ZdS )    )annotationsN)Tensornn)MLPBlockc                  <     e Zd ZdZej        dfd fdZddZ xZS )TwoWayTransformera  A Two-Way Transformer module for simultaneous attention to image and query points.

    This class implements a specialized transformer decoder that attends to an input image using queries with supplied
    positional embeddings. It's useful for tasks like object detection, image segmentation, and point cloud processing.

    Attributes:
        depth (int): Number of layers in the transformer.
        embedding_dim (int): Channel dimension for input embeddings.
        num_heads (int): Number of heads for multihead attention.
        mlp_dim (int): Internal channel dimension for the MLP block.
        layers (nn.ModuleList): List of TwoWayAttentionBlock layers composing the transformer.
        final_attn_token_to_image (Attention): Final attention layer from queries to image.
        norm_final_attn (nn.LayerNorm): Layer normalization applied to final queries.

    Methods:
        forward: Process image and point embeddings through the transformer.

    Examples:
        >>> transformer = TwoWayTransformer(depth=6, embedding_dim=256, num_heads=8, mlp_dim=2048)
        >>> image_embedding = torch.randn(1, 256, 32, 32)
        >>> image_pe = torch.randn(1, 256, 32, 32)
        >>> point_embedding = torch.randn(1, 100, 256)
        >>> output_queries, output_image = transformer(image_embedding, image_pe, point_embedding)
        >>> print(output_queries.shape, output_image.shape)
       depthintembedding_dim	num_headsmlp_dim
activationtype[nn.Module]attention_downsample_ratereturnNonec                   t                                                       || _        || _        || _        || _        t          j                    | _        t          |          D ]3}| j        
                    t          ||||||dk                         4t          |||          | _        t          j        |          | _        dS )ab  Initialize a Two-Way Transformer for simultaneous attention to image and query points.

        Args:
            depth (int): Number of layers in the transformer.
            embedding_dim (int): Channel dimension for input embeddings.
            num_heads (int): Number of heads for multihead attention. Must divide embedding_dim.
            mlp_dim (int): Internal channel dimension for the MLP block.
            activation (type[nn.Module], optional): Activation function to use in the MLP block.
            attention_downsample_rate (int, optional): Downsampling rate for attention mechanism.
        r   )r   r   r   r   r   skip_first_layer_pedownsample_rateN)super__init__r
   r   r   r   r   
ModuleListlayersrangeappendTwoWayAttentionBlock	Attentionfinal_attn_token_to_image	LayerNormnorm_final_attn)	selfr
   r   r   r   r   r   i	__class__s	           o/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/modules/transformer.pyr   zTwoWayTransformer.__init__(   s    & 	
*"moou 
	 
	AK$"/'#).G)*a  	 	 	 	 *3=)]v)w)w)w&!|M::    image_embeddingtorch.Tensorimage_pepoint_embedding!tuple[torch.Tensor, torch.Tensor]c                j   |                     d                              ddd          }|                     d                              ddd          }|}|}| j        D ]} |||||          \  }}||z   }||z   }|                     |||          }	||	z   }|                     |          }||fS )a  Process image and point embeddings through the Two-Way Transformer.

        Args:
            image_embedding (torch.Tensor): Image to attend to, with shape (B, embedding_dim, H, W).
            image_pe (torch.Tensor): Positional encoding to add to the image, with same shape as image_embedding.
            point_embedding (torch.Tensor): Embedding to add to query points, with shape (B, N_points, embedding_dim).

        Returns:
            queries (torch.Tensor): Processed point embeddings with shape (B, N_points, embedding_dim).
            keys (torch.Tensor): Processed image embeddings with shape (B, H*W, embedding_dim).
        r	   r      )querieskeysquery_pekey_peqkv)flattenpermuter   r    r"   )
r#   r(   r*   r+   r/   r0   layerr4   r5   attn_outs
             r&   forwardzTwoWayTransformer.forwardQ   s    $ *11!44<<Q1EE##A&&..q!Q77 " [ 	 	E!E(	  MGTT o%8O11Ad1CCH$&&w//}r'   )r
   r   r   r   r   r   r   r   r   r   r   r   r   r   )r(   r)   r*   r)   r+   r)   r   r,   	__name__
__module____qualname____doc__r   ReLUr   r;   __classcell__r%   s   @r&   r   r      sl         @ ')g)*'; '; '; '; '; '; ';R) ) ) ) ) ) ) )r'   r   c                  @     e Zd ZdZdej        ddfd fdZddZ xZS )r   aB  A two-way attention block for simultaneous attention to image and query points.

    This class implements a specialized transformer block with four main layers: self-attention on sparse inputs,
    cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of dense inputs to
    sparse inputs.

    Attributes:
        self_attn (Attention): Self-attention layer for queries.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
        norm2 (nn.LayerNorm): Layer normalization after token-to-image attention.
        mlp (MLPBlock): MLP block for transforming query embeddings.
        norm3 (nn.LayerNorm): Layer normalization after MLP block.
        norm4 (nn.LayerNorm): Layer normalization after image-to-token attention.
        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
        skip_first_layer_pe (bool): Whether to skip positional encoding in the first layer.

    Methods:
        forward: Apply self-attention and cross-attention to queries and keys.

    Examples:
        >>> embedding_dim, num_heads = 256, 8
        >>> block = TwoWayAttentionBlock(embedding_dim, num_heads)
        >>> queries = torch.randn(1, 100, embedding_dim)
        >>> keys = torch.randn(1, 1000, embedding_dim)
        >>> query_pe = torch.randn(1, 100, embedding_dim)
        >>> key_pe = torch.randn(1, 1000, embedding_dim)
        >>> processed_queries, processed_keys = block(queries, keys, query_pe, key_pe)
    i   r	   Fr   r   r   r   r   r   r   r   boolr   r   c                   t                                                       t          ||          | _        t	          j        |          | _        t          |||          | _        t	          j        |          | _        t          |||          | _
        t	          j        |          | _        t	          j        |          | _        t          |||          | _        || _        dS )a  Initialize a TwoWayAttentionBlock for simultaneous attention to image and query points.

        This block implements a specialized transformer layer with four main components: self-attention on sparse
        inputs, cross-attention of sparse inputs to dense inputs, MLP block on sparse inputs, and cross-attention of
        dense inputs to sparse inputs.

        Args:
            embedding_dim (int): Channel dimension of the embeddings.
            num_heads (int): Number of attention heads in the attention layers.
            mlp_dim (int, optional): Hidden dimension of the MLP block.
            activation (type[nn.Module], optional): Activation function for the MLP block.
            attention_downsample_rate (int, optional): Downsampling rate for the attention mechanism.
            skip_first_layer_pe (bool, optional): Whether to skip positional encoding in the first layer.
        r   N)r   r   r   	self_attnr   r!   norm1cross_attn_token_to_imagenorm2r   mlpnorm3norm4cross_attn_image_to_tokenr   )r#   r   r   r   r   r   r   r%   s          r&   r   zTwoWayAttentionBlock.__init__   s    . 	"=)<<\-00
)2=)]v)w)w)w&\-00
M7J??\-00
\-00
)2=)]v)w)w)w&#6   r'   r/   r)   r0   r1   r2   r,   c                   | j         r|                     |||          }n"||z   }|                     |||          }||z   }|                     |          }||z   }||z   }|                     |||          }||z   }|                     |          }|                     |          }||z   }|                     |          }||z   }||z   }|                     |||          }||z   }|                     |          }||fS )a  Apply two-way attention to process query and key embeddings in a transformer block.

        Args:
            queries (torch.Tensor): Query embeddings with shape (B, N_queries, embedding_dim).
            keys (torch.Tensor): Key embeddings with shape (B, N_keys, embedding_dim).
            query_pe (torch.Tensor): Positional encodings for queries with same shape as queries.
            key_pe (torch.Tensor): Positional encodings for keys with same shape as keys.

        Returns:
            queries (torch.Tensor): Processed query embeddings with shape (B, N_queries, embedding_dim).
            keys (torch.Tensor): Processed key embeddings with shape (B, N_keys, embedding_dim).
        r3   )	r   rG   rH   rI   rJ   rK   rL   rN   rM   )	r#   r/   r0   r1   r2   r4   r:   r5   mlp_outs	            r&   r;   zTwoWayAttentionBlock.forward   s/     # 	)nnw'WnEEGG("A~~Q'~::H(G**W%% h6M11Ad1CCH$**W%% ((7##G#**W%% h6M11Ag1FFhzz$}r'   )r   r   r   r   r   r   r   r   r   r   r   rE   r   r   )
r/   r)   r0   r)   r1   r)   r2   r)   r   r,   r<   rC   s   @r&   r   r   }   sr         D &(g)*$)$7 $7 $7 $7 $7 $7 $7L+ + + + + + + +r'   r   c                  b     e Zd ZdZ	 	 dd fdZedd            Zedd            ZddZ xZ	S )r   a  An attention layer with downscaling capability for embedding size after projection.

    This class implements a multi-head attention mechanism with the option to downsample the internal dimension of
    queries, keys, and values.

    Attributes:
        embedding_dim (int): Dimensionality of input embeddings.
        kv_in_dim (int): Dimensionality of key and value inputs.
        internal_dim (int): Internal dimension after downsampling.
        num_heads (int): Number of attention heads.
        q_proj (nn.Linear): Linear projection for queries.
        k_proj (nn.Linear): Linear projection for keys.
        v_proj (nn.Linear): Linear projection for values.
        out_proj (nn.Linear): Linear projection for output.

    Methods:
        _separate_heads: Separate input tensor into attention heads.
        _recombine_heads: Recombine separated attention heads.
        forward: Compute attention output for given query, key, and value tensors.

    Examples:
        >>> attn = Attention(embedding_dim=256, num_heads=8, downsample_rate=2)
        >>> q = torch.randn(1, 100, 256)
        >>> k = v = torch.randn(1, 50, 256)
        >>> output = attn(q, k, v)
        >>> print(output.shape)
        torch.Size([1, 100, 256])
    r.   Nr   r   r   r   	kv_in_dim
int | Noner   r   c                   t                                                       || _        ||n|| _        ||z  | _        || _        | j        |z  dk    s
J d            t          j        || j                  | _        t          j        | j        | j                  | _	        t          j        | j        | j                  | _
        t          j        | j        |          | _        dS )a6  Initialize the Attention module with specified dimensions and settings.

        Args:
            embedding_dim (int): Dimensionality of input embeddings.
            num_heads (int): Number of attention heads.
            downsample_rate (int, optional): Factor by which internal dimensions are downsampled.
            kv_in_dim (int | None, optional): Dimensionality of key and value inputs. If None, uses embedding_dim.

        Raises:
            AssertionError: If num_heads does not evenly divide the internal dim (embedding_dim / downsample_rate).
        Nr   z$num_heads must divide embedding_dim.)r   r   r   rR   internal_dimr   r   Linearq_projk_projv_projout_proj)r#   r   r   r   rR   r%   s        r&   r   zAttention.__init__  s    $ 	*&/&;)_<" 9,1113Y111it/@AAi0ABBi0ABB	$"3]CCr'   xr)   c                z    | j         \  }}}|                     |||||z            } |                     dd          S )zGSeparate the input tensor into the specified number of attention heads.r.   r	   )shapereshape	transpose)r[   r   bncs        r&   _separate_headszAttention._separate_heads,  s@     '1aIIaIqI~66{{1a   r'   r   c                z    | j         \  }}}}|                     dd          } |                     ||||z            S )z9Recombine separated attention heads into a single tensor.r.   r	   )r]   r_   r^   )r[   r`   n_headsn_tokens
c_per_heads        r&   _recombine_headszAttention._recombine_heads3  sB     ,-7(7HjKK1yyHg
&:;;;r'   r4   r5   r6   c                (   |                      |          }|                     |          }|                     |          }|                     || j                  }|                     || j                  }|                     || j                  }|j        \  }}}}||                    dddd          z  }|t          j        |          z  }t          j
        |d          }||z  }|                     |          }|                     |          S )a  Apply multi-head attention to query, key, and value tensors with optional downsampling.

        Args:
            q (torch.Tensor): Query tensor with shape (B, N_q, embedding_dim).
            k (torch.Tensor): Key tensor with shape (B, N_k, kv_in_dim).
            v (torch.Tensor): Value tensor with shape (B, N_k, kv_in_dim).

        Returns:
            (torch.Tensor): Output tensor after attention with shape (B, N_q, embedding_dim).
        r   r.      r	   )dim)rW   rX   rY   rc   r   r]   r8   mathsqrttorchsoftmaxrh   rZ   )r#   r4   r5   r6   _rg   attnouts           r&   r;   zAttention.forward:  s     KKNNKKNNKKNN   DN33  DN33  DN33  g1a199Q1a(((di
+++}Tr*** Qh##C((}}S!!!r'   )r.   N)
r   r   r   r   r   r   rR   rS   r   r   )r[   r)   r   r   r   r)   )r[   r   r   r   )r4   r)   r5   r)   r6   r)   r   r)   )
r=   r>   r?   r@   r   staticmethodrc   rh   r;   rB   rC   s   @r&   r   r      s         B  ! $D D D D D D D< ! ! ! \! < < < \<" " " " " " " "r'   r   )
__future__r   rm   ro   r   r   ultralytics.nn.modulesr   Moduler   r   r    r'   r&   <module>ry      s    # " " " " "           + + + + + +m m m m m	 m m m`p p p p p29 p p pfh" h" h" h" h"	 h" h" h" h" h"r'   