
    j6V                    0   d Z ddlmZ ddlZddlmZ ddlmZ ddlZddl	m
Z
 ddlm
c mZ ddlmc mZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ d
dlmZ  G d de
j                  Z  G d de
j                  Z! G d de
j                  Z"dS )a@  
ViTDet backbone adapted from Detectron2.
This module implements Vision Transformer (ViT) backbone for object detection.

Rope embedding code adopted from:
1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
2. https://github.com/naver-ai/rope-vit
3. https://github.com/lucidrains/rotary-embedding-torch
    )annotationsN)partial)Callable)Tensor)
PatchEmbed)apply_rotary_enccompute_axial_cisconcat_rel_posget_abs_poswindow_partitionwindow_unpartition)check_requirements   )
LayerScalec                  ^     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d d! fdZd"d#dZd$d%dZd&dZd'dZ xZ	S )(	AttentionzIMulti-head Attention block with relative position embeddings and 2d-rope.   TFN     @dimint	num_headsqkv_biasbooluse_rel_posrel_pos_zero_init
input_sizetuple[int, int] | None	cls_tokenuse_rope
rope_thetafloatrope_pt_sizerope_interpc                   t                                                       || _        ||z  | _        | j        dz  | _        || _        t          j        ||dz  |          | _        t          j        ||          | _	        || _
        || _        || _        |	| _        |
| _        || _        |                     ||           |                     |           dS )a	  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            input_size (tuple[int, int] or None): Input resolution for calculating the relative positional parameter
                size or rope size.
            cls_token (bool): Whether a cls_token is present.
            use_rope (bool): Whether to use rope 2d (independent of use_rel_pos, as it can be used together).
            rope_theta (float): Control frequencies of rope.
            rope_pt_size (tuple[int, int] or None): Size of rope in previous stage of training, needed for interpolation
                or tiling.
            rope_interp (bool): Whether to interpolate (or extrapolate) rope to match input size.
                 )biasN)super__init__r   head_dimscaler   nnLinearqkvprojr   r   r   r    r"   r#   _setup_rel_pos_setup_rope_freqs)selfr   r   r   r   r   r   r   r   r    r"   r#   	__class__s               g/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/vitdet.pyr)   zAttention.__init__,   s    < 	"y(]D(
"9S#'999Ic3''	 '$ $(& 	-z:::z*****    returnNonec                   | j         sd| _        d| _        dS |J | j        du s
J d            t	          j        t          j        d|d         z  dz
  | j                            | _        t	          j        t          j        d|d         z  dz
  | j                            | _        |sLt          j	        
                    | j        d           t          j	        
                    | j        d           |\  }}t          j        |          dddf         }t          j        |          dddf         }||z
  |dz
  z   }|                                | _        dS )	z%Setup relative positional embeddings.NFznot supported   r   r   {Gz?std)r   	rel_pos_h	rel_pos_wr   r,   	Parametertorchzerosr*   inittrunc_normal_arangelongrelative_coords)r2   r   r   HWq_coordsk_coordsrF   s           r4   r0   zAttention._setup_rel_pos`   sO    	!DN!DNF%%%~&&&&&&ek!jm2Ca2G&W&WXXek!jm2Ca2G&W&WXX  	<G!!$.d!;;;G!!$.d!;;; 1<??111d7+<??47+#h.1q59.3355r5   c                &   | j         s	d| _        dS |J | j        || _        t          t          | j        | j                  | _        d}| j        r| j        d         |d         z  }|                     |d         |d         |          }| j	        rwt          j        | j        dz  t          j        |j                  }t          j        t          j        |          |          dddf         }t          j        ||gd	          }|| _        dS )
zSetup 2d-rope frequencies.N)r   theta      ?r   r   )end_xend_y	scale_posr9   )dtypedevicer   )r   	freqs_cisr"   r   r	   r*   r    compute_cisr#   r   r@   rA   float32rR   polar	ones_likecat)r2   r   rP   rT   tcls_freqs_ciss         r4   r1   zAttention._setup_rope_freqsx   s4   } 	!DNF%%%$ *D #/
 
 
 	 	=)!,z!}<I$$Q-Q- % 
 
	
 > 	E"m '  A
 "K(:(:A>>tQQQwGM	=)"<!DDDI"r5   tuple[Tensor, Tensor]c                    | j         s||fS | j        J t          ||| j                            |j                            S )zApply 2d-rope to q and k.N)rT   )r   rT   r   torR   )r2   qks      r4   _apply_ropezAttention._apply_rope   sJ    } 	a4K~)))10A0A!(0K0KLLLLr5   xr   c           
        | j         rdnd}|j        dk    r|j        \  }}}}|dk    sJ ||z  }d}n3|j        dk    sJ |j        \  }}}d}t          j        ||z
            x}}|                     |                              ||d| j        d          }	|	                    ddddd          	                    d          \  }
}}| 
                    |
|          \  }
}| j        rt          |
                    dd          |                    dd          ||f|j        dd         | j        | j        d| j                  \  }
}|
                    || j        ||z  d          }
|                    || j        ||z  d          }t#          j        |
||          }|dk    rL|                    || j        ||d                              ddddd                              |||d          }nH|                    || j        |d                              dddd                              ||d          }|                     |          }|S )	z Forward pass of attention block.r   r      r&   r9   T)rescalerF   )r   ndimshapemathsqrtr.   reshaper   permuteunbindra   r   r
   flattenr=   r>   rF   Fscaled_dot_product_attentionviewr/   )r2   rb   sBrG   rH   _Lrg   r.   r_   r`   vs                r4   forwardzAttention.forward   sT   &AAQ6Q;;JAq!Q6666AADD6Q;;;;gGAq!DIa!e$$$A hhqkk!!!Q4>2>>++aAq!,,33A661a 1%%1 	8!		!Q		!QA! $ 4	 	 	DAq 		!T^QUB77A		!T^QUB77A*1a33199q$.!Q33;;Aq!QJJRRSTVWYZ\^__AAq$.!R0088Aq!DDLLQPQSUVVAIIaLLr5   )
r   TFTNFFr   NF)r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r#   r   )TN)r   r   r   r   r6   r7   )N)r   r   r6   r7   )r6   r\   rb   r   r6   r   )
__name__
__module____qualname____doc__r)   r0   r1   ra   rw   __classcell__r3   s   @r4   r   r   )   s        SS
 !"&-1#/3!2+ 2+ 2+ 2+ 2+ 2+ 2+h6 6 6 6 60%# %# %# %# %#NM M M M. . . . . . . .r5   r   c                  `     e Zd ZdZdddej        ej        ddddddddddfd$ fdZd%d#Z xZ	S )&Blockz4Transformer blocks with support of window attention.      @T        Fr   Nr   r   r   	mlp_ratior!   r   r   	drop_path
norm_layerCallable[..., nn.Module]	act_layerr   r   window_sizer   r   r   r"   r#   r   dropoutinit_valuesfloat | Nonec                   t                                                       t          d           ddlm}m}  ||          | _        t          |||||	|
dk    r|n|
|
f||||
  
        | _        |rt          ||          nt          j                    | _        |dk    r ||          nt          j                    | _         ||          | _         ||t          ||z            ||df          | _        |rt          ||          nt          j                    | _        t          j        |          | _        |
| _        dS )	a  
        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            drop_path (float): Stochastic depth rate.
            norm_layer (Callable): Normalization layer constructor.
            act_layer (Callable): Activation layer constructor.
            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks. If it equals 0, then not use window attention.
            input_size (tuple[int, int] | None): Input resolution for calculating the relative positional parameter
                size.
            use_rope (bool): Whether to use rope 2d (independent of use_rel_pos, as it can be used together).
            rope_pt_size (tuple[int, int] | None): Size of rope in previous stage of training, needed for interpolation
                or tiling.
            rope_interp (bool): Whether to interpolate (or extrapolate) rope to match target input size, expected to
                specify source size as rope_pt_size.
            cls_token (bool): Whether a cls_token is present.
            dropout (float): Dropout rate.
            init_values (float | None): Layer scale init, None for no layer scale.
        timmr   )DropPathMlp)	r   r   r   r   r   r   r"   r#   r   )r   r   )in_featureshidden_featuresr   dropN)r(   r)   r   timm.layersr   r   norm1r   attnr   r,   Identityls1r   norm2r   mlpls2Dropoutr   r   )r2   r   r   r   r   r   r   r   r   r   r   r   r   r"   r#   r   r   r   r   r   r3   s                       r4   r)   zBlock.__init__   sv   V 	6"""--------Z__
#/%0A%5%5zzK;U%#
 
 
	 @K]:c{;;;;PRP[P]P]09C),,,R[]]Z__
3i003	
 
 
 @K]:c{;;;;PRP[P]P]z'**&r5   rb   r   r6   c                d   |}|                      |          }| j        dk    r2|j        d         |j        d         }}t          || j                  \  }}|                     |                     |                    }| j        dk    rt          || j        |||f          }||                     |                     |                    z   }||                     |                     | 	                    | 
                    |                     |                                                  z   }|S )z&Forward pass of the transformer block.r   r   r9   )r   r   rh   r   r   r   r   r   r   r   r   r   )r2   rb   shortcutrG   rH   pad_hws         r4   rw   zBlock.forward&  s    JJqMMa71:qwqzqA(D,<==IAvHHTYYq\\""a"1d&6AGGAt||DNN1$5$5666T^^DHHTXXdjjmm5L5L,M,MNNOOOr5   )"r   r   r   r   r   r!   r   r   r   r!   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r   r#   r   r   r   r   r!   r   r   rx   )
ry   rz   r{   r|   r,   	LayerNormGELUr)   rw   r}   r~   s   @r4   r   r      s        >> /1|.0g!"&-1/3!$(%I' I' I' I' I' I' I'V       r5   r   c                       e Zd ZdZddddddddd	d
ej        ddddddddddddd	dddddddfdG fd9ZedHd>            ZdIdBZ	dCdCgfdJdFZ
 xZS )KViTzThis module implements Vision Transformer (ViT) backbone in :paper:`vitdet`. "Exploring Plain Vision Transformer
    Backbones for Object Detection", https://arxiv.org/abs/2203.16527.
    i      r&   i      r   Tr   r   )r9      r         FN   img_sizer   
patch_sizein_chans	embed_dimdepthr   r   r!   r   r   drop_path_rater   Callable[..., nn.Module] | strr   r   use_abs_postile_abs_posrel_pos_blockstuple[int, ...] | boolr   r   global_att_blockstuple[int, ...]r   r"   
int | Noneuse_interp_ropepretrain_img_sizepretrain_use_cls_tokenretain_cls_tokenr   return_interm_layersr   r   ln_preln_postbias_patch_embedcompile_mode
str | Noneuse_act_checkpointc            	        t                                                       || _        fdt          |          D             } t	                    | _        dg|z  | _        t          |t                    r|rdg|z  | _        n|D ]}!d| j        |!<   || _	        | j	        rx|sJ t          |           dk    s
J d            t          | j                  dk    s
J d            |dz  }"t          j        |"t          j        dd|          z            | _        t          |
t"                    r$t%          t'          t          |
          d	
          }
t)          ||f||f|||          | _        || _        || _        | j        r	| j        sJ | j        rB||z  ||z  z  }#|r|#dz   n|#}$t          j        t          j        d|$|                    | _        nd| _        d t          j        d|	|          D             }%|| _        || _        t          j                    | _        d}&t          |          D ]}!t?          d"i d|d|d|d|d|%|!         d|
d|d| j        |!         d|d|!| v r|ndd||z  ||z  fd|d|||fn||fd|d| j	        d|d|}'|!| vr|&dz  }&|| _         | j        !                    |'           || _"        |r|gt          | j                  z  n|g| _#        | j        &t          j$        %                    | j        d            |r |
|          nt          j&                    | _'        |r |
|          nt          j&                    | _(        | )                    | j*                   |Gt          j+        | j,        |d!          | _,        | j         r!| j-        rdt          j.        j/        _0        dS dS dS dS )#ak
  
        Args:
            img_size (int): Input image size. Only relevant for rel pos or rope.
            patch_size (int): Patch size.
            in_chans (int): Number of input image channels.
            embed_dim (int): Patch embedding dimension.
            depth (int): Depth of ViT.
            num_heads (int): Number of attention heads in each ViT block.
            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
            qkv_bias (bool): If True, add a learnable bias to query, key, value.
            drop_path_rate (float): Stochastic depth rate.
            norm_layer (Callable or str): Normalization layer constructor or name.
            act_layer (Callable): Activation layer constructor.
            use_abs_pos (bool): If True, use absolute positional embeddings.
            tile_abs_pos (bool): If True, tile absolute positional embeddings instead of interpolation.
            rel_pos_blocks (tuple[int, ...] | bool): Blocks which have rel pos embeddings.
            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
            window_size (int): Window size for window attention blocks.
            global_att_blocks (tuple[int, ...]): Indexes for blocks using global attention (other blocks use window
                attention).
            use_rope (bool): Whether to use rope 2d (independent of rel_pos_blocks, as it can be used together).
            rope_pt_size (int | None): Size of rope in previous stage of training, needed for interpolation or tiling.
            use_interp_rope (bool): Whether to interpolate (or extrapolate) rope to match target input size, expected to
                specify source size as rope_pt_size.
            pretrain_img_size (int): Input image size for pretraining models.
            pretrain_use_cls_token (bool): If True, pretraining models use class token.
            retain_cls_token (bool): Whether cls_token should be retained.
            dropout (float): Dropout rate. Applied in residual blocks of attn, mlp and inside the mlp.
            return_interm_layers (bool): Whether to return intermediate layers (all global attention blocks).
            init_values (float | None): Layer scale init, None for no layer scale.
            ln_pre (bool): If True, apply layer norm before transformer blocks.
            ln_post (bool): If True, apply layer norm after transformer blocks.
            bias_patch_embed (bool): If True, use bias in conv for patch embed.
            compile_mode (str | None): Mode to compile the forward, or None to disable.
            use_act_checkpoint (bool): If True, use activation checkpointing.
        c                    g | ]}|v|	S  r   ).0ir   s     r4   
<listcomp>z ViT.__init__.<locals>.<listcomp>  s$    VVVa1DU;U;U;U;U;Ur5   FTr   z&windowing not supported with cls tokenz$rel pos not supported with cls tokenr%   r   gh㈵>)eps)kernel_sizestrider   r   r'   Nc                6    g | ]}|                                 S r   )item)r   rb   s     r4   r   z ViT.__init__.<locals>.<listcomp>  s     JJJAqvvxxJJJr5   r   r   r   r   r   r   r   r   r   r   r   r   r"   r#   r   r   r   r:   r;   )mode	fullgraphr   )1r(   r)   r   rangelistfull_attn_idsr   
isinstancer   r   lensumr,   r?   r@   randnclass_embeddingstrr   getattrr   patch_embedr   r   rA   	pos_embedlinspacer   r   
ModuleListblocksr   r   appendr   channel_listrB   rC   r   r   r   apply_init_weightscompilerw   training_dynamoconfigoptimize_ddp))r2   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   window_block_indexesr   r+   num_patchesnum_positionsdpr	cur_stageblockr3   s)                    `                      r4   r)   zViT.__init__?  s   L 	&<#VVVV5<<VVV!"344$gond++ 	. 	.#'&5.D# . .)-#A&& 0  	V))))+,,1113[111t*++q0002X000tOE#%<Aq)8T8T0T#U#UD j#&& 	D Z!8!8dCCCJ%#Z0
+!
 
 
 )& 	$#### 	",
:?PT^?^_K1GX[1__[M\%+a	*R*RSSDNN!DN KJ>5!I!IJJJ$&moo	u 	& 	&A   I#) $) "	
 a&& &: $) !/22 #4"3 ,-0D+D+DKK! %
2H
4JKK " =I<P{K88WceqVr ,O //   !" (K#E( ,,,Q	&8D#Ku%%%%$8!EYjYK#d.@*A*AAA`i_j>%G!!$.d!;;;/5Hjj+++2;==07Jzz),,,R[]]

4%&&&# =LTXYYYDL& :4= :49$111 $#: : : :r5   m	nn.Moduler6   r7   c                   t          | t          j                  rrt          j                            | j        d           t          | t          j                  r.| j        )t          j                            | j        d           dS dS dS t          | t          j                  rLt          j                            | j        d           t          j                            | j        d           dS dS )zInitialize the weights.r:   r;   Nr   rM   )	r   r,   r-   rB   rC   weightr'   	constant_r   )r   s    r4   r   zViT._init_weights  s     a## 	-G!!!(!555!RY'' -AF,>!!!&!,,,,,- -,>,>2<(( 	-Gafa(((Gah,,,,,	- 	-r5   rb   torch.Tensorlist[torch.Tensor]c                   |                      |          }|j        d         |j        d         }}d}| j        r3t          j        | j        |                    dd          gd          }d}| j        -|t          | j        | j	        ||f| j        | j
                  z   }|                     |          }g }t          | j                  D ]7\  }}| j        r| j        rt!          j        ||d          }n ||          }|| j        d	         k    s| j        r|| j        v r|| j        d	         k    r|                     |          }|dd|df         }|j        d
k    r|                    dddd          }nr|j        dk    sJ t-          j        |j        d                   x}}|                    |j        d         |||j        d	                                       dddd          }|                    |           9|S )z&Vit forward path and get feature maps.r   r9   r   rS   N)tilingF)use_reentrantre   rd   r&   )r   rh   r   r@   rY   r   rn   r   r   r   r   r   	enumerater   r   r   
checkpointr   r   r   rg   rl   ri   rj   rk   r   )	r2   rb   hwrr   outputsr   blkfeatss	            r4   rw   zViT.forward  s   Qwqz171:1  	 	4/1aAqIIIAA>%K+A%(   A KKNN,, 	& 	&FAs& 4= )#qFFFCFFT'+++1J+qTXTfOfOf*2...QA!!!QRR%:??!MM!Q155EE :???? Iek!n555A!MM%+a.!QBPPXXYZ\]_`bcddEu%%%r5   i  imgsz	list[int]c                   | j         D ]}|j        dk    r|j                            |d         | j        z  |d         | j        z  f           |j                            |d         | j        z  |d         | j        z  f           dS )zCSetup rel pos embeddings and rope freqs for a new input image size.r   r   )r   N)r   r   r   r0   r   r1   )r2   r   r   s      r4   	set_imgszzViT.set_imgsz  s    [ 	p 	pE A%%J%%%(do2MuUVx[_[jOj1k%lllJ((U1X5PRWXYRZ^b^mRm4n(oooo		p 	pr5   )>r   r   r   r   r   r   r   r   r   r   r   r   r   r!   r   r   r   r!   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r!   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r6   r7   )rb   r   r6   r   )r   r   )ry   rz   r{   r|   r,   r   r)   staticmethodr   rw   r  r}   r~   s   @r4   r   r   :  s          #5@.0g !1>"&-:#' %!$'+!%%*$(!%#'#'Af: f: f: f: f: f: f:P - - - \-+ + + +Z -1$< p p p p p p p p pr5   r   )#r|   
__future__r   ri   	functoolsr   typingr   r@   torch.nnr,   torch.nn.functional
functionalro   torch.utils.checkpointutilsr   r   %ultralytics.models.sam.modules.blocksr   $ultralytics.models.sam.modules.utilsr   r	   r
   r   r   r   ultralytics.utils.checksr   
model_miscr   Moduler   r   r   r   r5   r4   <module>r     s  
  # " " " " "                              + + + + + + + + +       < < < < < <                8 7 7 7 7 7 " " " " " "l l l l l	 l l l^_ _ _ _ _BI _ _ _Dkp kp kp kp kp") kp kp kp kp kpr5   