
    jU                    \    d Z ddlmZ ddlmZ ddlZddlmZ  G d dej                  Z	dS )zVNecks are the interface between a vision backbone and the rest of the detection model.    )annotations)deepcopyNc                  J     e Zd ZdZ	 	 dd fdZddZddZddgfddZ xZS ) Sam3DualViTDetNeckzbA neck that implements a simple FPN as in ViTDet, with support for dual necks (for SAM3 and SAM2).      @       @      ?      ?Ftrunk	nn.Moduleposition_encodingd_modelintadd_sam2_neckboolc                   t                                                       || _        || _        t	          j                    | _        || _        d}| j        j        d         }t          |          D ]\  }}	t	          j
                    }
|	dk    r|
                    dt	          j        ||dz  dd                     |
                    dt	          j                               |
                    dt	          j        |dz  |d	z  dd                     |d	z  }n|	d
k    r5|
                    dt	          j        ||dz  dd                     |dz  }nO|	dk    r|}nF|	dk    r-|
                    dt	          j        dd                     |}nt          d|	 d          |
                    dt	          j        ||d|                     |
                    dt	          j        ||dd|                     | j                            |
           d| _        |rt'          | j                  | _        dS dS )a  
        SimpleFPN neck a la ViTDet
        (From detectron2, very lightly adapted)
        It supports a "dual neck" setting, where we have two identical necks (for SAM3 and SAM2), with different weights.

        :param trunk: the backbone
        :param position_encoding: the positional encoding to use
        :param d_model: the dimension of the model
        :param scale_factors: tuple of scale factors for each FPN level
        :param add_sam2_neck: whether to add a second neck for SAM2
        Tr   dconv_2x2_0   )kernel_sizestridegeludconv_2x2_1   r	   	dconv_2x2r
   r   maxpool_2x2zscale_factor=z is not supported yet.conv_1x1   )in_channelsout_channelsr   biasconv_3x3   )r    r!   r   paddingr"   N)super__init__r   r   nn
ModuleListconvsscale_factorschannel_list	enumerate
Sequential
add_moduleConvTranspose2dGELU	MaxPool2dNotImplementedErrorConv2dappend
sam2_convsr   )selfr   r   r   r+   r   use_biasdim_scalecurrentout_dim	__class__s               f/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/necks.pyr'   zSam3DualViTDetNeck.__init__   s   & 	
!2]__
*:*2.!-00 5	' 5	'HAumooG||""!&sC1H!ANNN   ""GII   ""!&saxqQRSSS   (#""&sC1H!ANNN   (##""!LQq999   )*W%*W*W*WXXX	 '!( !!	     	 '!( !!  	 	 	 Jg&&&& 	3&tz22DOOO	3 	3    tensor_listlist[torch.Tensor]returnctuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor] | None, list[torch.Tensor] | None]c                    |                      |          }|d         }|                     || j                  \  }}| j        ||ddfS |                     || j                  \  }}||||fS )z8Get feature maps and positional encodings from the neck.r   N)r   sam_forward_feature_levelsr*   r6   )r7   rA   xsxsam3_outsam3_possam2_outsam2_poss           r?   forwardzSam3DualViTDetNeck.forwardj   s|     ZZ$$rF!<<Q
KK(?"XtT11!<<QPP(8X55r@   rH   torch.Tensorr*   nn.ModuleList-tuple[list[torch.Tensor], list[torch.Tensor]]c                    g g }}|D ]b} ||          }|                     |           |                     |                     |                              |j                             c||fS )zNRun neck convolutions and compute positional encodings for each feature level.)r5   r   todtype)r7   rH   r*   outspossconvfeats          r?   rF   z-Sam3DualViTDetNeck.sam_forward_feature_levelsv   sz     d 	E 	ED477DKKKK..t4477
CCDDDDTzr@   i  imgsz	list[int]c                :    | j                             |           dS )z*Set the image size for the trunk backbone.N)r   	set_imgsz)r7   rX   s     r?   r[   zSam3DualViTDetNeck.set_imgsz   s    
U#####r@   )r   F)r   r   r   r   r   r   r   r   )rA   rB   rC   rD   )rH   rN   r*   rO   rC   rP   )rX   rY   )	__name__
__module____qualname____doc__r'   rM   rF   r[   __classcell__)r>   s   @r?   r   r      s        ll +#V3 V3 V3 V3 V3 V3 V3p
6 
6 
6 
6	 	 	 	 -1$< $ $ $ $ $ $ $ $ $r@   r   )
r_   
__future__r   copyr   torchtorch.nnr(   Moduler    r@   r?   <module>rg      s   
 ] \ " " " " " "             t$ t$ t$ t$ t$ t$ t$ t$ t$ t$r@   