
    j/                     4   d dl mZ d dlmZ d dlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddefdZ(defdZ)dde*de+de+fdZ,dde*defdZ-d dZ.dS )!    N)MLP)
torch_load   )PositionEmbeddingSineRoPEAttention)MemoryEncoder)MemoryAttentionMemoryAttentionLayer)	SAM3Model)TransformerDecoderTransformerDecoderLayer)TransformerEncoderFusionTransformerEncoderLayer)SequenceGeometryEncoder)PixelDecoderUniversalSegmentationHead)DotProductScoringTransformerWrapper)Sam3DualViTDetNeck)SAM3SemanticModel)VETextEncoder)ViT)SAM3VLBackboneTreturnc                     t          dddd          }t          di dddd	d
dddddddddddddddddddddddddd dd!d"d#dd$d%d&dd'd%d(d%d)d%d*| }t          |dg d+||,          S )-z.Create SAM3 visual backbone with ViT and neck.   TN'  num_pos_feats	normalizescaletemperatureimg_size  pretrain_img_sizeiP  
patch_size   	embed_dim   depth    	num_heads   	mlp_ratiog     @
norm_layer	LayerNormdrop_path_rate皙?qkv_biasuse_abs_postile_abs_posglobal_att_blocks)            rel_pos_blocks use_ropeuse_interp_ropewindow_size   pretrain_use_cls_tokenretain_cls_tokenFln_preln_postreturn_interm_layersbias_patch_embedcompile_mode)g      @g       @g      ?g      ?)position_encodingd_modelscale_factorstrunkadd_sam2_neck)r   r   r   )rG   enable_inst_interactivityrH   vit_backbones       f/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/build_sam3.py_create_vision_backbonerP      s    .	      # 2 $	
 b " % ; s  D T */ r   !" B#$  $t%& '( t)* +, #U-. /0 "\1L4 +***/       c                  b   t          t          dddddddt          j        dddd          t          j        dddd          	  	        d	dd
dddd          } t	          t          dddt          j        ddd          dd          d	dddddddddddd          }t          | |d          S )z)Create SAM3 detector encoder and decoder.r      r2   TF   )r,   dropoutr(   batch_first)	rI   dim_feedforwardrU   pos_enc_at_attnpos_enc_at_cross_attn_keyspos_enc_at_cross_attn_queriespre_normself_attentioncross_attention   r   )layer
num_layersrI   num_feature_levelsfrozenuse_act_checkpointadd_pooled_text_to_img_featpool_text_with_maskr,   rU   r(   )rI   rW   rU   r]   n_headsuse_text_cross_attention   r   logN)r_   r`   num_queriesreturn_intermediate
box_refinenum_o2m_queriesdacboxRPBrI   rb   interaction_layerdac_use_selfatt_lnrc   presence_token)encoderdecoderrI   )r   r   nnMultiheadAttentionr   r   r   )rt   ru   s     rO   _create_sam3_transformerrx   H   s/   (@%  ',*/0 	   1 	  
 
 
* $) 9) ) )G< #5% 1  
 %)
 
 
  3# # #G8 gwLLLLrQ   Fcheckpoint_pathenable_segmentationcompilec                    	 ddl }n%# t          $ r ddlm}  |d           ddl }Y nw xY w|rdnd}t	          |d          }t          |j                                        dd	d
d          }t          ||d          }t                      }	t          ddt          dddddt          j        d                              }
|r=t          ddddddt          j        ddd          t!          ddd|                    nd}t#          t%          dddd          ddddddt'          ddddddd          ddd          }t)          ||	||dd|
dd	  	        }t+          ||           }|                                 |S ) a  Build SAM3 image model.

    Args:
        checkpoint_path: Optional path to model checkpoint
        enable_segmentation: Whether to enable segmentation head
        compile: Whether to enable compilation of the model

    Returns:
        A SAM3 image model
    r   N)check_requirementsz+git+https://github.com/ultralytics/CLIP.gitdefaultT)rG   rM   r   r)   r-   r@   )	tokenizerrI   widthheadslayersr   )visualtextscalprS      )	input_dim
hidden_dim
output_dimr`   residualout_norm)rI   d_proj
prompt_mlp   FrT   rf   nearest)num_upsampling_stagesinterpolation_moder   rG   )r   upsampling_stages	aux_maskspresence_headdot_product_scoreract_ckptcross_attend_promptpixel_decoderr   r   r2   )rI   rW   rU   rX   r[   rZ   rY   )pos_encencode_boxes_as_pointsboxes_direct_project
boxes_poolboxes_pos_encrI   r`   r_   use_act_ckptadd_clsadd_post_encode_proj)	backbonetransformerinput_geometry_encodersegmentation_headra   o2m_mask_predictdot_prod_scoringuse_instance_querymultimask_output)clipImportErrorultralytics.utils.checksr}   rP   r   simple_tokenizerSimpleTokenizerr   rx   r   r   rv   r0   r   rw   r   r   r   r   r   _load_checkpointeval)ry   rz   r{   r   r}   rG   vision_encodertext_encoderr   r   r   r   r   models                 rO   build_sam3_image_modelr      sn      ??????HIII	 !(199TL,,bfgggN !'7799  L ^,aPPPH +,,K )\#&&
 
 
  D '	!# " 5! ! !
 '&'#,)	  	
 	
 	
 	
( + 2 5%	
 
 
  %!% !*/'+
 
 
 !1  8 5+) 
 
 
E UO44E	JJLLLLs    ))c                 H   t          dddg          }t          dddt          dddddt          dd	d	d
ddg          dt          dd	d	dd
ddgd                    d          }|r t	          d	t          |          d          nd}t          d;i ddd|d|d|dddddddd d!dd"dd#dd$dd%dd&dd'dd(dd)dd*dd+dd,dd-dd.d/d0d	d1dd2dd3dd4dd5dd6t          dd7d89          }t          || d:          }|	                                 |S )<aa  Build the SAM3 Tracker module for video tracking.

    Args:
        checkpoint_path (str): Path to model checkpoint.
        compile (str | None): Compilation mode for the vision backbone.
        with_backbone (bool): Whether to include the vision backbone in the model.

    Returns:
        (SAM3Model): A configured and initialized SAM3 model.
    @   i  )out_diminterpol_sizeTr   rS   r2   Fr   g     @H   )embedding_dimr,   downsample_rate
rope_theta
feat_sizes)r   r,   r   	kv_in_dimr   r   rope_k_repeat)rW   rU   rX   rY   rZ   	self_attnrI   
cross_attn   )rV   rI   pos_enc_at_inputr_   r`   )rG   N)r   r   r   
image_sizer$   image_encodermemory_attentionmemory_encoderbackbone_strider'   num_maskmemr7   sigmoid_scale_for_mem_encg      4@sigmoid_bias_for_mem_encg      $$use_mask_input_as_output_without_samdirectly_add_no_mem_embeduse_high_res_features_in_sammultimask_output_in_samiou_prediction_use_sigmoiduse_obj_ptrs_in_encoderadd_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalpred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrmultimask_output_for_trackinguse_multimask_token_for_obj_ptrmultimask_min_pt_numr   multimask_max_pt_numuse_mlp_for_obj_ptr_projcompile_image_encoderno_obj_embed_spatialproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrssam_mask_decoder_extra_argsg?g\(\?)dynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)interactiver<   )
r   r	   r
   r   r   rP   r   dictr   r   )ry   r{   with_backboner   r   r   r   s          rO   build_interactive_sam3r     s    #2dD\JJJN&" !'+*/#! !"8   $! !"8"  
 
 
0 9  B 	Q'>G'T'T'T[_```` 
  " " "4"h" *)" &~	"
 " A" #'$" "'" .2T" #'$" &*T" !%" $(4" !%" "&"  ,04!"" #"$ !D%"& '"( '+d)"* )-+", Q-". Q/"0 "&1"2 $e3"4 "T5"6 #'$7"8 )-9": %),0.2/3%
 %
 %
 %
;"EJ UOFFFE 
JJLLLLrQ   c                    t          |d          5 }t          |          }ddd           n# 1 swxY w Y   d|v r#t          |d         t                    r|d         }d |                                D             }|r|                    d |                                D                        |                    d |                                D                        |                    d |                                D                        |                    d |                                D                        |                     |d	
           | S )z%Load SAM3 model checkpoint from file.rbNr   c                 J    i | ] \  }}d |v 	|                     dd          |!S )detectorz	detector. replace.0kvs      rO   
<dictcomp>z$_load_checkpoint.<locals>.<dictcomp>e  s8    aaaAQ[_`Q`Q`qyyb111Q`Q`Q`rQ   c                 J    i | ] \  }}d |v 	|                     d d          |!S )zbackbone.vision_backbonezimage_encoder.vision_backboner   r   s      rO   r   z$_load_checkpoint.<locals>.<dictcomp>h  sD       Aq-22 		46UVVXY222rQ   c                 J    i | ] \  }}d |v 	|                     dd          |!S )ztracker.transformerztracker.transformer.encoderr   r   r   s      rO   r   z$_load_checkpoint.<locals>.<dictcomp>o  sC       Aq(A-- 		79KLLa---rQ   c                 J    i | ] \  }}d |v 	|                     d d          |!S )ztracker.maskmem_backboner   r   r   s      rO   r   z$_load_checkpoint.<locals>.<dictcomp>v  sC       Aq-22 		46FGG222rQ   c                 J    i | ] \  }}d |v 	|                     d d          |!S )ztracker.r   r   r   s      rO   r   z$_load_checkpoint.<locals>.<dictcomp>|  s8    iiiAYcghYhYh		*b 9 91YhYhYhrQ   F)strict)openr   
isinstancer   itemsupdateload_state_dict)r   
checkpointr   fckptsam3_image_ckpts         rO   r   r   _  s   	j$		 1!}}              $:d7mT::G}aaaaaO k +1133  	
 	
 	
 	  JJLL  	
 	
 	
 	  JJLL  	
 	
 	
 	iiiiijjj	/%888Ls   -11)NT)TF)F)/torch.nnrv   "ultralytics.nn.modules.transformerr   ultralytics.utils.patchesr   modules.blocksr   r   modules.encodersr   modules.memory_attentionr	   r
   modules.samr   sam3.decoderr   r   sam3.encoderr   r   sam3.geometry_encodersr   sam3.maskformer_segmentationr   r   sam3.model_miscr   r   
sam3.necksr   sam3.sam3_imager   sam3.text_encoder_ver   sam3.vitdetr   sam3.vl_combinerr   rP   rx   strboolr   r   r   r<   rQ   rO   <module>r     s'  
       2 2 2 2 2 2 0 0 0 0 0 0 @ @ @ @ @ @ @ @ + + + + + + K K K K K K K K " " " " " " E E E E E E E E K K K K K K K K ; ; ; ; ; ; Q Q Q Q Q Q Q Q B B B B B B B B * * * * * * . . . . . . / / / / / /       , , , , , ,+ +Rd + + + +\<M"4 <M <M <M <M~x xC xd x\` x x x xvZ ZC ZV_ Z Z Z Zz     rQ   