
    j4                        d dl mZ d dlmZ d dlZd dlmZ d dlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddZ G d dej        j                  ZdS )    )annotations)deepcopyN)inverse_sigmoid)	xywh2xyxy   )	SAM2Model   )Prompt)SAM3VLBackboneTc                <   |r|d         n|| |<   |r|rd| vr,d t          t          |          dz
            D             | d<   t          | d                   t          |          dz
  k    sJ t          | d         |dd                   D ]\  }}|||<   dS dS dS )zLHelper function to update output dictionary with main and auxiliary outputs.aux_outputsc                    g | ]}i S  r   ).0_s     k/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/sam3_image.py
<listcomp>z_update_out.<locals>.<listcomp>   s    !H!H!H"!H!H!H    r	   N)rangelenzip)outout_name	out_value	auxiliary
update_aux
aux_output	aux_values          r   _update_outr       s    %.=IbMMICM -Z -##!H!HeC	NNQ4F.G.G!H!H!HC3}%&&#i..1*<<<<<%(]);Yss^%L%L 	- 	-!J	#,Jx  - - - -	- 	-r   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 d'd( fdZ	 	 	 d)dZ	 d*d+dZd Z	 	 d,dZd Z		 d*d-d Z
d.d#Zd/d&Z xZS )0SAM3SemanticModelzCSAM3 model for semantic segmentation with vision-language backbone.Nr	   TFr   backboner   use_instance_queryboolmultimask_outputuse_act_checkpoint_seg_headsupervise_joint_box_scoresdetach_presence_in_joint_scoreseparate_scorer_for_instancenum_interactive_steps_valintc                   t                                                       || _        || _        || _        |j        | _        || _        || _        || _	        || _
        |
| _        || _        || _        || _        | j        r)|J || _
        d| _        |rt!          |          | _        nLt"          j                            | j        d          | _        d| _        |rt!          | j                  | _        || _        || _        | j        j        j        }| j        j        j        }|| j        j        j        r|ndk    sJ | j        j        j        | _        || _        |	| _        i | _        g | _        dS )z!Initialize the SAM3SemanticModel.Nr	   r   ) super__init__r#   geometry_encodertransformerd_model
hidden_dimnum_feature_levelssegmentation_heado2m_mask_predictdot_prod_scoringr'   matcherr+   use_dot_prod_scoringinstance_dot_prod_scoringr   torchnnLinearclass_embedinstance_class_embedr(   r)   decodernum_queriesnum_o2m_queriesdacr$   r&   text_embeddingsnames)selfr#   r1   input_geometry_encoderr5   r4   r6   r7   r$   r&   r'   r8   r9   r(   r)   r*   r+   num_o2o_staticnum_o2m_static	__class__s                      r   r/   zSAM3SemanticModel.__init__!   s   ( 	  6&%-"4!2 0 0+F()B&$8!$ 
	G#///$4D!-1D*+ L19:J1K1K.$xtBBD(,D%+ G,4T5E,F,F)*D'.L+ )1=)1AD4D4L4P"W..VWXXXX#+/"4 0!


r   c                   ||d         |z   g}|                      ||||          \  }}	|_t          j        dg|j        dd         R |j                  }t          j        g |	j        dd         dR |	j        |	j                  }t          j        ||gd          }
t          j        |	|gd          }|
|fS )	z(Encode the geometric and visual prompts.Nr   )
geo_prompt	img_feats	img_sizesimg_pos_embedsr   r	   )device)rP   dtypedim)r0   r;   zerosshaperP   rQ   cat)rF   rM   rO   vis_feat_sizesgeometric_promptvisual_prompt_embedvisual_prompt_maskprev_mask_pred	geo_feats	geo_maskspromptprompt_masks               r   _encode_promptz SAM3SemanticModel._encode_prompta   s     %"278I#44'$)	  5  
  
	9 &"'+q.G9?1223F.G.GPYP`"a"a"a!&*)/#2#&*** 'o" " "
 I':;CCCi,> ?QGGG{""r   encoder_extra_kwargsdict | Nonec                   | j                             |                                d|                                ||||          }|d         |d         |d         |d         |d         |||                    d|          |d		}|S )
zRun the transformer encoder.N)srcsrc_key_padding_masksrc_posr^   prompt_key_padding_mask
feat_sizesra   memory	pos_embedpadding_maskspatial_shapesvalid_ratiosmemory_text)	encoder_hidden_statesrj   rk   rl   rm   rW   prompt_before_encprompt_after_encr_   )r1   encodercopyget)	rF   rM   rO   rW   r^   r_   ra   ri   encoder_outs	            r   _run_encoderzSAM3SemanticModel._run_encoder   s     !))  !%"''))$/%!5 * 
 
 &,H%5,">2$%56">2,!' &

=& A A&
 
 r   c                   |j         d         }| j        j        j        j        }	|	                    d                              d|d          }
| j                            |
|||d|d         |d         d||d          \  }}}}|                    dd          }|                    dd          }||                    dd          }|                     ||||||           ||fS )	zRun the transformer decoder.r	   Nrl   rm   F)tgtri   memory_key_padding_maskposreference_boxesrl   rm   tgt_maskrn   text_attention_mask	apply_dacr   )dec_presence_out)	rU   r1   r@   query_embedweight	unsqueezerepeat	transpose_update_scores_and_boxes)rF   rj   ri   src_maskr   r^   r_   ru   bsr   rx   hsr{   r   r   s                  r   _run_decoderzSAM3SemanticModel._run_decoder   s    \!_&.:A##A&&--aQ77373C3K3K$, &'78$^4 + 4L 4
 4
0O-q \\!Q)33Aq99'/99!Q??%%- 	& 	
 	
 	
 Bwr   c           	        |                     d          }| j        r%| j        }	|r| j        | j        }	 |	|||          }
n"| j        }|r| j        | j        } ||          }
| j        j        j        }|r"| j        j        j	        | j        j        j	        } ||          }t          |          }||z                                   }t          |          }|t          |d|d           | j        r|J |                                                                }| j        r|                                }t          |
                                |                    d          z                                dd          }
t          |d	|
ddddd|f         d           t          |d
|ddddd|f         d           t          |d|ddddd|f         d           dS )z9Update output dict with class scores and box predictions.r   Npresence_logit_decF)r   g      $g      $@)minmaxpred_logits
pred_boxespred_boxes_xyxy)sizer9   r7   r:   r>   r?   r1   r@   
bbox_embedinstance_bbox_embedr   sigmoidr   r    r(   cloner)   detachr   clamp)rF   r   r   r{   r^   r_   r   is_instance_promptnum_o2odot_prod_scoring_headoutputs_classclass_embed_headbox_headanchor_box_offsetsreference_boxes_inv_sigoutputs_coordoutputs_boxes_xyxyprob_dec_presence_outs                     r   r   z*SAM3SemanticModel._update_scores_and_boxes   sR    ''!**$ 		1$($9!! Gd&D&P(,(F%11"fkJJMM#/! =d&?&K#'#< ,,R00M #+6 	D$"2":"N"Z'/CH%Xb\\"1/"B"B03EENNPP&}55'13CPUVVVV* 	#///$4$:$:$<$<$D$D$F$F!2 G(=(D(D(F(F%+M,A,A,C,CF[FeFefgFhFh,hiioot p  M 	CaaaHWHn(ERWXXXXC}QQQ8G8^'DQVWWWWC*,>qqq!!!XgX~,N[`aaaaaar   c           	     |   | j         |                    d          }| j        r|n|ddddd|f         }|                      |d         ||||          }	|	                                D ]8\  }
}|
| j         j        v r t          ||
|ddd|f         d           3|||
<   9dS |                    dd           dS )z%Run segmentation heads and get masks.Nr   backbone_fpn)backbone_featsobj_queriesro   r^   r_   F)r   )r5   r   r6   itemsinstance_keysr    pop)rF   r   backbone_outro   r^   r_   r   r   r   seg_head_outputskvs               r   _run_segmentation_headsz)SAM3SemanticModel._run_segmentation_heads  s    !-ggajjG $ 5M""2aaaHWHn;MK#55+N;'&;'  6     )..00  1.<<<Q!!!XgX+%HHHHHCFF	  ^T22222r   r   dict[str, torch.Tensor]text_idstorch.TensorrX   r
   c           
     0   t          j        | |t          |                    \  }}}}|                    d | j                                        D                        t          j                            d          5  | 	                    ||||          \  }}ddd           n# 1 swxY w Y   |d         dd|f         }	|d         |         }
t          j
        |	|gd          }t          j
        |
|gd	          }t          j                            d
          5  |                     |||||          }ddd           n# 1 swxY w Y   d|i}t          j                            d          5  |                     |d         |d         |d         ||||          \  }}ddd           n# 1 swxY w Y   t          j                            d          5  |                     |||d         |||           ddd           n# 1 swxY w Y   |S )zRForward pass for grounding (detection + segmentation) given input images and text.)batchc                    i | ]\  }}||	S r   r   )r   r   r   s      r   
<dictcomp>z7SAM3SemanticModel.forward_grounding.<locals>.<dictcomp>$  s    KKKdaQKKKr   zSAM3Image._encode_promptNlanguage_featureslanguage_maskr   rR   r	   zSAM3Image._run_encoderr   zSAM3Image._run_decoderro   rj   rk   )ri   rj   r   r   r^   r_   ru   z!SAM3Image._run_segmentation_heads)r   r   ro   r^   r_   r   )r   _prepare_backbone_featuresr   updaterD   r   r;   profilerrecord_functionr`   rV   rv   r   r   )rF   r   r   rX   rM   rO   rW   r^   r_   	txt_feats	txt_masksru   r   r   s                 r   forward_groundingz#SAM3SemanticModel.forward_grounding  sc    CLBf,c(mmC
 C
 C
?i 	KKd.B.H.H.J.JKKKLLL^++,FGG 	s 	s"&"5"5iQ_aq"r"rFK	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s 	s !!45aaakB	 1(;	Iv.A666iK 8a@@@ ^++,DEE 	l 	l++I~~W]_jkkK	l 	l 	l 	l 	l 	l 	l 	l 	l 	l 	l 	l 	l 	l 	l|, ^++,DEE 		 		''"#:;%k2$^4'' (  GC		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 ^++,OPP 	 	(()&12I&J' )   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 
sH   ?B''B+.B+!EEE52F33F7:F7"HHHtext	list[str]c                R    | j                             |          | _        || _        dS )z2Set the text embeddings for the given class names.N)r#   forward_textrD   rE   )rF   r   s     r   set_classeszSAM3SemanticModel.set_classesL  s%    #}99$??


r   imgsztuple[int, int]c                :    | j                             |           dS )z!Set the image size for the model.N)r#   	set_imgsz)rF   r   s     r   r   zSAM3SemanticModel.set_imgszQ  s    &&&&&r   )Nr	   TNTTTNTFFFr   )r#   r   r$   r%   r&   r%   r'   r%   r(   r%   r)   r%   r*   r%   r+   r,   )NNN)N)ra   rb   )NF)r   r   r   r   rX   r
   )r   r   )r   r   )__name__
__module____qualname____doc__r/   r`   rv   r   r   r   r   r   r   __classcell__)rJ   s   @r   r"   r"      s:       MM #'!%,0!+0/4-2)*#> > > > > > >L !# # # #L -1" " " " "H) ) )d  0b 0b 0b 0bd3 3 3: im- - - - -^   
' ' ' ' ' ' ' 'r   r"   )TT)
__future__r   rs   r   r;   ultralytics.nn.modules.utilsr   ultralytics.utils.opsr   modules.samr   geometry_encodersr
   vl_combinerr   r    r<   Moduler"   r   r   r   <module>r      s   
 # " " " " "        8 8 8 8 8 8 + + + + + + # # # # # # % % % % % % ' ' ' ' ' '- - - -u' u' u' u' u' u' u' u' u' u'r   