
    /jv"                        d dl mZ d dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d	d
lmZ  G d de          ZdS )    )annotationsN)Image)SegmentationPredictor)DEFAULT_CFG)box_iou)scale_masks)
TORCH_1_10   )adjust_bboxes_to_image_borderc                  J     e Zd ZdZeddfd
 fdZ fdZddZd Zd	 Z	 xZ
S )FastSAMPredictora  FastSAMPredictor is specialized for fast SAM (Segment Anything Model) segmentation prediction tasks.

    This class extends the SegmentationPredictor, customizing the prediction pipeline specifically for fast SAM. It
    adjusts post-processing steps to incorporate mask prediction and non-maximum suppression while optimizing for
    single-class segmentation.

    Attributes:
        prompts (dict): Dictionary containing prompt information for segmentation (bboxes, points, labels, texts).
        device (torch.device): Device on which model and tensors are processed.
        clip (Any, optional): CLIP model used for text-based prompting, loaded on demand.

    Methods:
        postprocess: Apply postprocessing to FastSAM predictions and handle prompts.
        prompt: Perform image segmentation inference based on various prompt types.
        set_prompts: Set prompts to be used during inference.
    N
_callbacksdict | Nonec                \    t                                          |||           i | _        dS )a;  Initialize the FastSAMPredictor with configuration and callbacks.

        This initializes a predictor specialized for Fast SAM (Segment Anything Model) segmentation tasks. The predictor
        extends SegmentationPredictor with custom post-processing for mask prediction and non-maximum suppression
        optimized for single-class segmentation.

        Args:
            cfg (dict): Configuration for the predictor.
            overrides (dict, optional): Configuration overrides.
            _callbacks (dict, optional): Dictionary of callback functions.
        N)super__init__prompts)selfcfg	overridesr   	__class__s       g/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/fastsam/predict.pyr   zFastSAMPredictor.__init__#   s,     	i444    c                   | j                             dd          }| j                             dd          }| j                             dd          }| j                             dd          }t                                          |||          }|D ]}	t	          j        dd|	j        d         |	j        d         g|	j        j        j	        t          j
                  }
t          |	j        j        |	j                  }t	          j        t          |
d         |          d	k                                              }|                                dk    r|
|	j        j        |<   |                     |||||
          S )a  Apply postprocessing to FastSAM predictions and handle prompts.

        Args:
            preds (list[torch.Tensor]): Raw predictions from the model.
            img (torch.Tensor): Input image tensor that was fed to the model.
            orig_imgs (list[np.ndarray]): Original images before preprocessing.

        Returns:
            (list[Results]): Processed results with prompts applied.
        bboxesNpointslabelstextsr   r
   )devicedtypeg?)r   r   r   r   )r   popr   postprocesstorchtensor
orig_shapeboxesdatar   float32r   xyxynonzeror   flattennumelprompt)r   predsimg	orig_imgsr   r   r   r   resultsresultfull_boxr&   idxr   s                r   r"   zFastSAMPredictor.postprocess2   sT    !!(D11!!(D11!!(D11  $//''%%eS)<< 	2 	2F|Av(+V->q-AB6<K\Kckpkx  H 2&,2CVEVWWE- > > DEEMMOOCyy{{a)1!#&{{76&W\{]]]r   c                
   ||||S g }t          |t                    s|g}|D ]F}t          |          dk    r|                    |           ,|j        j        j        dd         |j        k    rIt          d         	                                |j                  d         dk    
                                t          j        t          |          t          j        | j                  }|t          j        |t          j        | j                  }|j        dk    r|d         n|}|dddf         |dddf         z
  |dddf         |dddf         z
  z  }	t          j        fd|D                       }
t          j        d	
          }|	dddf         |z   |
z
  }d|t          j        |
|z  d
          <   |t          j        |t          j        | j                  }|j        dk    r|d         n|}|t          j        |j        d                   }t          j        |t          j        | j                  }t          |          t          |          k    s+J dt          |           dt          |           d            |                                dk    r3t          j        t          |          t          j        | j                  n2t          j        t          |          t          j        | j                  }t-          ||          D ]H\  }}t          |          |t          j        dd|d         |d         f         d          d         <   I||z  }|t          |t0                    r|g}g g c}t3          |j        j                                                  D ]\  }}d |D             \  }}}}t:          r|                                         n,|                             d                                          dk    r                    |           ~|j        ||||f         |||||df                                                                          z  }|                    tC          j"        |dddddddf                              | #                    ||          }t          j        |d
          }t                    rVfdtI          t          |                    D             }t          j%        |tM          |                   | j                  }d||<   |                    ||                    H|S )a  Perform image segmentation inference based on cues like bounding boxes, points, and text prompts.

        Args:
            results (Results | list[Results]): Original inference results from FastSAM models without any prompts.
            bboxes (np.ndarray | list, optional): Bounding boxes with shape (N, 4), in XYXY format.
            points (np.ndarray | list, optional): Points indicating object locations with shape (N, 2), in pixels.
            labels (np.ndarray | list, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
            texts (str | list[str], optional): Textual prompts, a list containing string objects.

        Returns:
            (list[Results]): Output results filtered and determined by the provided prompts.
        Nr   r
   g      ?)r    r         c                    g | ]?}d d |d         |d         |d         |d         f                              d          @S )Nr
   r6   r   r7   r
   r7   dim)sum).0bmaskss     r   
<listcomp>z+FastSAMPredictor.prompt.<locals>.<listcomp>l   sT    )m)m)mab%1Q4!A$;!qt0K*L*P*PU[*P*\*\)m)m)mr   r9   r:   Tz?Expected `labels` to have the same length as `points`, but got z and .)as_tuplec              3  4   K   | ]}t          |          V  d S )N)int)r=   xs     r   	<genexpr>z*FastSAMPredictor.prompt.<locals>.<genexpr>   s(      %8%8c!ff%8%8%8%8%8%8r   d   c                    g | ]}|v|	S  rJ   )r=   i
filter_idxs     r   r@   z+FastSAMPredictor.prompt.<locals>.<listcomp>   s#    UUUa*ATATATATATr   r   )'
isinstancelistlenappendr?   r'   shaper%   r   floatbyter#   zerosboolr   	as_tensorint32ndimstackr<   argmaxoneszipr*   str	enumerater&   r)   tolistr	   orig_imgcpunumpyr   	fromarray_clip_inferenceranger$   rD   )r   r1   r   r   r   r   prompt_resultsr2   r4   
bbox_areas
mask_areasfull_mask_areasunion	point_idxpointlabelcrop_imsrK   r>   x1y1x2y2crop
similaritytext_idxori_idxsrL   r?   s                              @@r   r-   zFastSAMPredictor.promptM   sW    >fnN'4(( 	 iG 6	/ 6	/F6{{a%%f---L%E{122&"333$U4[%6%6%8%8&:KLLQORUU[[]]+c&kkDKPPPC!u{4;WWW)/)9)9v$QQQTlVAAAqD\9fQQQTlVTUTUTUWXTX\>YZ
"[)m)m)m)mfl)m)m)mnn
"')Ev">">">"111d7+o=
J?CELe!3;;;<!u{4;WWW)/)9)9v>"ZQ88Fu{4;WWW6{{c&kk111vVYZ`VaVavvhklrhshsvvv 211
 zz||q(( Js6{{%*T[QQQQS[[
4;WWW 
 %($7$7 k kLE5_cdi_j_jIemE!!!U1XuQx2G,HSWXXXYZ[\\y  eS)) $"GE')2$*%fl&7&>&>&@&@AA G GDAq%8%8a%8%8%8NBB*4Oa%(,,q//:M:M:O:OTWWW"))!,,, !?2b5"R%<85BrE2b5RVAV;W;[;[;];];c;c;e;eeDOOEODAAAttt4D$E$EFFFF!11(EBB
 <
;;;z?? YUUUU5V+=+=UUUH$|HS]],CDKXXXH $H!!&+....r   c                J    ddl m} t           d          s |d j                   _        t          j         fd|D                       } j                            |          } j                             j        	                    |                    }||j
        z  S )a  Perform CLIP inference to calculate similarity between images and text prompts.

        Args:
            images (list[PIL.Image]): List of source images, each should be PIL.Image with RGB channel order.
            texts (list[str]): List of prompt texts, each should be a string object.

        Returns:
            (torch.Tensor): Similarity matrix between given images and texts with shape (M, N).
        r   )CLIPclipzViT-B/32rM   c                t    g | ]4}j                             |                              j                  5S rJ   )rz   image_preprocesstor   )r=   imager   s     r   r@   z4FastSAMPredictor._clip_inference.<locals>.<listcomp>   s:    dddTYdi88??BB4;OOdddr   )ultralytics.nn.text_modelry   hasattrr   rz   r#   rZ   encode_imageencode_texttokenizeT)r   imagesr   ry   image_featurestext_featuress   `     r   re   z FastSAMPredictor._clip_inference   s     	322222tV$$ 	=Z<<<DIdddd]cdddee//77	--di.@.@.G.GHH~///r   c                    || _         dS )z(Set prompts to be used during inference.N)r   )r   r   s     r   set_promptszFastSAMPredictor.set_prompts   s    r   )r   r   )NNNN)__name__
__module____qualname____doc__r   r   r"   r-   re   r   __classcell__)r   s   @r   r   r      s         " '$RV       ^ ^ ^ ^ ^6J J J JX0 0 0&      r   r   )
__future__r   r#   PILr   ultralytics.models.yolo.segmentr   ultralytics.utilsr   ultralytics.utils.metricsr   ultralytics.utils.opsr   ultralytics.utils.torch_utilsr	   utilsr   r   rJ   r   r   <module>r      s    # " " " " "        A A A A A A ) ) ) ) ) ) - - - - - - - - - - - - 4 4 4 4 4 4 0 0 0 0 0 0] ] ] ] ], ] ] ] ] ]r   