
    /j                     t    d dl Zd dlZd dlmZ d dlmZ d dlmZ  G d de          Z	 G d de	e          Z
dS )	    N)LoadVisualPrompt)DetectionPredictor)SegmentationPredictorc                   V     e Zd ZdZddef fdZd Z fdZd fd	Z fd	Z	d
 Z
 xZS )YOLOEVPDetectPredictoraa  A class extending DetectionPredictor for YOLO-EVP (Enhanced Visual Prompting) predictions.

    This class provides common functionality for YOLO models that use visual prompting, including model setup, prompt
    handling, and preprocessing transformations.

    Attributes:
        model (torch.nn.Module): The YOLO model for inference.
        device (torch.device): Device to run the model on (CPU or CUDA).
        prompts (dict | torch.Tensor): Visual prompts containing class indices and bounding boxes or masks.

    Methods:
        setup_model: Initialize the YOLO model and set it to evaluation mode.
        set_prompts: Set the visual prompts for the model.
        pre_transform: Preprocess images and prompts before inference.
        inference: Run inference with visual prompts.
        get_vpe: Process source to get visual prompt embeddings.
    Tverbosec                 \    t                                          ||           d| _        dS )zSet up the model for prediction.

        Args:
            model (torch.nn.Module): Model to load or use.
            verbose (bool, optional): If True, provides detailed logging.
        )r   TN)supersetup_modeldone_warmup)selfmodelr   	__class__s      j/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/yolo/yoloe/predict.pyr   z"YOLOEVPDetectPredictor.setup_model   s0     	E7333    c                     || _         dS )zSet the visual prompts for the model.

        Args:
            prompts (dict): Dictionary containing class indices and bounding boxes or masks. Must include a 'cls' key
                with class indices.
        N)prompts)r   r   s     r   set_promptsz"YOLOEVPDetectPredictor.set_prompts(   s     r   c           	          t                                                     j                            dd           j                            dd          } j        d         t	                    dk    rn                     d         j        dd         d         j        dd         |          }|                    d                               j	                  }nhJ d d	            t          t                    rt          d
 D                       sJ d d	            t          t                    rt          d D                       sJ d d	            t	                    t	                    cxk    rt	                    k    s=n J dt	                     dt	                     dt	                     d	             fdt          t	                              D             }t          j        j        j                            |d                               j	                  } j        j        r|                                n|                                 _        S )a  Preprocess images and prompts before inference.

        This method applies letterboxing to the input image and transforms the visual prompts (bounding boxes or masks)
        accordingly.

        Args:
            im (list): List of input images.

        Returns:
            (list): Preprocessed images ready for model inference.

        Raises:
            ValueError: If neither valid bounding boxes nor masks are provided in the prompts.
        bboxesNmaskscls   r      zExpected bboxes, but got !c              3   J   K   | ]}t          |t          j                  V  d S N
isinstancenpndarray.0bs     r   	<genexpr>z7YOLOEVPDetectPredictor.pre_transform.<locals>.<genexpr>K   s/      3^3^RSJq"*4M4M3^3^3^3^3^3^r   z#Expected list[np.ndarray], but got c              3   J   K   | ]}t          |t          j                  V  d S r   r   r"   s     r   r%   z7YOLOEVPDetectPredictor.pre_transform.<locals>.<genexpr>N   s/      5b5bTUjBJ6O6O5b5b5b5b5b5br   z-Expected same length for all inputs, but got vsc           	          g | ]L}                     |         j        d d         |         j        d d         |         |                   MS )Nr   )_process_single_imageshape)r#   ir   categoryimimgr   s     r   
<listcomp>z8YOLOEVPDetectPredictor.pre_transform.<locals>.<listcomp>T   sf        **3q6<+;RU[!_hWXk[abc[dee  r   T)batch_first)r
   pre_transformr   poplenr)   r*   	unsqueezetodevicer   listallrangetorchnnutilsrnnpad_sequencer   fp16halffloat)	r   r-   r   visualsr   r   r,   r.   r   s	   ``   @@@r   r1   z$YOLOEVPDetectPredictor.pre_transform1   s    gg##B''!!(D11  $//<&s88q==00Qbqb1A2a5;rPQr?T\^dfkllG''**--dk::GG %%'L6'L'L'L%%%fd++ 3^3^W]3^3^3^0^0^  ?f??? ^ h-- #5b5bYa5b5b5b2b2b  AhAAA b r77c(mm::::s6{{:::::jBjj3x==jj\_`f\g\gjjj ;::       s3xx  G hn(55g45PPSSTXT_``G)-Mw||~~~gmmoo
r   Nc           
         |t          |          rt          j        |t          j                  }|j        dk    r|dddf         }t          |d         |d         z  |d         |d         z            }||z  }|ddddfxx         t          |d         t          |d         |z            z
  dz  dz
            z  cc<   |ddddfxx         t          |d         t          |d         |z            z
  dz  dz
            z  cc<   nP|?t                                          |          }t          j	        |          }d||dk    <   nt          d	          t                                          ||||          S )
a  Process a single image by resizing bounding boxes or masks and generating visuals.

        Args:
            dst_shape (tuple): The target shape (height, width) of the image.
            src_shape (tuple): The original shape (height, width) of the image.
            category (list | np.ndarray): The category indices for visual prompts.
            bboxes (list | np.ndarray, optional): A list of bounding boxes in the format [x1, y1, x2, y2].
            masks (np.ndarray, optional): A list of masks corresponding to the image.

        Returns:
            (torch.Tensor): The processed visuals for the image.

        Raises:
            ValueError: If neither `bboxes` nor `masks` are provided.
        N)dtyper   r   .r   g?r   z$Please provide valid bboxes or masks)r3   r    arrayfloat32ndimminroundr
   r1   stack
ValueErrorr   get_visuals)	r   	dst_shape	src_shaper,   r   r   gainresized_masksr   s	           r   r)   z,YOLOEVPDetectPredictor._process_single_image\   s     #f++XfBJ777F{aaaay|il2IaL9Q<4OPPDdNF319	!uYq\D=P7Q7Q(QUV'VY\'\!]!]]319	!uYq\D=P7Q7Q(QUV'VY\'\!]!]]!GG11%88MH]++E"#E%3,CDDD  !!--h	65QQQr   c                 L     t                      j        |g|R d| j        i|S )a&  Run inference with visual prompts.

        Args:
            im (torch.Tensor): Input image tensor.
            *args (Any): Variable length argument list.
            **kwargs (Any): Arbitrary keyword arguments.

        Returns:
            (torch.Tensor): Model prediction results.
        vpe)r
   	inferencer   )r   r-   argskwargsr   s       r   rT   z YOLOEVPDetectPredictor.inference   s3     !uww GGGGGGGGr   c                     |                      |           t          | j                  dk    s
J d            | j        D ]9\  }}}|                     |          }|                     || j        d          c S dS )a  Process the source to get the visual prompt embeddings (VPE).

        Args:
            source (str | Path | int | PIL.Image | np.ndarray | torch.Tensor | list | tuple): The source of the image to
                make predictions on. Accepts various types including file paths, URLs, PIL images, numpy arrays, and
                torch tensors.

        Returns:
            (torch.Tensor): The visual prompt embeddings (VPE) from the model.
        r   z get_vpe only supports one image!T)rS   
return_vpeN)setup_sourcer3   dataset
preprocessr   r   )r   source_im0sr-   s        r   get_vpezYOLOEVPDetectPredictor.get_vpe   s     	&!!!4<  A%%%'I%%%, 	E 	EJAtQ&&B::bdlt:DDDDD	E 	Er   )T)NN)__name__
__module____qualname____doc__boolr   r   r1   r)   rT   r_   __classcell__)r   s   @r   r   r      s         $   $              ) ) ) ) )V"R "R "R "R "R "RHH H H H HE E E E E E Er   r   c                       e Zd ZdZdS )YOLOEVPSegPredictorz\Predictor for YOLO-EVP segmentation tasks combining detection and segmentation capabilities.N)r`   ra   rb   rc    r   r   rg   rg      s        ffDr   rg   )numpyr    r:   ultralytics.data.augmentr   ultralytics.models.yolo.detectr   ultralytics.models.yolo.segmentr   r   rg   rh   r   r   <module>rm      s         5 5 5 5 5 5 = = = = = = A A A A A AQE QE QE QE QE/ QE QE QEh	 	 	 	 	02G 	 	 	 	 	r   