
    j<                    0   d dl mZ d dlmZ d dlmZ d dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ 	 d dlZn# e$ r  ej        d           d dlZY nw xY w G d	 d
ej                  Z G d de          Z G d de          Z G d de          ZdddZdS )    )annotations)abstractmethod)PathN)Image)checks)smart_inference_modez+git+https://github.com/ultralytics/CLIP.gitc                  N     e Zd ZdZ fdZed             Zed             Z xZS )	TextModela  Abstract base class for text encoding models.

    This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
    the tokenize and encode_text methods to provide text tokenization and encoding functionality.

    Methods:
        tokenize: Convert input texts to tokens for model processing.
        encode_text: Encode tokenized texts into normalized feature vectors.
    c                H    t                                                       dS )z$Initialize the TextModel base class.N)super__init__)self	__class__s    ^/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/nn/text_model.pyr   zTextModel.__init__!   s        c                    dS )z3Convert input texts to tokens for model processing.N r   textss     r   tokenizezTextModel.tokenize%   	     	r   c                    dS )z7Encode tokenized texts into normalized feature vectors.Nr   r   r   dtypes      r   encode_textzTextModel.encode_text*   r   r   )	__name__
__module____qualname____doc__r   r   r   r   __classcell__r   s   @r   r
   r
      sx                ^   ^    r   r
   c                       e Zd ZdZd fdZdddZ e            ej        fdd            Z	 e            ej        fdd            Z
 xZS )CLIPa  Implements OpenAI's CLIP (Contrastive Language-Image Pre-training) text encoder.

    This class provides a text encoder based on OpenAI's CLIP model, which can convert text into feature vectors that
    are aligned with corresponding image features in a shared embedding space.

    Attributes:
        model (clip.model.CLIP): The loaded CLIP model.
        image_preprocess (callable): Preprocessing transform for images.
        device (torch.device): Device where the model is loaded.

    Methods:
        tokenize: Convert input texts to CLIP tokens.
        encode_text: Encode tokenized texts into normalized feature vectors.

    Examples:
        >>> import torch
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> clip_model = CLIP(size="ViT-B/32", device=device)
        >>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
        >>> text_features = clip_model.encode_text(tokens)
        >>> print(text_features.shape)
    sizestrdevicetorch.devicereturnNonec                    t                                                       t          j        ||          \  | _        | _        |                     |           || _        |                                  dS )a  Initialize the CLIP text encoder.

        This class implements the TextModel interface using OpenAI's CLIP model for text encoding. It loads a
        pre-trained CLIP model of the specified size and prepares it for text encoding tasks.

        Args:
            size (str): Model size identifier (e.g., 'ViT-B/32').
            device (torch.device): Device to load the model on.
        )r&   N)	r   r   cliploadmodelimage_preprocesstor&   eval)r   r$   r&   r   s      r   r   zCLIP.__init__H   s`     	,0Id6,J,J,J)
D)		r   Tr   str | list[str]truncatebooltorch.Tensorc                ^    t          j        ||                              | j                  S )a  Convert input texts to CLIP tokens.

        Args:
            texts (str | list[str]): Input text or list of texts to tokenize.
            truncate (bool, optional): Whether to trim texts that exceed CLIP's context length. Defaults to True to
                avoid RuntimeError from overly long inputs while still allowing explicit opt-out.

        Returns:
            (torch.Tensor): Tokenized text tensor with shape (batch_size, context_length) ready for model processing.

        Examples:
            >>> model = CLIP("ViT-B/32", device="cpu")
            >>> tokens = model.tokenize("a photo of a cat")
            >>> print(tokens.shape)  # torch.Size([1, 77])
            >>> strict_tokens = model.tokenize("a photo of a cat", truncate=False)  # Enforce strict length checks
            >>> print(strict_tokens.shape)  # Same shape/content as tokens since prompt less than 77 tokens
        r2   )r+   r   r/   r&   r   r   r2   s      r   r   zCLIP.tokenizeX   s)    $ }UX66699$+FFFr   r   torch.dtypec                    | j                             |                              |          }||                    ddd          z  }|S )aw  Encode tokenized texts into normalized feature vectors.

        This method processes tokenized text inputs through the CLIP model to generate feature vectors, which are then
        normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.

        Args:
            texts (torch.Tensor): Tokenized text inputs, typically created using the tokenize() method.
            dtype (torch.dtype, optional): Data type for output features.

        Returns:
            (torch.Tensor): Normalized text feature vectors with unit length (L2 norm = 1).

        Examples:
            >>> clip_model = CLIP("ViT-B/32", device="cuda")
            >>> tokens = clip_model.tokenize(["a photo of a cat", "a photo of a dog"])
            >>> features = clip_model.encode_text(tokens)
            >>> features.shape
            torch.Size([2, 512])
           Tpdimkeepdimr-   r   r/   norm)r   r   r   	txt_featss       r   r   zCLIP.encode_textl   sI    * J**51144U;;		D I II	r   imageImage.Image | torch.Tensorc                J   t          |t          j                  r@|                     |                              d                              | j                  }| j                            |                              |          }||                    ddd          z  }|S )a  Encode images into normalized feature vectors.

        This method processes image inputs through the CLIP model to generate feature vectors, which are then
        normalized to unit length. These normalized vectors can be used for text-image similarity comparisons.

        Args:
            image (PIL.Image | torch.Tensor): Image input as a PIL Image or preprocessed tensor. If a PIL Image is
                provided, it will be converted to a tensor using the model's image preprocessing function.
            dtype (torch.dtype, optional): Data type for output features.

        Returns:
            (torch.Tensor): Normalized image feature vectors with unit length (L2 norm = 1).

        Examples:
            >>> from ultralytics.nn.text_model import CLIP
            >>> from PIL import Image
            >>> clip_model = CLIP("ViT-B/32", device="cuda")
            >>> image = Image.open("path/to/image.jpg")
            >>> image_tensor = clip_model.image_preprocess(image).unsqueeze(0).to("cuda")
            >>> features = clip_model.encode_image(image_tensor)
            >>> features.shape
            torch.Size([1, 512])
        r   r:   r;   Tr<   )	
isinstancer   r.   	unsqueezer/   r&   r-   encode_imagerA   )r   rC   r   	img_featss       r   rH   zCLIP.encode_image   s    2 eU[)) 	N))%00::1==@@MMEJ++E2255e<<		D I II	r   r$   r%   r&   r'   r(   r)   T)r   r1   r2   r3   r(   r4   r   r4   r   r8   r(   r4   )rC   rD   r   r8   r(   r4   )r   r   r   r   r   r   r   torchfloat32r   rH   r    r!   s   @r   r#   r#   0   s         .      G G G G G( DIM     0 SXS`         r   r#   c                  r     e Zd ZdZddddddZd fdZddZ e            ej	        fdd            Z
 xZS )
MobileCLIPa?  Implement Apple's MobileCLIP text encoder for efficient text encoding.

    This class implements the TextModel interface using Apple's MobileCLIP model, providing efficient text encoding
    capabilities for vision-language tasks with reduced computational requirements compared to standard CLIP models.

    Attributes:
        model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
        tokenizer (callable): Tokenizer function for processing text inputs.
        device (torch.device): Device where the model is loaded.
        config_size_map (dict): Mapping from size identifiers to model configuration names.

    Methods:
        tokenize: Convert input texts to MobileCLIP tokens.
        encode_text: Encode tokenized texts into normalized feature vectors.

    Examples:
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> text_encoder = MobileCLIP(size="s0", device=device)
        >>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
        >>> features = text_encoder.encode_text(tokens)
    s0s1s2b)rQ   rR   rS   rT   bltr$   r%   r&   r'   r(   r)   c                   	 ddl }n(# t          $ r t          j        d           ddl }Y nw xY wt	                                                       | j        |         }d| d}t          |                                          sddl	m
}  |d|            |                    d| ||          d         | _        |                    d|           | _        |                     |           || _        |                                  dS )	aF  Initialize the MobileCLIP text encoder.

        This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.

        Args:
            size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
            device (torch.device): Device to load the model on.
        r   Nz1git+https://github.com/ultralytics/mobileclip.gitmobileclip_z.pt)downloadzHhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/)
pretrainedr&   )
mobileclipImportErrorr   check_requirementsr   r   config_size_mapr   is_fileultralyticsrX   create_model_and_transformsr-   get_tokenizer	tokenizerr/   r&   r0   )r   r$   r&   rZ   configfilerX   r   s          r   r   zMobileCLIP.__init__   sD   	 	 	 	%&YZZZ	
 	%d+&T&&&Dzz!!## 	h,,,,,,Hf`dffggg;;<R&<R<R_clr;sstuv
#112H2H2HII		s    "--r   	list[str]r4   c                \    |                      |                              | j                  S )a  Convert input texts to MobileCLIP tokens.

        Args:
            texts (list[str]): List of text strings to tokenize.

        Returns:
            (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).

        Examples:
            >>> model = MobileCLIP("s0", "cpu")
            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
        rb   r/   r&   r   s     r   r   zMobileCLIP.tokenize   s&     ~~e$$''444r   r   r8   c                    | j                             |                              |          }||                    ddd          z  }|S )az  Encode tokenized texts into normalized feature vectors.

        Args:
            texts (torch.Tensor): Tokenized text inputs.
            dtype (torch.dtype, optional): Data type for output features.

        Returns:
            (torch.Tensor): Normalized text feature vectors with L2 normalization applied.

        Examples:
            >>> model = MobileCLIP("s0", device="cpu")
            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
            >>> features = model.encode_text(tokens)
            >>> features.shape
            torch.Size([2, 512])  # Actual dimension depends on model size
        r:   r;   Tr<   r@   )r   r   r   text_featuress       r   r   zMobileCLIP.encode_text   sL    $ 
..u5588??++aR+FFFr   rJ   )r   re   r(   r4   rL   )r   r   r   r   r]   r   r   r   rM   rN   r   r    r!   s   @r   rP   rP      s         , "TCPPO     :5 5 5 5 DIM         r   rP   c                  f     e Zd ZdZdd fdZdddZ e            ej        fdd            Z	 xZ
S )MobileCLIPTSa  Load a TorchScript traced version of MobileCLIP.

    This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format, providing
    efficient text encoding capabilities for vision-language tasks with optimized inference performance.

    Attributes:
        encoder (torch.jit.ScriptModule): The loaded TorchScript MobileCLIP text encoder.
        tokenizer (callable): Tokenizer function for processing text inputs.
        device (torch.device): Device where the model is loaded.

    Methods:
        tokenize: Convert input texts to MobileCLIP tokens.
        encode_text: Encode tokenized texts into normalized feature vectors.

    Examples:
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> text_encoder = MobileCLIPTS(device=device)
        >>> tokens = text_encoder.tokenize(["a photo of a cat", "a photo of a dog"])
        >>> features = text_encoder.encode_text(tokens)
    mobileclip_blt.tsr&   r'   weightr%   c                    t                                                       ddlm} t          j                             ||          |          | _        t          j        j	        | _
        || _        dS )a  Initialize the MobileCLIP TorchScript text encoder.

        This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format for efficient
        text encoding with optimized inference performance.

        Args:
            device (torch.device): Device to load the model on.
            weight (str): Path to the TorchScript model weights.
        r   )attempt_download_asset)map_locationN)r   r   ultralytics.utils.downloadsro   rM   jitr,   encoderr+   r   rb   r&   )r   r&   rm   ro   r   s       r   r   zMobileCLIPTS.__init__  sj     	FFFFFFy~~&<&<V&D&DSY~ZZ+r   Tr   re   r2   r3   r(   r4   c                `    |                      ||                              | j                  S )a  Convert input texts to MobileCLIP tokens.

        Args:
            texts (list[str]): List of text strings to tokenize.
            truncate (bool, optional): Whether to trim texts that exceed the tokenizer context length. Defaults to True,
                matching CLIP's behavior to prevent runtime failures on long captions.

        Returns:
            (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).

        Examples:
            >>> model = MobileCLIPTS(device=torch.device("cpu"))
            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
            >>> strict_tokens = model.tokenize(
            ...     ["a very long caption"], truncate=False
            ... )  # RuntimeError if exceeds 77-token
        r6   rg   r7   s      r   r   zMobileCLIPTS.tokenize(  s*    $ ~~eh~77::4;GGGr   r   r8   c                R    |                      |                              |          S )av  Encode tokenized texts into normalized feature vectors.

        Args:
            texts (torch.Tensor): Tokenized text inputs.
            dtype (torch.dtype, optional): Data type for output features.

        Returns:
            (torch.Tensor): Normalized text feature vectors with L2 normalization applied.

        Examples:
            >>> model = MobileCLIPTS(device="cpu")
            >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
            >>> features = model.encode_text(tokens)
            >>> features.shape
            torch.Size([2, 512])  # Actual dimension depends on model size
        )rs   r/   r   s      r   r   zMobileCLIPTS.encode_text<  s$    & ||E""%%e,,,r   )rl   )r&   r'   rm   r%   rK   )r   re   r2   r3   r(   r4   rL   )r   r   r   r   r   r   r   rM   rN   r   r    r!   s   @r   rk   rk     s         *      "H H H H H( DIM - - - - - - - - -r   rk   variantr%   r&   r'   r(   c                    |                      d          \  }}|dk    rt          ||          S |dk    rt          |          S |dk    rt          |d          S t          d| d          )	a  Build a text encoding model based on the specified variant.

    Args:
        variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
        device (torch.device, optional): Device to load the model on.

    Returns:
        (TextModel): Instantiated text encoding model.

    Examples:
        >>> model = build_text_model("clip:ViT-B/32", device=torch.device("cuda"))
        >>> model = build_text_model("mobileclip:s0", device=torch.device("cpu"))
    :r+   rZ   mobileclip2zmobileclip2_b.ts)rm   zUnrecognized base model 'z<'. Supported models are 'clip', 'mobileclip', 'mobileclip2'.)splitr#   rk   
ValueError)rv   r&   baser$   s       r   build_text_modelr}   R  s     s##JD$v~~D&!!!			F###			F+=>>>>wTwwwxxxr   )N)rv   r%   r&   r'   r(   r
   )
__future__r   abcr   pathlibr   rM   torch.nnnnPILr   ultralytics.utilsr   ultralytics.utils.torch_utilsr   r+   r[   r\   Moduler
   r#   rP   rk   r}   r   r   r   <module>r      s   # " " " " "                          $ $ $ $ $ $ > > > > > >KKKK   FKLLLKKKKK
    	   4r r r r r9 r r rjY Y Y Y Y Y Y YxN- N- N- N- N-9 N- N- N-by y y y y y ys   5 AA