
    j                        d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZ dddZe
dz  ddfddZedk    r ed           dS dS )    )annotationsN)Path)IMG_FORMATSimg2label_paths)DATASETS_DIRLOGGERTQDM皙?
source_dir
str | Pathtrain_ratiofloatreturnr   c                   t          |           }t          | d          }|dz  |dz  }}|                    d           |                    d           |                    d           d |                                D             }t          d |D                       }t	          |           d| d	}t          j        d
| d| d|ddd|z
  dd	           |D ]}	||	j        z                      d           ||	j        z                      d           t          |		                    d                    }
t          j        |
           t          t	          |
          |z            }|
d|         D ]'}t          j        |||	j        z  |j        z             (|
|d         D ]'}t          j        |||	j        z  |j        z             (t          j        d| d           |S )u)  Split classification dataset into train and val directories in a new directory.

    Creates a new directory '{source_dir}_split' with train/val subdirectories, preserving the original class structure
    with an 80/20 split by default.

    Directory structure:
        Before:
            caltech/
            ├── class1/
            │   ├── img1.jpg
            │   ├── img2.jpg
            │   └── ...
            ├── class2/
            │   ├── img1.jpg
            │   └── ...
            └── ...

        After:
            caltech_split/
            ├── train/
            │   ├── class1/
            │   │   ├── img1.jpg
            │   │   └── ...
            │   ├── class2/
            │   │   ├── img1.jpg
            │   │   └── ...
            │   └── ...
            └── val/
                ├── class1/
                │   ├── img2.jpg
                │   └── ...
                ├── class2/
                │   └── ...
                └── ...

    Args:
        source_dir (str | Path): Path to classification dataset root directory.
        train_ratio (float): Ratio for train split, between 0 and 1.

    Returns:
        (Path): Path to the created split directory.

    Examples:
        Split dataset with default 80/20 ratio
        >>> split_classify_dataset("path/to/caltech")

        Split with custom ratio
        >>> split_classify_dataset("path/to/caltech", 0.75)
    _splittrainvalT)exist_okc                :    g | ]}|                                 |S  )is_dir.0ds     [/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/data/split.py
<listcomp>z*split_classify_dataset.<locals>.<listcomp>I   s%    AAAahhjjA!AAA    c              3  t   K   | ]3}t          t          |                    d                               V  4dS )*.*N)lenlistglobr   s     r   	<genexpr>z)split_classify_dataset.<locals>.<genexpr>J   s<      DDAs4u..//DDDDDDr   z
 classes, z imagesz
Splitting z (z) into z.0%z train,    z val...r   NzSplit complete in u    ✅)r   mkdiriterdirsumr    r   infonamer!   r"   randomshuffleintshutilcopy2)r   r   source_path
split_path
train_pathval_path
class_dirstotal_imagesstats	class_dirimage_files	split_idximgs                r   split_classify_datasetr:      sP   d z""K,,,--J%/e1CJ d###d###NNDN!!! BA[0022AAAJDDDDDDDL:??,???E
Ko[ooEoo+oooSTWbSbooooppp D D		in	$++T+:::	IN	"))4)888 9>>%0011{###K((;677	z	z* 	F 	FCLj9>9CHDEEEEyzz* 	D 	DCLh7#(BCCCC	D K5Z555666r   zcoco8/images)g?g?g        Fpathweightstuple[float, float, float]annotated_onlyboolNonec                |   t          |           } t          d |                     d          D                       }t          |          }t	          j        d           t	          j        g d||          }g d}|D ]:}| j        |z                                  r| j        |z  	                                 ;t          j        d|  d|z  z              t          t          ||          |	          D ]\  }}	|rBt          t          t          |	          g          d                                                   r}t!          | j        ||         z  d
d          5 }
|
                    d|	                    | j                                                   dz              ddd           n# 1 swxY w Y   dS )a  Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt
    files.

    Args:
        path (Path): Path to images directory.
        weights (tuple[float, float, float]): Train, validation, and test split fractions.
        annotated_only (bool): If True, only images with an associated txt file are used.

    Examples:
        Split images with default weights
        >>> from ultralytics.data.split import autosplit
        >>> autosplit()

        Split with custom weights and annotated images only
        >>> autosplit(path="path/to/images", weights=(0.8, 0.15, 0.05), annotated_only=True)
    c              3  j   K   | ].}|j         d d                                         t          v *|V  /dS )r$   N)suffixlowerr   )r   xs     r   r#   zautosplit.<locals>.<genexpr>x   sB      WW18ABB<3E3E3G3G;3V3V13V3V3V3VWWr   r   r   )r   r$      )r<   k)zautosplit_train.txtzautosplit_val.txtzautosplit_test.txtzAutosplitting images from z!, using *.txt labeled images only)totalazutf-8)encodingz./
N)r   sortedrglobr    r*   seedchoicesparentexistsunlinkr   r(   r	   zipr   stropenwriterelative_toas_posix)r;   r<   r>   filesnindicestxtrE   ir9   fs              r   	autosplitr_   b   s
   * ::DWWdjj//WWWWWEE

A
KNNNnYYY1===G
L
L
LC ' 'K!O##%% 	'[1_$$&&&
K3T336Y\j6jjkkks7E**!444 O O3 	Oos3xxj&A&A!&D!E!E!L!L!N!N 	OdkCF*C'BBB OaFS__T[99BBDDFFMNNNO O O O O O O O O O O O O O OO Os   AF00F4	7F4	__main__
caltech101)r
   )r   r   r   r   r   r   )r;   r   r<   r=   r>   r?   r   r@   )
__future__r   r*   r-   pathlibr   ultralytics.data.utilsr   r   ultralytics.utilsr   r   r	   r:   r_   __name__r   r   r   <module>rg      s    # " " " " "         ? ? ? ? ? ? ? ? 8 8 8 8 8 8 8 8 8 8R R R R Rl .*9 $O $O $O $O $ON z<((((( r   