#!/usr/bin/env python3 """ Připraví dataset pro trénink na serveru. Vždy zahrne: data/lanov_custom/ Podmíněně: data/hk_custom/ (pokud existuje) data/yolo_visdrone/ (pokud existuje, jen třídy 0–3) Výstup: dataset_lanov.zip lanov/train/images+labels, val/images+labels, dataset.yaml best.pt """ import random, shutil, zipfile from pathlib import Path VAL_RATIO = 0.2 BG_PER_SPLIT = 50 SEED = 42 CLASS_NAMES = ["car", "van", "truck", "bus"] OUT = Path("dataset_lanov") ZIP_OUT = Path("dataset_lanov.zip") random.seed(SEED) # --------------------------------------------------------------------------- # Zdroje dat (img_dir, lbl_dir, prefix pro unikátní názvy souborů) # --------------------------------------------------------------------------- SOURCES = [ ("data/lanov_custom/train/images", "data/lanov_custom/train/labels", "lanov"), ] # Podmíněné zdroje — stejný YOLO formát, stejné třídy for name, path in [ ("hk", "data/hk_custom/train"), ("visdrone", "data/yolo_visdrone/train"), ]: img_dir = Path(path) / "images" lbl_dir = Path(path) / "labels" if img_dir.exists() and lbl_dir.exists(): n = sum(1 for l in lbl_dir.glob("*.txt") if l.stat().st_size > 0) print(f"+ {name}: {n} anotovaných dlaždic") SOURCES.append((str(img_dir), str(lbl_dir), name)) else: print(f"- {name}: nenalezeno, přeskočeno") # --------------------------------------------------------------------------- # Načtení všech labelů ze všech zdrojů # --------------------------------------------------------------------------- all_annotated = [] # (lbl_path, img_path, prefix) all_background = [] for img_dir, lbl_dir, prefix in SOURCES: img_dir = Path(img_dir) lbl_dir = Path(lbl_dir) for lbl in sorted(lbl_dir.glob("*.txt")): img = img_dir / (lbl.stem + ".jpg") if not img.exists(): img = img_dir / (lbl.stem + ".png") entry = (lbl, img if img.exists() else None, prefix) if lbl.stat().st_size > 0: all_annotated.append(entry) else: all_background.append(entry) random.shuffle(all_annotated) random.shuffle(all_background) n_val = max(1, int(len(all_annotated) * VAL_RATIO)) val_ann = all_annotated[:n_val] train_ann = all_annotated[n_val:] train_bg = all_background[:BG_PER_SPLIT] val_bg = all_background[BG_PER_SPLIT : BG_PER_SPLIT * 2] print(f"\nAnotované: {len(all_annotated)} (train {len(train_ann)}, val {len(val_ann)})") print(f"Background: {len(all_background)} (train {len(train_bg)}, val {len(val_bg)})") # --------------------------------------------------------------------------- # Kopírování do OUT/ # --------------------------------------------------------------------------- if OUT.exists(): shutil.rmtree(OUT) for split_name, split in [("train", train_ann + train_bg), ("val", val_ann + val_bg)]: img_out = OUT / split_name / "images" lbl_out = OUT / split_name / "labels" img_out.mkdir(parents=True, exist_ok=True) lbl_out.mkdir(parents=True, exist_ok=True) for lbl_path, img_path, prefix in split: stem = f"{prefix}_{lbl_path.stem}" shutil.copy2(lbl_path, lbl_out / f"{stem}.txt") if img_path: shutil.copy2(img_path, img_out / f"{stem}{img_path.suffix}") # --------------------------------------------------------------------------- # dataset.yaml # --------------------------------------------------------------------------- (OUT / "dataset.yaml").write_text( f"path: .\ntrain: train/images\nval: val/images\nnc: {len(CLASS_NAMES)}\nnames: {CLASS_NAMES}\n" ) # --------------------------------------------------------------------------- # ZIP # --------------------------------------------------------------------------- if ZIP_OUT.exists(): ZIP_OUT.unlink() weights = Path("best.pt") with zipfile.ZipFile(ZIP_OUT, "w", zipfile.ZIP_DEFLATED) as zf: for f in sorted(OUT.rglob("*")): if f.is_file(): zf.write(f, "lanov/" + str(f.relative_to(OUT))) if weights.exists(): zf.write(weights, "best.pt") print("Přidáno: best.pt") size_mb = ZIP_OUT.stat().st_size / 1e6 print(f"\nHotovo: {ZIP_OUT} ({size_mb:.1f} MB)") print(f"\nNa serveru spusť:") print(f" unzip dataset_lanov.zip") print(f" pip install ultralytics") print(f" yolo train model=best.pt data=lanov/dataset.yaml epochs=50 imgsz=256 batch=32")