121 lines
4.4 KiB
Python
121 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Připraví dataset pro trénink na serveru.
|
||
Vždy zahrne: data/lanov_custom/
|
||
Podmíněně: data/hk_custom/ (pokud existuje)
|
||
data/yolo_visdrone/ (pokud existuje, jen třídy 0–3)
|
||
|
||
Výstup: dataset_lanov.zip
|
||
lanov/train/images+labels, val/images+labels, dataset.yaml
|
||
best.pt
|
||
"""
|
||
import random, shutil, zipfile
|
||
from pathlib import Path
|
||
|
||
VAL_RATIO = 0.2
|
||
BG_PER_SPLIT = 50
|
||
SEED = 42
|
||
CLASS_NAMES = ["car", "van", "truck", "bus"]
|
||
OUT = Path("dataset_lanov")
|
||
ZIP_OUT = Path("dataset_lanov.zip")
|
||
|
||
random.seed(SEED)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Zdroje dat (img_dir, lbl_dir, prefix pro unikátní názvy souborů)
|
||
# ---------------------------------------------------------------------------
|
||
SOURCES = [
|
||
("data/lanov_custom/train/images", "data/lanov_custom/train/labels", "lanov"),
|
||
]
|
||
|
||
# Podmíněné zdroje — stejný YOLO formát, stejné třídy
|
||
for name, path in [
|
||
("hk", "data/hk_custom/train"),
|
||
("visdrone", "data/yolo_visdrone/train"),
|
||
]:
|
||
img_dir = Path(path) / "images"
|
||
lbl_dir = Path(path) / "labels"
|
||
if img_dir.exists() and lbl_dir.exists():
|
||
n = sum(1 for l in lbl_dir.glob("*.txt") if l.stat().st_size > 0)
|
||
print(f"+ {name}: {n} anotovaných dlaždic")
|
||
SOURCES.append((str(img_dir), str(lbl_dir), name))
|
||
else:
|
||
print(f"- {name}: nenalezeno, přeskočeno")
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Načtení všech labelů ze všech zdrojů
|
||
# ---------------------------------------------------------------------------
|
||
all_annotated = [] # (lbl_path, img_path, prefix)
|
||
all_background = []
|
||
|
||
for img_dir, lbl_dir, prefix in SOURCES:
|
||
img_dir = Path(img_dir)
|
||
lbl_dir = Path(lbl_dir)
|
||
for lbl in sorted(lbl_dir.glob("*.txt")):
|
||
img = img_dir / (lbl.stem + ".jpg")
|
||
if not img.exists():
|
||
img = img_dir / (lbl.stem + ".png")
|
||
entry = (lbl, img if img.exists() else None, prefix)
|
||
if lbl.stat().st_size > 0:
|
||
all_annotated.append(entry)
|
||
else:
|
||
all_background.append(entry)
|
||
|
||
random.shuffle(all_annotated)
|
||
random.shuffle(all_background)
|
||
|
||
n_val = max(1, int(len(all_annotated) * VAL_RATIO))
|
||
val_ann = all_annotated[:n_val]
|
||
train_ann = all_annotated[n_val:]
|
||
train_bg = all_background[:BG_PER_SPLIT]
|
||
val_bg = all_background[BG_PER_SPLIT : BG_PER_SPLIT * 2]
|
||
|
||
print(f"\nAnotované: {len(all_annotated)} (train {len(train_ann)}, val {len(val_ann)})")
|
||
print(f"Background: {len(all_background)} (train {len(train_bg)}, val {len(val_bg)})")
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Kopírování do OUT/
|
||
# ---------------------------------------------------------------------------
|
||
if OUT.exists():
|
||
shutil.rmtree(OUT)
|
||
|
||
for split_name, split in [("train", train_ann + train_bg), ("val", val_ann + val_bg)]:
|
||
img_out = OUT / split_name / "images"
|
||
lbl_out = OUT / split_name / "labels"
|
||
img_out.mkdir(parents=True, exist_ok=True)
|
||
lbl_out.mkdir(parents=True, exist_ok=True)
|
||
for lbl_path, img_path, prefix in split:
|
||
stem = f"{prefix}_{lbl_path.stem}"
|
||
shutil.copy2(lbl_path, lbl_out / f"{stem}.txt")
|
||
if img_path:
|
||
shutil.copy2(img_path, img_out / f"{stem}{img_path.suffix}")
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# dataset.yaml
|
||
# ---------------------------------------------------------------------------
|
||
(OUT / "dataset.yaml").write_text(
|
||
f"path: .\ntrain: train/images\nval: val/images\nnc: {len(CLASS_NAMES)}\nnames: {CLASS_NAMES}\n"
|
||
)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ZIP
|
||
# ---------------------------------------------------------------------------
|
||
if ZIP_OUT.exists():
|
||
ZIP_OUT.unlink()
|
||
|
||
weights = Path("best.pt")
|
||
with zipfile.ZipFile(ZIP_OUT, "w", zipfile.ZIP_DEFLATED) as zf:
|
||
for f in sorted(OUT.rglob("*")):
|
||
if f.is_file():
|
||
zf.write(f, "lanov/" + str(f.relative_to(OUT)))
|
||
if weights.exists():
|
||
zf.write(weights, "best.pt")
|
||
print("Přidáno: best.pt")
|
||
|
||
size_mb = ZIP_OUT.stat().st_size / 1e6
|
||
print(f"\nHotovo: {ZIP_OUT} ({size_mb:.1f} MB)")
|
||
print(f"\nNa serveru spusť:")
|
||
print(f" unzip dataset_lanov.zip")
|
||
print(f" pip install ultralytics")
|
||
print(f" yolo train model=best.pt data=lanov/dataset.yaml epochs=50 imgsz=256 batch=32")
|