Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ export class TrainingExternalService {
// the N1-style "custom attachment" path.
const instancePolicy: protos.google.cloud.batch.v1.AllocationPolicy.IInstancePolicy = {
machineType: mlBatchMachineType,
bootDisk: { sizeGb: String(mlBatchBootDiskGb) },
bootDisk: { sizeGb: String(mlBatchBootDiskGb), type: "pd-ssd" },
provisioningModel: "SPOT",
};
if (mlBatchGpuType && mlBatchGpuCount > 0) {
Expand Down Expand Up @@ -381,7 +381,7 @@ export class TrainingExternalService {
memoryMib: mlBatchTaskMemoryMib,
},
maxRunDuration: { seconds: String(mlBatchMaxRunSeconds) },
maxRetryCount: 3,
maxRetryCount: 10,
lifecyclePolicies: [
{
action: protos.google.cloud.batch.v1.LifecyclePolicy.Action.RETRY_TASK,
Expand Down
44 changes: 32 additions & 12 deletions apps/ml-yolo/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
RUN pip install --no-cache-dir --upgrade pip

COPY requirements.txt .
# The main env MUST install the CUDA build of torch (the PyPI default,
# which pulls the ~2.5GB nvidia-* wheels). Do NOT add the
# download.pytorch.org/whl/cpu index here: `2.8.0+cpu` satisfies the
# `torch==2.8.0` pin and pip prefers it, but +cpu wheels are compiled
# without CUDA support entirely — torch.cuda.is_available() is then
# false no matter what the host provides, and Ultralytics silently
# trains on CPU. Cloud Batch's installGpuDrivers only supplies the
# kernel driver + libcuda.so; the CUDA user-space libs (cuBLAS, cuDNN,
# NCCL, ...) ship exclusively in these wheels. The tools venv below is
# different: export runs on CPU by design, so +cpu is correct there.
RUN pip install --no-cache-dir --timeout=120 --retries=5 -r requirements.txt

# luxonis/tools — used at export time to convert Ultralytics .pt to a
Expand Down Expand Up @@ -57,7 +67,11 @@ RUN python3 -m venv /opt/tools-venv && \
git checkout edbe7da1a7f75833a71d65caf1028036faa81061 && \
git submodule update --init --recursive && \
PIP_CONSTRAINT=constraints.txt /opt/tools-venv/bin/pip install --no-cache-dir --timeout=120 --retries=5 . && \
rm -rf /tmp/luxonis-tools
rm -rf /tmp/luxonis-tools && \
# Purge heavy build dependencies after compilation and git clones are done
apt-get purge -y build-essential cmake git && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/*

ENV ROBOPIPE_TOOLS_BIN=/opt/tools-venv/bin/tools

Expand Down Expand Up @@ -130,30 +144,36 @@ ENV IN_DOCKER=1
ENV ROBOPIPE_MODELCONVERTER_BIN=/opt/modelconverter-venv/bin/modelconverter
ENV ROBOPIPE_SNPE_ROOT=/opt/snpe

# Pre-download all yolo11 detection + segmentation pretrained weights into the
# image. Ultralytics' YOLO(name) constructor calls safe_download() which has a
# known silent-failure path: if a retry deletes a partial file and the loop
# exits without raising, attempt_download_asset() returns a Path to a
# non-existent file and the next torch.load() raises FileNotFoundError. By
# baking the weights at /app (the WORKDIR), YOLO()'s first existence check
# (`Path(file).exists()` against cwd) hits and skips the download entirely.
# Pin to v8.3.0 — the release tag that hosts the yolo11 family weights.
# Pre-download all yolo11 detection, segmentation, and classification pretrained
# weights into the image. Ultralytics' YOLO(name) constructor calls
# safe_download() which has a known silent-failure path: if a retry deletes a
# partial file and the loop exits without raising, attempt_download_asset()
# returns a Path to a non-existent file and the next torch.load() raises
# FileNotFoundError. By baking the weights at /app (the WORKDIR), YOLO()'s first
# existence check (`Path(file).exists()` against cwd) hits and skips the
# download entirely. Pin to v8.3.0 — the release tag that hosts the yolo11 family.
RUN cd /app && for v in \
yolo11n yolo11s yolo11m yolo11l yolo11x \
yolo11n-seg yolo11s-seg yolo11m-seg yolo11l-seg yolo11x-seg \
yolo11n-cls yolo11s-cls yolo11m-cls yolo11l-cls yolo11x-cls \
; do \
curl -fL --retry 3 -o ${v}.pt \
https://github.com/ultralytics/assets/releases/download/v8.3.0/${v}.pt; \
done

# Bake Ultralytics fonts into the image. Ultralytics downloads Arial.ttf and
# Arial.Unicode.ttf at runtime if missing from $YOLO_CONFIG_DIR. By baking them
# into a persistent /opt/ultralytics path, we save ~3-5s of start-up latency.
ENV YOLO_CONFIG_DIR=/opt/ultralytics
RUN mkdir -p ${YOLO_CONFIG_DIR} && \
curl -fL --retry 3 -o ${YOLO_CONFIG_DIR}/Arial.ttf https://ultralytics.com/assets/Arial.ttf && \
curl -fL --retry 3 -o ${YOLO_CONFIG_DIR}/Arial.Unicode.ttf https://ultralytics.com/assets/Arial.Unicode.ttf

COPY app/ ./app/

RUN mkdir -p /app/export /app/dataset

ENV PYTHONUNBUFFERED=1
# Cloud Batch containers don't always have a writable $HOME; pin Ultralytics'
# settings dir to /tmp to avoid the "not writable" warning on every run.
ENV YOLO_CONFIG_DIR=/tmp/Ultralytics

# Default to Cloud Batch job mode. Docker-compose overrides CMD to run the
# FastAPI server for local HTTP testing.
Expand Down
42 changes: 36 additions & 6 deletions apps/ml-yolo/app/ml/dataset.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import hashlib
import math
import random
import requests
import shutil
import yaml
import os
from concurrent.futures import ThreadPoolExecutor

from ..models.dataset_config import DatasetConfig
from ..models.image import Image
Expand All @@ -23,7 +25,7 @@
def copy_image(image: Image, dest: str):
image_path = image.file_url
if image_path.startswith("http://") or image_path.startswith("https://"):
response = requests.get(image_path, stream=True)
response = requests.get(image_path, stream=True, timeout=30)
if response.status_code == 200:
filename = os.path.basename(image_path)
dest_path = os.path.join(dest, filename)
Expand Down Expand Up @@ -57,9 +59,24 @@ def prepare_dataset_config(config: DatasetConfig, dir: str):
yaml.dump({"names": get_label_mapping(config)}, f)


def iterate_datasets(images: list[Image], config: DatasetConfig):
shuffled_images = list(images)
random.shuffle(shuffled_images)
def split_seed(seed_source: str) -> int:
"""Derive a deterministic RNG seed from a stable string (the model id).

sha256, not the built-in hash() — hash() is salted per process
(PYTHONHASHSEED), so it would produce a different seed on every VM and
defeat the purpose.
"""
return int.from_bytes(hashlib.sha256(seed_source.encode()).digest()[:8], "big")


def iterate_datasets(images: list[Image], config: DatasetConfig, seed: int):
# The split must be identical across Cloud Batch task attempts: a Spot
# retry resumes from a checkpoint, and a re-drawn split would leak
# already-trained images into val/test. Sort by file_url first so the
# result is also independent of payload ordering, then shuffle with a
# seeded RNG isolated from the global `random` state.
shuffled_images = sorted(images, key=lambda image: image.file_url)
random.Random(seed).shuffle(shuffled_images)
train_split, val_split, _ = (s / 100.0 for s in config.dataset_split)
total_images = len(images)
train_end = int(total_images * train_split)
Expand All @@ -86,7 +103,11 @@ def prepare_classification_directory(


def prepare_dataset(
dir: str, images: list[Image], config: DatasetConfig, task_type: ModelType
dir: str,
images: list[Image],
config: DatasetConfig,
task_type: ModelType,
seed: int,
):
global VAL_DIR
dir = f"{dir}/{DATASET_DIR}"
Expand All @@ -102,7 +123,8 @@ def prepare_dataset(
prepare_dirs(label_dir)
prepare_dataset_config(config, dir)

for image, curr_dir in iterate_datasets(images, config):
def _download_task(args):
image, curr_dir = args
if task_type == ModelType.CLASSIFICATION:
label_name = label_mapping[image.labels[0].label.label_number]
copy_image(image, f"{dir}/{curr_dir}/{label_name}")
Expand All @@ -113,3 +135,11 @@ def prepare_dataset(
copy_image(image, f"{image_dir}/{curr_dir}")
with open(f"{label_dir}/{curr_dir}/{label_filename}", "w") as f:
f.write("\n".join(image.labels_str(task_type)))

tasks = list(iterate_datasets(images, config, seed))
max_workers = min(32, len(tasks) or 1)
print(f"[ml-yolo] Downloading {len(tasks)} images using {max_workers} workers...")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
list(executor.map(_download_task, tasks))
print(f"[ml-yolo] Dataset preparation complete.")

Loading