Robopipe · stranavad · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/apps/api/src/modules/training-external/services/training-external.service.ts b/apps/api/src/modules/training-external/services/training-external.service.ts
@@ -323,7 +323,7 @@ export class TrainingExternalService {
       // the N1-style "custom attachment" path.
       const instancePolicy: protos.google.cloud.batch.v1.AllocationPolicy.IInstancePolicy = {
         machineType: mlBatchMachineType,
-        bootDisk: { sizeGb: String(mlBatchBootDiskGb) },
+        bootDisk: { sizeGb: String(mlBatchBootDiskGb), type: "pd-ssd" },
         provisioningModel: "SPOT",
       };
       if (mlBatchGpuType && mlBatchGpuCount > 0) {
@@ -381,7 +381,7 @@ export class TrainingExternalService {
                 memoryMib: mlBatchTaskMemoryMib,
               },
               maxRunDuration: { seconds: String(mlBatchMaxRunSeconds) },
-              maxRetryCount: 3,
+              maxRetryCount: 10,
               lifecyclePolicies: [
                 {
                   action: protos.google.cloud.batch.v1.LifecyclePolicy.Action.RETRY_TASK,

diff --git a/apps/ml-yolo/Dockerfile b/apps/ml-yolo/Dockerfile
@@ -24,6 +24,16 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN pip install --no-cache-dir --upgrade pip
 
 COPY requirements.txt .
+# The main env MUST install the CUDA build of torch (the PyPI default,
+# which pulls the ~2.5GB nvidia-* wheels). Do NOT add the
+# download.pytorch.org/whl/cpu index here: `2.8.0+cpu` satisfies the
+# `torch==2.8.0` pin and pip prefers it, but +cpu wheels are compiled
+# without CUDA support entirely — torch.cuda.is_available() is then
+# false no matter what the host provides, and Ultralytics silently
+# trains on CPU. Cloud Batch's installGpuDrivers only supplies the
+# kernel driver + libcuda.so; the CUDA user-space libs (cuBLAS, cuDNN,
+# NCCL, ...) ship exclusively in these wheels. The tools venv below is
+# different: export runs on CPU by design, so +cpu is correct there.
 RUN pip install --no-cache-dir --timeout=120 --retries=5 -r requirements.txt
 
 # luxonis/tools — used at export time to convert Ultralytics .pt to a
@@ -57,7 +67,11 @@ RUN python3 -m venv /opt/tools-venv && \
     git checkout edbe7da1a7f75833a71d65caf1028036faa81061 && \
     git submodule update --init --recursive && \
     PIP_CONSTRAINT=constraints.txt /opt/tools-venv/bin/pip install --no-cache-dir --timeout=120 --retries=5 . && \
-    rm -rf /tmp/luxonis-tools
+    rm -rf /tmp/luxonis-tools && \
+    # Purge heavy build dependencies after compilation and git clones are done
+    apt-get purge -y build-essential cmake git && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
 
 ENV ROBOPIPE_TOOLS_BIN=/opt/tools-venv/bin/tools
 
@@ -130,30 +144,36 @@ ENV IN_DOCKER=1
 ENV ROBOPIPE_MODELCONVERTER_BIN=/opt/modelconverter-venv/bin/modelconverter
 ENV ROBOPIPE_SNPE_ROOT=/opt/snpe
 
-# Pre-download all yolo11 detection + segmentation pretrained weights into the
-# image. Ultralytics' YOLO(name) constructor calls safe_download() which has a
-# known silent-failure path: if a retry deletes a partial file and the loop
-# exits without raising, attempt_download_asset() returns a Path to a
-# non-existent file and the next torch.load() raises FileNotFoundError. By
-# baking the weights at /app (the WORKDIR), YOLO()'s first existence check
-# (`Path(file).exists()` against cwd) hits and skips the download entirely.
-# Pin to v8.3.0 — the release tag that hosts the yolo11 family weights.
+# Pre-download all yolo11 detection, segmentation, and classification pretrained
+# weights into the image. Ultralytics' YOLO(name) constructor calls
+# safe_download() which has a known silent-failure path: if a retry deletes a
+# partial file and the loop exits without raising, attempt_download_asset()
+# returns a Path to a non-existent file and the next torch.load() raises
+# FileNotFoundError. By baking the weights at /app (the WORKDIR), YOLO()'s first
+# existence check (`Path(file).exists()` against cwd) hits and skips the
+# download entirely. Pin to v8.3.0 — the release tag that hosts the yolo11 family.
 RUN cd /app && for v in \
     yolo11n yolo11s yolo11m yolo11l yolo11x \
     yolo11n-seg yolo11s-seg yolo11m-seg yolo11l-seg yolo11x-seg \
+    yolo11n-cls yolo11s-cls yolo11m-cls yolo11l-cls yolo11x-cls \
     ; do \
     curl -fL --retry 3 -o ${v}.pt \
         https://github.com/ultralytics/assets/releases/download/v8.3.0/${v}.pt; \
     done
 
+# Bake Ultralytics fonts into the image. Ultralytics downloads Arial.ttf and
+# Arial.Unicode.ttf at runtime if missing from $YOLO_CONFIG_DIR. By baking them
+# into a persistent /opt/ultralytics path, we save ~3-5s of start-up latency.
+ENV YOLO_CONFIG_DIR=/opt/ultralytics
+RUN mkdir -p ${YOLO_CONFIG_DIR} && \
+    curl -fL --retry 3 -o ${YOLO_CONFIG_DIR}/Arial.ttf https://ultralytics.com/assets/Arial.ttf && \
+    curl -fL --retry 3 -o ${YOLO_CONFIG_DIR}/Arial.Unicode.ttf https://ultralytics.com/assets/Arial.Unicode.ttf
+
 COPY app/ ./app/
 
 RUN mkdir -p /app/export /app/dataset
 
 ENV PYTHONUNBUFFERED=1
-# Cloud Batch containers don't always have a writable $HOME; pin Ultralytics'
-# settings dir to /tmp to avoid the "not writable" warning on every run.
-ENV YOLO_CONFIG_DIR=/tmp/Ultralytics
 
 # Default to Cloud Batch job mode. Docker-compose overrides CMD to run the
 # FastAPI server for local HTTP testing.

diff --git a/apps/ml-yolo/app/ml/dataset.py b/apps/ml-yolo/app/ml/dataset.py
@@ -1,9 +1,11 @@
+import hashlib
 import math
 import random
 import requests
 import shutil
 import yaml
 import os
+from concurrent.futures import ThreadPoolExecutor
 
 from ..models.dataset_config import DatasetConfig
 from ..models.image import Image
@@ -23,7 +25,7 @@
 def copy_image(image: Image, dest: str):
     image_path = image.file_url
     if image_path.startswith("http://") or image_path.startswith("https://"):
-        response = requests.get(image_path, stream=True)
+        response = requests.get(image_path, stream=True, timeout=30)
         if response.status_code == 200:
             filename = os.path.basename(image_path)
             dest_path = os.path.join(dest, filename)
@@ -57,9 +59,24 @@ def prepare_dataset_config(config: DatasetConfig, dir: str):
         yaml.dump({"names": get_label_mapping(config)}, f)
 
 
-def iterate_datasets(images: list[Image], config: DatasetConfig):
-    shuffled_images = list(images)
-    random.shuffle(shuffled_images)
+def split_seed(seed_source: str) -> int:
+    """Derive a deterministic RNG seed from a stable string (the model id).
+
+    sha256, not the built-in hash() — hash() is salted per process
+    (PYTHONHASHSEED), so it would produce a different seed on every VM and
+    defeat the purpose.
+    """
+    return int.from_bytes(hashlib.sha256(seed_source.encode()).digest()[:8], "big")
+
+
+def iterate_datasets(images: list[Image], config: DatasetConfig, seed: int):
+    # The split must be identical across Cloud Batch task attempts: a Spot
+    # retry resumes from a checkpoint, and a re-drawn split would leak
+    # already-trained images into val/test. Sort by file_url first so the
+    # result is also independent of payload ordering, then shuffle with a
+    # seeded RNG isolated from the global `random` state.
+    shuffled_images = sorted(images, key=lambda image: image.file_url)
+    random.Random(seed).shuffle(shuffled_images)
     train_split, val_split, _ = (s / 100.0 for s in config.dataset_split)
     total_images = len(images)
     train_end = int(total_images * train_split)
@@ -86,7 +103,11 @@ def prepare_classification_directory(
 
 
 def prepare_dataset(
-    dir: str, images: list[Image], config: DatasetConfig, task_type: ModelType
+    dir: str,
+    images: list[Image],
+    config: DatasetConfig,
+    task_type: ModelType,
+    seed: int,
 ):
     global VAL_DIR
     dir = f"{dir}/{DATASET_DIR}"
@@ -102,7 +123,8 @@ def prepare_dataset(
         prepare_dirs(label_dir)
         prepare_dataset_config(config, dir)
 
-    for image, curr_dir in iterate_datasets(images, config):
+    def _download_task(args):
+        image, curr_dir = args
         if task_type == ModelType.CLASSIFICATION:
             label_name = label_mapping[image.labels[0].label.label_number]
             copy_image(image, f"{dir}/{curr_dir}/{label_name}")
@@ -113,3 +135,11 @@ def prepare_dataset(
             copy_image(image, f"{image_dir}/{curr_dir}")
             with open(f"{label_dir}/{curr_dir}/{label_filename}", "w") as f:
                 f.write("\n".join(image.labels_str(task_type)))
+
+    tasks = list(iterate_datasets(images, config, seed))
+    max_workers = min(32, len(tasks) or 1)
+    print(f"[ml-yolo] Downloading {len(tasks)} images using {max_workers} workers...")
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        list(executor.map(_download_task, tasks))
+    print(f"[ml-yolo] Dataset preparation complete.")
+