-
Notifications
You must be signed in to change notification settings - Fork 21
feat(IS-667): Add automatic cleanup of outdated ExApp Docker images #837
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| <?php | ||
|
|
||
| declare(strict_types=1); | ||
|
|
||
| /** | ||
| * SPDX-FileCopyrightText: 2026 Nextcloud GmbH and Nextcloud contributors | ||
| * SPDX-License-Identifier: AGPL-3.0-or-later | ||
| */ | ||
|
|
||
| namespace OCA\AppAPI\BackgroundJob; | ||
|
|
||
| use OCA\AppAPI\AppInfo\Application; | ||
| use OCA\AppAPI\Db\DaemonConfig; | ||
| use OCA\AppAPI\DeployActions\DockerActions; | ||
| use OCA\AppAPI\Service\DaemonConfigService; | ||
| use OCP\AppFramework\Utility\ITimeFactory; | ||
| use OCP\BackgroundJob\TimedJob; | ||
| use OCP\IAppConfig; | ||
| use Psr\Log\LoggerInterface; | ||
|
|
||
| class DockerImageCleanupJob extends TimedJob { | ||
| private const SECONDS_PER_DAY = 86400; | ||
|
|
||
| public function __construct( | ||
| ITimeFactory $time, | ||
| private readonly IAppConfig $appConfig, | ||
| private readonly DaemonConfigService $daemonConfigService, | ||
| private readonly DockerActions $dockerActions, | ||
| private readonly LoggerInterface $logger, | ||
| ) { | ||
| parent::__construct($time); | ||
| $intervalDays = $this->appConfig->getValueInt( | ||
| Application::APP_ID, Application::CONF_IMAGE_CLEANUP_INTERVAL_DAYS, 7, lazy: true | ||
| ); | ||
| $this->setInterval(max($intervalDays, 1) * self::SECONDS_PER_DAY); | ||
| } | ||
|
|
||
| protected function run($argument): void { | ||
| if ($this->isCleanupDisabled()) { | ||
| return; | ||
| } | ||
|
|
||
| $this->pruneImagesOnAllDaemons(); | ||
| } | ||
|
|
||
| private function isCleanupDisabled(): bool { | ||
| $intervalDays = $this->appConfig->getValueInt( | ||
| Application::APP_ID, Application::CONF_IMAGE_CLEANUP_INTERVAL_DAYS, 7, lazy: true | ||
| ); | ||
| return $intervalDays === 0; | ||
| } | ||
|
|
||
| private function pruneImagesOnAllDaemons(): void { | ||
| $daemons = $this->daemonConfigService->getRegisteredDaemonConfigs(); | ||
| foreach ($daemons as $daemon) { | ||
| if (!$this->supportsPrune($daemon)) { | ||
| continue; | ||
| } | ||
|
|
||
| try { | ||
| $this->dockerActions->initGuzzleClient($daemon); | ||
| $dockerUrl = $this->dockerActions->buildDockerUrl($daemon); | ||
| $result = $this->dockerActions->pruneImages($dockerUrl, ['dangling' => ['true']]); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me trace a concrete scenario through this and make sure I understand it: Admin installs my ExApp. Three weeks later they run In that scenario, which images does this
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With dangling=true, the old image (some_exapp:1.0) would not be removed; because it is still tagged, it is simply "unused." Only untagged images (:) are pruned under that filter. I initially used dangling=true as a conservative approach to avoid removing images that other workloads on the same Docker host might need. However, if the old images remain tagged, they aren't caught by the dangling filter. Maybe the better approach is to remove the filter to match docker image prune -a, which removes all images not referenced by a running container. This ensures the old 1.0 image is cleaned up. Disk space reclaimed: Dependent on image size. |
||
| if (isset($result['error'])) { | ||
| $this->logger->error(sprintf( | ||
| 'Image prune failed for daemon "%s": %s', | ||
| $daemon->getName(), $result['error'] | ||
| )); | ||
| } | ||
| } catch (\Exception $e) { | ||
| $this->logger->error(sprintf( | ||
| 'Exception during image prune for daemon "%s": %s', | ||
| $daemon->getName(), $e->getMessage() | ||
| )); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private function supportsPrune(DaemonConfig $daemon): bool { | ||
| if ($daemon->getAcceptsDeployId() !== DockerActions::DEPLOY_ID) { | ||
| return false; | ||
| } | ||
|
|
||
| // HaRP daemons are Docker-type but route through a proxy that doesn't | ||
| // support the /images/prune endpoint yet. Skip until upstream HaRP adds it. | ||
| if (!empty($daemon->getDeployConfig()['harp'])) { | ||
| $this->logger->debug(sprintf( | ||
| 'Skipping image prune for HaRP daemon "%s" (not yet supported)', | ||
| $daemon->getName() | ||
| )); | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,8 @@ | |
|
|
||
| namespace OCA\AppAPI\Command\ExApp; | ||
|
|
||
| use OCA\AppAPI\AppInfo\Application; | ||
| use OCA\AppAPI\Db\DaemonConfig; | ||
| use OCA\AppAPI\DeployActions\DockerActions; | ||
| use OCA\AppAPI\DeployActions\KubernetesActions; | ||
| use OCA\AppAPI\DeployActions\ManualActions; | ||
|
|
@@ -19,6 +21,7 @@ | |
|
|
||
| use OCA\AppAPI\Service\ExAppDeployOptionsService; | ||
| use OCA\AppAPI\Service\ExAppService; | ||
| use OCP\IAppConfig; | ||
| use Psr\Log\LoggerInterface; | ||
| use Symfony\Component\Console\Command\Command; | ||
| use Symfony\Component\Console\Input\InputArgument; | ||
|
|
@@ -39,6 +42,7 @@ public function __construct( | |
| private readonly ExAppArchiveFetcher $exAppArchiveFetcher, | ||
| private readonly ExAppFetcher $exAppFetcher, | ||
| private readonly ExAppDeployOptionsService $exAppDeployOptionsService, | ||
| private readonly IAppConfig $appConfig, | ||
| ) { | ||
| parent::__construct(); | ||
| } | ||
|
|
@@ -208,7 +212,13 @@ private function updateExApp(InputInterface $input, OutputInterface $output, str | |
| $auth = []; | ||
| $harpK8sUrl = null; | ||
| $k8sRoles = []; | ||
| $oldImageName = ''; | ||
| if ($daemonConfig->getAcceptsDeployId() === $this->dockerActions->getAcceptsDeployId()) { | ||
| // Capture old image name before deploying the new version | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I want to make sure I'm reading this right: $oldAppInfo = array_merge($appInfo, ['version' => $exApp->getVersion()]);
$oldDeployParams = $this->dockerActions->buildDeployParams($daemonConfig, $oldAppInfo);
$oldImageName = $this->dockerActions->buildBaseImageName($oldDeployParams['image_params'], $daemonConfig);
Can you describe, in terms of what (a) ExApp author shipped v1.0 as (b) Admin added a new In each case: what image ref is passed into
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for catching this, actually I would say this is an implementation error. (a) $oldImageName resolves to the new image rather than the old one because I am reconstructing it from the new manifest instead of using the data from the original install. (b) The same problem exists here: both the registry mapping and the image tag are derived from the current configuration/new manifest, rather than the originally deployed metadata. Summary: In both cases, there is no damage since consequently, docker returns a 409 error (image in use by the new container), and nothing is deleted, but no cleanup occurs. |
||
| $oldAppInfo = array_merge($appInfo, ['version' => $exApp->getVersion()]); | ||
| $oldDeployParams = $this->dockerActions->buildDeployParams($daemonConfig, $oldAppInfo); | ||
| $oldImageName = $this->dockerActions->buildBaseImageName($oldDeployParams['image_params'], $daemonConfig); | ||
|
|
||
| $deployParams = $this->dockerActions->buildDeployParams($daemonConfig, $appInfo); | ||
| if (boolval($exApp->getDeployConfig()['harp'] ?? false)) { | ||
| $deployResult = $this->dockerActions->deployExAppHarp($exApp, $daemonConfig, $deployParams); | ||
|
|
@@ -314,6 +324,8 @@ private function updateExApp(InputInterface $input, OutputInterface $output, str | |
| $output->writeln(sprintf('ExApp %s update successfully deployed.', $appId)); | ||
| } | ||
|
|
||
| $this->removeOldImageIfEnabled($oldImageName, $daemonConfig, $appId); | ||
|
|
||
| $this->service->dispatchExAppInitInternal($exApp); | ||
| if ($input->getOption('wait-finish')) { | ||
| $error = $this->exAppService->waitInitStepFinish($appId); | ||
|
|
@@ -342,4 +354,28 @@ private function updateExApp(InputInterface $input, OutputInterface $output, str | |
| return 0; | ||
| } | ||
|
|
||
| private function removeOldImageIfEnabled(string $oldImageName, DaemonConfig $daemonConfig, string $appId): void { | ||
| if (empty($oldImageName)) { | ||
| return; | ||
| } | ||
| // Image cleanup only applies to Docker daemons | ||
| if ($daemonConfig->getAcceptsDeployId() !== $this->dockerActions->getAcceptsDeployId()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What concretely happens on a HaRP-fronted daemon when cleanup-on-update is enabled and this hook fires? Is that the behavior you intended? If yes, what's the reasoning behind the asymmetry with the background job? If no, what should the code do?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was unintentional. While the background job explicitly skips HaRP daemons via supportsPrune(), I missed adding the same check to the update path. |
||
| return; | ||
| } | ||
|
|
||
| $isEnabled = $this->appConfig->getValueBool( | ||
| Application::APP_ID, Application::CONF_IMAGE_CLEANUP_ON_UPDATE, false, lazy: true | ||
| ); | ||
| if (!$isEnabled) { | ||
| return; | ||
| } | ||
|
|
||
| $dockerUrl = $this->dockerActions->buildDockerUrl($daemonConfig); | ||
| $result = $this->dockerActions->removeImage($dockerUrl, $oldImageName); | ||
| if ($result) { | ||
| $this->logger->warning(sprintf('Old image cleanup for %s: %s', $appId, $result)); | ||
| } else { | ||
| $this->logger->info(sprintf('Old image %s removed after updating %s', $oldImageName, $appId)); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| # Plan: Automatic Clean-up of Outdated ExApp Docker Images | ||
|
|
||
| **Spec:** [spec-docker-image-cleanup.md](./spec-docker-image-cleanup.md) | ||
| **Issue:** [nextcloud/app_api#667](https://github.com/nextcloud/app_api/issues/667) | ||
|
|
||
| --- | ||
|
|
||
| ## Phases | ||
|
|
||
| ### Phase 1: Docker API Methods β DONE | ||
|
|
||
| | Task | File | What | | ||
| |------|------|------| | ||
| | `removeImage()` | `DockerActions.php` | `DELETE /images/{id}`, uses `Http::STATUS_*` constants | | ||
| | `pruneImages()` | `DockerActions.php` | `POST /images/prune`, uses `Util::humanFileSize()` | | ||
| | Config constants | `Application.php` | `CONF_IMAGE_CLEANUP_INTERVAL_DAYS`, `CONF_IMAGE_CLEANUP_ON_UPDATE` | | ||
|
|
||
| ### Phase 2: Background Job β DONE | ||
|
|
||
| | Task | File | What | | ||
| |------|------|------| | ||
| | `DockerImageCleanupJob` | New file | Dynamic `setInterval()` from config, `isCleanupDisabled()`, `pruneImagesOnAllDaemons()`, `supportsPrune()` | | ||
| | Register job | `info.xml` | Added to `<background-jobs>` | | ||
|
|
||
| ### Phase 3: Cleanup on Update β DONE | ||
|
|
||
| | Task | File | What | | ||
| |------|------|------| | ||
| | Old image removal | `Update.php` | Added `IAppConfig` dep, captures old image before deploy, `removeOldImageIfEnabled()` after success | | ||
|
|
||
| ### Phase 4: Admin Settings β DONE | ||
|
|
||
| | Task | File | What | | ||
| |------|------|------| | ||
| | Initial state | `Admin.php` | Added both settings to `$adminInitialData` | | ||
| | UI controls | `AdminSettings.vue` | Interval input + checkbox, uses existing `onInput()`/`onCheckboxChanged()` | | ||
|
|
||
| --- | ||
|
|
||
| ## Verification | ||
|
|
||
| - [x] `removeImage()` handles 200, 404, 409 correctly | ||
| - [x] `pruneImages()` sends correct filters, logs space reclaimed | ||
| - [x] Background job disabled when interval = 0 | ||
| - [x] Background job skips non-Docker and HaRP daemons | ||
| - [x] Background job errors don't block other daemons | ||
| - [x] Update command only removes old image when checkbox enabled | ||
| - [x] Update cleanup failure doesn't block the update | ||
| - [x] Admin UI displays and saves both settings | ||
| - [x] Config keys centralized as constants | ||
| - [x] Registered in `info.xml` | ||
| - [x] Psalm: 0 errors | ||
| - [x] CS Fixer: 0 issues | ||
| - [x] Tests: 25 passing (8 + 11 + 6) | ||
|
|
||
| --- | ||
|
|
||
| ## Risks | ||
|
|
||
| | Risk | Mitigation | | ||
| |------|------------| | ||
| | DSP haproxy blocks `/images/prune` | Log error, document in release notes | | ||
| | Prune removes non-AppAPI images | Standard `docker image prune` behavior, only dangling images | | ||
| | HaRP doesn't support prune | Skip + log, follow-up when upstream adds it | | ||
| | Shared image between ExApps | Docker refuses with 409, handled gracefully | |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two questions on scheduling semantics - as we want to document these for the admin:
image_cleanup_interval_days = 0. Your code doesmax($intervalDays, 1) * SECONDS_PER_DAY-->setInterval(86400), andrun()then no-ops viaisCleanupDisabled(). End to end: does the job (a) not execute at all, or (b) execute every day and exit immediately insiderun()? What doesocc background-job:listshow in each case?interval_days = 1, job runs at T, T+1d, T+2d, T+3d. At T+3.5d admin changes it to7. When does the next execution fire - T+3.5d+7d, T+3d+7d, T+4d with an immediate no-op, or something else?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Q1: Itβs (b). The job runs daily and exits immediately via run(). Due to the max(0, 1) logic, the constructor sets a 24-hour interval even if interval_days = 0. Because Nextcloud TimedJobs are registered in info.xml, they cannot be dynamically unregistered to achieve (a). The overhead is minimal, but the job remains active in occ background-job:list.
Q2: The constructor re-evaluates the interval every cron cycle (every 5 mins). If changed to 7 days at T+3.5d:
Next cron: Updates to setInterval(7 days).
Next run: Scheduled for T+10d (Last run at T+3d + 7d interval).
The change applies nearly instantly, and the countdown resets relative to the last execution, not the setting change.