feat: add container restart tracking and crash loop detection

Track container restart counts from Docker and detect crash loops to provide better visibility into application health issues.

- Add restart_count, last_restart_at, and last_restart_type columns to applications table
- Detect restart count increases from Docker inspect data and send notifications
- Show restart count badge in UI with warning icon on Logs navigation
- Distinguish between crash restarts and manual restarts
- Implement 30-second grace period to prevent false "exited" status during crash loops
- Reset restart count on manual stop, restart, and redeploy actions
- Add unit tests for restart count tracking logic

This helps users quickly identify when containers are in crash loops and need attention, even when the container status flickers between states during Docker's restart backoff period.
This commit is contained in:
Andras Bacsai 2025-11-10 13:04:31 +01:00
parent 775216e7a5
commit 68a9f2ca77
9 changed files with 231 additions and 21 deletions

View File

@ -28,6 +28,8 @@ class GetContainersStatus
protected ?Collection $applicationContainerStatuses;
protected ?Collection $applicationContainerRestartCounts;
public function handle(Server $server, ?Collection $containers = null, ?Collection $containerReplicates = null)
{
$this->containers = $containers;
@ -136,6 +138,18 @@ class GetContainersStatus
if ($containerName) {
$this->applicationContainerStatuses->get($applicationId)->put($containerName, $containerStatus);
}
// Track restart counts for applications
$restartCount = data_get($container, 'RestartCount', 0);
if (! isset($this->applicationContainerRestartCounts)) {
$this->applicationContainerRestartCounts = collect();
}
if (! $this->applicationContainerRestartCounts->has($applicationId)) {
$this->applicationContainerRestartCounts->put($applicationId, collect());
}
if ($containerName) {
$this->applicationContainerRestartCounts->get($applicationId)->put($containerName, $restartCount);
}
} else {
// Notify user that this container should not be there.
}
@ -291,7 +305,24 @@ class GetContainersStatus
continue;
}
$application->update(['status' => 'exited']);
// If container was recently restarting (crash loop), keep it as degraded for a grace period
// This prevents false "exited" status during the brief moment between container removal and recreation
$recentlyRestarted = $application->restart_count > 0 &&
$application->last_restart_at &&
$application->last_restart_at->greaterThan(now()->subSeconds(30));
if ($recentlyRestarted) {
// Keep it as degraded if it was recently in a crash loop
$application->update(['status' => 'degraded (unhealthy)']);
} else {
// Reset restart count when application exits completely
$application->update([
'status' => 'exited',
'restart_count' => 0,
'last_restart_at' => null,
'last_restart_type' => null,
]);
}
}
$notRunningApplicationPreviews = $previews->pluck('id')->diff($foundApplicationPreviews);
foreach ($notRunningApplicationPreviews as $previewId) {
@ -340,7 +371,37 @@ class GetContainersStatus
continue;
}
$aggregatedStatus = $this->aggregateApplicationStatus($application, $containerStatuses);
// Track restart counts first
$maxRestartCount = 0;
if (isset($this->applicationContainerRestartCounts) && $this->applicationContainerRestartCounts->has($applicationId)) {
$containerRestartCounts = $this->applicationContainerRestartCounts->get($applicationId);
$maxRestartCount = $containerRestartCounts->max() ?? 0;
$previousRestartCount = $application->restart_count ?? 0;
if ($maxRestartCount > $previousRestartCount) {
// Restart count increased - this is a crash restart
$application->update([
'restart_count' => $maxRestartCount,
'last_restart_at' => now(),
'last_restart_type' => 'crash',
]);
// Send notification
$containerName = $application->name;
$projectUuid = data_get($application, 'environment.project.uuid');
$environmentName = data_get($application, 'environment.name');
$applicationUuid = data_get($application, 'uuid');
if ($projectUuid && $applicationUuid && $environmentName) {
$url = base_url().'/project/'.$projectUuid.'/'.$environmentName.'/application/'.$applicationUuid;
} else {
$url = null;
}
}
}
// Aggregate status after tracking restart counts
$aggregatedStatus = $this->aggregateApplicationStatus($application, $containerStatuses, $maxRestartCount);
if ($aggregatedStatus) {
$statusFromDb = $application->status;
if ($statusFromDb !== $aggregatedStatus) {
@ -355,7 +416,7 @@ class GetContainersStatus
ServiceChecked::dispatch($this->server->team->id);
}
private function aggregateApplicationStatus($application, Collection $containerStatuses): ?string
private function aggregateApplicationStatus($application, Collection $containerStatuses, int $maxRestartCount = 0): ?string
{
// Parse docker compose to check for excluded containers
$dockerComposeRaw = data_get($application, 'docker_compose_raw');
@ -413,6 +474,11 @@ class GetContainersStatus
return 'degraded (unhealthy)';
}
// If container is exited but has restart count > 0, it's in a crash loop
if ($hasExited && $maxRestartCount > 0) {
return 'degraded (unhealthy)';
}
if ($hasRunning && $hasExited) {
return 'degraded (unhealthy)';
}
@ -421,7 +487,7 @@ class GetContainersStatus
return $hasUnhealthy ? 'running (unhealthy)' : 'running (healthy)';
}
// All containers are exited
// All containers are exited with no restart count - truly stopped
return 'exited (unhealthy)';
}
}

View File

@ -94,6 +94,14 @@ class Heading extends Component
return;
}
// Reset restart count on deployment
$this->application->update([
'restart_count' => 0,
'last_restart_at' => null,
'last_restart_type' => null,
]);
$this->setDeploymentUuid();
$result = queue_application_deployment(
application: $this->application,
@ -137,6 +145,14 @@ class Heading extends Component
return;
}
// Reset restart count on manual restart
$this->application->update([
'restart_count' => 0,
'last_restart_at' => now(),
'last_restart_type' => 'manual',
]);
$this->setDeploymentUuid();
$result = queue_application_deployment(
application: $this->application,

View File

@ -121,6 +121,8 @@ class Application extends BaseModel
protected $casts = [
'http_basic_auth_password' => 'encrypted',
'restart_count' => 'integer',
'last_restart_at' => 'datetime',
];
protected static function booted()

View File

@ -0,0 +1,30 @@
<?php
use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::table('applications', function (Blueprint $table) {
$table->integer('restart_count')->default(0)->after('status');
$table->timestamp('last_restart_at')->nullable()->after('restart_count');
$table->string('last_restart_type', 10)->nullable()->after('last_restart_at');
});
}
/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::table('applications', function (Blueprint $table) {
$table->dropColumn(['restart_count', 'last_restart_at', 'last_restart_type']);
});
}
};

View File

@ -12,6 +12,13 @@
@else
<x-status.stopped :status="$resource->status" />
@endif
@if (isset($resource->restart_count) && $resource->restart_count > 0 && !str($resource->status)->startsWith('exited'))
<div class="flex items-center pl-2">
<span class="text-xs dark:text-warning" title="Container has restarted {{ $resource->restart_count }} time{{ $resource->restart_count > 1 ? 's' : '' }}. Last restart: {{ $resource->last_restart_at?->diffForHumans() }}">
({{ $resource->restart_count }}x restarts)
</span>
</div>
@endif
@if (!str($resource->status)->contains('exited') && $showRefreshButton)
<button wire:loading.remove.delay.shortest wire:target="manualCheckStatus" title="Refresh Status" wire:click='manualCheckStatus'
class="mx-1 dark:hover:fill-white fill-black dark:fill-warning">

View File

@ -12,7 +12,14 @@
</a>
<a class="{{ request()->routeIs('project.application.logs') ? 'dark:text-white' : '' }}"
href="{{ route('project.application.logs', $parameters) }}">
Logs
<div class="flex items-center gap-1">
Logs
@if ($application->restart_count > 0 && !str($application->status)->startsWith('exited'))
<svg class="w-4 h-4 dark:text-warning" viewBox="0 0 24 24" fill="currentColor" xmlns="http://www.w3.org/2000/svg" title="Container has restarted {{ $application->restart_count }} time{{ $application->restart_count > 1 ? 's' : '' }}">
<path d="M12 2L1 21h22L12 2zm0 4l7.53 13H4.47L12 6zm-1 5v4h2v-4h-2zm0 5v2h2v-2h-2z"/>
</svg>
@endif
</div>
</a>
@if (!$application->destination->server->isSwarm())
@can('canAccessTerminal')

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,82 @@
<?php
use App\Models\Application;
use App\Models\Server;
beforeEach(function () {
// Mock server
$this->server = Mockery::mock(Server::class);
$this->server->shouldReceive('isFunctional')->andReturn(true);
$this->server->shouldReceive('isSwarm')->andReturn(false);
$this->server->shouldReceive('applications')->andReturn(collect());
// Mock application
$this->application = Mockery::mock(Application::class);
$this->application->shouldReceive('getAttribute')->with('id')->andReturn(1);
$this->application->shouldReceive('getAttribute')->with('name')->andReturn('test-app');
$this->application->shouldReceive('getAttribute')->with('restart_count')->andReturn(0);
$this->application->shouldReceive('getAttribute')->with('uuid')->andReturn('test-uuid');
$this->application->shouldReceive('getAttribute')->with('environment')->andReturn(null);
});
it('extracts restart count from container data', function () {
$containerData = [
'RestartCount' => 5,
'State' => [
'Status' => 'running',
'Health' => ['Status' => 'healthy'],
],
'Config' => [
'Labels' => [
'coolify.applicationId' => '1',
'com.docker.compose.service' => 'web',
],
],
];
$restartCount = data_get($containerData, 'RestartCount', 0);
expect($restartCount)->toBe(5);
});
it('defaults to zero when restart count is missing', function () {
$containerData = [
'State' => [
'Status' => 'running',
],
'Config' => [
'Labels' => [],
],
];
$restartCount = data_get($containerData, 'RestartCount', 0);
expect($restartCount)->toBe(0);
});
it('detects restart count increase', function () {
$previousRestartCount = 2;
$currentRestartCount = 5;
expect($currentRestartCount)->toBeGreaterThan($previousRestartCount);
});
it('identifies maximum restart count from multiple containers', function () {
$containerRestartCounts = collect([
'web' => 3,
'worker' => 5,
'scheduler' => 1,
]);
$maxRestartCount = $containerRestartCounts->max();
expect($maxRestartCount)->toBe(5);
});
it('handles empty restart counts collection', function () {
$containerRestartCounts = collect([]);
$maxRestartCount = $containerRestartCounts->max() ?? 0;
expect($maxRestartCount)->toBe(0);
});