From b22e79caec94195eed721cae0cc3e7cf68bf6e2a Mon Sep 17 00:00:00 2001
From: Andras Bacsai <5845193+andrasbacsai@users.noreply.github.com>
Date: Mon, 10 Nov 2025 11:11:18 +0100
Subject: [PATCH] feat(jobs): improve scheduled tasks with retry logic and
queue cleanup
- Add retry configuration to CoolifyTask (3 tries, 600s timeout)
- Add retry configuration to ScheduledTaskJob (3 tries, configurable timeout)
- Add retry configuration to DatabaseBackupJob (2 tries)
- Implement exponential backoff for all jobs (30s, 60s, 120s intervals)
- Add failed() handlers with comprehensive error logging to scheduled-errors channel
- Add execution tracking: started_at, retry_count, duration (decimal), error_details
- Add configurable timeout field to scheduled tasks (60-3600s, default 300s)
- Update UI to include timeout configuration in task creation/editing forms
- Increase ScheduledJobManager lock expiration from 60s to 90s for high-load environments
- Implement safe queue cleanup with restart vs runtime modes
- Restart mode: aggressive cleanup (marks all processing jobs as failed)
- Runtime mode: conservative cleanup (only marks jobs >12h as failed, skips deployments)
- Add cleanup:redis --restart flag for system startup
- Integrate cleanup into Dev.php init() for development environment
- Increase scheduled-errors log retention from 7 to 14 days
- Create comprehensive test suite (unit and feature tests)
- Add TESTING_GUIDE.md with manual testing instructions
Fixes issues with jobs failing after single attempt and "attempted too many times" errors
---
TESTING_GUIDE.md | 235 ++++++++++++++++++
app/Console/Commands/CleanupRedis.php | 104 +++++++-
app/Console/Commands/Dev.php | 10 +
app/Console/Commands/Init.php | 2 +-
app/Jobs/CoolifyTask.php | 49 ++++
app/Jobs/DatabaseBackupJob.php | 38 ++-
app/Jobs/ScheduledJobManager.php | 2 +-
app/Jobs/ScheduledTaskJob.php | 82 +++++-
.../Project/Shared/ScheduledTask/Add.php | 6 +
.../Project/Shared/ScheduledTask/Show.php | 5 +
app/Models/ScheduledTask.php | 8 +
app/Models/ScheduledTaskExecution.php | 10 +
config/logging.php | 4 +-
...1_add_timeout_to_scheduled_tasks_table.php | 28 +++
...ove_scheduled_task_executions_tracking.php | 31 +++
package-lock.json | 22 +-
.../shared/scheduled-task/add.blade.php | 3 +
.../shared/scheduled-task/show.blade.php | 2 +
templates/service-templates-latest.json | 16 +-
templates/service-templates.json | 16 +-
tests/Feature/CoolifyTaskRetryTest.php | 70 ++++++
tests/Unit/ScheduledJobsRetryConfigTest.php | 56 +++++
22 files changed, 762 insertions(+), 37 deletions(-)
create mode 100644 TESTING_GUIDE.md
create mode 100644 database/migrations/2025_11_09_000001_add_timeout_to_scheduled_tasks_table.php
create mode 100644 database/migrations/2025_11_09_000002_improve_scheduled_task_executions_tracking.php
create mode 100644 tests/Feature/CoolifyTaskRetryTest.php
create mode 100644 tests/Unit/ScheduledJobsRetryConfigTest.php
diff --git a/TESTING_GUIDE.md b/TESTING_GUIDE.md
new file mode 100644
index 000000000..91a79cd62
--- /dev/null
+++ b/TESTING_GUIDE.md
@@ -0,0 +1,235 @@
+# Testing Guide: Scheduled Tasks Improvements
+
+## Overview
+This guide covers testing all the improvements made to the scheduled tasks system, including retry logic, timeout handling, and error logging.
+
+## Jobs Modified
+
+1. **CoolifyTask** - Infrastructure job for SSH operations (3 retries, 600s timeout)
+2. **ScheduledTaskJob** - Scheduled container commands (3 retries, configurable timeout)
+3. **DatabaseBackupJob** - Database backups (2 retries, existing timeout)
+
+---
+
+## Quick Test Commands
+
+### Run Unit Tests (No Database Required)
+```bash
+./vendor/bin/pest tests/Unit/ScheduledJobsRetryConfigTest.php
+```
+
+### Run Feature Tests (Requires Database - Run in Docker)
+```bash
+docker exec coolify php artisan test --filter=CoolifyTaskRetryTest
+```
+
+---
+
+## Manual Testing
+
+### 1. Test ScheduledTaskJob ✅ (You tested this)
+
+**How to test:**
+1. Create a scheduled task in the UI
+2. Set a short frequency (every minute)
+3. Monitor execution in the UI
+4. Check logs: `storage/logs/scheduled-errors-2025-11-09.log`
+
+**What to verify:**
+- Task executes successfully
+- Duration is recorded (in seconds with 2 decimal places)
+- Retry count is tracked
+- Timeout configuration is respected
+
+---
+
+### 2. Test DatabaseBackupJob ✅ (You tested this)
+
+**How to test:**
+1. Create a scheduled database backup
+2. Set frequency to manual or very short interval
+3. Trigger backup manually or wait for schedule
+4. Check logs for any errors
+
+**What to verify:**
+- Backup completes successfully
+- Retry logic works if there's a transient failure
+- Error logging is consistent
+- Backoff timing is correct (60s, 300s)
+
+---
+
+### 3. Test CoolifyTask ⚠️ (IMPORTANT - Not tested yet)
+
+CoolifyTask is used throughout the application for ALL SSH operations. Here are multiple ways to test it:
+
+#### **Option A: Server Validation** (Easiest)
+1. Go to **Servers** in Coolify UI
+2. Select any server
+3. Click **"Validate Server"** or **"Check Connection"**
+4. This triggers CoolifyTask jobs
+5. Check Horizon dashboard for job processing
+6. Check logs: `storage/logs/scheduled-errors-2025-11-09.log`
+
+#### **Option B: Container Operations**
+1. Go to any **Application** or **Service**
+2. Try these actions (each triggers CoolifyTask):
+ - Restart container
+ - View logs
+ - Execute command in container
+3. Monitor Horizon for job processing
+4. Check logs for errors
+
+#### **Option C: Application Deployment**
+1. Deploy or redeploy any application
+2. This triggers MANY CoolifyTask jobs
+3. Watch Horizon dashboard - you should see:
+ - Jobs being dispatched
+ - Jobs completing successfully
+ - If any fail, they should retry (check "Failed Jobs")
+4. Check logs for retry attempts
+
+#### **Option D: Docker Cleanup**
+1. Wait for or trigger Docker cleanup (runs on schedule)
+2. This uses CoolifyTask for cleanup commands
+3. Check logs: `storage/logs/scheduled-errors-2025-11-09.log`
+
+---
+
+## Monitoring & Verification
+
+### Horizon Dashboard
+1. Open Horizon: `/horizon`
+2. Watch these sections:
+ - **Recent Jobs** - See jobs being processed
+ - **Failed Jobs** - Jobs that failed permanently after retries
+ - **Monitoring** - Job throughput and wait times
+
+### Log Monitoring
+```bash
+# Watch scheduled errors in real-time
+tail -f storage/logs/scheduled-errors-2025-11-09.log
+
+# Check for specific job errors
+grep "CoolifyTask" storage/logs/scheduled-errors-2025-11-09.log
+grep "ScheduledTaskJob" storage/logs/scheduled-errors-2025-11-09.log
+grep "DatabaseBackupJob" storage/logs/scheduled-errors-2025-11-09.log
+```
+
+### Database Verification
+```sql
+-- Check execution tracking
+SELECT * FROM scheduled_task_executions
+ORDER BY created_at DESC
+LIMIT 10;
+
+-- Verify duration is decimal (not throwing errors)
+SELECT id, duration, retry_count, started_at, finished_at
+FROM scheduled_task_executions
+WHERE duration IS NOT NULL;
+
+-- Check for tasks with retries
+SELECT * FROM scheduled_task_executions
+WHERE retry_count > 0;
+```
+
+---
+
+## Expected Behavior
+
+### ✅ Success Indicators
+
+1. **Jobs Complete Successfully**
+ - Horizon shows completed jobs
+ - No errors in scheduled-errors log
+ - Execution records in database
+
+2. **Retry Logic Works**
+ - Failed jobs retry automatically
+ - Backoff timing is respected (30s, 60s, etc.)
+ - Jobs marked failed only after all retries exhausted
+
+3. **Timeout Enforcement**
+ - Long-running jobs terminate at timeout
+ - Timeout is configurable per task
+ - No hanging jobs
+
+4. **Error Logging**
+ - All errors logged to `storage/logs/scheduled-errors-2025-11-09.log`
+ - Consistent format with job name, attempt count, error details
+ - Trace included for debugging
+
+5. **Execution Tracking**
+ - Duration recorded correctly (decimal with 2 places)
+ - Retry count incremented on failures
+ - Started/finished timestamps accurate
+
+---
+
+## Troubleshooting
+
+### Issue: Jobs fail immediately without retrying
+**Check:**
+- Verify `$tries` property is set on the job
+- Check if exception is being caught and re-thrown correctly
+- Look for `maxExceptions` being reached
+
+### Issue: "Invalid text representation" errors
+**Fix Applied:**
+- Duration field changed from integer to decimal(10,2)
+- If you see this, run migrations again
+
+### Issue: Jobs not appearing in Horizon
+**Check:**
+- Horizon is running (`php artisan horizon`)
+- Queue workers are active
+- Job is dispatched to correct queue ('high' for these jobs)
+
+### Issue: Timeout not working
+**Check:**
+- Timeout is set on job (CoolifyTask: 600s, ScheduledTask: configurable)
+- PHP `max_execution_time` allows job timeout
+- Queue worker timeout is higher than job timeout
+
+---
+
+## Test Checklist
+
+- [ ] Unit tests pass: `./vendor/bin/pest tests/Unit/ScheduledJobsRetryConfigTest.php`
+- [ ] ScheduledTaskJob tested manually ✅
+- [ ] DatabaseBackupJob tested manually ✅
+- [ ] CoolifyTask tested manually (server validation, container ops, or deployment)
+- [ ] Retry logic verified (force a failure, watch retry attempts)
+- [ ] Timeout enforcement tested (create long-running task with short timeout)
+- [ ] Error logs checked: `storage/logs/scheduled-errors-2025-11-09.log`
+- [ ] Horizon dashboard shows jobs processing correctly
+- [ ] Database execution records show duration as decimal
+- [ ] UI shows timeout configuration field for scheduled tasks
+
+---
+
+## Next Steps After Testing
+
+1. If all tests pass, run migrations on production/staging:
+ ```bash
+ php artisan migrate
+ ```
+
+2. Monitor logs for the first 24 hours:
+ ```bash
+ tail -f storage/logs/scheduled-errors-2025-11-09.log
+ ```
+
+3. Check Horizon for any failed jobs needing attention
+
+4. Verify existing scheduled tasks now have retry capability
+
+---
+
+## Questions?
+
+If you encounter issues:
+1. Check `storage/logs/scheduled-errors-2025-11-09.log` first
+2. Check `storage/logs/laravel.log` for general errors
+3. Look at Horizon "Failed Jobs" for detailed error info
+4. Review database execution records for patterns
diff --git a/app/Console/Commands/CleanupRedis.php b/app/Console/Commands/CleanupRedis.php
index f6a2de75b..a5fdc33e0 100644
--- a/app/Console/Commands/CleanupRedis.php
+++ b/app/Console/Commands/CleanupRedis.php
@@ -7,7 +7,7 @@ use Illuminate\Support\Facades\Redis;
class CleanupRedis extends Command
{
- protected $signature = 'cleanup:redis {--dry-run : Show what would be deleted without actually deleting} {--skip-overlapping : Skip overlapping queue cleanup} {--clear-locks : Clear stale WithoutOverlapping locks}';
+ protected $signature = 'cleanup:redis {--dry-run : Show what would be deleted without actually deleting} {--skip-overlapping : Skip overlapping queue cleanup} {--clear-locks : Clear stale WithoutOverlapping locks} {--restart : Aggressive cleanup mode for system restart (marks all processing jobs as failed)}';
protected $description = 'Cleanup Redis (Horizon jobs, metrics, overlapping queues, cache locks, and related data)';
@@ -63,6 +63,14 @@ class CleanupRedis extends Command
$deletedCount += $locksCleaned;
}
+ // Clean up stuck jobs (restart mode = aggressive, runtime mode = conservative)
+ $isRestart = $this->option('restart');
+ if ($isRestart || $this->option('clear-locks')) {
+ $this->info($isRestart ? 'Cleaning up stuck jobs (RESTART MODE - aggressive)...' : 'Checking for stuck jobs (runtime mode - conservative)...');
+ $jobsCleaned = $this->cleanupStuckJobs($redis, $prefix, $dryRun, $isRestart);
+ $deletedCount += $jobsCleaned;
+ }
+
if ($dryRun) {
$this->info("DRY RUN: Would delete {$deletedCount} out of {$totalKeys} keys");
} else {
@@ -332,4 +340,98 @@ class CleanupRedis extends Command
return $cleanedCount;
}
+
+ /**
+ * Clean up stuck jobs based on mode (restart vs runtime).
+ *
+ * @param mixed $redis Redis connection
+ * @param string $prefix Horizon prefix
+ * @param bool $dryRun Dry run mode
+ * @param bool $isRestart Restart mode (aggressive) vs runtime mode (conservative)
+ * @return int Number of jobs cleaned
+ */
+ private function cleanupStuckJobs($redis, string $prefix, bool $dryRun, bool $isRestart): int
+ {
+ $cleanedCount = 0;
+ $now = time();
+
+ // Get all keys with the horizon prefix
+ $keys = $redis->keys('*');
+
+ foreach ($keys as $key) {
+ $keyWithoutPrefix = str_replace($prefix, '', $key);
+ $type = $redis->command('type', [$keyWithoutPrefix]);
+
+ // Only process hash-type keys (individual jobs)
+ if ($type !== 5) {
+ continue;
+ }
+
+ $data = $redis->command('hgetall', [$keyWithoutPrefix]);
+ $status = data_get($data, 'status');
+ $payload = data_get($data, 'payload');
+
+ // Only process jobs in "processing" or "reserved" state
+ if (! in_array($status, ['processing', 'reserved'])) {
+ continue;
+ }
+
+ // Parse job payload to get job class and started time
+ $payloadData = json_decode($payload, true);
+ $jobClass = data_get($payloadData, 'displayName', 'Unknown');
+ $pushedAt = (int) data_get($data, 'pushed_at', 0);
+
+ // Calculate how long the job has been processing
+ $processingTime = $now - $pushedAt;
+
+ $shouldFail = false;
+ $reason = '';
+
+ if ($isRestart) {
+ // RESTART MODE: Mark ALL processing/reserved jobs as failed
+ // Safe because all workers are dead on restart
+ $shouldFail = true;
+ $reason = 'System restart - all workers terminated';
+ } else {
+ // RUNTIME MODE: Only mark truly stuck jobs as failed
+ // Be conservative to avoid killing legitimate long-running jobs
+
+ // Skip ApplicationDeploymentJob entirely (has dynamic_timeout, can run 2+ hours)
+ if (str_contains($jobClass, 'ApplicationDeploymentJob')) {
+ continue;
+ }
+
+ // Skip DatabaseBackupJob (large backups can take hours)
+ if (str_contains($jobClass, 'DatabaseBackupJob')) {
+ continue;
+ }
+
+ // For other jobs, only fail if processing > 12 hours
+ if ($processingTime > 43200) { // 12 hours
+ $shouldFail = true;
+ $reason = 'Processing for more than 12 hours';
+ }
+ }
+
+ if ($shouldFail) {
+ if ($dryRun) {
+ $this->warn(" Would mark as FAILED: {$jobClass} (processing for ".round($processingTime / 60, 1)." min) - {$reason}");
+ } else {
+ // Mark job as failed
+ $redis->command('hset', [$keyWithoutPrefix, 'status', 'failed']);
+ $redis->command('hset', [$keyWithoutPrefix, 'failed_at', $now]);
+ $redis->command('hset', [$keyWithoutPrefix, 'exception', "Job cleaned up by cleanup:redis - {$reason}"]);
+
+ $this->info(" ✓ Marked as FAILED: {$jobClass} (processing for ".round($processingTime / 60, 1).' min) - '.$reason);
+ }
+ $cleanedCount++;
+ }
+ }
+
+ if ($cleanedCount === 0) {
+ $this->info($isRestart ? ' No jobs to clean up' : ' No stuck jobs found (all jobs running normally)');
+ }
+
+ return $cleanedCount;
+ }
}
diff --git a/app/Console/Commands/Dev.php b/app/Console/Commands/Dev.php
index 8f26d78ff..f04d67482 100644
--- a/app/Console/Commands/Dev.php
+++ b/app/Console/Commands/Dev.php
@@ -45,6 +45,16 @@ class Dev extends Command
} else {
echo "Instance already initialized.\n";
}
+
+ // Clean up stuck jobs and stale locks on development startup
+ try {
+ echo "Cleaning up Redis (stuck jobs and stale locks)...\n";
+ Artisan::call('cleanup:redis', ['--restart' => true, '--clear-locks' => true]);
+ echo "Redis cleanup completed.\n";
+ } catch (\Throwable $e) {
+ echo "Error in cleanup:redis: {$e->getMessage()}\n";
+ }
+
CheckHelperImageJob::dispatch();
}
}
diff --git a/app/Console/Commands/Init.php b/app/Console/Commands/Init.php
index 4bc818f0a..aba99199e 100644
--- a/app/Console/Commands/Init.php
+++ b/app/Console/Commands/Init.php
@@ -73,7 +73,7 @@ class Init extends Command
$this->cleanupUnusedNetworkFromCoolifyProxy();
try {
- $this->call('cleanup:redis', ['--clear-locks' => true]);
+ $this->call('cleanup:redis', ['--restart' => true, '--clear-locks' => true]);
} catch (\Throwable $e) {
echo "Error in cleanup:redis command: {$e->getMessage()}\n";
}
diff --git a/app/Jobs/CoolifyTask.php b/app/Jobs/CoolifyTask.php
index 49a5ba8dd..d6dc6fa05 100755
--- a/app/Jobs/CoolifyTask.php
+++ b/app/Jobs/CoolifyTask.php
@@ -3,18 +3,35 @@
namespace App\Jobs;
use App\Actions\CoolifyTask\RunRemoteProcess;
+use App\Enums\ProcessStatus;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldBeEncrypted;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
+use Illuminate\Support\Facades\Log;
use Spatie\Activitylog\Models\Activity;
class CoolifyTask implements ShouldBeEncrypted, ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
+ /**
+ * The number of times the job may be attempted.
+ */
+ public $tries = 3;
+
+ /**
+ * The maximum number of unhandled exceptions to allow before failing.
+ */
+ public $maxExceptions = 1;
+
+ /**
+ * The number of seconds the job can run before timing out.
+ */
+ public $timeout = 600;
+
/**
* Create a new job instance.
*/
@@ -42,4 +59,36 @@ class CoolifyTask implements ShouldBeEncrypted, ShouldQueue
$remote_process();
}
+
+ /**
+ * Calculate the number of seconds to wait before retrying the job.
+ */
+ public function backoff(): array
+ {
+ return [30, 90, 180]; // 30s, 90s, 180s between retries
+ }
+
+ /**
+ * Handle a job failure.
+ */
+ public function failed(?\Throwable $exception): void
+ {
+ Log::channel('scheduled-errors')->error('CoolifyTask permanently failed', [
+ 'job' => 'CoolifyTask',
+ 'activity_id' => $this->activity->id,
+ 'server_uuid' => $this->activity->getExtraProperty('server_uuid'),
+ 'command_preview' => substr($this->activity->getExtraProperty('command') ?? '', 0, 200),
+ 'error' => $exception?->getMessage(),
+ 'total_attempts' => $this->attempts(),
+ 'trace' => $exception?->getTraceAsString(),
+ ]);
+
+ // Update activity status to reflect permanent failure
+ $this->activity->properties = $this->activity->properties->merge([
+ 'status' => ProcessStatus::ERROR->value,
+ 'error' => $exception?->getMessage() ?? 'Job permanently failed',
+ 'failed_at' => now()->toIso8601String(),
+ ]);
+ $this->activity->save();
+ }
}
diff --git a/app/Jobs/DatabaseBackupJob.php b/app/Jobs/DatabaseBackupJob.php
index 45586f0d0..ff16c78e0 100644
--- a/app/Jobs/DatabaseBackupJob.php
+++ b/app/Jobs/DatabaseBackupJob.php
@@ -23,6 +23,7 @@ use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
+use Illuminate\Support\Facades\Log;
use Illuminate\Support\Str;
use Throwable;
use Visus\Cuid2\Cuid2;
@@ -31,6 +32,16 @@ class DatabaseBackupJob implements ShouldBeEncrypted, ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
+ /**
+ * The number of times the job may be attempted.
+ */
+ public $tries = 2;
+
+ /**
+ * The maximum number of unhandled exceptions to allow before failing.
+ */
+ public $maxExceptions = 1;
+
public ?Team $team = null;
public Server $server;
@@ -659,17 +670,42 @@ class DatabaseBackupJob implements ShouldBeEncrypted, ShouldQueue
return "{$helperImage}:{$latestVersion}";
}
+ /**
+ * Calculate the number of seconds to wait before retrying the job.
+ */
+ public function backoff(): array
+ {
+ return [60, 300]; // 1min, 5min between retries
+ }
+
public function failed(?Throwable $exception): void
{
+ Log::channel('scheduled-errors')->error('DatabaseBackup permanently failed', [
+ 'job' => 'DatabaseBackupJob',
+ 'backup_id' => $this->backup->uuid,
+ 'database' => $this->database?->name ?? 'unknown',
+ 'database_type' => get_class($this->database ?? new \stdClass),
+ 'server' => $this->server?->name ?? 'unknown',
+ 'total_attempts' => $this->attempts(),
+ 'error' => $exception?->getMessage(),
+ 'trace' => $exception?->getTraceAsString(),
+ ]);
+
$log = ScheduledDatabaseBackupExecution::where('uuid', $this->backup_log_uuid)->first();
if ($log) {
$log->update([
'status' => 'failed',
- 'message' => 'Job failed: '.($exception?->getMessage() ?? 'Unknown error'),
+ 'message' => 'Job permanently failed after '.$this->attempts().' attempts: '.($exception?->getMessage() ?? 'Unknown error'),
'size' => 0,
'filename' => null,
+ 'finished_at' => Carbon::now(),
]);
}
+
+ // Notify team about permanent failure
+ if ($this->team) {
+ $this->team->notify(new BackupFailed($this->backup, $this->database, $this->backup_output));
+ }
}
}
diff --git a/app/Jobs/ScheduledJobManager.php b/app/Jobs/ScheduledJobManager.php
index 9937444b8..75ff883c2 100644
--- a/app/Jobs/ScheduledJobManager.php
+++ b/app/Jobs/ScheduledJobManager.php
@@ -52,7 +52,7 @@ class ScheduledJobManager implements ShouldQueue
{
return [
(new WithoutOverlapping('scheduled-job-manager'))
- ->expireAfter(60) // Lock expires after 1 minute to prevent stale locks
+ ->expireAfter(90) // Lock expires after 90s to handle high-load environments with many tasks
->dontRelease(), // Don't re-queue on lock conflict
];
}
diff --git a/app/Jobs/ScheduledTaskJob.php b/app/Jobs/ScheduledTaskJob.php
index 609595356..95c3b0eaf 100644
--- a/app/Jobs/ScheduledTaskJob.php
+++ b/app/Jobs/ScheduledTaskJob.php
@@ -18,11 +18,27 @@ use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
+use Illuminate\Support\Facades\Log;
class ScheduledTaskJob implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
+ /**
+ * The number of times the job may be attempted.
+ */
+ public $tries = 3;
+
+ /**
+ * The maximum number of unhandled exceptions to allow before failing.
+ */
+ public $maxExceptions = 1;
+
+ /**
+ * The number of seconds the job can run before timing out.
+ */
+ public $timeout = 300;
+
public Team $team;
public Server $server;
@@ -55,6 +71,9 @@ class ScheduledTaskJob implements ShouldQueue
}
$this->team = Team::findOrFail($task->team_id);
$this->server_timezone = $this->getServerTimezone();
+
+ // Set timeout from task configuration
+ $this->timeout = $this->task->timeout ?? 300;
}
private function getServerTimezone(): string
@@ -70,9 +89,13 @@ class ScheduledTaskJob implements ShouldQueue
public function handle(): void
{
+ $startTime = Carbon::now();
+
try {
$this->task_log = ScheduledTaskExecution::create([
'scheduled_task_id' => $this->task->id,
+ 'started_at' => $startTime,
+ 'retry_count' => $this->attempts() - 1,
]);
$this->server = $this->resource->destination->server;
@@ -129,15 +152,70 @@ class ScheduledTaskJob implements ShouldQueue
'message' => $this->task_output ?? $e->getMessage(),
]);
}
- $this->team?->notify(new TaskFailed($this->task, $e->getMessage()));
+
+ // Log the error to the scheduled-errors channel
+ Log::channel('scheduled-errors')->error('ScheduledTask execution failed', [
+ 'job' => 'ScheduledTaskJob',
+ 'task_id' => $this->task->uuid,
+ 'task_name' => $this->task->name,
+ 'server' => $this->server->name ?? 'unknown',
+ 'attempt' => $this->attempts(),
+ 'error' => $e->getMessage(),
+ ]);
+
+ // Only notify and throw on final failure
+ if ($this->attempts() >= $this->tries) {
+ $this->team?->notify(new TaskFailed($this->task, $e->getMessage()));
+ }
+
+ // Re-throw to trigger Laravel's retry mechanism with backoff
throw $e;
} finally {
ScheduledTaskDone::dispatch($this->team->id);
if ($this->task_log) {
+ $finishedAt = Carbon::now();
+ $duration = round($startTime->floatDiffInSeconds($finishedAt), 2);
+
$this->task_log->update([
- 'finished_at' => Carbon::now()->toImmutable(),
+ 'finished_at' => $finishedAt->toImmutable(),
+ 'duration' => $duration,
]);
}
}
}
+
+ /**
+ * Calculate the number of seconds to wait before retrying the job.
+ */
+ public function backoff(): array
+ {
+ return [30, 60, 120]; // 30s, 60s, 120s between retries
+ }
+
+ /**
+ * Handle a job failure.
+ */
+ public function failed(?\Throwable $exception): void
+ {
+ Log::channel('scheduled-errors')->error('ScheduledTask permanently failed', [
+ 'job' => 'ScheduledTaskJob',
+ 'task_id' => $this->task->uuid,
+ 'task_name' => $this->task->name,
+ 'server' => $this->server->name ?? 'unknown',
+ 'total_attempts' => $this->attempts(),
+ 'error' => $exception?->getMessage(),
+ 'trace' => $exception?->getTraceAsString(),
+ ]);
+
+ if ($this->task_log) {
+ $this->task_log->update([
+ 'status' => 'failed',
+ 'message' => 'Job permanently failed after '.$this->attempts().' attempts: '.($exception?->getMessage() ?? 'Unknown error'),
+ 'finished_at' => Carbon::now()->toImmutable(),
+ ]);
+ }
+
+ // Notify team about permanent failure
+ $this->team?->notify(new TaskFailed($this->task, $exception?->getMessage() ?? 'Unknown error'));
+ }
}
diff --git a/app/Livewire/Project/Shared/ScheduledTask/Add.php b/app/Livewire/Project/Shared/ScheduledTask/Add.php
index e4b666532..d7210c15d 100644
--- a/app/Livewire/Project/Shared/ScheduledTask/Add.php
+++ b/app/Livewire/Project/Shared/ScheduledTask/Add.php
@@ -34,11 +34,14 @@ class Add extends Component
public ?string $container = '';
+ public int $timeout = 300;
+
protected $rules = [
'name' => 'required|string',
'command' => 'required|string',
'frequency' => 'required|string',
'container' => 'nullable|string',
+ 'timeout' => 'required|integer|min:60|max:3600',
];
protected $validationAttributes = [
@@ -46,6 +49,7 @@ class Add extends Component
'command' => 'command',
'frequency' => 'frequency',
'container' => 'container',
+ 'timeout' => 'timeout',
];
public function mount()
@@ -103,6 +107,7 @@ class Add extends Component
$task->command = $this->command;
$task->frequency = $this->frequency;
$task->container = $this->container;
+ $task->timeout = $this->timeout;
$task->team_id = currentTeam()->id;
switch ($this->type) {
@@ -130,5 +135,6 @@ class Add extends Component
$this->command = '';
$this->frequency = '';
$this->container = '';
+ $this->timeout = 300;
}
}
diff --git a/app/Livewire/Project/Shared/ScheduledTask/Show.php b/app/Livewire/Project/Shared/ScheduledTask/Show.php
index c8d07ae36..920a0efe5 100644
--- a/app/Livewire/Project/Shared/ScheduledTask/Show.php
+++ b/app/Livewire/Project/Shared/ScheduledTask/Show.php
@@ -40,6 +40,9 @@ class Show extends Component
#[Validate(['string', 'nullable'])]
public ?string $container = null;
+ #[Validate(['integer', 'required', 'min:60', 'max:3600'])]
+ public int $timeout = 300;
+
#[Locked]
public ?string $application_uuid;
@@ -99,6 +102,7 @@ class Show extends Component
$this->task->command = str($this->command)->trim()->value();
$this->task->frequency = str($this->frequency)->trim()->value();
$this->task->container = str($this->container)->trim()->value();
+ $this->task->timeout = $this->timeout;
$this->task->save();
} else {
$this->isEnabled = $this->task->enabled;
@@ -106,6 +110,7 @@ class Show extends Component
$this->command = $this->task->command;
$this->frequency = $this->task->frequency;
$this->container = $this->task->container;
+ $this->timeout = $this->task->timeout ?? 300;
}
}
diff --git a/app/Models/ScheduledTask.php b/app/Models/ScheduledTask.php
index 06903ffb6..bada0b7a5 100644
--- a/app/Models/ScheduledTask.php
+++ b/app/Models/ScheduledTask.php
@@ -12,6 +12,14 @@ class ScheduledTask extends BaseModel
protected $guarded = [];
+ protected function casts(): array
+ {
+ return [
+ 'enabled' => 'boolean',
+ 'timeout' => 'integer',
+ ];
+ }
+
public function service()
{
return $this->belongsTo(Service::class);
diff --git a/app/Models/ScheduledTaskExecution.php b/app/Models/ScheduledTaskExecution.php
index de13fefb0..02fd6917a 100644
--- a/app/Models/ScheduledTaskExecution.php
+++ b/app/Models/ScheduledTaskExecution.php
@@ -8,6 +8,16 @@ class ScheduledTaskExecution extends BaseModel
{
protected $guarded = [];
+ protected function casts(): array
+ {
+ return [
+ 'started_at' => 'datetime',
+ 'finished_at' => 'datetime',
+ 'retry_count' => 'integer',
+ 'duration' => 'decimal:2',
+ ];
+ }
+
public function scheduledTask(): BelongsTo
{
return $this->belongsTo(ScheduledTask::class);
diff --git a/config/logging.php b/config/logging.php
index 488327414..1a75978f3 100644
--- a/config/logging.php
+++ b/config/logging.php
@@ -129,8 +129,8 @@ return [
'scheduled-errors' => [
'driver' => 'daily',
'path' => storage_path('logs/scheduled-errors.log'),
- 'level' => 'debug',
- 'days' => 7,
+ 'level' => 'warning',
+ 'days' => 14,
],
],
diff --git a/database/migrations/2025_11_09_000001_add_timeout_to_scheduled_tasks_table.php b/database/migrations/2025_11_09_000001_add_timeout_to_scheduled_tasks_table.php
new file mode 100644
index 000000000..067861e16
--- /dev/null
+++ b/database/migrations/2025_11_09_000001_add_timeout_to_scheduled_tasks_table.php
@@ -0,0 +1,28 @@
+integer('timeout')->default(300)->after('frequency');
+ });
+ }
+
+ /**
+ * Reverse the migrations.
+ */
+ public function down(): void
+ {
+ Schema::table('scheduled_tasks', function (Blueprint $table) {
+ $table->dropColumn('timeout');
+ });
+ }
+};
diff --git a/database/migrations/2025_11_09_000002_improve_scheduled_task_executions_tracking.php b/database/migrations/2025_11_09_000002_improve_scheduled_task_executions_tracking.php
new file mode 100644
index 000000000..14fdd5998
--- /dev/null
+++ b/database/migrations/2025_11_09_000002_improve_scheduled_task_executions_tracking.php
@@ -0,0 +1,31 @@
+timestamp('started_at')->nullable()->after('scheduled_task_id');
+ $table->integer('retry_count')->default(0)->after('status');
+ $table->decimal('duration', 10, 2)->nullable()->after('retry_count')->comment('Duration in seconds');
+ $table->text('error_details')->nullable()->after('message');
+ });
+ }
+
+ /**
+ * Reverse the migrations.
+ */
+ public function down(): void
+ {
+ Schema::table('scheduled_task_executions', function (Blueprint $table) {
+ $table->dropColumn(['started_at', 'retry_count', 'duration', 'error_details']);
+ });
+ }
+};
diff --git a/package-lock.json b/package-lock.json
index f8ef518d2..b076800e6 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -916,8 +916,7 @@
"resolved": "https://registry.npmjs.org/@socket.io/component-emitter/-/component-emitter-3.1.2.tgz",
"integrity": "sha512-9BCxFwvbGg/RsZK9tjXd8s4UcwR0MWeFQ1XEKIQVVvAGJyINdrqKMcTRyLoK8Rse1GjzLV9cwjWV1olXRWEXVA==",
"dev": true,
- "license": "MIT",
- "peer": true
+ "license": "MIT"
},
"node_modules/@tailwindcss/forms": {
"version": "0.5.10",
@@ -1432,7 +1431,8 @@
"version": "5.5.0",
"resolved": "https://registry.npmjs.org/@xterm/xterm/-/xterm-5.5.0.tgz",
"integrity": "sha512-hqJHYaQb5OptNunnyAnkHyM8aCjZ1MEIDTQu1iIbbTD/xops91NB5yq1ZK/dC2JDbVWtF23zUtl9JE2NqwT87A==",
- "license": "MIT"
+ "license": "MIT",
+ "peer": true
},
"node_modules/asynckit": {
"version": "0.4.0",
@@ -1595,7 +1595,6 @@
"integrity": "sha512-T0iLjnyNWahNyv/lcjS2y4oE358tVS/SYQNxYXGAJ9/GLgH4VCvOQ/mhTjqU88mLZCQgiG8RIegFHYCdVC+j5w==",
"dev": true,
"license": "MIT",
- "peer": true,
"dependencies": {
"@socket.io/component-emitter": "~3.1.0",
"debug": "~4.3.1",
@@ -1610,7 +1609,6 @@
"integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
"dev": true,
"license": "MIT",
- "peer": true,
"dependencies": {
"ms": "^2.1.3"
},
@@ -1629,7 +1627,6 @@
"integrity": "sha512-HqD3yTBfnBxIrbnM1DoD6Pcq8NECnh8d4As1Qgh0z5Gg3jRRIqijury0CL3ghu/edArpUYiYqQiDUQBIs4np3Q==",
"dev": true,
"license": "MIT",
- "peer": true,
"engines": {
"node": ">=10.0.0"
}
@@ -2391,6 +2388,7 @@
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
"dev": true,
"license": "MIT",
+ "peer": true,
"engines": {
"node": ">=12"
},
@@ -2467,6 +2465,7 @@
"integrity": "sha512-wp3HqIIUc1GRyu1XrP6m2dgyE9MoCsXVsWNlohj0rjSkLf+a0jLvEyVubdg58oMk7bhjBWnFClgp8jfAa6Ak4Q==",
"dev": true,
"license": "MIT",
+ "peer": true,
"dependencies": {
"tweetnacl": "^1.0.3"
}
@@ -2551,7 +2550,6 @@
"integrity": "sha512-hJVXfu3E28NmzGk8o1sHhN3om52tRvwYeidbj7xKy2eIIse5IoKX3USlS6Tqt3BHAtflLIkCQBkzVrEEfWUyYQ==",
"dev": true,
"license": "MIT",
- "peer": true,
"dependencies": {
"@socket.io/component-emitter": "~3.1.0",
"debug": "~4.3.2",
@@ -2568,7 +2566,6 @@
"integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
"dev": true,
"license": "MIT",
- "peer": true,
"dependencies": {
"ms": "^2.1.3"
},
@@ -2587,7 +2584,6 @@
"integrity": "sha512-/GbIKmo8ioc+NIWIhwdecY0ge+qVBSMdgxGygevmdHj24bsfgtCmcUUcQ5ZzcylGFHsN3k4HB4Cgkl96KVnuew==",
"dev": true,
"license": "MIT",
- "peer": true,
"dependencies": {
"@socket.io/component-emitter": "~3.1.0",
"debug": "~4.3.1"
@@ -2602,7 +2598,6 @@
"integrity": "sha512-Er2nc/H7RrMXZBFCEim6TCmMk02Z8vLC2Rbi1KEBggpo0fS6l0S1nnapwmIi3yW/+GOJap1Krg4w0Hg80oCqgQ==",
"dev": true,
"license": "MIT",
- "peer": true,
"dependencies": {
"ms": "^2.1.3"
},
@@ -2651,7 +2646,8 @@
"version": "4.1.10",
"resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.10.tgz",
"integrity": "sha512-P3nr6WkvKV/ONsTzj6Gb57sWPMX29EPNPopo7+FcpkQaNsrNpZ1pv8QmrYI2RqEKD7mlGqLnGovlcYnBK0IqUA==",
- "license": "MIT"
+ "license": "MIT",
+ "peer": true
},
"node_modules/tapable": {
"version": "2.3.0",
@@ -2720,6 +2716,7 @@
"integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
"dev": true,
"license": "MIT",
+ "peer": true,
"dependencies": {
"esbuild": "^0.25.0",
"fdir": "^6.4.4",
@@ -2819,6 +2816,7 @@
"integrity": "sha512-rjOV2ecxMd5SiAmof2xzh2WxntRcigkX/He4YFJ6WdRvVUrbt6DxC1Iujh10XLl8xCDRDtGKMeO3D+pRQ1PP9w==",
"dev": true,
"license": "MIT",
+ "peer": true,
"dependencies": {
"@vue/compiler-dom": "3.5.16",
"@vue/compiler-sfc": "3.5.16",
@@ -2841,7 +2839,6 @@
"integrity": "sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==",
"dev": true,
"license": "MIT",
- "peer": true,
"engines": {
"node": ">=10.0.0"
},
@@ -2863,7 +2860,6 @@
"resolved": "https://registry.npmjs.org/xmlhttprequest-ssl/-/xmlhttprequest-ssl-2.1.2.tgz",
"integrity": "sha512-TEU+nJVUUnA4CYJFLvK5X9AOeH4KvDvhIfm0vV1GaQRtchnG0hgK5p8hw/xjv8cunWYCsiPCSDzObPyhEwq3KQ==",
"dev": true,
- "peer": true,
"engines": {
"node": ">=0.4.0"
}
diff --git a/resources/views/livewire/project/shared/scheduled-task/add.blade.php b/resources/views/livewire/project/shared/scheduled-task/add.blade.php
index 0c4b8a4d6..6fa04c28b 100644
--- a/resources/views/livewire/project/shared/scheduled-task/add.blade.php
+++ b/resources/views/livewire/project/shared/scheduled-task/add.blade.php
@@ -4,6 +4,9 @@
+
@if ($type === 'application')
@if ($containerNames->count() > 1)
diff --git a/resources/views/livewire/project/shared/scheduled-task/show.blade.php b/resources/views/livewire/project/shared/scheduled-task/show.blade.php
index 1ede7775a..fa2ce0ad9 100644
--- a/resources/views/livewire/project/shared/scheduled-task/show.blade.php
+++ b/resources/views/livewire/project/shared/scheduled-task/show.blade.php
@@ -35,6 +35,8 @@
+
@if ($type === 'application')
first();
+
+ if (! $server) {
+ $this->markTestSkipped('No servers available for testing');
+ }
+
+ Queue::fake();
+
+ // Create an activity for the task
+ $activity = activity()
+ ->withProperties([
+ 'server_uuid' => $server->uuid,
+ 'command' => 'echo "test"',
+ 'type' => 'inline',
+ ])
+ ->event('inline')
+ ->log('[]');
+
+ // Dispatch the job
+ CoolifyTask::dispatch(
+ activity: $activity,
+ ignore_errors: false,
+ call_event_on_finish: null,
+ call_event_data: null
+ );
+
+ // Assert job was dispatched
+ Queue::assertPushed(CoolifyTask::class);
+});
+
+it('has correct retry configuration on CoolifyTask', function () {
+ $server = Server::where('ip', '!=', '1.2.3.4')->first();
+
+ if (! $server) {
+ $this->markTestSkipped('No servers available for testing');
+ }
+
+ $activity = activity()
+ ->withProperties([
+ 'server_uuid' => $server->uuid,
+ 'command' => 'echo "test"',
+ 'type' => 'inline',
+ ])
+ ->event('inline')
+ ->log('[]');
+
+ $job = new CoolifyTask(
+ activity: $activity,
+ ignore_errors: false,
+ call_event_on_finish: null,
+ call_event_data: null
+ );
+
+ // Assert retry configuration
+ expect($job->tries)->toBe(3);
+ expect($job->maxExceptions)->toBe(1);
+ expect($job->timeout)->toBe(600);
+ expect($job->backoff())->toBe([30, 90, 180]);
+});
diff --git a/tests/Unit/ScheduledJobsRetryConfigTest.php b/tests/Unit/ScheduledJobsRetryConfigTest.php
new file mode 100644
index 000000000..bf959a0f5
--- /dev/null
+++ b/tests/Unit/ScheduledJobsRetryConfigTest.php
@@ -0,0 +1,56 @@
+hasProperty('tries'))->toBeTrue()
+ ->and($reflection->hasProperty('maxExceptions'))->toBeTrue()
+ ->and($reflection->hasProperty('timeout'))->toBeTrue()
+ ->and($reflection->hasMethod('backoff'))->toBeTrue();
+
+ // Get default values from class definition
+ $defaultProperties = $reflection->getDefaultProperties();
+
+ expect($defaultProperties['tries'])->toBe(3)
+ ->and($defaultProperties['maxExceptions'])->toBe(1)
+ ->and($defaultProperties['timeout'])->toBe(600);
+});
+
+it('ScheduledTaskJob has correct retry properties defined', function () {
+ $reflection = new ReflectionClass(ScheduledTaskJob::class);
+
+ // Check public properties exist
+ expect($reflection->hasProperty('tries'))->toBeTrue()
+ ->and($reflection->hasProperty('maxExceptions'))->toBeTrue()
+ ->and($reflection->hasProperty('timeout'))->toBeTrue()
+ ->and($reflection->hasMethod('backoff'))->toBeTrue()
+ ->and($reflection->hasMethod('failed'))->toBeTrue();
+
+ // Get default values from class definition
+ $defaultProperties = $reflection->getDefaultProperties();
+
+ expect($defaultProperties['tries'])->toBe(3)
+ ->and($defaultProperties['maxExceptions'])->toBe(1)
+ ->and($defaultProperties['timeout'])->toBe(300);
+});
+
+it('DatabaseBackupJob has correct retry properties defined', function () {
+ $reflection = new ReflectionClass(DatabaseBackupJob::class);
+
+ // Check public properties exist
+ expect($reflection->hasProperty('tries'))->toBeTrue()
+ ->and($reflection->hasProperty('maxExceptions'))->toBeTrue()
+ ->and($reflection->hasMethod('backoff'))->toBeTrue()
+ ->and($reflection->hasMethod('failed'))->toBeTrue();
+
+ // Get default values from class definition
+ $defaultProperties = $reflection->getDefaultProperties();
+
+ expect($defaultProperties['tries'])->toBe(2)
+ ->and($defaultProperties['maxExceptions'])->toBe(1);
+});