mirror of
https://github.com/dagu-org/dagu.git
synced 2025-12-28 06:34:22 +00:00
* **New Features** * Added `cleanup` command to remove old DAG run history with configurable retention periods * Supports `--dry-run` flag to preview which runs would be removed without deleting * Includes `--yes` flag to skip confirmation prompts
528 lines
15 KiB
Go
528 lines
15 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/dagu-org/dagu/internal/core"
|
|
"github.com/dagu-org/dagu/internal/core/execution"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/mock"
|
|
)
|
|
|
|
func TestNewZombieDetector(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
|
|
// Test with default interval
|
|
detector := NewZombieDetector(dagRunStore, procStore, 0)
|
|
assert.NotNil(t, detector)
|
|
assert.Equal(t, 45*time.Second, detector.interval)
|
|
|
|
// Test with custom interval
|
|
detector = NewZombieDetector(dagRunStore, procStore, 60*time.Second)
|
|
assert.NotNil(t, detector)
|
|
assert.Equal(t, 60*time.Second, detector.interval)
|
|
}
|
|
|
|
func TestZombieDetector_detectAndCleanZombies(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := context.Background()
|
|
|
|
t.Run("NoRunningDAGs", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
// No running DAGs
|
|
dagRunStore.On("ListStatuses", ctx, mock.Anything).Return([]*execution.DAGRunStatus{}, nil)
|
|
|
|
detector.detectAndCleanZombies(ctx)
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
procStore.AssertExpectations(t)
|
|
})
|
|
|
|
t.Run("RunningDAGIsAlive", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
// One running DAG
|
|
runningStatus := &execution.DAGRunStatus{
|
|
Name: "test-dag",
|
|
DAGRunID: "run-123",
|
|
Status: core.Running,
|
|
}
|
|
dagRunStore.On("ListStatuses", ctx, mock.Anything).Return([]*execution.DAGRunStatus{runningStatus}, nil)
|
|
|
|
// Mock attempt
|
|
attempt := &mockDAGRunAttempt{}
|
|
dagRunRef := execution.NewDAGRunRef("test-dag", "run-123")
|
|
dagRunStore.On("FindAttempt", mock.Anything, dagRunRef).Return(attempt, nil)
|
|
|
|
// Mock DAG
|
|
dag := &core.DAG{Name: "test-dag"}
|
|
attempt.On("ReadDAG", mock.Anything).Return(dag, nil)
|
|
|
|
// Process is alive
|
|
procRef := execution.DAGRunRef{
|
|
Name: dag.Name,
|
|
ID: "run-123",
|
|
}
|
|
procStore.On("IsRunAlive", mock.Anything, dag.ProcGroup(), procRef).Return(true, nil)
|
|
|
|
detector.detectAndCleanZombies(ctx)
|
|
|
|
// Should not update status since process is alive
|
|
attempt.AssertNotCalled(t, "Open", mock.Anything)
|
|
attempt.AssertNotCalled(t, "Write", mock.Anything, mock.Anything)
|
|
attempt.AssertNotCalled(t, "Close", mock.Anything)
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
procStore.AssertExpectations(t)
|
|
attempt.AssertExpectations(t)
|
|
})
|
|
|
|
t.Run("RunningDAGIsZombie", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
// One running DAG
|
|
runningStatus := &execution.DAGRunStatus{
|
|
Name: "test-dag",
|
|
DAGRunID: "run-123",
|
|
Status: core.Running,
|
|
}
|
|
dagRunStore.On("ListStatuses", ctx, mock.Anything).Return([]*execution.DAGRunStatus{runningStatus}, nil)
|
|
|
|
// Mock attempt
|
|
attempt := &mockDAGRunAttempt{}
|
|
dagRunRef := execution.NewDAGRunRef("test-dag", "run-123")
|
|
dagRunStore.On("FindAttempt", mock.Anything, dagRunRef).Return(attempt, nil)
|
|
|
|
// Mock DAG
|
|
dag := &core.DAG{Name: "test-dag"}
|
|
attempt.On("ReadDAG", mock.Anything).Return(dag, nil)
|
|
|
|
// Process is NOT alive (zombie)
|
|
procRef := execution.DAGRunRef{
|
|
Name: dag.Name,
|
|
ID: "run-123",
|
|
}
|
|
procStore.On("IsRunAlive", mock.Anything, dag.ProcGroup(), procRef).Return(false, nil)
|
|
|
|
// Expect status update
|
|
attempt.On("Open", mock.Anything).Return(nil)
|
|
attempt.On("Write", mock.Anything, mock.MatchedBy(func(s execution.DAGRunStatus) bool {
|
|
return s.Status == core.Failed && s.FinishedAt != ""
|
|
})).Return(nil)
|
|
attempt.On("Close", mock.Anything).Return(nil)
|
|
|
|
detector.detectAndCleanZombies(ctx)
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
procStore.AssertExpectations(t)
|
|
attempt.AssertExpectations(t)
|
|
})
|
|
|
|
t.Run("ErrorListingStatuses", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
// Error listing statuses
|
|
dagRunStore.On("ListStatuses", ctx, mock.Anything).Return([]*execution.DAGRunStatus(nil), errors.New("db error"))
|
|
|
|
// Should handle error gracefully
|
|
detector.detectAndCleanZombies(ctx)
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
procStore.AssertExpectations(t)
|
|
})
|
|
}
|
|
|
|
func TestZombieDetector_Start(t *testing.T) {
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, 50*time.Millisecond)
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
// Set up expectations
|
|
callCount := atomic.Int32{}
|
|
dagRunStore.On("ListStatuses", ctx, mock.Anything).Return([]*execution.DAGRunStatus{}, nil).Run(func(_ mock.Arguments) {
|
|
callCount.Add(1)
|
|
})
|
|
|
|
// Start detector in background
|
|
go detector.Start(ctx)
|
|
|
|
// Wait for at least 2 ticks
|
|
time.Sleep(150 * time.Millisecond)
|
|
|
|
// Cancel context to stop
|
|
cancel()
|
|
|
|
// Give it time to stop
|
|
time.Sleep(50 * time.Millisecond)
|
|
|
|
// Should have been called at least twice
|
|
assert.GreaterOrEqual(t, callCount.Load(), int32(2))
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
}
|
|
|
|
func TestZombieDetector_checkAndCleanZombie_errors(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
ctx := context.Background()
|
|
|
|
t.Run("ErrorFindingAttempt", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
status := &execution.DAGRunStatus{
|
|
Name: "test-dag",
|
|
DAGRunID: "run-123",
|
|
Status: core.Running,
|
|
}
|
|
|
|
dagRunRef := execution.NewDAGRunRef("test-dag", "run-123")
|
|
dagRunStore.On("FindAttempt", mock.Anything, dagRunRef).Return((*mockDAGRunAttempt)(nil), errors.New("not found"))
|
|
|
|
err := detector.checkAndCleanZombie(ctx, status)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "find attempt")
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
})
|
|
|
|
t.Run("ErrorReadingDAG", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
status := &execution.DAGRunStatus{
|
|
Name: "test-dag",
|
|
DAGRunID: "run-123",
|
|
Status: core.Running,
|
|
}
|
|
|
|
attempt := &mockDAGRunAttempt{}
|
|
dagRunRef := execution.NewDAGRunRef("test-dag", "run-123")
|
|
dagRunStore.On("FindAttempt", mock.Anything, dagRunRef).Return(attempt, nil)
|
|
attempt.On("ReadDAG", mock.Anything).Return((*core.DAG)(nil), errors.New("read error"))
|
|
|
|
err := detector.checkAndCleanZombie(ctx, status)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "read dag")
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
attempt.AssertExpectations(t)
|
|
})
|
|
|
|
t.Run("ErrorCheckingIfAlive", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
status := &execution.DAGRunStatus{
|
|
Name: "test-dag",
|
|
DAGRunID: "run-123",
|
|
Status: core.Running,
|
|
}
|
|
|
|
attempt := &mockDAGRunAttempt{}
|
|
dagRunRef := execution.NewDAGRunRef("test-dag", "run-123")
|
|
dagRunStore.On("FindAttempt", mock.Anything, dagRunRef).Return(attempt, nil)
|
|
|
|
dag := &core.DAG{Name: "test-dag"}
|
|
attempt.On("ReadDAG", mock.Anything).Return(dag, nil)
|
|
|
|
procRef := execution.DAGRunRef{
|
|
Name: dag.Name,
|
|
ID: "run-123",
|
|
}
|
|
procStore.On("IsRunAlive", mock.Anything, dag.ProcGroup(), procRef).Return(false, errors.New("check error"))
|
|
|
|
err := detector.checkAndCleanZombie(ctx, status)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "check alive")
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
procStore.AssertExpectations(t)
|
|
attempt.AssertExpectations(t)
|
|
})
|
|
|
|
t.Run("ErrorUpdatingStatus", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, time.Second)
|
|
|
|
status := &execution.DAGRunStatus{
|
|
Name: "test-dag",
|
|
DAGRunID: "run-123",
|
|
Status: core.Running,
|
|
}
|
|
|
|
attempt := &mockDAGRunAttempt{}
|
|
dagRunRef := execution.NewDAGRunRef("test-dag", "run-123")
|
|
dagRunStore.On("FindAttempt", mock.Anything, dagRunRef).Return(attempt, nil)
|
|
|
|
dag := &core.DAG{Name: "test-dag"}
|
|
attempt.On("ReadDAG", mock.Anything).Return(dag, nil)
|
|
|
|
procRef := execution.DAGRunRef{
|
|
Name: dag.Name,
|
|
ID: "run-123",
|
|
}
|
|
procStore.On("IsRunAlive", mock.Anything, dag.ProcGroup(), procRef).Return(false, nil)
|
|
|
|
// Fail to open attempt
|
|
attempt.On("Open", mock.Anything).Return(errors.New("open error"))
|
|
|
|
err := detector.checkAndCleanZombie(ctx, status)
|
|
assert.Error(t, err)
|
|
assert.Contains(t, err.Error(), "update status")
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
procStore.AssertExpectations(t)
|
|
attempt.AssertExpectations(t)
|
|
})
|
|
}
|
|
|
|
func TestZombieDetector_concurrency(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
dagRunStore := &mockDAGRunStore{}
|
|
procStore := &mockProcStore{}
|
|
detector := NewZombieDetector(dagRunStore, procStore, 10*time.Millisecond)
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
// Make detectAndCleanZombies take longer than the interval
|
|
slowCallCount := atomic.Int32{}
|
|
dagRunStore.On("ListStatuses", ctx, mock.Anything).Return([]*execution.DAGRunStatus{}, nil).Run(func(_ mock.Arguments) {
|
|
slowCallCount.Add(1)
|
|
time.Sleep(30 * time.Millisecond) // Slower than interval
|
|
})
|
|
|
|
// Start detector
|
|
go detector.Start(ctx)
|
|
|
|
// Let it run for a while
|
|
time.Sleep(100 * time.Millisecond)
|
|
|
|
// Cancel to stop
|
|
cancel()
|
|
|
|
// Should have skipped some calls due to concurrency protection
|
|
// With 100ms runtime and 10ms interval, without protection we'd expect ~10 calls
|
|
// With protection, we should see fewer calls
|
|
callCount := slowCallCount.Load()
|
|
t.Logf("Call count: %d", callCount)
|
|
|
|
// Should be less than what we'd expect without concurrency protection
|
|
assert.Less(t, callCount, int32(8))
|
|
assert.GreaterOrEqual(t, callCount, int32(2))
|
|
|
|
dagRunStore.AssertExpectations(t)
|
|
}
|
|
|
|
// Mock DAGRunStore
|
|
type mockDAGRunStore struct {
|
|
mock.Mock
|
|
}
|
|
|
|
func (m *mockDAGRunStore) CreateAttempt(ctx context.Context, dag *core.DAG, ts time.Time, dagRunID string, opts execution.NewDAGRunAttemptOptions) (execution.DAGRunAttempt, error) {
|
|
args := m.Called(ctx, dag, ts, dagRunID, opts)
|
|
return args.Get(0).(execution.DAGRunAttempt), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) RecentAttempts(ctx context.Context, name string, itemLimit int) []execution.DAGRunAttempt {
|
|
args := m.Called(ctx, name, itemLimit)
|
|
return args.Get(0).([]execution.DAGRunAttempt)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) LatestAttempt(ctx context.Context, name string) (execution.DAGRunAttempt, error) {
|
|
args := m.Called(ctx, name)
|
|
return args.Get(0).(execution.DAGRunAttempt), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) ListStatuses(ctx context.Context, opts ...execution.ListDAGRunStatusesOption) ([]*execution.DAGRunStatus, error) {
|
|
args := m.Called(ctx, opts)
|
|
if args.Get(0) == nil {
|
|
return nil, args.Error(1)
|
|
}
|
|
return args.Get(0).([]*execution.DAGRunStatus), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) FindAttempt(ctx context.Context, dagRun execution.DAGRunRef) (execution.DAGRunAttempt, error) {
|
|
args := m.Called(ctx, dagRun)
|
|
if args.Get(0) == nil {
|
|
return nil, args.Error(1)
|
|
}
|
|
return args.Get(0).(execution.DAGRunAttempt), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) FindSubAttempt(ctx context.Context, dagRun execution.DAGRunRef, subDAGRunID string) (execution.DAGRunAttempt, error) {
|
|
args := m.Called(ctx, dagRun, subDAGRunID)
|
|
return args.Get(0).(execution.DAGRunAttempt), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) RemoveOldDAGRuns(ctx context.Context, name string, retentionDays int, opts ...execution.RemoveOldDAGRunsOption) ([]string, error) {
|
|
args := m.Called(ctx, name, retentionDays, opts)
|
|
if args.Get(0) == nil {
|
|
return nil, args.Error(1)
|
|
}
|
|
return args.Get(0).([]string), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) RenameDAGRuns(ctx context.Context, oldName, newName string) error {
|
|
args := m.Called(ctx, oldName, newName)
|
|
return args.Error(0)
|
|
}
|
|
|
|
func (m *mockDAGRunStore) RemoveDAGRun(ctx context.Context, dagRun execution.DAGRunRef) error {
|
|
args := m.Called(ctx, dagRun)
|
|
return args.Error(0)
|
|
}
|
|
|
|
var _ execution.ProcStore = (*mockProcStore)(nil)
|
|
|
|
// Mock ProcStore
|
|
type mockProcStore struct {
|
|
mock.Mock
|
|
}
|
|
|
|
// Lock implements execution.ProcStore.
|
|
func (m *mockProcStore) Lock(_ context.Context, _ string) error {
|
|
return nil
|
|
}
|
|
|
|
// CountAliveByDAGName implements models.ProcStore.
|
|
func (m *mockProcStore) CountAliveByDAGName(_ context.Context, _, _ string) (int, error) {
|
|
return 0, nil
|
|
}
|
|
|
|
// TryLock implements models.ProcStore.
|
|
func (m *mockProcStore) TryLock(_ context.Context, _ string) error {
|
|
return nil
|
|
}
|
|
|
|
// Unlock implements models.ProcStore.
|
|
func (m *mockProcStore) Unlock(_ context.Context, _ string) {
|
|
}
|
|
|
|
func (m *mockProcStore) Acquire(ctx context.Context, groupName string, dagRun execution.DAGRunRef) (execution.ProcHandle, error) {
|
|
args := m.Called(ctx, groupName, dagRun)
|
|
return args.Get(0).(execution.ProcHandle), args.Error(1)
|
|
}
|
|
|
|
func (m *mockProcStore) CountAlive(ctx context.Context, groupName string) (int, error) {
|
|
args := m.Called(ctx, groupName)
|
|
return args.Int(0), args.Error(1)
|
|
}
|
|
|
|
func (m *mockProcStore) IsRunAlive(ctx context.Context, groupName string, dagRun execution.DAGRunRef) (bool, error) {
|
|
args := m.Called(ctx, groupName, dagRun)
|
|
return args.Bool(0), args.Error(1)
|
|
}
|
|
|
|
func (m *mockProcStore) ListAlive(ctx context.Context, groupName string) ([]execution.DAGRunRef, error) {
|
|
args := m.Called(ctx, groupName)
|
|
if args.Get(0) == nil {
|
|
return nil, args.Error(1)
|
|
}
|
|
return args.Get(0).([]execution.DAGRunRef), args.Error(1)
|
|
}
|
|
|
|
func (m *mockProcStore) ListAllAlive(ctx context.Context) (map[string][]execution.DAGRunRef, error) {
|
|
args := m.Called(ctx)
|
|
if args.Get(0) == nil {
|
|
return nil, args.Error(1)
|
|
}
|
|
return args.Get(0).(map[string][]execution.DAGRunRef), args.Error(1)
|
|
}
|
|
|
|
// Mock DAGRunAttempt
|
|
type mockDAGRunAttempt struct {
|
|
mock.Mock
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) ID() string {
|
|
args := m.Called()
|
|
return args.String(0)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) Open(ctx context.Context) error {
|
|
args := m.Called(ctx)
|
|
return args.Error(0)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) Write(ctx context.Context, status execution.DAGRunStatus) error {
|
|
args := m.Called(ctx, status)
|
|
return args.Error(0)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) Close(ctx context.Context) error {
|
|
args := m.Called(ctx)
|
|
return args.Error(0)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) ReadStatus(ctx context.Context) (*execution.DAGRunStatus, error) {
|
|
args := m.Called(ctx)
|
|
return args.Get(0).(*execution.DAGRunStatus), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) ReadDAG(ctx context.Context) (*core.DAG, error) {
|
|
args := m.Called(ctx)
|
|
return args.Get(0).(*core.DAG), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) Abort(ctx context.Context) error {
|
|
args := m.Called(ctx)
|
|
return args.Error(0)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) IsAborting(ctx context.Context) (bool, error) {
|
|
args := m.Called(ctx)
|
|
return args.Bool(0), args.Error(1)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) Hide(ctx context.Context) error {
|
|
args := m.Called(ctx)
|
|
return args.Error(0)
|
|
}
|
|
|
|
func (m *mockDAGRunAttempt) Hidden() bool {
|
|
args := m.Called()
|
|
return args.Bool(0)
|
|
}
|