fix: cancel dependent jobs when rerunning jobs (#12756)

When a single Forgejo Actions job should be rerun, its dependent jobs (those that have the job to be rerun in their `needs:`) might still be running. That means they cannot be rerun. Currently, Forgejo ignores and simply skips them. But that is wrong: their outcome is meaningless at best and wrong at worst because it depends on an outdated attempt of the job to be rerun. This is remedied by cancelling them before queueing them for a rerun, too.

Follow-up of https://codeberg.org/forgejo/forgejo/pulls/12141.

## Checklist

The [contributor guide](https://forgejo.org/docs/next/contributor/) contains information that will be helpful to first time contributors. All work and communication must conform to Forgejo's [AI Agreement](https://codeberg.org/forgejo/governance/src/branch/main/AIAgreement.md). There also are a few [conditions for merging Pull Requests in Forgejo repositories](https://codeberg.org/forgejo/governance/src/branch/main/PullRequestsAgreement.md). You are also welcome to join the [Forgejo development chatroom](https://matrix.to/#/#forgejo-development:matrix.org).

### Tests for Go changes

- I added test coverage for Go changes...
  - [x] in their respective `*_test.go` for unit tests.
  - [ ] in the `tests/integration` directory if it involves interactions with a live Forgejo server.
- I ran...
  - [x] `make pr-go` before pushing

### Documentation

- [ ] I created a pull request [to the documentation](https://codeberg.org/forgejo/docs) to explain to Forgejo users how to use this change.
- [x] I did not document these changes and I do not expect someone else to do it.

### Release notes

- [x] This change will be noticed by a Forgejo user or admin (feature, bug fix, performance, etc.). I suggest to include a release note for this change.
- [ ] This change is not visible to a Forgejo user or admin (refactor, dependency upgrade, etc.). I think there is no need to add a release note for this change.

Reviewed-on: https://codeberg.org/forgejo/forgejo/pulls/12756
Reviewed-by: Michael Kriese <michael.kriese@gmx.de>
Reviewed-by: Mathieu Fenniak <mfenniak@noreply.codeberg.org>
This commit is contained in:
Andreas Ahlenstorf 2026-05-29 04:46:57 +02:00 committed by Mathieu Fenniak
commit 531a43a9a5
3 changed files with 103 additions and 19 deletions

View file

@ -13,6 +13,8 @@ import (
"forgejo.org/models/db"
"forgejo.org/models/unit"
"forgejo.org/modules/container"
"forgejo.org/modules/timeutil"
"forgejo.org/modules/util"
"xorm.io/builder"
)
@ -153,16 +155,28 @@ func RerunJob(ctx context.Context, job *actions_model.ActionRunJob) ([]*actions_
}
for _, jobToRerun := range GetAllRerunJobs(job, jobs) {
// If the dependent job is still running, cancel it so that it can be rerun, too. Its results are obsolete
// when the job it depends on is rerun.
if !jobToRerun.Status.IsDone() {
if err := cancelSingleJob(ctx, jobToRerun, actions_model.StatusCancelled); err != nil {
return fmt.Errorf("cannot cancel dependent job %d with status %s: %w",
jobToRerun.ID, jobToRerun.Status, err)
}
// Refresh the job after cancellation.
if jobToRerun, err = actions_model.GetRunJobByID(ctx, jobToRerun.ID); err != nil {
return fmt.Errorf("cannot refresh cancelled dependent job %d: %w", jobToRerun.ID, err)
}
}
canBeRerun, err := jobToRerun.CanBeRerun(ctx)
if err != nil {
return fmt.Errorf("cannot determine whether job %d can be rerun: %w", jobToRerun.ID, err)
}
// Skipping jobs that cannot be rerun is wrong. They should be cancelled and rerun, instead, because they
// are dependent jobs and the old results might be worthless, anyway. But we keep that behaviour for now,
// because changing it requires more rework.
// This should never happen because the run was validated and the job cancelled if it was running.
if !canBeRerun {
continue
return fmt.Errorf("cannot rerun dependent job %d", jobToRerun.ID)
}
// The job that should be rerun cannot be blocked, even if it has needs.
@ -201,3 +215,35 @@ func rerunSingleJob(ctx context.Context, job *actions_model.ActionRunJob, initia
return nil
}
// cancelSingleJob cancels the given job and its associated task, if any. outcomeStatus defines the status that should
// be assigned to the cancelled job and its associated task after cancellation; a non-terminal status will result in an
// error. Nothing happens if the job has already been completed.
func cancelSingleJob(ctx context.Context, job *actions_model.ActionRunJob, outcomeStatus actions_model.Status) error {
if !outcomeStatus.IsDone() {
return fmt.Errorf("outcomeStatus must be a terminal status, but is: %s", outcomeStatus)
}
if job.Status.IsDone() {
return nil
}
return db.WithTx(ctx, func(ctx context.Context) error {
if job.TaskID == 0 {
job.Status = outcomeStatus
job.Stopped = timeutil.TimeStampNow()
_, err := UpdateRunJob(ctx, job, nil, "status", "stopped")
if err != nil {
return fmt.Errorf("could not cancel job %d: %w", job.ID, err)
}
}
// A task might have been created while we're trying to cancel the job. Therefore, always try to stop the task.
if err := StopTask(ctx, job.TaskID, outcomeStatus); err != nil {
if errors.Is(err, util.ErrNotExist) {
return nil
}
return err
}
return nil
})
}

View file

@ -225,6 +225,58 @@ func TestRerun_RerunJob(t *testing.T) {
assert.Equal(t, timeutil.TimeStamp(0), dependentJob.Stopped)
})
t.Run("Rerun job needed by others with cancellation", func(t *testing.T) {
defer unittest.OverrideFixtures("services/actions/TestRerun_RerunJob")()
require.NoError(t, unittest.PrepareTestDatabase())
job := unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunJob{ID: 683911})
rerunJobs, err := RerunJob(t.Context(), job)
require.NoError(t, err)
assert.Equal(t, int64(683911), rerunJobs[0].ID)
assert.Equal(t, int64(683912), rerunJobs[1].ID)
// Cancel the first job so that we can rerun it.
require.NoError(t, cancelSingleJob(t.Context(), job, actions_model.StatusFailure))
job = unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunJob{ID: 683911})
assert.Equal(t, int64(2), job.Attempt)
assert.Equal(t, actions_model.StatusFailure, job.Status)
assert.Equal(t, timeutil.TimeStamp(0), job.Started)
assert.NotZero(t, job.Stopped)
dependentJob := unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunJob{ID: 683912})
assert.Equal(t, int64(2), dependentJob.Attempt)
assert.Equal(t, actions_model.StatusBlocked, dependentJob.Status)
assert.Equal(t, timeutil.TimeStamp(0), dependentJob.Started)
assert.Equal(t, timeutil.TimeStamp(0), dependentJob.Stopped)
// Rerun again. The dependent job should be cancelled first.
rerunJobs, err = RerunJob(t.Context(), job)
require.NoError(t, err)
assert.Len(t, rerunJobs, 2)
assert.Equal(t, int64(683911), rerunJobs[0].ID)
assert.Equal(t, int64(683912), rerunJobs[1].ID)
job = unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunJob{ID: 683911})
assert.Equal(t, int64(3), job.Attempt)
assert.Equal(t, actions_model.StatusWaiting, job.Status)
assert.Equal(t, timeutil.TimeStamp(0), job.Started)
assert.Equal(t, timeutil.TimeStamp(0), job.Stopped)
dependentJob = unittest.AssertExistsAndLoadBean(t, &actions_model.ActionRunJob{ID: 683912})
assert.Equal(t, int64(3), dependentJob.Attempt)
assert.Equal(t, actions_model.StatusBlocked, dependentJob.Status)
assert.Equal(t, timeutil.TimeStamp(0), dependentJob.Started)
assert.Equal(t, timeutil.TimeStamp(0), dependentJob.Stopped)
})
t.Run("Rerun job with needs", func(t *testing.T) {
defer unittest.OverrideFixtures("services/actions/TestRerun_RerunJob")()
require.NoError(t, unittest.PrepareTestDatabase())

View file

@ -11,7 +11,6 @@ import (
actions_model "forgejo.org/models/actions"
"forgejo.org/models/db"
"forgejo.org/modules/timeutil"
)
func killRun(ctx context.Context, run *actions_model.ActionRun, newStatus actions_model.Status) error {
@ -21,20 +20,7 @@ func killRun(ctx context.Context, run *actions_model.ActionRun, newStatus action
return err
}
for _, job := range jobs {
oldStatus := job.Status
if oldStatus.IsDone() {
continue
}
if job.TaskID == 0 {
job.Status = newStatus
job.Stopped = timeutil.TimeStampNow()
_, err := UpdateRunJob(ctx, job, nil, "status", "stopped")
if err != nil {
return err
}
continue
}
if err := StopTask(ctx, job.TaskID, newStatus); err != nil {
if err := cancelSingleJob(ctx, job, newStatus); err != nil {
return err
}
}