riverqueue
diff --git a/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎producer.go
Lines changed: 11 additions & 6 deletions b/‎producer.go
Lines changed: 11 additions & 6 deletions
diff --git a/‎riverdriver/river_driver_interface.go
Lines changed: 7 additions & 6 deletions b/‎riverdriver/river_driver_interface.go
Lines changed: 7 additions & 6 deletions
diff --git a/‎riverdriver/riverdatabasesql/internal/dbsqlc/river_job.sql.go
Lines changed: 19 additions & 7 deletions b/‎riverdriver/riverdatabasesql/internal/dbsqlc/river_job.sql.go
Lines changed: 19 additions & 7 deletions
diff --git a/‎riverdriver/riverdatabasesql/river_database_sql_driver.go
Lines changed: 5 additions & 4 deletions b/‎riverdriver/riverdatabasesql/river_database_sql_driver.go
Lines changed: 5 additions & 4 deletions
diff --git a/‎riverdriver/riverdrivertest/riverdrivertest.go
Lines changed: 101 additions & 23 deletions b/‎riverdriver/riverdrivertest/riverdrivertest.go
Lines changed: 101 additions & 23 deletions
diff --git a/‎riverdriver/riverpgxv5/internal/dbsqlc/river_job.sql
Lines changed: 13 additions & 1 deletion b/‎riverdriver/riverpgxv5/internal/dbsqlc/river_job.sql
Lines changed: 13 additions & 1 deletion
diff --git a/‎riverdriver/riverpgxv5/internal/dbsqlc/river_job.sql.go
Lines changed: 19 additions & 7 deletions b/‎riverdriver/riverpgxv5/internal/dbsqlc/river_job.sql.go
Lines changed: 19 additions & 7 deletions
@@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - The reindexer maintenance service now reindexes all `river_job` indexes, including its primary key. This is expected to help in situations where the jobs table has in the past expanded to a very large size (which makes most indexes larger), is now a much more modest size, but has left the indexes in their expanded state. [PR #963](https://github.com/riverqueue/river/pull/963).
 - The River CLI now accepts a `--target-version` of 0 with `river migrate-down` to run all down migrations and remove all River tables (previously, -1 was used for this; -1 still works, but now 0 also works). [PR #966](https://github.com/riverqueue/river/pull/966).
 - **Breaking change:** The `HookWorkEnd` interface's `WorkEnd` function now receives a `JobRow` parameter in addition to the `error` it received before. Having a `JobRow` to work with is fairly crucial to most functionality that a hook would implement, and its previous omission was entirely an error. [PR #970](https://github.com/riverqueue/river/pull/970).
+- Add maximum bound to each job's `attempted_by` array so that in degenerate cases where a job is locked many, many times (say it's snoozed hundreds of times), it doesn't grow to unlimited bounds. [PR #974](https://github.com/riverqueue/river/pull/974).
 
 ### Fixed
 
 
@@ -742,13 +742,18 @@ func (p *producer) dispatchWork(workCtx context.Context, count int, fetchResultC
 	// back to the queue.
 	ctx := context.WithoutCancel(workCtx)
 
+	// Maximum size of the `attempted_by` array on each job row. This maximum is
+	// rarely hit, but exists to protect against degenerate cases.
+	const maxAttemptedBy = 100
+
 	jobs, err := p.pilot.JobGetAvailable(ctx, p.exec, p.state, &riverdriver.JobGetAvailableParams{
-		ClientID:   p.config.ClientID,
-		Max:        count,
-		Now:        p.Time.NowUTCOrNil(),
-		Queue:      p.config.Queue,
-		ProducerID: p.id.Load(),
-		Schema:     p.config.Schema,
+		ClientID:       p.config.ClientID,
+		Max:            count,
+		MaxAttemptedBy: maxAttemptedBy,
+		Now:            p.Time.NowUTCOrNil(),
+		Queue:          p.config.Queue,
+		ProducerID:     p.id.Load(),
+		Schema:         p.config.Schema,
 	})
 	if err != nil {
 		p.Logger.Error(p.Name+": Error fetching jobs", slog.String("err", err.Error()), slog.String("queue", p.config.Queue))
 
@@ -356,12 +356,13 @@ type JobDeleteBeforeParams struct {
 }
 
 type JobGetAvailableParams struct {
-	ClientID   string
-	Max        int
-	Now        *time.Time
-	ProducerID int64
-	Queue      string
-	Schema     string
+	ClientID       string
+	Max            int
+	MaxAttemptedBy int
+	Now            *time.Time
+	ProducerID     int64
+	Queue          string
+	Schema         string
 }
 
 type JobGetByIDParams struct {
 
@@ -228,10 +228,11 @@ func (e *Executor) JobDeleteBefore(ctx context.Context, params *riverdriver.JobD
 
 func (e *Executor) JobGetAvailable(ctx context.Context, params *riverdriver.JobGetAvailableParams) ([]*rivertype.JobRow, error) {
 	jobs, err := dbsqlc.New().JobGetAvailable(schemaTemplateParam(ctx, params.Schema), e.dbtx, &dbsqlc.JobGetAvailableParams{
-		AttemptedBy: params.ClientID,
-		Max:         int32(min(params.Max, math.MaxInt32)), //nolint:gosec
-		Now:         params.Now,
-		Queue:       params.Queue,
+		AttemptedBy:    params.ClientID,
+		Max:            int32(min(params.Max, math.MaxInt32)), //nolint:gosec
+		MaxAttemptedBy: int32(min(params.MaxAttemptedBy, math.MaxInt32)),
+		Now:            params.Now,
+		Queue:          params.Queue,
 	})
 	if err != nil {
 		return nil, interpretError(err)
 
@@ -862,6 +862,11 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 	t.Run("JobGetAvailable", func(t *testing.T) {
 		t.Parallel()
 
+		const (
+			max            = 100
+			maxAttemptedBy = 10
+		)
+
 		t.Run("Success", func(t *testing.T) {
 			t.Parallel()
 
@@ -870,9 +875,10 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 			_ = testfactory.Job(ctx, t, exec, &testfactory.JobOpts{})
 
 			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      100,
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            max,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.Len(t, jobRows, 1)
@@ -891,9 +897,10 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 
 			// Two rows inserted but only one found because of the added limit.
 			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      1,
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            1,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.Len(t, jobRows, 1)
@@ -910,9 +917,10 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 
 			// Job is in a non-default queue so it's not found.
 			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      100,
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            max,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.Empty(t, jobRows)
@@ -931,10 +939,11 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 
 			// Job is scheduled a while from now so it's not found.
 			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      100,
-				Now:      &now,
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            max,
+				MaxAttemptedBy: maxAttemptedBy,
+				Now:            &now,
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.Empty(t, jobRows)
@@ -956,10 +965,11 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 			})
 
 			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      100,
-				Now:      ptrutil.Ptr(now),
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            max,
+				MaxAttemptedBy: maxAttemptedBy,
+				Now:            ptrutil.Ptr(now),
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.Len(t, jobRows, 1)
@@ -979,9 +989,10 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 			}
 
 			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      2,
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            2,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.Len(t, jobRows, 2, "expected to fetch exactly 2 jobs")
@@ -998,15 +1009,82 @@ func Exercise[TTx any](ctx context.Context, t *testing.T,
 
 			// Should fetch the one remaining job on the next attempt:
 			jobRows, err = exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
-				ClientID: clientID,
-				Max:      1,
-				Queue:    rivercommon.QueueDefault,
+				ClientID:       clientID,
+				Max:            1,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
 			})
 			require.NoError(t, err)
 			require.NoError(t, err)
 			require.Len(t, jobRows, 1, "expected to fetch exactly 1 job")
 			require.Equal(t, 3, jobRows[0].Priority, "expected final job to have priority 3")
 		})
+
+		t.Run("AttemptedByAtMaxTruncated", func(t *testing.T) {
+			t.Parallel()
+
+			exec, _ := setup(ctx, t)
+
+			attemptedBy := make([]string, maxAttemptedBy)
+			for i := range maxAttemptedBy {
+				attemptedBy[i] = "attempt_" + strconv.Itoa(i)
+			}
+
+			_ = testfactory.Job(ctx, t, exec, &testfactory.JobOpts{
+				AttemptedBy: attemptedBy,
+			})
+
+			// Job is in a non-default queue so it's not found.
+			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
+				ClientID:       clientID,
+				Max:            max,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
+			})
+			require.NoError(t, err)
+			require.Len(t, jobRows, 1)
+
+			jobRow := jobRows[0]
+			require.Equal(t, append(
+				attemptedBy[1:],
+				clientID,
+			), jobRow.AttemptedBy)
+		})
+
+		// Almost identical to the above, but tests that there are more existing
+		// `attempted_by` elements than the maximum allowed. There's a fine bug
+		// around use of > versus >= in the query's conditional, so make sure to
+		// capture both cases to make sure they work.
+		t.Run("AttemptedByOverMaxTruncated", func(t *testing.T) {
+			t.Parallel()
+
+			exec, _ := setup(ctx, t)
+
+			attemptedBy := make([]string, maxAttemptedBy+1)
+			for i := range maxAttemptedBy + 1 {
+				attemptedBy[i] = "attempt_" + strconv.Itoa(i)
+			}
+
+			_ = testfactory.Job(ctx, t, exec, &testfactory.JobOpts{
+				AttemptedBy: attemptedBy,
+			})
+
+			// Job is in a non-default queue so it's not found.
+			jobRows, err := exec.JobGetAvailable(ctx, &riverdriver.JobGetAvailableParams{
+				ClientID:       clientID,
+				Max:            max,
+				MaxAttemptedBy: maxAttemptedBy,
+				Queue:          rivercommon.QueueDefault,
+			})
+			require.NoError(t, err)
+			require.Len(t, jobRows, 1)
+
+			jobRow := jobRows[0]
+			require.Equal(t, append(
+				attemptedBy[2:], // start at 2 because there were 2 extra elements
+				clientID,
+			), jobRow.AttemptedBy)
+		})
 	})
 
 	t.Run("JobGetByID", func(t *testing.T) {
 
@@ -146,7 +146,19 @@ SET
     state = 'running',
     attempt = river_job.attempt + 1,
     attempted_at = coalesce(sqlc.narg('now')::timestamptz, now()),
-    attempted_by = array_append(river_job.attempted_by, @attempted_by::text)
+    attempted_by = CASE WHEN array_length(river_job.attempted_by, 1) >= @max_attempted_by::int
+                        THEN array_append(
+                            -- +2 instead of +1 because in one of those history
+                            -- making mistakes that's likely in aggregate cost
+                            -- humanity >$10B in bugs and lost productivity by
+                            -- now, like strings, Postgres array indexing start
+                            -- at 1 instead of 0.
+                            river_job.attempted_by[array_length(river_job.attempted_by,
+                            1) + 2 - @max_attempted_by:],
+                            @attempted_by::text
+                        )
+                        ELSE array_append(river_job.attempted_by, @attempted_by::text)
+                        END
 FROM
     locked_jobs
 WHERE