From 3b4092baa9aae9f01c23532b0889803846cc04ce Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 06:16:13 +0100 Subject: [PATCH 001/155] Start work on batch fetching jobs --- src/index.ts | 43 ++++++++++++++++++++++++++++++++ src/main.ts | 62 +++++++++++++++++++++++++++++++++++++++++------ src/sql/getJob.ts | 26 ++++++++++---------- 3 files changed, 110 insertions(+), 21 deletions(-) diff --git a/src/index.ts b/src/index.ts index ea21d5c6..54c0074c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -154,6 +154,49 @@ declare global { logger?: Logger; events?: WorkerEvents; + + /** + * To enable processing jobs in batches, set this to an integer larger + * than 1. This will result in jobs being fetched by the pool rather than + * the worker, the pool will fetch (and lock!) `getJobBatchSize` jobs up + * front, and each time a worker requests a job it will be served from + * this list until the list is exhausted, at which point a new set of + * jobs will be fetched (and locked). + * + * This setting can help reduce the load on your database from looking + * for jobs, but is only really effective when there are often many jobs + * queued and ready to go, and can increase the latency of job execution + * because a single worker may lock jobs into its queue leaving other + * workers idle. + * + * @default `1` + */ + getJobBatchSize?: number; + + /** + * The time in milliseconds to wait after a `completeJob` call to see if + * there are any other completeJob calls that can be batched together. A + * setting of `-1` disables this. + * + * Enabling this feature increases the time for which jobs are locked + * past completion, thus increasing the risk of catastrophic failure + * resulting in the jobs being executed again once they expire. + * + * @default `-1` + */ + completeJobBatchDelay?: number; + + /** + * The time in milliseconds to wait after a `failJob` call to see if + * there are any other failJob calls that can be batched together. A + * setting of `-1` disables this. + * + * Enabling this feature increases the time for which jobs are locked + * past failure. + * + * @default `-1` + */ + failJobBatchDelay?: number; } interface Preset { worker?: WorkerOptions; diff --git a/src/main.ts b/src/main.ts index ccc653b2..ba6349cf 100644 --- a/src/main.ts +++ b/src/main.ts @@ -545,7 +545,13 @@ export function _runTaskList( ): WorkerPool { const { resolvedPreset: { - worker: { concurrentJobs: baseConcurrency, gracefulShutdownAbortTimeout }, + worker: { + concurrentJobs: baseConcurrency, + gracefulShutdownAbortTimeout, + getJobBatchSize = 1, + completeJobBatchDelay = -1, + failJobBatchDelay = -1, + }, }, _rawOptions: { noHandleSignals = false }, } = compiledSharedOptions; @@ -859,14 +865,54 @@ export function _runTaskList( `You must not set workerId when concurrency > 1; each worker must have a unique identifier`, ); } + let jobQueue: Job[] = []; + let nextJobs: Promise | null = null; const getJob: GetJobFunction = async (workerId, flagsToSkip) => { - return baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerId, - flagsToSkip, - ); + if (flagsToSkip !== null || getJobBatchSize <= 1) { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerId, + flagsToSkip, + 1, + ); + return jobs[0]; + } else { + const job = jobQueue.pop(); + if (job) { + // Queue already has a job, run that + return job; + } else { + if (!nextJobs) { + // Queue is empty, no fetch of jobs in progress; let's fetch them + nextJobs = (async () => { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerId, + flagsToSkip, + getJobBatchSize, + ); + jobQueue = jobs.reverse(); + // Return true if we fetched the full batch size + const fetchAgain = jobs.length >= getJobBatchSize; + return fetchAgain; + })(); + } + /** If true, the full batch size was fetched, so if the queue is exhausted again it's likely that there will be more jobs */ + const fetchAgain = await nextJobs; + const job = jobQueue.pop(); + if (job) { + return job; + } else if (fetchAgain) { + return getJob(workerId, flagsToSkip); + } else { + return undefined; + } + } + } }; for (let i = 0; i < concurrency; i++) { const worker = makeNewWorker(compiledSharedOptions, { diff --git a/src/sql/getJob.ts b/src/sql/getJob.ts index 7f1f2247..eb3bb15b 100644 --- a/src/sql/getJob.ts +++ b/src/sql/getJob.ts @@ -17,7 +17,9 @@ export async function getJob( tasks: TaskList, poolId: string, flagsToSkip: string[] | null, -): Promise { + rawBatchSize: number, +): Promise { + const batchSize = parseInt(String(rawBatchSize), 10) || 1; const { escapedWorkerSchema, workerSchema, @@ -38,7 +40,7 @@ export async function getJob( if (taskDetails.taskIds.length === 0) { logger.error("No tasks found; nothing to do!"); - return undefined; + return []; } let i = 2; @@ -157,7 +159,7 @@ with j as ( ${queueClause} ${flagsClause} order by priority asc, run_at asc - limit 1 + limit ${batchSize} for update skip locked )${updateQueue} @@ -179,23 +181,21 @@ with j as ( ]; const name = !preparedStatements ? undefined - : `get_job${hasFlags ? "F" : ""}${useNodeTime ? "N" : ""}/${workerSchema}`; + : `get_job${batchSize === 1 ? "" : batchSize}${hasFlags ? "F" : ""}${ + useNodeTime ? "N" : "" + }/${workerSchema}`; - const { - rows: [jobRow], - } = await withPgClient.withRetries((client) => + const { rows } = await withPgClient.withRetries((client) => client.query({ text, values, name, }), ); - if (jobRow) { - return Object.assign(jobRow, { + return rows.reverse().map((jobRow) => + Object.assign(jobRow, { task_identifier: taskDetails.supportedTaskIdentifierByTaskId[jobRow.task_id], - }); - } else { - return undefined; - } + }), + ); } From ecaa062e8d719b1e203752629ad51fdaa01c2471 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 06:46:20 +0100 Subject: [PATCH 002/155] Batched job fetching with watermark --- src/index.ts | 2 +- src/main.ts | 69 ++++++++++++++++++++++++++--------------------- src/sql/getJob.ts | 2 +- 3 files changed, 40 insertions(+), 33 deletions(-) diff --git a/src/index.ts b/src/index.ts index 54c0074c..c997c0c5 100644 --- a/src/index.ts +++ b/src/index.ts @@ -169,7 +169,7 @@ declare global { * because a single worker may lock jobs into its queue leaving other * workers idle. * - * @default `1` + * @default `-1` */ getJobBatchSize?: number; diff --git a/src/main.ts b/src/main.ts index ba6349cf..4dba00f8 100644 --- a/src/main.ts +++ b/src/main.ts @@ -548,7 +548,7 @@ export function _runTaskList( worker: { concurrentJobs: baseConcurrency, gracefulShutdownAbortTimeout, - getJobBatchSize = 1, + getJobBatchSize = -1, completeJobBatchDelay = -1, failJobBatchDelay = -1, }, @@ -867,8 +867,42 @@ export function _runTaskList( } let jobQueue: Job[] = []; let nextJobs: Promise | null = null; + let getJobCounter = 0; + let getJobBaseline = 0; + const batchGetJob = async (myFetchId: number): Promise => { + if (!nextJobs) { + // Queue is empty, no fetch of jobs in progress; let's fetch them. + getJobBaseline = getJobCounter; + nextJobs = (async () => { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerPool.id, // << NOTE: This is the worker pool id, not the worker id! + null, + getJobBatchSize, + ); + jobQueue = jobs.reverse(); + return jobs.length >= getJobBatchSize; + })().finally(() => { + nextJobs = null; + }); + } + const fetchedMax = await nextJobs; + const job = jobQueue.pop(); + if (job) { + return job; + } else if (fetchedMax || myFetchId > getJobBaseline) { + // Either we fetched as many jobs as we could and there still weren't + // enough, or we requested a job after the request for jobs was sent to + // the database. Either way, let's fetch again. + return batchGetJob(myFetchId); + } else { + return undefined; + } + }; const getJob: GetJobFunction = async (workerId, flagsToSkip) => { - if (flagsToSkip !== null || getJobBatchSize <= 1) { + if (flagsToSkip !== null || getJobBatchSize < 1) { const jobs = await baseGetJob( compiledSharedOptions, withPgClient, @@ -880,37 +914,10 @@ export function _runTaskList( return jobs[0]; } else { const job = jobQueue.pop(); - if (job) { - // Queue already has a job, run that + if (job !== undefined) { return job; } else { - if (!nextJobs) { - // Queue is empty, no fetch of jobs in progress; let's fetch them - nextJobs = (async () => { - const jobs = await baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerId, - flagsToSkip, - getJobBatchSize, - ); - jobQueue = jobs.reverse(); - // Return true if we fetched the full batch size - const fetchAgain = jobs.length >= getJobBatchSize; - return fetchAgain; - })(); - } - /** If true, the full batch size was fetched, so if the queue is exhausted again it's likely that there will be more jobs */ - const fetchAgain = await nextJobs; - const job = jobQueue.pop(); - if (job) { - return job; - } else if (fetchAgain) { - return getJob(workerId, flagsToSkip); - } else { - return undefined; - } + return batchGetJob(++getJobCounter); } } }; diff --git a/src/sql/getJob.ts b/src/sql/getJob.ts index eb3bb15b..73d174a9 100644 --- a/src/sql/getJob.ts +++ b/src/sql/getJob.ts @@ -192,7 +192,7 @@ with j as ( name, }), ); - return rows.reverse().map((jobRow) => + return rows.map((jobRow) => Object.assign(jobRow, { task_identifier: taskDetails.supportedTaskIdentifierByTaskId[jobRow.task_id], From 9b2ba39f9643da5d53ec098ad55288fb41658426 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 07:31:09 +0100 Subject: [PATCH 003/155] Fix getJob call to reflect changes in #469 --- src/main.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.ts b/src/main.ts index 4dba00f8..ea36d765 100644 --- a/src/main.ts +++ b/src/main.ts @@ -878,7 +878,7 @@ export function _runTaskList( compiledSharedOptions, withPgClient, tasks, - workerPool.id, // << NOTE: This is the worker pool id, not the worker id! + workerPool.id, null, getJobBatchSize, ); @@ -901,13 +901,13 @@ export function _runTaskList( return undefined; } }; - const getJob: GetJobFunction = async (workerId, flagsToSkip) => { + const getJob: GetJobFunction = async (_workerId, flagsToSkip) => { if (flagsToSkip !== null || getJobBatchSize < 1) { const jobs = await baseGetJob( compiledSharedOptions, withPgClient, tasks, - workerId, + workerPool.id, flagsToSkip, 1, ); From 484e02c6de1743451558ae8f64f59108d22fc006 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 07:56:49 +0100 Subject: [PATCH 004/155] Hoist completeJob and failJob --- src/interfaces.ts | 7 +++++++ src/main.ts | 28 +++++++++++++++++++++++++++- src/worker.ts | 18 +++++++----------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index 47775398..74091a69 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -1238,3 +1238,10 @@ export type GetJobFunction = ( workerId: string, flagsToSkip: string[] | null, ) => Promise; + +export type CompleteJobFunction = (job: DbJob) => Promise; +export type FailJobFunction = ( + job: DbJob, + message: string, + replacementPayload: undefined | unknown[], +) => Promise; diff --git a/src/main.ts b/src/main.ts index ea36d765..e714ad43 100644 --- a/src/main.ts +++ b/src/main.ts @@ -9,7 +9,9 @@ import { makeWithPgClientFromPool, } from "./helpers"; import { + CompleteJobFunction, EnhancedWithPgClient, + FailJobFunction, GetJobFunction, Job, RunOnceOptions, @@ -28,7 +30,8 @@ import { } from "./lib"; import { Logger } from "./logger"; import SIGNALS, { Signal } from "./signals"; -import { failJobs } from "./sql/failJob"; +import { completeJob as baseCompleteJob } from "./sql/completeJob"; +import { failJob as baseFailJob, failJobs } from "./sql/failJob"; import { getJob as baseGetJob } from "./sql/getJob"; import { resetLockedAt } from "./sql/resetLockedAt"; import { makeNewWorker } from "./worker"; @@ -921,6 +924,27 @@ export function _runTaskList( } } }; + + const completeJob: CompleteJobFunction = async (job) => { + return baseCompleteJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + job, + ); + }; + + const failJob: FailJobFunction = async (job, message, replacementPayload) => { + return baseFailJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + job, + message, + replacementPayload, + ); + }; + for (let i = 0; i < concurrency; i++) { const worker = makeNewWorker(compiledSharedOptions, { tasks, @@ -932,6 +956,8 @@ export function _runTaskList( autostart, workerId, getJob, + completeJob, + failJob, }); workerPool._workers.push(worker); const remove = () => { diff --git a/src/worker.ts b/src/worker.ts index cda0837f..355d3045 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -4,7 +4,9 @@ import { randomBytes } from "crypto"; import deferred from "./deferred"; import { makeJobHelpers } from "./helpers"; import { + CompleteJobFunction, EnhancedWithPgClient, + FailJobFunction, GetJobFunction, Job, PromiseOrDirect, @@ -14,8 +16,6 @@ import { WorkerSharedOptions, } from "./interfaces"; import { coerceError, CompiledSharedOptions } from "./lib"; -import { completeJob } from "./sql/completeJob"; -import { failJob } from "./sql/failJob"; const NO_LOG_SUCCESS = !!process.env.NO_LOG_SUCCESS; @@ -31,6 +31,8 @@ export function makeNewWorker( autostart?: boolean; workerId?: string; getJob: GetJobFunction; + completeJob: CompleteJobFunction; + failJob: FailJobFunction; }, ): Worker { const { @@ -43,6 +45,8 @@ export function makeNewWorker( autostart = true, workerId = `worker-${randomBytes(9).toString("hex")}`, getJob, + completeJob, + failJob, } = params; const { events, @@ -343,9 +347,6 @@ export function makeNewWorker( { failure: true, job, error: err, duration }, ); await failJob( - compiledSharedOptions, - withPgClient, - workerPool.id, job, message, // "Batch jobs": copy through only the unsuccessful parts of the payload @@ -374,12 +375,7 @@ export function makeNewWorker( ); } - await completeJob( - compiledSharedOptions, - withPgClient, - workerPool.id, - job, - ); + await completeJob(job); } events.emit("job:complete", { worker, job, error: err }); } catch (fatalError) { From 862251d870a1720bc9bb973a1e842c73aa63a53a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 08:55:53 +0100 Subject: [PATCH 005/155] Refactor failJob/completeJob in preparation for batching --- src/main.ts | 54 +++++++++++++++++++++++------------ src/sql/completeJob.ts | 25 +++++++++++----- src/sql/failJob.ts | 65 ++++++++++++++++++++++++++++-------------- 3 files changed, 96 insertions(+), 48 deletions(-) diff --git a/src/main.ts b/src/main.ts index e714ad43..e38891c0 100644 --- a/src/main.ts +++ b/src/main.ts @@ -925,25 +925,41 @@ export function _runTaskList( } }; - const completeJob: CompleteJobFunction = async (job) => { - return baseCompleteJob( - compiledSharedOptions, - withPgClient, - workerPool.id, - job, - ); - }; - - const failJob: FailJobFunction = async (job, message, replacementPayload) => { - return baseFailJob( - compiledSharedOptions, - withPgClient, - workerPool.id, - job, - message, - replacementPayload, - ); - }; + const completeJob: CompleteJobFunction = + completeJobBatchDelay >= 0 + ? async (job) => { + return baseCompleteJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + [job], + ); + } + : (job) => + baseCompleteJob(compiledSharedOptions, withPgClient, workerPool.id, [ + job, + ]); + + const failJob: FailJobFunction = + failJobBatchDelay >= 0 + ? async (job, message, replacementPayload) => { + return baseFailJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + [ + { + job, + message, + replacementPayload, + }, + ], + ); + } + : (job, message, replacementPayload) => + baseFailJob(compiledSharedOptions, withPgClient, workerPool.id, [ + { job, message, replacementPayload }, + ]); for (let i = 0; i < concurrency; i++) { const worker = makeNewWorker(compiledSharedOptions, { diff --git a/src/sql/completeJob.ts b/src/sql/completeJob.ts index 404235b6..f64a827b 100644 --- a/src/sql/completeJob.ts +++ b/src/sql/completeJob.ts @@ -5,7 +5,7 @@ export async function completeJob( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, poolId: string, - job: DbJob, + jobs: ReadonlyArray, ): Promise { const { escapedWorkerSchema, @@ -15,33 +15,44 @@ export async function completeJob( }, } = compiledSharedOptions; + const jobsWithQueues: DbJob[] = []; + const jobsWithoutQueues: DbJob[] = []; + for (const job of jobs) { + if (job.job_queue_id != null) { + jobsWithQueues.push(job); + } else { + jobsWithoutQueues.push(job); + } + } + // TODO: retry logic, in case of server connection interruption - if (job.job_queue_id != null) { + if (jobsWithQueues.length > 0) { await withPgClient.withRetries((client) => client.query({ text: `\ with j as ( delete from ${escapedWorkerSchema}._private_jobs as jobs -where id = $1::bigint +where id = ANY($1::bigint[]) returning * ) update ${escapedWorkerSchema}._private_job_queues as job_queues set locked_by = null, locked_at = null from j where job_queues.id = j.job_queue_id and job_queues.locked_by = $2::text;`, - values: [job.id, poolId], + values: [jobsWithQueues.map((j) => j.id), poolId], name: !preparedStatements ? undefined : `complete_job_q/${workerSchema}`, }), ); - } else { + } + if (jobsWithoutQueues.length > 0) { await withPgClient.withRetries((client) => client.query({ text: `\ delete from ${escapedWorkerSchema}._private_jobs as jobs -where id = $1::bigint`, - values: [job.id], +where id = ANY($1::bigint[])`, + values: [jobsWithoutQueues.map((j) => j.id)], name: !preparedStatements ? undefined : `complete_job/${workerSchema}`, }), ); diff --git a/src/sql/failJob.ts b/src/sql/failJob.ts index 63a08884..a86db5e5 100644 --- a/src/sql/failJob.ts +++ b/src/sql/failJob.ts @@ -1,13 +1,16 @@ import { DbJob, EnhancedWithPgClient } from "../interfaces"; import { CompiledSharedOptions } from "../lib"; +interface Spec { + job: DbJob; + message: string; + replacementPayload: undefined | unknown[]; +} export async function failJob( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, poolId: string, - job: DbJob, - message: string, - replacementPayload: undefined | unknown[], + specs: ReadonlyArray, ): Promise { const { escapedWorkerSchema, @@ -17,56 +20,74 @@ export async function failJob( }, } = compiledSharedOptions; + const specsWithQueues: Spec[] = []; + const specsWithoutQueues: Spec[] = []; + + for (const spec of specs) { + if (spec.job.job_queue_id != null) { + specsWithQueues.push(spec); + } else { + specsWithoutQueues.push(spec); + } + } + // TODO: retry logic, in case of server connection interruption - if (job.job_queue_id != null) { + if (specsWithQueues.length > 0) { await withPgClient.withRetries((client) => client.query({ text: `\ with j as ( update ${escapedWorkerSchema}._private_jobs as jobs set -last_error = $2::text, +last_error = (el->>'message'), run_at = greatest(now(), run_at) + (exp(least(attempts, 10)) * interval '1 second'), locked_by = null, locked_at = null, -payload = coalesce($4::json, jobs.payload) -where id = $1::bigint and locked_by = $3::text +payload = coalesce(el->'payload', jobs.payload) +from json_array_elements($2::json) as els(el) +where id = (el->>'jobId')::bigint and locked_by = $1::text returning * ) update ${escapedWorkerSchema}._private_job_queues as job_queues set locked_by = null, locked_at = null from j -where job_queues.id = j.job_queue_id and job_queues.locked_by = $3::text;`, +where job_queues.id = j.job_queue_id and job_queues.locked_by = $1::text;`, values: [ - job.id, - message, poolId, - replacementPayload != null - ? JSON.stringify(replacementPayload) - : null, + JSON.stringify( + specsWithQueues.map(({ job, message, replacementPayload }) => ({ + jobId: job.id, + message, + payload: replacementPayload, + })), + ), ], name: !preparedStatements ? undefined : `fail_job_q/${workerSchema}`, }), ); - } else { + } + if (specsWithoutQueues.length > 0) { await withPgClient.withRetries((client) => client.query({ text: `\ update ${escapedWorkerSchema}._private_jobs as jobs set -last_error = $2::text, +last_error = (el->>'message'), run_at = greatest(now(), run_at) + (exp(least(attempts, 10)) * interval '1 second'), locked_by = null, locked_at = null, -payload = coalesce($4::json, jobs.payload) -where id = $1::bigint and locked_by = $3::text;`, +payload = coalesce(el->'payload', jobs.payload) +from json_array_elements($2::json) as els(el) +where id = (el->>'jobId')::bigint and locked_by = $1::text;`, values: [ - job.id, - message, poolId, - replacementPayload != null - ? JSON.stringify(replacementPayload) - : null, + JSON.stringify( + specsWithoutQueues.map(({ job, message, replacementPayload }) => ({ + jobId: job.id, + message, + payload: replacementPayload, + })), + ), ], name: !preparedStatements ? undefined : `fail_job/${workerSchema}`, }), From 2382ed11d2dcd6626a38cd35baea691ee28e8c23 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 09:04:02 +0100 Subject: [PATCH 006/155] Stub batch function --- src/main.ts | 59 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/src/main.ts b/src/main.ts index e38891c0..17565bd6 100644 --- a/src/main.ts +++ b/src/main.ts @@ -927,14 +927,17 @@ export function _runTaskList( const completeJob: CompleteJobFunction = completeJobBatchDelay >= 0 - ? async (job) => { - return baseCompleteJob( - compiledSharedOptions, - withPgClient, - workerPool.id, - [job], - ); - } + ? batch( + completeJobBatchDelay, + (job) => job, + (jobs) => + baseCompleteJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + jobs, + ), + ) : (job) => baseCompleteJob(compiledSharedOptions, withPgClient, workerPool.id, [ job, @@ -942,20 +945,21 @@ export function _runTaskList( const failJob: FailJobFunction = failJobBatchDelay >= 0 - ? async (job, message, replacementPayload) => { - return baseFailJob( - compiledSharedOptions, - withPgClient, - workerPool.id, - [ - { - job, - message, - replacementPayload, - }, - ], - ); - } + ? batch( + failJobBatchDelay, + (job, message, replacementPayload) => ({ + job, + message, + replacementPayload, + }), + (specs) => + baseFailJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + specs, + ), + ) : (job, message, replacementPayload) => baseFailJob(compiledSharedOptions, withPgClient, workerPool.id, [ { job, message, replacementPayload }, @@ -1039,3 +1043,14 @@ export const runTaskListOnce = ( return pool; }; + +function batch( + delay: number, + makeSpec: (...args: TArgs) => TSpec, + callback: (specs: ReadonlyArray) => Promise, +): (...args: TArgs) => Promise { + return (...args) => { + const spec = makeSpec(...args); + return callback([spec]); + }; +} From 95382470611411c831810a4ae57689a9d9296327 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 09:09:04 +0100 Subject: [PATCH 007/155] Implement batching function --- src/main.ts | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 17565bd6..51ba3a66 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1049,8 +1049,25 @@ function batch( makeSpec: (...args: TArgs) => TSpec, callback: (specs: ReadonlyArray) => Promise, ): (...args: TArgs) => Promise { + let currentBatch: { specs: TSpec[]; promise: Promise } | null = null; return (...args) => { const spec = makeSpec(...args); - return callback([spec]); + if (currentBatch) { + currentBatch.specs.push(spec); + } else { + const specs = [spec]; + currentBatch = { + specs, + promise: (async () => { + await sleep(delay); + currentBatch = null; + return callback(specs); + })(), + }; + } + return currentBatch.promise; }; } + +const sleep = (ms: number) => + new Promise((resolve) => setTimeout(resolve, ms)); From 3752cdc6beab2ac7201e139ab90ccfeedb913c24 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 09:34:51 +0100 Subject: [PATCH 008/155] Lint fix --- src/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 51ba3a66..487bf8d3 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1044,7 +1044,7 @@ export const runTaskListOnce = ( return pool; }; -function batch( +function batch( delay: number, makeSpec: (...args: TArgs) => TSpec, callback: (specs: ReadonlyArray) => Promise, From 113ef71c9ff1f82d1ffae2d69bb3a26c9376ebc4 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 10:36:16 +0100 Subject: [PATCH 009/155] Tweak graceful/forceful shutdown handover --- src/interfaces.ts | 2 ++ src/main.ts | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index 74091a69..05aef3c8 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -523,6 +523,8 @@ export interface WorkerPool { /** @internal */ _shuttingDown: boolean; /** @internal */ + _forcefulShuttingDown: boolean; + /** @internal */ _active: boolean; /** @internal */ _workers: Worker[]; diff --git a/src/main.ts b/src/main.ts index 487bf8d3..a59c1326 100644 --- a/src/main.ts +++ b/src/main.ts @@ -601,9 +601,13 @@ export function _runTaskList( unregisterSignalHandlers(); } } else { - logger.error( - `Graphile Worker internal error: terminate() was called twice for worker pool. Ignoring second call; but this indicates a bug - please file an issue.`, - ); + try { + throw new Error( + `Graphile Worker internal error: terminate() was called twice for worker pool. Ignoring second call; but this indicates a bug - please file an issue.`, + ); + } catch (e) { + logger.error(String(e.stack)); + } } } @@ -623,6 +627,7 @@ export function _runTaskList( id: `${continuous ? "pool" : "otpool"}-${randomBytes(9).toString("hex")}`, _active: true, _shuttingDown: false, + _forcefulShuttingDown: false, _workers: [], _withPgClient: withPgClient, get worker() { @@ -644,6 +649,12 @@ export function _runTaskList( async gracefulShutdown( message = "Worker pool is shutting down gracefully", ) { + if (workerPool._forcefulShuttingDown) { + logger.error( + `gracefulShutdown called when forcefulShutdown is already in progress`, + ); + return; + } if (workerPool._shuttingDown) { logger.error( `gracefulShutdown called when gracefulShutdown is already in progress`, @@ -744,13 +755,22 @@ export function _runTaskList( }); return this.forcefulShutdown(message); } - terminate(); + if (!terminated) { + terminate(); + } }, /** * Stop accepting jobs and "fail" all currently running jobs. */ async forcefulShutdown(message: string) { + if (workerPool._forcefulShuttingDown) { + logger.error( + `forcefulShutdown called when forcefulShutdown is already in progress`, + ); + return; + } + workerPool._forcefulShuttingDown = true; events.emit("pool:forcefulShutdown", { pool: workerPool, workerPool, @@ -821,7 +841,9 @@ export function _runTaskList( { error: e }, ); } - terminate(); + if (!terminated) { + terminate(); + } }, promise, From 1821691d9cde6328527aae5c7fcc3739dea0b561 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 10:36:36 +0100 Subject: [PATCH 010/155] Evaluate envvar just once up top --- src/worker.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/worker.ts b/src/worker.ts index 355d3045..ee9a396e 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -19,6 +19,8 @@ import { coerceError, CompiledSharedOptions } from "./lib"; const NO_LOG_SUCCESS = !!process.env.NO_LOG_SUCCESS; +const NO_LOG_SUCCESS = !!process.env.NO_LOG_SUCCESS; + export function makeNewWorker( compiledSharedOptions: CompiledSharedOptions, params: { From c542aa3d42c078d879acf92af6b69f6c1ea29e0b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 13:07:28 +0100 Subject: [PATCH 011/155] Release batches before releasing worker pool --- src/interfaces.ts | 13 +++- src/main.ts | 167 +++++++++++++++++++++++++++++++++++++--------- src/worker.ts | 4 +- 3 files changed, 148 insertions(+), 36 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index 05aef3c8..a4747768 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -930,6 +930,15 @@ export type WorkerEventMap = { client: PoolClient; }; + /** + * When a worker pool fails to complete/fail a job + */ + "pool:fatalError": { + workerPool: WorkerPool; + error: unknown; + action: string; + }; + /** * When a worker pool is released */ @@ -1241,9 +1250,9 @@ export type GetJobFunction = ( flagsToSkip: string[] | null, ) => Promise; -export type CompleteJobFunction = (job: DbJob) => Promise; +export type CompleteJobFunction = (job: DbJob) => void; export type FailJobFunction = ( job: DbJob, message: string, replacementPayload: undefined | unknown[], -) => Promise; +) => void; diff --git a/src/main.ts b/src/main.ts index a59c1326..da9d8fb3 100644 --- a/src/main.ts +++ b/src/main.ts @@ -583,9 +583,35 @@ export function _runTaskList( const promise = defer(); - function deactivate() { + async function deactivate() { if (workerPool._active) { workerPool._active = false; + // TODO: stop the batch()es and await the promises here + const releaseCompleteJobPromise = releaseCompleteJob?.(); + const releaseFailJobPromise = releaseFailJob?.(); + const [releaseCompleteJobResult, releaseFailJobResult] = + await Promise.allSettled([ + releaseCompleteJobPromise, + releaseFailJobPromise, + ]); + if (releaseCompleteJobResult.status === "rejected") { + // Log but continue regardless + logger.error( + `Releasing complete job batcher failed: ${releaseCompleteJobResult.reason}`, + { + error: releaseCompleteJobResult.reason, + }, + ); + } + if (releaseFailJobResult.status === "rejected") { + // Log but continue regardless + logger.error( + `Releasing failed job batcher failed: ${releaseFailJobResult.reason}`, + { + error: releaseFailJobResult.reason, + }, + ); + } return onDeactivate?.(); } } @@ -947,7 +973,7 @@ export function _runTaskList( } }; - const completeJob: CompleteJobFunction = + const { release: releaseCompleteJob, fn: completeJob } = ( completeJobBatchDelay >= 0 ? batch( completeJobBatchDelay, @@ -959,13 +985,34 @@ export function _runTaskList( workerPool.id, jobs, ), + (error, jobs) => { + events.emit("pool:fatalError", { + error, + workerPool, + action: "completeJob", + }); + logger.error( + `Failed to complete jobs '${jobs + .map((j) => j.id) + .join("', '")}':\n${String(error)}`, + { fatalError: error, jobs }, + ); + workerPool.gracefulShutdown(); + }, ) - : (job) => - baseCompleteJob(compiledSharedOptions, withPgClient, workerPool.id, [ - job, - ]); + : { + release: null, + fn: (job) => + baseCompleteJob( + compiledSharedOptions, + withPgClient, + workerPool.id, + [job], + ), + } + ) as { release: (() => void) | null; fn: CompleteJobFunction }; - const failJob: FailJobFunction = + const { release: releaseFailJob, fn: failJob } = ( failJobBatchDelay >= 0 ? batch( failJobBatchDelay, @@ -981,11 +1028,29 @@ export function _runTaskList( workerPool.id, specs, ), + (error, specs) => { + events.emit("pool:fatalError", { + error, + workerPool, + action: "failJob", + }); + logger.error( + `Failed to fail jobs '${specs + .map((spec) => spec.job.id) + .join("', '")}':\n${String(error)}`, + { fatalError: error, specs }, + ); + workerPool.gracefulShutdown(); + }, ) - : (job, message, replacementPayload) => - baseFailJob(compiledSharedOptions, withPgClient, workerPool.id, [ - { job, message, replacementPayload }, - ]); + : { + release: null, + fn: (job, message, replacementPayload) => + baseFailJob(compiledSharedOptions, withPgClient, workerPool.id, [ + { job, message, replacementPayload }, + ]), + } + ) as { release: (() => void) | null; fn: FailJobFunction }; for (let i = 0; i < concurrency; i++) { const worker = makeNewWorker(compiledSharedOptions, { @@ -1010,8 +1075,7 @@ export function _runTaskList( } workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); if (!continuous && workerPool._workers.length === 0) { - deactivate(); - terminate(); + deactivate().then(terminate, terminate); } }; worker.promise.then( @@ -1070,26 +1134,65 @@ function batch( delay: number, makeSpec: (...args: TArgs) => TSpec, callback: (specs: ReadonlyArray) => Promise, -): (...args: TArgs) => Promise { - let currentBatch: { specs: TSpec[]; promise: Promise } | null = null; - return (...args) => { - const spec = makeSpec(...args); - if (currentBatch) { - currentBatch.specs.push(spec); - } else { - const specs = [spec]; - currentBatch = { - specs, - promise: (async () => { - await sleep(delay); - currentBatch = null; - return callback(specs); - })(), - }; - } - return currentBatch.promise; + errorHandler: ( + error: unknown, + specs: ReadonlyArray, + ) => void | Promise, +): { + release(): void | Promise; + fn: (...args: TArgs) => void; +} { + let pending = 0; + let releasing = false; + let released = false; + const promise = deferred(); + let currentBatch: { specs: TSpec[]; promise: Promise } | null = null; + return { + async release() { + if (releasing) { + return; + } + releasing = true; + if (pending === 0) { + released = true; + promise.resolve(); + } + await promise; + }, + fn(...args) { + if (released) { + throw new Error( + "This batcher has been released, and so no more calls can be made.", + ); + } + const spec = makeSpec(...args); + if (currentBatch) { + currentBatch.specs.push(spec); + } else { + const specs = [spec]; + currentBatch = { + specs, + promise: (async () => { + pending++; + try { + await sleep(delay); + currentBatch = null; + await callback(specs); + } catch (error) { + errorHandler(error, specs); + } finally { + pending--; + if (pending === 0 && releasing) { + released = true; + promise.resolve(); + } + } + })(), + }; + } + return; + }, }; } - const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); diff --git a/src/worker.ts b/src/worker.ts index ee9a396e..174bc7c9 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -348,7 +348,7 @@ export function makeNewWorker( }`, { failure: true, job, error: err, duration }, ); - await failJob( + failJob( job, message, // "Batch jobs": copy through only the unsuccessful parts of the payload @@ -377,7 +377,7 @@ export function makeNewWorker( ); } - await completeJob(job); + completeJob(job); } events.emit("job:complete", { worker, job, error: err }); } catch (fatalError) { From 93fb46230bb7d388e64948c0a217b88cd859017d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 6 Jun 2024 15:50:21 +0100 Subject: [PATCH 012/155] Refactor --- src/main.ts | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/main.ts b/src/main.ts index da9d8fb3..f4b00b05 100644 --- a/src/main.ts +++ b/src/main.ts @@ -925,19 +925,21 @@ export function _runTaskList( // Queue is empty, no fetch of jobs in progress; let's fetch them. getJobBaseline = getJobCounter; nextJobs = (async () => { - const jobs = await baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerPool.id, - null, - getJobBatchSize, - ); - jobQueue = jobs.reverse(); - return jobs.length >= getJobBatchSize; - })().finally(() => { - nextJobs = null; - }); + try { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerPool.id, + null, + getJobBatchSize, + ); + jobQueue = jobs.reverse(); + return jobs.length >= getJobBatchSize; + } finally { + nextJobs = null; + } + })(); } const fetchedMax = await nextJobs; const job = jobQueue.pop(); From 1cab6742beda067b017f67bf8e68285e1709f948 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 10 Jun 2024 13:27:58 +0100 Subject: [PATCH 013/155] Warn about bad batch settings --- src/main.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main.ts b/src/main.ts index f4b00b05..efa87643 100644 --- a/src/main.ts +++ b/src/main.ts @@ -575,6 +575,12 @@ export function _runTaskList( ); } + if (getJobBatchSize > 0 && getJobBatchSize < concurrency) { + logger.warn( + `Your job batch size (${getJobBatchSize}) is smaller than your concurrency setting (${concurrency}); this may result in drastically lower performance if your jobs can complete quickly. Please update to \`getJobBatchSize: ${concurrency}\` to improve performance, or \`getJobBatchSize: -1\` to disable batching.`, + ); + } + let unregisterSignalHandlers: (() => void) | undefined = undefined; if (!noHandleSignals) { // Clean up when certain signals occur From fde43415c197402d678dbd9e02559c1a419f24ca Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 10 Jun 2024 13:42:09 +0100 Subject: [PATCH 014/155] Refactor in preparation for new queue --- src/batchGetJob.ts | 79 ++++++++++++++++++++++++++++++++++++++++++++++ src/main.ts | 69 ++++++++++------------------------------ 2 files changed, 95 insertions(+), 53 deletions(-) create mode 100644 src/batchGetJob.ts diff --git a/src/batchGetJob.ts b/src/batchGetJob.ts new file mode 100644 index 00000000..944f4e18 --- /dev/null +++ b/src/batchGetJob.ts @@ -0,0 +1,79 @@ +import { + CompiledSharedOptions, + EnhancedWithPgClient, + WorkerPoolOptions, +} from "."; +import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; +import { getJob as baseGetJob } from "./sql/getJob"; + +export function makeBatchGetJob( + compiledSharedOptions: CompiledSharedOptions, + tasks: TaskList, + withPgClient: EnhancedWithPgClient, + workerPool: WorkerPool, + getJobBatchSize: number, +): GetJobFunction { + let getJobCounter = 0; + let jobQueue: Job[] = []; + let nextJobs: Promise | null = null; + let getJobBaseline = 0; + const getJob: GetJobFunction = async (workerId, flagsToSkip) => { + // Cannot batch if there's flags + if (flagsToSkip !== null) { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerPool.id, + flagsToSkip, + 1, + ); + return jobs[0]; + } + + const job = jobQueue.pop(); + if (job !== undefined) { + return job; + } else { + return batchGetJob(++getJobCounter); + } + }; + + const batchGetJob = async (myFetchId: number): Promise => { + // TODO rewrite this so that if we have batch size of 1 we'll still fetch newer jobs in parallel (not queued) + if (!nextJobs) { + // Queue is empty, no fetch of jobs in progress; let's fetch them. + getJobBaseline = getJobCounter; + nextJobs = (async () => { + try { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerPool.id, + null, + getJobBatchSize, + ); + jobQueue = jobs.reverse(); + return jobs.length >= getJobBatchSize; + } finally { + nextJobs = null; + } + })(); + } + const fetchedMax = await nextJobs; + const job = jobQueue.pop(); + if (job) { + return job; + } else if (fetchedMax || myFetchId > getJobBaseline) { + // Either we fetched as many jobs as we could and there still weren't + // enough, or we requested a job after the request for jobs was sent to + // the database. Either way, let's fetch again. + return batchGetJob(myFetchId); + } else { + return undefined; + } + }; + + return getJob; +} diff --git a/src/main.ts b/src/main.ts index efa87643..6801d58b 100644 --- a/src/main.ts +++ b/src/main.ts @@ -3,6 +3,7 @@ import { EventEmitter } from "events"; import { Notification, Pool, PoolClient } from "pg"; import { inspect } from "util"; +import { makeBatchGetJob } from "./batchGetJob"; import defer from "./deferred"; import { makeWithPgClientFromClient, @@ -922,64 +923,26 @@ export function _runTaskList( `You must not set workerId when concurrency > 1; each worker must have a unique identifier`, ); } - let jobQueue: Job[] = []; - let nextJobs: Promise | null = null; - let getJobCounter = 0; - let getJobBaseline = 0; - const batchGetJob = async (myFetchId: number): Promise => { - if (!nextJobs) { - // Queue is empty, no fetch of jobs in progress; let's fetch them. - getJobBaseline = getJobCounter; - nextJobs = (async () => { - try { + const getJob: GetJobFunction = + getJobBatchSize >= 1 + ? makeBatchGetJob( + compiledSharedOptions, + tasks, + withPgClient, + workerPool, + getJobBatchSize, + ) + : async (_workerId, flagsToSkip) => { const jobs = await baseGetJob( compiledSharedOptions, withPgClient, tasks, workerPool.id, - null, - getJobBatchSize, + flagsToSkip, + 1, ); - jobQueue = jobs.reverse(); - return jobs.length >= getJobBatchSize; - } finally { - nextJobs = null; - } - })(); - } - const fetchedMax = await nextJobs; - const job = jobQueue.pop(); - if (job) { - return job; - } else if (fetchedMax || myFetchId > getJobBaseline) { - // Either we fetched as many jobs as we could and there still weren't - // enough, or we requested a job after the request for jobs was sent to - // the database. Either way, let's fetch again. - return batchGetJob(myFetchId); - } else { - return undefined; - } - }; - const getJob: GetJobFunction = async (_workerId, flagsToSkip) => { - if (flagsToSkip !== null || getJobBatchSize < 1) { - const jobs = await baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerPool.id, - flagsToSkip, - 1, - ); - return jobs[0]; - } else { - const job = jobQueue.pop(); - if (job !== undefined) { - return job; - } else { - return batchGetJob(++getJobCounter); - } - } - }; + return jobs[0]; + }; const { release: releaseCompleteJob, fn: completeJob } = ( completeJobBatchDelay >= 0 @@ -1153,7 +1116,7 @@ function batch( let pending = 0; let releasing = false; let released = false; - const promise = deferred(); + const promise = defer(); let currentBatch: { specs: TSpec[]; promise: Promise } | null = null; return { async release() { From 64f6aabb0cfc7d469b6f3dad59bfe3d56a602076 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 10 Jun 2024 16:15:38 +0100 Subject: [PATCH 015/155] Refactor again and explain the purpose of LocalQueue and how it works --- src/batchGetJob.ts | 79 ------------------------- src/localQueue.ts | 144 +++++++++++++++++++++++++++++++++++++++++++++ src/main.ts | 32 +++++----- 3 files changed, 162 insertions(+), 93 deletions(-) delete mode 100644 src/batchGetJob.ts create mode 100644 src/localQueue.ts diff --git a/src/batchGetJob.ts b/src/batchGetJob.ts deleted file mode 100644 index 944f4e18..00000000 --- a/src/batchGetJob.ts +++ /dev/null @@ -1,79 +0,0 @@ -import { - CompiledSharedOptions, - EnhancedWithPgClient, - WorkerPoolOptions, -} from "."; -import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; -import { getJob as baseGetJob } from "./sql/getJob"; - -export function makeBatchGetJob( - compiledSharedOptions: CompiledSharedOptions, - tasks: TaskList, - withPgClient: EnhancedWithPgClient, - workerPool: WorkerPool, - getJobBatchSize: number, -): GetJobFunction { - let getJobCounter = 0; - let jobQueue: Job[] = []; - let nextJobs: Promise | null = null; - let getJobBaseline = 0; - const getJob: GetJobFunction = async (workerId, flagsToSkip) => { - // Cannot batch if there's flags - if (flagsToSkip !== null) { - const jobs = await baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerPool.id, - flagsToSkip, - 1, - ); - return jobs[0]; - } - - const job = jobQueue.pop(); - if (job !== undefined) { - return job; - } else { - return batchGetJob(++getJobCounter); - } - }; - - const batchGetJob = async (myFetchId: number): Promise => { - // TODO rewrite this so that if we have batch size of 1 we'll still fetch newer jobs in parallel (not queued) - if (!nextJobs) { - // Queue is empty, no fetch of jobs in progress; let's fetch them. - getJobBaseline = getJobCounter; - nextJobs = (async () => { - try { - const jobs = await baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerPool.id, - null, - getJobBatchSize, - ); - jobQueue = jobs.reverse(); - return jobs.length >= getJobBatchSize; - } finally { - nextJobs = null; - } - })(); - } - const fetchedMax = await nextJobs; - const job = jobQueue.pop(); - if (job) { - return job; - } else if (fetchedMax || myFetchId > getJobBaseline) { - // Either we fetched as many jobs as we could and there still weren't - // enough, or we requested a job after the request for jobs was sent to - // the database. Either way, let's fetch again. - return batchGetJob(myFetchId); - } else { - return undefined; - } - }; - - return getJob; -} diff --git a/src/localQueue.ts b/src/localQueue.ts new file mode 100644 index 00000000..0bcf871e --- /dev/null +++ b/src/localQueue.ts @@ -0,0 +1,144 @@ +import { + CompiledSharedOptions, + EnhancedWithPgClient, + WorkerPoolOptions, +} from "."; +import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; +import { getJob as baseGetJob } from "./sql/getJob"; + +/** + * The local queue exists to reduce strain on the database; it works by + * fetching a batch of jobs from the database and distributing them to workers + * as and when necessary. It is also responsible for polling when in use, + * relieving the workers of this responsibility. + * + * The local queue trades latency for throughput: jobs may sit in the local + * queue for a longer time (maximum `localQueueSize` jobs waiting maximum + * `localQueueTTL` milliseconds), but fewer requests to the database are made + * for jobs since more jobs are fetched at once, enabling the worker to reach + * higher levels of performance (and reducing read stress on the DB). + * + * The local queue is always in one of these modes: + * + * - POLLING mode + * - WAITING mode + * - TTL_EXPIRED mode + * + * ## POLLING mode + * + * POLLING mode is the initial state of the local queue. The queue will only be + * in POLLING mode when it contains no cached jobs. + * + * When the queue enters POLLING mode (and when it starts) it will trigger a + * fetch of jobs from the database. + * + * If no jobs were returned then it will wait `pollInterval` ms and then fetch + * again. + * + * If a "new job" notification is received during the polling interval then the + * timer will be cancelled, and a fetch will be fired immediately. + * + * If jobs are returned from a POLLING mode fetch then the queue immediately + * enters WAITING mode. + * + * ## WAITING mode + * + * The local queue can only be in WAITING mode if there are cached jobs. + * + * Any waiting clients are issued any available cached jobs. + * + * If no cached jobs remain, then the local queue enters POLLING mode, + * triggering a fetch. + * + * If cached jobs remain (even if there's just one, even if it has been 30 + * minutes since the last fetch) then the local queue continues to wait for + * a worker to claim the remaining jobs. Once no jobs remain, the local queue + * reverts to POLLING mode, triggering a fetch. + * + * In WAITING mode, all "new job" announcements are ignored. + * + * The local queue can be in WAITING mode for at most `getJobBatchTime` + * milliseconds (default: 30 minutes), after which all unclaimed jobs are + * returned to the pool and the local queue enters TTL_EXPIRED mode. + * + * ## TTL_EXPIRED mode + * + * This mode is used when jobs were queued in WAITING mode for too long. The + * local queue will sit in TTL_EXPIRED mode until a worker asks for a job, + * whereupon the local queue will enter POLLING mode (triggering a fetch). + * + */ + +export class LocalQueue { + getJobCounter = 0; + jobQueue: Job[] = []; + nextJobs: Promise | null = null; + getJobBaseline = 0; + + constructor( + private compiledSharedOptions: CompiledSharedOptions, + private tasks: TaskList, + private withPgClient: EnhancedWithPgClient, + private workerPool: WorkerPool, + private getJobBatchSize: number, + ) {} + + // If you refactor this to be a method rather than a property, make sure that you `.bind(this)` to it. + public getJob: GetJobFunction = async (workerId, flagsToSkip) => { + // Cannot batch if there's flags + if (flagsToSkip !== null) { + const jobs = await baseGetJob( + this.compiledSharedOptions, + this.withPgClient, + this.tasks, + this.workerPool.id, + flagsToSkip, + 1, + ); + return jobs[0]; + } + + const job = this.jobQueue.pop(); + if (job !== undefined) { + return job; + } else { + return this.batchGetJob(++this.getJobCounter); + } + }; + + private async batchGetJob(myFetchId: number): Promise { + // TODO rewrite this so that if we have batch size of 1 we'll still fetch newer jobs in parallel (not queued) + if (!this.nextJobs) { + // Queue is empty, no fetch of jobs in progress; let's fetch them. + this.getJobBaseline = this.getJobCounter; + this.nextJobs = (async () => { + try { + const jobs = await baseGetJob( + this.compiledSharedOptions, + this.withPgClient, + this.tasks, + this.workerPool.id, + null, + this.getJobBatchSize, + ); + this.jobQueue = jobs.reverse(); + return jobs.length >= this.getJobBatchSize; + } finally { + this.nextJobs = null; + } + })(); + } + const fetchedMax = await this.nextJobs; + const job = this.jobQueue.pop(); + if (job) { + return job; + } else if (fetchedMax || myFetchId > this.getJobBaseline) { + // Either we fetched as many jobs as we could and there still weren't + // enough, or we requested a job after the request for jobs was sent to + // the database. Either way, let's fetch again. + return this.batchGetJob(myFetchId); + } else { + return undefined; + } + } +} diff --git a/src/main.ts b/src/main.ts index 6801d58b..29ec3752 100644 --- a/src/main.ts +++ b/src/main.ts @@ -3,7 +3,6 @@ import { EventEmitter } from "events"; import { Notification, Pool, PoolClient } from "pg"; import { inspect } from "util"; -import { makeBatchGetJob } from "./batchGetJob"; import defer from "./deferred"; import { makeWithPgClientFromClient, @@ -29,6 +28,7 @@ import { processSharedOptions, tryParseJson, } from "./lib"; +import { LocalQueue } from "./localQueue"; import { Logger } from "./logger"; import SIGNALS, { Signal } from "./signals"; import { completeJob as baseCompleteJob } from "./sql/completeJob"; @@ -566,6 +566,7 @@ export function _runTaskList( onTerminate, onDeactivate, } = options; + let autostart = rawAutostart; const { logger, events } = compiledSharedOptions; @@ -923,26 +924,29 @@ export function _runTaskList( `You must not set workerId when concurrency > 1; each worker must have a unique identifier`, ); } - const getJob: GetJobFunction = + const localQueue = getJobBatchSize >= 1 - ? makeBatchGetJob( + ? new LocalQueue( compiledSharedOptions, tasks, withPgClient, workerPool, getJobBatchSize, ) - : async (_workerId, flagsToSkip) => { - const jobs = await baseGetJob( - compiledSharedOptions, - withPgClient, - tasks, - workerPool.id, - flagsToSkip, - 1, - ); - return jobs[0]; - }; + : null; + const getJob: GetJobFunction = localQueue + ? localQueue.getJob // Already bound + : async (_workerId, flagsToSkip) => { + const jobs = await baseGetJob( + compiledSharedOptions, + withPgClient, + tasks, + workerPool.id, + flagsToSkip, + 1, + ); + return jobs[0]; + }; const { release: releaseCompleteJob, fn: completeJob } = ( completeJobBatchDelay >= 0 From 044aa5a2e7ccc8231166a8f4af97705ac9e4d66c Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 10 Jun 2024 17:38:38 +0100 Subject: [PATCH 016/155] Implement localQueue --- src/index.ts | 12 +- src/localQueue.ts | 265 +++++++++++++++++++++++++++++++++++-------- src/main.ts | 10 +- src/sql/returnJob.ts | 68 +++++++++++ 4 files changed, 300 insertions(+), 55 deletions(-) create mode 100644 src/sql/returnJob.ts diff --git a/src/index.ts b/src/index.ts index c997c0c5..e377676d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -158,7 +158,7 @@ declare global { /** * To enable processing jobs in batches, set this to an integer larger * than 1. This will result in jobs being fetched by the pool rather than - * the worker, the pool will fetch (and lock!) `getJobBatchSize` jobs up + * the worker, the pool will fetch (and lock!) `localQueueSize` jobs up * front, and each time a worker requests a job it will be served from * this list until the list is exhausted, at which point a new set of * jobs will be fetched (and locked). @@ -171,7 +171,15 @@ declare global { * * @default `-1` */ - getJobBatchSize?: number; + localQueueSize?: number; + + /** + * How long should jobs sit in the local queue before they are returned + * to the database? Defaults to 5 minutes. + * + * @default `300000` + */ + localQueueTtl?: number; /** * The time in milliseconds to wait after a `completeJob` call to see if diff --git a/src/localQueue.ts b/src/localQueue.ts index 0bcf871e..16aedcd6 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -1,10 +1,19 @@ +import assert from "assert"; + import { CompiledSharedOptions, EnhancedWithPgClient, WorkerPoolOptions, } from "."; +import { MINUTE, SECOND } from "./cronConstants"; +import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; import { getJob as baseGetJob } from "./sql/getJob"; +import { returnJob } from "./sql/returnJob"; + +const POLLING = "POLLING"; +const WAITING = "WAITING"; +const TTL_EXPIRED = "TTL_EXPIRED"; /** * The local queue exists to reduce strain on the database; it works by @@ -70,18 +79,208 @@ import { getJob as baseGetJob } from "./sql/getJob"; */ export class LocalQueue { - getJobCounter = 0; - jobQueue: Job[] = []; - nextJobs: Promise | null = null; - getJobBaseline = 0; + readonly ttl: number; + readonly pollInterval: number; + readonly jobQueue: Job[] = []; + readonly workerQueue: Deferred[] = []; + fetchInProgress = false; + ttlExpiredTimer: NodeJS.Timeout | null = null; + fetchTimer: NodeJS.Timeout | null = null; + // Set true to fetch immediately after a fetch completes; typically only used + // when the queue is pulsed during a fetch. + fetchAgain = false; + mode: typeof POLLING | typeof WAITING | typeof TTL_EXPIRED; constructor( - private compiledSharedOptions: CompiledSharedOptions, - private tasks: TaskList, - private withPgClient: EnhancedWithPgClient, - private workerPool: WorkerPool, - private getJobBatchSize: number, - ) {} + private readonly compiledSharedOptions: CompiledSharedOptions, + private readonly tasks: TaskList, + private readonly withPgClient: EnhancedWithPgClient, + private readonly workerPool: WorkerPool, + private readonly getJobBatchSize: number, + ) { + this.ttl = + compiledSharedOptions.resolvedPreset.worker.localQueueTtl ?? 5 * MINUTE; + this.pollInterval = + compiledSharedOptions.resolvedPreset.worker.pollInterval ?? 2 * SECOND; + this.setModePolling(); + } + + private setModePolling() { + assert.ok( + !this.fetchTimer, + "Cannot enter polling mode when a fetch is scheduled", + ); + assert.ok( + !this.fetchInProgress, + "Cannot enter polling mode when fetch is in progress", + ); + assert.equal( + this.jobQueue.length, + 0, + "Cannot enter polling mode when job queue isn't empty", + ); + + if (this.ttlExpiredTimer) { + clearTimeout(this.ttlExpiredTimer); + this.ttlExpiredTimer = null; + } + + this.mode = POLLING; + + this.fetch(); + } + + private setModeWaiting() { + // Can only enter WAITING mode from POLLING mode. + assert.equal(this.mode, POLLING); + assert.ok( + !this.fetchTimer, + "Cannot enter waiting mode when a fetch is scheduled", + ); + assert.ok( + !this.fetchInProgress, + "Cannot enter waiting mode when fetch is in progress", + ); + assert.notEqual( + this.jobQueue.length, + 0, + "Cannot enter waiting mode when job queue is empty", + ); + + if (this.ttlExpiredTimer) { + clearTimeout(this.ttlExpiredTimer); + } + + this.mode = WAITING; + + this.ttlExpiredTimer = setTimeout(() => { + this.setModeTtlExpired(); + }, this.ttl); + } + + private setModeTtlExpired() { + // Can only enter TTL_EXPIRED mode from WAITING mode. + assert.equal(this.mode, WAITING); + assert.ok( + !this.fetchTimer, + "Cannot enter TTL expired mode when a fetch is scheduled", + ); + assert.ok( + !this.fetchInProgress, + "Cannot enter TTL expired mode when fetch is in progress", + ); + assert.notEqual( + this.jobQueue.length, + 0, + "Cannot enter TTL expired mode when job queue is empty", + ); + + if (this.ttlExpiredTimer) { + clearTimeout(this.ttlExpiredTimer); + this.ttlExpiredTimer = null; + } + + this.mode = TTL_EXPIRED; + + // Return jobs to the pool + const jobsToReturn = this.jobQueue.splice(0, this.jobQueue.length); + returnJob( + this.compiledSharedOptions, + this.withPgClient, + this.workerPool.id, + jobsToReturn, + ).catch((e) => { + // TODO: handle this better! + this.compiledSharedOptions.logger.error( + `Failed to return jobs from local queue to database queue`, + { error: e }, + ); + }); + } + + private fetch = (): void => { + this._fetch().catch((e) => { + // This should not happen + this.compiledSharedOptions.logger.error(`Error occurred during fetch`, { + error: e, + }); + }); + }; + + private async _fetch() { + try { + assert.equal(this.mode, POLLING, "Can only fetch when in polling mode"); + assert.equal( + this.fetchInProgress, + false, + "Cannot fetch when a fetch is already in progress", + ); + if (this.fetchTimer) { + clearTimeout(this.fetchTimer); + this.fetchTimer = null; + } + this.fetchAgain = false; + this.fetchInProgress = true; + + const jobs = await baseGetJob( + this.compiledSharedOptions, + this.withPgClient, + this.tasks, + this.workerPool.id, + null, + this.getJobBatchSize, + ); + assert.equal( + this.jobQueue.length, + 0, + "Should not fetch when job queue isn't empty", + ); + const jobCount = jobs.length; + const workerCount = Math.min(jobCount, this.workerQueue.length); + const workers = this.workerQueue.splice(0, workerCount); + for (let i = 0; i < jobCount; i++) { + const job = jobs[i]; + if (i < workerCount) { + workers[i].resolve(job); + } else { + this.jobQueue.push(job); + } + } + if (this.jobQueue.length > 0) { + this.setModeWaiting(); + } else { + if (jobCount === this.getJobBatchSize || this.fetchAgain) { + // Maximal fetch; trigger immediate refetch + process.nextTick(this.fetch); + } else { + // Set up the timer + this.fetchTimer = setTimeout(this.fetch, this.pollInterval); + } + } + } catch (e) { + // Error happened; rely on poll interval. + this.compiledSharedOptions.logger.error( + `Error occurred fetching jobs; will try again on next poll interval. Error: ${e}`, + { error: e }, + ); + } finally { + this.fetchInProgress = false; + } + } + + /** Called when a new job becomes available in the DB */ + public pulse() { + // The only situation when this affects anything is if we're running in polling mode. + if (this.mode === POLLING) { + if (this.fetchInProgress) { + this.fetchAgain = true; + } else if (this.fetchTimer) { + clearTimeout(this.fetchTimer); + this.fetchTimer = null; + this.fetch(); + } + } + } // If you refactor this to be a method rather than a property, make sure that you `.bind(this)` to it. public getJob: GetJobFunction = async (workerId, flagsToSkip) => { @@ -98,47 +297,17 @@ export class LocalQueue { return jobs[0]; } - const job = this.jobQueue.pop(); - if (job !== undefined) { - return job; - } else { - return this.batchGetJob(++this.getJobCounter); + if (this.mode === TTL_EXPIRED) { + this.setModePolling(); } - }; - private async batchGetJob(myFetchId: number): Promise { - // TODO rewrite this so that if we have batch size of 1 we'll still fetch newer jobs in parallel (not queued) - if (!this.nextJobs) { - // Queue is empty, no fetch of jobs in progress; let's fetch them. - this.getJobBaseline = this.getJobCounter; - this.nextJobs = (async () => { - try { - const jobs = await baseGetJob( - this.compiledSharedOptions, - this.withPgClient, - this.tasks, - this.workerPool.id, - null, - this.getJobBatchSize, - ); - this.jobQueue = jobs.reverse(); - return jobs.length >= this.getJobBatchSize; - } finally { - this.nextJobs = null; - } - })(); - } - const fetchedMax = await this.nextJobs; - const job = this.jobQueue.pop(); - if (job) { + const job = this.jobQueue.shift(); + if (job !== undefined) { return job; - } else if (fetchedMax || myFetchId > this.getJobBaseline) { - // Either we fetched as many jobs as we could and there still weren't - // enough, or we requested a job after the request for jobs was sent to - // the database. Either way, let's fetch again. - return this.batchGetJob(myFetchId); } else { - return undefined; + const d = defer(); + this.workerQueue.push(d); + return d; } - } + }; } diff --git a/src/main.ts b/src/main.ts index 29ec3752..8c327b47 100644 --- a/src/main.ts +++ b/src/main.ts @@ -552,7 +552,7 @@ export function _runTaskList( worker: { concurrentJobs: baseConcurrency, gracefulShutdownAbortTimeout, - getJobBatchSize = -1, + localQueueSize = -1, completeJobBatchDelay = -1, failJobBatchDelay = -1, }, @@ -577,9 +577,9 @@ export function _runTaskList( ); } - if (getJobBatchSize > 0 && getJobBatchSize < concurrency) { + if (localQueueSize > 0 && localQueueSize < concurrency) { logger.warn( - `Your job batch size (${getJobBatchSize}) is smaller than your concurrency setting (${concurrency}); this may result in drastically lower performance if your jobs can complete quickly. Please update to \`getJobBatchSize: ${concurrency}\` to improve performance, or \`getJobBatchSize: -1\` to disable batching.`, + `Your job batch size (${localQueueSize}) is smaller than your concurrency setting (${concurrency}); this may result in drastically lower performance if your jobs can complete quickly. Please update to \`localQueueSize: ${concurrency}\` to improve performance, or \`localQueueSize: -1\` to disable batching.`, ); } @@ -925,13 +925,13 @@ export function _runTaskList( ); } const localQueue = - getJobBatchSize >= 1 + localQueueSize >= 1 ? new LocalQueue( compiledSharedOptions, tasks, withPgClient, workerPool, - getJobBatchSize, + localQueueSize, ) : null; const getJob: GetJobFunction = localQueue diff --git a/src/sql/returnJob.ts b/src/sql/returnJob.ts new file mode 100644 index 00000000..f7b22f4f --- /dev/null +++ b/src/sql/returnJob.ts @@ -0,0 +1,68 @@ +import { DbJob, EnhancedWithPgClient } from "../interfaces"; +import { CompiledSharedOptions } from "../lib"; + +export async function returnJob( + compiledSharedOptions: CompiledSharedOptions, + withPgClient: EnhancedWithPgClient, + poolId: string, + jobs: ReadonlyArray, +): Promise { + const { + escapedWorkerSchema, + workerSchema, + resolvedPreset: { + worker: { preparedStatements }, + }, + } = compiledSharedOptions; + + const jobsWithQueues: DbJob[] = []; + const jobsWithoutQueues: DbJob[] = []; + + for (const job of jobs) { + if (job.job_queue_id != null) { + jobsWithQueues.push(job); + } else { + jobsWithoutQueues.push(job); + } + } + + if (jobsWithQueues.length > 0) { + await withPgClient.withRetries((client) => + client.query({ + text: `\ +with j as ( +update ${escapedWorkerSchema}._private_jobs as jobs +set +attempts = GREATEST(0, attempts - 1), +locked_by = null, +locked_at = null +where id = ANY($2::bigint[]) +and locked_by = $1::text +returning * +) +update ${escapedWorkerSchema}._private_job_queues as job_queues +set locked_by = null, locked_at = null +from j +where job_queues.id = j.job_queue_id and job_queues.locked_by = $1::text;`, + values: [poolId, jobsWithQueues.map((job) => job.id)], + name: !preparedStatements ? undefined : `return_job_q/${workerSchema}`, + }), + ); + } + if (jobsWithoutQueues.length > 0) { + await withPgClient.withRetries((client) => + client.query({ + text: `\ +update ${escapedWorkerSchema}._private_jobs as jobs +set +attempts = GREATEST(0, attempts - 1), +locked_by = null, +locked_at = null +where id = ANY($2::bigint[]) +and locked_by = $1::text;`, + values: [poolId, jobsWithoutQueues.map((job) => job.id)], + name: !preparedStatements ? undefined : `return_job/${workerSchema}`, + }), + ); + } +} From 33faf355125e1ffbbe5281dae3b213948d668461 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 09:09:59 +0100 Subject: [PATCH 017/155] Return to POLLING from WAITING; RELEASE mode. --- src/localQueue.ts | 88 ++++++++++++++++++++++++++++++++++++++++------- src/main.ts | 25 +++++++++++--- 2 files changed, 96 insertions(+), 17 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 16aedcd6..4c4a6680 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -14,6 +14,7 @@ import { returnJob } from "./sql/returnJob"; const POLLING = "POLLING"; const WAITING = "WAITING"; const TTL_EXPIRED = "TTL_EXPIRED"; +const RELEASED = "RELEASED"; /** * The local queue exists to reduce strain on the database; it works by @@ -32,6 +33,7 @@ const TTL_EXPIRED = "TTL_EXPIRED"; * - POLLING mode * - WAITING mode * - TTL_EXPIRED mode + * - RELEASED mode * * ## POLLING mode * @@ -76,6 +78,9 @@ const TTL_EXPIRED = "TTL_EXPIRED"; * local queue will sit in TTL_EXPIRED mode until a worker asks for a job, * whereupon the local queue will enter POLLING mode (triggering a fetch). * + * ## RELEASED mode + * + * Triggered on shutdown. */ export class LocalQueue { @@ -89,7 +94,7 @@ export class LocalQueue { // Set true to fetch immediately after a fetch completes; typically only used // when the queue is pulsed during a fetch. fetchAgain = false; - mode: typeof POLLING | typeof WAITING | typeof TTL_EXPIRED; + mode: typeof POLLING | typeof WAITING | typeof TTL_EXPIRED | typeof RELEASED; constructor( private readonly compiledSharedOptions: CompiledSharedOptions, @@ -97,6 +102,7 @@ export class LocalQueue { private readonly withPgClient: EnhancedWithPgClient, private readonly workerPool: WorkerPool, private readonly getJobBatchSize: number, + private readonly continuous: boolean, ) { this.ttl = compiledSharedOptions.resolvedPreset.worker.localQueueTtl ?? 5 * MINUTE; @@ -183,6 +189,10 @@ export class LocalQueue { this.mode = TTL_EXPIRED; // Return jobs to the pool + this.returnJobs(); + } + + private returnJobs() { const jobsToReturn = this.jobQueue.splice(0, this.jobQueue.length); returnJob( this.compiledSharedOptions, @@ -208,6 +218,7 @@ export class LocalQueue { }; private async _fetch() { + let fetchedMax = false; try { assert.equal(this.mode, POLLING, "Can only fetch when in polling mode"); assert.equal( @@ -222,6 +233,7 @@ export class LocalQueue { this.fetchAgain = false; this.fetchInProgress = true; + // The ONLY await in this function. const jobs = await baseGetJob( this.compiledSharedOptions, this.withPgClient, @@ -230,12 +242,14 @@ export class LocalQueue { null, this.getJobBatchSize, ); + assert.equal( this.jobQueue.length, 0, "Should not fetch when job queue isn't empty", ); const jobCount = jobs.length; + fetchedMax = jobCount >= this.getJobBatchSize; const workerCount = Math.min(jobCount, this.workerQueue.length); const workers = this.workerQueue.splice(0, workerCount); for (let i = 0; i < jobCount; i++) { @@ -246,17 +260,6 @@ export class LocalQueue { this.jobQueue.push(job); } } - if (this.jobQueue.length > 0) { - this.setModeWaiting(); - } else { - if (jobCount === this.getJobBatchSize || this.fetchAgain) { - // Maximal fetch; trigger immediate refetch - process.nextTick(this.fetch); - } else { - // Set up the timer - this.fetchTimer = setTimeout(this.fetch, this.pollInterval); - } - } } catch (e) { // Error happened; rely on poll interval. this.compiledSharedOptions.logger.error( @@ -266,6 +269,23 @@ export class LocalQueue { } finally { this.fetchInProgress = false; } + + // Finally, now that there is no fetch in progress, choose what to do next + if (this.mode === "RELEASED") { + this.returnJobs(); + } else if (this.jobQueue.length > 0) { + this.setModeWaiting(); + } else { + if (fetchedMax || this.fetchAgain) { + // Maximal fetch; trigger immediate refetch + this.fetch(); + } else if (this.continuous) { + // Set up the timer + this.fetchTimer = setTimeout(this.fetch, this.pollInterval); + } else { + this.setModeReleased(); + } + } } /** Called when a new job becomes available in the DB */ @@ -284,6 +304,10 @@ export class LocalQueue { // If you refactor this to be a method rather than a property, make sure that you `.bind(this)` to it. public getJob: GetJobFunction = async (workerId, flagsToSkip) => { + if (this.mode === RELEASED) { + return undefined; + } + // Cannot batch if there's flags if (flagsToSkip !== null) { const jobs = await baseGetJob( @@ -303,6 +327,10 @@ export class LocalQueue { const job = this.jobQueue.shift(); if (job !== undefined) { + if (this.jobQueue.length === 0) { + assert.equal(this.mode, WAITING); + this.setModePolling(); + } return job; } else { const d = defer(); @@ -310,4 +338,40 @@ export class LocalQueue { return d; } }; + + public release() { + this.setModeReleased(); + } + + private setModeReleased() { + assert.notEqual( + this.mode, + RELEASED, + "LocalQueue must only be released once", + ); + + const oldMode = this.mode; + this.mode = RELEASED; + + if (oldMode === POLLING) { + if (this.fetchTimer) { + clearTimeout(this.fetchTimer); + this.fetchTimer = null; + this.workerQueue.forEach((w) => w.resolve(undefined)); + } else { + // Rely on checking mode at end of fetch + } + } else if (oldMode === WAITING) { + if (this.ttlExpiredTimer) { + clearTimeout(this.ttlExpiredTimer); + this.ttlExpiredTimer = null; + } + // Trigger the jobs to be released + this.returnJobs(); + } else if (oldMode === TTL_EXPIRED) { + // No action necessary + } + + // TODO: we should await the returnJobs promise, the fetch promise, etc. + } } diff --git a/src/main.ts b/src/main.ts index 8c327b47..f813947b 100644 --- a/src/main.ts +++ b/src/main.ts @@ -597,11 +597,16 @@ export function _runTaskList( // TODO: stop the batch()es and await the promises here const releaseCompleteJobPromise = releaseCompleteJob?.(); const releaseFailJobPromise = releaseFailJob?.(); - const [releaseCompleteJobResult, releaseFailJobResult] = - await Promise.allSettled([ - releaseCompleteJobPromise, - releaseFailJobPromise, - ]); + const releaseLocalQueue = localQueue?.release(); + const [ + releaseCompleteJobResult, + releaseFailJobResult, + releaseLocalQueueResult, + ] = await Promise.allSettled([ + releaseCompleteJobPromise, + releaseFailJobPromise, + releaseLocalQueue, + ]); if (releaseCompleteJobResult.status === "rejected") { // Log but continue regardless logger.error( @@ -620,6 +625,15 @@ export function _runTaskList( }, ); } + if (releaseLocalQueueResult.status === "rejected") { + // Log but continue regardless + logger.error( + `Releasing local queue failed: ${releaseLocalQueueResult.reason}`, + { + error: releaseLocalQueueResult.reason, + }, + ); + } return onDeactivate?.(); } } @@ -932,6 +946,7 @@ export function _runTaskList( withPgClient, workerPool, localQueueSize, + continuous, ) : null; const getJob: GetJobFunction = localQueue From 8ac92e6ca562ea0b5aa8f8b7b8c7f518a6cf0688 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 09:18:23 +0100 Subject: [PATCH 018/155] Fix releasing from POLLING mode --- src/localQueue.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 4c4a6680..c009b19c 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -354,10 +354,14 @@ export class LocalQueue { this.mode = RELEASED; if (oldMode === POLLING) { + // Release pending workers + const workers = this.workerQueue.splice(0, this.workerQueue.length); + workers.forEach((w) => w.resolve(undefined)); + + // Release next fetch call if (this.fetchTimer) { clearTimeout(this.fetchTimer); this.fetchTimer = null; - this.workerQueue.forEach((w) => w.resolve(undefined)); } else { // Rely on checking mode at end of fetch } From d7e381e011cc049bcef3676ae4592edc19b16c38 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 10:35:16 +0100 Subject: [PATCH 019/155] Concurrency 24 works better on my new machine --- perfTest/run.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perfTest/run.js b/perfTest/run.js index 4235ba54..1bb23111 100755 --- a/perfTest/run.js +++ b/perfTest/run.js @@ -6,7 +6,7 @@ const exec = promisify(rawExec); const JOB_COUNT = 20000; const STUCK_JOB_COUNT = 0; const PARALLELISM = 4; -const CONCURRENCY = 10; +const CONCURRENCY = 24; const time = async (cb) => { const start = process.hrtime(); From 1f6a4b01825bed0caf50d0ef59c3daa256efd0e3 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 12:00:56 +0100 Subject: [PATCH 020/155] Wait for background tasks to complete before releasing --- src/interfaces.ts | 2 +- src/localQueue.ts | 81 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 24 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index a4747768..3af440f1 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -1248,7 +1248,7 @@ export interface WorkerPluginContext { export type GetJobFunction = ( workerId: string, flagsToSkip: string[] | null, -) => Promise; +) => PromiseOrDirect; export type CompleteJobFunction = (job: DbJob) => void; export type FailJobFunction = ( diff --git a/src/localQueue.ts b/src/localQueue.ts index c009b19c..d117f4a1 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -3,6 +3,7 @@ import assert from "assert"; import { CompiledSharedOptions, EnhancedWithPgClient, + PromiseOrDirect, WorkerPoolOptions, } from "."; import { MINUTE, SECOND } from "./cronConstants"; @@ -95,6 +96,8 @@ export class LocalQueue { // when the queue is pulsed during a fetch. fetchAgain = false; mode: typeof POLLING | typeof WAITING | typeof TTL_EXPIRED | typeof RELEASED; + private promise = defer(); + private backgroundCount = 0; constructor( private readonly compiledSharedOptions: CompiledSharedOptions, @@ -111,6 +114,28 @@ export class LocalQueue { this.setModePolling(); } + private decreaseBackgroundCount = () => { + this.backgroundCount--; + if (this.mode === "RELEASED" && this.backgroundCount === 0) { + this.promise.resolve(); + } + }; + + /** + * For promises that happen in the background, but that we want to ensure are + * handled before we release the queue (so that the database pool isn't + * released too early). + */ + private background(promise: Promise) { + if (this.mode === "RELEASED" && this.backgroundCount === 0) { + throw new Error( + `Cannot background something when the queue is already released`, + ); + } + this.backgroundCount++; + promise.then(this.decreaseBackgroundCount, this.decreaseBackgroundCount); + } + private setModePolling() { assert.ok( !this.fetchTimer, @@ -194,27 +219,31 @@ export class LocalQueue { private returnJobs() { const jobsToReturn = this.jobQueue.splice(0, this.jobQueue.length); - returnJob( - this.compiledSharedOptions, - this.withPgClient, - this.workerPool.id, - jobsToReturn, - ).catch((e) => { - // TODO: handle this better! - this.compiledSharedOptions.logger.error( - `Failed to return jobs from local queue to database queue`, - { error: e }, - ); - }); + this.background( + returnJob( + this.compiledSharedOptions, + this.withPgClient, + this.workerPool.id, + jobsToReturn, + ).then((e) => { + // TODO: handle this better! + this.compiledSharedOptions.logger.error( + `Failed to return jobs from local queue to database queue`, + { error: e }, + ); + }), + ); } private fetch = (): void => { - this._fetch().catch((e) => { - // This should not happen - this.compiledSharedOptions.logger.error(`Error occurred during fetch`, { - error: e, - }); - }); + this.background( + this._fetch().catch((e) => { + // This should not happen + this.compiledSharedOptions.logger.error(`Error occurred during fetch`, { + error: e, + }); + }), + ); }; private async _fetch() { @@ -303,14 +332,14 @@ export class LocalQueue { } // If you refactor this to be a method rather than a property, make sure that you `.bind(this)` to it. - public getJob: GetJobFunction = async (workerId, flagsToSkip) => { + public getJob: GetJobFunction = (workerId, flagsToSkip) => { if (this.mode === RELEASED) { return undefined; } // Cannot batch if there's flags if (flagsToSkip !== null) { - const jobs = await baseGetJob( + const jobsPromise = baseGetJob( this.compiledSharedOptions, this.withPgClient, this.tasks, @@ -318,7 +347,7 @@ export class LocalQueue { flagsToSkip, 1, ); - return jobs[0]; + return jobsPromise.then((jobs) => jobs[0]); } if (this.mode === TTL_EXPIRED) { @@ -340,7 +369,10 @@ export class LocalQueue { }; public release() { - this.setModeReleased(); + if (this.mode !== "RELEASED") { + this.setModeReleased(); + } + return this.promise; } private setModeReleased() { @@ -362,6 +394,7 @@ export class LocalQueue { if (this.fetchTimer) { clearTimeout(this.fetchTimer); this.fetchTimer = null; + this.promise.resolve(); } else { // Rely on checking mode at end of fetch } @@ -376,6 +409,8 @@ export class LocalQueue { // No action necessary } - // TODO: we should await the returnJobs promise, the fetch promise, etc. + if (this.backgroundCount === 0) { + this.promise.resolve(); + } } } From 6b51f26651f865530e25677f0fc14e49d43e54d0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 12:58:51 +0100 Subject: [PATCH 021/155] Default settings --- perfTest/graphile.config.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/perfTest/graphile.config.js b/perfTest/graphile.config.js index 32ef26f8..841ba7d6 100644 --- a/perfTest/graphile.config.js +++ b/perfTest/graphile.config.js @@ -15,6 +15,10 @@ const preset = { fileExtensions: [".js", ".cjs", ".mjs"], // fileExtensions: [".js", ".cjs", ".mjs", ".ts", ".cts", ".mts"], gracefulShutdownAbortTimeout: 2500, + localQueueSize: -1, + completeJobBatchDelay: -1, + failJobBatchDelay: -1, }, }; + module.exports = preset; From 016ac3990a59b74c55830eab715077af278179c6 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 13:53:04 +0100 Subject: [PATCH 022/155] Remove double map --- src/sql/completeJob.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/sql/completeJob.ts b/src/sql/completeJob.ts index f64a827b..d3562ae9 100644 --- a/src/sql/completeJob.ts +++ b/src/sql/completeJob.ts @@ -15,18 +15,18 @@ export async function completeJob( }, } = compiledSharedOptions; - const jobsWithQueues: DbJob[] = []; - const jobsWithoutQueues: DbJob[] = []; + const jobIdsWithoutQueue: string[] = []; + const jobIdsWithQueue: string[] = []; for (const job of jobs) { if (job.job_queue_id != null) { - jobsWithQueues.push(job); + jobIdsWithQueue.push(job.id); } else { - jobsWithoutQueues.push(job); + jobIdsWithoutQueue.push(job.id); } } // TODO: retry logic, in case of server connection interruption - if (jobsWithQueues.length > 0) { + if (jobIdsWithQueue.length > 0) { await withPgClient.withRetries((client) => client.query({ text: `\ @@ -39,20 +39,20 @@ update ${escapedWorkerSchema}._private_job_queues as job_queues set locked_by = null, locked_at = null from j where job_queues.id = j.job_queue_id and job_queues.locked_by = $2::text;`, - values: [jobsWithQueues.map((j) => j.id), poolId], + values: [jobIdsWithQueue, poolId], name: !preparedStatements ? undefined : `complete_job_q/${workerSchema}`, }), ); } - if (jobsWithoutQueues.length > 0) { + if (jobIdsWithoutQueue.length > 0) { await withPgClient.withRetries((client) => client.query({ text: `\ delete from ${escapedWorkerSchema}._private_jobs as jobs where id = ANY($1::bigint[])`, - values: [jobsWithoutQueues.map((j) => j.id)], + values: [jobIdsWithoutQueue], name: !preparedStatements ? undefined : `complete_job/${workerSchema}`, }), ); From 59f4d99813cc1139553f4ad45bd9b7485b7cb4bc Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 14:12:49 +0100 Subject: [PATCH 023/155] More efficient implementation --- src/main.ts | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/src/main.ts b/src/main.ts index f813947b..b0163686 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1135,8 +1135,18 @@ function batch( let pending = 0; let releasing = false; let released = false; + const incrementPending = () => { + pending++; + }; + const decrementPending = () => { + pending--; + if (pending === 0 && releasing === true) { + released = true; + promise.resolve(); + } + }; const promise = defer(); - let currentBatch: { specs: TSpec[]; promise: Promise } | null = null; + let currentBatch: TSpec[] | null = null; return { async release() { if (releasing) { @@ -1156,33 +1166,21 @@ function batch( ); } const spec = makeSpec(...args); - if (currentBatch) { - currentBatch.specs.push(spec); + if (currentBatch !== null) { + currentBatch.push(spec); } else { const specs = [spec]; - currentBatch = { - specs, - promise: (async () => { - pending++; - try { - await sleep(delay); - currentBatch = null; - await callback(specs); - } catch (error) { - errorHandler(error, specs); - } finally { - pending--; - if (pending === 0 && releasing) { - released = true; - promise.resolve(); - } - } - })(), - }; + currentBatch = specs; + incrementPending(); + setTimeout(() => { + currentBatch = null; + callback(specs).then(decrementPending, (error) => { + decrementPending(); + errorHandler(error, specs); + }); + }, delay); } return; }, }; } -const sleep = (ms: number) => - new Promise((resolve) => setTimeout(resolve, ms)); From 52dd450dca07153f47696adf4e0a2356349956bf Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 14:22:24 +0100 Subject: [PATCH 024/155] Neater API --- src/interfaces.ts | 10 +++++----- src/main.ts | 18 +++++------------- src/worker.ts | 11 ++++++----- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index 3af440f1..dfa10aed 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -1251,8 +1251,8 @@ export type GetJobFunction = ( ) => PromiseOrDirect; export type CompleteJobFunction = (job: DbJob) => void; -export type FailJobFunction = ( - job: DbJob, - message: string, - replacementPayload: undefined | unknown[], -) => void; +export type FailJobFunction = (spec: { + job: DbJob; + message: string; + replacementPayload: undefined | unknown[]; +}) => void; diff --git a/src/main.ts b/src/main.ts index b0163686..0cc403ab 100644 --- a/src/main.ts +++ b/src/main.ts @@ -967,7 +967,6 @@ export function _runTaskList( completeJobBatchDelay >= 0 ? batch( completeJobBatchDelay, - (job) => job, (jobs) => baseCompleteJob( compiledSharedOptions, @@ -1006,11 +1005,6 @@ export function _runTaskList( failJobBatchDelay >= 0 ? batch( failJobBatchDelay, - (job, message, replacementPayload) => ({ - job, - message, - replacementPayload, - }), (specs) => baseFailJob( compiledSharedOptions, @@ -1035,9 +1029,9 @@ export function _runTaskList( ) : { release: null, - fn: (job, message, replacementPayload) => + fn: (spec) => baseFailJob(compiledSharedOptions, withPgClient, workerPool.id, [ - { job, message, replacementPayload }, + spec, ]), } ) as { release: (() => void) | null; fn: FailJobFunction }; @@ -1120,9 +1114,8 @@ export const runTaskListOnce = ( return pool; }; -function batch( +function batch( delay: number, - makeSpec: (...args: TArgs) => TSpec, callback: (specs: ReadonlyArray) => Promise, errorHandler: ( error: unknown, @@ -1130,7 +1123,7 @@ function batch( ) => void | Promise, ): { release(): void | Promise; - fn: (...args: TArgs) => void; + fn: (spec: TSpec) => void; } { let pending = 0; let releasing = false; @@ -1159,13 +1152,12 @@ function batch( } await promise; }, - fn(...args) { + fn(spec) { if (released) { throw new Error( "This batcher has been released, and so no more calls can be made.", ); } - const spec = makeSpec(...args); if (currentBatch !== null) { currentBatch.push(spec); } else { diff --git a/src/worker.ts b/src/worker.ts index 174bc7c9..b21d8786 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -348,14 +348,15 @@ export function makeNewWorker( }`, { failure: true, job, error: err, duration }, ); - failJob( + failJob({ job, message, // "Batch jobs": copy through only the unsuccessful parts of the payload - batchJobFailedPayloads.length > 0 - ? batchJobFailedPayloads - : undefined, - ); + replacementPayload: + batchJobFailedPayloads.length > 0 + ? batchJobFailedPayloads + : undefined, + }); } else { try { events.emit("job:success", { worker, job }); From af7a26b6cc4e75b04421423cbe49ee67ca0d5d79 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 14:38:10 +0100 Subject: [PATCH 025/155] Marginally more efficient --- src/sql/completeJob.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sql/completeJob.ts b/src/sql/completeJob.ts index d3562ae9..5f5b1581 100644 --- a/src/sql/completeJob.ts +++ b/src/sql/completeJob.ts @@ -32,7 +32,8 @@ export async function completeJob( text: `\ with j as ( delete from ${escapedWorkerSchema}._private_jobs as jobs -where id = ANY($1::bigint[]) +from unnest($1::bigint[]) n(n) +where id = n returning * ) update ${escapedWorkerSchema}._private_job_queues as job_queues @@ -51,7 +52,8 @@ where job_queues.id = j.job_queue_id and job_queues.locked_by = $2::text;`, client.query({ text: `\ delete from ${escapedWorkerSchema}._private_jobs as jobs -where id = ANY($1::bigint[])`, +using unnest($1::bigint[]) n(n) +where id = n`, values: [jobIdsWithoutQueue], name: !preparedStatements ? undefined : `complete_job/${workerSchema}`, }), From d05c9667a62ee9453781fcaafb91d3b052d80391 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 15:23:56 +0100 Subject: [PATCH 026/155] Need to test more jobs because we're too fast now --- perfTest/run.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perfTest/run.js b/perfTest/run.js index 1bb23111..1903a89b 100755 --- a/perfTest/run.js +++ b/perfTest/run.js @@ -3,7 +3,7 @@ const { execSync, exec: rawExec } = require("child_process"); const { promisify } = require("util"); const exec = promisify(rawExec); -const JOB_COUNT = 20000; +const JOB_COUNT = 200000; const STUCK_JOB_COUNT = 0; const PARALLELISM = 4; const CONCURRENCY = 24; From 9da3ed8b50b6a2e07011961b20c7350aa2307a26 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 15:24:08 +0100 Subject: [PATCH 027/155] Boolean comparison first --- src/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 0cc403ab..1c3e78a8 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1133,7 +1133,7 @@ function batch( }; const decrementPending = () => { pending--; - if (pending === 0 && releasing === true) { + if (releasing === true && pending === 0) { released = true; promise.resolve(); } From 99e92318e675205c0f8414a89336ee986f8c7546 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 15:24:57 +0100 Subject: [PATCH 028/155] Single job different statement, plus disabled alternative approach --- src/sql/completeJob.ts | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/sql/completeJob.ts b/src/sql/completeJob.ts index 5f5b1581..cba10a3f 100644 --- a/src/sql/completeJob.ts +++ b/src/sql/completeJob.ts @@ -1,6 +1,8 @@ import { DbJob, EnhancedWithPgClient } from "../interfaces"; import { CompiledSharedOptions } from "../lib"; +const manualPrepare = false; + export async function completeJob( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, @@ -47,16 +49,39 @@ where job_queues.id = j.job_queue_id and job_queues.locked_by = $2::text;`, }), ); } - if (jobIdsWithoutQueue.length > 0) { + if (jobIdsWithoutQueue.length === 1) { await withPgClient.withRetries((client) => client.query({ text: `\ delete from ${escapedWorkerSchema}._private_jobs as jobs -using unnest($1::bigint[]) n(n) -where id = n`, - values: [jobIdsWithoutQueue], +where id = $1::bigint`, + values: [jobIdsWithoutQueue[0]], name: !preparedStatements ? undefined : `complete_job/${workerSchema}`, }), ); + } else if (jobIdsWithoutQueue.length > 1) { + if (manualPrepare) { + await withPgClient.withRetries((client) => + client.query({ + text: `\ +prepare gwcj (bigint) as delete from ${escapedWorkerSchema}._private_jobs where id = $1; +${jobIdsWithoutQueue.map((id) => `execute gwcj(${id});`).join("\n")} +deallocate gwcj;`, + }), + ); + } else { + await withPgClient.withRetries((client) => + client.query({ + text: `\ +delete from ${escapedWorkerSchema}._private_jobs as jobs +using unnest($1::bigint[]) n(n) +where id = n`, + values: [jobIdsWithoutQueue], + name: !preparedStatements + ? undefined + : `complete_jobs/${workerSchema}`, + }), + ); + } } } From f757f0df7bb3dbec497e62b3e3173d4d6356b253 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 15:33:11 +0100 Subject: [PATCH 029/155] Listen with new LocalQueue too --- src/interfaces.ts | 2 ++ src/main.ts | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index dfa10aed..41c515eb 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -511,6 +511,8 @@ export interface Worker { export interface WorkerPool { id: string; + /** Encourage `n` workers to look for jobs _right now_, cancelling the delay timers. */ + nudge(n: number): void; /** @deprecated Use gracefulShutdown instead */ release: () => Promise; gracefulShutdown: (message?: string) => Promise; diff --git a/src/main.ts b/src/main.ts index 1c3e78a8..64fdb2ba 100644 --- a/src/main.ts +++ b/src/main.ts @@ -448,10 +448,9 @@ export function runTaskListInternal( const payload = tryParseJson<{ count: number; }>(message.payload); - let n = payload?.count ?? 1; + const n = payload?.count ?? 1; if (n > 0) { - // Nudge up to `n` workers - workerPool._workers.some((worker) => worker.nudge() && --n <= 0); + workerPool.nudge(n); } break; } @@ -681,6 +680,14 @@ export function _runTaskList( get worker() { return concurrency === 1 ? this._workers[0] ?? null : null; }, + nudge(this: WorkerPool, n: number) { + if (localQueue) { + localQueue.pulse(); + } else { + // Nudge up to `n` workers + this._workers.some((worker) => worker.nudge() && --n <= 0); + } + }, abortSignal, abortPromise, release() { From 37b5055ceef277cba13faaefc4c7a0ceade27acb Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 16:23:51 +0100 Subject: [PATCH 030/155] Lint fixes --- src/localQueue.ts | 1 - src/main.ts | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index d117f4a1..383a1490 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -3,7 +3,6 @@ import assert from "assert"; import { CompiledSharedOptions, EnhancedWithPgClient, - PromiseOrDirect, WorkerPoolOptions, } from "."; import { MINUTE, SECOND } from "./cronConstants"; diff --git a/src/main.ts b/src/main.ts index 64fdb2ba..9a08d277 100644 --- a/src/main.ts +++ b/src/main.ts @@ -680,10 +680,11 @@ export function _runTaskList( get worker() { return concurrency === 1 ? this._workers[0] ?? null : null; }, - nudge(this: WorkerPool, n: number) { + nudge(this: WorkerPool, count: number) { if (localQueue) { localQueue.pulse(); } else { + let n = count; // Nudge up to `n` workers this._workers.some((worker) => worker.nudge() && --n <= 0); } From 222c979bc7f50aae165e7906bcb2b12fbb3c5fab Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 11 Jun 2024 16:34:55 +0100 Subject: [PATCH 031/155] Oops --- src/sql/completeJob.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sql/completeJob.ts b/src/sql/completeJob.ts index cba10a3f..83004575 100644 --- a/src/sql/completeJob.ts +++ b/src/sql/completeJob.ts @@ -34,7 +34,7 @@ export async function completeJob( text: `\ with j as ( delete from ${escapedWorkerSchema}._private_jobs as jobs -from unnest($1::bigint[]) n(n) +using unnest($1::bigint[]) n(n) where id = n returning * ) From 96b212695af97ef4c1797d0852ce353ab9710c88 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 12 Jul 2024 11:11:47 +0100 Subject: [PATCH 032/155] 0.17.0-canary.6c2c85c --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index b53dd921..13805e92 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.16.6", + "version": "0.17.0-canary.6c2c85c", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 8756a0f5..87b5199a 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.16.6"; +export const version = "0.17.0-canary.6c2c85c"; From 333c9c2105ebc69a5fdb85315de14aabee063140 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 16 Oct 2024 16:33:58 +0100 Subject: [PATCH 033/155] 0.17.0-canary.67dbcb6 --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 13805e92..1bd159d5 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.6c2c85c", + "version": "0.17.0-canary.67dbcb6", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 87b5199a..76f7fe6f 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.6c2c85c"; +export const version = "0.17.0-canary.67dbcb6"; From 56ca27731265325c793670b3104fae2b230bf981 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:02:44 +0100 Subject: [PATCH 034/155] [ci] bump From afd7059df71ed8adce875d5974a90119b413221b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:02:54 +0100 Subject: [PATCH 035/155] [ci] bump From 25f1fe88e42d96c6ec7615950885b699fff90cec Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:02:58 +0100 Subject: [PATCH 036/155] [ci] bump From 307683e94bb64cd425ff780d0ca1a16d86d3fde2 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:18:37 +0100 Subject: [PATCH 037/155] [ci] bump From 28a173fbdfd317b734038f74eceb4a3bd3ed42d9 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:18:41 +0100 Subject: [PATCH 038/155] [ci] bump From 32f829955bdfd9f450e8bcd8d808e64117e1ab2a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:18:44 +0100 Subject: [PATCH 039/155] [ci] bump From a119a8b7ca0cb11e72ba6e25618ade65c081bcca Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:39:53 +0100 Subject: [PATCH 040/155] [ci] bump From 2b7bd90a72c0cca2973ea51b6c3b82d6ad502351 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:39:58 +0100 Subject: [PATCH 041/155] [ci] bump From c43bdf92bdd2fe9da636230d36de88ceab59531f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 18 Oct 2024 17:40:01 +0100 Subject: [PATCH 042/155] [ci] bump From 6d1cfa4aa6fa0115721412ef36433b1a1245b280 Mon Sep 17 00:00:00 2001 From: Benjie Date: Wed, 30 Oct 2024 18:12:10 +0000 Subject: [PATCH 043/155] Update src/localQueue.ts --- src/localQueue.ts | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 383a1490..8299b895 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -224,13 +224,16 @@ export class LocalQueue { this.withPgClient, this.workerPool.id, jobsToReturn, - ).then((e) => { - // TODO: handle this better! - this.compiledSharedOptions.logger.error( - `Failed to return jobs from local queue to database queue`, - { error: e }, - ); - }), + ).then( + () => {}, + (e) => { + // TODO: handle this better! + this.compiledSharedOptions.logger.error( + `Failed to return jobs from local queue to database queue`, + { error: e }, + ); + } + ), ); } From 8868d1ffbb63457088347982573337d98da74a0d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 25 Oct 2024 16:11:48 +0100 Subject: [PATCH 044/155] Move localQueue options into their own object --- perfTest/graphile.config.js | 4 ++- src/index.ts | 50 +++++++++++++++++++------------------ src/localQueue.ts | 6 ++--- src/main.ts | 2 +- 4 files changed, 33 insertions(+), 29 deletions(-) diff --git a/perfTest/graphile.config.js b/perfTest/graphile.config.js index 841ba7d6..62ee0b4c 100644 --- a/perfTest/graphile.config.js +++ b/perfTest/graphile.config.js @@ -15,7 +15,9 @@ const preset = { fileExtensions: [".js", ".cjs", ".mjs"], // fileExtensions: [".js", ".cjs", ".mjs", ".ts", ".cts", ".mts"], gracefulShutdownAbortTimeout: 2500, - localQueueSize: -1, + localQueue: { + size: -1, + }, completeJobBatchDelay: -1, failJobBatchDelay: -1, }, diff --git a/src/index.ts b/src/index.ts index e377676d..ef44d471 100644 --- a/src/index.ts +++ b/src/index.ts @@ -155,31 +155,33 @@ declare global { events?: WorkerEvents; - /** - * To enable processing jobs in batches, set this to an integer larger - * than 1. This will result in jobs being fetched by the pool rather than - * the worker, the pool will fetch (and lock!) `localQueueSize` jobs up - * front, and each time a worker requests a job it will be served from - * this list until the list is exhausted, at which point a new set of - * jobs will be fetched (and locked). - * - * This setting can help reduce the load on your database from looking - * for jobs, but is only really effective when there are often many jobs - * queued and ready to go, and can increase the latency of job execution - * because a single worker may lock jobs into its queue leaving other - * workers idle. - * - * @default `-1` - */ - localQueueSize?: number; + localQueue?: { + /** + * To enable processing jobs in batches, set this to an integer larger + * than 1. This will result in jobs being fetched by the pool rather than + * the worker, the pool will fetch (and lock!) `localQueue.size` jobs up + * front, and each time a worker requests a job it will be served from + * this list until the list is exhausted, at which point a new set of + * jobs will be fetched (and locked). + * + * This setting can help reduce the load on your database from looking + * for jobs, but is only really effective when there are often many jobs + * queued and ready to go, and can increase the latency of job execution + * because a single worker may lock jobs into its queue leaving other + * workers idle. + * + * @default `-1` + */ + size: number; - /** - * How long should jobs sit in the local queue before they are returned - * to the database? Defaults to 5 minutes. - * - * @default `300000` - */ - localQueueTtl?: number; + /** + * How long should jobs sit in the local queue before they are returned + * to the database? Defaults to 5 minutes. + * + * @default `300000` + */ + ttl?: number; + }; /** * The time in milliseconds to wait after a `completeJob` call to see if diff --git a/src/localQueue.ts b/src/localQueue.ts index 8299b895..e82d0193 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -23,8 +23,8 @@ const RELEASED = "RELEASED"; * relieving the workers of this responsibility. * * The local queue trades latency for throughput: jobs may sit in the local - * queue for a longer time (maximum `localQueueSize` jobs waiting maximum - * `localQueueTTL` milliseconds), but fewer requests to the database are made + * queue for a longer time (maximum `localQueue.size` jobs waiting maximum + * `localQueue.ttl` milliseconds), but fewer requests to the database are made * for jobs since more jobs are fetched at once, enabling the worker to reach * higher levels of performance (and reducing read stress on the DB). * @@ -107,7 +107,7 @@ export class LocalQueue { private readonly continuous: boolean, ) { this.ttl = - compiledSharedOptions.resolvedPreset.worker.localQueueTtl ?? 5 * MINUTE; + compiledSharedOptions.resolvedPreset.worker.localQueue?.ttl ?? 5 * MINUTE; this.pollInterval = compiledSharedOptions.resolvedPreset.worker.pollInterval ?? 2 * SECOND; this.setModePolling(); diff --git a/src/main.ts b/src/main.ts index 9a08d277..5f73c129 100644 --- a/src/main.ts +++ b/src/main.ts @@ -551,7 +551,7 @@ export function _runTaskList( worker: { concurrentJobs: baseConcurrency, gracefulShutdownAbortTimeout, - localQueueSize = -1, + localQueue: { size: localQueueSize = -1 } = {}, completeJobBatchDelay = -1, failJobBatchDelay = -1, }, From 9767d4869da44fb2cfc33679713ff8a02439b492 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 25 Oct 2024 16:14:02 +0100 Subject: [PATCH 045/155] Introduce STARTING mode --- src/localQueue.ts | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index e82d0193..fd397616 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -11,6 +11,7 @@ import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; import { getJob as baseGetJob } from "./sql/getJob"; import { returnJob } from "./sql/returnJob"; +const STARTING = "STARTING"; const POLLING = "POLLING"; const WAITING = "WAITING"; const TTL_EXPIRED = "TTL_EXPIRED"; @@ -30,18 +31,24 @@ const RELEASED = "RELEASED"; * * The local queue is always in one of these modes: * + * - STARTING mode * - POLLING mode * - WAITING mode * - TTL_EXPIRED mode * - RELEASED mode * + * ## STARTING mode + * + * STARTING mode is the initial state of the local queue. + * + * Immediately move to POLLING mode. + * * ## POLLING mode * - * POLLING mode is the initial state of the local queue. The queue will only be - * in POLLING mode when it contains no cached jobs. + * The queue will only be in POLLING mode when it contains no cached jobs. * - * When the queue enters POLLING mode (and when it starts) it will trigger a - * fetch of jobs from the database. + * When the queue enters POLLING mode it will trigger a fetch of jobs from the + * database. * * If no jobs were returned then it will wait `pollInterval` ms and then fetch * again. @@ -94,7 +101,12 @@ export class LocalQueue { // Set true to fetch immediately after a fetch completes; typically only used // when the queue is pulsed during a fetch. fetchAgain = false; - mode: typeof POLLING | typeof WAITING | typeof TTL_EXPIRED | typeof RELEASED; + mode: + | typeof STARTING + | typeof POLLING + | typeof WAITING + | typeof TTL_EXPIRED + | typeof RELEASED = STARTING; private promise = defer(); private backgroundCount = 0; From 6d19367f5805069b1707b6fca00757e1ca793468 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 25 Oct 2024 17:30:46 +0100 Subject: [PATCH 046/155] Add localQueue refetchDelay feature: be kind on DB when queue is near-empty --- src/index.ts | 42 +++++++++++++ src/localQueue.ts | 148 ++++++++++++++++++++++++++++++++++++++++------ src/main.ts | 2 +- 3 files changed, 174 insertions(+), 18 deletions(-) diff --git a/src/index.ts b/src/index.ts index ef44d471..448e150a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -181,6 +181,48 @@ declare global { * @default `300000` */ ttl?: number; + + /** + * When running at very high scale (multiple worker instances, each + * with some level of concurrency), Worker's polling can cause + * significant load on the database when there are too few jobs in the + * database to keep all worker pools busy - each time a new job comes + * in, each pool may request it, multiplying up the load. To reduce + * this impact, when a pool receives no (or few) results to its query + * for new jobs, we can instigate a "refetch delay" to cause the pool + * to wait before issuing its next poll for jobs, even when new job + * notifications come in. + */ + refetchDelay?: { + /** + * How long in milliseconds to wait, on average, before asking for + * more jobs when a previous fetch results in insufficient jobs to + * fill the local queue. (Causes the local queue to (mostly) ignore + * "new job" notifications.) + * + * When new jobs are coming in but the workers are mostly idle, you + * can expect on average `(1000/durationMs) * INSTANCE_COUNT` "get jobs" + * queries per second to be issued to your database. Increasing this + * decreases database load at the cost of increased latency when there + * are insufficient jobs in the database to keep the local queue full. + */ + durationMs: number; + /** + * How many jobs should a fetch return to trigger the refetchDelay? + * Must be less than the local queue size + * + * @default {0} + */ + threshold?: number; + /** + * How many new jobs, on average, can the pool that's in idle fetch + * delay be notified of before it aborts the refetch delay and fetches + * anyway + * + * @default {5 * localQueue.size} + */ + abortThreshold?: number; + }; }; /** diff --git a/src/localQueue.ts b/src/localQueue.ts index fd397616..e0dc196c 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -47,17 +47,25 @@ const RELEASED = "RELEASED"; * * The queue will only be in POLLING mode when it contains no cached jobs. * - * When the queue enters POLLING mode it will trigger a fetch of jobs from the - * database. + * When the queue enters POLLING mode: * - * If no jobs were returned then it will wait `pollInterval` ms and then fetch - * again. + * - if any refetch delay has expired it will trigger a fetch of jobs from the + * database, + * - otherwise it will trigger a refetch to happen once the refetch delay has + * completed. * - * If a "new job" notification is received during the polling interval then the - * timer will be cancelled, and a fetch will be fired immediately. + * When jobs are fetched: * - * If jobs are returned from a POLLING mode fetch then the queue immediately - * enters WAITING mode. + * - if no jobs were returned then it will wait `pollInterval` ms and then + * fetch again. + * - if fewer than `Math.ceil(Math.min(localQueueRefetchDelay.threshold, localQueueSize))` + * jobs were returned then a refetch delay will be set (if configured). + * - if jobs are returned from a POLLING mode fetch then the queue immediately + * enters WAITING mode. + * + * When a "new job" notification is received, once any required refetch delay + * has expired (or immediately if it has already expired) the timer will be + * cancelled, and a fetch will be fired immediately. * * ## WAITING mode * @@ -110,6 +118,12 @@ export class LocalQueue { private promise = defer(); private backgroundCount = 0; + /** If `localQueueRefetchDelay` is configured; set this true if the fetch resulted in a queue size lower than the threshold. */ + private refetchDelayActive = false; + private refetchDelayFetchOnComplete = false; + private refetchDelayTimer: NodeJS.Timeout | null = null; + private refetchDelayCounter: number = 0; + constructor( private readonly compiledSharedOptions: CompiledSharedOptions, private readonly tasks: TaskList, @@ -122,6 +136,17 @@ export class LocalQueue { compiledSharedOptions.resolvedPreset.worker.localQueue?.ttl ?? 5 * MINUTE; this.pollInterval = compiledSharedOptions.resolvedPreset.worker.pollInterval ?? 2 * SECOND; + const localQueueRefetchDelayDuration = + compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay + ?.durationMs; + if ( + localQueueRefetchDelayDuration != null && + localQueueRefetchDelayDuration > this.pollInterval + ) { + throw new Error( + `Invalid configuration; 'preset.worker.localQueue.refetchDelay.durationMs' (${localQueueRefetchDelayDuration}) must not be larger than 'preset.worker.pollInterval' (${this.pollInterval})`, + ); + } this.setModePolling(); } @@ -250,6 +275,14 @@ export class LocalQueue { } private fetch = (): void => { + if (this.fetchTimer) { + clearTimeout(this.fetchTimer); + this.fetchTimer = null; + } + if (this.refetchDelayActive) { + this.refetchDelayFetchOnComplete = true; + return; + } this.background( this._fetch().catch((e) => { // This should not happen @@ -262,6 +295,9 @@ export class LocalQueue { private async _fetch() { let fetchedMax = false; + let fetchedUnderRefetchDelayThreshold = false; + const refetchDelayOptions = + this.compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay; try { assert.equal(this.mode, POLLING, "Can only fetch when in polling mode"); assert.equal( @@ -269,12 +305,19 @@ export class LocalQueue { false, "Cannot fetch when a fetch is already in progress", ); - if (this.fetchTimer) { - clearTimeout(this.fetchTimer); - this.fetchTimer = null; - } + assert.equal( + this.refetchDelayActive, + false, + "Can not fetch when fetches are meant to be delayed", + ); + assert.equal( + this.jobQueue.length, + 0, + "Should not fetch when job queue isn't empty", + ); this.fetchAgain = false; this.fetchInProgress = true; + this.refetchDelayCounter = 0; // The ONLY await in this function. const jobs = await baseGetJob( @@ -289,10 +332,18 @@ export class LocalQueue { assert.equal( this.jobQueue.length, 0, - "Should not fetch when job queue isn't empty", + "Should not fetch when job queue isn't empty (recheck)", ); const jobCount = jobs.length; fetchedMax = jobCount >= this.getJobBatchSize; + fetchedUnderRefetchDelayThreshold = + !fetchedMax && + !!refetchDelayOptions && + jobCount < Math.floor(refetchDelayOptions.threshold ?? 0); + + // NOTE: we don't need to handle `this.mode === RELEASED` here because + // being in that mode guarantees the workerQueue is empty. + const workerCount = Math.min(jobCount, this.workerQueue.length); const workers = this.workerQueue.splice(0, workerCount); for (let i = 0; i < jobCount; i++) { @@ -316,11 +367,31 @@ export class LocalQueue { // Finally, now that there is no fetch in progress, choose what to do next if (this.mode === "RELEASED") { this.returnJobs(); - } else if (this.jobQueue.length > 0) { + return; + } + + if (fetchedUnderRefetchDelayThreshold) { + const ms = + (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); + + this.fetchAgain = false; + this.refetchDelayActive = true; + this.refetchDelayFetchOnComplete = false; + // NOTE: this.refetchDelayCounter is set at the beginning of fetch() to allow for pulse() during fetch() + this.refetchDelayTimer = setTimeout(this.refetchDelayCompleteOrAbort, ms); + } + + if (this.jobQueue.length > 0) { this.setModeWaiting(); } else { if (fetchedMax || this.fetchAgain) { - // Maximal fetch; trigger immediate refetch + // Maximal fetch and all jobs instantly consumed; trigger immediate refetch + // OR: new jobs came in during fetch(); trigger immediate refetch + assert.equal( + this.refetchDelayActive, + false, + "refetchDelayActive should imply didn't fetch max and fetchAgain is false", + ); this.fetch(); } else if (this.continuous) { // Set up the timer @@ -329,10 +400,49 @@ export class LocalQueue { this.setModeReleased(); } } + + // In case the counter was incremented sufficiently during fetch() + this.checkRefetchDelayAbortThreshold(); + } + + private refetchDelayCompleteOrAbort = (): void => { + if (this.refetchDelayTimer) { + clearTimeout(this.refetchDelayTimer); + this.refetchDelayTimer = null; + } + this.refetchDelayActive = false; + if (this.mode === POLLING && this.refetchDelayFetchOnComplete) { + // Cancel poll, do now + if (this.fetchTimer) { + clearTimeout(this.fetchTimer); + this.fetchTimer = null; + } + this.fetch(); + } + }; + + private checkRefetchDelayAbortThreshold() { + if (!this.refetchDelayActive || this.mode === "RELEASED") { + return; + } + const refetchDelayOptions = + this.compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay; + const threshold = Math.min( + refetchDelayOptions?.abortThreshold ?? Infinity, + 5 * this.getJobBatchSize, + ); + if (this.refetchDelayCounter >= threshold) { + this.refetchDelayFetchOnComplete = true; + this.refetchDelayCompleteOrAbort(); + } } /** Called when a new job becomes available in the DB */ - public pulse() { + public pulse(count: number) { + this.refetchDelayCounter += count; + + this.checkRefetchDelayAbortThreshold(); + // The only situation when this affects anything is if we're running in polling mode. if (this.mode === POLLING) { if (this.fetchInProgress) { @@ -399,6 +509,11 @@ export class LocalQueue { const oldMode = this.mode; this.mode = RELEASED; + if (this.refetchDelayTimer != null) { + clearTimeout(this.refetchDelayTimer); + this.refetchDelayTimer = null; + } + if (oldMode === POLLING) { // Release pending workers const workers = this.workerQueue.splice(0, this.workerQueue.length); @@ -422,7 +537,6 @@ export class LocalQueue { } else if (oldMode === TTL_EXPIRED) { // No action necessary } - if (this.backgroundCount === 0) { this.promise.resolve(); } diff --git a/src/main.ts b/src/main.ts index 5f73c129..0af9daf6 100644 --- a/src/main.ts +++ b/src/main.ts @@ -682,7 +682,7 @@ export function _runTaskList( }, nudge(this: WorkerPool, count: number) { if (localQueue) { - localQueue.pulse(); + localQueue.pulse(count); } else { let n = count; // Nudge up to `n` workers From a087440f34d343ee344aee25a5dfe2392c5fe454 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 25 Oct 2024 17:35:35 +0100 Subject: [PATCH 047/155] Randomize the abort threshold --- src/localQueue.ts | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index e0dc196c..7aa1389f 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -123,6 +123,7 @@ export class LocalQueue { private refetchDelayFetchOnComplete = false; private refetchDelayTimer: NodeJS.Timeout | null = null; private refetchDelayCounter: number = 0; + private refetchDelayAbortThreshold: number = Infinity; constructor( private readonly compiledSharedOptions: CompiledSharedOptions, @@ -373,10 +374,17 @@ export class LocalQueue { if (fetchedUnderRefetchDelayThreshold) { const ms = (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); + const threshold = + (0.5 + Math.random()) * + Math.min( + refetchDelayOptions?.abortThreshold ?? Infinity, + 5 * this.getJobBatchSize, + ); this.fetchAgain = false; this.refetchDelayActive = true; this.refetchDelayFetchOnComplete = false; + this.refetchDelayAbortThreshold = threshold; // NOTE: this.refetchDelayCounter is set at the beginning of fetch() to allow for pulse() during fetch() this.refetchDelayTimer = setTimeout(this.refetchDelayCompleteOrAbort, ms); } @@ -425,13 +433,7 @@ export class LocalQueue { if (!this.refetchDelayActive || this.mode === "RELEASED") { return; } - const refetchDelayOptions = - this.compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay; - const threshold = Math.min( - refetchDelayOptions?.abortThreshold ?? Infinity, - 5 * this.getJobBatchSize, - ); - if (this.refetchDelayCounter >= threshold) { + if (this.refetchDelayCounter >= this.refetchDelayAbortThreshold) { this.refetchDelayFetchOnComplete = true; this.refetchDelayCompleteOrAbort(); } From e3ec005952653a680b9deaac516f300b7b43922a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 25 Oct 2024 17:42:49 +0100 Subject: [PATCH 048/155] Cleaner branching without setting fetchAgain by accident --- src/localQueue.ts | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 7aa1389f..931a7145 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -410,7 +410,7 @@ export class LocalQueue { } // In case the counter was incremented sufficiently during fetch() - this.checkRefetchDelayAbortThreshold(); + this.handleCheckRefetchDelayAbortThreshold(); } private refetchDelayCompleteOrAbort = (): void => { @@ -429,24 +429,25 @@ export class LocalQueue { } }; - private checkRefetchDelayAbortThreshold() { + private handleCheckRefetchDelayAbortThreshold(): boolean { if (!this.refetchDelayActive || this.mode === "RELEASED") { - return; + return false; } if (this.refetchDelayCounter >= this.refetchDelayAbortThreshold) { this.refetchDelayFetchOnComplete = true; this.refetchDelayCompleteOrAbort(); + return true; } + return false; } /** Called when a new job becomes available in the DB */ public pulse(count: number) { this.refetchDelayCounter += count; - this.checkRefetchDelayAbortThreshold(); - - // The only situation when this affects anything is if we're running in polling mode. - if (this.mode === POLLING) { + if (this.handleCheckRefetchDelayAbortThreshold()) { + /* handled */ + } else if (this.mode === POLLING) { if (this.fetchInProgress) { this.fetchAgain = true; } else if (this.fetchTimer) { @@ -515,6 +516,7 @@ export class LocalQueue { clearTimeout(this.refetchDelayTimer); this.refetchDelayTimer = null; } + this.refetchDelayActive = false; if (oldMode === POLLING) { // Release pending workers From d98598db831fc3ce863cdf900c4c30d5fa4071e0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 1 Nov 2024 12:25:31 +0000 Subject: [PATCH 049/155] Lint --- src/localQueue.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 931a7145..6db94465 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -270,7 +270,7 @@ export class LocalQueue { `Failed to return jobs from local queue to database queue`, { error: e }, ); - } + }, ), ); } From 8f72ececfaec0452a6427cf01f5483734a500a6f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 11 Nov 2024 15:42:54 +0000 Subject: [PATCH 050/155] Don't trigger refetch once setting mode to released --- src/localQueue.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/localQueue.ts b/src/localQueue.ts index 6db94465..a1abe0d5 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -406,6 +406,7 @@ export class LocalQueue { this.fetchTimer = setTimeout(this.fetch, this.pollInterval); } else { this.setModeReleased(); + return; } } From 098d203c8bd16d23259bb655b549585d724df3c8 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 11 Nov 2024 15:43:40 +0000 Subject: [PATCH 051/155] Fix bug and clarify variable name --- src/localQueue.ts | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index a1abe0d5..90edbcb7 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -295,8 +295,19 @@ export class LocalQueue { }; private async _fetch() { + /** + * Did we fetch the maximum number of records that we could? (If so, we + * should consider fetching again straight away so there's always jobs to + * be done.) + */ let fetchedMax = false; - let fetchedUnderRefetchDelayThreshold = false; + /** + * Did we fetch more jobs than the refetch delay threshold? (Greater than, + * not equal to.) If false, we should start a refetch delay. + * + * Initialized to `true` so on error we don't enable refetch delay. + */ + let refetchDelayThresholdSurpassed = true; const refetchDelayOptions = this.compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay; try { @@ -337,10 +348,13 @@ export class LocalQueue { ); const jobCount = jobs.length; fetchedMax = jobCount >= this.getJobBatchSize; - fetchedUnderRefetchDelayThreshold = - !fetchedMax && - !!refetchDelayOptions && - jobCount < Math.floor(refetchDelayOptions.threshold ?? 0); + refetchDelayThresholdSurpassed = + // If we've fetched the maximum, we've met the requirement + fetchedMax || + // If refetch delay is disabled, we've met the requirement + !refetchDelayOptions || + // If we fetched more than (**not** equal to) `threshold` jobs, we've met the requirement + jobCount > Math.floor(refetchDelayOptions.threshold ?? 0); // NOTE: we don't need to handle `this.mode === RELEASED` here because // being in that mode guarantees the workerQueue is empty. @@ -371,7 +385,7 @@ export class LocalQueue { return; } - if (fetchedUnderRefetchDelayThreshold) { + if (!refetchDelayThresholdSurpassed) { const ms = (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); const threshold = From b45b59dc5476e4f13dadea23f2e7e70f4e933d0a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 11 Nov 2024 15:43:56 +0000 Subject: [PATCH 052/155] Comments and variable renames for clarity --- src/localQueue.ts | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 90edbcb7..8a453c07 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -329,6 +329,9 @@ export class LocalQueue { ); this.fetchAgain = false; this.fetchInProgress = true; + // NOTE: this.refetchDelayCounter is set here allow for pulse() during + // fetch(). If the refetch delay threshold is surpassed then this value + // is harmlessly ignored. this.refetchDelayCounter = 0; // The ONLY await in this function. @@ -386,9 +389,11 @@ export class LocalQueue { } if (!refetchDelayThresholdSurpassed) { - const ms = + /** How long to avoid any refetches for */ + const refetchDelayMs = (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); - const threshold = + /** How many notifications do we need to receive before we abort the "no refetches" behavior? */ + const abortThreshold = (0.5 + Math.random()) * Math.min( refetchDelayOptions?.abortThreshold ?? Infinity, @@ -398,9 +403,13 @@ export class LocalQueue { this.fetchAgain = false; this.refetchDelayActive = true; this.refetchDelayFetchOnComplete = false; - this.refetchDelayAbortThreshold = threshold; - // NOTE: this.refetchDelayCounter is set at the beginning of fetch() to allow for pulse() during fetch() - this.refetchDelayTimer = setTimeout(this.refetchDelayCompleteOrAbort, ms); + this.refetchDelayAbortThreshold = abortThreshold; + // NOTE: this.refetchDelayCounter is set at the beginning of fetch() + // (i.e. above) to allow for pulse() during fetch() + this.refetchDelayTimer = setTimeout( + this.refetchDelayCompleteOrAbort, + refetchDelayMs, + ); } if (this.jobQueue.length > 0) { From 835502f11156d929044d8227d128a52a6652c0ae Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 13 Nov 2024 14:37:31 +0000 Subject: [PATCH 053/155] Clarify and fix behavior of refetch delay --- src/localQueue.ts | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 8a453c07..ef3baeaa 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -438,7 +438,7 @@ export class LocalQueue { } private refetchDelayCompleteOrAbort = (): void => { - if (this.refetchDelayTimer) { + if (this.refetchDelayTimer != null) { clearTimeout(this.refetchDelayTimer); this.refetchDelayTimer = null; } @@ -453,6 +453,10 @@ export class LocalQueue { } }; + /** + * If no refetch delay is active, returns false; otherwise returns true and + * checks to see if we need to abort the delay and trigger a fetch. + */ private handleCheckRefetchDelayAbortThreshold(): boolean { if (!this.refetchDelayActive || this.mode === "RELEASED") { return false; @@ -460,9 +464,8 @@ export class LocalQueue { if (this.refetchDelayCounter >= this.refetchDelayAbortThreshold) { this.refetchDelayFetchOnComplete = true; this.refetchDelayCompleteOrAbort(); - return true; } - return false; + return true; } /** Called when a new job becomes available in the DB */ @@ -470,11 +473,12 @@ export class LocalQueue { this.refetchDelayCounter += count; if (this.handleCheckRefetchDelayAbortThreshold()) { - /* handled */ + // Refetch delay was enabled; we've incremented the counter and taken + // action if necessary. No further action necessary. } else if (this.mode === POLLING) { if (this.fetchInProgress) { this.fetchAgain = true; - } else if (this.fetchTimer) { + } else if (this.fetchTimer != null) { clearTimeout(this.fetchTimer); this.fetchTimer = null; this.fetch(); From abc16f80e9fdfe47ff59fd26a83c0fecc9e8d9cb Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 13 Nov 2024 14:39:08 +0000 Subject: [PATCH 054/155] Reduce diff --- src/localQueue.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/localQueue.ts b/src/localQueue.ts index ef3baeaa..a284a294 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -569,6 +569,7 @@ export class LocalQueue { } else if (oldMode === TTL_EXPIRED) { // No action necessary } + if (this.backgroundCount === 0) { this.promise.resolve(); } From 187c4e031fb8fb6658ac53c0b5f844dc78e37cd6 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 13 Nov 2024 16:28:19 +0000 Subject: [PATCH 055/155] 0.17.0-canary.379fb2e --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 1bd159d5..582156f0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.67dbcb6", + "version": "0.17.0-canary.379fb2e", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 76f7fe6f..4264f926 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.67dbcb6"; +export const version = "0.17.0-canary.379fb2e"; From f044bfe2cba8f2cdc6e4d850b3adf37076addba0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 09:58:39 +0000 Subject: [PATCH 056/155] Update performance results --- perfTest/graphile.config.js | 19 ++-- perfTest/latencyTest.js | 5 +- perfTest/run.js | 13 +-- website/docs/performance.md | 173 +++++++++++++++++++++++++++++------- 4 files changed, 158 insertions(+), 52 deletions(-) diff --git a/perfTest/graphile.config.js b/perfTest/graphile.config.js index 62ee0b4c..76a20ee6 100644 --- a/perfTest/graphile.config.js +++ b/perfTest/graphile.config.js @@ -5,21 +5,28 @@ // import { WorkerProPreset } from "../graphile-pro-worker/dist/index.js"; +const CONCURRENT_JOBS = 24; + /** @type {GraphileConfig.Preset} */ const preset = { // extends: [WorkerProPreset], worker: { connectionString: process.env.PERF_DATABASE_URL || "postgres:///graphile_worker_perftest", - concurrentJobs: 3, fileExtensions: [".js", ".cjs", ".mjs"], // fileExtensions: [".js", ".cjs", ".mjs", ".ts", ".cts", ".mts"], gracefulShutdownAbortTimeout: 2500, - localQueue: { - size: -1, - }, - completeJobBatchDelay: -1, - failJobBatchDelay: -1, + + concurrentJobs: CONCURRENT_JOBS, + maxPoolSize: CONCURRENT_JOBS + 1, + + //localQueue: { size: -1 }, + //completeJobBatchDelay: -1, + //failJobBatchDelay: -1, + + localQueue: { size: 500, refetchDelay: { durationMs: 10 } }, + completeJobBatchDelay: 0, + failJobBatchDelay: 0, }, }; diff --git a/perfTest/latencyTest.js b/perfTest/latencyTest.js index 2dd5c685..5b1a37ef 100644 --- a/perfTest/latencyTest.js +++ b/perfTest/latencyTest.js @@ -9,10 +9,7 @@ const preset = require("./graphile.config.js"); const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); /** @type {import('../dist/index.js').WorkerPoolOptions} */ -const options = { - concurrency: 1, - preset, -}; +const options = { preset }; async function main() { const pgPool = new Pool({ connectionString: process.env.PERF_DATABASE_URL }); diff --git a/perfTest/run.js b/perfTest/run.js index 1903a89b..a94d353f 100755 --- a/perfTest/run.js +++ b/perfTest/run.js @@ -6,7 +6,6 @@ const exec = promisify(rawExec); const JOB_COUNT = 200000; const STUCK_JOB_COUNT = 0; const PARALLELISM = 4; -const CONCURRENCY = 24; const time = async (cb) => { const start = process.hrtime(); @@ -52,10 +51,7 @@ async function main() { console.log("Timing startup/shutdown time..."); let result; const startupTime = await time(async () => { - result = await exec( - `node ../dist/cli.js --once -j ${CONCURRENCY} -m ${CONCURRENCY + 1}`, - execOptions, - ); + result = await exec(`node ../dist/cli.js --once`, execOptions); }); logResult(result); console.log(); @@ -81,12 +77,7 @@ async function main() { const dur = await time(async () => { const promises = []; for (let i = 0; i < PARALLELISM; i++) { - promises.push( - exec( - `node ../dist/cli.js --once -j ${CONCURRENCY} -m ${CONCURRENCY + 1}`, - execOptions, - ), - ); + promises.push(exec(`node ../dist/cli.js --once`, execOptions)); } (await Promise.all(promises)).map(logResult); }); diff --git a/website/docs/performance.md b/website/docs/performance.md index e4f824db..8e8ed7f3 100644 --- a/website/docs/performance.md +++ b/website/docs/performance.md @@ -3,72 +3,183 @@ title: "Performance" sidebar_position: 120 --- +## Quick stats + +Quick stats in optimial conditions: + +- jobs executed per second: ~183,000 +- average latency from add_job to job execution start: 4.16ms (max: 13.84ms) +- jobs queued per second from single add_jobs batch call: ~202,000 +- time to start and immediately shut down the worker: 68ms + +The above stats were achieved with this configuration: + +```ts +const preset = { + worker: { + connectionString: "postgres:///graphile_worker_perftest", + fileExtensions: [".js", ".cjs", ".mjs"], + + concurrentJobs: 24, + maxPoolSize: 25, + + // Batching options (see below) + localQueue: { size: 500 }, + completeJobBatchDelay: 0, + failJobBatchDelay: 0, + }, +}; +``` + +## Performance statement + `graphile-worker` is not intended to replace extremely high performance -dedicated job queues, it's intended to be a very easy way to get a -reasonably performant job queue up and running with Node.js and PostgreSQL. But -this doesn't mean it's a slouch by any means — it achieves an -average latency from triggering a job in one process to executing it in another -of under 3ms, and a 12-core database server can queue around 99,600 jobs per -second and can process around 11,800 jobs per second. +dedicated job queues for Facebook scale, it's intended to give regular +organizations the fastest and easiest to set up job queue we can achieve without +needing to expand your infrastructure beyond Node.js and PostgreSQL. But this +doesn't mean it's a slouch by any means — it achieves an average +latency from triggering a job in one process to executing it in another of under +5ms, and a well-specced database server can queue around 172,000 jobs per second +from a single PostgreSQL client, and can process around 196k jobs per second +using a pool of 4 Graphile Worker instances, each with concurrency set to 24. +For many organizations, this is more than they'll ever need. + +## Horizontal scaling `graphile-worker` is horizontally scalable to a point. Each instance has a customizable worker pool, this pool defaults to size 1 (only one job at a time on this worker) but depending on the nature of your tasks (i.e. assuming they're not compute-heavy) you will likely want to set this higher to benefit from Node.js' concurrency. If your tasks are compute heavy you may -still wish to set it higher and then using Node's `child_process` (or Node -v11's `worker_threads`) to share the compute load over multiple cores -without significantly impacting the main worker's run loop. Note, however, -that Graphile Worker is limited by the performance of the underlying Postgres +still wish to set it higher and then using Node's `child_process` or +`worker_threads` to share the compute load over multiple cores without +significantly impacting the main worker's run loop. + +## Enabling batching for highest performance + +Graphile Worker is limited by the performance of the underlying Postgres database, and when you hit this limit performance will start to go down (rather than up) as you add more workers. -To test performance, you can run `yarn perfTest`. This runs three tests: +To mitigate this, we've added batching functionality to many of the internal +methods which you can enable via the configuration. For example using a local +queue enables each pool to pull down a configurable number of jobs up front so +its workers can start a new job the moment their previous one completes without +having to request a new job from the database. This batching also reduces load +on the database since there are fewer total queries per second, but it's a +slight trade-off since more jobs are checked out but not necessarily actively +being worked on, so latency may increase and in the event of a crash more jobs +will be locked. + +## Running the performance tests + +To test performance, you can check out the repository and then run +`yarn perfTest`. This runs three tests: 1. a startup/shutdown test to see how fast the worker can startup and exit if there's no jobs queued (this includes connecting to the database and ensuring the migrations are up to date) -2. a load test — by default this will run 20,000 +2. a load test — by default this will run 200,000 [trivial](https://github.com/graphile/worker/blob/main/perfTest/tasks/log_if_999.js) - jobs with a parallelism of 4 (i.e. 4 node processes) and a concurrency of 10 - (i.e. 10 concurrent jobs running on each node process), but you can configure - this in `perfTest/run.js`. (These settings were optimized for a 12-core - hyper-threading machine running both the tests and the database locally.) + jobs with a parallelism of 4 (i.e. 4 node processes) and a concurrency of 24 + (i.e. 24 concurrent jobs running on each node process), but you can configure + this in `perfTest/run.js`. (These settings were optimized for a Intel + i9-14900K with efficiency cores disabled and running both the tests and the + database locally.) 3. a latency test — determining how long between issuing an `add_job` command and the task itself being executed. ## perfTest results: -The test was ran on a 12-core AMD Ryzen 3900 with an M.2 SSD, running both the -workers and the database (and a tonne of Chrome tabs, electron apps, and what -not). Jobs=20000, parallelism=4, concurrency=10. +Executed on +[this machine](https://uk.pcpartpicker.com/user/BenjieGillam/saved/#view=BjtCrH), +running both the workers and the database (and a tonne of Chrome tabs, electron +apps, and what not). + +### With batching + +**Jobs per second: ~184,000** -Conclusion: +```ts +const preset = { + worker: { + connectionString: "postgres:///graphile_worker_perftest", + fileExtensions: [".js", ".cjs", ".mjs"], -- Startup/shutdown: 110ms -- Jobs per second: 11,851 -- Average latency: 2.66ms (min: 2.39ms, max: 12.09ms) + concurrentJobs: 24, + maxPoolSize: 25, + + // Batching options (see below) + localQueue: { size: 500 }, + completeJobBatchDelay: 0, + failJobBatchDelay: 0, + }, +}; +``` ``` Timing startup/shutdown time... -... it took 110ms +... it took 68ms + +Scheduling 200000 jobs +Adding jobs: 988.425ms +... it took 1160ms + + +Timing 200000 job execution... +Found 999! + +... it took 1156ms +Jobs per second: 183895.49 + +Testing latency... +[core] INFO: Worker connected and looking for jobs... (task names: 'latency') +Beginning latency test +Latencies - min: 3.24ms, max: 18.18ms, avg: 4.28ms +``` + +### Without batching + +**Jobs per second: ~15,600** + +```ts +const preset = { + worker: { + connectionString: "postgres:///graphile_worker_perftest", + fileExtensions: [".js", ".cjs", ".mjs"], + + concurrentJobs: 24, + maxPoolSize: 25, + + // Batching disabled (default) + localQueue: { size: -1 }, + completeJobBatchDelay: -1, + failJobBatchDelay: -1, + }, +}; +``` + +``` +Timing startup/shutdown time... +... it took 77ms + -Scheduling 20000 jobs -Adding jobs: 200.84ms -... it took 287ms +Scheduling 200000 jobs +Adding jobs: 992.368ms +... it took 1163ms -Timing 20000 job execution... +Timing 200000 job execution... Found 999! -... it took 1797ms -Jobs per second: 11851.90 +... it took 12892ms +Jobs per second: 15606.79 Testing latency... [core] INFO: Worker connected and looking for jobs... (task names: 'latency') Beginning latency test -Latencies - min: 2.39ms, max: 12.09ms, avg: 2.66ms +Latencies - min: 3.40ms, max: 14.13ms, avg: 4.47ms ``` TODO: post perfTest results in a more reasonable configuration, e.g. using an From da79374855eced756a995e7778c5cb9cc0c73ed6 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 12:48:58 +0000 Subject: [PATCH 057/155] Beginnings of tower defence --- package.json | 1 + towerDefence/README.md | 15 +++ towerDefence/crontab | 0 towerDefence/graphile.config.mjs | 33 ++++++ towerDefence/run.mjs | 182 +++++++++++++++++++++++++++++++ towerDefence/tasks/log_if_999.js | 5 + 6 files changed, 236 insertions(+) create mode 100644 towerDefence/README.md create mode 100644 towerDefence/crontab create mode 100644 towerDefence/graphile.config.mjs create mode 100755 towerDefence/run.mjs create mode 100644 towerDefence/tasks/log_if_999.js diff --git a/package.json b/package.json index 582156f0..c169e680 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "depcheck": "depcheck --ignores='graphile-worker,faktory-worker,@google-cloud/tasks,bullmq,jest-environment-node,@docusaurus/*,@fortawesome/*,@mdx-js/*,@types/jest,clsx,eslint_d,graphile,juice,postcss-nested,prism-react-renderer,react,react-dom,svgo,ts-node,@types/debug,tslib'", "db:dump": "./scripts/dump_db", "perfTest": "cd perfTest && node ./run.js", + "towerDefence": "cd towerDefence && node ./run.mjs", "preversion": "grep '^### Pending' RELEASE_NOTES.md && echo \"⚠️ Cannot publish with 'Pending' in RELEASE_NOTES ⚠️\" && exit 1 || true", "version": "node scripts/postversion.mjs && git add src/version.ts", "website": "cd website && yarn run" diff --git a/towerDefence/README.md b/towerDefence/README.md new file mode 100644 index 00000000..f1919514 --- /dev/null +++ b/towerDefence/README.md @@ -0,0 +1,15 @@ +# Tower defence test + +With the advanced options like localQueue and refetchDelay, Graphile Worker gets +quite complex and testing it becomes a challenge. When there's enough work to go +around (as in `perfTest`), testing is easy and the system handles admirably. But +things become more complex when there's not enough work to go around: we still +want to execute jobs quickly, but we don't want all 10 Graphile Worker instances +sending a query to the DB each time a new job comes in. + +This folder mounts a "tower defence"-style attack against a cluster of Graphile +Worker instances; it's designed to a) make sure no bugs happen, and b) let us +monitor system metrics under various load conditions. We start with the setup +phase where we build our towers (Graphile Worker instances) and then we send +different "waves" of jobs at the towers to ensure everything continues to work +smoothly. diff --git a/towerDefence/crontab b/towerDefence/crontab new file mode 100644 index 00000000..e69de29b diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs new file mode 100644 index 00000000..cd787e63 --- /dev/null +++ b/towerDefence/graphile.config.mjs @@ -0,0 +1,33 @@ +// @ts-check + +/** @typedef {import("../dist/index.js")} Worker */ +// import type {} from "../src/index.js"; + +// import { WorkerProPreset } from "../graphile-pro-worker/dist/index.js"; + +const CONCURRENT_JOBS = 10; + +/** @type {GraphileConfig.Preset} */ +const preset = { + // extends: [WorkerProPreset], + worker: { + connectionString: + process.env.PERF_DATABASE_URL || "postgres:///graphile_worker_perftest", + fileExtensions: [".js", ".cjs", ".mjs"], + // fileExtensions: [".js", ".cjs", ".mjs", ".ts", ".cts", ".mts"], + gracefulShutdownAbortTimeout: 2500, + + concurrentJobs: CONCURRENT_JOBS, + maxPoolSize: CONCURRENT_JOBS + 1, + + //localQueue: { size: -1 }, + //completeJobBatchDelay: -1, + //failJobBatchDelay: -1, + + localQueue: { size: 500, refetchDelay: { durationMs: 10 } }, + completeJobBatchDelay: 0, + failJobBatchDelay: 0, + }, +}; + +export default preset; diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs new file mode 100755 index 00000000..7f8c3045 --- /dev/null +++ b/towerDefence/run.mjs @@ -0,0 +1,182 @@ +#!/usr/bin/env node +// @ts-check +import { execSync, spawn } from "child_process"; +import pg from "pg"; +import { promisify } from "util"; + +const STUCK_JOB_COUNT = 0; +const PARALLELISM = 10; +const WAVES = [makeWave([1])]; + +const taskIdentifier = "log_if_999"; + +const __dirname = new URL(".", import.meta.url).pathname; + +// run in this script's parent directory +process.chdir(__dirname); + +process.env.NO_LOG_SUCCESS = "1"; + +// if connection string not provided, assume postgres is available locally +process.env.PERF_DATABASE_URL ??= `${ + process.env.TEST_CONNECTION_STRING || "postgres:///graphile_worker_perftest" +}`; + +const env = { + ...process.env, + DATABASE_URL: process.env.PERF_DATABASE_URL, +}; + +/** @type {import("child_process").CommonExecOptions} */ +const execOptions = { + env, + stdio: ["ignore", "ignore", "inherit"], +}; + +/** @type {import("child_process").CommonSpawnOptions} */ +const spawnOptions = { + env, + stdio: ["ignore", "inherit", "inherit"], +}; + +const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); + +const GENERAL_JOBS_PER_SECOND = 15000; +const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; + +/** @type {(jobBatches: number[]) => () => Promise} */ +function makeWave(jobBatches) { + return async () => { + let totalCount = 0; + for (let i = 0; i < jobBatches.length; i++) { + const jobCount = jobBatches[i]; + const jobs = []; + for (let i = 0; i < jobCount; i++) { + totalCount++; + jobs.push( + `("${taskIdentifier.replace( + /["\\]/g, + "\\$&", + )}","{\\"id\\":${i}}",,,,,,)`, + ); + } + const jobsString = `{"${jobs + .map((j) => j.replace(/["\\]/g, "\\$&")) + .join('","')}"}`; + await pgPool.query( + `select graphile_worker.add_jobs($1::graphile_worker.job_spec[]);`, + [jobsString], + ); + } + + // Give roughly enough time for the jobs to complete + await sleep(totalCount / GENERAL_JOBS_PER_MILLISECOND); + + // And then wait a bit longer + await sleep(10000); + + // And check the jobs table is empty + const { + rows: [{ count }], + } = await pgPool.query( + `select count(*) from graphile_worker.jobs where task_identifier <> 'stuck';`, + ); + if (count !== "0") { + throw new Error(`Expected 0 jobs, got ${count}`); + } + }; +} + +/** @type {(ms: number) => Promise} */ +const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + +/** @type {(cb: () => any) => Promise} */ +const time = async (cb) => { + const start = process.hrtime(); + await cb(); + const diff = process.hrtime(start); + const dur = diff[0] * 1e3 + diff[1] * 1e-6; + console.log(`... it took ${dur.toFixed(0)}ms`); + return dur; +}; + +async function main() { + console.log("Building"); + execSync("yarn prepack", execOptions); + + console.log("Dropping and recreating the test database"); + execSync("node ../perfTest/recreateDb.js", execOptions); + + console.log("Installing the schema"); + execSync("node ../dist/cli.js --schema-only", execOptions); + + if (STUCK_JOB_COUNT > 0) { + console.log(`Scheduling ${STUCK_JOB_COUNT} stuck jobs`); + await time(() => { + execSync( + `node ../perfTest/init.js ${STUCK_JOB_COUNT} stuck`, + execOptions, + ); + }); + } + + console.log(); + console.log(); + console.log(`Spawning ${PARALLELISM} workers...`); + /** @type {import("child_process").PromiseWithChild[]} */ + const workerPromises = []; + for (let i = 0; i < PARALLELISM; i++) { + const child = spawn(`node`, [`../dist/cli.js`], spawnOptions); + const promise = Object.assign( + new Promise((resolve, reject) => { + child.on("error", reject); + child.on("exit", resolve); + }), + { child }, + ); + workerPromises.push(promise); + } + + const allDone = Promise.all(workerPromises).then( + () => { + console.log("All workers exited cleanly"); + }, + (e) => { + /** @type {import("child_process").ExecException} */ + const err = e; + if (err.signal === "SIGTERM") { + // all good; we terminated it + } else { + console.dir(err); + process.exit(1); + } + }, + ); + + await sleep(2000); + console.log("The wait is over... starting the attack"); + console.log(); + console.log(); + + for (let waveNumber = 0; waveNumber < WAVES.length; waveNumber++) { + const wave = WAVES[waveNumber]; + console.log(`Wave ${waveNumber + 1}...`); + await wave(); + console.log(); + console.log(); + } + + console.log("Waves complete; waiting for workers to finish"); + for (const { child } of workerPromises) { + child.kill("SIGTERM"); + } + + await allDone; + + console.log("Exiting"); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +}); diff --git a/towerDefence/tasks/log_if_999.js b/towerDefence/tasks/log_if_999.js new file mode 100644 index 00000000..a1bc70ec --- /dev/null +++ b/towerDefence/tasks/log_if_999.js @@ -0,0 +1,5 @@ +module.exports = ({ id }) => { + if (id === 999) { + console.log("Found 999!"); + } +}; From 4b8462c60f57f1c41245e5faf59c51553f0ef1a0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 12:55:22 +0000 Subject: [PATCH 058/155] More waves and don't wait between waves --- towerDefence/run.mjs | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 7f8c3045..01234e15 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -6,7 +6,12 @@ import { promisify } from "util"; const STUCK_JOB_COUNT = 0; const PARALLELISM = 10; -const WAVES = [makeWave([1])]; +const WAVES = [ + makeWave([1]), + makeWave(new Array(1000).fill(1)), + makeWave(new Array(1000).fill(4)), + makeWave(new Array(1000).fill(20)), +]; const taskIdentifier = "log_if_999"; @@ -72,17 +77,22 @@ function makeWave(jobBatches) { // Give roughly enough time for the jobs to complete await sleep(totalCount / GENERAL_JOBS_PER_MILLISECOND); - // And then wait a bit longer - await sleep(10000); - - // And check the jobs table is empty - const { - rows: [{ count }], - } = await pgPool.query( - `select count(*) from graphile_worker.jobs where task_identifier <> 'stuck';`, - ); - if (count !== "0") { - throw new Error(`Expected 0 jobs, got ${count}`); + // And wait for the jobs table to be empty + const MAX_ATTEMPTS = 20; + for (let attempts = 0; attempts < MAX_ATTEMPTS; attempts++) { + const { + rows: [{ count }], + } = await pgPool.query( + `select count(*) from graphile_worker.jobs where task_identifier <> 'stuck';`, + ); + if (count === "0") { + break; + } + if (attempts === MAX_ATTEMPTS - 1) { + throw new Error(`Expected 0 jobs, got ${count}`); + } else { + await sleep(50 * (attempts + 1) ** 1.5); + } } }; } From 6c7d92d66ba3b3fb8ace824db30bf440e67fda94 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 15:06:28 +0000 Subject: [PATCH 059/155] Add some events so we can collect stats from tower defence --- src/interfaces.ts | 78 ++++++++++++++++++++++ src/localQueue.ts | 89 ++++++++++++++++++------- src/sql/{returnJob.ts => returnJobs.ts} | 2 +- towerDefence/graphile.config.mjs | 58 ++++++++++++++++ towerDefence/run.mjs | 6 +- 5 files changed, 205 insertions(+), 28 deletions(-) rename src/sql/{returnJob.ts => returnJobs.ts} (98%) diff --git a/src/interfaces.ts b/src/interfaces.ts index 41c515eb..808dbac9 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -15,6 +15,7 @@ import type { Release, ResolvedWorkerPreset, } from "./lib"; +import { LocalQueue } from "./localQueue"; import type { Logger } from "./logger"; import type { Signal } from "./signals"; @@ -1020,6 +1021,73 @@ export type WorkerEventMap = { workerPool: WorkerPool; }; + /** + * When a local queue is created + */ + "localQueue:init": { + localQueue: LocalQueue; + }; + + /** + * When a local queue enters 'polling' mode + */ + "localQueue:setMode": { + localQueue: LocalQueue; + oldMode: LocalQueueMode; + newMode: Exclude; + }; + + /** + * Too few jobs were fetched from the DB, so the local queue is going to + * sleep. + */ + "localQueue:refetchDelay:start": { + localQueue: LocalQueue; + /** The number of jobs that were fetched */ + jobCount: number; + /** We needed this number or fewer jobs to trigger */ + threshold: number; + /** How long we should delay for */ + delayMs: number; + /** If we receive this number of nudges, we will abort the delay */ + abortThreshold: number; + }; + + /** + * Too many nudges happened whilst the local queue was asleep, and it has + * been awoken early to deal with the rush! + */ + "localQueue:refetchDelay:abort": { + localQueue: LocalQueue; + /** How many nudges did we receive during the delay */ + count: number; + /** How many nudges did we need to receive for the abort */ + abortThreshold: number; + }; + + /** + * The refetchDelay terminated normally. + */ + "localQueue:refetchDelay:expired": { + localQueue: LocalQueue; + }; + + /** + * The refetchDelay terminated normally. + */ + "localQueue:getJobs:complete": { + localQueue: LocalQueue; + jobs: Job[]; + }; + + /** + * The refetchDelay terminated normally. + */ + "localQueue:returnJobs": { + localQueue: LocalQueue; + jobs: Job[]; + }; + /** * When a worker is created */ @@ -1258,3 +1326,13 @@ export type FailJobFunction = (spec: { message: string; replacementPayload: undefined | unknown[]; }) => void; + +export const LocalQueueModes = { + STARTING: "STARTING", + POLLING: "POLLING", + WAITING: "WAITING", + TTL_EXPIRED: "TTL_EXPIRED", + RELEASED: "RELEASED", +} as const; + +export type LocalQueueMode = keyof typeof LocalQueueModes; diff --git a/src/localQueue.ts b/src/localQueue.ts index a284a294..02d68a56 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -3,19 +3,17 @@ import assert from "assert"; import { CompiledSharedOptions, EnhancedWithPgClient, + LocalQueueMode, + LocalQueueModes, WorkerPoolOptions, } from "."; import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; import { getJob as baseGetJob } from "./sql/getJob"; -import { returnJob } from "./sql/returnJob"; +import { returnJobs } from "./sql/returnJobs"; -const STARTING = "STARTING"; -const POLLING = "POLLING"; -const WAITING = "WAITING"; -const TTL_EXPIRED = "TTL_EXPIRED"; -const RELEASED = "RELEASED"; +const { STARTING, POLLING, WAITING, TTL_EXPIRED, RELEASED } = LocalQueueModes; /** * The local queue exists to reduce strain on the database; it works by @@ -109,12 +107,7 @@ export class LocalQueue { // Set true to fetch immediately after a fetch completes; typically only used // when the queue is pulsed during a fetch. fetchAgain = false; - mode: - | typeof STARTING - | typeof POLLING - | typeof WAITING - | typeof TTL_EXPIRED - | typeof RELEASED = STARTING; + public readonly mode: LocalQueueMode = STARTING; private promise = defer(); private backgroundCount = 0; @@ -129,7 +122,7 @@ export class LocalQueue { private readonly compiledSharedOptions: CompiledSharedOptions, private readonly tasks: TaskList, private readonly withPgClient: EnhancedWithPgClient, - private readonly workerPool: WorkerPool, + public readonly workerPool: WorkerPool, private readonly getJobBatchSize: number, private readonly continuous: boolean, ) { @@ -148,9 +141,25 @@ export class LocalQueue { `Invalid configuration; 'preset.worker.localQueue.refetchDelay.durationMs' (${localQueueRefetchDelayDuration}) must not be larger than 'preset.worker.pollInterval' (${this.pollInterval})`, ); } + compiledSharedOptions.events.emit("localQueue:init", { + localQueue: this, + }); this.setModePolling(); } + private setMode( + newMode: Exclude, + ) { + const oldMode = this.mode; + // Override the 'readonly' + (this.mode as LocalQueueMode) = newMode; + this.compiledSharedOptions.events.emit("localQueue:setMode", { + localQueue: this, + oldMode, + newMode, + }); + } + private decreaseBackgroundCount = () => { this.backgroundCount--; if (this.mode === "RELEASED" && this.backgroundCount === 0) { @@ -193,7 +202,7 @@ export class LocalQueue { this.ttlExpiredTimer = null; } - this.mode = POLLING; + this.setMode(POLLING); this.fetch(); } @@ -219,7 +228,7 @@ export class LocalQueue { clearTimeout(this.ttlExpiredTimer); } - this.mode = WAITING; + this.setMode(WAITING); this.ttlExpiredTimer = setTimeout(() => { this.setModeTtlExpired(); @@ -248,7 +257,7 @@ export class LocalQueue { this.ttlExpiredTimer = null; } - this.mode = TTL_EXPIRED; + this.setMode(TTL_EXPIRED); // Return jobs to the pool this.returnJobs(); @@ -256,8 +265,12 @@ export class LocalQueue { private returnJobs() { const jobsToReturn = this.jobQueue.splice(0, this.jobQueue.length); + this.compiledSharedOptions.events.emit("localQueue:returnJobs", { + localQueue: this, + jobs: jobsToReturn, + }); this.background( - returnJob( + returnJobs( this.compiledSharedOptions, this.withPgClient, this.workerPool.id, @@ -308,6 +321,7 @@ export class LocalQueue { * Initialized to `true` so on error we don't enable refetch delay. */ let refetchDelayThresholdSurpassed = true; + let jobCount = 0; const refetchDelayOptions = this.compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay; try { @@ -344,12 +358,17 @@ export class LocalQueue { this.getJobBatchSize, ); + this.compiledSharedOptions.events.emit("localQueue:getJobs:complete", { + localQueue: this, + jobs, + }); + assert.equal( this.jobQueue.length, 0, "Should not fetch when job queue isn't empty (recheck)", ); - const jobCount = jobs.length; + jobCount = jobs.length; fetchedMax = jobCount >= this.getJobBatchSize; refetchDelayThresholdSurpassed = // If we've fetched the maximum, we've met the requirement @@ -388,10 +407,10 @@ export class LocalQueue { return; } + /** How long to avoid any refetches for */ + const refetchDelayMs = + (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); if (!refetchDelayThresholdSurpassed) { - /** How long to avoid any refetches for */ - const refetchDelayMs = - (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); /** How many notifications do we need to receive before we abort the "no refetches" behavior? */ const abortThreshold = (0.5 + Math.random()) * @@ -410,6 +429,13 @@ export class LocalQueue { this.refetchDelayCompleteOrAbort, refetchDelayMs, ); + this.compiledSharedOptions.events.emit("localQueue:refetchDelay:start", { + localQueue: this, + jobCount, + threshold: refetchDelayOptions?.threshold ?? 0, + delayMs: refetchDelayMs, + abortThreshold: this.refetchDelayAbortThreshold, + }); } if (this.jobQueue.length > 0) { @@ -437,12 +463,27 @@ export class LocalQueue { this.handleCheckRefetchDelayAbortThreshold(); } - private refetchDelayCompleteOrAbort = (): void => { + private refetchDelayCompleteOrAbort = (aborted = false): void => { if (this.refetchDelayTimer != null) { clearTimeout(this.refetchDelayTimer); this.refetchDelayTimer = null; } this.refetchDelayActive = false; + if (aborted) { + this.compiledSharedOptions.events.emit("localQueue:refetchDelay:abort", { + localQueue: this, + count: this.refetchDelayCounter, + abortThreshold: this.refetchDelayAbortThreshold, + }); + } else { + this.compiledSharedOptions.events.emit( + "localQueue:refetchDelay:expired", + { + localQueue: this, + }, + ); + } + if (this.mode === POLLING && this.refetchDelayFetchOnComplete) { // Cancel poll, do now if (this.fetchTimer) { @@ -463,7 +504,7 @@ export class LocalQueue { } if (this.refetchDelayCounter >= this.refetchDelayAbortThreshold) { this.refetchDelayFetchOnComplete = true; - this.refetchDelayCompleteOrAbort(); + this.refetchDelayCompleteOrAbort(true); } return true; } @@ -538,7 +579,7 @@ export class LocalQueue { ); const oldMode = this.mode; - this.mode = RELEASED; + this.setMode(RELEASED); if (this.refetchDelayTimer != null) { clearTimeout(this.refetchDelayTimer); diff --git a/src/sql/returnJob.ts b/src/sql/returnJobs.ts similarity index 98% rename from src/sql/returnJob.ts rename to src/sql/returnJobs.ts index f7b22f4f..7b0e36f8 100644 --- a/src/sql/returnJob.ts +++ b/src/sql/returnJobs.ts @@ -1,7 +1,7 @@ import { DbJob, EnhancedWithPgClient } from "../interfaces"; import { CompiledSharedOptions } from "../lib"; -export async function returnJob( +export async function returnJobs( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, poolId: string, diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index cd787e63..5ccce705 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -7,6 +7,63 @@ const CONCURRENT_JOBS = 10; +const stats = { + jobsFetched: 0, + jobsReturned: 0, + timeInMode: {}, + timeInRefetchDelay: 0n, + refetchDelays: 0, + refetchDelayAborted: 0, +}; + +let lastModeStart = process.hrtime.bigint(); +let refetchDelayStart = process.hrtime.bigint(); + +/** @type {GraphileConfig.Plugin} */ +const TowerDefenceResultPlugin = { + name: "TowerDefenceResultPlugin", + version: "0.0.0", + worker: { + hooks: { + init(ctx) { + ctx.events.on("pool:release", (event) => { + console.log(`Pool ${event.workerPool.id} released`); + console.dir(stats); + }); + ctx.events.on("localQueue:getJobs:complete", ({ jobs }) => { + stats.jobsFetched += jobs.length; + }); + ctx.events.on("localQueue:returnJobs", ({ jobs }) => { + stats.jobsReturned += jobs.length; + }); + ctx.events.on("localQueue:init", () => { + lastModeStart = process.hrtime.bigint(); + }); + ctx.events.on("localQueue:setMode", ({ oldMode, newMode }) => { + const now = process.hrtime.bigint(); + const diff = now - lastModeStart; + lastModeStart = now; + stats.timeInMode[oldMode] ??= 0n; + stats.timeInMode[oldMode] += diff; + }); + ctx.events.on("localQueue:refetchDelay:start", () => { + stats.refetchDelays += 1; + refetchDelayStart = process.hrtime.bigint(); + }); + ctx.events.on("localQueue:refetchDelay:abort", () => { + stats.refetchDelaysAborted += 1; + const elapsed = process.hrtime.bigint() - refetchDelayStart; + stats.timeInRefetchDelay += elapsed; + }); + ctx.events.on("localQueue:refetchDelay:expired", () => { + const elapsed = process.hrtime.bigint() - refetchDelayStart; + stats.timeInRefetchDelay += elapsed; + }); + }, + }, + }, +}; + /** @type {GraphileConfig.Preset} */ const preset = { // extends: [WorkerProPreset], @@ -28,6 +85,7 @@ const preset = { completeJobBatchDelay: 0, failJobBatchDelay: 0, }, + plugins: [TowerDefenceResultPlugin], }; export default preset; diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 01234e15..9894dd53 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -8,9 +8,9 @@ const STUCK_JOB_COUNT = 0; const PARALLELISM = 10; const WAVES = [ makeWave([1]), - makeWave(new Array(1000).fill(1)), - makeWave(new Array(1000).fill(4)), - makeWave(new Array(1000).fill(20)), + makeWave(new Array(10000).fill(1)), + makeWave(new Array(10000).fill(4)), + makeWave(new Array(10000).fill(20)), ]; const taskIdentifier = "log_if_999"; From 24d51c178a9cd7b22c6b176ce0dd85e5aac4fc6b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 15:24:57 +0000 Subject: [PATCH 060/155] Easier to read results --- towerDefence/graphile.config.mjs | 35 ++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 5ccce705..8f02b790 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -10,7 +10,7 @@ const CONCURRENT_JOBS = 10; const stats = { jobsFetched: 0, jobsReturned: 0, - timeInMode: {}, + timeInMode: Object.create(null), timeInRefetchDelay: 0n, refetchDelays: 0, refetchDelayAborted: 0, @@ -19,6 +19,23 @@ const stats = { let lastModeStart = process.hrtime.bigint(); let refetchDelayStart = process.hrtime.bigint(); +/** @type {(value: number | string, width?: number, char?: string) => string} */ +const p = (v, w = 10, s = " ") => String(v).padStart(w, s); + +/** @type {(t: bigint) => string} */ +const ms = (t) => { + return `${(Number(t) / 1e6).toFixed(2)}ms`; +}; + +/** @type {() => string} */ +const tim = () => { + let results = []; + for (const m in stats.timeInMode) { + results.push(p(`${p(m)}=${ms(stats.timeInMode[m])}`, 19)); + } + return results.join(","); +}; + /** @type {GraphileConfig.Plugin} */ const TowerDefenceResultPlugin = { name: "TowerDefenceResultPlugin", @@ -27,8 +44,18 @@ const TowerDefenceResultPlugin = { hooks: { init(ctx) { ctx.events.on("pool:release", (event) => { - console.log(`Pool ${event.workerPool.id} released`); - console.dir(stats); + console.log( + `\nPool ${event.workerPool.id} released\nFetched=${p( + stats.jobsFetched, + 6, + )}|Returned=${p(stats.jobsReturned, 6)}|TotalDelay=${p( + ms(stats.timeInRefetchDelay), + 11, + )}(Aborted=${p( + `${stats.refetchDelayAborted}/${stats.refetchDelays}`, + 9, + )})|${tim()}\n`, + ); }); ctx.events.on("localQueue:getJobs:complete", ({ jobs }) => { stats.jobsFetched += jobs.length; @@ -81,7 +108,7 @@ const preset = { //completeJobBatchDelay: -1, //failJobBatchDelay: -1, - localQueue: { size: 500, refetchDelay: { durationMs: 10 } }, + localQueue: { size: 500, refetchDelay: { durationMs: 100 } }, completeJobBatchDelay: 0, failJobBatchDelay: 0, }, From e295c72ddf492dbf95740d9ef6d409daf3d72f7d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 15:46:58 +0000 Subject: [PATCH 061/155] Extreme values demonstrate problem --- towerDefence/graphile.config.mjs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 8f02b790..b00c03b0 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -13,7 +13,7 @@ const stats = { timeInMode: Object.create(null), timeInRefetchDelay: 0n, refetchDelays: 0, - refetchDelayAborted: 0, + refetchDelaysAborted: 0, }; let lastModeStart = process.hrtime.bigint(); @@ -52,7 +52,7 @@ const TowerDefenceResultPlugin = { ms(stats.timeInRefetchDelay), 11, )}(Aborted=${p( - `${stats.refetchDelayAborted}/${stats.refetchDelays}`, + `${stats.refetchDelaysAborted}/${stats.refetchDelays}`, 9, )})|${tim()}\n`, ); @@ -108,7 +108,8 @@ const preset = { //completeJobBatchDelay: -1, //failJobBatchDelay: -1, - localQueue: { size: 500, refetchDelay: { durationMs: 100 } }, + pollInterval: 15000, + localQueue: { size: CONCURRENT_JOBS, refetchDelay: { durationMs: 15000 } }, completeJobBatchDelay: 0, failJobBatchDelay: 0, }, From 5749bed1939836899cc226e956f4da055bbcb819 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 17:14:52 +0000 Subject: [PATCH 062/155] Produce the problematic behavior I was worried about --- towerDefence/graphile.config.mjs | 21 +++++++++++++-------- towerDefence/run.mjs | 16 +++++++++------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index b00c03b0..00aaf06d 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -8,6 +8,7 @@ const CONCURRENT_JOBS = 10; const stats = { + fetches: 0, jobsFetched: 0, jobsReturned: 0, timeInMode: Object.create(null), @@ -45,19 +46,20 @@ const TowerDefenceResultPlugin = { init(ctx) { ctx.events.on("pool:release", (event) => { console.log( - `\nPool ${event.workerPool.id} released\nFetched=${p( - stats.jobsFetched, + `\nPool ${event.workerPool.id} released\nFetches=${p( + stats.fetches, + 5, + )}|Fetched=${p(stats.jobsFetched, 6)}|Returned=${p( + stats.jobsReturned, 6, - )}|Returned=${p(stats.jobsReturned, 6)}|TotalDelay=${p( - ms(stats.timeInRefetchDelay), - 11, - )}(Aborted=${p( + )}|TotalDelay=${p(ms(stats.timeInRefetchDelay), 11)}(Aborted=${p( `${stats.refetchDelaysAborted}/${stats.refetchDelays}`, 9, )})|${tim()}\n`, ); }); ctx.events.on("localQueue:getJobs:complete", ({ jobs }) => { + stats.fetches += 1; stats.jobsFetched += jobs.length; }); ctx.events.on("localQueue:returnJobs", ({ jobs }) => { @@ -108,8 +110,11 @@ const preset = { //completeJobBatchDelay: -1, //failJobBatchDelay: -1, - pollInterval: 15000, - localQueue: { size: CONCURRENT_JOBS, refetchDelay: { durationMs: 15000 } }, + pollInterval: 2000, + localQueue: { + size: CONCURRENT_JOBS, + refetchDelay: { durationMs: 1000, abortThreshold: CONCURRENT_JOBS * 10 }, + }, completeJobBatchDelay: 0, failJobBatchDelay: 0, }, diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 9894dd53..fed3d3f9 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -2,15 +2,14 @@ // @ts-check import { execSync, spawn } from "child_process"; import pg from "pg"; -import { promisify } from "util"; const STUCK_JOB_COUNT = 0; const PARALLELISM = 10; const WAVES = [ - makeWave([1]), - makeWave(new Array(10000).fill(1)), - makeWave(new Array(10000).fill(4)), - makeWave(new Array(10000).fill(20)), + //makeWave([1]), + makeWave(new Array(10000).fill(1), 10), + //makeWave(new Array(10000).fill(4)), + //makeWave(new Array(10000).fill(20)), ]; const taskIdentifier = "log_if_999"; @@ -49,8 +48,8 @@ const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); const GENERAL_JOBS_PER_SECOND = 15000; const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; -/** @type {(jobBatches: number[]) => () => Promise} */ -function makeWave(jobBatches) { +/** @type {(jobBatches: number[], sleepDuration?: number) => () => Promise} */ +function makeWave(jobBatches, sleepDuration = -1) { return async () => { let totalCount = 0; for (let i = 0; i < jobBatches.length; i++) { @@ -72,6 +71,9 @@ function makeWave(jobBatches) { `select graphile_worker.add_jobs($1::graphile_worker.job_spec[]);`, [jobsString], ); + if (sleepDuration >= 0) { + await sleep(sleepDuration); + } } // Give roughly enough time for the jobs to complete From 993cf449af6bce332236f32c45b71c0b59d409de Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Thu, 14 Nov 2024 17:22:28 +0000 Subject: [PATCH 063/155] Clarify doc --- src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.ts b/src/index.ts index 448e150a..69bb893d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -175,8 +175,8 @@ declare global { size: number; /** - * How long should jobs sit in the local queue before they are returned - * to the database? Defaults to 5 minutes. + * How long (in milliseconds) should jobs sit in the local queue before + * they are returned to the database? Defaults to 5 minutes. * * @default `300000` */ From c0d7e9ba527bd649d1431eedc2f4c2e534420a91 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 10:01:31 +0000 Subject: [PATCH 064/155] Rename abortThreshold to maxAbortThreshold and update docs and implementation --- src/index.ts | 30 ++++++++++++++++++++++++++---- src/localQueue.ts | 16 ++++++++++------ towerDefence/graphile.config.mjs | 5 ++++- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/index.ts b/src/index.ts index 69bb893d..2b5346ce 100644 --- a/src/index.ts +++ b/src/index.ts @@ -207,6 +207,7 @@ declare global { * are insufficient jobs in the database to keep the local queue full. */ durationMs: number; + /** * How many jobs should a fetch return to trigger the refetchDelay? * Must be less than the local queue size @@ -214,14 +215,35 @@ declare global { * @default {0} */ threshold?: number; + /** - * How many new jobs, on average, can the pool that's in idle fetch - * delay be notified of before it aborts the refetch delay and fetches - * anyway + * How many new jobs can a pool that's in refetch delay be notified + * of before it must abort the refetch delay and fetch anyway. + * + * Note that because you may have many different workers in refetch + * delay we take a random number up to this threshold, this means + * that different workers will abort refetch delay at different times + * which a) helps avoid the thundering herd problem, and b) helps to + * reduce the latency of executing a new job when all workers are in + * refetch delay. + * + * We don't know the best value for this, it likely will change based + * on a large number of factors. If you're not sure what to set it + * to, we recommend you start by taking `localQueue.size` and + * multiplying it by the number of Graphile Worker instances you're + * running (ignoring their `concurrency` settings). Then iterate + * based on the behaviors you observe. And report back to us - we'd + * love to hear about what works and what doesn't! + * + * To force the full refetch delay to always apply, set this to + * `Infinity` since `Math.random() * Infinity = Infinity` (except in + * the case that Math.random() is zero, but that's only got a 1 in + * 2^53 chance of happening so you're probably fine, right? Don't + * worry, we handle this.) * * @default {5 * localQueue.size} */ - abortThreshold?: number; + maxAbortThreshold?: number; }; }; diff --git a/src/localQueue.ts b/src/localQueue.ts index 02d68a56..30d685d1 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -411,13 +411,17 @@ export class LocalQueue { const refetchDelayMs = (0.5 + Math.random()) * (refetchDelayOptions?.durationMs ?? 100); if (!refetchDelayThresholdSurpassed) { - /** How many notifications do we need to receive before we abort the "no refetches" behavior? */ + /** The configured abort threshold */ + const maxAbortThreshold = + refetchDelayOptions?.maxAbortThreshold ?? 5 * this.getJobBatchSize; + /** + * How many notifications do we need to receive before we abort the "no + * refetches" behavior? Note: this is not + */ const abortThreshold = - (0.5 + Math.random()) * - Math.min( - refetchDelayOptions?.abortThreshold ?? Infinity, - 5 * this.getJobBatchSize, - ); + // `|| Infinity` because if `maxAbortThreshold = Infinity` and + // `Math.random() = 0` then we'd get `NaN` (`0 * Infinity = NaN`) + Math.random() * maxAbortThreshold || Infinity; this.fetchAgain = false; this.refetchDelayActive = true; diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 00aaf06d..ba97edae 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -113,7 +113,10 @@ const preset = { pollInterval: 2000, localQueue: { size: CONCURRENT_JOBS, - refetchDelay: { durationMs: 1000, abortThreshold: CONCURRENT_JOBS * 10 }, + refetchDelay: { + durationMs: 1000, + maxAbortThreshold: CONCURRENT_JOBS * 10, + }, }, completeJobBatchDelay: 0, failJobBatchDelay: 0, From 6b9179cfe379e8360cb4d3a6f4254bd1b99405ed Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 10:08:10 +0000 Subject: [PATCH 065/155] More waves --- towerDefence/run.mjs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index fed3d3f9..60aca6b1 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -6,10 +6,13 @@ import pg from "pg"; const STUCK_JOB_COUNT = 0; const PARALLELISM = 10; const WAVES = [ - //makeWave([1]), - makeWave(new Array(10000).fill(1), 10), - //makeWave(new Array(10000).fill(4)), - //makeWave(new Array(10000).fill(20)), + makeWave([1]), + makeWave(new Array(1000).fill(1), 10), + makeWave(new Array(1000).fill(1), 5), + makeWave(new Array(10000).fill(1), 1), + makeWave(new Array(10000).fill(1)), + makeWave(new Array(10000).fill(4)), + makeWave(new Array(10000).fill(200)), ]; const taskIdentifier = "log_if_999"; From 5126d39d9b66690c26748a92eb01eabeac4e4935 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 10:09:26 +0000 Subject: [PATCH 066/155] Shorter waves --- towerDefence/run.mjs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 60aca6b1..d99607a0 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -9,10 +9,10 @@ const WAVES = [ makeWave([1]), makeWave(new Array(1000).fill(1), 10), makeWave(new Array(1000).fill(1), 5), - makeWave(new Array(10000).fill(1), 1), - makeWave(new Array(10000).fill(1)), - makeWave(new Array(10000).fill(4)), - makeWave(new Array(10000).fill(200)), + makeWave(new Array(3000).fill(1), 1), + makeWave(new Array(5000).fill(1)), + makeWave(new Array(5000).fill(4)), + makeWave(new Array(1000).fill(200)), ]; const taskIdentifier = "log_if_999"; From 81bc1bb41874af0629c95bc955b38fc4ca3a1abb Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 10:11:56 +0000 Subject: [PATCH 067/155] Ive we've already been running jobs slowly, don't sleep so long --- towerDefence/run.mjs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index d99607a0..6ea92f2f 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -55,6 +55,7 @@ const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; function makeWave(jobBatches, sleepDuration = -1) { return async () => { let totalCount = 0; + let start = Date.now(); for (let i = 0; i < jobBatches.length; i++) { const jobCount = jobBatches[i]; const jobs = []; @@ -80,7 +81,13 @@ function makeWave(jobBatches, sleepDuration = -1) { } // Give roughly enough time for the jobs to complete - await sleep(totalCount / GENERAL_JOBS_PER_MILLISECOND); + const estimatedExecutionTime = totalCount / GENERAL_JOBS_PER_MILLISECOND; + + const elapsed = Date.now() - start; + const timeToSleep = estimatedExecutionTime - elapsed; + if (timeToSleep > 0) { + await sleep(timeToSleep); + } // And wait for the jobs table to be empty const MAX_ATTEMPTS = 20; From 1308d84ec7cc16572de01952c0db299b95928f7f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 10:21:49 +0000 Subject: [PATCH 068/155] Track empty fetches --- towerDefence/graphile.config.mjs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index ba97edae..f162dbed 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -9,6 +9,7 @@ const CONCURRENT_JOBS = 10; const stats = { fetches: 0, + emptyFetches: 0, jobsFetched: 0, jobsReturned: 0, timeInMode: Object.create(null), @@ -49,10 +50,13 @@ const TowerDefenceResultPlugin = { `\nPool ${event.workerPool.id} released\nFetches=${p( stats.fetches, 5, - )}|Fetched=${p(stats.jobsFetched, 6)}|Returned=${p( - stats.jobsReturned, + )}(empty=${p(stats.emptyFetches, 5)})|Fetched=${p( + stats.jobsFetched, 6, - )}|TotalDelay=${p(ms(stats.timeInRefetchDelay), 11)}(Aborted=${p( + )}|Returned=${p(stats.jobsReturned, 6)}|TotalDelay=${p( + ms(stats.timeInRefetchDelay), + 11, + )}(Aborted=${p( `${stats.refetchDelaysAborted}/${stats.refetchDelays}`, 9, )})|${tim()}\n`, @@ -60,6 +64,9 @@ const TowerDefenceResultPlugin = { }); ctx.events.on("localQueue:getJobs:complete", ({ jobs }) => { stats.fetches += 1; + if (jobs.length === 0) { + stats.emptyFetches += 1; + } stats.jobsFetched += jobs.length; }); ctx.events.on("localQueue:returnJobs", ({ jobs }) => { From 57b411ee054f8ce062936f8ff916d1b66c510112 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 14:18:13 +0000 Subject: [PATCH 069/155] Use addJobs API and track latency --- towerDefence/graphile.config.mjs | 24 ++++++++++++++++++------ towerDefence/run.mjs | 31 +++++++++++++++++-------------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index f162dbed..219eeb09 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -16,6 +16,8 @@ const stats = { timeInRefetchDelay: 0n, refetchDelays: 0, refetchDelaysAborted: 0, + maxLatency: 0, + latencySum: 0, }; let lastModeStart = process.hrtime.bigint(); @@ -50,13 +52,16 @@ const TowerDefenceResultPlugin = { `\nPool ${event.workerPool.id} released\nFetches=${p( stats.fetches, 5, - )}(empty=${p(stats.emptyFetches, 5)})|Fetched=${p( - stats.jobsFetched, + )}(empty=${p(stats.emptyFetches, 5)};maxLatency=${p( + stats.maxLatency, + 4, + )}ms;avgLatency=${p( + (stats.latencySum / stats.jobsFetched).toFixed(2), + 8, + )}ms)|Fetched=${p(stats.jobsFetched, 6)}|Returned=${p( + stats.jobsReturned, 6, - )}|Returned=${p(stats.jobsReturned, 6)}|TotalDelay=${p( - ms(stats.timeInRefetchDelay), - 11, - )}(Aborted=${p( + )}|TotalDelay=${p(ms(stats.timeInRefetchDelay), 11)}(Aborted=${p( `${stats.refetchDelaysAborted}/${stats.refetchDelays}`, 9, )})|${tim()}\n`, @@ -95,6 +100,13 @@ const TowerDefenceResultPlugin = { const elapsed = process.hrtime.bigint() - refetchDelayStart; stats.timeInRefetchDelay += elapsed; }); + ctx.events.on("job:start", (event) => { + const l = Date.now() - +event.job.run_at; + stats.latencySum += l; + if (l > stats.maxLatency) { + stats.maxLatency = l; + } + }); }, }, }, diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 6ea92f2f..c2a2cf42 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -3,6 +3,8 @@ import { execSync, spawn } from "child_process"; import pg from "pg"; +import { makeWorkerUtils } from "../dist/index.js"; + const STUCK_JOB_COUNT = 0; const PARALLELISM = 10; const WAVES = [ @@ -48,6 +50,10 @@ const spawnOptions = { const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); +const workerUtils = await makeWorkerUtils({ + pgPool, +}); + const GENERAL_JOBS_PER_SECOND = 15000; const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; @@ -55,26 +61,23 @@ const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; function makeWave(jobBatches, sleepDuration = -1) { return async () => { let totalCount = 0; - let start = Date.now(); + const NOW = new Date(); + let start = +NOW; for (let i = 0; i < jobBatches.length; i++) { const jobCount = jobBatches[i]; + /** @type {import("../dist/index.js").AddJobsJobSpec[]} */ const jobs = []; for (let i = 0; i < jobCount; i++) { totalCount++; - jobs.push( - `("${taskIdentifier.replace( - /["\\]/g, - "\\$&", - )}","{\\"id\\":${i}}",,,,,,)`, - ); + jobs.push({ + identifier: taskIdentifier, + payload: { + id: i, + }, + runAt: NOW, + }); } - const jobsString = `{"${jobs - .map((j) => j.replace(/["\\]/g, "\\$&")) - .join('","')}"}`; - await pgPool.query( - `select graphile_worker.add_jobs($1::graphile_worker.job_spec[]);`, - [jobsString], - ); + await workerUtils.addJobs(jobs); if (sleepDuration >= 0) { await sleep(sleepDuration); } From d2e4f1241fe0db12701657abc00f69fcb89049e7 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 14:24:35 +0000 Subject: [PATCH 070/155] Can't delete DB if workerUtils is connected --- towerDefence/run.mjs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index c2a2cf42..4e988342 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -50,16 +50,12 @@ const spawnOptions = { const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); -const workerUtils = await makeWorkerUtils({ - pgPool, -}); - const GENERAL_JOBS_PER_SECOND = 15000; const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; -/** @type {(jobBatches: number[], sleepDuration?: number) => () => Promise} */ +/** @type {(jobBatches: number[], sleepDuration?: number) => (workerUtils: import("../dist/interfaces.js").WorkerUtils) => Promise} */ function makeWave(jobBatches, sleepDuration = -1) { - return async () => { + return async (workerUtils) => { let totalCount = 0; const NOW = new Date(); let start = +NOW; @@ -135,6 +131,10 @@ async function main() { console.log("Installing the schema"); execSync("node ../dist/cli.js --schema-only", execOptions); + const workerUtils = await makeWorkerUtils({ + pgPool, + }); + if (STUCK_JOB_COUNT > 0) { console.log(`Scheduling ${STUCK_JOB_COUNT} stuck jobs`); await time(() => { @@ -186,7 +186,7 @@ async function main() { for (let waveNumber = 0; waveNumber < WAVES.length; waveNumber++) { const wave = WAVES[waveNumber]; console.log(`Wave ${waveNumber + 1}...`); - await wave(); + await wave(workerUtils); console.log(); console.log(); } From daa2a6208ccfc128a90f8cabee6989d63eb439ef Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 14:26:44 +0000 Subject: [PATCH 071/155] Stupid mistake --- towerDefence/run.mjs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 4e988342..9106ab62 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -57,9 +57,9 @@ const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; function makeWave(jobBatches, sleepDuration = -1) { return async (workerUtils) => { let totalCount = 0; - const NOW = new Date(); - let start = +NOW; + let start = Date.now(); for (let i = 0; i < jobBatches.length; i++) { + const NOW = new Date(); const jobCount = jobBatches[i]; /** @type {import("../dist/index.js").AddJobsJobSpec[]} */ const jobs = []; From ee97cde6f0e01f49cb29fa8cf40d45ebba31d7be Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 15:06:44 +0000 Subject: [PATCH 072/155] Fix division by zero error --- towerDefence/graphile.config.mjs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 219eeb09..8fa04380 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -56,7 +56,9 @@ const TowerDefenceResultPlugin = { stats.maxLatency, 4, )}ms;avgLatency=${p( - (stats.latencySum / stats.jobsFetched).toFixed(2), + stats.jobsFetched + ? (stats.latencySum / stats.jobsFetched).toFixed(2) + : "-", 8, )}ms)|Fetched=${p(stats.jobsFetched, 6)}|Returned=${p( stats.jobsReturned, From f000ad085df7af81542c5a73bdd10cd5e0f5ff09 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 15:33:28 +0000 Subject: [PATCH 073/155] Add another wave, and increase local queue size by 1 --- towerDefence/graphile.config.mjs | 2 +- towerDefence/run.mjs | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 8fa04380..44184721 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -133,7 +133,7 @@ const preset = { pollInterval: 2000, localQueue: { - size: CONCURRENT_JOBS, + size: CONCURRENT_JOBS + 1, refetchDelay: { durationMs: 1000, maxAbortThreshold: CONCURRENT_JOBS * 10, diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 9106ab62..944ac742 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -14,9 +14,15 @@ const WAVES = [ makeWave(new Array(3000).fill(1), 1), makeWave(new Array(5000).fill(1)), makeWave(new Array(5000).fill(4)), - makeWave(new Array(1000).fill(200)), + makeWave(new Array(200).fill(200)), + makeWave(Array.from({ length: 50 }, repeat([2000, 200, 20, 2])), 5), ]; +/** @type {(arr: T[]) => (_: any, i: number) => T} */ +function repeat(arr) { + return (_, i) => arr[i % arr.length]; +} + const taskIdentifier = "log_if_999"; const __dirname = new URL(".", import.meta.url).pathname; From b445bf6a4d793290c0ce39e3aeadfcc830da1a67 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 15:59:48 +0000 Subject: [PATCH 074/155] Use JSON rather than constructing tuples; batch at 1M --- perfTest/init.js | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/perfTest/init.js b/perfTest/init.js index 1cd6f952..f9d779d9 100644 --- a/perfTest/init.js +++ b/perfTest/init.js @@ -36,21 +36,19 @@ $$ language plpgsql;`, } else { const jobs = []; for (let i = 0; i < jobCount; i++) { - jobs.push( - `("${taskIdentifier.replace( - /["\\]/g, - "\\$&", - )}","{\\"id\\":${i}}",,,,,,)`, + jobs.push({ identifier: taskIdentifier, payload: { id: i } }); + } + console.time(`Adding jobs`); + while (jobs.length > 0) { + const jobsSlice = jobs.splice(0, 1000000); + const jobsString = JSON.stringify(jobsSlice); + console.log(`Adding ${jobsSlice.length} jobs`); + await pgPool.query( + `select 1 from graphile_worker.add_jobs(array(select json_populate_recordset(null::graphile_worker.job_spec, $1::json)));`, + [jobsString], ); + console.log(`...added`); } - const jobsString = `{"${jobs - .map((j) => j.replace(/["\\]/g, "\\$&")) - .join('","')}"}`; - console.time("Adding jobs"); - await pgPool.query( - `select graphile_worker.add_jobs($1::graphile_worker.job_spec[]);`, - [jobsString], - ); console.timeEnd("Adding jobs"); } From 1afae2cdc40f152d10798d56a27c1a33ca5314e6 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 16:12:46 +0000 Subject: [PATCH 075/155] 0.17.0-canary.6aeb577 --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index c169e680..9932a6a2 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.379fb2e", + "version": "0.17.0-canary.6aeb577", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 4264f926..0909b236 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.379fb2e"; +export const version = "0.17.0-canary.6aeb577"; From 51c5648b7442ed718452e2c25ab063ac27da897e Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 15 Nov 2024 17:03:44 +0000 Subject: [PATCH 076/155] 0.17.0-canary.9817f67 --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 9932a6a2..0f18f445 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.6aeb577", + "version": "0.17.0-canary.9817f67", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 0909b236..2b893a05 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.6aeb577"; +export const version = "0.17.0-canary.9817f67"; From 6a7dd036712683a7d27a7bf88a5c81483d662310 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 18 Nov 2024 10:45:05 +0000 Subject: [PATCH 077/155] "Breaking" change --- sql/000019.sql | 4 ++++ src/generated/sql.ts | 5 +++++ 2 files changed, 9 insertions(+) create mode 100644 sql/000019.sql diff --git a/sql/000019.sql b/sql/000019.sql new file mode 100644 index 00000000..56d7aa8e --- /dev/null +++ b/sql/000019.sql @@ -0,0 +1,4 @@ +--! breaking-change +-- This is just a breaking change marker for the v0.17 worker-centric to +-- pool-centric jump. The migration itself is not breaking. +select 1; diff --git a/src/generated/sql.ts b/src/generated/sql.ts index 6a6fe7ec..df57995a 100644 --- a/src/generated/sql.ts +++ b/src/generated/sql.ts @@ -2359,5 +2359,10 @@ begin return v_job; end; $$; +`, + "000019.sql": String.raw`--! breaking-change +-- This is just a breaking change marker for the v0.17 worker-centric to +-- pool-centric jump. The migration itself is not breaking. +select 1; `, }; From c141856e539858ae2d4650ea040aca1a8b3fadda Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 18 Nov 2024 11:04:43 +0000 Subject: [PATCH 078/155] Fix migration tests --- __tests__/migrate.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/__tests__/migrate.test.ts b/__tests__/migrate.test.ts index eb61cc61..e2d76d1b 100644 --- a/__tests__/migrate.test.ts +++ b/__tests__/migrate.test.ts @@ -14,7 +14,7 @@ import { const options: WorkerSharedOptions = {}; -const MAX_MIGRATION_NUMBER = 18; +const MAX_MIGRATION_NUMBER = 19; test("migration installs schema; second migration does no harm", async () => { await withPgClient(async (pgClient) => { From e7bd5770020a141122ab91ad6d679828244353e5 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 18 Nov 2024 13:00:10 +0000 Subject: [PATCH 079/155] Fix all the issues from TypeScript strict mode --- src/main.ts | 2 +- src/plugins/LoadTaskFromJsPlugin.ts | 1 + src/worker.ts | 2 -- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main.ts b/src/main.ts index 0af9daf6..7aba7e10 100644 --- a/src/main.ts +++ b/src/main.ts @@ -653,7 +653,7 @@ export function _runTaskList( `Graphile Worker internal error: terminate() was called twice for worker pool. Ignoring second call; but this indicates a bug - please file an issue.`, ); } catch (e) { - logger.error(String(e.stack)); + logger.error(String(coerceError(e).stack)); } } } diff --git a/src/plugins/LoadTaskFromJsPlugin.ts b/src/plugins/LoadTaskFromJsPlugin.ts index 64b1e124..14173bc9 100644 --- a/src/plugins/LoadTaskFromJsPlugin.ts +++ b/src/plugins/LoadTaskFromJsPlugin.ts @@ -4,6 +4,7 @@ import { pathToFileURL } from "url"; import { FileDetails, isValidTask } from "../index.js"; import { coerceError } from "../lib.js"; import { version } from "../version.js"; +import { coerceError } from "../lib.js"; const DEFAULT_EXTENSIONS = [".js", ".mjs", ".cjs"]; diff --git a/src/worker.ts b/src/worker.ts index b21d8786..e1eaaebc 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -19,8 +19,6 @@ import { coerceError, CompiledSharedOptions } from "./lib"; const NO_LOG_SUCCESS = !!process.env.NO_LOG_SUCCESS; -const NO_LOG_SUCCESS = !!process.env.NO_LOG_SUCCESS; - export function makeNewWorker( compiledSharedOptions: CompiledSharedOptions, params: { From 91f1a5e10f58c6cbef42225bcdd3bb4bee8ccb83 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 18 Nov 2024 13:55:13 +0000 Subject: [PATCH 080/155] Move release of batching of fail/complete to terminate --- src/main.ts | 124 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 29 deletions(-) diff --git a/src/main.ts b/src/main.ts index 7aba7e10..6e64b211 100644 --- a/src/main.ts +++ b/src/main.ts @@ -593,20 +593,49 @@ export function _runTaskList( async function deactivate() { if (workerPool._active) { workerPool._active = false; - // TODO: stop the batch()es and await the promises here + const errors: Error[] = []; + try { + await localQueue?.release(); + } catch (rawE) { + const e = coerceError(rawE); + errors.push(e); + // Log but continue regardless + logger.error(`Releasing local queue failed: ${e}`, { error: rawE }); + } + try { + // Note: this runs regardless of success of the above + await onDeactivate?.(); + } catch (rawE) { + const e = coerceError(rawE); + errors.push(e); + // Log but continue regardless + logger.error(`onDeactivate raised an error: ${e}`, { error: rawE }); + } + + if (errors.length > 0) { + throw new AggregateError( + errors, + "Errors occurred whilst deactivating queue", + ); + } + } + } + + let terminated = false; + async function terminate(error?: Error) { + if (!terminated) { + terminated = true; + const releaseCompleteJobPromise = releaseCompleteJob?.(); const releaseFailJobPromise = releaseFailJob?.(); - const releaseLocalQueue = localQueue?.release(); - const [ - releaseCompleteJobResult, - releaseFailJobResult, - releaseLocalQueueResult, - ] = await Promise.allSettled([ - releaseCompleteJobPromise, - releaseFailJobPromise, - releaseLocalQueue, - ]); + const [releaseCompleteJobResult, releaseFailJobResult] = + await Promise.allSettled([ + releaseCompleteJobPromise, + releaseFailJobPromise, + ]); + const errors: Error[] = error ? [error] : []; if (releaseCompleteJobResult.status === "rejected") { + errors.push(coerceError(releaseCompleteJobResult.reason)); // Log but continue regardless logger.error( `Releasing complete job batcher failed: ${releaseCompleteJobResult.reason}`, @@ -616,6 +645,7 @@ export function _runTaskList( ); } if (releaseFailJobResult.status === "rejected") { + errors.push(coerceError(releaseFailJobResult.reason)); // Log but continue regardless logger.error( `Releasing failed job batcher failed: ${releaseFailJobResult.reason}`, @@ -624,26 +654,27 @@ export function _runTaskList( }, ); } - if (releaseLocalQueueResult.status === "rejected") { - // Log but continue regardless - logger.error( - `Releasing local queue failed: ${releaseLocalQueueResult.reason}`, - { - error: releaseLocalQueueResult.reason, - }, - ); - } - return onDeactivate?.(); - } - } - let terminated = false; - function terminate() { - if (!terminated) { - terminated = true; const idx = allWorkerPools.indexOf(workerPool); allWorkerPools.splice(idx, 1); - promise.resolve(onTerminate?.()); + + try { + const result = onTerminate?.(); + promise.resolve(result); + } catch (e) { + errors.push(coerceError(e)); + } + if (errors.length === 1) { + promise.reject(errors[0]); + } else if (errors.length > 1) { + promise.reject( + new AggregateError( + errors, + "Errors occurred whilst terminating queue", + ), + ); + } + if (unregisterSignalHandlers) { unregisterSignalHandlers(); } @@ -732,14 +763,18 @@ export function _runTaskList( try { logger.debug(`Attempting graceful shutdown`); // Stop new jobs being added + // TODO: releasing the job releasers BEFORE we release the workers doesn't make any sense? const deactivatePromise = deactivate(); + const errors: Error[] = []; + // Remove all the workers - we're shutting them down manually const workers = [...workerPool._workers]; const workerPromises = workers.map((worker) => worker.release()); const [deactivateResult, ...workerReleaseResults] = await Promise.allSettled([deactivatePromise, ...workerPromises]); if (deactivateResult.status === "rejected") { + errors.push(coerceError(deactivateResult.reason)); // Log but continue regardless logger.error(`Deactivation failed: ${deactivateResult.reason}`, { error: deactivateResult.reason, @@ -794,6 +829,12 @@ export function _runTaskList( cancelledJobs, }); } + if (errors.length > 0) { + throw new AggregateError( + errors, + "Errors occurred whilst shutting down worker", + ); + } events.emit("pool:gracefulShutdown:complete", { pool: workerPool, workerPool, @@ -1067,7 +1108,32 @@ export function _runTaskList( } workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); if (!continuous && workerPool._workers.length === 0) { - deactivate().then(terminate, terminate); + compiledSharedOptions.events.emit("pool:gracefulShutdown", { + workerPool, + pool: workerPool, + message: + "'Run once' mode processed all available jobs and is now exiting", + }); + deactivate().then( + () => { + compiledSharedOptions.events.emit( + "pool:gracefulShutdown:complete", + { + workerPool, + pool: workerPool, + }, + ); + terminate(); + }, + (error) => { + compiledSharedOptions.events.emit("pool:gracefulShutdown:error", { + workerPool, + pool: workerPool, + error, + }); + terminate(error); + }, + ); } }; worker.promise.then( From f4b7c4a71055f65be19d073f053e02a051bdaadc Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 18 Nov 2024 17:03:02 +0000 Subject: [PATCH 081/155] Test that graceful shutdown works in runOnce --- src/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 6e64b211..e30e24b8 100644 --- a/src/main.ts +++ b/src/main.ts @@ -693,7 +693,7 @@ export function _runTaskList( const abortSignal = abortController.signal; const abortPromise = new Promise((_resolve, reject) => { abortSignal.addEventListener("abort", () => { - reject(abortSignal.reason); + reject(coerceError(abortSignal.reason)); }); }); // Make sure Node doesn't get upset about unhandled rejection From aa43296d6ac6b9771f6d6dffe8d863eb810728c8 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 15:43:51 +0000 Subject: [PATCH 082/155] Implement middleware system --- src/index.ts | 68 +++++++++++++++++++++++++- src/lib.ts | 127 ++++++++++++++++++++++++++++++------------------- src/migrate.ts | 103 ++++++++++++++++++++------------------- 3 files changed, 198 insertions(+), 100 deletions(-) diff --git a/src/index.ts b/src/index.ts index 2b5346ce..c8594865 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,9 @@ import { Logger } from "@graphile/logger"; -import { PluginHook } from "graphile-config"; +import { + CallbackOrDescriptor, + MiddlewareNext, + PluginHook, +} from "graphile-config"; import type { PoolClient } from "pg"; import { getCronItems } from "./getCronItems"; @@ -7,14 +11,18 @@ import { getTasks } from "./getTasks"; import { FileDetails, PromiseOrDirect, + RunOnceOptions, + SharedOptions, Task, TaskList, WithPgClient, Worker, WorkerEvents, WorkerPluginContext, + WorkerSharedOptions, + WorkerUtilsOptions, } from "./interfaces"; -import { CompiledSharedOptions } from "./lib"; +import { CompiledSharedOptions, ResolvedWorkerPreset } from "./lib"; export { parseCronItem, parseCronItems, parseCrontab } from "./crontab"; export * from "./interfaces"; export { @@ -37,6 +45,22 @@ declare global { interface Tasks { /* extend this through declaration merging */ } + interface BootstrapEvent { + /** + * The client used to perform the bootstrap. Replacing this is not officially + * supported, but... + */ + client: PoolClient; + /** + * The Postgres version number, e.g. 120000 for PostgreSQL 12.0 + */ + readonly postgresVersion: number; + /** + * Somewhere to store temporary data from plugins, only used during + * bootstrap and migrate + */ + readonly scratchpad: Record; + } interface MigrateEvent { /** * The client used to run the migration. Replacing this is not officially @@ -278,6 +302,19 @@ declare global { interface Plugin { worker?: { + // TODO: replace with the following once we upgrade graphile-config again + // middleware?: MiddlewareHandlers + middleware?: { + [key in keyof WorkerMiddleware]?: CallbackOrDescriptor< + WorkerMiddleware[key] extends ( + ...args: infer UArgs + ) => infer UResult + ? (next: MiddlewareNext, ...args: UArgs) => UResult + : never + >; + }; + + // TODO: deprecate this, replace with middleware hooks?: { [key in keyof WorkerHooks]?: PluginHook< WorkerHooks[key] extends (...args: infer UArgs) => infer UResult @@ -287,6 +324,33 @@ declare global { }; }; } + + interface WorkerMiddleware { + /** + * Called when Graphile Worker starts up. + */ + init< + T extends + | SharedOptions + | WorkerSharedOptions + | WorkerOptions + | RunOnceOptions + | WorkerUtilsOptions, + >(event: { + resolvedPreset: ResolvedWorkerPreset; + }): CompiledSharedOptions; + + /** + * Called when installing the Graphile Worker DB schema (or upgrading it). + */ + bootstrap(event: GraphileWorker.BootstrapEvent): PromiseOrDirect; + + /** + * Called when migrating the Graphile Worker DB. + */ + migrate(event: GraphileWorker.MigrateEvent): PromiseOrDirect; + } + interface WorkerHooks { /** * Called when Graphile Worker starts up. diff --git a/src/lib.ts b/src/lib.ts index e79bc8b1..7a548165 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -1,6 +1,12 @@ import * as assert from "assert"; import { EventEmitter } from "events"; -import { applyHooks, AsyncHooks, resolvePresets } from "graphile-config"; +import { + applyHooks, + AsyncHooks, + Middleware, + orderedApply, + resolvePresets, +} from "graphile-config"; import { Client, Pool, PoolClient, PoolConfig } from "pg"; import { makeWorkerPresetWorkerOptions } from "./config"; @@ -64,6 +70,7 @@ export interface CompiledSharedOptions< _rawOptions: T; resolvedPreset: ResolvedWorkerPreset; hooks: AsyncHooks; + middleware: Middleware; } interface ProcessSharedOptionsSettings { @@ -214,58 +221,78 @@ export function processSharedOptions< legacyOptionsToPreset(options), ]) as ResolvedWorkerPreset; - const { - worker: { - minResetLockedInterval, - maxResetLockedInterval, - schema: workerSchema, - logger, - events = new EventEmitter(), - }, - } = resolvedPreset; - - const escapedWorkerSchema = Client.prototype.escapeIdentifier(workerSchema); - if ( - !Number.isFinite(minResetLockedInterval) || - !Number.isFinite(maxResetLockedInterval) || - minResetLockedInterval < 1 || - maxResetLockedInterval < minResetLockedInterval - ) { - throw new Error( - `Invalid values for minResetLockedInterval (${minResetLockedInterval})/maxResetLockedInterval (${maxResetLockedInterval})`, - ); - } - const hooks = new AsyncHooks(); - compiled = { - version, - maxMigrationNumber: MAX_MIGRATION_NUMBER, - breakingMigrationNumbers: BREAKING_MIGRATIONS, - events, - logger, - workerSchema, - escapedWorkerSchema, - _rawOptions: options, - hooks, - resolvedPreset, - }; - applyHooks( + const middleware = new Middleware(); + + orderedApply( resolvedPreset.plugins, - (p) => p.worker?.hooks, - (name, fn, plugin) => { - const context: WorkerPluginContext = compiled!; - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const cb = ((...args: any[]) => fn(context, ...args)) as any; - cb.displayName = `${plugin.name}_hook_${name}`; - hooks.hook(name, cb); + (plugin) => plugin.worker?.middleware, + (name, fn, _plugin) => { + middleware.register(name, fn as any); }, ); - _sharedOptionsCache.set(options, compiled); - Promise.resolve(hooks.process("init")).catch((error) => { - logger.error( - `One of the plugins you are using raised an error during 'init'; but errors during 'init' are currently ignored. Continuing. Error: ${error}`, - { error }, - ); - }); + + compiled = middleware.run( + "init", + { resolvedPreset }, + ({ resolvedPreset }) => { + const { + worker: { + minResetLockedInterval, + maxResetLockedInterval, + schema: workerSchema, + logger, + events = new EventEmitter(), + }, + } = resolvedPreset; + + const escapedWorkerSchema = + Client.prototype.escapeIdentifier(workerSchema); + if ( + !Number.isFinite(minResetLockedInterval) || + !Number.isFinite(maxResetLockedInterval) || + minResetLockedInterval < 1 || + maxResetLockedInterval < minResetLockedInterval + ) { + throw new Error( + `Invalid values for minResetLockedInterval (${minResetLockedInterval})/maxResetLockedInterval (${maxResetLockedInterval})`, + ); + } + const hooks = new AsyncHooks(); + const compiled = { + version, + maxMigrationNumber: MAX_MIGRATION_NUMBER, + breakingMigrationNumbers: BREAKING_MIGRATIONS, + events, + logger, + workerSchema, + escapedWorkerSchema, + _rawOptions: options, + hooks, + middleware, + resolvedPreset, + }; + applyHooks( + resolvedPreset.plugins, + (p) => p.worker?.hooks, + (name, fn, plugin) => { + const context: WorkerPluginContext = compiled!; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const cb = ((...args: any[]) => fn(context, ...args)) as any; + cb.displayName = `${plugin.name}_hook_${name}`; + hooks.hook(name, cb); + }, + ); + _sharedOptionsCache.set(options, compiled); + // 'init' hook is deprecated; use middleware instead. + Promise.resolve(hooks.process("init")).catch((error) => { + logger.error( + `One of the plugins you are using raised an error during 'init'; but errors during 'init' are currently ignored. Continuing. Error: ${error}`, + { error }, + ); + }); + return compiled; + }, + ) as CompiledSharedOptions; } if (scope) { return { diff --git a/src/migrate.ts b/src/migrate.ts index dfa544fc..a7132c74 100644 --- a/src/migrate.ts +++ b/src/migrate.ts @@ -34,25 +34,27 @@ export async function installSchema( compiledSharedOptions: CompiledSharedOptions, event: GraphileWorker.MigrateEvent, ) { - const { hooks, escapedWorkerSchema } = compiledSharedOptions; + const { hooks, escapedWorkerSchema, middleware } = compiledSharedOptions; (event as Writeable).postgresVersion = await fetchAndCheckPostgresVersion(event.client); - await hooks.process("prebootstrap", event); - // Change to this query should be reflected in website/docs/schema.md - await event.client.query(` - create schema if not exists ${escapedWorkerSchema}; - create table if not exists ${escapedWorkerSchema}.migrations( - id int primary key, - ts timestamptz default now() not null + await middleware.run("bootstrap", event, async (event) => { + await hooks.process("prebootstrap", event); + // Change to this query should be reflected in website/docs/schema.md + await event.client.query(`\ +create schema if not exists ${escapedWorkerSchema}; +create table if not exists ${escapedWorkerSchema}.migrations( + id int primary key, + ts timestamptz default now() not null +); +alter table ${escapedWorkerSchema}.migrations add column if not exists breaking boolean not null default false; +`); + await event.client.query( + `update ${escapedWorkerSchema}.migrations set breaking = true where id = any($1::int[])`, + [BREAKING_MIGRATIONS], ); - alter table ${escapedWorkerSchema}.migrations add column if not exists breaking boolean not null default false; - `); - await event.client.query( - `update ${escapedWorkerSchema}.migrations set breaking = true where id = any($1::int[])`, - [BREAKING_MIGRATIONS], - ); - await hooks.process("postbootstrap", event); + await hooks.process("postbootstrap", event); + }); } /** @internal */ @@ -116,7 +118,8 @@ export async function migrate( compiledSharedOptions: CompiledSharedOptions, client: PoolClient, ) { - const { escapedWorkerSchema, hooks, logger } = compiledSharedOptions; + const { escapedWorkerSchema, hooks, logger, middleware } = + compiledSharedOptions; let latestMigration: number | null = null; let latestBreakingMigration: number | null = null; const event = { client, postgresVersion: 0, scratchpad: Object.create(null) }; @@ -159,40 +162,44 @@ select current_setting('server_version_num') as server_version_num, await sleep(400 + Math.random() * 200); } - await hooks.process("premigrate", event); + await middleware.run("migrate", event, async (event) => { + await hooks.process("premigrate", event); - const migrationFiles = Object.keys(migrations) as (keyof typeof migrations)[]; - let highestMigration = 0; - let migrated = false; - for (const migrationFile of migrationFiles) { - const migrationNumber = parseInt(migrationFile.slice(0, 6), 10); - if (migrationNumber > highestMigration) { - highestMigration = migrationNumber; - } - if (latestMigration == null || migrationNumber > latestMigration) { - migrated = true; - await runMigration( - compiledSharedOptions, - event, - migrationFile, - migrationNumber, - ); + const migrationFiles = Object.keys( + migrations, + ) as (keyof typeof migrations)[]; + let highestMigration = 0; + let migrated = false; + for (const migrationFile of migrationFiles) { + const migrationNumber = parseInt(migrationFile.slice(0, 6), 10); + if (migrationNumber > highestMigration) { + highestMigration = migrationNumber; + } + if (latestMigration == null || migrationNumber > latestMigration) { + migrated = true; + await runMigration( + compiledSharedOptions, + event, + migrationFile, + migrationNumber, + ); + } } - } - if (migrated) { - logger.debug(`Migrations complete`); - } + if (migrated) { + logger.debug(`Migrations complete`); + } - if (latestBreakingMigration && highestMigration < latestBreakingMigration) { - process.exitCode = 57; - throw new Error( - `Database is using Graphile Worker schema revision ${latestMigration} which includes breaking migration ${latestBreakingMigration}, but the currently running worker only supports up to revision ${highestMigration}. It would be unsafe to continue; please ensure all versions of Graphile Worker are compatible.`, - ); - } else if (latestMigration && highestMigration < latestMigration) { - logger.warn( - `Database is using Graphile Worker schema revision ${latestMigration}, but the currently running worker only supports up to revision ${highestMigration} which may or may not be compatible. Please ensure all versions of Graphile Worker you're running are compatible, or use Worker Pro which will perform this check for you. Attempting to continue regardless.`, - ); - } - await hooks.process("postmigrate", event); + if (latestBreakingMigration && highestMigration < latestBreakingMigration) { + process.exitCode = 57; + throw new Error( + `Database is using Graphile Worker schema revision ${latestMigration} which includes breaking migration ${latestBreakingMigration}, but the currently running worker only supports up to revision ${highestMigration}. It would be unsafe to continue; please ensure all versions of Graphile Worker are compatible.`, + ); + } else if (latestMigration && highestMigration < latestMigration) { + logger.warn( + `Database is using Graphile Worker schema revision ${latestMigration}, but the currently running worker only supports up to revision ${highestMigration} which may or may not be compatible. Please ensure all versions of Graphile Worker you're running are compatible, or use Worker Pro which will perform this check for you. Attempting to continue regardless.`, + ); + } + await hooks.process("postmigrate", event); + }); } From b746b4a4577a5be13cefbd27511e9b677fdb9d7d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 16:16:25 +0000 Subject: [PATCH 083/155] gracefulShutdown middleware --- src/index.ts | 11 +++ src/interfaces.ts | 4 +- src/main.ts | 199 ++++++++++++++++++++++++---------------------- 3 files changed, 116 insertions(+), 98 deletions(-) diff --git a/src/index.ts b/src/index.ts index c8594865..8b3ed231 100644 --- a/src/index.ts +++ b/src/index.ts @@ -19,6 +19,7 @@ import { Worker, WorkerEvents, WorkerPluginContext, + WorkerPool, WorkerSharedOptions, WorkerUtilsOptions, } from "./interfaces"; @@ -61,6 +62,7 @@ declare global { */ readonly scratchpad: Record; } + interface MigrateEvent { /** * The client used to run the migration. Replacing this is not officially @@ -77,6 +79,11 @@ declare global { */ readonly scratchpad: Record; } + + interface PoolGracefulShutdownEvent { + workerPool: WorkerPool; + message: string; + } } namespace GraphileConfig { @@ -349,6 +356,10 @@ declare global { * Called when migrating the Graphile Worker DB. */ migrate(event: GraphileWorker.MigrateEvent): PromiseOrDirect; + + poolGracefulShutdown( + event: GraphileWorker.PoolGracefulShutdownEvent, + ): PromiseOrDirect; } interface WorkerHooks { diff --git a/src/interfaces.ts b/src/interfaces.ts index 808dbac9..aeedadc8 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -516,8 +516,8 @@ export interface WorkerPool { nudge(n: number): void; /** @deprecated Use gracefulShutdown instead */ release: () => Promise; - gracefulShutdown: (message?: string) => Promise; - forcefulShutdown: (message: string) => Promise; + gracefulShutdown: (message?: string) => PromiseOrDirect; + forcefulShutdown: (message: string) => PromiseOrDirect; promise: Promise; /** Fires 'abort' when all running jobs should stop because worker is shutting down. @experimental */ abortSignal: AbortSignal; diff --git a/src/main.ts b/src/main.ts index e30e24b8..db0e0b20 100644 --- a/src/main.ts +++ b/src/main.ts @@ -567,7 +567,7 @@ export function _runTaskList( } = options; let autostart = rawAutostart; - const { logger, events } = compiledSharedOptions; + const { logger, events, middleware } = compiledSharedOptions; if (ENABLE_DANGEROUS_LOGS) { logger.debug( @@ -733,9 +733,7 @@ export function _runTaskList( * Stop accepting jobs, and wait gracefully for the jobs that are in * progress to complete. */ - async gracefulShutdown( - message = "Worker pool is shutting down gracefully", - ) { + gracefulShutdown(message = "Worker pool is shutting down gracefully") { if (workerPool._forcefulShuttingDown) { logger.error( `gracefulShutdown called when forcefulShutdown is already in progress`, @@ -755,106 +753,115 @@ export function _runTaskList( }, gracefulShutdownAbortTimeout); abortTimer.unref(); - events.emit("pool:gracefulShutdown", { - pool: workerPool, - workerPool, - message, - }); - try { - logger.debug(`Attempting graceful shutdown`); - // Stop new jobs being added - // TODO: releasing the job releasers BEFORE we release the workers doesn't make any sense? - const deactivatePromise = deactivate(); - - const errors: Error[] = []; - - // Remove all the workers - we're shutting them down manually - const workers = [...workerPool._workers]; - const workerPromises = workers.map((worker) => worker.release()); - const [deactivateResult, ...workerReleaseResults] = - await Promise.allSettled([deactivatePromise, ...workerPromises]); - if (deactivateResult.status === "rejected") { - errors.push(coerceError(deactivateResult.reason)); - // Log but continue regardless - logger.error(`Deactivation failed: ${deactivateResult.reason}`, { - error: deactivateResult.reason, + return middleware.run( + "poolGracefulShutdown", + { workerPool, message }, + async ({ workerPool, message }) => { + events.emit("pool:gracefulShutdown", { + pool: workerPool, + workerPool, + message, }); - } - const jobsToRelease: Job[] = []; - for (let i = 0; i < workerReleaseResults.length; i++) { - const workerReleaseResult = workerReleaseResults[i]; - if (workerReleaseResult.status === "rejected") { - const worker = workers[i]; - const job = worker.getActiveJob(); - events.emit("pool:gracefulShutdown:workerError", { + try { + logger.debug(`Attempting graceful shutdown`); + // Stop new jobs being added + // TODO: releasing the job releasers BEFORE we release the workers doesn't make any sense? + const deactivatePromise = deactivate(); + + const errors: Error[] = []; + + // Remove all the workers - we're shutting them down manually + const workers = [...workerPool._workers]; + const workerPromises = workers.map((worker) => worker.release()); + const [deactivateResult, ...workerReleaseResults] = + await Promise.allSettled([deactivatePromise, ...workerPromises]); + if (deactivateResult.status === "rejected") { + errors.push(coerceError(deactivateResult.reason)); + // Log but continue regardless + logger.error(`Deactivation failed: ${deactivateResult.reason}`, { + error: deactivateResult.reason, + }); + } + const jobsToRelease: Job[] = []; + for (let i = 0; i < workerReleaseResults.length; i++) { + const workerReleaseResult = workerReleaseResults[i]; + if (workerReleaseResult.status === "rejected") { + const worker = workers[i]; + const job = worker.getActiveJob(); + events.emit("pool:gracefulShutdown:workerError", { + pool: workerPool, + workerPool, + error: workerReleaseResult.reason, + job, + }); + logger.debug( + `Cancelling worker ${worker.workerId} (job: ${ + job?.id ?? "none" + }) failed`, + { + worker, + job, + reason: workerReleaseResult.reason, + }, + ); + if (job) { + jobsToRelease.push(job); + } + } + } + if (jobsToRelease.length > 0) { + const workerIds = workers.map((worker) => worker.workerId); + logger.debug( + `Releasing the jobs ${jobsToRelease + .map((j) => j.id) + .join()} (workers: ${workerIds.join(", ")})`, + { + jobs: jobsToRelease, + workerIds, + }, + ); + const cancelledJobs = await failJobs( + compiledSharedOptions, + withPgClient, + workerPool.id, + jobsToRelease, + message, + ); + logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { + cancelledJobs, + }); + } + if (errors.length > 0) { + throw new AggregateError( + errors, + "Errors occurred whilst shutting down worker", + ); + } + events.emit("pool:gracefulShutdown:complete", { pool: workerPool, workerPool, - error: workerReleaseResult.reason, - job, }); - logger.debug( - `Cancelling worker ${worker.workerId} (job: ${ - job?.id ?? "none" - }) failed`, + logger.debug("Graceful shutdown complete"); + } catch (e) { + events.emit("pool:gracefulShutdown:error", { + pool: workerPool, + workerPool, + error: e, + }); + const message = coerceError(e).message; + logger.error( + `Error occurred during graceful shutdown: ${message}`, { - worker, - job, - reason: workerReleaseResult.reason, + error: e, }, ); - if (job) { - jobsToRelease.push(job); - } + return this.forcefulShutdown(message); } - } - if (jobsToRelease.length > 0) { - const workerIds = workers.map((worker) => worker.workerId); - logger.debug( - `Releasing the jobs ${jobsToRelease - .map((j) => j.id) - .join()} (workers: ${workerIds.join(", ")})`, - { - jobs: jobsToRelease, - workerIds, - }, - ); - const cancelledJobs = await failJobs( - compiledSharedOptions, - withPgClient, - workerPool.id, - jobsToRelease, - message, - ); - logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { - cancelledJobs, - }); - } - if (errors.length > 0) { - throw new AggregateError( - errors, - "Errors occurred whilst shutting down worker", - ); - } - events.emit("pool:gracefulShutdown:complete", { - pool: workerPool, - workerPool, - }); - logger.debug("Graceful shutdown complete"); - } catch (e) { - events.emit("pool:gracefulShutdown:error", { - pool: workerPool, - workerPool, - error: e, - }); - const message = coerceError(e).message; - logger.error(`Error occurred during graceful shutdown: ${message}`, { - error: e, - }); - return this.forcefulShutdown(message); - } - if (!terminated) { - terminate(); - } + if (!terminated) { + terminate(); + } + }, + ); }, /** From b57766a7023e8bc8fa81aca75b6546bcf6159d0d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 16:16:45 +0000 Subject: [PATCH 084/155] Shutting down non-continuous worker should go via gracefulShutdown path --- src/main.ts | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/src/main.ts b/src/main.ts index db0e0b20..a346bed0 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1115,31 +1115,8 @@ export function _runTaskList( } workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); if (!continuous && workerPool._workers.length === 0) { - compiledSharedOptions.events.emit("pool:gracefulShutdown", { - workerPool, - pool: workerPool, - message: - "'Run once' mode processed all available jobs and is now exiting", - }); - deactivate().then( - () => { - compiledSharedOptions.events.emit( - "pool:gracefulShutdown:complete", - { - workerPool, - pool: workerPool, - }, - ); - terminate(); - }, - (error) => { - compiledSharedOptions.events.emit("pool:gracefulShutdown:error", { - workerPool, - pool: workerPool, - error, - }); - terminate(error); - }, + workerPool.gracefulShutdown( + "'Run once' mode processed all available jobs and is now exiting", ); } }; From 4e72225254487482d8958077076bf9ae6001726d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 16:17:30 +0000 Subject: [PATCH 085/155] f --- src/interfaces.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index aeedadc8..b3478bd4 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -515,7 +515,7 @@ export interface WorkerPool { /** Encourage `n` workers to look for jobs _right now_, cancelling the delay timers. */ nudge(n: number): void; /** @deprecated Use gracefulShutdown instead */ - release: () => Promise; + release: () => PromiseOrDirect; gracefulShutdown: (message?: string) => PromiseOrDirect; forcefulShutdown: (message: string) => PromiseOrDirect; promise: Promise; From 17ccefd944d16a82d8cdaf8c8aa3e4baac071a74 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 17:01:37 +0000 Subject: [PATCH 086/155] Add middleware for forcefulShutdown, guarantee event emitter, rework init middleware --- src/config.ts | 3 + src/index.ts | 11 ++++ src/lib.ts | 30 +++++----- src/main.ts | 157 ++++++++++++++++++++++++++------------------------ 4 files changed, 112 insertions(+), 89 deletions(-) diff --git a/src/config.ts b/src/config.ts index 08eca53e..7e8f54b6 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,6 +1,8 @@ import { cosmiconfigSync } from "cosmiconfig"; +import EventEmitter from "events"; import { MINUTE, SECOND } from "./cronConstants"; +import type { WorkerEvents } from "./interfaces"; import { defaultLogger } from "./logger"; const cosmiconfigResult = cosmiconfigSync("graphile-worker").search(); @@ -34,6 +36,7 @@ export const makeWorkerPresetWorkerOptions = () => maxResetLockedInterval: 10 * MINUTE, gracefulShutdownAbortTimeout: 5 * SECOND, useNodeTime: false, + events: new EventEmitter() as WorkerEvents, } satisfies GraphileConfig.WorkerOptions); function enforceStringOrUndefined( diff --git a/src/index.ts b/src/index.ts index 8b3ed231..1d586afc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -84,6 +84,11 @@ declare global { workerPool: WorkerPool; message: string; } + + interface PoolForcefulShutdownEvent { + workerPool: WorkerPool; + message: string; + } } namespace GraphileConfig { @@ -344,7 +349,9 @@ declare global { | RunOnceOptions | WorkerUtilsOptions, >(event: { + version: string; resolvedPreset: ResolvedWorkerPreset; + escapedWorkerSchema: string; }): CompiledSharedOptions; /** @@ -360,6 +367,10 @@ declare global { poolGracefulShutdown( event: GraphileWorker.PoolGracefulShutdownEvent, ): PromiseOrDirect; + + poolForcefulShutdown( + event: GraphileWorker.PoolForcefulShutdownEvent, + ): PromiseOrDirect; } interface WorkerHooks { diff --git a/src/lib.ts b/src/lib.ts index 7a548165..b4003cb6 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -231,22 +231,22 @@ export function processSharedOptions< }, ); + const { + worker: { + minResetLockedInterval, + maxResetLockedInterval, + schema: workerSchema, + logger, + events, + }, + plugins, + } = resolvedPreset; + const escapedWorkerSchema = Client.prototype.escapeIdentifier(workerSchema); + compiled = middleware.run( "init", - { resolvedPreset }, - ({ resolvedPreset }) => { - const { - worker: { - minResetLockedInterval, - maxResetLockedInterval, - schema: workerSchema, - logger, - events = new EventEmitter(), - }, - } = resolvedPreset; - - const escapedWorkerSchema = - Client.prototype.escapeIdentifier(workerSchema); + { resolvedPreset, escapedWorkerSchema, version }, + () => { if ( !Number.isFinite(minResetLockedInterval) || !Number.isFinite(maxResetLockedInterval) || @@ -272,7 +272,7 @@ export function processSharedOptions< resolvedPreset, }; applyHooks( - resolvedPreset.plugins, + plugins, (p) => p.worker?.hooks, (name, fn, plugin) => { const context: WorkerPluginContext = compiled!; diff --git a/src/main.ts b/src/main.ts index a346bed0..f70b645a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -756,7 +756,7 @@ export function _runTaskList( return middleware.run( "poolGracefulShutdown", { workerPool, message }, - async ({ workerPool, message }) => { + async ({ message }) => { events.emit("pool:gracefulShutdown", { pool: workerPool, workerPool, @@ -867,87 +867,96 @@ export function _runTaskList( /** * Stop accepting jobs and "fail" all currently running jobs. */ - async forcefulShutdown(message: string) { + forcefulShutdown(message: string) { if (workerPool._forcefulShuttingDown) { logger.error( `forcefulShutdown called when forcefulShutdown is already in progress`, ); return; } - workerPool._forcefulShuttingDown = true; - events.emit("pool:forcefulShutdown", { - pool: workerPool, - workerPool, - message, - }); - try { - logger.debug(`Attempting forceful shutdown`); - // Stop new jobs being added - const deactivatePromise = deactivate(); - - // Release all our workers' jobs - const workers = [...workerPool._workers]; - const jobsInProgress: Array = workers - .map((worker) => worker.getActiveJob()) - .filter((job): job is Job => !!job); - - // Remove all the workers - we're shutting them down manually - const workerPromises = workers.map((worker) => worker.release(true)); - // Ignore the results, we're shutting down anyway - // TODO: add a timeout - const [deactivateResult, ..._ignoreWorkerReleaseResults] = - await Promise.allSettled([deactivatePromise, ...workerPromises]); - if (deactivateResult.status === "rejected") { - // Log but continue regardless - logger.error(`Deactivation failed: ${deactivateResult.reason}`, { - error: deactivateResult.reason, - }); - } - - if (jobsInProgress.length > 0) { - const workerIds = workers.map((worker) => worker.workerId); - logger.debug( - `Releasing the jobs ${jobsInProgress - .map((j) => j.id) - .join()} (workers: ${workerIds.join(", ")})`, - { - jobs: jobsInProgress, - workerIds, - }, - ); - const cancelledJobs = await failJobs( - compiledSharedOptions, - withPgClient, - workerPool.id, - jobsInProgress, + return middleware.run( + "poolForcefulShutdown", + { workerPool: this, message }, + async ({ message }) => { + workerPool._forcefulShuttingDown = true; + events.emit("pool:forcefulShutdown", { + pool: workerPool, + workerPool, message, - ); - logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { - cancelledJobs, }); - } else { - logger.debug("No active jobs to release"); - } - events.emit("pool:forcefulShutdown:complete", { - pool: workerPool, - workerPool, - }); - logger.debug("Forceful shutdown complete"); - } catch (e) { - events.emit("pool:forcefulShutdown:error", { - pool: workerPool, - workerPool, - error: e, - }); - const error = coerceError(e); - logger.error( - `Error occurred during forceful shutdown: ${error.message}`, - { error: e }, - ); - } - if (!terminated) { - terminate(); - } + try { + logger.debug(`Attempting forceful shutdown`); + // Stop new jobs being added + const deactivatePromise = deactivate(); + + // Release all our workers' jobs + const workers = [...workerPool._workers]; + const jobsInProgress: Array = workers + .map((worker) => worker.getActiveJob()) + .filter((job): job is Job => !!job); + + // Remove all the workers - we're shutting them down manually + const workerPromises = workers.map((worker) => + worker.release(true), + ); + // Ignore the results, we're shutting down anyway + // TODO: add a timeout + const [deactivateResult, ..._ignoreWorkerReleaseResults] = + await Promise.allSettled([deactivatePromise, ...workerPromises]); + if (deactivateResult.status === "rejected") { + // Log but continue regardless + logger.error(`Deactivation failed: ${deactivateResult.reason}`, { + error: deactivateResult.reason, + }); + } + + if (jobsInProgress.length > 0) { + const workerIds = workers.map((worker) => worker.workerId); + logger.debug( + `Releasing the jobs ${jobsInProgress + .map((j) => j.id) + .join()} (workers: ${workerIds.join(", ")})`, + { + jobs: jobsInProgress, + workerIds, + }, + ); + const cancelledJobs = await failJobs( + compiledSharedOptions, + withPgClient, + workerPool.id, + jobsInProgress, + message, + ); + logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { + cancelledJobs, + }); + } else { + logger.debug("No active jobs to release"); + } + events.emit("pool:forcefulShutdown:complete", { + pool: workerPool, + workerPool, + }); + logger.debug("Forceful shutdown complete"); + } catch (e) { + events.emit("pool:forcefulShutdown:error", { + pool: workerPool, + workerPool, + error: e, + }); + logger.error( + `Error occurred during forceful shutdown: ${ + coerceError(e).message + }`, + { error: e }, + ); + } + if (!terminated) { + terminate(); + } + }, + ); }, promise, From 49bcbeec475f82a53b975b8ccb53fccb8c701841 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 17:58:01 +0000 Subject: [PATCH 087/155] Clarify relationship between CompiledSharedOptions and WorkerPluginContext and share relevant types --- src/index.ts | 18 +++++--- src/interfaces.ts | 21 +++++---- src/lib.ts | 113 +++++++++++++++++++++------------------------- src/main.ts | 4 +- src/migrate.ts | 7 ++- 5 files changed, 83 insertions(+), 80 deletions(-) diff --git a/src/index.ts b/src/index.ts index 1d586afc..fa0f1352 100644 --- a/src/index.ts +++ b/src/index.ts @@ -18,12 +18,13 @@ import { WithPgClient, Worker, WorkerEvents, + WorkerPluginBaseContext, WorkerPluginContext, WorkerPool, WorkerSharedOptions, WorkerUtilsOptions, } from "./interfaces"; -import { CompiledSharedOptions, ResolvedWorkerPreset } from "./lib"; +import { CompiledSharedOptions } from "./lib"; export { parseCronItem, parseCronItems, parseCrontab } from "./crontab"; export * from "./interfaces"; export { @@ -46,7 +47,11 @@ declare global { interface Tasks { /* extend this through declaration merging */ } + interface InitEvent { + ctx: WorkerPluginBaseContext; + } interface BootstrapEvent { + ctx: WorkerPluginContext; /** * The client used to perform the bootstrap. Replacing this is not officially * supported, but... @@ -64,6 +69,7 @@ declare global { } interface MigrateEvent { + ctx: WorkerPluginContext; /** * The client used to run the migration. Replacing this is not officially * supported, but... @@ -81,11 +87,13 @@ declare global { } interface PoolGracefulShutdownEvent { + ctx: WorkerPluginContext; workerPool: WorkerPool; message: string; } interface PoolForcefulShutdownEvent { + ctx: WorkerPluginContext; workerPool: WorkerPool; message: string; } @@ -348,11 +356,9 @@ declare global { | WorkerOptions | RunOnceOptions | WorkerUtilsOptions, - >(event: { - version: string; - resolvedPreset: ResolvedWorkerPreset; - escapedWorkerSchema: string; - }): CompiledSharedOptions; + >( + event: GraphileWorker.InitEvent, + ): CompiledSharedOptions; /** * Called when installing the Graphile Worker DB schema (or upgrading it). diff --git a/src/interfaces.ts b/src/interfaces.ts index b3478bd4..e9dbea50 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -1,7 +1,7 @@ /* eslint-disable @typescript-eslint/ban-types */ import type { EventEmitter } from "events"; import type { Stats } from "fs"; -import { AsyncHooks } from "graphile-config"; +import { AsyncHooks, Middleware } from "graphile-config"; import type { Notification, Pool, @@ -1302,18 +1302,21 @@ export interface FileDetails { export type Writeable = { -readonly [P in keyof T]: T[P] }; -export interface WorkerPluginContext { +// The options available before we connect to the database +export interface WorkerPluginBaseContext { version: string; - maxMigrationNumber: number; - breakingMigrationNumbers: number[]; - events: WorkerEvents; - logger: Logger; + resolvedPreset: ResolvedWorkerPreset; workerSchema: string; escapedWorkerSchema: string; - /** @internal */ - _rawOptions: SharedOptions; + events: WorkerEvents; + logger: Logger; +} +// Once we've connected to the DB, we know more +export interface WorkerPluginContext extends WorkerPluginBaseContext { hooks: AsyncHooks; - resolvedPreset: ResolvedWorkerPreset; + middleware: Middleware; + maxMigrationNumber: number; + breakingMigrationNumbers: number[]; } export type GetJobFunction = ( workerId: string, diff --git a/src/lib.ts b/src/lib.ts index b4003cb6..6396597b 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -23,6 +23,7 @@ import { WithPgClient, WorkerEvents, WorkerOptions, + WorkerPluginBaseContext, WorkerPluginContext, WorkerSharedOptions, WorkerUtilsOptions, @@ -52,25 +53,14 @@ export type ResolvedWorkerPreset = GraphileConfig.ResolvedPreset & { }; // NOTE: when you add things here, you may also want to add them to WorkerPluginContext -export interface CompiledSharedOptions< - T extends SharedOptions = SharedOptions, -> { - version: string; - maxMigrationNumber: number; - breakingMigrationNumbers: number[]; - events: WorkerEvents; - logger: Logger; - workerSchema: string; - escapedWorkerSchema: string; +export interface CompiledSharedOptions + extends WorkerPluginContext { /** * DO NOT USE THIS! As we move over to presets this will be removed. * * @internal */ _rawOptions: T; - resolvedPreset: ResolvedWorkerPreset; - hooks: AsyncHooks; - middleware: Middleware; } interface ProcessSharedOptionsSettings { @@ -243,56 +233,55 @@ export function processSharedOptions< } = resolvedPreset; const escapedWorkerSchema = Client.prototype.escapeIdentifier(workerSchema); - compiled = middleware.run( - "init", - { resolvedPreset, escapedWorkerSchema, version }, - () => { - if ( - !Number.isFinite(minResetLockedInterval) || - !Number.isFinite(maxResetLockedInterval) || - minResetLockedInterval < 1 || - maxResetLockedInterval < minResetLockedInterval - ) { - throw new Error( - `Invalid values for minResetLockedInterval (${minResetLockedInterval})/maxResetLockedInterval (${maxResetLockedInterval})`, - ); - } - const hooks = new AsyncHooks(); - const compiled = { - version, - maxMigrationNumber: MAX_MIGRATION_NUMBER, - breakingMigrationNumbers: BREAKING_MIGRATIONS, - events, - logger, - workerSchema, - escapedWorkerSchema, - _rawOptions: options, - hooks, - middleware, - resolvedPreset, - }; - applyHooks( - plugins, - (p) => p.worker?.hooks, - (name, fn, plugin) => { - const context: WorkerPluginContext = compiled!; - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const cb = ((...args: any[]) => fn(context, ...args)) as any; - cb.displayName = `${plugin.name}_hook_${name}`; - hooks.hook(name, cb); - }, + const ctx: WorkerPluginBaseContext = { + version, + resolvedPreset, + workerSchema, + escapedWorkerSchema, + events, + logger, + }; + + compiled = middleware.run("init", { ctx }, () => { + if ( + !Number.isFinite(minResetLockedInterval) || + !Number.isFinite(maxResetLockedInterval) || + minResetLockedInterval < 1 || + maxResetLockedInterval < minResetLockedInterval + ) { + throw new Error( + `Invalid values for minResetLockedInterval (${minResetLockedInterval})/maxResetLockedInterval (${maxResetLockedInterval})`, ); - _sharedOptionsCache.set(options, compiled); - // 'init' hook is deprecated; use middleware instead. - Promise.resolve(hooks.process("init")).catch((error) => { - logger.error( - `One of the plugins you are using raised an error during 'init'; but errors during 'init' are currently ignored. Continuing. Error: ${error}`, - { error }, - ); - }); - return compiled; - }, - ) as CompiledSharedOptions; + } + const hooks = new AsyncHooks(); + const compiled: CompiledSharedOptions = Object.assign(ctx, { + hooks, + middleware, + maxMigrationNumber: MAX_MIGRATION_NUMBER, + breakingMigrationNumbers: BREAKING_MIGRATIONS, + _rawOptions: options, + }); + applyHooks( + plugins, + (p) => p.worker?.hooks, + (name, fn, plugin) => { + const context: WorkerPluginContext = compiled!; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const cb = ((...args: any[]) => fn(context, ...args)) as any; + cb.displayName = `${plugin.name}_hook_${name}`; + hooks.hook(name, cb); + }, + ); + _sharedOptionsCache.set(options, compiled); + // 'init' hook is deprecated; use middleware instead. + Promise.resolve(hooks.process("init")).catch((error) => { + logger.error( + `One of the plugins you are using raised an error during 'init'; but errors during 'init' are currently ignored. Continuing. Error: ${error}`, + { error }, + ); + }); + return compiled; + }) as CompiledSharedOptions; } if (scope) { return { diff --git a/src/main.ts b/src/main.ts index f70b645a..00c7e2f2 100644 --- a/src/main.ts +++ b/src/main.ts @@ -755,7 +755,7 @@ export function _runTaskList( return middleware.run( "poolGracefulShutdown", - { workerPool, message }, + { ctx: compiledSharedOptions, workerPool, message }, async ({ message }) => { events.emit("pool:gracefulShutdown", { pool: workerPool, @@ -876,7 +876,7 @@ export function _runTaskList( } return middleware.run( "poolForcefulShutdown", - { workerPool: this, message }, + { ctx: compiledSharedOptions, workerPool: this, message }, async ({ message }) => { workerPool._forcefulShuttingDown = true; events.emit("pool:forcefulShutdown", { diff --git a/src/migrate.ts b/src/migrate.ts index a7132c74..fde2dd79 100644 --- a/src/migrate.ts +++ b/src/migrate.ts @@ -122,7 +122,12 @@ export async function migrate( compiledSharedOptions; let latestMigration: number | null = null; let latestBreakingMigration: number | null = null; - const event = { client, postgresVersion: 0, scratchpad: Object.create(null) }; + const event = { + ctx: compiledSharedOptions, + client, + postgresVersion: 0, + scratchpad: Object.create(null), + }; for (let attempts = 0; attempts < 2; attempts++) { try { const { From 8035788221f0d293e826fab5ffc1bc7765a992e0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 18:00:48 +0000 Subject: [PATCH 088/155] Fix lint issues --- src/lib.ts | 5 ++--- src/plugins/LoadTaskFromJsPlugin.ts | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/lib.ts b/src/lib.ts index 6396597b..440c9c72 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -1,5 +1,4 @@ import * as assert from "assert"; -import { EventEmitter } from "events"; import { applyHooks, AsyncHooks, @@ -21,14 +20,13 @@ import { RunOnceOptions, SharedOptions, WithPgClient, - WorkerEvents, WorkerOptions, WorkerPluginBaseContext, WorkerPluginContext, WorkerSharedOptions, WorkerUtilsOptions, } from "./interfaces"; -import { Logger, LogScope } from "./logger"; +import { LogScope } from "./logger"; import { migrate } from "./migrate"; import { WorkerPreset } from "./preset"; import { version } from "./version"; @@ -217,6 +215,7 @@ export function processSharedOptions< resolvedPreset.plugins, (plugin) => plugin.worker?.middleware, (name, fn, _plugin) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any middleware.register(name, fn as any); }, ); diff --git a/src/plugins/LoadTaskFromJsPlugin.ts b/src/plugins/LoadTaskFromJsPlugin.ts index 14173bc9..64b1e124 100644 --- a/src/plugins/LoadTaskFromJsPlugin.ts +++ b/src/plugins/LoadTaskFromJsPlugin.ts @@ -4,7 +4,6 @@ import { pathToFileURL } from "url"; import { FileDetails, isValidTask } from "../index.js"; import { coerceError } from "../lib.js"; import { version } from "../version.js"; -import { coerceError } from "../lib.js"; const DEFAULT_EXTENSIONS = [".js", ".mjs", ".cjs"]; From ddce61f31c0772a28d547d8fa7f4812f67f32214 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 19 Nov 2024 18:02:43 +0000 Subject: [PATCH 089/155] Only shutdown if we're not already doing so --- src/main.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main.ts b/src/main.ts index 00c7e2f2..86fd2fd9 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1124,9 +1124,11 @@ export function _runTaskList( } workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); if (!continuous && workerPool._workers.length === 0) { - workerPool.gracefulShutdown( - "'Run once' mode processed all available jobs and is now exiting", - ); + if (!workerPool._shuttingDown) { + workerPool.gracefulShutdown( + "'Run once' mode processed all available jobs and is now exiting", + ); + } } }; worker.promise.then( From 18835b111527811b7895afe35e8367277689f0e0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 16:34:00 +0000 Subject: [PATCH 090/155] Fix onTerminate handling --- src/main.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.ts b/src/main.ts index 86fd2fd9..a8139dca 100644 --- a/src/main.ts +++ b/src/main.ts @@ -659,8 +659,7 @@ export function _runTaskList( allWorkerPools.splice(idx, 1); try { - const result = onTerminate?.(); - promise.resolve(result); + await onTerminate?.(); } catch (e) { errors.push(coerceError(e)); } @@ -673,6 +672,8 @@ export function _runTaskList( "Errors occurred whilst terminating queue", ), ); + } else { + promise.resolve(); } if (unregisterSignalHandlers) { From c83069afbc55cab2e2a81ffaa562437092c58adb Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 16:34:41 +0000 Subject: [PATCH 091/155] Forceful shutdown should error if something went wrong --- src/main.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main.ts b/src/main.ts index a8139dca..7a5def4d 100644 --- a/src/main.ts +++ b/src/main.ts @@ -859,7 +859,7 @@ export function _runTaskList( return this.forcefulShutdown(message); } if (!terminated) { - terminate(); + await terminate(); } }, ); @@ -946,15 +946,18 @@ export function _runTaskList( workerPool, error: e, }); + const error = coerceError(e); logger.error( - `Error occurred during forceful shutdown: ${ - coerceError(e).message - }`, + `Error occurred during forceful shutdown: ${error.message}`, { error: e }, ); + if (!terminated) { + await terminate(error); + } + throw e; } if (!terminated) { - terminate(); + await terminate(); } }, ); From 40afb61a2ec05a6b1e27c425a7b1fab762018add Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 16:35:56 +0000 Subject: [PATCH 092/155] Forceful shutdown should result in promise rejection --- src/main.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 7a5def4d..3877b9b2 100644 --- a/src/main.ts +++ b/src/main.ts @@ -856,6 +856,7 @@ export function _runTaskList( error: e, }, ); + // Note: we now rely on forcefulShutdown to handle terminate() return this.forcefulShutdown(message); } if (!terminated) { @@ -957,7 +958,7 @@ export function _runTaskList( throw e; } if (!terminated) { - await terminate(); + await terminate(new Error("Forceful shutdown")); } }, ); From cf503925fc1d6be1a2e24ae49deb226e2c91eaa0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 16:47:41 +0000 Subject: [PATCH 093/155] Refactoring to ensure all cases are handled --- src/localQueue.ts | 69 +++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 30d685d1..da779bc5 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -109,6 +109,7 @@ export class LocalQueue { fetchAgain = false; public readonly mode: LocalQueueMode = STARTING; private promise = defer(); + /** A count of the number of "background" processes such as fetching or returning jobs */ private backgroundCount = 0; /** If `localQueueRefetchDelay` is configured; set this true if the fetch resulted in a queue size lower than the threshold. */ @@ -576,13 +577,8 @@ export class LocalQueue { } private setModeReleased() { - assert.notEqual( - this.mode, - RELEASED, - "LocalQueue must only be released once", - ); - const oldMode = this.mode; + assert.notEqual(oldMode, RELEASED, "LocalQueue must only be released once"); this.setMode(RELEASED); if (this.refetchDelayTimer != null) { @@ -591,28 +587,49 @@ export class LocalQueue { } this.refetchDelayActive = false; - if (oldMode === POLLING) { - // Release pending workers - const workers = this.workerQueue.splice(0, this.workerQueue.length); - workers.forEach((w) => w.resolve(undefined)); - - // Release next fetch call - if (this.fetchTimer) { - clearTimeout(this.fetchTimer); - this.fetchTimer = null; - this.promise.resolve(); - } else { - // Rely on checking mode at end of fetch + switch (oldMode) { + case POLLING: { + // Release pending workers + const futureJobs = this.workerQueue.splice(0, this.workerQueue.length); + futureJobs.forEach((futureJob) => futureJob.resolve(undefined)); + + // Release next fetch call + if (this.fetchTimer) { + clearTimeout(this.fetchTimer); + this.fetchTimer = null; + this.promise.resolve(); + } else { + // Rely on checking mode at end of fetch + } + // No need to return jobs + break; } - } else if (oldMode === WAITING) { - if (this.ttlExpiredTimer) { - clearTimeout(this.ttlExpiredTimer); - this.ttlExpiredTimer = null; + case WAITING: { + if (this.ttlExpiredTimer) { + clearTimeout(this.ttlExpiredTimer); + this.ttlExpiredTimer = null; + } + // Trigger the jobs to be released + // NOTE: this will add to backgroundCount + this.returnJobs(); + break; + } + case TTL_EXPIRED: { + // No action necessary + break; + } + case STARTING: { + // From STARTING to RELEASED directly? This should never happen! + break; + } + case RELEASED: { + // Explicitly ruled against via assertion above. + break; + } + default: { + const never: never = oldMode; + throw new Error(`Unhandled mode: ${never}`); } - // Trigger the jobs to be released - this.returnJobs(); - } else if (oldMode === TTL_EXPIRED) { - // No action necessary } if (this.backgroundCount === 0) { From 6afe77361034fac448f3d1e247773e1f044788d1 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 16:47:57 +0000 Subject: [PATCH 094/155] We don't release job releasers here any more. --- src/main.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 3877b9b2..4dd33861 100644 --- a/src/main.ts +++ b/src/main.ts @@ -766,7 +766,6 @@ export function _runTaskList( try { logger.debug(`Attempting graceful shutdown`); // Stop new jobs being added - // TODO: releasing the job releasers BEFORE we release the workers doesn't make any sense? const deactivatePromise = deactivate(); const errors: Error[] = []; From 4bb7227d65a08c646abd85dcdaa6a9c0448591d4 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 17:33:14 +0000 Subject: [PATCH 095/155] Ensure that LocalQueue exits with the correct status (e.g. rejects if returning jobs fails) --- src/localQueue.ts | 114 +++++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 32 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index da779bc5..41a78f9f 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -10,6 +10,7 @@ import { import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; +import { coerceError } from "./lib"; import { getJob as baseGetJob } from "./sql/getJob"; import { returnJobs } from "./sql/returnJobs"; @@ -108,7 +109,9 @@ export class LocalQueue { // when the queue is pulsed during a fetch. fetchAgain = false; public readonly mode: LocalQueueMode = STARTING; - private promise = defer(); + /** The promise that resolves/rejects when the queue is disposed of */ + private _finPromise = defer(); + private errors: Error[] = []; /** A count of the number of "background" processes such as fetching or returning jobs */ private backgroundCount = 0; @@ -161,10 +164,37 @@ export class LocalQueue { }); } + private fin() { + assert.equal(this.mode, "RELEASED"); + assert.equal(this.backgroundCount, 0); + if (this.errors.length === 1) { + this._finPromise.reject(this.errors[0]); + } else if (this.errors.length > 1) { + this._finPromise.reject(new AggregateError(this.errors)); + } else { + this._finPromise.resolve(); + } + } + private decreaseBackgroundCount = () => { this.backgroundCount--; if (this.mode === "RELEASED" && this.backgroundCount === 0) { - this.promise.resolve(); + this.fin(); + } + }; + + private decreaseBackgroundCountWithError = (e: unknown) => { + this.backgroundCount--; + if (this.mode === "RELEASED") { + this.errors.push(coerceError(e)); + if (this.backgroundCount === 0) { + this.fin(); + } + } else { + this.compiledSharedOptions.logger.error( + `Backgrounding should never yield errors when the queue is not RELEASED`, + { error: e }, + ); } }; @@ -172,6 +202,9 @@ export class LocalQueue { * For promises that happen in the background, but that we want to ensure are * handled before we release the queue (so that the database pool isn't * released too early). + * + * IMPORTANT: never raise an error from background unless mode === "RELEASED" - you + * need to handle errors yourself! */ private background(promise: Promise) { if (this.mode === "RELEASED" && this.backgroundCount === 0) { @@ -180,7 +213,10 @@ export class LocalQueue { ); } this.backgroundCount++; - promise.then(this.decreaseBackgroundCount, this.decreaseBackgroundCount); + promise.then( + this.decreaseBackgroundCount, + this.decreaseBackgroundCountWithError, + ); } private setModePolling() { @@ -265,7 +301,11 @@ export class LocalQueue { } private returnJobs() { - const jobsToReturn = this.jobQueue.splice(0, this.jobQueue.length); + const l = this.jobQueue.length; + if (l === 0) { + return; + } + const jobsToReturn = this.jobQueue.splice(0, l); this.compiledSharedOptions.events.emit("localQueue:returnJobs", { localQueue: this, jobs: jobsToReturn, @@ -279,16 +319,39 @@ export class LocalQueue { ).then( () => {}, (e) => { - // TODO: handle this better! - this.compiledSharedOptions.logger.error( - `Failed to return jobs from local queue to database queue`, - { error: e }, - ); + if (this.mode === "RELEASED") { + throw new Error( + `Error occurred whilst returning jobs from local queue to database queue: ${ + coerceError(e).message + }`, + ); + } else { + // Return the jobs to the queue; MUST NOT HAPPEN IN RELEASED MODE. + this.receivedJobs(jobsToReturn); + this.compiledSharedOptions.logger.error( + `Failed to return jobs from local queue to database queue`, + { error: e }, + ); + } }, ), ); } + private receivedJobs(jobs: Job[]) { + const jobCount = jobs.length; + const workerCount = Math.min(jobCount, this.workerQueue.length); + const workers = this.workerQueue.splice(0, workerCount); + for (let i = 0; i < jobCount; i++) { + const job = jobs[i]; + if (i < workerCount) { + workers[i].resolve(job); + } else { + this.jobQueue.push(job); + } + } + } + private fetch = (): void => { if (this.fetchTimer) { clearTimeout(this.fetchTimer); @@ -364,11 +427,6 @@ export class LocalQueue { jobs, }); - assert.equal( - this.jobQueue.length, - 0, - "Should not fetch when job queue isn't empty (recheck)", - ); jobCount = jobs.length; fetchedMax = jobCount >= this.getJobBatchSize; refetchDelayThresholdSurpassed = @@ -381,17 +439,7 @@ export class LocalQueue { // NOTE: we don't need to handle `this.mode === RELEASED` here because // being in that mode guarantees the workerQueue is empty. - - const workerCount = Math.min(jobCount, this.workerQueue.length); - const workers = this.workerQueue.splice(0, workerCount); - for (let i = 0; i < jobCount; i++) { - const job = jobs[i]; - if (i < workerCount) { - workers[i].resolve(job); - } else { - this.jobQueue.push(job); - } - } + this.receivedJobs(jobs); } catch (e) { // Error happened; rely on poll interval. this.compiledSharedOptions.logger.error( @@ -573,7 +621,7 @@ export class LocalQueue { if (this.mode !== "RELEASED") { this.setModeReleased(); } - return this.promise; + return this._finPromise; } private setModeReleased() { @@ -594,14 +642,16 @@ export class LocalQueue { futureJobs.forEach((futureJob) => futureJob.resolve(undefined)); // Release next fetch call - if (this.fetchTimer) { + if (this.fetchTimer != null) { + // No need to return jobs in POLLING mode clearTimeout(this.fetchTimer); this.fetchTimer = null; - this.promise.resolve(); } else { - // Rely on checking mode at end of fetch + // There's a fetch in progress, so backgroundCount will not be 0, and + // fetch handles calling returnJobs if it completes when in RELEASED + // mode. } - // No need to return jobs + break; } case WAITING: { @@ -615,7 +665,7 @@ export class LocalQueue { break; } case TTL_EXPIRED: { - // No action necessary + // No action necessary, jobs are already returned break; } case STARTING: { @@ -633,7 +683,7 @@ export class LocalQueue { } if (this.backgroundCount === 0) { - this.promise.resolve(); + this.fin(); } } } From 4b6b5dc739659e60327e23be9ced370222af4c96 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 17:34:10 +0000 Subject: [PATCH 096/155] Fix types in a test --- __tests__/migrate.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/__tests__/migrate.test.ts b/__tests__/migrate.test.ts index e2d76d1b..03b00514 100644 --- a/__tests__/migrate.test.ts +++ b/__tests__/migrate.test.ts @@ -238,6 +238,7 @@ test("throws helpful error message in migration 11", async () => { // Manually run the first 10 migrations const event = { + ctx: compiledSharedOptions, client: pgClient, postgresVersion: 120000, // TODO: use the actual postgres version scratchpad: Object.create(null), From 9848f8cd9f46a954ce592954eac3c07780cff39b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 17:55:30 +0000 Subject: [PATCH 097/155] I threw it myself, I know what it is --- src/main.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 4dd33861..ef89554d 100644 --- a/src/main.ts +++ b/src/main.ts @@ -685,7 +685,7 @@ export function _runTaskList( `Graphile Worker internal error: terminate() was called twice for worker pool. Ignoring second call; but this indicates a bug - please file an issue.`, ); } catch (e) { - logger.error(String(coerceError(e).stack)); + logger.error(String((e as Error).stack)); } } } From b62b8e396b530859b3171fc744e449fc5281f363 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:07:11 +0000 Subject: [PATCH 098/155] Ensure deactivate happens at most once, and yields same errors if called a second time (e.g. from forcefulShutdown) --- src/main.ts | 61 +++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/src/main.ts b/src/main.ts index ef89554d..c9f945d4 100644 --- a/src/main.ts +++ b/src/main.ts @@ -590,35 +590,42 @@ export function _runTaskList( const promise = defer(); - async function deactivate() { - if (workerPool._active) { - workerPool._active = false; - const errors: Error[] = []; - try { - await localQueue?.release(); - } catch (rawE) { - const e = coerceError(rawE); - errors.push(e); - // Log but continue regardless - logger.error(`Releasing local queue failed: ${e}`, { error: rawE }); - } - try { - // Note: this runs regardless of success of the above - await onDeactivate?.(); - } catch (rawE) { - const e = coerceError(rawE); - errors.push(e); - // Log but continue regardless - logger.error(`onDeactivate raised an error: ${e}`, { error: rawE }); - } + let deactivatePromise: Promise | null = null; - if (errors.length > 0) { - throw new AggregateError( - errors, - "Errors occurred whilst deactivating queue", - ); - } + function deactivate() { + if (!deactivatePromise) { + deactivatePromise = (async () => { + if (workerPool._active) { + workerPool._active = false; + const errors: Error[] = []; + try { + await localQueue?.release(); + } catch (rawE) { + const e = coerceError(rawE); + errors.push(e); + // Log but continue regardless + logger.error(`Releasing local queue failed: ${e}`, { error: rawE }); + } + try { + // Note: this runs regardless of success of the above + await onDeactivate?.(); + } catch (rawE) { + const e = coerceError(rawE); + errors.push(e); + // Log but continue regardless + logger.error(`onDeactivate raised an error: ${e}`, { error: rawE }); + } + + if (errors.length > 0) { + throw new AggregateError( + errors, + "Errors occurred whilst deactivating queue", + ); + } + } + })(); } + return deactivatePromise; } let terminated = false; From 0bdcd9fa60db7ab20451c938225b0465dc0b6ddf Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:19:52 +0000 Subject: [PATCH 099/155] Ensure forcefulShutdown and gracefulShutdown yield the same promises when called a second time --- src/main.ts | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/main.ts b/src/main.ts index c9f945d4..7c41928c 100644 --- a/src/main.ts +++ b/src/main.ts @@ -14,6 +14,7 @@ import { FailJobFunction, GetJobFunction, Job, + PromiseOrDirect, RunOnceOptions, TaskList, WorkerEventMap, @@ -707,6 +708,9 @@ export function _runTaskList( // Make sure Node doesn't get upset about unhandled rejection abortPromise.then(null, () => /* noop */ void 0); + let gracefulShutdownPromise: PromiseOrDirect; + let forcefulShutdownPromise: PromiseOrDirect; + // This is a representation of us that can be interacted with externally const workerPool: WorkerPool = { // "otpool" - "one time pool" @@ -746,22 +750,17 @@ export function _runTaskList( logger.error( `gracefulShutdown called when forcefulShutdown is already in progress`, ); - return; + return forcefulShutdownPromise; } if (workerPool._shuttingDown) { logger.error( `gracefulShutdown called when gracefulShutdown is already in progress`, ); - return; + return gracefulShutdownPromise; } - workerPool._shuttingDown = true; - - const abortTimer = setTimeout(() => { - abortController.abort(); - }, gracefulShutdownAbortTimeout); - abortTimer.unref(); - return middleware.run( + workerPool._shuttingDown = true; + gracefulShutdownPromise = middleware.run( "poolGracefulShutdown", { ctx: compiledSharedOptions, workerPool, message }, async ({ message }) => { @@ -870,6 +869,13 @@ export function _runTaskList( } }, ); + + const abortTimer = setTimeout(() => { + abortController.abort(); + }, gracefulShutdownAbortTimeout); + abortTimer.unref(); + + return gracefulShutdownPromise; }, /** @@ -880,13 +886,14 @@ export function _runTaskList( logger.error( `forcefulShutdown called when forcefulShutdown is already in progress`, ); - return; + return forcefulShutdownPromise; } - return middleware.run( + + workerPool._forcefulShuttingDown = true; + forcefulShutdownPromise = middleware.run( "poolForcefulShutdown", { ctx: compiledSharedOptions, workerPool: this, message }, async ({ message }) => { - workerPool._forcefulShuttingDown = true; events.emit("pool:forcefulShutdown", { pool: workerPool, workerPool, @@ -968,6 +975,8 @@ export function _runTaskList( } }, ); + + return forcefulShutdownPromise; }, promise, From 6bc93febb796ef98b7a2cb190352c6044c609b20 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:20:15 +0000 Subject: [PATCH 100/155] Graceful shutdown should not complete if forceful shutdown has begun --- src/main.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main.ts b/src/main.ts index 7c41928c..ed172c34 100644 --- a/src/main.ts +++ b/src/main.ts @@ -815,7 +815,7 @@ export function _runTaskList( } } } - if (jobsToRelease.length > 0) { + if (!this._forcefulShuttingDown && jobsToRelease.length > 0) { const workerIds = workers.map((worker) => worker.workerId); logger.debug( `Releasing the jobs ${jobsToRelease @@ -837,7 +837,16 @@ export function _runTaskList( cancelledJobs, }); } - if (errors.length > 0) { + if (this._forcefulShuttingDown) { + errors.push( + new Error( + "forcefulShutdown was initiated whilst gracefulShutdown was still executing.", + ), + ); + } + if (errors.length === 1) { + throw errors[0]; + } else if (errors.length > 1) { throw new AggregateError( errors, "Errors occurred whilst shutting down worker", From b3150ac30b73be3c6d79ea2815b873cd7299ef7f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:21:47 +0000 Subject: [PATCH 101/155] Cleanup gracefulShutdown handover to forcefulShutdown --- src/main.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main.ts b/src/main.ts index ed172c34..0e82ef0a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -837,6 +837,7 @@ export function _runTaskList( cancelledJobs, }); } + if (this._forcefulShuttingDown) { errors.push( new Error( @@ -844,6 +845,7 @@ export function _runTaskList( ), ); } + if (errors.length === 1) { throw errors[0]; } else if (errors.length > 1) { @@ -870,8 +872,13 @@ export function _runTaskList( error: e, }, ); - // Note: we now rely on forcefulShutdown to handle terminate() - return this.forcefulShutdown(message); + // NOTE: we now rely on forcefulShutdown to handle terminate() + if (this._forcefulShuttingDown) { + // Skip the warning about double shutdown + return forcefulShutdownPromise; + } else { + return this.forcefulShutdown(message); + } } if (!terminated) { await terminate(); From a561febe93205a3390b9fdc6c07e8b7bb16b7b0b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:29:04 +0000 Subject: [PATCH 102/155] More consistently handle errors in forcefulShutdown --- src/main.ts | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/main.ts b/src/main.ts index 0e82ef0a..6a5b0ab5 100644 --- a/src/main.ts +++ b/src/main.ts @@ -854,6 +854,7 @@ export function _runTaskList( "Errors occurred whilst shutting down worker", ); } + events.emit("pool:gracefulShutdown:complete", { pool: workerPool, workerPool, @@ -868,9 +869,7 @@ export function _runTaskList( const message = coerceError(e).message; logger.error( `Error occurred during graceful shutdown: ${message}`, - { - error: e, - }, + { error: e }, ); // NOTE: we now rely on forcefulShutdown to handle terminate() if (this._forcefulShuttingDown) { @@ -920,6 +919,8 @@ export function _runTaskList( // Stop new jobs being added const deactivatePromise = deactivate(); + const errors: Error[] = []; + // Release all our workers' jobs const workers = [...workerPool._workers]; const jobsInProgress: Array = workers @@ -939,6 +940,7 @@ export function _runTaskList( logger.error(`Deactivation failed: ${deactivateResult.reason}`, { error: deactivateResult.reason, }); + errors.push(coerceError(deactivateResult.reason)); } if (jobsInProgress.length > 0) { @@ -952,19 +954,33 @@ export function _runTaskList( workerIds, }, ); - const cancelledJobs = await failJobs( - compiledSharedOptions, - withPgClient, - workerPool.id, - jobsInProgress, - message, - ); - logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { - cancelledJobs, - }); + try { + const cancelledJobs = await failJobs( + compiledSharedOptions, + withPgClient, + workerPool.id, + jobsInProgress, + message, + ); + logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { + cancelledJobs, + }); + } catch (e) { + errors.push(coerceError(e)); + } } else { logger.debug("No active jobs to release"); } + + if (errors.length === 1) { + throw errors[0]; + } else if (errors.length > 1) { + throw new AggregateError( + errors, + "Errors occurred whilst forcefully shutting down worker", + ); + } + events.emit("pool:forcefulShutdown:complete", { pool: workerPool, workerPool, From 3c3a237e357244285837946b7fda3bb21f23ae0d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:43:26 +0000 Subject: [PATCH 103/155] Move promise handling outside of the gracefulShutdown/forcefulShutdown hooks --- src/index.ts | 4 ++-- src/main.ts | 65 +++++++++++++++++++++++++++++++--------------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/index.ts b/src/index.ts index fa0f1352..76628deb 100644 --- a/src/index.ts +++ b/src/index.ts @@ -372,11 +372,11 @@ declare global { poolGracefulShutdown( event: GraphileWorker.PoolGracefulShutdownEvent, - ): PromiseOrDirect; + ): Promise; poolForcefulShutdown( event: GraphileWorker.PoolForcefulShutdownEvent, - ): PromiseOrDirect; + ): Promise; } interface WorkerHooks { diff --git a/src/main.ts b/src/main.ts index 6a5b0ab5..611c695d 100644 --- a/src/main.ts +++ b/src/main.ts @@ -14,7 +14,6 @@ import { FailJobFunction, GetJobFunction, Job, - PromiseOrDirect, RunOnceOptions, TaskList, WorkerEventMap, @@ -589,7 +588,7 @@ export function _runTaskList( unregisterSignalHandlers = registerSignalHandlers(logger, events); } - const promise = defer(); + const _finPromise = defer(); let deactivatePromise: Promise | null = null; @@ -671,21 +670,14 @@ export function _runTaskList( } catch (e) { errors.push(coerceError(e)); } + if (errors.length === 1) { - promise.reject(errors[0]); + throw errors[0]; } else if (errors.length > 1) { - promise.reject( - new AggregateError( - errors, - "Errors occurred whilst terminating queue", - ), + throw new AggregateError( + errors, + "Errors occurred whilst terminating queue", ); - } else { - promise.resolve(); - } - - if (unregisterSignalHandlers) { - unregisterSignalHandlers(); } } else { try { @@ -708,8 +700,22 @@ export function _runTaskList( // Make sure Node doesn't get upset about unhandled rejection abortPromise.then(null, () => /* noop */ void 0); - let gracefulShutdownPromise: PromiseOrDirect; - let forcefulShutdownPromise: PromiseOrDirect; + let gracefulShutdownPromise: Promise | null = null; + let forcefulShutdownPromise: Promise | null = null; + + const finWithError = (e: unknown) => { + const error = e != null ? coerceError(e) : null; + if (error) { + _finPromise.reject(error); + } else { + _finPromise.resolve(); + } + + if (unregisterSignalHandlers) { + unregisterSignalHandlers(); + } + }; + const fin = () => finWithError(null); // This is a representation of us that can be interacted with externally const workerPool: WorkerPool = { @@ -750,13 +756,13 @@ export function _runTaskList( logger.error( `gracefulShutdown called when forcefulShutdown is already in progress`, ); - return forcefulShutdownPromise; + return forcefulShutdownPromise!; } if (workerPool._shuttingDown) { logger.error( `gracefulShutdown called when gracefulShutdown is already in progress`, ); - return gracefulShutdownPromise; + return gracefulShutdownPromise!; } workerPool._shuttingDown = true; @@ -874,7 +880,7 @@ export function _runTaskList( // NOTE: we now rely on forcefulShutdown to handle terminate() if (this._forcefulShuttingDown) { // Skip the warning about double shutdown - return forcefulShutdownPromise; + return forcefulShutdownPromise!; } else { return this.forcefulShutdown(message); } @@ -885,6 +891,8 @@ export function _runTaskList( }, ); + gracefulShutdownPromise.then(fin, finWithError); + const abortTimer = setTimeout(() => { abortController.abort(); }, gracefulShutdownAbortTimeout); @@ -901,7 +909,7 @@ export function _runTaskList( logger.error( `forcefulShutdown called when forcefulShutdown is already in progress`, ); - return forcefulShutdownPromise; + return forcefulShutdownPromise!; } workerPool._forcefulShuttingDown = true; @@ -998,29 +1006,34 @@ export function _runTaskList( { error: e }, ); if (!terminated) { + // Guaranteed to throw await terminate(error); } - throw e; + throw error; } if (!terminated) { + // Guaranteed to throw await terminate(new Error("Forceful shutdown")); } }, ); + // This should never call fin() since forceful shutdown always errors + forcefulShutdownPromise.then(fin, finWithError); + return forcefulShutdownPromise; }, - promise, + promise: _finPromise, then(onfulfilled, onrejected) { - return promise.then(onfulfilled, onrejected); + return _finPromise.then(onfulfilled, onrejected); }, catch(onrejected) { - return promise.catch(onrejected); + return _finPromise.catch(onrejected); }, finally(onfinally) { - return promise.finally(onfinally); + return _finPromise.finally(onfinally); }, _start: autostart ? null @@ -1031,7 +1044,7 @@ export function _runTaskList( }, }; - promise.finally(() => { + _finPromise.finally(() => { events.emit("pool:release", { pool: workerPool, workerPool }); }); From 28f3b0eaa88e6600e4bf6e8828f08f71d1e5d94a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 20 Nov 2024 18:43:59 +0000 Subject: [PATCH 104/155] Only finish once --- src/main.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main.ts b/src/main.ts index 611c695d..1a79d3cf 100644 --- a/src/main.ts +++ b/src/main.ts @@ -703,7 +703,12 @@ export function _runTaskList( let gracefulShutdownPromise: Promise | null = null; let forcefulShutdownPromise: Promise | null = null; + let finished = false; const finWithError = (e: unknown) => { + if (finished) { + return; + } + finished = true; const error = e != null ? coerceError(e) : null; if (error) { _finPromise.reject(error); From 7aa5955eec8f11b1d78c893083d527a8d1d71d1b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 22 Nov 2024 11:02:54 +0000 Subject: [PATCH 105/155] The _moment_ that deactivate is called, all future getJob calls should immediately return undefined --- src/main.ts | 63 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/src/main.ts b/src/main.ts index 1a79d3cf..a1a90bc9 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,3 +1,4 @@ +import * as assert from "assert"; import { randomBytes } from "crypto"; import { EventEmitter } from "events"; import { Notification, Pool, PoolClient } from "pg"; @@ -594,34 +595,34 @@ export function _runTaskList( function deactivate() { if (!deactivatePromise) { + assert.equal(workerPool._active, true); + workerPool._active = false; + deactivatePromise = (async () => { - if (workerPool._active) { - workerPool._active = false; - const errors: Error[] = []; - try { - await localQueue?.release(); - } catch (rawE) { - const e = coerceError(rawE); - errors.push(e); - // Log but continue regardless - logger.error(`Releasing local queue failed: ${e}`, { error: rawE }); - } - try { - // Note: this runs regardless of success of the above - await onDeactivate?.(); - } catch (rawE) { - const e = coerceError(rawE); - errors.push(e); - // Log but continue regardless - logger.error(`onDeactivate raised an error: ${e}`, { error: rawE }); - } + const errors: Error[] = []; + try { + await localQueue?.release(); + } catch (rawE) { + const e = coerceError(rawE); + errors.push(e); + // Log but continue regardless + logger.error(`Releasing local queue failed: ${e}`, { error: rawE }); + } + try { + // Note: this runs regardless of success of the above + await onDeactivate?.(); + } catch (rawE) { + const e = coerceError(rawE); + errors.push(e); + // Log but continue regardless + logger.error(`onDeactivate raised an error: ${e}`, { error: rawE }); + } - if (errors.length > 0) { - throw new AggregateError( - errors, - "Errors occurred whilst deactivating queue", - ); - } + if (errors.length > 0) { + throw new AggregateError( + errors, + "Errors occurred whilst deactivating queue", + ); } })(); } @@ -1085,8 +1086,16 @@ export function _runTaskList( ) : null; const getJob: GetJobFunction = localQueue - ? localQueue.getJob // Already bound + ? async (workerId, flagsToSkip) => { + if (!workerPool._active) { + return undefined; + } + return localQueue.getJob(workerId, flagsToSkip); + } : async (_workerId, flagsToSkip) => { + if (!workerPool._active) { + return undefined; + } const jobs = await baseGetJob( compiledSharedOptions, withPgClient, From 174c76c4ac47930566f4ad429bfbabaaea4c87e5 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 22 Nov 2024 11:06:25 +0000 Subject: [PATCH 106/155] Apply timeouts to forceful shutdown --- src/main.ts | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/main.ts b/src/main.ts index a1a90bc9..50a704e6 100644 --- a/src/main.ts +++ b/src/main.ts @@ -930,8 +930,18 @@ export function _runTaskList( }); try { logger.debug(`Attempting forceful shutdown`); + const timeout = new Promise((_resolve, reject) => { + const t = setTimeout( + () => reject(new Error("Timed out")), + 5000 /* TODO: make configurable */, + ); + t.unref(); + }); + // Stop new jobs being added - const deactivatePromise = deactivate(); + // NOTE: deactivate() immediately stops getJob working, even if the + // promise takes a while to resolve. + const deactivatePromise = Promise.race([deactivate(), timeout]); const errors: Error[] = []; @@ -943,10 +953,12 @@ export function _runTaskList( // Remove all the workers - we're shutting them down manually const workerPromises = workers.map((worker) => - worker.release(true), + // Note force=true means that this completes immediately _except_ + // it still calls the `stopWorker` async hook, so we must still + // handle a timeout. + Promise.race([worker.release(true), timeout]), ); // Ignore the results, we're shutting down anyway - // TODO: add a timeout const [deactivateResult, ..._ignoreWorkerReleaseResults] = await Promise.allSettled([deactivatePromise, ...workerPromises]); if (deactivateResult.status === "rejected") { @@ -1024,7 +1036,6 @@ export function _runTaskList( }, ); - // This should never call fin() since forceful shutdown always errors forcefulShutdownPromise.then(fin, finWithError); return forcefulShutdownPromise; From afcaaed64936ed778acd03d998cfc42b30d46328 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 22 Nov 2024 14:24:41 +0000 Subject: [PATCH 107/155] forcefulShutdown can be successful; channel errors separately --- src/index.ts | 4 +- src/interfaces.ts | 4 +- src/main.ts | 192 ++++++++++++++++++++++++++++++++-------------- 3 files changed, 138 insertions(+), 62 deletions(-) diff --git a/src/index.ts b/src/index.ts index 76628deb..641b018e 100644 --- a/src/index.ts +++ b/src/index.ts @@ -372,11 +372,11 @@ declare global { poolGracefulShutdown( event: GraphileWorker.PoolGracefulShutdownEvent, - ): Promise; + ): ReturnType; poolForcefulShutdown( event: GraphileWorker.PoolForcefulShutdownEvent, - ): Promise; + ): ReturnType; } interface WorkerHooks { diff --git a/src/interfaces.ts b/src/interfaces.ts index e9dbea50..558d771d 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -517,7 +517,9 @@ export interface WorkerPool { /** @deprecated Use gracefulShutdown instead */ release: () => PromiseOrDirect; gracefulShutdown: (message?: string) => PromiseOrDirect; - forcefulShutdown: (message: string) => PromiseOrDirect; + forcefulShutdown: (message: string) => PromiseOrDirect<{ + forceFailedJobs: readonly Job[]; + }>; promise: Promise; /** Fires 'abort' when all running jobs should stop because worker is shutting down. @experimental */ abortSignal: AbortSignal; diff --git a/src/main.ts b/src/main.ts index 50a704e6..3bc28be9 100644 --- a/src/main.ts +++ b/src/main.ts @@ -589,6 +589,8 @@ export function _runTaskList( unregisterSignalHandlers = registerSignalHandlers(logger, events); } + /* Errors that should be raised from the workerPool.promise (i.e. _finPromise) */ + const _finErrors: Error[] = []; const _finPromise = defer(); let deactivatePromise: Promise | null = null; @@ -630,10 +632,13 @@ export function _runTaskList( } let terminated = false; - async function terminate(error?: Error) { + async function terminate() { if (!terminated) { terminated = true; + /* Errors that should be raised from terminate() itself */ + const terminateErrors: Error[] = []; + const releaseCompleteJobPromise = releaseCompleteJob?.(); const releaseFailJobPromise = releaseFailJob?.(); const [releaseCompleteJobResult, releaseFailJobResult] = @@ -641,9 +646,10 @@ export function _runTaskList( releaseCompleteJobPromise, releaseFailJobPromise, ]); - const errors: Error[] = error ? [error] : []; if (releaseCompleteJobResult.status === "rejected") { - errors.push(coerceError(releaseCompleteJobResult.reason)); + const error = coerceError(releaseCompleteJobResult.reason); + _finErrors.push(error); + terminateErrors.push(error); // Log but continue regardless logger.error( `Releasing complete job batcher failed: ${releaseCompleteJobResult.reason}`, @@ -653,7 +659,9 @@ export function _runTaskList( ); } if (releaseFailJobResult.status === "rejected") { - errors.push(coerceError(releaseFailJobResult.reason)); + const error = coerceError(releaseFailJobResult.reason); + _finErrors.push(error); + terminateErrors.push(error); // Log but continue regardless logger.error( `Releasing failed job batcher failed: ${releaseFailJobResult.reason}`, @@ -669,14 +677,15 @@ export function _runTaskList( try { await onTerminate?.(); } catch (e) { - errors.push(coerceError(e)); + _finErrors.push(coerceError(e)); + terminateErrors.push(coerceError(e)); } - if (errors.length === 1) { - throw errors[0]; - } else if (errors.length > 1) { + if (terminateErrors.length === 1) { + throw terminateErrors[0]; + } else if (terminateErrors.length > 1) { throw new AggregateError( - errors, + terminateErrors, "Errors occurred whilst terminating queue", ); } @@ -701,8 +710,12 @@ export function _runTaskList( // Make sure Node doesn't get upset about unhandled rejection abortPromise.then(null, () => /* noop */ void 0); - let gracefulShutdownPromise: Promise | null = null; - let forcefulShutdownPromise: Promise | null = null; + let gracefulShutdownPromise: ReturnType< + WorkerPool["gracefulShutdown"] + > | null = null; + let forcefulShutdownPromise: ReturnType< + WorkerPool["forcefulShutdown"] + > | null = null; let finished = false; const finWithError = (e: unknown) => { @@ -710,9 +723,13 @@ export function _runTaskList( return; } finished = true; - const error = e != null ? coerceError(e) : null; - if (error) { - _finPromise.reject(error); + if (e != null) { + _finErrors.push(coerceError(e)); + } + if (_finErrors.length === 1) { + _finPromise.reject(_finErrors[0]); + } else if (_finErrors.length > 1) { + _finPromise.reject(new AggregateError(_finErrors)); } else { _finPromise.resolve(); } @@ -758,18 +775,20 @@ export function _runTaskList( * progress to complete. */ gracefulShutdown(message = "Worker pool is shutting down gracefully") { - if (workerPool._forcefulShuttingDown) { - logger.error( - `gracefulShutdown called when forcefulShutdown is already in progress`, - ); - return forcefulShutdownPromise!; - } if (workerPool._shuttingDown) { logger.error( `gracefulShutdown called when gracefulShutdown is already in progress`, ); return gracefulShutdownPromise!; } + if (workerPool._forcefulShuttingDown) { + logger.error( + `gracefulShutdown called when forcefulShutdown is already in progress`, + ); + return Promise.resolve(forcefulShutdownPromise).then(() => { + throw new Error("Forceful shutdown already initiated"); + }); + } workerPool._shuttingDown = true; gracefulShutdownPromise = middleware.run( @@ -786,7 +805,7 @@ export function _runTaskList( // Stop new jobs being added const deactivatePromise = deactivate(); - const errors: Error[] = []; + const gracefulShutdownErrors: Error[] = []; // Remove all the workers - we're shutting them down manually const workers = [...workerPool._workers]; @@ -794,7 +813,9 @@ export function _runTaskList( const [deactivateResult, ...workerReleaseResults] = await Promise.allSettled([deactivatePromise, ...workerPromises]); if (deactivateResult.status === "rejected") { - errors.push(coerceError(deactivateResult.reason)); + const error = coerceError(deactivateResult.reason); + _finErrors.push(error); + gracefulShutdownErrors.push(error); // Log but continue regardless logger.error(`Deactivation failed: ${deactivateResult.reason}`, { error: deactivateResult.reason, @@ -851,18 +872,19 @@ export function _runTaskList( } if (this._forcefulShuttingDown) { - errors.push( + // Do _not_ add to _finErrors + gracefulShutdownErrors.push( new Error( "forcefulShutdown was initiated whilst gracefulShutdown was still executing.", ), ); } - if (errors.length === 1) { - throw errors[0]; - } else if (errors.length > 1) { + if (gracefulShutdownErrors.length === 1) { + throw gracefulShutdownErrors[0]; + } else if (gracefulShutdownErrors.length > 1) { throw new AggregateError( - errors, + gracefulShutdownErrors, "Errors occurred whilst shutting down worker", ); } @@ -883,13 +905,17 @@ export function _runTaskList( `Error occurred during graceful shutdown: ${message}`, { error: e }, ); - // NOTE: we now rely on forcefulShutdown to handle terminate() - if (this._forcefulShuttingDown) { + + const forcefulPromise = // Skip the warning about double shutdown - return forcefulShutdownPromise!; - } else { - return this.forcefulShutdown(message); - } + this._forcefulShuttingDown + ? forcefulShutdownPromise! + : this.forcefulShutdown(message); + + // NOTE: we now rely on forcefulShutdown to handle terminate() + return Promise.resolve(forcefulPromise).then(() => { + throw e; + }); } if (!terminated) { await terminate(); @@ -897,7 +923,7 @@ export function _runTaskList( }, ); - gracefulShutdownPromise.then(fin, finWithError); + Promise.resolve(gracefulShutdownPromise).then(fin, finWithError); const abortTimer = setTimeout(() => { abortController.abort(); @@ -938,18 +964,16 @@ export function _runTaskList( t.unref(); }); + const wasAlreadyDeactivating = deactivatePromise != null; // Stop new jobs being added // NOTE: deactivate() immediately stops getJob working, even if the // promise takes a while to resolve. - const deactivatePromise = Promise.race([deactivate(), timeout]); + const deactiveateOrTimeout = Promise.race([deactivate(), timeout]); - const errors: Error[] = []; + const forcefulShutdownErrors: Error[] = []; // Release all our workers' jobs const workers = [...workerPool._workers]; - const jobsInProgress: Array = workers - .map((worker) => worker.getActiveJob()) - .filter((job): job is Job => !!job); // Remove all the workers - we're shutting them down manually const workerPromises = workers.map((worker) => @@ -959,24 +983,54 @@ export function _runTaskList( Promise.race([worker.release(true), timeout]), ); // Ignore the results, we're shutting down anyway - const [deactivateResult, ..._ignoreWorkerReleaseResults] = - await Promise.allSettled([deactivatePromise, ...workerPromises]); + const [deactivateResult, ...workerReleaseResults] = + await Promise.allSettled([ + deactiveateOrTimeout, + ...workerPromises, + ]); if (deactivateResult.status === "rejected") { // Log but continue regardless logger.error(`Deactivation failed: ${deactivateResult.reason}`, { error: deactivateResult.reason, }); - errors.push(coerceError(deactivateResult.reason)); + const error = coerceError(deactivateResult.reason); + if (!wasAlreadyDeactivating) { + // Add this to _finErrors unless it's already there + _finErrors.push(error); + } + forcefulShutdownErrors.push(error); } - if (jobsInProgress.length > 0) { + const workerProblems = workers + .map((worker, i) => { + const result = workerReleaseResults[i]; + const activeJob = worker.getActiveJob(); + if (result.status === "rejected") { + return [ + worker, + coerceError(result.reason), + activeJob, + ] as const; + } else if (activeJob) { + return [worker, null, activeJob] as const; + } else { + return null; + } + }) + .filter((t: T | null): t is T => t != null); + + const forceFailedJobs = workerProblems + .map(([, , job]) => job) + .filter((job): job is Job => !!job); + + if (forceFailedJobs.length > 0) { const workerIds = workers.map((worker) => worker.workerId); logger.debug( - `Releasing the jobs ${jobsInProgress + `Releasing the jobs ${forceFailedJobs .map((j) => j.id) .join()} (workers: ${workerIds.join(", ")})`, { - jobs: jobsInProgress, + jobs: forceFailedJobs, workerIds, }, ); @@ -985,24 +1039,44 @@ export function _runTaskList( compiledSharedOptions, withPgClient, workerPool.id, - jobsInProgress, + forceFailedJobs, message, ); + logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { cancelledJobs, }); } catch (e) { - errors.push(coerceError(e)); + const error = coerceError(e); + _finErrors.push(error); + forcefulShutdownErrors.push(error); } } else { logger.debug("No active jobs to release"); } - if (errors.length === 1) { - throw errors[0]; - } else if (errors.length > 1) { + for (const [worker, error, job] of workerProblems) { + // These are not a failure of forcefulShutdown, so do not go into + // forcefulShutdownErrors. + _finErrors.push( + new Error( + `Worker ${worker.workerId} ${ + job ? `with active job ${job.id}` : "" + } ${ + error + ? `failed to release, error: ${error})` + : `failed to stop working` + }`, + { cause: error }, + ), + ); + } + + if (forcefulShutdownErrors.length === 1) { + throw forcefulShutdownErrors[0]; + } else if (forcefulShutdownErrors.length > 1) { throw new AggregateError( - errors, + forcefulShutdownErrors, "Errors occurred whilst forcefully shutting down worker", ); } @@ -1012,6 +1086,9 @@ export function _runTaskList( workerPool, }); logger.debug("Forceful shutdown complete"); + return { + forceFailedJobs, + }; } catch (e) { events.emit("pool:forcefulShutdown:error", { pool: workerPool, @@ -1019,24 +1096,21 @@ export function _runTaskList( error: e, }); const error = coerceError(e); + _finErrors.push(error); logger.error( `Error occurred during forceful shutdown: ${error.message}`, { error: e }, ); + throw error; + } finally { if (!terminated) { - // Guaranteed to throw - await terminate(error); + await terminate(); } - throw error; - } - if (!terminated) { - // Guaranteed to throw - await terminate(new Error("Forceful shutdown")); } }, ); - forcefulShutdownPromise.then(fin, finWithError); + Promise.resolve(forcefulShutdownPromise).then(fin, finWithError); return forcefulShutdownPromise; }, From fdd041cee6f7858509cfd96c306e1fc14d8b922b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 22 Nov 2024 17:58:36 +0000 Subject: [PATCH 108/155] On unexpected worker exit, shut down entire pool (gracefully) --- src/main.ts | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 3bc28be9..ff6810aa 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1282,12 +1282,24 @@ export function _runTaskList( workerPool._workers.push(worker); const remove = () => { if (continuous && workerPool._active && !workerPool._shuttingDown) { + // TODO: user should choose how to handle this, maybe via a middleware: + // - graceful shutdown (implemented) + // - forceful shutdown (probably best after a delay) + // - boot up a replacement worker + /* middleware.run("poolWorkerPrematureExit", {}, () => { */ logger.error( `Worker exited, but pool is in continuous mode, is active, and is not shutting down... Did something go wrong?`, ); + _finErrors.push( + new Error(`Worker ${worker.workerId} exited unexpectedly`), + ); + workerPool.gracefulShutdown( + "Something went wrong, one of the workers exited prematurely. Shutting down.", + ); + /* }) */ } workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); - if (!continuous && workerPool._workers.length === 0) { + if (workerPool._workers.length === 0) { if (!workerPool._shuttingDown) { workerPool.gracefulShutdown( "'Run once' mode processed all available jobs and is now exiting", From 84fb704514cad18a59d9dd2e73912d55d104c0f5 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 17:40:57 +0000 Subject: [PATCH 109/155] Upgrade graphile-config, simplify middleware handlers --- package.json | 2 +- src/index.ts | 18 ++---------------- yarn.lock | 8 ++++---- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/package.json b/package.json index 0f18f445..9c9b51ac 100644 --- a/package.json +++ b/package.json @@ -54,7 +54,7 @@ "@types/debug": "^4.1.10", "@types/pg": "^8.10.5", "cosmiconfig": "^8.3.6", - "graphile-config": "^0.0.1-beta.11", + "graphile-config": "^0.0.1-beta.12", "json5": "^2.2.3", "pg": "^8.11.3", "tslib": "^2.6.2", diff --git a/src/index.ts b/src/index.ts index 641b018e..2a02e529 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,9 +1,5 @@ import { Logger } from "@graphile/logger"; -import { - CallbackOrDescriptor, - MiddlewareNext, - PluginHook, -} from "graphile-config"; +import { MiddlewareHandlers, PluginHook } from "graphile-config"; import type { PoolClient } from "pg"; import { getCronItems } from "./getCronItems"; @@ -322,17 +318,7 @@ declare global { interface Plugin { worker?: { - // TODO: replace with the following once we upgrade graphile-config again - // middleware?: MiddlewareHandlers - middleware?: { - [key in keyof WorkerMiddleware]?: CallbackOrDescriptor< - WorkerMiddleware[key] extends ( - ...args: infer UArgs - ) => infer UResult - ? (next: MiddlewareNext, ...args: UArgs) => UResult - : never - >; - }; + middleware?: MiddlewareHandlers; // TODO: deprecate this, replace with middleware hooks?: { diff --git a/yarn.lock b/yarn.lock index d81fd84d..991bd5df 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6355,10 +6355,10 @@ graphemer@^1.4.0: resolved "https://registry.yarnpkg.com/graphemer/-/graphemer-1.4.0.tgz#fb2f1d55e0e3a1849aeffc90c4fa0dd53a0e66c6" integrity sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag== -graphile-config@^0.0.1-beta.4, graphile-config@^0.0.1-beta.11: - version "0.0.1-beta.11" - resolved "https://registry.yarnpkg.com/graphile-config/-/graphile-config-0.0.1-beta.11.tgz#4bd2ffd1fee6834f2e5dedc64016e7a3a9eda151" - integrity sha512-+2QLPpihQQvSYd6sSXcDrwHMMSygUrK41qWhak7u3vsXj2AGwVwl+kVvlBwuoovaoUPDsGF8zy5IevTAMgzg5Q== +graphile-config@^0.0.1-beta.12, graphile-config@^0.0.1-beta.4: + version "0.0.1-beta.12" + resolved "https://registry.yarnpkg.com/graphile-config/-/graphile-config-0.0.1-beta.12.tgz#e31e12077366f3cbe55708ec20452e5027177627" + integrity sha512-th7C2fM29dhra5gCmykWUJQMCAzA6C5W+dF8DZa0BWLImmHnSUK+AO4qPCR6bZKR5JKTW2onZweqP4ZHVLPQFw== dependencies: "@types/interpret" "^1.1.1" "@types/node" "^20.5.7" From 88dba0f5e60abcc446e0af5bb4a79199d019f16a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 17:50:03 +0000 Subject: [PATCH 110/155] New event emitter each time --- src/config.ts | 1 - src/lib.ts | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config.ts b/src/config.ts index 7e8f54b6..cc39efb6 100644 --- a/src/config.ts +++ b/src/config.ts @@ -36,7 +36,6 @@ export const makeWorkerPresetWorkerOptions = () => maxResetLockedInterval: 10 * MINUTE, gracefulShutdownAbortTimeout: 5 * SECOND, useNodeTime: false, - events: new EventEmitter() as WorkerEvents, } satisfies GraphileConfig.WorkerOptions); function enforceStringOrUndefined( diff --git a/src/lib.ts b/src/lib.ts index 440c9c72..ea67a3c1 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -1,4 +1,5 @@ import * as assert from "assert"; +import EventEmitter from "events"; import { applyHooks, AsyncHooks, @@ -226,7 +227,7 @@ export function processSharedOptions< maxResetLockedInterval, schema: workerSchema, logger, - events, + events = new EventEmitter(), }, plugins, } = resolvedPreset; From fc3f4a5b62d3f31663845d4bdb5a048d838fa4c3 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 17:50:11 +0000 Subject: [PATCH 111/155] Move to resolvePreset --- src/config.ts | 2 -- src/lib.ts | 14 ++++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/config.ts b/src/config.ts index cc39efb6..08eca53e 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,8 +1,6 @@ import { cosmiconfigSync } from "cosmiconfig"; -import EventEmitter from "events"; import { MINUTE, SECOND } from "./cronConstants"; -import type { WorkerEvents } from "./interfaces"; import { defaultLogger } from "./logger"; const cosmiconfigResult = cosmiconfigSync("graphile-worker").search(); diff --git a/src/lib.ts b/src/lib.ts index ea67a3c1..d2702943 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -5,7 +5,7 @@ import { AsyncHooks, Middleware, orderedApply, - resolvePresets, + resolvePreset, } from "graphile-config"; import { Client, Pool, PoolClient, PoolConfig } from "pg"; @@ -204,11 +204,13 @@ export function processSharedOptions< | CompiledSharedOptions | undefined; if (!compiled) { - const resolvedPreset = resolvePresets([ - WorkerPreset, - // Explicit options override the preset - legacyOptionsToPreset(options), - ]) as ResolvedWorkerPreset; + const resolvedPreset = resolvePreset({ + extends: [ + WorkerPreset, + // Explicit options override the preset + legacyOptionsToPreset(options), + ], + }) as ResolvedWorkerPreset; const middleware = new Middleware(); From d880d2ff8a5fb85796dc84ed4b1de8eaaff0be1d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 18:02:13 +0000 Subject: [PATCH 112/155] Rename completeJob -> completeJobs --- src/main.ts | 6 +++--- src/sql/{completeJob.ts => completeJobs.ts} | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) rename src/sql/{completeJob.ts => completeJobs.ts} (98%) diff --git a/src/main.ts b/src/main.ts index ff6810aa..7cfc53be 100644 --- a/src/main.ts +++ b/src/main.ts @@ -32,7 +32,7 @@ import { import { LocalQueue } from "./localQueue"; import { Logger } from "./logger"; import SIGNALS, { Signal } from "./signals"; -import { completeJob as baseCompleteJob } from "./sql/completeJob"; +import { completeJobs as baseCompleteJobs } from "./sql/completeJobs"; import { failJob as baseFailJob, failJobs } from "./sql/failJob"; import { getJob as baseGetJob } from "./sql/getJob"; import { resetLockedAt } from "./sql/resetLockedAt"; @@ -1197,7 +1197,7 @@ export function _runTaskList( ? batch( completeJobBatchDelay, (jobs) => - baseCompleteJob( + baseCompleteJobs( compiledSharedOptions, withPgClient, workerPool.id, @@ -1221,7 +1221,7 @@ export function _runTaskList( : { release: null, fn: (job) => - baseCompleteJob( + baseCompleteJobs( compiledSharedOptions, withPgClient, workerPool.id, diff --git a/src/sql/completeJob.ts b/src/sql/completeJobs.ts similarity index 98% rename from src/sql/completeJob.ts rename to src/sql/completeJobs.ts index 83004575..6912e48b 100644 --- a/src/sql/completeJob.ts +++ b/src/sql/completeJobs.ts @@ -3,7 +3,7 @@ import { CompiledSharedOptions } from "../lib"; const manualPrepare = false; -export async function completeJob( +export async function completeJobs( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, poolId: string, From 3a6195eb811fca6754a2767b262747533f2fdb6f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 18:03:44 +0000 Subject: [PATCH 113/155] Rename getJob -> getJobs --- src/localQueue.ts | 6 +++--- src/main.ts | 4 ++-- src/sql/{getJob.ts => getJobs.ts} | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) rename src/sql/{getJob.ts => getJobs.ts} (99%) diff --git a/src/localQueue.ts b/src/localQueue.ts index 41a78f9f..c36e9d2b 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -11,7 +11,7 @@ import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; import { coerceError } from "./lib"; -import { getJob as baseGetJob } from "./sql/getJob"; +import { getJobs as baseGetJobs } from "./sql/getJobs"; import { returnJobs } from "./sql/returnJobs"; const { STARTING, POLLING, WAITING, TTL_EXPIRED, RELEASED } = LocalQueueModes; @@ -413,7 +413,7 @@ export class LocalQueue { this.refetchDelayCounter = 0; // The ONLY await in this function. - const jobs = await baseGetJob( + const jobs = await baseGetJobs( this.compiledSharedOptions, this.withPgClient, this.tasks, @@ -588,7 +588,7 @@ export class LocalQueue { // Cannot batch if there's flags if (flagsToSkip !== null) { - const jobsPromise = baseGetJob( + const jobsPromise = baseGetJobs( this.compiledSharedOptions, this.withPgClient, this.tasks, diff --git a/src/main.ts b/src/main.ts index 7cfc53be..67393806 100644 --- a/src/main.ts +++ b/src/main.ts @@ -34,7 +34,7 @@ import { Logger } from "./logger"; import SIGNALS, { Signal } from "./signals"; import { completeJobs as baseCompleteJobs } from "./sql/completeJobs"; import { failJob as baseFailJob, failJobs } from "./sql/failJob"; -import { getJob as baseGetJob } from "./sql/getJob"; +import { getJobs as baseGetJobs } from "./sql/getJobs"; import { resetLockedAt } from "./sql/resetLockedAt"; import { makeNewWorker } from "./worker"; @@ -1181,7 +1181,7 @@ export function _runTaskList( if (!workerPool._active) { return undefined; } - const jobs = await baseGetJob( + const jobs = await baseGetJobs( compiledSharedOptions, withPgClient, tasks, diff --git a/src/sql/getJob.ts b/src/sql/getJobs.ts similarity index 99% rename from src/sql/getJob.ts rename to src/sql/getJobs.ts index 73d174a9..2dbc2ee2 100644 --- a/src/sql/getJob.ts +++ b/src/sql/getJobs.ts @@ -11,7 +11,7 @@ export function isPromise(t: T | Promise): t is Promise { ); } -export async function getJob( +export async function getJobs( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, tasks: TaskList, From c3946cd5fe54a90e4cc0a60480030c7f21a787d9 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 18:08:42 +0000 Subject: [PATCH 114/155] Rename failJob -> batchFailJobs --- src/main.ts | 6 +++--- src/sql/{failJob.ts => failJobs.ts} | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) rename src/sql/{failJob.ts => failJobs.ts} (99%) diff --git a/src/main.ts b/src/main.ts index 67393806..5b5dbb39 100644 --- a/src/main.ts +++ b/src/main.ts @@ -33,7 +33,7 @@ import { LocalQueue } from "./localQueue"; import { Logger } from "./logger"; import SIGNALS, { Signal } from "./signals"; import { completeJobs as baseCompleteJobs } from "./sql/completeJobs"; -import { failJob as baseFailJob, failJobs } from "./sql/failJob"; +import { batchFailJobs as baseFailJobs, failJobs } from "./sql/failJobs"; import { getJobs as baseGetJobs } from "./sql/getJobs"; import { resetLockedAt } from "./sql/resetLockedAt"; import { makeNewWorker } from "./worker"; @@ -1235,7 +1235,7 @@ export function _runTaskList( ? batch( failJobBatchDelay, (specs) => - baseFailJob( + baseFailJobs( compiledSharedOptions, withPgClient, workerPool.id, @@ -1259,7 +1259,7 @@ export function _runTaskList( : { release: null, fn: (spec) => - baseFailJob(compiledSharedOptions, withPgClient, workerPool.id, [ + baseFailJobs(compiledSharedOptions, withPgClient, workerPool.id, [ spec, ]), } diff --git a/src/sql/failJob.ts b/src/sql/failJobs.ts similarity index 99% rename from src/sql/failJob.ts rename to src/sql/failJobs.ts index a86db5e5..a459c7af 100644 --- a/src/sql/failJob.ts +++ b/src/sql/failJobs.ts @@ -6,7 +6,7 @@ interface Spec { replacementPayload: undefined | unknown[]; } -export async function failJob( +export async function batchFailJobs( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, poolId: string, From 6b8cbab4a2da3dcec3b861855e396cfff0d385e0 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Fri, 6 Dec 2024 18:11:45 +0000 Subject: [PATCH 115/155] Rename to batch for consistency --- src/localQueue.ts | 6 +++--- src/main.ts | 16 ++++++++-------- src/sql/completeJobs.ts | 2 +- src/sql/getJobs.ts | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index c36e9d2b..793cd6e0 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -11,7 +11,7 @@ import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; import { coerceError } from "./lib"; -import { getJobs as baseGetJobs } from "./sql/getJobs"; +import { batchGetJobs } from "./sql/getJobs"; import { returnJobs } from "./sql/returnJobs"; const { STARTING, POLLING, WAITING, TTL_EXPIRED, RELEASED } = LocalQueueModes; @@ -413,7 +413,7 @@ export class LocalQueue { this.refetchDelayCounter = 0; // The ONLY await in this function. - const jobs = await baseGetJobs( + const jobs = await batchGetJobs( this.compiledSharedOptions, this.withPgClient, this.tasks, @@ -588,7 +588,7 @@ export class LocalQueue { // Cannot batch if there's flags if (flagsToSkip !== null) { - const jobsPromise = baseGetJobs( + const jobsPromise = batchGetJobs( this.compiledSharedOptions, this.withPgClient, this.tasks, diff --git a/src/main.ts b/src/main.ts index 5b5dbb39..8878535a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -32,9 +32,9 @@ import { import { LocalQueue } from "./localQueue"; import { Logger } from "./logger"; import SIGNALS, { Signal } from "./signals"; -import { completeJobs as baseCompleteJobs } from "./sql/completeJobs"; -import { batchFailJobs as baseFailJobs, failJobs } from "./sql/failJobs"; -import { getJobs as baseGetJobs } from "./sql/getJobs"; +import { batchCompleteJobs } from "./sql/completeJobs"; +import { batchFailJobs, failJobs } from "./sql/failJobs"; +import { batchGetJobs } from "./sql/getJobs"; import { resetLockedAt } from "./sql/resetLockedAt"; import { makeNewWorker } from "./worker"; @@ -1181,7 +1181,7 @@ export function _runTaskList( if (!workerPool._active) { return undefined; } - const jobs = await baseGetJobs( + const jobs = await batchGetJobs( compiledSharedOptions, withPgClient, tasks, @@ -1197,7 +1197,7 @@ export function _runTaskList( ? batch( completeJobBatchDelay, (jobs) => - baseCompleteJobs( + batchCompleteJobs( compiledSharedOptions, withPgClient, workerPool.id, @@ -1221,7 +1221,7 @@ export function _runTaskList( : { release: null, fn: (job) => - baseCompleteJobs( + batchCompleteJobs( compiledSharedOptions, withPgClient, workerPool.id, @@ -1235,7 +1235,7 @@ export function _runTaskList( ? batch( failJobBatchDelay, (specs) => - baseFailJobs( + batchFailJobs( compiledSharedOptions, withPgClient, workerPool.id, @@ -1259,7 +1259,7 @@ export function _runTaskList( : { release: null, fn: (spec) => - baseFailJobs(compiledSharedOptions, withPgClient, workerPool.id, [ + batchFailJobs(compiledSharedOptions, withPgClient, workerPool.id, [ spec, ]), } diff --git a/src/sql/completeJobs.ts b/src/sql/completeJobs.ts index 6912e48b..3154a161 100644 --- a/src/sql/completeJobs.ts +++ b/src/sql/completeJobs.ts @@ -3,7 +3,7 @@ import { CompiledSharedOptions } from "../lib"; const manualPrepare = false; -export async function completeJobs( +export async function batchCompleteJobs( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, poolId: string, diff --git a/src/sql/getJobs.ts b/src/sql/getJobs.ts index 2dbc2ee2..0ceac97a 100644 --- a/src/sql/getJobs.ts +++ b/src/sql/getJobs.ts @@ -11,7 +11,7 @@ export function isPromise(t: T | Promise): t is Promise { ); } -export async function getJobs( +export async function batchGetJobs( compiledSharedOptions: CompiledSharedOptions, withPgClient: EnhancedWithPgClient, tasks: TaskList, From e922c1341191e1f0513869887b070ed87e5e598d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Sat, 7 Dec 2024 11:32:04 +0000 Subject: [PATCH 116/155] Add and correct LocalQueue comments --- src/localQueue.ts | 136 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 124 insertions(+), 12 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 793cd6e0..1d208943 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -55,12 +55,14 @@ const { STARTING, POLLING, WAITING, TTL_EXPIRED, RELEASED } = LocalQueueModes; * * When jobs are fetched: * - * - if no jobs were returned then it will wait `pollInterval` ms and then - * fetch again. * - if fewer than `Math.ceil(Math.min(localQueueRefetchDelay.threshold, localQueueSize))` * jobs were returned then a refetch delay will be set (if configured). - * - if jobs are returned from a POLLING mode fetch then the queue immediately - * enters WAITING mode. + * - if jobs were returned then it will supply as many as possible to any + * waiting workers (`workerQueue`) + * - if all workers are busy and jobs still remain it will store them to + * `jobQueue` and immediately enter WAITING mode + * - otherwise (if no jobs remain: `jobQueue` is empty) we'll wait + * `pollInterval` ms and then fetch again. * * When a "new job" notification is received, once any required refetch delay * has expired (or immediately if it has already expired) the timer will be @@ -96,30 +98,123 @@ const { STARTING, POLLING, WAITING, TTL_EXPIRED, RELEASED } = LocalQueueModes; * * Triggered on shutdown. */ - export class LocalQueue { + /** + * The configured time (in milliseconds) that a job may sit unclaimed in the + * local queue before being returned to the database. + */ readonly ttl: number; + + /** + * The time interval (in milliseconds) between fetch requests when in + * `POLLING` mode. + */ readonly pollInterval: number; + + /** + * The jobs that have been pulled from the database that are waiting for a + * worker to claim them. Once claimed, a job will be removed from this list. + * This should be empty in POLLING and TTL_EXPIRED modes. + */ + readonly jobQueue: Job[] = []; + /** + * Workers waiting for jobs are represented by deferred promises in this + * list. When a job becomes available, first it attempts to satisfy one of + * these from the workerQueue, and only if this is empty does it then add the + * job to the `jobQueue`. + */ readonly workerQueue: Deferred[] = []; + + /** + * Are we currently fetching jobs from the DB? Prevents double-fetches. + */ fetchInProgress = false; + + /** + * When we enter WAITING mode (i.e. there are jobs in `jobQueue`), we set up + * this timer. When the timer fires, we will release any remaining jobs in + * jobQueue back to the database (and enter TTL_EXPIRED mode). Note: all jobs + * are fetched at once, and no further jobs are fetched, so the TTL for all + * jobs will expire at the same time - we'll only return to POLLING mode once + * all jobs have been executed. + */ ttlExpiredTimer: NodeJS.Timeout | null = null; + + /** + * The timer associated with the next fetch poll (see also `pollInterval`). + */ fetchTimer: NodeJS.Timeout | null = null; - // Set true to fetch immediately after a fetch completes; typically only used - // when the queue is pulsed during a fetch. + + /** + * Should we fetch again once the current fetch is complete? This is + * generally used to indicate that we received a "new job" notification (the + * queue is "pulsed") whilst we were already fetching, so our fetch may not + * have included that job. + */ fetchAgain = false; + + /** + * The mode that the queue is in; must only be changed via `setMode`, which + * itself must only be called by the `setMode*()` methods. + */ public readonly mode: LocalQueueMode = STARTING; - /** The promise that resolves/rejects when the queue is disposed of */ + + /** + * The promise that resolves/rejects when the local queue has been released. + * Will not resolve until all locally queued jobs have been returned to the + * pool (or may reject if this process fails) and all active fetches and + * other background tasks are complete. This is important, otherwise we might + * release the pg.Pool that we're using before jobs are returned to the + * database, which would be something we couldn't recover from! + * + * If it rejects, may reject with a regular Error or an AggregateError + * representing multiple failures. + */ private _finPromise = defer(); + + /** + * Errors that occurred causing the shutdown or during the shutdown of this + * local queue instance. + */ private errors: Error[] = []; - /** A count of the number of "background" processes such as fetching or returning jobs */ + + /** + * A count of the number of "background" processes such as fetching or + * returning jobs such that we can avoid exiting until all background tasks + * have completed. + */ private backgroundCount = 0; - /** If `localQueueRefetchDelay` is configured; set this true if the fetch resulted in a queue size lower than the threshold. */ + /** + * If `localQueueRefetchDelay` is configured; set this true if the fetch + * resulted in a queue size lower than the threshold. + */ private refetchDelayActive = false; + + /** + * If true, when the refetch delay expires in POLLING mode (or when we next + * enter POLLING mode after it expires), immediately trigger a fetch. If + * false, just wait for the regular POLLING timeouts. + */ private refetchDelayFetchOnComplete = false; + + /** The timer tracking when the refetch delay has expired. */ private refetchDelayTimer: NodeJS.Timeout | null = null; + + /** + * The number of new jobs received during the fetch or the resulting refetch + * delay; see also `refetchDelayAbortThreshold`. + */ private refetchDelayCounter: number = 0; + + /** + * A random number between 0 and either + * `preset.worker.localQueue.refetchDelay.maxAbortThreshold` or + * `5*preset.worker.localQueue.size`; when we've been informed of this many + * jobs via pulse(), we must abort the refetch delay and trigger an immediate + * fetch. + */ private refetchDelayAbortThreshold: number = Infinity; constructor( @@ -127,7 +222,12 @@ export class LocalQueue { private readonly tasks: TaskList, private readonly withPgClient: EnhancedWithPgClient, public readonly workerPool: WorkerPool, + /** How many jobs to fetch at once */ private readonly getJobBatchSize: number, + /** + * If false, exit once the DB seems to have been exhausted of jobs, even if + * for just a moment. (I.e. `runOnce()`) + */ private readonly continuous: boolean, ) { this.ttl = @@ -148,9 +248,13 @@ export class LocalQueue { compiledSharedOptions.events.emit("localQueue:init", { localQueue: this, }); + // Immediately enter polling mode. this.setModePolling(); } + /** + * Only call this from `setMode*()` helpers. + */ private setMode( newMode: Exclude, ) { @@ -164,9 +268,15 @@ export class LocalQueue { }); } + /** + * Called when the LocalQueue is completely finished and released: no + * background tasks, no jobs in job queue. Resolves (or rejects) + * `_finPromise`. + */ private fin() { assert.equal(this.mode, "RELEASED"); assert.equal(this.backgroundCount, 0); + assert.equal(this.jobQueue.length, 0); if (this.errors.length === 1) { this._finPromise.reject(this.errors[0]); } else if (this.errors.length > 1) { @@ -191,15 +301,17 @@ export class LocalQueue { this.fin(); } } else { + // If we're not shutting down, view this as a temporary error (but give + // Benjie a wrist slap anyway). this.compiledSharedOptions.logger.error( - `Backgrounding should never yield errors when the queue is not RELEASED`, + `GraphileWorkerInternalError: Backgrounding should never yield errors when the queue is not RELEASED`, { error: e }, ); } }; /** - * For promises that happen in the background, but that we want to ensure are + * Track promises that happen in the background, but that we want to ensure are * handled before we release the queue (so that the database pool isn't * released too early). * From 4149e90f1855d3b11134c57a9301c87a0be206e9 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Sat, 7 Dec 2024 12:59:38 +0000 Subject: [PATCH 117/155] Overhaul error handling in returnJobs --- src/localQueue.ts | 116 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 90 insertions(+), 26 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 1d208943..999d6b86 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -10,7 +10,7 @@ import { import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; -import { coerceError } from "./lib"; +import { coerceError, sleep } from "./lib"; import { batchGetJobs } from "./sql/getJobs"; import { returnJobs } from "./sql/returnJobs"; @@ -346,6 +346,7 @@ export class LocalQueue { "Cannot enter polling mode when job queue isn't empty", ); + // There's no jobs, so there's no need for ttlExpired timer any more. if (this.ttlExpiredTimer) { clearTimeout(this.ttlExpiredTimer); this.ttlExpiredTimer = null; @@ -353,12 +354,15 @@ export class LocalQueue { this.setMode(POLLING); + // This won't necessarily fetch, it will respect refetchDelay this.fetch(); } - private setModeWaiting() { - // Can only enter WAITING mode from POLLING mode. - assert.equal(this.mode, POLLING); + private setModeWaiting(causedByErrorHandling = false) { + if (!causedByErrorHandling) { + // Can only enter WAITING mode from POLLING mode. + assert.equal(this.mode, POLLING); + } assert.ok( !this.fetchTimer, "Cannot enter waiting mode when a fetch is scheduled", @@ -367,6 +371,11 @@ export class LocalQueue { !this.fetchInProgress, "Cannot enter waiting mode when fetch is in progress", ); + assert.equal( + this.workerQueue.length, + 0, + "Cannot enter waiting mode when the worker queue is not empty", + ); assert.notEqual( this.jobQueue.length, 0, @@ -379,12 +388,10 @@ export class LocalQueue { this.setMode(WAITING); - this.ttlExpiredTimer = setTimeout(() => { - this.setModeTtlExpired(); - }, this.ttl); + this.ttlExpiredTimer = setTimeout(this.setModeTtlExpired, this.ttl); } - private setModeTtlExpired() { + private setModeTtlExpired = () => { // Can only enter TTL_EXPIRED mode from WAITING mode. assert.equal(this.mode, WAITING); assert.ok( @@ -410,7 +417,7 @@ export class LocalQueue { // Return jobs to the pool this.returnJobs(); - } + }; private returnJobs() { const l = this.jobQueue.length; @@ -418,10 +425,80 @@ export class LocalQueue { return; } const jobsToReturn = this.jobQueue.splice(0, l); + this.compiledSharedOptions.events.emit("localQueue:returnJobs", { localQueue: this, jobs: jobsToReturn, }); + + let attempts = 1; + let initialError: Error; + const MAX_ATTEMPTS = 20; + const onError = (e: unknown): void | Promise => { + if (attempts === 1) { + initialError = coerceError(e); + } + + this.compiledSharedOptions.logger.error( + `Failed to return jobs from local queue to database queue (attempt ${attempts}/${MAX_ATTEMPTS})`, + { error: e, attempts, maxAttempts: MAX_ATTEMPTS }, + ); + + // NOTE: the mode now may not be the mode that we were in when + // returnJobs was called. An error happened... we need to deal with + // this error gracefully. + switch (this.mode) { + case "RELEASED": { + throw new Error( + `Error occurred whilst returning jobs from local queue to database queue: ${initialError.message}`, + ); + } + + // NOTE: considered doing `this.receivedJobs(jobsToReturn)`; but I + // simply trying to release them again seems safer and more correct. + default: { + if (attempts < MAX_ATTEMPTS) { + /** Minimum delay between attempts (milliseconds); can actually be half this due to jitter */ + const minDelay = 200; + /** Maximum delay between attempts (milliseconds) - can actually be 1.5x this due to jitter */ + const maxDelay = 30_000; // Maximum delay in milliseconds + /** `multiplier ^ attempts` */ + const multiplier = 1.5; + /** Prevent the thundering herd problem by offsetting randomly */ + const jitter = Math.random(); + const delay = + Math.min( + minDelay * Math.pow(multiplier, attempts - 1), + maxDelay, + ) * + (0.5 + jitter); + + // Be sure to increment attempts to avoid infinite loop! + ++attempts; + return sleep(delay).then(() => + returnJobs( + this.compiledSharedOptions, + this.withPgClient, + this.workerPool.id, + jobsToReturn, + ).then(noop, onError), + ); + } else { + // TODO: is this the correct way to handle this? Are we allowed to + // trigger shut down internally? + this.release(); + // Now we're in release mode, throwing the error will be tracked + // automatically by `this.background()` + throw new Error( + `Error occurred whilst returning jobs from local queue to database queue; aborting after ${attempts} attempts. Initial error: ${initialError.message}`, + ); + } + } + } + }; + + // NOTE: the `this.background` call covers all of the re-attempts via + // `onError` above, since `onError` returns the next promise each time. this.background( returnJobs( this.compiledSharedOptions, @@ -429,23 +506,8 @@ export class LocalQueue { this.workerPool.id, jobsToReturn, ).then( - () => {}, - (e) => { - if (this.mode === "RELEASED") { - throw new Error( - `Error occurred whilst returning jobs from local queue to database queue: ${ - coerceError(e).message - }`, - ); - } else { - // Return the jobs to the queue; MUST NOT HAPPEN IN RELEASED MODE. - this.receivedJobs(jobsToReturn); - this.compiledSharedOptions.logger.error( - `Failed to return jobs from local queue to database queue`, - { error: e }, - ); - } - }, + noop, // No action necessary on success + onError, ), ); } @@ -799,3 +861,5 @@ export class LocalQueue { } } } + +function noop() {} From 0c713748221a09cb6fc92baf81a3a5242f012c5f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Sat, 7 Dec 2024 14:16:38 +0000 Subject: [PATCH 118/155] Minor refactoring and more comments --- src/localQueue.ts | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 999d6b86..8fea5053 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -559,6 +559,7 @@ export class LocalQueue { * Initialized to `true` so on error we don't enable refetch delay. */ let refetchDelayThresholdSurpassed = true; + /** How many jobs did we fetch? (Initialize to zero in case of error.) */ let jobCount = 0; const refetchDelayOptions = this.compiledSharedOptions.resolvedPreset.worker.localQueue?.refetchDelay; @@ -592,7 +593,7 @@ export class LocalQueue { this.withPgClient, this.tasks, this.workerPool.id, - null, + null, // `flagsToSkip` is not set, see `LocalQueue.getJob` this.getJobBatchSize, ); @@ -609,7 +610,7 @@ export class LocalQueue { // If refetch delay is disabled, we've met the requirement !refetchDelayOptions || // If we fetched more than (**not** equal to) `threshold` jobs, we've met the requirement - jobCount > Math.floor(refetchDelayOptions.threshold ?? 0); + jobCount > (refetchDelayOptions.threshold ?? 0); // NOTE: we don't need to handle `this.mode === RELEASED` here because // being in that mode guarantees the workerQueue is empty. @@ -696,7 +697,11 @@ export class LocalQueue { this.refetchDelayTimer = null; } this.refetchDelayActive = false; + if (aborted) { + // Force refetch because we've been notified of so many jobs! + this.refetchDelayFetchOnComplete = true; + this.compiledSharedOptions.events.emit("localQueue:refetchDelay:abort", { localQueue: this, count: this.refetchDelayCounter, @@ -713,7 +718,7 @@ export class LocalQueue { if (this.mode === POLLING && this.refetchDelayFetchOnComplete) { // Cancel poll, do now - if (this.fetchTimer) { + if (this.fetchTimer != null) { clearTimeout(this.fetchTimer); this.fetchTimer = null; } @@ -730,7 +735,6 @@ export class LocalQueue { return false; } if (this.refetchDelayCounter >= this.refetchDelayAbortThreshold) { - this.refetchDelayFetchOnComplete = true; this.refetchDelayCompleteOrAbort(true); } return true; @@ -754,7 +758,8 @@ export class LocalQueue { } } - // If you refactor this to be a method rather than a property, make sure that you `.bind(this)` to it. + // If you refactor this to be a method rather than a property, make sure that + // you `.bind(this)` to it. public getJob: GetJobFunction = (workerId, flagsToSkip) => { if (this.mode === RELEASED) { return undefined; @@ -762,6 +767,7 @@ export class LocalQueue { // Cannot batch if there's flags if (flagsToSkip !== null) { + // PERF: we could actually batch for similar flags, I guess. const jobsPromise = batchGetJobs( this.compiledSharedOptions, this.withPgClient, @@ -829,7 +835,7 @@ export class LocalQueue { break; } case WAITING: { - if (this.ttlExpiredTimer) { + if (this.ttlExpiredTimer != null) { clearTimeout(this.ttlExpiredTimer); this.ttlExpiredTimer = null; } @@ -839,7 +845,7 @@ export class LocalQueue { break; } case TTL_EXPIRED: { - // No action necessary, jobs are already returned + // No action necessary, jobs are already returned, no jobs, no pending workers break; } case STARTING: { From 3a0161cd6ccb6009e43993629199b2430d80cc08 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 16:21:12 +0000 Subject: [PATCH 119/155] Tidy --- src/main.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/main.ts b/src/main.ts index 8878535a..58964c21 100644 --- a/src/main.ts +++ b/src/main.ts @@ -653,9 +653,7 @@ export function _runTaskList( // Log but continue regardless logger.error( `Releasing complete job batcher failed: ${releaseCompleteJobResult.reason}`, - { - error: releaseCompleteJobResult.reason, - }, + { error: releaseCompleteJobResult.reason }, ); } if (releaseFailJobResult.status === "rejected") { @@ -665,9 +663,7 @@ export function _runTaskList( // Log but continue regardless logger.error( `Releasing failed job batcher failed: ${releaseFailJobResult.reason}`, - { - error: releaseFailJobResult.reason, - }, + { error: releaseFailJobResult.reason }, ); } From 31fc5c115bc30156dc022f19006b10415ad731cf Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 16:21:21 +0000 Subject: [PATCH 120/155] Add to graceful shutdown errors --- src/main.ts | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/src/main.ts b/src/main.ts index 58964c21..368cc83d 100644 --- a/src/main.ts +++ b/src/main.ts @@ -845,26 +845,30 @@ export function _runTaskList( } } if (!this._forcefulShuttingDown && jobsToRelease.length > 0) { - const workerIds = workers.map((worker) => worker.workerId); - logger.debug( - `Releasing the jobs ${jobsToRelease - .map((j) => j.id) - .join()} (workers: ${workerIds.join(", ")})`, - { - jobs: jobsToRelease, - workerIds, - }, - ); - const cancelledJobs = await failJobs( - compiledSharedOptions, - withPgClient, - workerPool.id, - jobsToRelease, - message, - ); - logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { - cancelledJobs, - }); + try { + const workerIds = workers.map((worker) => worker.workerId); + logger.debug( + `Releasing the jobs ${jobsToRelease + .map((j) => j.id) + .join()} (workers: ${workerIds.join(", ")})`, + { + jobs: jobsToRelease, + workerIds, + }, + ); + const cancelledJobs = await failJobs( + compiledSharedOptions, + withPgClient, + workerPool.id, + jobsToRelease, + message, + ); + logger.debug(`Cancelled ${cancelledJobs.length} jobs`, { + cancelledJobs, + }); + } catch (e) { + gracefulShutdownErrors.push(coerceError(e)); + } } if (this._forcefulShuttingDown) { From 16fb4cda7322995e399068ca394db3662889e3ee Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 16:30:37 +0000 Subject: [PATCH 121/155] Make code clearer --- src/main.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.ts b/src/main.ts index 368cc83d..1c3d9e7e 100644 --- a/src/main.ts +++ b/src/main.ts @@ -956,7 +956,7 @@ export function _runTaskList( }); try { logger.debug(`Attempting forceful shutdown`); - const timeout = new Promise((_resolve, reject) => { + const timeout = new Promise((_resolve, reject) => { const t = setTimeout( () => reject(new Error("Timed out")), 5000 /* TODO: make configurable */, @@ -976,17 +976,17 @@ export function _runTaskList( const workers = [...workerPool._workers]; // Remove all the workers - we're shutting them down manually - const workerPromises = workers.map((worker) => + const workerReleasePromises = workers.map((worker) => { // Note force=true means that this completes immediately _except_ // it still calls the `stopWorker` async hook, so we must still // handle a timeout. - Promise.race([worker.release(true), timeout]), - ); + return Promise.race([worker.release(true), timeout]); + }); // Ignore the results, we're shutting down anyway const [deactivateResult, ...workerReleaseResults] = await Promise.allSettled([ deactiveateOrTimeout, - ...workerPromises, + ...workerReleasePromises, ]); if (deactivateResult.status === "rejected") { // Log but continue regardless From 5a75d769ecd45395d1e04b928e59e8aa8f12e55a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 17:35:02 +0000 Subject: [PATCH 122/155] Give 'batch' retry logic with backpressure --- src/lib.ts | 24 +++++++++ src/localQueue.ts | 39 +++++++------- src/main.ts | 127 +++++++++++++++++++++++++++++++++++++--------- 3 files changed, 147 insertions(+), 43 deletions(-) diff --git a/src/lib.ts b/src/lib.ts index d2702943..cc3e5877 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -597,3 +597,27 @@ export function isPromiseLike(v: PromiseLike | T): v is PromiseLike { // eslint-disable-next-line @typescript-eslint/no-explicit-any return v != null && typeof (v as any).then === "function"; } + +export interface RetryOptions { + maxAttempts: number; + /** Minimum delay between attempts (milliseconds); can actually be half this due to jitter */ + minDelay: number; + /** Maximum delay between attempts (milliseconds) - can actually be 1.5x this due to jitter */ + maxDelay: number; + /** `multiplier ^ attempts` */ + multiplier: number; +} + +export function calculateDelay( + previousAttempts: number, + retryOptions: RetryOptions, +) { + const { minDelay = 200, maxDelay = 30_000, multiplier = 1.5 } = retryOptions; + /** Prevent the thundering herd problem by offsetting randomly */ + const jitter = Math.random(); + + return ( + Math.min(minDelay * Math.pow(multiplier, previousAttempts), maxDelay) * + (0.5 + jitter) + ); +} diff --git a/src/localQueue.ts b/src/localQueue.ts index 8fea5053..a86e2ccb 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -10,10 +10,17 @@ import { import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; -import { coerceError, sleep } from "./lib"; +import { calculateDelay, coerceError, RetryOptions, sleep } from "./lib"; import { batchGetJobs } from "./sql/getJobs"; import { returnJobs } from "./sql/returnJobs"; +const RETURN_JOBS_RETRY_OPTIONS: RetryOptions = { + maxAttempts: 20, + minDelay: 200, + maxDelay: 30_000, + multiplier: 1.5, +}; + const { STARTING, POLLING, WAITING, TTL_EXPIRED, RELEASED } = LocalQueueModes; /** @@ -433,15 +440,19 @@ export class LocalQueue { let attempts = 1; let initialError: Error; - const MAX_ATTEMPTS = 20; + const { maxAttempts } = RETURN_JOBS_RETRY_OPTIONS; const onError = (e: unknown): void | Promise => { if (attempts === 1) { initialError = coerceError(e); } this.compiledSharedOptions.logger.error( - `Failed to return jobs from local queue to database queue (attempt ${attempts}/${MAX_ATTEMPTS})`, - { error: e, attempts, maxAttempts: MAX_ATTEMPTS }, + `Failed to return jobs from local queue to database queue (attempt ${attempts}/${maxAttempts})`, + { + error: e, + attempts, + returnJobsRetryOptions: RETURN_JOBS_RETRY_OPTIONS, + }, ); // NOTE: the mode now may not be the mode that we were in when @@ -457,21 +468,11 @@ export class LocalQueue { // NOTE: considered doing `this.receivedJobs(jobsToReturn)`; but I // simply trying to release them again seems safer and more correct. default: { - if (attempts < MAX_ATTEMPTS) { - /** Minimum delay between attempts (milliseconds); can actually be half this due to jitter */ - const minDelay = 200; - /** Maximum delay between attempts (milliseconds) - can actually be 1.5x this due to jitter */ - const maxDelay = 30_000; // Maximum delay in milliseconds - /** `multiplier ^ attempts` */ - const multiplier = 1.5; - /** Prevent the thundering herd problem by offsetting randomly */ - const jitter = Math.random(); - const delay = - Math.min( - minDelay * Math.pow(multiplier, attempts - 1), - maxDelay, - ) * - (0.5 + jitter); + if (attempts < maxAttempts) { + const delay = calculateDelay( + attempts - 1, + RETURN_JOBS_RETRY_OPTIONS, + ); // Be sure to increment attempts to avoid infinite loop! ++attempts; diff --git a/src/main.ts b/src/main.ts index 1c3d9e7e..c627e8e8 100644 --- a/src/main.ts +++ b/src/main.ts @@ -4,7 +4,7 @@ import { EventEmitter } from "events"; import { Notification, Pool, PoolClient } from "pg"; import { inspect } from "util"; -import defer from "./deferred"; +import defer, { Deferred } from "./deferred"; import { makeWithPgClientFromClient, makeWithPgClientFromPool, @@ -23,10 +23,13 @@ import { WorkerPoolOptions, } from "./interfaces"; import { + calculateDelay, coerceError, CompiledSharedOptions, makeEnhancedWithPgClient, processSharedOptions, + RetryOptions, + sleep, tryParseJson, } from "./lib"; import { LocalQueue } from "./localQueue"; @@ -38,6 +41,13 @@ import { batchGetJobs } from "./sql/getJobs"; import { resetLockedAt } from "./sql/resetLockedAt"; import { makeNewWorker } from "./worker"; +const BATCH_RETRY_OPTIONS: RetryOptions = { + maxAttempts: 20, + minDelay: 200, + maxDelay: 30_000, + multiplier: 1.5, +}; + const ENABLE_DANGEROUS_LOGS = process.env.GRAPHILE_ENABLE_DANGEROUS_LOGS === "1"; const NO_LOG_SUCCESS = !!process.env.NO_LOG_SUCCESS; @@ -1086,9 +1096,7 @@ export function _runTaskList( workerPool, }); logger.debug("Forceful shutdown complete"); - return { - forceFailedJobs, - }; + return { forceFailedJobs }; } catch (e) { events.emit("pool:forcefulShutdown:error", { pool: workerPool, @@ -1195,6 +1203,7 @@ export function _runTaskList( const { release: releaseCompleteJob, fn: completeJob } = ( completeJobBatchDelay >= 0 ? batch( + "completeJobs", completeJobBatchDelay, (jobs) => batchCompleteJobs( @@ -1217,6 +1226,7 @@ export function _runTaskList( ); workerPool.gracefulShutdown(); }, + BATCH_RETRY_OPTIONS, ) : { release: null, @@ -1233,6 +1243,7 @@ export function _runTaskList( const { release: releaseFailJob, fn: failJob } = ( failJobBatchDelay >= 0 ? batch( + "failJobs", failJobBatchDelay, (specs) => batchFailJobs( @@ -1255,6 +1266,7 @@ export function _runTaskList( ); workerPool.gracefulShutdown(); }, + BATCH_RETRY_OPTIONS, ) : { release: null, @@ -1359,16 +1371,21 @@ export const runTaskListOnce = ( return pool; }; +/** + * On error we'll retry according to retryOptions. + */ function batch( + opName: string, delay: number, - callback: (specs: ReadonlyArray) => Promise, + rawCallback: (specs: ReadonlyArray) => Promise, errorHandler: ( error: unknown, specs: ReadonlyArray, ) => void | Promise, + retryOptions?: RetryOptions, ): { release(): void | Promise; - fn: (spec: TSpec) => void; + fn: (spec: TSpec) => void | Promise; } { let pending = 0; let releasing = false; @@ -1384,7 +1401,84 @@ function batch( } }; const promise = defer(); + + let backpressure: Deferred | null = null; + function holdup() { + if (!backpressure) { + incrementPending(); + backpressure = defer(); + } + } + function allgood() { + if (backpressure) { + backpressure.resolve(); + // Bump a tick to give the things held up by backpressure a chance to register. + process.nextTick(decrementPending); + } + } + + const callback = retryOptions + ? async (specs: ReadonlyArray): Promise => { + let lastError: Error | undefined; + for ( + let previousAttempts = 0; + previousAttempts < retryOptions.maxAttempts; + previousAttempts++ + ) { + if (previousAttempts > 0) { + const delay = calculateDelay(previousAttempts - 1, retryOptions); + console.error( + `${opName}: attempt ${previousAttempts}/${ + retryOptions.maxAttempts + } failed; retrying after ${delay.toFixed( + 0, + )}ms. Error: ${lastError}`, + ); + await sleep(delay); + } + try { + const result = await rawCallback(specs); + // We succeeded - remove backpressure. + allgood(); + return result; + } catch (e) { + // Tell other callers to wait until we're successful again (i.e. apply backpressure) + holdup(); + lastError = coerceError(e); + throw e; + } + } + throw ( + lastError ?? + new Error(`Failed after ${retryOptions.maxAttempts} attempts`) + ); + } + : rawCallback; + let currentBatch: TSpec[] | null = null; + function handleSpec(spec: TSpec) { + if (released) { + throw new Error( + "This batcher has been released, and so no more calls can be made.", + ); + } + if (currentBatch !== null) { + currentBatch.push(spec); + } else { + const specs = [spec]; + currentBatch = specs; + incrementPending(); + setTimeout(() => { + currentBatch = null; + callback(specs).then(decrementPending, (error) => { + decrementPending(); + errorHandler(error, specs); + allgood(); + }); + }, delay); + } + return; + } return { async release() { if (releasing) { @@ -1398,26 +1492,11 @@ function batch( await promise; }, fn(spec) { - if (released) { - throw new Error( - "This batcher has been released, and so no more calls can be made.", - ); - } - if (currentBatch !== null) { - currentBatch.push(spec); + if (backpressure) { + return backpressure.then(() => handleSpec(spec)); } else { - const specs = [spec]; - currentBatch = specs; - incrementPending(); - setTimeout(() => { - currentBatch = null; - callback(specs).then(decrementPending, (error) => { - decrementPending(); - errorHandler(error, specs); - }); - }, delay); + return handleSpec(spec); } - return; }, }; } From eb9c69de4689365c3ca9e6187c25edadc0730095 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 17:52:40 +0000 Subject: [PATCH 123/155] Reason for internal shutdown trigger --- src/main.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/main.ts b/src/main.ts index c627e8e8..ca60e092 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1224,7 +1224,11 @@ export function _runTaskList( .join("', '")}':\n${String(error)}`, { fatalError: error, jobs }, ); - workerPool.gracefulShutdown(); + // This is the reason for shutdown + _finErrors.push(coerceError(error)); + workerPool.gracefulShutdown( + `Could not completeJobs; queue is in an inconsistent state; aborting.`, + ); }, BATCH_RETRY_OPTIONS, ) @@ -1264,7 +1268,11 @@ export function _runTaskList( .join("', '")}':\n${String(error)}`, { fatalError: error, specs }, ); - workerPool.gracefulShutdown(); + // This is the reason for shutdown + _finErrors.push(coerceError(error)); + workerPool.gracefulShutdown( + `Could not failJobs; queue is in an inconsistent state; aborting.`, + ); }, BATCH_RETRY_OPTIONS, ) From d7693bc6a0be7acb09917b903c83632d77a51d3f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 17:54:07 +0000 Subject: [PATCH 124/155] Only trigger shutdown if not already shutting down --- src/main.ts | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/main.ts b/src/main.ts index ca60e092..b6388fe6 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1224,11 +1224,13 @@ export function _runTaskList( .join("', '")}':\n${String(error)}`, { fatalError: error, jobs }, ); - // This is the reason for shutdown - _finErrors.push(coerceError(error)); - workerPool.gracefulShutdown( - `Could not completeJobs; queue is in an inconsistent state; aborting.`, - ); + if (!_shuttingDownGracefully && !_shuttingDownForcefully) { + // This is the reason for shutdown + _finErrors.push(coerceError(error)); + workerPool.gracefulShutdown( + `Could not completeJobs; queue is in an inconsistent state; aborting.`, + ); + } }, BATCH_RETRY_OPTIONS, ) @@ -1268,11 +1270,13 @@ export function _runTaskList( .join("', '")}':\n${String(error)}`, { fatalError: error, specs }, ); - // This is the reason for shutdown - _finErrors.push(coerceError(error)); - workerPool.gracefulShutdown( - `Could not failJobs; queue is in an inconsistent state; aborting.`, - ); + if (!_shuttingDownGracefully && !_shuttingDownForcefully) { + // This is the reason for shutdown + _finErrors.push(coerceError(error)); + workerPool.gracefulShutdown( + `Could not failJobs; queue is in an inconsistent state; aborting.`, + ); + } }, BATCH_RETRY_OPTIONS, ) From 53f458067585b54207a826c08734ffb6afdffb69 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 18:21:55 +0000 Subject: [PATCH 125/155] Trigger gracefulShutdown on forcefulShutdown --- src/main.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main.ts b/src/main.ts index b6388fe6..a4c0fe5f 100644 --- a/src/main.ts +++ b/src/main.ts @@ -953,6 +953,9 @@ export function _runTaskList( ); return forcefulShutdownPromise!; } + if (!workerPool._shuttingDown) { + Promise.resolve(this.gracefulShutdown()).then(null, () => {}); + } workerPool._forcefulShuttingDown = true; forcefulShutdownPromise = middleware.run( From 6d344f0c6e6170617655b3be228e3cfbc3220857 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 18:22:14 +0000 Subject: [PATCH 126/155] Use correct variable --- src/main.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.ts b/src/main.ts index a4c0fe5f..48e7000f 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1227,7 +1227,7 @@ export function _runTaskList( .join("', '")}':\n${String(error)}`, { fatalError: error, jobs }, ); - if (!_shuttingDownGracefully && !_shuttingDownForcefully) { + if (!workerPool._shuttingDown) { // This is the reason for shutdown _finErrors.push(coerceError(error)); workerPool.gracefulShutdown( @@ -1273,7 +1273,7 @@ export function _runTaskList( .join("', '")}':\n${String(error)}`, { fatalError: error, specs }, ); - if (!_shuttingDownGracefully && !_shuttingDownForcefully) { + if (!workerPool._shuttingDown) { // This is the reason for shutdown _finErrors.push(coerceError(error)); workerPool.gracefulShutdown( From f43f47f82a49f979466f44328f94f919c88340e4 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 18:25:16 +0000 Subject: [PATCH 127/155] Give users the ability to change how premature worker shutdown is handled --- src/index.ts | 11 ++++++++++ src/main.ts | 59 ++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/src/index.ts b/src/index.ts index 2a02e529..9da74472 100644 --- a/src/index.ts +++ b/src/index.ts @@ -93,6 +93,13 @@ declare global { workerPool: WorkerPool; message: string; } + + interface PoolWorkerPrematureExitEvent { + ctx: WorkerPluginContext; + workerPool: WorkerPool; + worker: Worker; + replaceWithNewWorker(): void; + } } namespace GraphileConfig { @@ -363,6 +370,10 @@ declare global { poolForcefulShutdown( event: GraphileWorker.PoolForcefulShutdownEvent, ): ReturnType; + + poolWorkerPrematureExit( + event: GraphileWorker.PoolWorkerPrematureExitEvent, + ): void; } interface WorkerHooks { diff --git a/src/main.ts b/src/main.ts index 48e7000f..ea308dd1 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1292,7 +1292,7 @@ export function _runTaskList( } ) as { release: (() => void) | null; fn: FailJobFunction }; - for (let i = 0; i < concurrency; i++) { + const createNewWorkerInPool = () => { const worker = makeNewWorker(compiledSharedOptions, { tasks, withPgClient, @@ -1308,24 +1308,48 @@ export function _runTaskList( }); workerPool._workers.push(worker); const remove = () => { + // Remove worker from the pool + workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); if (continuous && workerPool._active && !workerPool._shuttingDown) { - // TODO: user should choose how to handle this, maybe via a middleware: - // - graceful shutdown (implemented) - // - forceful shutdown (probably best after a delay) - // - boot up a replacement worker - /* middleware.run("poolWorkerPrematureExit", {}, () => { */ logger.error( `Worker exited, but pool is in continuous mode, is active, and is not shutting down... Did something go wrong?`, ); - _finErrors.push( - new Error(`Worker ${worker.workerId} exited unexpectedly`), - ); - workerPool.gracefulShutdown( - "Something went wrong, one of the workers exited prematurely. Shutting down.", - ); - /* }) */ + try { + let called = false; + const replaceWithNewWorker = () => { + if (called) { + // Ignore additional calls + return; + } + called = true; + createNewWorkerInPool(); + }; + + // Allows user to choose how to handle this; for example: + // - graceful shutdown (default behavior) + // - forceful shutdown (probably best after a delay?) + // - boot up a replacement worker via `createNewWorker` + middleware.runSync( + "poolWorkerPrematureExit", + { + ctx: compiledSharedOptions, + workerPool, + worker, + replaceWithNewWorker, + }, + () => { + throw new Error(`Worker ${worker.workerId} exited unexpectedly`); + }, + ); + } catch (e) { + if (!workerPool._shuttingDown) { + _finErrors.push(coerceError(e)); + workerPool.gracefulShutdown( + "Something went wrong, one of the workers exited prematurely. Shutting down.", + ); + } + } } - workerPool._workers.splice(workerPool._workers.indexOf(worker), 1); if (workerPool._workers.length === 0) { if (!workerPool._shuttingDown) { workerPool.gracefulShutdown( @@ -1344,9 +1368,12 @@ export function _runTaskList( logger.error(`Worker exited with error: ${error}`, { error }); }, ); - } + return worker; + }; - // TODO: handle when a worker shuts down (spawn a new one) + for (let i = 0; i < concurrency; i++) { + createNewWorkerInPool(); + } return workerPool; } From 9c411ddee61e3dd769307fa006e3473c889b720d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 18:26:53 +0000 Subject: [PATCH 128/155] Refactor --- src/main.ts | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main.ts b/src/main.ts index ea308dd1..e4a9a392 100644 --- a/src/main.ts +++ b/src/main.ts @@ -557,6 +557,7 @@ export function _runTaskList( onTerminate?: () => Promise | void; }, ): WorkerPool { + const ctx = compiledSharedOptions; const { resolvedPreset: { worker: { @@ -799,7 +800,7 @@ export function _runTaskList( workerPool._shuttingDown = true; gracefulShutdownPromise = middleware.run( "poolGracefulShutdown", - { ctx: compiledSharedOptions, workerPool, message }, + { ctx, workerPool, message }, async ({ message }) => { events.emit("pool:gracefulShutdown", { pool: workerPool, @@ -960,7 +961,7 @@ export function _runTaskList( workerPool._forcefulShuttingDown = true; forcefulShutdownPromise = middleware.run( "poolForcefulShutdown", - { ctx: compiledSharedOptions, workerPool: this, message }, + { ctx, workerPool, message }, async ({ message }) => { events.emit("pool:forcefulShutdown", { pool: workerPool, @@ -1314,6 +1315,7 @@ export function _runTaskList( logger.error( `Worker exited, but pool is in continuous mode, is active, and is not shutting down... Did something go wrong?`, ); + try { let called = false; const replaceWithNewWorker = () => { @@ -1331,12 +1333,7 @@ export function _runTaskList( // - boot up a replacement worker via `createNewWorker` middleware.runSync( "poolWorkerPrematureExit", - { - ctx: compiledSharedOptions, - workerPool, - worker, - replaceWithNewWorker, - }, + { ctx, workerPool, worker, replaceWithNewWorker }, () => { throw new Error(`Worker ${worker.workerId} exited unexpectedly`); }, From 37ea0136f47b834bc4597d14791b11ef5ae1e2f5 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 18:31:30 +0000 Subject: [PATCH 129/155] Clarify message --- src/main.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index e4a9a392..00eda59a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1350,7 +1350,9 @@ export function _runTaskList( if (workerPool._workers.length === 0) { if (!workerPool._shuttingDown) { workerPool.gracefulShutdown( - "'Run once' mode processed all available jobs and is now exiting", + continuous + ? "There are no remaining workers; exiting" + : "'Run once' mode processed all available jobs and is now exiting", ); } } From a5ff74dc7e444d3198804281f96aa387871d9d95 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Mon, 9 Dec 2024 18:39:09 +0000 Subject: [PATCH 130/155] Give pool error handlers --- towerDefence/run.mjs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 944ac742..350d91d8 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -55,6 +55,8 @@ const spawnOptions = { }; const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); +pgPool.on("error", () => {}); +pgPool.on("connect", (client) => void client.on("error", () => {})); const GENERAL_JOBS_PER_SECOND = 15000; const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; From 01a083f49276988932f3beacfc993b34395c615f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 15:51:23 +0000 Subject: [PATCH 131/155] Comments --- src/index.ts | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/index.ts b/src/index.ts index 60b5feea..6f431c9b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -48,15 +48,18 @@ declare global { } interface BootstrapEvent { ctx: WorkerPluginContext; + /** * The client used to perform the bootstrap. Replacing this is not officially * supported, but... */ client: PoolClient; + /** * The Postgres version number, e.g. 120000 for PostgreSQL 12.0 */ readonly postgresVersion: number; + /** * Somewhere to store temporary data from plugins, only used during * bootstrap and migrate @@ -66,6 +69,7 @@ declare global { interface MigrateEvent { ctx: WorkerPluginContext; + /** * The client used to run the migration. Replacing this is not officially * supported, but... @@ -100,6 +104,14 @@ declare global { ctx: WorkerPluginContext; workerPool: WorkerPool; worker: Worker; + /** + * Use this to spin up a new Worker in place of the old one that failed. + * Generally a Worker fails due to some underlying network or database + * issue, and just spinning up a new one in its place may simply mask the + * issue, so this is not recommended. + * + * Only the first call to this method (per event) will have any effect. + */ replaceWithNewWorker(): void; } } @@ -250,6 +262,11 @@ declare global { */ events?: WorkerEvents; + /** + * If you're running in high concurrency, you will likely want to reduce + * the load on the database by using a local queue to distribute jobs to + * workers rather than having each ask the database directly. + */ localQueue?: { /** * To enable processing jobs in batches, set this to an integer larger @@ -369,10 +386,12 @@ declare global { } interface Preset { + /** Options for Graphile Worker */ worker?: WorkerOptions; } interface Plugin { + /** Plugin hooks and middleware for Graphile Worker */ worker?: { middleware?: MiddlewareHandlers; @@ -412,14 +431,28 @@ declare global { */ migrate(event: GraphileWorker.MigrateEvent): PromiseOrDirect; + /** + * Called when performing a graceful shutdown on a WorkerPool. + */ poolGracefulShutdown( event: GraphileWorker.PoolGracefulShutdownEvent, ): ReturnType; + /** + * Called when performing a forceful shutdown on a WorkerPool. + */ poolForcefulShutdown( event: GraphileWorker.PoolForcefulShutdownEvent, ): ReturnType; + /** + * Called when a Worker inside a WorkerPool exits unexpectedly; + * allows user to choose how to handle this; for example: + * + * - graceful shutdown (default behavior) + * - forceful shutdown (probably best after a delay?) + * - boot up a replacement worker via `createNewWorker` + */ poolWorkerPrematureExit( event: GraphileWorker.PoolWorkerPrematureExitEvent, ): void; From 0bd0b77975e0e06c00d002ea60531cb6120f8836 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 16:15:09 +0000 Subject: [PATCH 132/155] Use new retry delay calculator --- src/lib.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/lib.ts b/src/lib.ts index cc3e5877..22625744 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -535,11 +535,11 @@ export function tryParseJson( /** @see {@link https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html} */ const RETRYABLE_ERROR_CODES = [ - { code: "40001", backoffMS: 50 }, // serialization_failure - { code: "40P01", backoffMS: 50 }, // deadlock_detected - { code: "57P03", backoffMS: 3000 }, // cannot_connect_now - { code: "EHOSTUNREACH", backoffMS: 3000 }, // no connection to the server - { code: "ETIMEDOUT", backoffMS: 3000 }, // timeout + { code: "40001", minDelay: 50, maxDelay: 5_000 }, // serialization_failure + { code: "40P01", minDelay: 50, maxDelay: 5_000 }, // deadlock_detected + { code: "57P03", minDelay: 3000, maxDelay: 120_000 }, // cannot_connect_now + { code: "EHOSTUNREACH", minDelay: 3000, maxDelay: 120_000 }, // no connection to the server + { code: "ETIMEDOUT", minDelay: 3000, maxDelay: 120_000 }, // timeout ]; const MAX_RETRIES = 100; @@ -565,8 +565,14 @@ export function makeEnhancedWithPgClient( ); if (retryable) { lastError = e; + const delay = calculateDelay(attempts, { + maxAttempts: MAX_RETRIES, + minDelay: retryable.minDelay, + maxDelay: retryable.maxDelay, + multiplier: 1.5, + }); // Try again in backoffMS - await sleep(retryable.backoffMS * Math.sqrt(attempts + 1)); + await sleep(delay); } else { throw e; } From c695bcc330a4047d513b9e95ebad3bf8982d728b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 16:22:26 +0000 Subject: [PATCH 133/155] Ensure errors are escalated to batch() so backpressure applies --- src/interfaces.ts | 4 +--- src/main.ts | 15 +++++++++------ src/sql/completeJobs.ts | 13 ++++++------- src/sql/failJobs.ts | 10 ++++------ 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/interfaces.ts b/src/interfaces.ts index 75b34b50..52e8b40f 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -39,9 +39,7 @@ export interface WithPgClient { export interface EnhancedWithPgClient extends WithPgClient { /** **Experimental**; see https://github.com/graphile/worker/issues/387 */ - withRetries: ( - callback: (pgClient: PoolClient) => Promise, - ) => Promise; + withRetries: WithPgClient; } /** diff --git a/src/main.ts b/src/main.ts index 00eda59a..c163f572 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1212,7 +1212,7 @@ export function _runTaskList( (jobs) => batchCompleteJobs( compiledSharedOptions, - withPgClient, + withPgClient, // batch handles retries and adds backpressure workerPool.id, jobs, ), @@ -1243,7 +1243,7 @@ export function _runTaskList( fn: (job) => batchCompleteJobs( compiledSharedOptions, - withPgClient, + withPgClient.withRetries, workerPool.id, [job], ), @@ -1258,7 +1258,7 @@ export function _runTaskList( (specs) => batchFailJobs( compiledSharedOptions, - withPgClient, + withPgClient, // batch handles retries and adds backpressure workerPool.id, specs, ), @@ -1287,9 +1287,12 @@ export function _runTaskList( : { release: null, fn: (spec) => - batchFailJobs(compiledSharedOptions, withPgClient, workerPool.id, [ - spec, - ]), + batchFailJobs( + compiledSharedOptions, + withPgClient.withRetries, + workerPool.id, + [spec], + ), } ) as { release: (() => void) | null; fn: FailJobFunction }; diff --git a/src/sql/completeJobs.ts b/src/sql/completeJobs.ts index 3154a161..8e861aaa 100644 --- a/src/sql/completeJobs.ts +++ b/src/sql/completeJobs.ts @@ -1,11 +1,11 @@ -import { DbJob, EnhancedWithPgClient } from "../interfaces"; +import { DbJob, WithPgClient } from "../interfaces"; import { CompiledSharedOptions } from "../lib"; const manualPrepare = false; export async function batchCompleteJobs( compiledSharedOptions: CompiledSharedOptions, - withPgClient: EnhancedWithPgClient, + withPgClient: WithPgClient, poolId: string, jobs: ReadonlyArray, ): Promise { @@ -27,9 +27,8 @@ export async function batchCompleteJobs( } } - // TODO: retry logic, in case of server connection interruption if (jobIdsWithQueue.length > 0) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ with j as ( @@ -50,7 +49,7 @@ where job_queues.id = j.job_queue_id and job_queues.locked_by = $2::text;`, ); } if (jobIdsWithoutQueue.length === 1) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ delete from ${escapedWorkerSchema}._private_jobs as jobs @@ -61,7 +60,7 @@ where id = $1::bigint`, ); } else if (jobIdsWithoutQueue.length > 1) { if (manualPrepare) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ prepare gwcj (bigint) as delete from ${escapedWorkerSchema}._private_jobs where id = $1; @@ -70,7 +69,7 @@ deallocate gwcj;`, }), ); } else { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ delete from ${escapedWorkerSchema}._private_jobs as jobs diff --git a/src/sql/failJobs.ts b/src/sql/failJobs.ts index a459c7af..96bc1d00 100644 --- a/src/sql/failJobs.ts +++ b/src/sql/failJobs.ts @@ -1,4 +1,4 @@ -import { DbJob, EnhancedWithPgClient } from "../interfaces"; +import { DbJob, EnhancedWithPgClient, WithPgClient } from "../interfaces"; import { CompiledSharedOptions } from "../lib"; interface Spec { job: DbJob; @@ -8,7 +8,7 @@ interface Spec { export async function batchFailJobs( compiledSharedOptions: CompiledSharedOptions, - withPgClient: EnhancedWithPgClient, + withPgClient: WithPgClient, poolId: string, specs: ReadonlyArray, ): Promise { @@ -31,9 +31,8 @@ export async function batchFailJobs( } } - // TODO: retry logic, in case of server connection interruption if (specsWithQueues.length > 0) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ with j as ( @@ -67,7 +66,7 @@ where job_queues.id = j.job_queue_id and job_queues.locked_by = $1::text;`, ); } if (specsWithoutQueues.length > 0) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ update ${escapedWorkerSchema}._private_jobs as jobs @@ -110,7 +109,6 @@ export async function failJobs( }, } = compiledSharedOptions; - // TODO: retry logic, in case of server connection interruption const { rows: failedJobs } = await withPgClient.withRetries((client) => client.query({ text: `\ From ad686d9be710d751bc971c4f8279c1cd9a6906e6 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 16:29:06 +0000 Subject: [PATCH 134/155] Refactor --- src/lib.ts | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/lib.ts b/src/lib.ts index 22625744..8388455d 100644 --- a/src/lib.ts +++ b/src/lib.ts @@ -534,13 +534,20 @@ export function tryParseJson( } /** @see {@link https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html} */ -const RETRYABLE_ERROR_CODES = [ - { code: "40001", minDelay: 50, maxDelay: 5_000 }, // serialization_failure - { code: "40P01", minDelay: 50, maxDelay: 5_000 }, // deadlock_detected - { code: "57P03", minDelay: 3000, maxDelay: 120_000 }, // cannot_connect_now - { code: "EHOSTUNREACH", minDelay: 3000, maxDelay: 120_000 }, // no connection to the server - { code: "ETIMEDOUT", minDelay: 3000, maxDelay: 120_000 }, // timeout -]; +export const RETRYABLE_ERROR_CODES: Record< + string, + Omit | undefined +> = { + // @ts-ignore + __proto__: null, + + "40001": { minDelay: 50, maxDelay: 5_000 }, // serialization_failure + "40P01": { minDelay: 50, maxDelay: 5_000 }, // deadlock_detected + "57P03": { minDelay: 3000, maxDelay: 120_000 }, // cannot_connect_now + EHOSTUNREACH: { minDelay: 3000, maxDelay: 120_000 }, // no connection to the server + ETIMEDOUT: { minDelay: 3000, maxDelay: 120_000 }, // timeout +}; + const MAX_RETRIES = 100; export function makeEnhancedWithPgClient( @@ -560,9 +567,7 @@ export function makeEnhancedWithPgClient( return await withPgClient(...args); } catch (rawE) { const e = coerceError(rawE); - const retryable = RETRYABLE_ERROR_CODES.find( - ({ code }) => code === e.code, - ); + const retryable = RETRYABLE_ERROR_CODES[e.code as string]; if (retryable) { lastError = e; const delay = calculateDelay(attempts, { From e4545e1416c5410e6f3b08202ab1f801b35ef82d Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 16:34:06 +0000 Subject: [PATCH 135/155] Use delays from retryables in batch --- src/main.ts | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main.ts b/src/main.ts index c163f572..db616420 100644 --- a/src/main.ts +++ b/src/main.ts @@ -28,6 +28,7 @@ import { CompiledSharedOptions, makeEnhancedWithPgClient, processSharedOptions, + RETRYABLE_ERROR_CODES, RetryOptions, sleep, tryParseJson, @@ -1463,20 +1464,26 @@ function batch( const callback = retryOptions ? async (specs: ReadonlyArray): Promise => { - let lastError: Error | undefined; + let lastError: ReturnType | undefined; for ( let previousAttempts = 0; previousAttempts < retryOptions.maxAttempts; previousAttempts++ ) { if (previousAttempts > 0) { - const delay = calculateDelay(previousAttempts - 1, retryOptions); + const code = lastError?.code as string; + const retryable = RETRYABLE_ERROR_CODES[code]; + const delay = calculateDelay(previousAttempts - 1, { + ...retryOptions, + // NOTE: `retryable` might be undefined, in which case `retryOptions` wins + ...retryable, + }); console.error( `${opName}: attempt ${previousAttempts}/${ retryOptions.maxAttempts - } failed; retrying after ${delay.toFixed( - 0, - )}ms. Error: ${lastError}`, + } failed${ + code ? ` with code ${JSON.stringify(code)}` : `` + }; retrying after ${delay.toFixed(0)}ms. Error: ${lastError}`, ); await sleep(delay); } From 2b250e9d6aa10ae8b78cd6ae7afd8b7f53b75663 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 16:40:33 +0000 Subject: [PATCH 136/155] Handle our own errors via onError --- src/localQueue.ts | 26 ++++++++++++++++++-------- src/sql/returnJobs.ts | 8 ++++---- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index a86e2ccb..896c020a 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -10,7 +10,13 @@ import { import { MINUTE, SECOND } from "./cronConstants"; import defer, { Deferred } from "./deferred"; import { GetJobFunction, Job, TaskList, WorkerPool } from "./interfaces"; -import { calculateDelay, coerceError, RetryOptions, sleep } from "./lib"; +import { + calculateDelay, + coerceError, + RETRYABLE_ERROR_CODES, + RetryOptions, + sleep, +} from "./lib"; import { batchGetJobs } from "./sql/getJobs"; import { returnJobs } from "./sql/returnJobs"; @@ -442,8 +448,9 @@ export class LocalQueue { let initialError: Error; const { maxAttempts } = RETURN_JOBS_RETRY_OPTIONS; const onError = (e: unknown): void | Promise => { + const lastError = coerceError(e); if (attempts === 1) { - initialError = coerceError(e); + initialError = lastError; } this.compiledSharedOptions.logger.error( @@ -469,17 +476,20 @@ export class LocalQueue { // simply trying to release them again seems safer and more correct. default: { if (attempts < maxAttempts) { - const delay = calculateDelay( - attempts - 1, - RETURN_JOBS_RETRY_OPTIONS, - ); + const code = lastError?.code as string; + const retryable = RETRYABLE_ERROR_CODES[code]; + const delay = calculateDelay(attempts - 1, { + ...RETURN_JOBS_RETRY_OPTIONS, + // NOTE: `retryable` might be undefined, in which case `RETURN_JOBS_RETRY_OPTIONS` wins + ...retryable, + }); // Be sure to increment attempts to avoid infinite loop! ++attempts; return sleep(delay).then(() => returnJobs( this.compiledSharedOptions, - this.withPgClient, + this.withPgClient, // We'll handle the retries via onError this.workerPool.id, jobsToReturn, ).then(noop, onError), @@ -503,7 +513,7 @@ export class LocalQueue { this.background( returnJobs( this.compiledSharedOptions, - this.withPgClient, + this.withPgClient, // We'll handle the retries via onError this.workerPool.id, jobsToReturn, ).then( diff --git a/src/sql/returnJobs.ts b/src/sql/returnJobs.ts index 7b0e36f8..988c5f64 100644 --- a/src/sql/returnJobs.ts +++ b/src/sql/returnJobs.ts @@ -1,9 +1,9 @@ -import { DbJob, EnhancedWithPgClient } from "../interfaces"; +import { DbJob, WithPgClient } from "../interfaces"; import { CompiledSharedOptions } from "../lib"; export async function returnJobs( compiledSharedOptions: CompiledSharedOptions, - withPgClient: EnhancedWithPgClient, + withPgClient: WithPgClient, poolId: string, jobs: ReadonlyArray, ): Promise { @@ -27,7 +27,7 @@ export async function returnJobs( } if (jobsWithQueues.length > 0) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ with j as ( @@ -50,7 +50,7 @@ where job_queues.id = j.job_queue_id and job_queues.locked_by = $1::text;`, ); } if (jobsWithoutQueues.length > 0) { - await withPgClient.withRetries((client) => + await withPgClient((client) => client.query({ text: `\ update ${escapedWorkerSchema}._private_jobs as jobs From abf2463a99a2a30395d12d31d92a3f288bf4bf69 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 17:07:28 +0000 Subject: [PATCH 137/155] Move parallelism into config --- towerDefence/graphile.config.mjs | 3 ++- towerDefence/run.mjs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 44184721..9ac71160 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -6,6 +6,7 @@ // import { WorkerProPreset } from "../graphile-pro-worker/dist/index.js"; const CONCURRENT_JOBS = 10; +export const PARALLELISM = 10; const stats = { fetches: 0, @@ -136,7 +137,7 @@ const preset = { size: CONCURRENT_JOBS + 1, refetchDelay: { durationMs: 1000, - maxAbortThreshold: CONCURRENT_JOBS * 10, + maxAbortThreshold: CONCURRENT_JOBS * PARALLELISM, }, }, completeJobBatchDelay: 0, diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 350d91d8..8c4f2acf 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -4,9 +4,9 @@ import { execSync, spawn } from "child_process"; import pg from "pg"; import { makeWorkerUtils } from "../dist/index.js"; +import { PARALLELISM } from "./graphile.config.mjs"; const STUCK_JOB_COUNT = 0; -const PARALLELISM = 10; const WAVES = [ makeWave([1]), makeWave(new Array(1000).fill(1), 10), From 79d2b7252e876b54a527d59d7acf74db3f1f187e Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 17:07:44 +0000 Subject: [PATCH 138/155] Have tasks sleep for 250ms --- towerDefence/run.mjs | 10 +++++++--- towerDefence/tasks/log_if_999.js | 4 +++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index 8c4f2acf..fbe48108 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -4,7 +4,9 @@ import { execSync, spawn } from "child_process"; import pg from "pg"; import { makeWorkerUtils } from "../dist/index.js"; -import { PARALLELISM } from "./graphile.config.mjs"; +import config, { PARALLELISM } from "./graphile.config.mjs"; + +const CONCURRENCY = config.worker?.concurrentJobs ?? 1; const STUCK_JOB_COUNT = 0; const WAVES = [ @@ -48,17 +50,19 @@ const execOptions = { stdio: ["ignore", "ignore", "inherit"], }; -/** @type {import("child_process").CommonSpawnOptions} */ +/** @type {import("child_process").SpawnOptions} */ const spawnOptions = { env, stdio: ["ignore", "inherit", "inherit"], + detached: false, }; const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); pgPool.on("error", () => {}); pgPool.on("connect", (client) => void client.on("error", () => {})); -const GENERAL_JOBS_PER_SECOND = 15000; +//const GENERAL_JOBS_PER_SECOND = 15000; +const GENERAL_JOBS_PER_SECOND = CONCURRENCY * PARALLELISM * (1000 / 250); const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; /** @type {(jobBatches: number[], sleepDuration?: number) => (workerUtils: import("../dist/interfaces.js").WorkerUtils) => Promise} */ diff --git a/towerDefence/tasks/log_if_999.js b/towerDefence/tasks/log_if_999.js index a1bc70ec..dbfa58d6 100644 --- a/towerDefence/tasks/log_if_999.js +++ b/towerDefence/tasks/log_if_999.js @@ -1,4 +1,6 @@ -module.exports = ({ id }) => { +const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); +module.exports = async ({ id }) => { + await sleep(250); if (id === 999) { console.log("Found 999!"); } From 4905a93ec0b532f729ba2c00fd90913c10fa8009 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 17:24:46 +0000 Subject: [PATCH 139/155] Centralize configuration of sleepTime --- towerDefence/run.mjs | 8 +++++++- towerDefence/tasks/log_if_999.js | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index fbe48108..beb4a4a9 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -61,8 +61,13 @@ const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); pgPool.on("error", () => {}); pgPool.on("connect", (client) => void client.on("error", () => {})); +const SLEEP_TIME = 20; + //const GENERAL_JOBS_PER_SECOND = 15000; -const GENERAL_JOBS_PER_SECOND = CONCURRENCY * PARALLELISM * (1000 / 250); +const GENERAL_JOBS_PER_SECOND = Math.min( + 15000, + CONCURRENCY * PARALLELISM * (1000 / (SLEEP_TIME + 0.1)), +); const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; /** @type {(jobBatches: number[], sleepDuration?: number) => (workerUtils: import("../dist/interfaces.js").WorkerUtils) => Promise} */ @@ -81,6 +86,7 @@ function makeWave(jobBatches, sleepDuration = -1) { identifier: taskIdentifier, payload: { id: i, + sleepTime: SLEEP_TIME, }, runAt: NOW, }); diff --git a/towerDefence/tasks/log_if_999.js b/towerDefence/tasks/log_if_999.js index dbfa58d6..bcda790d 100644 --- a/towerDefence/tasks/log_if_999.js +++ b/towerDefence/tasks/log_if_999.js @@ -1,7 +1,9 @@ const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); -module.exports = async ({ id }) => { - await sleep(250); +module.exports = ({ id, sleepTime }) => { if (id === 999) { console.log("Found 999!"); } + if (sleepTime) { + return sleep(sleepTime); + } }; From ba43590c0ed68859b9c7d84c1c7f646537374bdc Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 17:45:15 +0000 Subject: [PATCH 140/155] Add a base sleep duration estimate --- towerDefence/run.mjs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/towerDefence/run.mjs b/towerDefence/run.mjs index beb4a4a9..02453ca2 100755 --- a/towerDefence/run.mjs +++ b/towerDefence/run.mjs @@ -7,6 +7,8 @@ import { makeWorkerUtils } from "../dist/index.js"; import config, { PARALLELISM } from "./graphile.config.mjs"; const CONCURRENCY = config.worker?.concurrentJobs ?? 1; +/** How long each individual task sleeps for */ +const SLEEP_TIME = 50; const STUCK_JOB_COUNT = 0; const WAVES = [ @@ -61,8 +63,6 @@ const pgPool = new pg.Pool({ connectionString: process.env.PERF_DATABASE_URL }); pgPool.on("error", () => {}); pgPool.on("connect", (client) => void client.on("error", () => {})); -const SLEEP_TIME = 20; - //const GENERAL_JOBS_PER_SECOND = 15000; const GENERAL_JOBS_PER_SECOND = Math.min( 15000, @@ -71,7 +71,7 @@ const GENERAL_JOBS_PER_SECOND = Math.min( const GENERAL_JOBS_PER_MILLISECOND = GENERAL_JOBS_PER_SECOND / 1000; /** @type {(jobBatches: number[], sleepDuration?: number) => (workerUtils: import("../dist/interfaces.js").WorkerUtils) => Promise} */ -function makeWave(jobBatches, sleepDuration = -1) { +function makeWave(jobBatches, extraSleepDuration = -1) { return async (workerUtils) => { let totalCount = 0; let start = Date.now(); @@ -92,6 +92,9 @@ function makeWave(jobBatches, sleepDuration = -1) { }); } await workerUtils.addJobs(jobs); + const sleepDuration = + Math.floor((jobCount * SLEEP_TIME) / (CONCURRENCY * PARALLELISM)) + + extraSleepDuration; if (sleepDuration >= 0) { await sleep(sleepDuration); } From f6a688988e95ba896c40351b7a23075bf4f1efd8 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Tue, 10 Dec 2024 17:53:23 +0000 Subject: [PATCH 141/155] Set threshold to one less than the local queue size --- towerDefence/graphile.config.mjs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/towerDefence/graphile.config.mjs b/towerDefence/graphile.config.mjs index 9ac71160..e3bd3dbe 100644 --- a/towerDefence/graphile.config.mjs +++ b/towerDefence/graphile.config.mjs @@ -115,6 +115,8 @@ const TowerDefenceResultPlugin = { }, }; +const localQueueSize = CONCURRENT_JOBS + 1; + /** @type {GraphileConfig.Preset} */ const preset = { // extends: [WorkerProPreset], @@ -134,9 +136,10 @@ const preset = { pollInterval: 2000, localQueue: { - size: CONCURRENT_JOBS + 1, + size: localQueueSize, refetchDelay: { durationMs: 1000, + threshold: localQueueSize - 1, maxAbortThreshold: CONCURRENT_JOBS * PARALLELISM, }, }, From 698f5224cc058b08b1017e92b66becdcc01a711a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 10:39:22 +0000 Subject: [PATCH 142/155] 0.17.0-canary.f6a6889 --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index ad5305d6..72ccbe35 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.9817f67", + "version": "0.17.0-canary.f6a6889", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 2b893a05..0a602ee2 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.9817f67"; +export const version = "0.17.0-canary.f6a6889"; From 2ae2f2b581ba5d88c134fbf1a270c30698448b2b Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 14:07:11 +0000 Subject: [PATCH 143/155] All event handlers should provide ctx --- src/cron.ts | 12 +++++- src/interfaces.ts | 102 +++++++++++++++++++++++++++++++++++++--------- src/localQueue.ts | 7 ++++ src/main.ts | 54 ++++++++++++++++-------- src/runner.ts | 4 +- src/worker.ts | 24 ++++++----- 6 files changed, 152 insertions(+), 51 deletions(-) diff --git a/src/cron.ts b/src/cron.ts index 4b6cfe4c..512e84eb 100644 --- a/src/cron.ts +++ b/src/cron.ts @@ -179,6 +179,7 @@ async function scheduleCronJobs( * performs backfilling on any crontab tasks that need it. */ async function registerAndBackfillItems( + ctx: CompiledSharedOptions, { pgPool, events, cron }: { pgPool: Pool; events: WorkerEvents; cron: Cron }, escapedWorkerSchema: string, parsedCronItems: ParsedCronItem[], @@ -261,6 +262,7 @@ async function registerAndBackfillItems( // At this time it's not expected that backfilling will be sufficiently // expensive to justify optimising this further. events.emit("cron:backfill", { + ctx, cron, itemsToBackfill, timestamp: ts, @@ -338,11 +340,13 @@ export const runCron = ( } const start = new Date(); - events.emit("cron:starting", { cron, start }); + const ctx = compiledSharedOptions; + events.emit("cron:starting", { ctx, cron, start }); // We must backfill BEFORE scheduling any new jobs otherwise backfill won't // work due to known_crontabs.last_execution having been updated. await registerAndBackfillItems( + ctx, { pgPool, events, cron }, escapedWorkerSchema, parsedCronItems, @@ -350,7 +354,7 @@ export const runCron = ( useNodeTime, ); - events.emit("cron:started", { cron, start }); + events.emit("cron:started", { ctx, cron, start }); if (!cron._active) { return stop(); @@ -411,6 +415,7 @@ export const runCron = ( }, ); events.emit("cron:prematureTimer", { + ctx, cron, currentTimestamp, expectedTimestamp, @@ -427,6 +432,7 @@ export const runCron = ( )}s behind)`, ); events.emit("cron:overdueTimer", { + ctx, cron, currentTimestamp, expectedTimestamp, @@ -449,6 +455,7 @@ export const runCron = ( // Finally actually run the jobs. if (jobsAndIdentifiers.length) { events.emit("cron:schedule", { + ctx, cron, timestamp: expectedTimestamp, jobsAndIdentifiers, @@ -461,6 +468,7 @@ export const runCron = ( useNodeTime, ); events.emit("cron:scheduled", { + ctx, cron, timestamp: expectedTimestamp, jobsAndIdentifiers, diff --git a/src/interfaces.ts b/src/interfaces.ts index 52e8b40f..8bb2ffd1 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -900,23 +900,32 @@ export type WorkerEventMap = { /** * When a worker pool is created */ - "pool:create": { workerPool: WorkerPool }; + "pool:create": { ctx: WorkerPluginContext; workerPool: WorkerPool }; /** * When a worker pool attempts to connect to PG ready to issue a LISTEN * statement */ - "pool:listen:connecting": { workerPool: WorkerPool; attempts: number }; + "pool:listen:connecting": { + ctx: WorkerPluginContext; + workerPool: WorkerPool; + attempts: number; + }; /** * When a worker pool starts listening for jobs via PG LISTEN */ - "pool:listen:success": { workerPool: WorkerPool; client: PoolClient }; + "pool:listen:success": { + ctx: WorkerPluginContext; + workerPool: WorkerPool; + client: PoolClient; + }; /** * When a worker pool faces an error on their PG LISTEN client */ "pool:listen:error": { + ctx: WorkerPluginContext; workerPool: WorkerPool; error: unknown; }; @@ -925,6 +934,7 @@ export type WorkerEventMap = { * When a worker pool receives a notification */ "pool:listen:notification": { + ctx: WorkerPluginContext; workerPool: WorkerPool; message: Notification; client: PoolClient; @@ -934,6 +944,7 @@ export type WorkerEventMap = { * When a worker pool listening client is no longer available */ "pool:listen:release": { + ctx: WorkerPluginContext; workerPool: WorkerPool; /** If you use this client, be careful to handle errors - it may be in an invalid state (errored, disconnected, etc). */ client: PoolClient; @@ -943,6 +954,7 @@ export type WorkerEventMap = { * When a worker pool fails to complete/fail a job */ "pool:fatalError": { + ctx: WorkerPluginContext; workerPool: WorkerPool; error: unknown; action: string; @@ -952,6 +964,7 @@ export type WorkerEventMap = { * When a worker pool is released */ "pool:release": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -961,6 +974,7 @@ export type WorkerEventMap = { * When a worker pool starts a graceful shutdown */ "pool:gracefulShutdown": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -971,6 +985,7 @@ export type WorkerEventMap = { * When a worker pool graceful shutdown throws an error */ "pool:gracefulShutdown:error": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -982,6 +997,7 @@ export type WorkerEventMap = { * throws an error from release() */ "pool:gracefulShutdown:workerError": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -993,6 +1009,7 @@ export type WorkerEventMap = { * When a worker pool graceful shutdown throws an error */ "pool:gracefulShutdown:complete": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -1002,6 +1019,7 @@ export type WorkerEventMap = { * When a worker pool starts a forceful shutdown */ "pool:forcefulShutdown": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -1012,6 +1030,7 @@ export type WorkerEventMap = { * When a worker pool forceful shutdown throws an error */ "pool:forcefulShutdown:error": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -1022,6 +1041,7 @@ export type WorkerEventMap = { * When a worker pool forceful shutdown throws an error */ "pool:forcefulShutdown:complete": { + ctx: WorkerPluginContext; /** @deprecated Use workerPool for consistency */ pool: WorkerPool; workerPool: WorkerPool; @@ -1030,14 +1050,13 @@ export type WorkerEventMap = { /** * When a local queue is created */ - "localQueue:init": { - localQueue: LocalQueue; - }; + "localQueue:init": { ctx: WorkerPluginContext; localQueue: LocalQueue }; /** * When a local queue enters 'polling' mode */ "localQueue:setMode": { + ctx: WorkerPluginContext; localQueue: LocalQueue; oldMode: LocalQueueMode; newMode: Exclude; @@ -1048,6 +1067,7 @@ export type WorkerEventMap = { * sleep. */ "localQueue:refetchDelay:start": { + ctx: WorkerPluginContext; localQueue: LocalQueue; /** The number of jobs that were fetched */ jobCount: number; @@ -1064,6 +1084,7 @@ export type WorkerEventMap = { * been awoken early to deal with the rush! */ "localQueue:refetchDelay:abort": { + ctx: WorkerPluginContext; localQueue: LocalQueue; /** How many nudges did we receive during the delay */ count: number; @@ -1075,6 +1096,7 @@ export type WorkerEventMap = { * The refetchDelay terminated normally. */ "localQueue:refetchDelay:expired": { + ctx: WorkerPluginContext; localQueue: LocalQueue; }; @@ -1082,6 +1104,7 @@ export type WorkerEventMap = { * The refetchDelay terminated normally. */ "localQueue:getJobs:complete": { + ctx: WorkerPluginContext; localQueue: LocalQueue; jobs: Job[]; }; @@ -1090,6 +1113,7 @@ export type WorkerEventMap = { * The refetchDelay terminated normally. */ "localQueue:returnJobs": { + ctx: WorkerPluginContext; localQueue: LocalQueue; jobs: Job[]; }; @@ -1097,37 +1121,46 @@ export type WorkerEventMap = { /** * When a worker is created */ - "worker:create": { worker: Worker; tasks: TaskList }; + "worker:create": { + ctx: WorkerPluginContext; + worker: Worker; + tasks: TaskList; + }; /** * When a worker release is requested */ - "worker:release": { worker: Worker }; + "worker:release": { ctx: WorkerPluginContext; worker: Worker }; /** * When a worker stops (normally after a release) */ - "worker:stop": { worker: Worker; error?: unknown }; + "worker:stop": { ctx: WorkerPluginContext; worker: Worker; error?: unknown }; /** * When a worker is about to ask the database for a job to execute */ - "worker:getJob:start": { worker: Worker }; + "worker:getJob:start": { ctx: WorkerPluginContext; worker: Worker }; /** * When a worker calls get_job but there are no available jobs */ - "worker:getJob:error": { worker: Worker; error: unknown }; + "worker:getJob:error": { + ctx: WorkerPluginContext; + worker: Worker; + error: unknown; + }; /** * When a worker calls get_job but there are no available jobs */ - "worker:getJob:empty": { worker: Worker }; + "worker:getJob:empty": { ctx: WorkerPluginContext; worker: Worker }; /** * When a worker is created */ "worker:fatalError": { + ctx: WorkerPluginContext; worker: Worker; error: unknown; jobError: unknown | null; @@ -1136,17 +1169,18 @@ export type WorkerEventMap = { /** * When a job is retrieved by get_job */ - "job:start": { worker: Worker; job: Job }; + "job:start": { ctx: WorkerPluginContext; worker: Worker; job: Job }; /** * When a job completes successfully */ - "job:success": { worker: Worker; job: Job }; + "job:success": { ctx: WorkerPluginContext; worker: Worker; job: Job }; /** * When a job throws an error */ "job:error": { + ctx: WorkerPluginContext; worker: Worker; job: Job; error: unknown; @@ -1157,6 +1191,7 @@ export type WorkerEventMap = { * When a job fails permanently (emitted after job:error when appropriate) */ "job:failed": { + ctx: WorkerPluginContext; worker: Worker; job: Job; error: unknown; @@ -1167,16 +1202,22 @@ export type WorkerEventMap = { * When a job has finished executing and the result (success or failure) has * been written back to the database */ - "job:complete": { worker: Worker; job: Job; error: unknown }; + "job:complete": { + ctx: WorkerPluginContext; + worker: Worker; + job: Job; + error: unknown; + }; /** **Experimental** When the cron starts working (before backfilling) */ - "cron:starting": { cron: Cron; start: Date }; + "cron:starting": { ctx: WorkerPluginContext; cron: Cron; start: Date }; /** **Experimental** When the cron starts working (after backfilling completes) */ - "cron:started": { cron: Cron; start: Date }; + "cron:started": { ctx: WorkerPluginContext; cron: Cron; start: Date }; /** **Experimental** When a number of jobs need backfilling for a particular timestamp. */ "cron:backfill": { + ctx: WorkerPluginContext; cron: Cron; itemsToBackfill: JobAndCronIdentifierWithDetails[]; timestamp: string; @@ -1187,6 +1228,7 @@ export type WorkerEventMap = { * clock was adjusted) and we try again a little later. */ "cron:prematureTimer": { + ctx: WorkerPluginContext; cron: Cron; currentTimestamp: number; expectedTimestamp: number; @@ -1198,6 +1240,7 @@ export type WorkerEventMap = { * went to sleep) and we need to catch up. */ "cron:overdueTimer": { + ctx: WorkerPluginContext; cron: Cron; currentTimestamp: number; expectedTimestamp: number; @@ -1209,6 +1252,7 @@ export type WorkerEventMap = { * database write.) */ "cron:schedule": { + ctx: WorkerPluginContext; cron: Cron; timestamp: number; jobsAndIdentifiers: JobAndCronIdentifier[]; @@ -1220,6 +1264,7 @@ export type WorkerEventMap = { * database write.) */ "cron:scheduled": { + ctx: WorkerPluginContext; cron: Cron; timestamp: number; jobsAndIdentifiers: JobAndCronIdentifier[]; @@ -1230,6 +1275,7 @@ export type WorkerEventMap = { * (currently every 8-10 minutes) */ "resetLocked:started": { + ctx: WorkerPluginContext; /** @internal Not sure this'll stay on pool */ workerPool: WorkerPool; }; @@ -1239,6 +1285,7 @@ export type WorkerEventMap = { * successfully. */ "resetLocked:success": { + ctx: WorkerPluginContext; /** * The number of milliseconds until resetLocked runs again (or null if we * won't because the pool is exiting) @@ -1253,6 +1300,7 @@ export type WorkerEventMap = { * **Experimental** When the `resetLocked` process has failed. */ "resetLocked:failure": { + ctx: WorkerPluginContext; error: Error; /** @@ -1268,21 +1316,35 @@ export type WorkerEventMap = { /** * When the runner is terminated by a signal */ - gracefulShutdown: { signal: Signal }; + gracefulShutdown: { ctx: WorkerPluginContext; signal: Signal }; /** * When the runner is terminated by a signal _again_ after 5 seconds */ - forcefulShutdown: { signal: Signal }; + forcefulShutdown: { ctx: WorkerPluginContext; signal: Signal }; /** * When the runner is stopped */ - stop: Record; + stop: { ctx: WorkerPluginContext }; }; export type WorkerEvents = TypedEventEmitter; +export type GlobalEventMap = { + /** + * When the runner is terminated by a signal + */ + gracefulShutdown: { signal: Signal }; + + /** + * When the runner is terminated by a signal _again_ after 5 seconds + */ + forcefulShutdown: { signal: Signal }; +}; + +export type GlobalEvents = TypedEventEmitter; + /** * The digest of a timestamp into the component parts that a cron schedule cares about. */ diff --git a/src/localQueue.ts b/src/localQueue.ts index 896c020a..1e659491 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -259,6 +259,7 @@ export class LocalQueue { ); } compiledSharedOptions.events.emit("localQueue:init", { + ctx: compiledSharedOptions, localQueue: this, }); // Immediately enter polling mode. @@ -275,6 +276,7 @@ export class LocalQueue { // Override the 'readonly' (this.mode as LocalQueueMode) = newMode; this.compiledSharedOptions.events.emit("localQueue:setMode", { + ctx: this.compiledSharedOptions, localQueue: this, oldMode, newMode, @@ -440,6 +442,7 @@ export class LocalQueue { const jobsToReturn = this.jobQueue.splice(0, l); this.compiledSharedOptions.events.emit("localQueue:returnJobs", { + ctx: this.compiledSharedOptions, localQueue: this, jobs: jobsToReturn, }); @@ -609,6 +612,7 @@ export class LocalQueue { ); this.compiledSharedOptions.events.emit("localQueue:getJobs:complete", { + ctx: this.compiledSharedOptions, localQueue: this, jobs, }); @@ -669,6 +673,7 @@ export class LocalQueue { refetchDelayMs, ); this.compiledSharedOptions.events.emit("localQueue:refetchDelay:start", { + ctx: this.compiledSharedOptions, localQueue: this, jobCount, threshold: refetchDelayOptions?.threshold ?? 0, @@ -714,6 +719,7 @@ export class LocalQueue { this.refetchDelayFetchOnComplete = true; this.compiledSharedOptions.events.emit("localQueue:refetchDelay:abort", { + ctx: this.compiledSharedOptions, localQueue: this, count: this.refetchDelayCounter, abortThreshold: this.refetchDelayAbortThreshold, @@ -722,6 +728,7 @@ export class LocalQueue { this.compiledSharedOptions.events.emit( "localQueue:refetchDelay:expired", { + ctx: this.compiledSharedOptions, localQueue: this, }, ); diff --git a/src/main.ts b/src/main.ts index db616420..fc5b3963 100644 --- a/src/main.ts +++ b/src/main.ts @@ -14,6 +14,8 @@ import { EnhancedWithPgClient, FailJobFunction, GetJobFunction, + GlobalEventMap, + GlobalEvents, Job, RunOnceOptions, TaskList, @@ -66,7 +68,7 @@ export { allWorkerPools as _allWorkerPools }; * gracefulShutdown to all the pools' events; we use this event emitter to * aggregate these requests. */ -const _signalHandlersEventEmitter: WorkerEvents = new EventEmitter(); +const _signalHandlersEventEmitter: GlobalEvents = new EventEmitter(); /** * Only register the signal handlers once _globally_. @@ -88,7 +90,7 @@ let _registeredSignalHandlersCount = 0; * future calls will register the events but take no further actions. */ function registerSignalHandlers( - logger: Logger, + ctx: CompiledSharedOptions, events: WorkerEvents, ): () => void { if (_shuttingDownGracefully || _shuttingDownForcefully) { @@ -97,13 +99,13 @@ function registerSignalHandlers( ); } - const gscb = (o: WorkerEventMap["gracefulShutdown"]) => - events.emit("gracefulShutdown", o); - const fscb = (o: WorkerEventMap["forcefulShutdown"]) => - events.emit("forcefulShutdown", o); + const gscb = (o: GlobalEventMap["gracefulShutdown"]) => + events.emit("gracefulShutdown", { ctx, ...o }); + const fscb = (o: GlobalEventMap["forcefulShutdown"]) => + events.emit("forcefulShutdown", { ctx, ...o }); if (!_registeredSignalHandlers) { - _reallyRegisterSignalHandlers(logger); + _reallyRegisterSignalHandlers(ctx.logger); } _registeredSignalHandlersCount++; @@ -262,6 +264,7 @@ export function runTaskListInternal( tasks: TaskList, pgPool: Pool, ): WorkerPool { + const ctx = compiledSharedOptions; const { events, logger, @@ -329,10 +332,10 @@ export function runTaskListInternal( resetLockedAtPromise = undefined; if (workerPool._active) { const delay = resetLockedDelay(); - events.emit("resetLocked:success", { workerPool, delay }); + events.emit("resetLocked:success", { ctx, workerPool, delay }); resetLockedTimeout = setTimeout(resetLocked, delay); } else { - events.emit("resetLocked:success", { workerPool, delay: null }); + events.emit("resetLocked:success", { ctx, workerPool, delay: null }); } }, (e) => { @@ -341,6 +344,7 @@ export function runTaskListInternal( if (workerPool._active) { const delay = resetLockedDelay(); events.emit("resetLocked:failure", { + ctx, workerPool, error: e, delay, @@ -354,6 +358,7 @@ export function runTaskListInternal( ); } else { events.emit("resetLocked:failure", { + ctx, workerPool, error: e, delay: null, @@ -367,7 +372,7 @@ export function runTaskListInternal( } }, ); - events.emit("resetLocked:started", { workerPool }); + events.emit("resetLocked:started", { ctx, workerPool }); }; // Reset locked in the first 60 seconds, not immediately because we don't @@ -389,7 +394,7 @@ export function runTaskListInternal( } const reconnectWithExponentialBackoff = (err: Error) => { - events.emit("pool:listen:error", { workerPool, error: err }); + events.emit("pool:listen:error", { ctx, workerPool, error: err }); attempts++; @@ -411,7 +416,7 @@ export function runTaskListInternal( reconnectTimeout = setTimeout(() => { reconnectTimeout = null; - events.emit("pool:listen:connecting", { workerPool, attempts }); + events.emit("pool:listen:connecting", { ctx, workerPool, attempts }); pgPool.connect(listenForChanges); }, delay); }; @@ -451,6 +456,7 @@ export function runTaskListInternal( function handleNotification(message: Notification) { if (changeListener?.client === client && !workerPool._shuttingDown) { events.emit("pool:listen:notification", { + ctx, workerPool, message, client, @@ -495,7 +501,7 @@ export function runTaskListInternal( client.removeListener("notification", handleNotification); // TODO: ideally we'd only stop handling errors once all pending queries are complete; but either way we shouldn't try again! client.removeListener("error", onErrorReleaseClientAndTryAgain); - events.emit("pool:listen:release", { workerPool, client }); + events.emit("pool:listen:release", { ctx, workerPool, client }); try { await client.query( 'UNLISTEN "jobs:insert"; UNLISTEN "worker:migrate";', @@ -515,7 +521,7 @@ export function runTaskListInternal( //---------------------------------------- changeListener = { client, release }; - events.emit("pool:listen:success", { workerPool, client }); + events.emit("pool:listen:success", { ctx, workerPool, client }); client.on("notification", handleNotification); // Subscribe to jobs:insert message @@ -536,7 +542,7 @@ export function runTaskListInternal( }; // Create a client dedicated to listening for new jobs. - events.emit("pool:listen:connecting", { workerPool, attempts }); + events.emit("pool:listen:connecting", { ctx, workerPool, attempts }); pgPool.connect(listenForChanges); return workerPool; @@ -598,7 +604,10 @@ export function _runTaskList( let unregisterSignalHandlers: (() => void) | undefined = undefined; if (!noHandleSignals) { // Clean up when certain signals occur - unregisterSignalHandlers = registerSignalHandlers(logger, events); + unregisterSignalHandlers = registerSignalHandlers( + compiledSharedOptions, + events, + ); } /* Errors that should be raised from the workerPool.promise (i.e. _finPromise) */ @@ -804,6 +813,7 @@ export function _runTaskList( { ctx, workerPool, message }, async ({ message }) => { events.emit("pool:gracefulShutdown", { + ctx, pool: workerPool, workerPool, message, @@ -836,6 +846,7 @@ export function _runTaskList( const worker = workers[i]; const job = worker.getActiveJob(); events.emit("pool:gracefulShutdown:workerError", { + ctx, pool: workerPool, workerPool, error: workerReleaseResult.reason, @@ -902,12 +913,14 @@ export function _runTaskList( } events.emit("pool:gracefulShutdown:complete", { + ctx, pool: workerPool, workerPool, }); logger.debug("Graceful shutdown complete"); } catch (e) { events.emit("pool:gracefulShutdown:error", { + ctx, pool: workerPool, workerPool, error: e, @@ -965,6 +978,7 @@ export function _runTaskList( { ctx, workerPool, message }, async ({ message }) => { events.emit("pool:forcefulShutdown", { + ctx, pool: workerPool, workerPool, message, @@ -1097,6 +1111,7 @@ export function _runTaskList( } events.emit("pool:forcefulShutdown:complete", { + ctx, pool: workerPool, workerPool, }); @@ -1104,6 +1119,7 @@ export function _runTaskList( return { forceFailedJobs }; } catch (e) { events.emit("pool:forcefulShutdown:error", { + ctx, pool: workerPool, workerPool, error: e, @@ -1149,7 +1165,7 @@ export function _runTaskList( }; _finPromise.finally(() => { - events.emit("pool:release", { pool: workerPool, workerPool }); + events.emit("pool:release", { ctx, pool: workerPool, workerPool }); }); abortSignal.addEventListener("abort", () => { @@ -1160,7 +1176,7 @@ export function _runTaskList( // Ensure that during a forced shutdown we get cleaned up too allWorkerPools.push(workerPool); - events.emit("pool:create", { workerPool }); + events.emit("pool:create", { ctx, workerPool }); // Spawn our workers; they can share clients from the pool. const workerId = @@ -1219,6 +1235,7 @@ export function _runTaskList( ), (error, jobs) => { events.emit("pool:fatalError", { + ctx, error, workerPool, action: "completeJob", @@ -1265,6 +1282,7 @@ export function _runTaskList( ), (error, specs) => { events.emit("pool:fatalError", { + ctx, error, workerPool, action: "failJob", diff --git a/src/runner.ts b/src/runner.ts index 9274a878..9c9f02f5 100644 --- a/src/runner.ts +++ b/src/runner.ts @@ -6,6 +6,7 @@ import { Runner, RunnerOptions, TaskList, + WorkerPluginContext, } from "./interfaces"; import { coerceError, @@ -149,6 +150,7 @@ function buildRunner(input: { release: () => PromiseOrDirect; }): Runner { const { compiledOptions, taskList, parsedCronItems, release } = input; + const ctx: WorkerPluginContext = compiledOptions; const { events, pgPool, releasers, addJob, logger } = compiledOptions; const cron = runCron(compiledOptions, parsedCronItems, { pgPool, events }); @@ -166,7 +168,7 @@ function buildRunner(input: { compiledOptions.logger.debug("Runner stopping"); if (running) { running = false; - events.emit("stop", {}); + events.emit("stop", { ctx }); try { const promises: Array> = []; if (cron._active) { diff --git a/src/worker.ts b/src/worker.ts index e1eaaebc..66bab137 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -35,6 +35,7 @@ export function makeNewWorker( failJob: FailJobFunction; }, ): Worker { + const ctx = compiledSharedOptions; const { tasks, withPgClient, @@ -72,10 +73,10 @@ export function makeNewWorker( promise.then( () => { - events.emit("worker:stop", { worker }); + events.emit("worker:stop", { ctx, worker }); }, (error) => { - events.emit("worker:stop", { worker, error }); + events.emit("worker:stop", { ctx, worker, error }); }, ); let activeJob: Job | null = null; @@ -94,7 +95,7 @@ export function makeNewWorker( const release = (force = false) => { if (active) { active = false; - events.emit("worker:release", { worker }); + events.emit("worker:release", { ctx, worker }); if (cancelDoNext()) { workerDeferred.resolve(); @@ -136,7 +137,7 @@ export function makeNewWorker( }, }; - events.emit("worker:create", { worker, tasks }); + events.emit("worker:create", { ctx, worker, tasks }); logger.debug(`Spawned`); @@ -176,7 +177,7 @@ export function makeNewWorker( flagsToSkip = event.flagsToSkip; } - events.emit("worker:getJob:start", { worker }); + events.emit("worker:getJob:start", { ctx, worker }); const jobRow = await getJob(workerPool.id, flagsToSkip); // `doNext` cannot be executed concurrently, so we know this is safe. @@ -184,13 +185,13 @@ export function makeNewWorker( activeJob = jobRow && jobRow.id ? jobRow : null; if (activeJob) { - events.emit("job:start", { worker, job: activeJob }); + events.emit("job:start", { ctx, worker, job: activeJob }); } else { - events.emit("worker:getJob:empty", { worker }); + events.emit("worker:getJob:empty", { ctx, worker }); } } catch (rawErr) { const err = coerceError(rawErr); - events.emit("worker:getJob:error", { worker, error: err }); + events.emit("worker:getJob:error", { ctx, worker, error: err }); if (continuous) { contiguousErrors++; logger.debug( @@ -298,6 +299,7 @@ export function makeNewWorker( if (err) { try { events.emit("job:error", { + ctx, worker, job, error: err, @@ -313,6 +315,7 @@ export function makeNewWorker( try { // Failed forever events.emit("job:failed", { + ctx, worker, job, error: err, @@ -357,7 +360,7 @@ export function makeNewWorker( }); } else { try { - events.emit("job:success", { worker, job }); + events.emit("job:success", { ctx, worker, job }); } catch (e) { logger.error( "Error occurred in event emitter for 'job:success'; this is an issue in your application code and you should fix it", @@ -378,10 +381,11 @@ export function makeNewWorker( completeJob(job); } - events.emit("job:complete", { worker, job, error: err }); + events.emit("job:complete", { ctx, worker, job, error: err }); } catch (fatalError) { try { events.emit("worker:fatalError", { + ctx, worker, error: fatalError, jobError: err, From f676c3d829b2b04eb0579955d2e2ca12d63fae29 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 14:43:20 +0000 Subject: [PATCH 144/155] Kill command --- src/interfaces.ts | 3 +++ src/runner.ts | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/src/interfaces.ts b/src/interfaces.ts index 8bb2ffd1..160a969c 100644 --- a/src/interfaces.ts +++ b/src/interfaces.ts @@ -567,7 +567,10 @@ export interface WorkerPool { } export interface Runner { + /** Attempts to cleanly shut down the runner */ stop: () => Promise; + /** Use .stop() instead, unless you know what you're doing */ + kill: () => Promise; addJob: AddJobFunction; promise: Promise; events: WorkerEvents; diff --git a/src/runner.ts b/src/runner.ts index 9c9f02f5..bef2c463 100644 --- a/src/runner.ts +++ b/src/runner.ts @@ -190,6 +190,14 @@ function buildRunner(input: { throw new Error("Runner is already stopped"); } }; + const kill = async () => { + if (running) { + stop().catch(() => {}); + } + if (workerPool._active) { + await workerPool.forcefulShutdown(`Terminated through .kill() command`); + } + }; workerPool.promise.finally(() => { if (running) { @@ -222,6 +230,7 @@ function buildRunner(input: { return { stop, + kill, addJob, promise, events, From 676f6e9efee21bcf1482aa98061d21a0da7bcb43 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 15:39:11 +0000 Subject: [PATCH 145/155] Aggregate error messages --- src/localQueue.ts | 4 +++- src/main.ts | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/localQueue.ts b/src/localQueue.ts index 1e659491..7cc2c1f7 100644 --- a/src/localQueue.ts +++ b/src/localQueue.ts @@ -295,7 +295,9 @@ export class LocalQueue { if (this.errors.length === 1) { this._finPromise.reject(this.errors[0]); } else if (this.errors.length > 1) { - this._finPromise.reject(new AggregateError(this.errors)); + this._finPromise.reject( + new AggregateError(this.errors, "Worker did not exit cleanly"), + ); } else { this._finPromise.resolve(); } diff --git a/src/main.ts b/src/main.ts index fc5b3963..1f63f282 100644 --- a/src/main.ts +++ b/src/main.ts @@ -746,7 +746,12 @@ export function _runTaskList( if (_finErrors.length === 1) { _finPromise.reject(_finErrors[0]); } else if (_finErrors.length > 1) { - _finPromise.reject(new AggregateError(_finErrors)); + _finPromise.reject( + new AggregateError( + _finErrors, + `Worker pool '${workerPool.id}' failed to shut down cleanly`, + ), + ); } else { _finPromise.resolve(); } From 1d9bb102f17b9e013d62f76aad1394d08d60e0e2 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 15:39:37 +0000 Subject: [PATCH 146/155] .finally() promises still need error handling --- src/main.ts | 10 +++++++--- src/runner.ts | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/main.ts b/src/main.ts index 1f63f282..f638b605 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1169,9 +1169,11 @@ export function _runTaskList( }, }; - _finPromise.finally(() => { - events.emit("pool:release", { ctx, pool: workerPool, workerPool }); - }); + _finPromise + .finally(() => { + events.emit("pool:release", { ctx, pool: workerPool, workerPool }); + }) + .catch(noop); abortSignal.addEventListener("abort", () => { if (!workerPool._shuttingDown) { @@ -1574,3 +1576,5 @@ function batch( }, }; } + +function noop() {} diff --git a/src/runner.ts b/src/runner.ts index bef2c463..6d7f3855 100644 --- a/src/runner.ts +++ b/src/runner.ts @@ -199,18 +199,18 @@ function buildRunner(input: { } }; - workerPool.promise.finally(() => { + const wp = workerPool.promise.finally(() => { if (running) { stop(); } }); - cron.promise.finally(() => { + const cp = cron.promise.finally(() => { if (running) { stop(); } }); - const promise = Promise.all([cron.promise, workerPool.promise]).then( + const promise = Promise.all([cp, wp]).then( () => { /* noop */ }, From 852b279260d08743b9b281a56126c1b912226bea Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 15:41:22 +0000 Subject: [PATCH 147/155] .finally() promises still need error handling (more) --- src/main.ts | 40 ++++++++++++++++++++++------------------ src/worker.ts | 10 +++++++--- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/main.ts b/src/main.ts index f638b605..1b45210a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -182,16 +182,18 @@ function _reallyRegisterSignalHandlers(logger: Logger) { allWorkerPools.map((pool) => pool.gracefulShutdown(`Graceful worker shutdown due to ${signal}`), ), - ).finally(() => { - clearTimeout(switchTimeout); - process.removeListener(signal, gracefulHandler); - if (!_shuttingDownForcefully) { - logger.info( - `Global graceful shutdown complete; killing self via ${signal}`, - ); - process.kill(process.pid, signal); - } - }); + ) + .finally(() => { + clearTimeout(switchTimeout); + process.removeListener(signal, gracefulHandler); + if (!_shuttingDownForcefully) { + logger.info( + `Global graceful shutdown complete; killing self via ${signal}`, + ); + process.kill(process.pid, signal); + } + }) + .catch(noop); }; const forcefulHandler = function (signal: Signal) { if (_shuttingDownForcefully) { @@ -213,14 +215,16 @@ function _reallyRegisterSignalHandlers(logger: Logger) { allWorkerPools.map((pool) => pool.forcefulShutdown(`Forced worker shutdown due to ${signal}`), ), - ).finally(() => { - removeForcefulHandler(); - clearTimeout(removeTimeout); - logger.error( - `Global forceful shutdown completed; killing self via ${signal}`, - ); - process.kill(process.pid, signal); - }); + ) + .finally(() => { + removeForcefulHandler(); + clearTimeout(removeTimeout); + logger.error( + `Global forceful shutdown completed; killing self via ${signal}`, + ); + process.kill(process.pid, signal); + }) + .catch(noop); }; logger.debug( diff --git a/src/worker.ts b/src/worker.ts index 66bab137..5a3fad39 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -67,9 +67,11 @@ export function makeNewWorker( const promise: Promise & { /** @internal */ worker?: Worker; - } = workerDeferred.finally(() => { - return hooks.process("stopWorker", { worker, withPgClient }); - }); + } = workerDeferred + .finally(() => { + return hooks.process("stopWorker", { worker, withPgClient }); + }) + .catch(noop); promise.then( () => { @@ -427,3 +429,5 @@ export function makeNewWorker( return worker; } + +function noop() {} From 2f8d3781fbd76e3976b55bdb92b7a7a0df338f01 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 16:34:04 +0000 Subject: [PATCH 148/155] Add migration table data to dump --- __tests__/schema.sql | 22 ++++++++++++++++++++++ scripts/dump_db | 1 + 2 files changed, 23 insertions(+) diff --git a/__tests__/schema.sql b/__tests__/schema.sql index f8b77b65..bc655cf6 100644 --- a/__tests__/schema.sql +++ b/__tests__/schema.sql @@ -374,3 +374,25 @@ ALTER TABLE graphile_worker._private_job_queues ENABLE ROW LEVEL SECURITY; ALTER TABLE graphile_worker._private_jobs ENABLE ROW LEVEL SECURITY; ALTER TABLE graphile_worker._private_known_crontabs ENABLE ROW LEVEL SECURITY; ALTER TABLE graphile_worker._private_tasks ENABLE ROW LEVEL SECURITY; +SELECT pg_catalog.set_config('search_path', '', false); +COPY graphile_worker.migrations (id, ts, breaking) FROM stdin; +1 2024-12-11 16:33:34.832099+00 t +2 2024-12-11 16:33:34.869094+00 f +3 2024-12-11 16:33:34.879306+00 t +4 2024-12-11 16:33:34.881577+00 f +5 2024-12-11 16:33:34.885313+00 f +6 2024-12-11 16:33:34.888433+00 f +7 2024-12-11 16:33:34.903467+00 f +8 2024-12-11 16:33:34.906565+00 f +9 2024-12-11 16:33:34.922042+00 f +10 2024-12-11 16:33:34.924261+00 f +11 2024-12-11 16:33:34.926218+00 t +12 2024-12-11 16:33:35.036279+00 f +13 2024-12-11 16:33:35.039057+00 t +14 2024-12-11 16:33:35.042399+00 t +15 2024-12-11 16:33:35.044275+00 f +16 2024-12-11 16:33:35.045737+00 t +17 2024-12-11 16:33:35.048717+00 f +18 2024-12-11 16:33:35.051647+00 f +19 2024-12-11 16:33:35.054562+00 t +\. diff --git a/scripts/dump_db b/scripts/dump_db index 1ee62a79..1e6059ac 100755 --- a/scripts/dump_db +++ b/scripts/dump_db @@ -7,5 +7,6 @@ psql template1 -c "CREATE USER graphile_worker_role WITH SUPERUSER PASSWORD 'pas createdb graphile_worker_dump -O graphile_worker_role PGUSER=graphile_worker_role PGPASSWORD=password PGHOST=127.0.0.1 ts-node src/cli.ts -c postgres:///graphile_worker_dump --schema-only pg_dump --schema-only --no-owner graphile_worker_dump | sed -e '/^--/d' -e '/^\s*$/d' -e '/^SET /d' -e 's/EXECUTE FUNCTION/EXECUTE PROCEDURE/g' > __tests__/schema.sql +pg_dump --data-only --no-owner --table graphile_worker.migrations graphile_worker_dump | sed -e '/^--/d' -e '/^\s*$/d' -e '/^SET /d' -e 's/EXECUTE FUNCTION/EXECUTE PROCEDURE/g' >> __tests__/schema.sql dropdb graphile_worker_dump dropuser graphile_worker_role From 0e88c16420949557f576f0301518275ca8ae712e Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 17:13:20 +0000 Subject: [PATCH 149/155] Upgrade to latest graphile-config --- package.json | 2 +- yarn.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index 72ccbe35..df1ad194 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "@types/debug": "^4.1.10", "@types/pg": "^8.10.5", "cosmiconfig": "^8.3.6", - "graphile-config": "^0.0.1-beta.12", + "graphile-config": "^0.0.1-beta.14", "json5": "^2.2.3", "pg": "^8.11.3", "tslib": "^2.6.2", diff --git a/yarn.lock b/yarn.lock index 991bd5df..adf1323a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6355,10 +6355,10 @@ graphemer@^1.4.0: resolved "https://registry.yarnpkg.com/graphemer/-/graphemer-1.4.0.tgz#fb2f1d55e0e3a1849aeffc90c4fa0dd53a0e66c6" integrity sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag== -graphile-config@^0.0.1-beta.12, graphile-config@^0.0.1-beta.4: - version "0.0.1-beta.12" - resolved "https://registry.yarnpkg.com/graphile-config/-/graphile-config-0.0.1-beta.12.tgz#e31e12077366f3cbe55708ec20452e5027177627" - integrity sha512-th7C2fM29dhra5gCmykWUJQMCAzA6C5W+dF8DZa0BWLImmHnSUK+AO4qPCR6bZKR5JKTW2onZweqP4ZHVLPQFw== +graphile-config@^0.0.1-beta.14, graphile-config@^0.0.1-beta.4: + version "0.0.1-beta.14" + resolved "https://registry.yarnpkg.com/graphile-config/-/graphile-config-0.0.1-beta.14.tgz#6238ad5960ccc20b19718726da7c3b1a6c48d831" + integrity sha512-3FlhyRKz4LvIbY4AXn4EI8DSTdSYsg0WRfX6U9QeytGta9aiefF1QqSiC1ocXUlNJUMBfm28dy0eL669ljYRwg== dependencies: "@types/interpret" "^1.1.1" "@types/node" "^20.5.7" From e3fa6987f65a96d2a45cf01121ce7b8f6622ff5f Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 17:18:30 +0000 Subject: [PATCH 150/155] 0.17.0-canary.0e88c16 --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index df1ad194..a1a9edae 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.f6a6889", + "version": "0.17.0-canary.0e88c16", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index 0a602ee2..caad9af0 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.f6a6889"; +export const version = "0.17.0-canary.0e88c16"; From 209dbf69f659b8057ce8421ec4101218b3122cd9 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 17:25:01 +0000 Subject: [PATCH 151/155] Move tsconfig to devDependencies --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index a1a9edae..f99371a3 100644 --- a/package.json +++ b/package.json @@ -52,7 +52,6 @@ "homepage": "https://github.com/graphile/worker#readme", "dependencies": { "@graphile/logger": "^0.2.0", - "@tsconfig/node18": "^18.2.4", "@types/debug": "^4.1.10", "@types/pg": "^8.10.5", "cosmiconfig": "^8.3.6", @@ -73,6 +72,7 @@ "@fortawesome/free-solid-svg-icons": "^6.5.1", "@fortawesome/react-fontawesome": "^0.2.0", "@mdx-js/react": "^1.6.22", + "@tsconfig/node18": "^18.2.4", "@types/jest": "^26.0.0", "@types/json5": "^2.2.0", "@types/node": "^20.8.7", From 01036341874b44aa8cba6fd2238ceceb089c6ab4 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 18:00:19 +0000 Subject: [PATCH 152/155] Trim grant and stable timestamps --- __tests__/schema.sql | 38 +++++++++++++++++++------------------- scripts/dump_db | 4 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/__tests__/schema.sql b/__tests__/schema.sql index bc655cf6..1e545a57 100644 --- a/__tests__/schema.sql +++ b/__tests__/schema.sql @@ -376,23 +376,23 @@ ALTER TABLE graphile_worker._private_known_crontabs ENABLE ROW LEVEL SECURITY; ALTER TABLE graphile_worker._private_tasks ENABLE ROW LEVEL SECURITY; SELECT pg_catalog.set_config('search_path', '', false); COPY graphile_worker.migrations (id, ts, breaking) FROM stdin; -1 2024-12-11 16:33:34.832099+00 t -2 2024-12-11 16:33:34.869094+00 f -3 2024-12-11 16:33:34.879306+00 t -4 2024-12-11 16:33:34.881577+00 f -5 2024-12-11 16:33:34.885313+00 f -6 2024-12-11 16:33:34.888433+00 f -7 2024-12-11 16:33:34.903467+00 f -8 2024-12-11 16:33:34.906565+00 f -9 2024-12-11 16:33:34.922042+00 f -10 2024-12-11 16:33:34.924261+00 f -11 2024-12-11 16:33:34.926218+00 t -12 2024-12-11 16:33:35.036279+00 f -13 2024-12-11 16:33:35.039057+00 t -14 2024-12-11 16:33:35.042399+00 t -15 2024-12-11 16:33:35.044275+00 f -16 2024-12-11 16:33:35.045737+00 t -17 2024-12-11 16:33:35.048717+00 f -18 2024-12-11 16:33:35.051647+00 f -19 2024-12-11 16:33:35.054562+00 t +1 1970-01-01 00:00:00.000000+00 t +2 1970-01-01 00:00:00.000000+00 f +3 1970-01-01 00:00:00.000000+00 t +4 1970-01-01 00:00:00.000000+00 f +5 1970-01-01 00:00:00.000000+00 f +6 1970-01-01 00:00:00.000000+00 f +7 1970-01-01 00:00:00.000000+00 f +8 1970-01-01 00:00:00.000000+00 f +9 1970-01-01 00:00:00.000000+00 f +10 1970-01-01 00:00:00.000000+00 f +11 1970-01-01 00:00:00.000000+00 t +12 1970-01-01 00:00:00.000000+00 f +13 1970-01-01 00:00:00.000000+00 t +14 1970-01-01 00:00:00.000000+00 t +15 1970-01-01 00:00:00.000000+00 f +16 1970-01-01 00:00:00.000000+00 t +17 1970-01-01 00:00:00.000000+00 f +18 1970-01-01 00:00:00.000000+00 f +19 1970-01-01 00:00:00.000000+00 t \. diff --git a/scripts/dump_db b/scripts/dump_db index 1e6059ac..4dc03203 100755 --- a/scripts/dump_db +++ b/scripts/dump_db @@ -6,7 +6,7 @@ dropuser graphile_worker_role || true psql template1 -c "CREATE USER graphile_worker_role WITH SUPERUSER PASSWORD 'password';" createdb graphile_worker_dump -O graphile_worker_role PGUSER=graphile_worker_role PGPASSWORD=password PGHOST=127.0.0.1 ts-node src/cli.ts -c postgres:///graphile_worker_dump --schema-only -pg_dump --schema-only --no-owner graphile_worker_dump | sed -e '/^--/d' -e '/^\s*$/d' -e '/^SET /d' -e 's/EXECUTE FUNCTION/EXECUTE PROCEDURE/g' > __tests__/schema.sql -pg_dump --data-only --no-owner --table graphile_worker.migrations graphile_worker_dump | sed -e '/^--/d' -e '/^\s*$/d' -e '/^SET /d' -e 's/EXECUTE FUNCTION/EXECUTE PROCEDURE/g' >> __tests__/schema.sql +pg_dump --schema-only --no-owner graphile_worker_dump | sed -E -e '/^--/d' -e '/^\s*$/d' -e '/^SET /d' -e 's/EXECUTE FUNCTION/EXECUTE PROCEDURE/g' -e '/^(REVOKE|GRANT) .* ON SCHEMA public (FROM|TO) PUBLIC;$/d' > __tests__/schema.sql +pg_dump --data-only --no-owner graphile_worker_dump --table=graphile_worker.migrations --table=graphile_worker._private_pro_migrations | sed -E -e '/^--/d' -e '/^\s*$/d' -e 's/\b2[0-9]{3}-[0-9]{2}-[0-9]{2}\s[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{1,6}\+00/1970-01-01 00:00:00.000000+00/g' -e '/^SET /d' >> __tests__/schema.sql dropdb graphile_worker_dump dropuser graphile_worker_role From 6f1217aa2c7bccebb8c5c43aa41f92b53d4aa095 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 18:00:46 +0000 Subject: [PATCH 153/155] Lint --- src/main.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.ts b/src/main.ts index 1b45210a..65a5292a 100644 --- a/src/main.ts +++ b/src/main.ts @@ -19,7 +19,6 @@ import { Job, RunOnceOptions, TaskList, - WorkerEventMap, WorkerEvents, WorkerPool, WorkerPoolOptions, From 1fcb2a02ab072c4e999c9941231fc726a2376a11 Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 18:15:14 +0000 Subject: [PATCH 154/155] Enable more clash codes --- src/migrate.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/migrate.ts b/src/migrate.ts index fde2dd79..e679c4fc 100644 --- a/src/migrate.ts +++ b/src/migrate.ts @@ -97,7 +97,10 @@ export async function runMigration( const error = coerceError(rawError); await event.client.query("rollback"); await hooks.process("migrationError", { ...event, error }); - if (!migrationInsertComplete && error.code === "23505") { + if ( + !migrationInsertComplete && + CLASH_CODES.includes(error.code as string) + ) { // Someone else did this migration! Success! logger.debug( `Some other worker has performed migration ${migrationFile}; continuing.`, @@ -146,13 +149,13 @@ select current_setting('server_version_num') as server_version_num, break; } catch (rawE) { const e = coerceError(rawE); - if (attempts === 0 && (e.code === "42P01" || e.code === "42703")) { + if (attempts === 0 && NX_CODES.includes(e.code as string)) { try { await installSchema(compiledSharedOptions, event); break; } catch (rawE2) { const e2 = coerceError(rawE2); - if (e2.code === "23505") { + if (CLASH_CODES.includes(e2.code as string)) { // Another instance installed this concurrently? Go around again. } else { throw e2; @@ -208,3 +211,8 @@ select current_setting('server_version_num') as server_version_num, await hooks.process("postmigrate", event); }); } + +/** Doesn't exist */ +const NX_CODES = ["42P01", "42703"]; +/** Someone else created */ +const CLASH_CODES = ["23505", "42P06", "42P07", "42710"]; From c665c80d6f49485836fef0a9e78c3b61c9a93b7a Mon Sep 17 00:00:00 2001 From: Benjie Gillam Date: Wed, 11 Dec 2024 18:23:35 +0000 Subject: [PATCH 155/155] 0.17.0-canary.1fcb2a0 --- package.json | 2 +- src/version.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index f99371a3..f99ecdb1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "graphile-worker", - "version": "0.17.0-canary.0e88c16", + "version": "0.17.0-canary.1fcb2a0", "type": "commonjs", "description": "Job queue for PostgreSQL", "main": "dist/index.js", diff --git a/src/version.ts b/src/version.ts index caad9af0..989fa0d2 100644 --- a/src/version.ts +++ b/src/version.ts @@ -1,2 +1,2 @@ // This file is autogenerated by /scripts/postversion.mjs -export const version = "0.17.0-canary.0e88c16"; +export const version = "0.17.0-canary.1fcb2a0";