From dc5365601f37f1620c179aa32a7dbc979de09a92 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 5 Jun 2025 14:19:07 -0700 Subject: [PATCH 01/13] update API to ingest yuri's protos changes --- client/src/raw.rs | 9 + core/src/worker/client.rs | 4 + core/src/worker/mod.rs | 1 + .../machines/cancel_external_state_machine.rs | 2 +- .../machines/child_workflow_state_machine.rs | 1 + .../machines/signal_external_state_machine.rs | 2 +- sdk-core-protos/protos/api_upstream/README.md | 4 + .../api_upstream/openapi/openapiv2.json | 657 +++++++++++++++--- .../api_upstream/openapi/openapiv3.yaml | 653 +++++++++++++++-- .../temporal/api/batch/v1/message.proto | 8 +- .../temporal/api/command/v1/message.proto | 10 +- .../temporal/api/common/v1/message.proto | 4 +- .../temporal/api/deployment/v1/message.proto | 33 +- .../temporal/api/enums/v1/common.proto | 11 +- .../temporal/api/enums/v1/deployment.proto | 26 + .../temporal/api/enums/v1/failed_cause.proto | 2 + .../temporal/api/enums/v1/reset.proto | 8 +- .../temporal/api/failure/v1/message.proto | 4 +- .../temporal/api/history/v1/message.proto | 78 ++- .../temporal/api/nexus/v1/message.proto | 8 +- .../temporal/api/schedule/v1/message.proto | 4 +- .../temporal/api/worker/v1/message.proto | 126 ++++ .../temporal/api/workflow/v1/message.proto | 25 +- .../workflowservice/v1/request_response.proto | 168 ++++- .../api/workflowservice/v1/service.proto | 21 + sdk-core-protos/src/history_builder.rs | 1 - sdk-core-protos/src/lib.rs | 7 + 27 files changed, 1612 insertions(+), 265 deletions(-) create mode 100644 sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto diff --git a/client/src/raw.rs b/client/src/raw.rs index d5bdb791f..e77eef487 100644 --- a/client/src/raw.rs +++ b/client/src/raw.rs @@ -1339,6 +1339,15 @@ proxier! { r.extensions_mut().insert(labels); } ); + ( + record_worker_heartbeat, + RecordWorkerHeartbeatRequest, + RecordWorkerHeartbeatResponse, + |r| { + let labels = namespaced_request!(r); + r.extensions_mut().insert(labels); + } + ); } proxier! { diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index fd204ff1f..238d5a9b7 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -267,6 +267,7 @@ impl WorkerClient for WorkerClientBag { binary_checksum: self.binary_checksum(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), + worker_heartbeat: None, } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -303,6 +304,7 @@ impl WorkerClient for WorkerClientBag { }), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), + worker_heartbeat: None, } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -335,6 +337,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), + worker_heartbeat: None, } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -612,6 +615,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), sticky_task_queue, reason: "graceful shutdown".to_string(), + worker_heartbeat: None, }; Ok( diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 7e3c6a3a0..c1b16993c 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -307,6 +307,7 @@ impl Worker { task_pollers: TaskPollers, telem_instance: Option<&TelemetryInstance>, ) -> Self { + // TODO: Use existing MetricsContext or a new meter to record and export these metrics, possibly through the same MetricsCallBuffer let (metrics, meter) = if let Some(ti) = telem_instance { ( MetricsContext::top_level(config.namespace.clone(), config.task_queue.clone(), ti), diff --git a/core/src/worker/workflow/machines/cancel_external_state_machine.rs b/core/src/worker/workflow/machines/cancel_external_state_machine.rs index 2f389472d..ef79cd709 100644 --- a/core/src/worker/workflow/machines/cancel_external_state_machine.rs +++ b/core/src/worker/workflow/machines/cancel_external_state_machine.rs @@ -69,7 +69,7 @@ pub(super) fn new_external_cancel( namespace: workflow_execution.namespace, workflow_id: workflow_execution.workflow_id, run_id: workflow_execution.run_id, - // Apparently this is effectively deprecated at this point + #[allow(deprecated)] control: "".to_string(), child_workflow_only: only_child, reason, diff --git a/core/src/worker/workflow/machines/child_workflow_state_machine.rs b/core/src/worker/workflow/machines/child_workflow_state_machine.rs index 14145e329..19d66dd75 100644 --- a/core/src/worker/workflow/machines/child_workflow_state_machine.rs +++ b/core/src/worker/workflow/machines/child_workflow_state_machine.rs @@ -714,6 +714,7 @@ impl WFMachinesAdapter for ChildWorkflowMachine { run_id: self.shared_state.run_id.clone(), child_workflow_only: true, reason, + #[allow(deprecated)] control: "".to_string(), } .into(), diff --git a/core/src/worker/workflow/machines/signal_external_state_machine.rs b/core/src/worker/workflow/machines/signal_external_state_machine.rs index 08e084c5f..9d76045c2 100644 --- a/core/src/worker/workflow/machines/signal_external_state_machine.rs +++ b/core/src/worker/workflow/machines/signal_external_state_machine.rs @@ -104,7 +104,7 @@ pub(super) fn new_external_signal( }, signal_name: attrs.signal_name, input: attrs.args.into_payloads(), - // Is deprecated + #[allow(deprecated)] control: "".to_string(), child_workflow_only: only_child, }, diff --git a/sdk-core-protos/protos/api_upstream/README.md b/sdk-core-protos/protos/api_upstream/README.md index cdf431261..0ab8d0ae0 100644 --- a/sdk-core-protos/protos/api_upstream/README.md +++ b/sdk-core-protos/protos/api_upstream/README.md @@ -6,6 +6,10 @@ This repository contains both the protobuf descriptors and OpenAPI documentation Install as git submodule to the project. +## Contribution + +Make your change to the temporal/proto files, and run `make` to update the openapi definitions. + ## License MIT License, please see [LICENSE](LICENSE) for details. diff --git a/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json b/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json index 5e0c615af..005a4d5d1 100644 --- a/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json +++ b/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json @@ -1510,7 +1510,7 @@ }, { "name": "taskQueueType", - "description": "Deprecated. Use `ENHANCED` mode with `task_queue_types`. Ignored in `ENHANCED` mode.\nIf unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used.\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", + "description": "If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used.\nOnly supported in default mode (use `task_queue_types` in ENHANCED mode instead).\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", "in": "query", "required": false, "type": "string", @@ -1522,16 +1522,23 @@ ], "default": "TASK_QUEUE_TYPE_UNSPECIFIED" }, + { + "name": "reportStats", + "description": "Report stats for the requested task queue type(s).", + "in": "query", + "required": false, + "type": "boolean" + }, { "name": "includeTaskQueueStatus", - "description": "Deprecated. Ignored in `ENHANCED` mode.", + "description": "Deprecated, use `report_stats` instead.\nIf true, the task queue status will be included in the response.", "in": "query", "required": false, "type": "boolean" }, { "name": "apiMode", - "description": "All options except `task_queue_type` and `include_task_queue_status` are only available in the `ENHANCED` mode.\n\n - DESCRIBE_TASK_QUEUE_MODE_UNSPECIFIED: Unspecified means legacy behavior.\n - DESCRIBE_TASK_QUEUE_MODE_ENHANCED: Enhanced mode reports aggregated results for all partitions, supports Build IDs, and reports richer info.", + "description": "Deprecated. ENHANCED mode is also being deprecated.\nSelect the API mode to use for this request: DEFAULT mode (if unset) or ENHANCED mode.\nConsult the documentation for each field to understand which mode it is supported in.\n\n - DESCRIBE_TASK_QUEUE_MODE_UNSPECIFIED: Unspecified means legacy behavior.\n - DESCRIBE_TASK_QUEUE_MODE_ENHANCED: Enhanced mode reports aggregated results for all partitions, supports Build IDs, and reports richer info.", "in": "query", "required": false, "type": "string", @@ -1568,7 +1575,7 @@ }, { "name": "taskQueueTypes", - "description": "Task queue types to report info about. If not specified, all types are considered.\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", + "description": "Deprecated (as part of the ENHANCED mode deprecation).\nTask queue types to report info about. If not specified, all types are considered.\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", "in": "query", "required": false, "type": "array", @@ -1583,23 +1590,16 @@ }, "collectionFormat": "multi" }, - { - "name": "reportStats", - "description": "Report stats for the requested task queue types and versions", - "in": "query", - "required": false, - "type": "boolean" - }, { "name": "reportPollers", - "description": "Report list of pollers for requested task queue types and versions", + "description": "Deprecated (as part of the ENHANCED mode deprecation).\nReport list of pollers for requested task queue types and versions.", "in": "query", "required": false, "type": "boolean" }, { "name": "reportTaskReachability", - "description": "Report task reachability for the requested versions and all task types (task reachability is not reported\nper task type).", + "description": "Deprecated (as part of the ENHANCED mode deprecation).\nReport task reachability for the requested versions and all task types (task reachability is not reported\nper task type).", "in": "query", "required": false, "type": "boolean" @@ -2184,6 +2184,98 @@ ] } }, + "/api/v1/namespaces/{namespace}/workers": { + "get": { + "summary": "ListWorkers is a visibility API to list worker status information in a specific namespace.", + "operationId": "ListWorkers2", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1ListWorkersResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "pageSize", + "in": "query", + "required": false, + "type": "integer", + "format": "int32" + }, + { + "name": "nextPageToken", + "in": "query", + "required": false, + "type": "string", + "format": "byte" + }, + { + "name": "query", + "description": "`query` in ListWorkers is used to filter workers based on worker status info.\nThe following worker status attributes are expected are supported as part of the query:\n* WorkerInstanceKey\n* WorkerIdentity\n* HostName\n* TaskQueue\n* DeploymentName\n* BuildId\n* SdkName\n* SdkVersion\n* StartTime\n* LastHeartbeatTime\n* Status\nCurrently metrics are not supported as a part of ListWorkers query.", + "in": "query", + "required": false, + "type": "string" + } + ], + "tags": [ + "WorkflowService" + ] + } + }, + "/api/v1/namespaces/{namespace}/workers/heartbeat": { + "post": { + "summary": "WorkerHeartbeat receive heartbeat request from the worker.", + "operationId": "RecordWorkerHeartbeat2", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1RecordWorkerHeartbeatResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "description": "Namespace this worker belongs to.", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "body", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/WorkflowServiceRecordWorkerHeartbeatBody" + } + } + ], + "tags": [ + "WorkflowService" + ] + } + }, "/api/v1/namespaces/{namespace}/workflow-count": { "get": { "summary": "CountWorkflowExecutions is a visibility API to count of workflow executions in a specific namespace.", @@ -5058,7 +5150,7 @@ }, { "name": "taskQueueType", - "description": "Deprecated. Use `ENHANCED` mode with `task_queue_types`. Ignored in `ENHANCED` mode.\nIf unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used.\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", + "description": "If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used.\nOnly supported in default mode (use `task_queue_types` in ENHANCED mode instead).\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", "in": "query", "required": false, "type": "string", @@ -5070,16 +5162,23 @@ ], "default": "TASK_QUEUE_TYPE_UNSPECIFIED" }, + { + "name": "reportStats", + "description": "Report stats for the requested task queue type(s).", + "in": "query", + "required": false, + "type": "boolean" + }, { "name": "includeTaskQueueStatus", - "description": "Deprecated. Ignored in `ENHANCED` mode.", + "description": "Deprecated, use `report_stats` instead.\nIf true, the task queue status will be included in the response.", "in": "query", "required": false, "type": "boolean" }, { "name": "apiMode", - "description": "All options except `task_queue_type` and `include_task_queue_status` are only available in the `ENHANCED` mode.\n\n - DESCRIBE_TASK_QUEUE_MODE_UNSPECIFIED: Unspecified means legacy behavior.\n - DESCRIBE_TASK_QUEUE_MODE_ENHANCED: Enhanced mode reports aggregated results for all partitions, supports Build IDs, and reports richer info.", + "description": "Deprecated. ENHANCED mode is also being deprecated.\nSelect the API mode to use for this request: DEFAULT mode (if unset) or ENHANCED mode.\nConsult the documentation for each field to understand which mode it is supported in.\n\n - DESCRIBE_TASK_QUEUE_MODE_UNSPECIFIED: Unspecified means legacy behavior.\n - DESCRIBE_TASK_QUEUE_MODE_ENHANCED: Enhanced mode reports aggregated results for all partitions, supports Build IDs, and reports richer info.", "in": "query", "required": false, "type": "string", @@ -5116,7 +5215,7 @@ }, { "name": "taskQueueTypes", - "description": "Task queue types to report info about. If not specified, all types are considered.\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", + "description": "Deprecated (as part of the ENHANCED mode deprecation).\nTask queue types to report info about. If not specified, all types are considered.\n\n - TASK_QUEUE_TYPE_WORKFLOW: Workflow type of task queue.\n - TASK_QUEUE_TYPE_ACTIVITY: Activity type of task queue.\n - TASK_QUEUE_TYPE_NEXUS: Task queue type for dispatching Nexus requests.", "in": "query", "required": false, "type": "array", @@ -5131,23 +5230,16 @@ }, "collectionFormat": "multi" }, - { - "name": "reportStats", - "description": "Report stats for the requested task queue types and versions", - "in": "query", - "required": false, - "type": "boolean" - }, { "name": "reportPollers", - "description": "Report list of pollers for requested task queue types and versions", + "description": "Deprecated (as part of the ENHANCED mode deprecation).\nReport list of pollers for requested task queue types and versions.", "in": "query", "required": false, "type": "boolean" }, { "name": "reportTaskReachability", - "description": "Report task reachability for the requested versions and all task types (task reachability is not reported\nper task type).", + "description": "Deprecated (as part of the ENHANCED mode deprecation).\nReport task reachability for the requested versions and all task types (task reachability is not reported\nper task type).", "in": "query", "required": false, "type": "boolean" @@ -5693,6 +5785,98 @@ ] } }, + "/namespaces/{namespace}/workers": { + "get": { + "summary": "ListWorkers is a visibility API to list worker status information in a specific namespace.", + "operationId": "ListWorkers", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1ListWorkersResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "pageSize", + "in": "query", + "required": false, + "type": "integer", + "format": "int32" + }, + { + "name": "nextPageToken", + "in": "query", + "required": false, + "type": "string", + "format": "byte" + }, + { + "name": "query", + "description": "`query` in ListWorkers is used to filter workers based on worker status info.\nThe following worker status attributes are expected are supported as part of the query:\n* WorkerInstanceKey\n* WorkerIdentity\n* HostName\n* TaskQueue\n* DeploymentName\n* BuildId\n* SdkName\n* SdkVersion\n* StartTime\n* LastHeartbeatTime\n* Status\nCurrently metrics are not supported as a part of ListWorkers query.", + "in": "query", + "required": false, + "type": "string" + } + ], + "tags": [ + "WorkflowService" + ] + } + }, + "/namespaces/{namespace}/workers/heartbeat": { + "post": { + "summary": "WorkerHeartbeat receive heartbeat request from the worker.", + "operationId": "RecordWorkerHeartbeat", + "responses": { + "200": { + "description": "A successful response.", + "schema": { + "$ref": "#/definitions/v1RecordWorkerHeartbeatResponse" + } + }, + "default": { + "description": "An unexpected error response.", + "schema": { + "$ref": "#/definitions/rpcStatus" + } + } + }, + "parameters": [ + { + "name": "namespace", + "description": "Namespace this worker belongs to.", + "in": "path", + "required": true, + "type": "string" + }, + { + "name": "body", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/WorkflowServiceRecordWorkerHeartbeatBody" + } + } + ], + "tags": [ + "WorkflowService" + ] + } + }, "/namespaces/{namespace}/workflow-count": { "get": { "summary": "CountWorkflowExecutions is a visibility API to count of workflow executions in a specific namespace.", @@ -6812,6 +6996,18 @@ }, "routingConfig": { "$ref": "#/definitions/v1RoutingConfig" + }, + "latestVersionSummary": { + "$ref": "#/definitions/WorkerDeploymentInfoWorkerDeploymentVersionSummary", + "description": "Summary of the version that was added most recently in the Worker Deployment." + }, + "currentVersionSummary": { + "$ref": "#/definitions/WorkerDeploymentInfoWorkerDeploymentVersionSummary", + "description": "Summary of the current version of the Worker Deployment." + }, + "rampingVersionSummary": { + "$ref": "#/definitions/WorkerDeploymentInfoWorkerDeploymentVersionSummary", + "description": "Summary of the ramping version of the Worker Deployment." } }, "title": "A subset of WorkerDeploymentInfo" @@ -6922,7 +7118,7 @@ "properties": { "operationId": { "type": "string", - "description": "Deprecated: Renamed to operation_token." + "description": "Deprecated. Renamed to operation_token." }, "links": { "type": "array", @@ -7095,6 +7291,10 @@ "type": "string", "description": "Deprecated. Use `deployment_version`." }, + "status": { + "$ref": "#/definitions/v1WorkerDeploymentVersionStatus", + "description": "The status of the Worker Deployment Version." + }, "deploymentVersion": { "$ref": "#/definitions/v1WorkerDeploymentVersion", "description": "Required." @@ -7104,7 +7304,37 @@ "format": "date-time" }, "drainageStatus": { - "$ref": "#/definitions/v1VersionDrainageStatus" + "$ref": "#/definitions/v1VersionDrainageStatus", + "description": "Deprecated. Use `drainage_info` instead." + }, + "drainageInfo": { + "$ref": "#/definitions/v1VersionDrainageInfo", + "title": "Information about workflow drainage to help the user determine when it is safe\nto decommission a Version. Not present while version is current or ramping" + }, + "currentSinceTime": { + "type": "string", + "format": "date-time", + "description": "Unset if not current." + }, + "rampingSinceTime": { + "type": "string", + "format": "date-time", + "description": "Unset if not ramping. Updated when the version first starts ramping, not on each ramp change." + }, + "routingUpdateTime": { + "type": "string", + "format": "date-time", + "description": "Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed." + }, + "firstActivationTime": { + "type": "string", + "format": "date-time", + "description": "Timestamp when this version first became current or ramping." + }, + "lastDeactivationTime": { + "type": "string", + "format": "date-time", + "description": "Timestamp when this version last stopped being current or ramping." } } }, @@ -7339,6 +7569,18 @@ } } }, + "WorkflowServiceRecordWorkerHeartbeatBody": { + "type": "object", + "properties": { + "identity": { + "type": "string", + "description": "The identity of the client who initiated this request." + }, + "workerHeartbeat": { + "$ref": "#/definitions/v1WorkerHeartbeat" + } + } + }, "WorkflowServiceRequestCancelWorkflowExecutionBody": { "type": "object", "properties": { @@ -7407,6 +7649,10 @@ "jitter": { "type": "string", "title": "If set, and activity is in backoff, the activity will start at a random time within the specified jitter duration.\n(unless it is paused and keep_paused is set)" + }, + "restoreOriginalOptions": { + "type": "boolean", + "description": "If set, the activity options will be restored to the defaults.\nDefault options are then options activity was created with.\nThey are part of the first SCHEDULE event." } } }, @@ -7437,7 +7683,7 @@ }, "resetReapplyType": { "$ref": "#/definitions/v1ResetReapplyType", - "title": "Event types to be reapplied (deprecated)\nDefault: RESET_REAPPLY_TYPE_SIGNAL" + "title": "Deprecated. Use `options`.\nDefault: RESET_REAPPLY_TYPE_SIGNAL" }, "resetReapplyExcludeTypes": { "type": "array", @@ -7764,7 +8010,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." }, "retryPolicy": { "$ref": "#/definitions/v1RetryPolicy", @@ -7835,7 +8081,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." }, "header": { "$ref": "#/definitions/v1Header", @@ -8144,6 +8390,10 @@ "type": { "type": "string", "description": "Update all running activities of this type." + }, + "restoreOriginal": { + "type": "boolean", + "description": "If set, the activity options will be restored to the default.\nDefault options are then options activity was created with.\nThey are part of the first SCHEDULE event.\nThis flag cannot be combined with any other option; if you supply\nrestore_original together with other options, the request will be rejected." } } }, @@ -8221,6 +8471,10 @@ "type": "string" }, "description": "List of keys to remove from the metadata." + }, + "identity": { + "type": "string", + "description": "Optional. The identity of the client who initiated this request." } }, "description": "Used to update the user-defined metadata of a Worker Deployment Version." @@ -8917,11 +9171,11 @@ }, "resetType": { "$ref": "#/definitions/v1ResetType", - "description": "Reset type (deprecated, use `options`)." + "description": "Deprecated. Use `options`." }, "resetReapplyType": { "$ref": "#/definitions/v1ResetReapplyType", - "description": "History event reapply options (deprecated, use `options`)." + "description": "Deprecated. Use `options`." }, "postResetOperations": { "type": "array", @@ -9220,7 +9474,7 @@ }, "operationId": { "type": "string", - "description": "Operation ID as originally generated by a Handler.\n\nDeprecated: Renamed to operation_token." + "description": "Operation ID as originally generated by a Handler.\n\nDeprecated. Renamed to operation_token." }, "operationToken": { "type": "string", @@ -9695,7 +9949,7 @@ }, "inheritBuildId": { "type": "boolean", - "description": "If this is set, the new execution inherits the Build ID of the current execution. Otherwise,\nthe assignment rules will be used to independently assign a Build ID to the new execution." + "description": "If this is set, the new execution inherits the Build ID of the current execution. Otherwise,\nthe assignment rules will be used to independently assign a Build ID to the new execution.\nDeprecated. Only considered for versioning v0.2." } } }, @@ -10027,23 +10281,26 @@ "items": { "type": "object", "$ref": "#/definitions/v1PollerInfo" - }, - "description": "Deprecated. Use `versions_info.types_info.pollers` with `ENHANCED` mode instead.\nNot set in `ENHANCED` mode." + } + }, + "stats": { + "$ref": "#/definitions/v1TaskQueueStats", + "description": "Statistics for the task queue. Only populated when `report_stats` is set to true in the request." + }, + "versioningInfo": { + "$ref": "#/definitions/v1TaskQueueVersioningInfo", + "description": "Specifies which Worker Deployment Version(s) Server routes this Task Queue's tasks to.\nWhen not present, it means the tasks are routed to Unversioned workers (workers with\nUNVERSIONED or unspecified WorkerVersioningMode.)\nTask Queue Versioning info is updated indirectly by calling SetWorkerDeploymentCurrentVersion\nand SetWorkerDeploymentRampingVersion on Worker Deployments.\nNote: This information is not relevant to Pinned workflow executions and their activities as\nthey are always routed to their Pinned Deployment Version. However, new workflow executions\nare typically not Pinned until they complete their first task (unless they are started with\na Pinned VersioningOverride or are Child Workflows of a Pinned parent)." }, "taskQueueStatus": { "$ref": "#/definitions/v1TaskQueueStatus", - "description": "Deprecated. Not set in `ENHANCED` mode." + "description": "Deprecated.\nStatus of the task queue. Only populated when `include_task_queue_status` is set to true in the request." }, "versionsInfo": { "type": "object", "additionalProperties": { "$ref": "#/definitions/v1TaskQueueVersionInfo" }, - "description": "This map contains Task Queue information for each Build ID. Empty string as key value means unversioned.\nOnly set in `ENHANCED` mode." - }, - "versioningInfo": { - "$ref": "#/definitions/v1TaskQueueVersioningInfo", - "description": "Specifies which Worker Deployment Version(s) Server routes this Task Queue's tasks to.\nWhen not present, it means the tasks are routed to Unversioned workers (workers with\nUNVERSIONED or unspecified WorkerVersioningMode.)\nTask Queue Versioning info is updated indirectly by calling SetWorkerDeploymentCurrentVersion\nand SetWorkerDeploymentRampingVersion on Worker Deployments.\nNote: This information is not relevant to Pinned workflow executions and their activities as\nthey are always routed to their Pinned Deployment Version. However, new workflow executions\nare typically not Pinned until they complete their first task (unless they are started with\na Pinned VersioningOverride or are Child Workflows of a Pinned parent)." + "description": "Deprecated.\nOnly returned in ENHANCED mode.\nThis map contains Task Queue information for each Build ID. Empty string as key value means unversioned." } } }, @@ -10324,7 +10581,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." } } }, @@ -11113,6 +11370,23 @@ } } }, + "v1ListWorkersResponse": { + "type": "object", + "properties": { + "workersInfo": { + "type": "array", + "items": { + "type": "object", + "$ref": "#/definitions/v1WorkerInfo" + } + }, + "nextPageToken": { + "type": "string", + "format": "byte", + "title": "Next page token" + } + } + }, "v1ListWorkflowExecutionsResponse": { "type": "object", "properties": { @@ -11642,7 +11916,7 @@ }, "operationId": { "type": "string", - "description": "Operation ID - may be empty if the operation completed synchronously.\n\nDeprecated: Renamed to operation_token." + "description": "Operation ID - may be empty if the operation completed synchronously.\n\nDeprecated. Renamed to operation_token." }, "operationToken": { "type": "string", @@ -11850,15 +12124,15 @@ "useWorkflowBuildId": { "type": "object", "properties": {}, - "description": "When present, it means this activity is assigned to the build ID of its workflow." + "description": "Deprecated. When present, it means this activity is assigned to the build ID of its workflow." }, "lastIndependentlyAssignedBuildId": { "type": "string", - "description": "This means the activity is independently versioned and not bound to the build ID of its workflow.\nThe activity will use the build id in this field instead.\nIf the task fails and is scheduled again, the assigned build ID may change according to the latest versioning\nrules." + "description": "Deprecated. This means the activity is independently versioned and not bound to the build ID of its workflow.\nThe activity will use the build id in this field instead.\nIf the task fails and is scheduled again, the assigned build ID may change according to the latest versioning\nrules." }, "lastWorkerVersionStamp": { "$ref": "#/definitions/v1WorkerVersionStamp", - "title": "The version stamp of the worker to whom this activity was most recently dispatched\nDeprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv]" + "title": "Deprecated. The version stamp of the worker to whom this activity was most recently dispatched\nThis field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv]" }, "currentRetryInterval": { "type": "string", @@ -11955,7 +12229,7 @@ }, "operationId": { "type": "string", - "description": "Operation ID. Only set for asynchronous operations after a successful StartOperation call.\n\nDeprecated: Renamed to operation_token." + "description": "Operation ID. Only set for asynchronous operations after a successful StartOperation call.\n\nDeprecated. Renamed to operation_token." }, "scheduleToCloseTimeout": { "type": "string", @@ -12446,6 +12720,9 @@ } } }, + "v1RecordWorkerHeartbeatResponse": { + "type": "object" + }, "v1RegisterNamespaceRequest": { "type": "object", "properties": { @@ -12598,7 +12875,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." } } }, @@ -12622,7 +12899,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." }, "childWorkflowOnly": { "type": "boolean", @@ -12693,7 +12970,7 @@ }, "resetReapplyType": { "$ref": "#/definitions/v1ResetReapplyType", - "title": "Event types to be reapplied (deprecated)\nDefault: RESET_REAPPLY_TYPE_SIGNAL" + "title": "Deprecated. Use `options`.\nDefault: RESET_REAPPLY_TYPE_SIGNAL" }, "currentRunOnly": { "type": "boolean", @@ -12718,7 +12995,7 @@ }, "binaryChecksum": { "type": "string", - "description": "A worker binary version identifier (deprecated)." + "description": "Deprecated. A worker binary version identifier." }, "runId": { "type": "string", @@ -12778,7 +13055,7 @@ "RESET_REAPPLY_TYPE_ALL_ELIGIBLE" ], "default": "RESET_REAPPLY_TYPE_UNSPECIFIED", - "description": "Event types to include when reapplying events. Deprecated: applications\nshould use ResetReapplyExcludeType to specify exclusions from this set, and\nnew event types should be added to ResetReapplyExcludeType instead of here.\n\n - RESET_REAPPLY_TYPE_SIGNAL: Signals are reapplied when workflow is reset.\n - RESET_REAPPLY_TYPE_NONE: No events are reapplied when workflow is reset.\n - RESET_REAPPLY_TYPE_ALL_ELIGIBLE: All eligible events are reapplied when workflow is reset." + "description": "Deprecated: applications should use ResetReapplyExcludeType to specify\nexclusions from this set, and new event types should be added to ResetReapplyExcludeType\ninstead of here.\n\n - RESET_REAPPLY_TYPE_SIGNAL: Signals are reapplied when workflow is reset.\n - RESET_REAPPLY_TYPE_NONE: No events are reapplied when workflow is reset.\n - RESET_REAPPLY_TYPE_ALL_ELIGIBLE: All eligible events are reapplied when workflow is reset." }, "v1ResetStickyTaskQueueResponse": { "type": "object" @@ -12791,7 +13068,7 @@ "RESET_TYPE_LAST_WORKFLOW_TASK" ], "default": "RESET_TYPE_UNSPECIFIED", - "description": "Reset type options. Deprecated, see temporal.api.common.v1.ResetOptions.\n\n - RESET_TYPE_FIRST_WORKFLOW_TASK: Resets to event of the first workflow task completed, or if it does not exist, the event after task scheduled.\n - RESET_TYPE_LAST_WORKFLOW_TASK: Resets to event of the last workflow task completed, or if it does not exist, the event after task scheduled." + "description": "Deprecated, see temporal.api.common.v1.ResetOptions.\n\n - RESET_TYPE_FIRST_WORKFLOW_TASK: Resets to event of the first workflow task completed, or if it does not exist, the event after task scheduled.\n - RESET_TYPE_LAST_WORKFLOW_TASK: Resets to event of the last workflow task completed, or if it does not exist, the event after task scheduled." }, "v1ResetWorkflowExecutionResponse": { "type": "object", @@ -13157,7 +13434,8 @@ "format": "date-time" }, "invalidScheduleError": { - "type": "string" + "type": "string", + "description": "Deprecated." } } }, @@ -13346,7 +13624,7 @@ "type": "object", "$ref": "#/definitions/v1CalendarSpec" }, - "description": "Any timestamps matching any of exclude_* will be skipped.\n\nuse exclude_structured_calendar" + "description": "Any timestamps matching any of exclude_* will be skipped.\nDeprecated. Use exclude_structured_calendar." }, "excludeStructuredCalendar": { "type": "array", @@ -13557,7 +13835,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." } } }, @@ -13589,7 +13867,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." }, "childWorkflowOnly": { "type": "boolean", @@ -13678,7 +13956,7 @@ }, "inheritBuildId": { "type": "boolean", - "description": "If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment\nrules of the child's Task Queue will be used to independently assign a Build ID to it." + "description": "If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment\nrules of the child's Task Queue will be used to independently assign a Build ID to it.\nDeprecated. Only considered for versioning v0.2." }, "priority": { "$ref": "#/definitions/v1Priority", @@ -13716,7 +13994,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." }, "initiatedEventId": { "type": "string", @@ -13770,7 +14048,7 @@ }, "control": { "type": "string", - "title": "Deprecated" + "description": "Deprecated." }, "workflowTaskCompletedEventId": { "type": "string", @@ -13799,7 +14077,7 @@ }, "inheritBuildId": { "type": "boolean", - "description": "If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment\nrules of the child's Task Queue will be used to independently assign a Build ID to it." + "description": "If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment\nrules of the child's Task Queue will be used to independently assign a Build ID to it.\nDeprecated. Only considered for versioning v0.2." }, "priority": { "$ref": "#/definitions/v1Priority", @@ -14834,7 +15112,7 @@ "description": "Required if behavior is `PINNED`. Must be absent if behavior is not `PINNED`.\nIdentifies the worker deployment version to pin the workflow to, in the format\n\".\".\nDeprecated. Use `override.pinned.version`." } }, - "description": "Used to override the versioning behavior (and pinned deployment version, if applicable) of a\nspecific workflow execution. If set, takes precedence over the worker-sent values. See\n`WorkflowExecutionInfo.VersioningInfo` for more information. To remove the override, call\n`UpdateWorkflowExecutionOptions` with a null `VersioningOverride`, and use the `update_mask`\nto indicate that it should be mutated." + "description": "Used to override the versioning behavior (and pinned deployment version, if applicable) of a\nspecific workflow execution. If set, takes precedence over the worker-sent values. See\n`WorkflowExecutionInfo.VersioningInfo` for more information. To remove the override, call\n`UpdateWorkflowExecutionOptions` with a null `VersioningOverride`, and use the `update_mask`\nto indicate that it should be mutated.\nPinned overrides are automatically inherited by child workflows, continue-as-new workflows,\nworkflow retries, and cron workflows." }, "v1WaitPolicy": { "type": "object", @@ -14914,6 +15192,10 @@ "type": "string", "description": "Deprecated. Use `deployment_version`." }, + "status": { + "$ref": "#/definitions/v1WorkerDeploymentVersionStatus", + "description": "The status of the Worker Deployment Version." + }, "deploymentVersion": { "$ref": "#/definitions/v1WorkerDeploymentVersion", "description": "Required." @@ -14933,12 +15215,22 @@ "currentSinceTime": { "type": "string", "format": "date-time", - "description": "\nNil if not current." + "description": "\nUnset if not current." }, "rampingSinceTime": { "type": "string", "format": "date-time", - "description": "\nNil if not ramping. Updated when the version first starts ramping, not on each ramp change." + "description": "\nUnset if not ramping. Updated when the version first starts ramping, not on each ramp change." + }, + "firstActivationTime": { + "type": "string", + "format": "date-time", + "description": "Timestamp when this version first became current or ramping." + }, + "lastDeactivationTime": { + "type": "string", + "format": "date-time", + "description": "Timestamp when this version last stopped being current or ramping." }, "rampPercentage": { "type": "number", @@ -14964,6 +15256,206 @@ }, "description": "A Worker Deployment Version (Version, for short) represents all workers of the same \ncode and config within a Deployment. Workers of the same Version are expected to \nbehave exactly the same so when executions move between them there are no \nnon-determinism issues.\nWorker Deployment Versions are created in Temporal server automatically when \ntheir first poller arrives to the server.\nExperimental. Worker Deployments are experimental and might significantly change in the future." }, + "v1WorkerDeploymentVersionStatus": { + "type": "string", + "enum": [ + "WORKER_DEPLOYMENT_VERSION_STATUS_UNSPECIFIED", + "WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE", + "WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT", + "WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING", + "WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING", + "WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED" + ], + "default": "WORKER_DEPLOYMENT_VERSION_STATUS_UNSPECIFIED", + "description": "Specify the status of a Worker Deployment Version.\nExperimental. Worker Deployments are experimental and might significantly change in the future.\n\n - WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE: The Worker Deployment Version has been created inside the Worker Deployment but is not used by any\nworkflow executions. These Versions can still have workflows if they have an explicit Versioning Override targeting\nthis Version. Such Versioning Override could be set at workflow start time, or at a later time via `UpdateWorkflowExecutionOptions`.\n - WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT: The Worker Deployment Version is the current version of the Worker Deployment. All new workflow executions \nand tasks of existing unversioned or AutoUpgrade workflows are routed to this version.\n - WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING: The Worker Deployment Version is the ramping version of the Worker Deployment. A subset of new Pinned workflow executions are \nrouted to this version. Moreover, a portion of existing unversioned or AutoUpgrade workflow executions are also routed to this version.\n - WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING: The Worker Deployment Version is not used by new workflows but is still used by\nopen pinned workflows. The version cannot be decommissioned safely.\n - WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED: The Worker Deployment Version is not used by new or open workflows, but might be still needed by\nQueries sent to closed workflows. The version can be decommissioned safely if user does\nnot query closed workflows. If the user does query closed workflows for some time x after\nworkflows are closed, they should decommission the version after it has been drained for that duration." + }, + "v1WorkerHeartbeat": { + "type": "object", + "properties": { + "workerInstanceKey": { + "type": "string", + "description": "Worker identifier, should be unique for the namespace.\nIt is distinct from worker identity, which is not necessarily namespace-unique." + }, + "workerIdentity": { + "type": "string", + "description": "Worker identity, set by the client, may not be unique.\nUsually host_name+(user group name)+process_id, but can be overwritten by the user." + }, + "hostInfo": { + "$ref": "#/definitions/v1WorkerHostInfo", + "description": "Worker host information." + }, + "taskQueue": { + "type": "string", + "description": "Task queue this worker is polling for tasks." + }, + "deploymentVersion": { + "$ref": "#/definitions/v1WorkerDeploymentVersion" + }, + "sdkName": { + "type": "string" + }, + "sdkVersion": { + "type": "string" + }, + "status": { + "$ref": "#/definitions/v1WorkerStatus", + "description": "Worker status. Defined by SDK." + }, + "startTime": { + "type": "string", + "format": "date-time", + "title": "Worker start time.\nIt can be used to determine worker uptime. (current time - start time)" + }, + "heartbeatTime": { + "type": "string", + "format": "date-time", + "description": "Timestamp of this heartbeat, coming from the worker. Worker should set it to \"now\".\nNote that this timestamp comes directly from the worker and is subject to workers' clock skew." + }, + "elapsedSinceLastHeartbeat": { + "type": "string", + "description": "Elapsed time since the last heartbeat from the worker." + }, + "workflowTaskSlotsInfo": { + "$ref": "#/definitions/v1WorkerSlotsInfo" + }, + "activityTaskSlotsInfo": { + "$ref": "#/definitions/v1WorkerSlotsInfo" + }, + "nexusTaskSlotsInfo": { + "$ref": "#/definitions/v1WorkerSlotsInfo" + }, + "localActivitySlotsInfo": { + "$ref": "#/definitions/v1WorkerSlotsInfo" + }, + "workflowPollerInfo": { + "$ref": "#/definitions/v1WorkerPollerInfo" + }, + "workflowStickyPollerInfo": { + "$ref": "#/definitions/v1WorkerPollerInfo" + }, + "activityPollerInfo": { + "$ref": "#/definitions/v1WorkerPollerInfo" + }, + "nexusPollerInfo": { + "$ref": "#/definitions/v1WorkerPollerInfo" + }, + "totalStickyCacheHit": { + "type": "integer", + "format": "int32", + "description": "A Workflow Task found a cached Workflow Execution to run against." + }, + "totalStickyCacheMiss": { + "type": "integer", + "format": "int32", + "description": "A Workflow Task did not find a cached Workflow execution to run against." + }, + "currentStickyCacheSize": { + "type": "integer", + "format": "int32", + "description": "Current cache size, expressed in number of Workflow Executions." + } + }, + "description": "Worker info message, contains information about the worker and its current state.\nAll information is provided by the worker itself." + }, + "v1WorkerHostInfo": { + "type": "object", + "properties": { + "hostName": { + "type": "string", + "description": "Worker host identifier." + }, + "processId": { + "type": "string", + "description": "Worker process identifier, should be unique for the host." + }, + "currentHostCpuUsage": { + "type": "number", + "format": "float", + "description": "System used CPU as a float in the range [0.0, 1.0] where 1.0 is defined as all\ncores on the host pegged." + }, + "currentHostMemUsage": { + "type": "number", + "format": "float", + "description": "System used memory as a float in the range [0.0, 1.0] where 1.0 is defined as\nall available memory on the host is used." + } + }, + "title": "Holds everything needed to identify the worker host/process context" + }, + "v1WorkerInfo": { + "type": "object", + "properties": { + "workerHeartbeat": { + "$ref": "#/definitions/v1WorkerHeartbeat" + } + } + }, + "v1WorkerPollerInfo": { + "type": "object", + "properties": { + "currentPollers": { + "type": "integer", + "format": "int32", + "description": "Number of polling RPCs that are currently in flight." + }, + "lastSuccessfulPollTime": { + "type": "string", + "format": "date-time" + }, + "isAutoscaling": { + "type": "boolean", + "title": "Set true if the number of concurrent pollers is auto-scaled" + } + } + }, + "v1WorkerSlotsInfo": { + "type": "object", + "properties": { + "currentAvailableSlots": { + "type": "integer", + "format": "int32", + "description": "Number of slots available for the worker to specific tasks.\nMay be -1 if the upper bound is not known." + }, + "currentUsedSlots": { + "type": "integer", + "format": "int32", + "description": "Number of slots used by the worker for specific tasks." + }, + "slotSupplierKind": { + "type": "string", + "title": "Kind of the slot supplier, which is used to determine how the slots are allocated.\nPossible values: \"Fixed | ResourceBased | Custom String\"" + }, + "totalProcessedTasks": { + "type": "integer", + "format": "int32", + "description": "Total number of tasks processed (completed both successfully and unsuccesfully, or any other way)\nby the worker since the worker started. This is a cumulative counter." + }, + "totalFailedTasks": { + "type": "integer", + "format": "int32", + "description": "Total number of failed tasks processed by the worker so far." + }, + "lastIntervalProcessedTasks": { + "type": "integer", + "format": "int32", + "description": "Number of tasks processed in since the last heartbeat from the worker.\nThis is a cumulative counter, and it is reset to 0 each time the worker sends a heartbeat.\nContains both successful and failed tasks." + }, + "lastIntervalFailureTasks": { + "type": "integer", + "format": "int32", + "description": "Number of failed tasks processed since the last heartbeat from the worker." + } + } + }, + "v1WorkerStatus": { + "type": "string", + "enum": [ + "WORKER_STATUS_UNSPECIFIED", + "WORKER_STATUS_RUNNING", + "WORKER_STATUS_SHUTTING_DOWN", + "WORKER_STATUS_SHUTDOWN" + ], + "default": "WORKER_STATUS_UNSPECIFIED" + }, "v1WorkerVersionCapabilities": { "type": "object", "properties": { @@ -15146,7 +15638,7 @@ }, "inheritBuildId": { "type": "boolean", - "description": "If this is set, the new execution inherits the Build ID of the current execution. Otherwise,\nthe assignment rules will be used to independently assign a Build ID to the new execution." + "description": "If this is set, the new execution inherits the Build ID of the current execution. Otherwise,\nthe assignment rules will be used to independently assign a Build ID to the new execution.\nDeprecated. Only considered for versioning v0.2." } } }, @@ -15367,11 +15859,11 @@ }, "header": { "$ref": "#/definitions/v1Header", - "description": "Headers that were passed by the sender of the signal and copied by temporal \nserver into the workflow task." + "description": "Headers that were passed by the sender of the signal and copied by temporal\nserver into the workflow task." }, "skipGenerateWorkflowTask": { "type": "boolean", - "description": "This field is deprecated and never respected. It should always be set to false." + "description": "Deprecated. This field is never respected and should always be set to false." }, "externalWorkflowExecution": { "$ref": "#/definitions/v1WorkflowExecution", @@ -15422,7 +15914,7 @@ }, "continuedExecutionRunId": { "type": "string", - "description": "Run id of the previous workflow which continued-as-new or retired or cron executed into this\nworkflow." + "description": "Run id of the previous workflow which continued-as-new or retried or cron executed into this\nworkflow." }, "initiator": { "$ref": "#/definitions/v1ContinueAsNewInitiator" @@ -15509,19 +16001,19 @@ }, "versioningOverride": { "$ref": "#/definitions/v1VersioningOverride", - "description": "Versioning override applied to this workflow when it was started." + "description": "Versioning override applied to this workflow when it was started.\nChildren, crons, retries, and continue-as-new will inherit source run's override if pinned\nand if the new workflow's Task Queue belongs to the override version." }, "parentPinnedWorkerDeploymentVersion": { "type": "string", - "description": "When present, it means this is a child workflow of a parent that is Pinned to this Worker\nDeployment Version. In this case, child workflow will start as Pinned to this Version instead\nof starting on the Current Version of its Task Queue.\nThis is set only if the child workflow is starting on a Task Queue belonging to the same\nWorker Deployment Version.\nDeprecated. Use `parent_pinned_deployment_version`." - }, - "parentPinnedDeploymentVersion": { - "$ref": "#/definitions/v1WorkerDeploymentVersion", - "description": "When present, it means this is a child workflow of a parent that is Pinned to this Worker\nDeployment Version. In this case, child workflow will start as Pinned to this Version instead\nof starting on the Current Version of its Task Queue.\nThis is set only if the child workflow is starting on a Task Queue belonging to the same\nWorker Deployment Version." + "description": "When present, it means this is a child workflow of a parent that is Pinned to this Worker\nDeployment Version. In this case, child workflow will start as Pinned to this Version instead\nof starting on the Current Version of its Task Queue.\nThis is set only if the child workflow is starting on a Task Queue belonging to the same\nWorker Deployment Version.\nDeprecated. Use `parent_versioning_info`." }, "priority": { "$ref": "#/definitions/v1Priority", "title": "Priority metadata" + }, + "inheritedPinnedVersion": { + "$ref": "#/definitions/v1WorkerDeploymentVersion", + "description": "If present, the new workflow should start on this version with pinned base behavior.\nChild of pinned parent will inherit the parent's version if the Child's Task Queue belongs to that version.\n\nNew run initiated by workflow ContinueAsNew of pinned run, will inherit the previous run's version if the\nnew run's Task Queue belongs to that version.\n\nNew run initiated by workflow Cron will never inherit.\n\nNew run initiated by workflow Retry will only inherit if the retried run is effectively pinned at the time\nof retry, and the retried run inherited a pinned version when it started (ie. it is a child of a pinned\nparent, or a CaN of a pinned run, and is running on a Task Queue in the inherited version).\n\nPinned override is inherited if Task Queue of new run is compatible with the override version.\nOverride is inherited separately and takes precedence over inherited base version." } }, "title": "Always the first event in workflow history" @@ -15653,7 +16145,7 @@ "properties": { "behavior": { "$ref": "#/definitions/v1VersioningBehavior", - "description": "Versioning behavior determines how the server should treat this execution when workers are\nupgraded. When present it means this workflow execution is versioned; UNSPECIFIED means\nunversioned. See the comments in `VersioningBehavior` enum for more info about different\nbehaviors.\nThis field is first set after an execution completes its first workflow task on a versioned\nworker, and set again on completion of every subsequent workflow task.\nFor child workflows of Pinned parents, this will be set to Pinned (along with `version`) when\nthe the child starts so that child's first workflow task goes to the same Version as the\nparent. After the first workflow task, it depends on the child workflow itself if it wants\nto stay pinned or become unpinned (according to Versioning Behavior set in the worker).\nNote that `behavior` is overridden by `versioning_override` if the latter is present." + "description": "Versioning behavior determines how the server should treat this execution when workers are\nupgraded. When present it means this workflow execution is versioned; UNSPECIFIED means\nunversioned. See the comments in `VersioningBehavior` enum for more info about different\nbehaviors.\nThis field is first set after an execution completes its first workflow task on a versioned\nworker, and set again on completion of every subsequent workflow task.\nFor child workflows of Pinned parents, this will be set to Pinned (along with `deployment_version`) when\nthe the child starts so that child's first workflow task goes to the same Version as the\nparent. After the first workflow task, it depends on the child workflow itself if it wants\nto stay pinned or become unpinned (according to Versioning Behavior set in the worker).\nNote that `behavior` is overridden by `versioning_override` if the latter is present." }, "deployment": { "$ref": "#/definitions/v1Deployment", @@ -15669,7 +16161,7 @@ }, "versioningOverride": { "$ref": "#/definitions/v1VersioningOverride", - "description": "Present if user has set an execution-specific versioning override. This override takes\nprecedence over SDK-sent `behavior` (and `version` when override is PINNED). An\noverride can be set when starting a new execution, as well as afterwards by calling the\n`UpdateWorkflowExecutionOptions` API.\nPinned overrides are automatically inherited by child workflows." + "description": "Present if user has set an execution-specific versioning override. This override takes\nprecedence over SDK-sent `behavior` (and `version` when override is PINNED). An\noverride can be set when starting a new execution, as well as afterwards by calling the\n`UpdateWorkflowExecutionOptions` API.\nPinned overrides are automatically inherited by child workflows, continue-as-new workflows,\nworkflow retries, and cron workflows." }, "deploymentTransition": { "$ref": "#/definitions/v1DeploymentTransition", @@ -15677,7 +16169,7 @@ }, "versionTransition": { "$ref": "#/definitions/v1DeploymentVersionTransition", - "description": "When present, indicates the workflow is transitioning to a different deployment version\n(which may belong to the same deployment name or another). Can indicate one of the following\ntransitions: unversioned -> versioned, versioned -> versioned\non a different deployment version, or versioned -> unversioned.\nNot applicable to workflows with PINNED behavior.\nWhen a workflow with AUTO_UPGRADE behavior creates a new workflow task, it will automatically\nstart a transition to the task queue's current version if the task queue's current version is\ndifferent from the workflow's current deployment version.\nIf the AUTO_UPGRADE workflow is stuck due to backlogged activity or workflow tasks, those\ntasks will be redirected to the task queue's current version. As soon as a poller from\nthat deployment version is available to receive the task, the workflow will automatically\nstart a transition to that version and continue execution there.\nA version transition can only exist while there is a pending or started workflow task.\nOnce the pending workflow task completes on the transition's target version, the\ntransition completes and the workflow's `behavior`, and `version` fields are updated per the\nworker's task completion response.\nPending activities will not start new attempts during a transition. Once the transition is\ncompleted, pending activities will start their next attempt on the new version." + "description": "When present, indicates the workflow is transitioning to a different deployment version\n(which may belong to the same deployment name or another). Can indicate one of the following\ntransitions: unversioned -> versioned, versioned -> versioned\non a different deployment version, or versioned -> unversioned.\nNot applicable to workflows with PINNED behavior.\nWhen a workflow with AUTO_UPGRADE behavior creates a new workflow task, it will automatically\nstart a transition to the task queue's current version if the task queue's current version is\ndifferent from the workflow's current deployment version.\nIf the AUTO_UPGRADE workflow is stuck due to backlogged activity or workflow tasks, those\ntasks will be redirected to the task queue's current version. As soon as a poller from\nthat deployment version is available to receive the task, the workflow will automatically\nstart a transition to that version and continue execution there.\nA version transition can only exist while there is a pending or started workflow task.\nOnce the pending workflow task completes on the transition's target version, the\ntransition completes and the workflow's `behavior`, and `deployment_version` fields are updated per the\nworker's task completion response.\nPending activities will not start new attempts during a transition. Once the transition is\ncompleted, pending activities will start their next attempt on the new version." } }, "description": "Holds all the information about worker versioning for a particular workflow execution.\nExperimental. Versioning info is experimental and might change in the future." @@ -15968,10 +16460,11 @@ "WORKFLOW_TASK_FAILED_CAUSE_BAD_SCHEDULE_NEXUS_OPERATION_ATTRIBUTES", "WORKFLOW_TASK_FAILED_CAUSE_PENDING_NEXUS_OPERATIONS_LIMIT_EXCEEDED", "WORKFLOW_TASK_FAILED_CAUSE_BAD_REQUEST_CANCEL_NEXUS_OPERATION_ATTRIBUTES", - "WORKFLOW_TASK_FAILED_CAUSE_FEATURE_DISABLED" + "WORKFLOW_TASK_FAILED_CAUSE_FEATURE_DISABLED", + "WORKFLOW_TASK_FAILED_CAUSE_GRPC_MESSAGE_TOO_LARGE" ], "default": "WORKFLOW_TASK_FAILED_CAUSE_UNSPECIFIED", - "description": "Workflow tasks can fail for various reasons. Note that some of these reasons can only originate\nfrom the server, and some of them can only originate from the SDK/worker.\n\n - WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_COMMAND: Between starting and completing the workflow task (with a workflow completion command), some\nnew command (like a signal) was processed into workflow history. The outstanding task will be\nfailed with this reason, and a worker must pick up a new task.\n - WORKFLOW_TASK_FAILED_CAUSE_RESET_STICKY_TASK_QUEUE: The worker wishes to fail the task and have the next one be generated on a normal, not sticky\nqueue. Generally workers should prefer to use the explicit `ResetStickyTaskQueue` RPC call.\n - WORKFLOW_TASK_FAILED_CAUSE_NON_DETERMINISTIC_ERROR: The worker encountered a mismatch while replaying history between what was expected, and\nwhat the workflow code actually did.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_CHILD_WORKFLOWS_LIMIT_EXCEEDED: We send the below error codes to users when their requests would violate a size constraint\nof their workflow. We do this to ensure that the state of their workflow does not become too\nlarge because that can cause severe performance degradation. You can modify the thresholds for\neach of these errors within your dynamic config.\n\nSpawning a new child workflow would cause this workflow to exceed its limit of pending child\nworkflows.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_ACTIVITIES_LIMIT_EXCEEDED: Starting a new activity would cause this workflow to exceed its limit of pending activities\nthat we track.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_SIGNALS_LIMIT_EXCEEDED: A workflow has a buffer of signals that have not yet reached their destination. We return this\nerror when sending a new signal would exceed the capacity of this buffer.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_REQUEST_CANCEL_LIMIT_EXCEEDED: Similarly, we have a buffer of pending requests to cancel other workflows. We return this error\nwhen our capacity for pending cancel requests is already reached.\n - WORKFLOW_TASK_FAILED_CAUSE_BAD_UPDATE_WORKFLOW_EXECUTION_MESSAGE: Workflow execution update message (update.Acceptance, update.Rejection, or update.Response)\nhas wrong format, or missing required fields.\n - WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_UPDATE: Similar to WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_COMMAND, but for updates.\n - WORKFLOW_TASK_FAILED_CAUSE_BAD_SCHEDULE_NEXUS_OPERATION_ATTRIBUTES: A workflow task completed with an invalid ScheduleNexusOperation command.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_NEXUS_OPERATIONS_LIMIT_EXCEEDED: A workflow task completed requesting to schedule a Nexus Operation exceeding the server configured limit.\n - WORKFLOW_TASK_FAILED_CAUSE_BAD_REQUEST_CANCEL_NEXUS_OPERATION_ATTRIBUTES: A workflow task completed with an invalid RequestCancelNexusOperation command.\n - WORKFLOW_TASK_FAILED_CAUSE_FEATURE_DISABLED: A workflow task completed requesting a feature that's disabled on the server (either system wide or - typically -\nfor the workflow's namespace).\nCheck the workflow task failure message for more information." + "description": "Workflow tasks can fail for various reasons. Note that some of these reasons can only originate\nfrom the server, and some of them can only originate from the SDK/worker.\n\n - WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_COMMAND: Between starting and completing the workflow task (with a workflow completion command), some\nnew command (like a signal) was processed into workflow history. The outstanding task will be\nfailed with this reason, and a worker must pick up a new task.\n - WORKFLOW_TASK_FAILED_CAUSE_RESET_STICKY_TASK_QUEUE: The worker wishes to fail the task and have the next one be generated on a normal, not sticky\nqueue. Generally workers should prefer to use the explicit `ResetStickyTaskQueue` RPC call.\n - WORKFLOW_TASK_FAILED_CAUSE_NON_DETERMINISTIC_ERROR: The worker encountered a mismatch while replaying history between what was expected, and\nwhat the workflow code actually did.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_CHILD_WORKFLOWS_LIMIT_EXCEEDED: We send the below error codes to users when their requests would violate a size constraint\nof their workflow. We do this to ensure that the state of their workflow does not become too\nlarge because that can cause severe performance degradation. You can modify the thresholds for\neach of these errors within your dynamic config.\n\nSpawning a new child workflow would cause this workflow to exceed its limit of pending child\nworkflows.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_ACTIVITIES_LIMIT_EXCEEDED: Starting a new activity would cause this workflow to exceed its limit of pending activities\nthat we track.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_SIGNALS_LIMIT_EXCEEDED: A workflow has a buffer of signals that have not yet reached their destination. We return this\nerror when sending a new signal would exceed the capacity of this buffer.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_REQUEST_CANCEL_LIMIT_EXCEEDED: Similarly, we have a buffer of pending requests to cancel other workflows. We return this error\nwhen our capacity for pending cancel requests is already reached.\n - WORKFLOW_TASK_FAILED_CAUSE_BAD_UPDATE_WORKFLOW_EXECUTION_MESSAGE: Workflow execution update message (update.Acceptance, update.Rejection, or update.Response)\nhas wrong format, or missing required fields.\n - WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_UPDATE: Similar to WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_COMMAND, but for updates.\n - WORKFLOW_TASK_FAILED_CAUSE_BAD_SCHEDULE_NEXUS_OPERATION_ATTRIBUTES: A workflow task completed with an invalid ScheduleNexusOperation command.\n - WORKFLOW_TASK_FAILED_CAUSE_PENDING_NEXUS_OPERATIONS_LIMIT_EXCEEDED: A workflow task completed requesting to schedule a Nexus Operation exceeding the server configured limit.\n - WORKFLOW_TASK_FAILED_CAUSE_BAD_REQUEST_CANCEL_NEXUS_OPERATION_ATTRIBUTES: A workflow task completed with an invalid RequestCancelNexusOperation command.\n - WORKFLOW_TASK_FAILED_CAUSE_FEATURE_DISABLED: A workflow task completed requesting a feature that's disabled on the server (either system wide or - typically -\nfor the workflow's namespace).\nCheck the workflow task failure message for more information.\n - WORKFLOW_TASK_FAILED_CAUSE_GRPC_MESSAGE_TOO_LARGE: A workflow task failed because a grpc message was too large." }, "v1WorkflowTaskFailedEventAttributes": { "type": "object", @@ -16012,7 +16505,7 @@ }, "binaryChecksum": { "type": "string", - "title": "DEPRECATED since 1.21 - This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv]\nIf a worker explicitly failed this task, its binary id" + "title": "Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv]\nIf a worker explicitly failed this task, its binary id" }, "workerVersion": { "$ref": "#/definitions/v1WorkerVersionStamp", diff --git a/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml b/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml index 28aab1cfb..1b454569e 100644 --- a/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml +++ b/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml @@ -1418,8 +1418,8 @@ paths: - name: taskQueueType in: query description: |- - Deprecated. Use `ENHANCED` mode with `task_queue_types`. Ignored in `ENHANCED` mode. - If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used. + If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used. + Only supported in default mode (use `task_queue_types` in ENHANCED mode instead). schema: enum: - TASK_QUEUE_TYPE_UNSPECIFIED @@ -1428,14 +1428,24 @@ paths: - TASK_QUEUE_TYPE_NEXUS type: string format: enum + - name: reportStats + in: query + description: Report stats for the requested task queue type(s). + schema: + type: boolean - name: includeTaskQueueStatus in: query - description: Deprecated. Ignored in `ENHANCED` mode. + description: |- + Deprecated, use `report_stats` instead. + If true, the task queue status will be included in the response. schema: type: boolean - name: apiMode in: query - description: All options except `task_queue_type` and `include_task_queue_status` are only available in the `ENHANCED` mode. + description: |- + Deprecated. ENHANCED mode is also being deprecated. + Select the API mode to use for this request: DEFAULT mode (if unset) or ENHANCED mode. + Consult the documentation for each field to understand which mode it is supported in. schema: enum: - DESCRIBE_TASK_QUEUE_MODE_UNSPECIFIED @@ -1463,7 +1473,9 @@ paths: type: boolean - name: taskQueueTypes in: query - description: Task queue types to report info about. If not specified, all types are considered. + description: |- + Deprecated (as part of the ENHANCED mode deprecation). + Task queue types to report info about. If not specified, all types are considered. schema: type: array items: @@ -1474,20 +1486,18 @@ paths: - TASK_QUEUE_TYPE_NEXUS type: string format: enum - - name: reportStats - in: query - description: Report stats for the requested task queue types and versions - schema: - type: boolean - name: reportPollers in: query - description: Report list of pollers for requested task queue types and versions + description: |- + Deprecated (as part of the ENHANCED mode deprecation). + Report list of pollers for requested task queue types and versions. schema: type: boolean - name: reportTaskReachability in: query description: |- - Report task reachability for the requested versions and all task types (task reachability is not reported + Deprecated (as part of the ENHANCED mode deprecation). + Report task reachability for the requested versions and all task types (task reachability is not reported per task type). schema: type: boolean @@ -1972,6 +1982,92 @@ paths: application/json: schema: $ref: '#/components/schemas/Status' + /api/v1/namespaces/{namespace}/workers: + get: + tags: + - WorkflowService + description: ListWorkers is a visibility API to list worker status information in a specific namespace. + operationId: ListWorkers + parameters: + - name: namespace + in: path + required: true + schema: + type: string + - name: pageSize + in: query + schema: + type: integer + format: int32 + - name: nextPageToken + in: query + schema: + type: string + format: bytes + - name: query + in: query + description: |- + `query` in ListWorkers is used to filter workers based on worker status info. + The following worker status attributes are expected are supported as part of the query: + * WorkerInstanceKey + * WorkerIdentity + * HostName + * TaskQueue + * DeploymentName + * BuildId + * SdkName + * SdkVersion + * StartTime + * LastHeartbeatTime + * Status + Currently metrics are not supported as a part of ListWorkers query. + schema: + type: string + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListWorkersResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' + /api/v1/namespaces/{namespace}/workers/heartbeat: + post: + tags: + - WorkflowService + description: WorkerHeartbeat receive heartbeat request from the worker. + operationId: RecordWorkerHeartbeat + parameters: + - name: namespace + in: path + description: Namespace this worker belongs to. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RecordWorkerHeartbeatRequest' + required: true + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/RecordWorkerHeartbeatResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' /api/v1/namespaces/{namespace}/workflow-count: get: tags: @@ -4588,8 +4684,8 @@ paths: - name: taskQueueType in: query description: |- - Deprecated. Use `ENHANCED` mode with `task_queue_types`. Ignored in `ENHANCED` mode. - If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used. + If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used. + Only supported in default mode (use `task_queue_types` in ENHANCED mode instead). schema: enum: - TASK_QUEUE_TYPE_UNSPECIFIED @@ -4598,14 +4694,24 @@ paths: - TASK_QUEUE_TYPE_NEXUS type: string format: enum + - name: reportStats + in: query + description: Report stats for the requested task queue type(s). + schema: + type: boolean - name: includeTaskQueueStatus in: query - description: Deprecated. Ignored in `ENHANCED` mode. + description: |- + Deprecated, use `report_stats` instead. + If true, the task queue status will be included in the response. schema: type: boolean - name: apiMode in: query - description: All options except `task_queue_type` and `include_task_queue_status` are only available in the `ENHANCED` mode. + description: |- + Deprecated. ENHANCED mode is also being deprecated. + Select the API mode to use for this request: DEFAULT mode (if unset) or ENHANCED mode. + Consult the documentation for each field to understand which mode it is supported in. schema: enum: - DESCRIBE_TASK_QUEUE_MODE_UNSPECIFIED @@ -4633,7 +4739,9 @@ paths: type: boolean - name: taskQueueTypes in: query - description: Task queue types to report info about. If not specified, all types are considered. + description: |- + Deprecated (as part of the ENHANCED mode deprecation). + Task queue types to report info about. If not specified, all types are considered. schema: type: array items: @@ -4644,20 +4752,18 @@ paths: - TASK_QUEUE_TYPE_NEXUS type: string format: enum - - name: reportStats - in: query - description: Report stats for the requested task queue types and versions - schema: - type: boolean - name: reportPollers in: query - description: Report list of pollers for requested task queue types and versions + description: |- + Deprecated (as part of the ENHANCED mode deprecation). + Report list of pollers for requested task queue types and versions. schema: type: boolean - name: reportTaskReachability in: query description: |- - Report task reachability for the requested versions and all task types (task reachability is not reported + Deprecated (as part of the ENHANCED mode deprecation). + Report task reachability for the requested versions and all task types (task reachability is not reported per task type). schema: type: boolean @@ -5109,6 +5215,92 @@ paths: application/json: schema: $ref: '#/components/schemas/Status' + /namespaces/{namespace}/workers: + get: + tags: + - WorkflowService + description: ListWorkers is a visibility API to list worker status information in a specific namespace. + operationId: ListWorkers + parameters: + - name: namespace + in: path + required: true + schema: + type: string + - name: pageSize + in: query + schema: + type: integer + format: int32 + - name: nextPageToken + in: query + schema: + type: string + format: bytes + - name: query + in: query + description: |- + `query` in ListWorkers is used to filter workers based on worker status info. + The following worker status attributes are expected are supported as part of the query: + * WorkerInstanceKey + * WorkerIdentity + * HostName + * TaskQueue + * DeploymentName + * BuildId + * SdkName + * SdkVersion + * StartTime + * LastHeartbeatTime + * Status + Currently metrics are not supported as a part of ListWorkers query. + schema: + type: string + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ListWorkersResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' + /namespaces/{namespace}/workers/heartbeat: + post: + tags: + - WorkflowService + description: WorkerHeartbeat receive heartbeat request from the worker. + operationId: RecordWorkerHeartbeat + parameters: + - name: namespace + in: path + description: Namespace this worker belongs to. + required: true + schema: + type: string + requestBody: + content: + application/json: + schema: + $ref: '#/components/schemas/RecordWorkerHeartbeatRequest' + required: true + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/RecordWorkerHeartbeatResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' /namespaces/{namespace}/workflow-count: get: tags: @@ -6399,7 +6591,7 @@ components: - RESET_TYPE_FIRST_WORKFLOW_TASK - RESET_TYPE_LAST_WORKFLOW_TASK type: string - description: Reset type (deprecated, use `options`). + description: Deprecated. Use `options`. format: enum resetReapplyType: enum: @@ -6408,7 +6600,7 @@ components: - RESET_REAPPLY_TYPE_NONE - RESET_REAPPLY_TYPE_ALL_ELIGIBLE type: string - description: History event reapply options (deprecated, use `options`). + description: Deprecated. Use `options`. format: enum postResetOperations: type: array @@ -7296,20 +7488,10 @@ components: type: array items: $ref: '#/components/schemas/PollerInfo' - description: |- - Deprecated. Use `versions_info.types_info.pollers` with `ENHANCED` mode instead. - Not set in `ENHANCED` mode. - taskQueueStatus: + stats: allOf: - - $ref: '#/components/schemas/TaskQueueStatus' - description: Deprecated. Not set in `ENHANCED` mode. - versionsInfo: - type: object - additionalProperties: - $ref: '#/components/schemas/TaskQueueVersionInfo' - description: |- - This map contains Task Queue information for each Build ID. Empty string as key value means unversioned. - Only set in `ENHANCED` mode. + - $ref: '#/components/schemas/TaskQueueStats' + description: Statistics for the task queue. Only populated when `report_stats` is set to true in the request. versioningInfo: allOf: - $ref: '#/components/schemas/TaskQueueVersioningInfo' @@ -7323,6 +7505,20 @@ components: they are always routed to their Pinned Deployment Version. However, new workflow executions are typically not Pinned until they complete their first task (unless they are started with a Pinned VersioningOverride or are Child Workflows of a Pinned parent). + taskQueueStatus: + allOf: + - $ref: '#/components/schemas/TaskQueueStatus' + description: |- + Deprecated. + Status of the task queue. Only populated when `include_task_queue_status` is set to true in the request. + versionsInfo: + type: object + additionalProperties: + $ref: '#/components/schemas/TaskQueueVersionInfo' + description: |- + Deprecated. + Only returned in ENHANCED mode. + This map contains Task Queue information for each Build ID. Empty string as key value means unversioned. DescribeWorkerDeploymentResponse: type: object properties: @@ -7548,7 +7744,7 @@ components: $ref: '#/components/schemas/WorkflowExecution' control: type: string - description: Deprecated + description: Deprecated. FailoverStatus: type: object properties: @@ -8251,7 +8447,30 @@ components: format: date-time routingConfig: $ref: '#/components/schemas/RoutingConfig' + latestVersionSummary: + allOf: + - $ref: '#/components/schemas/WorkerDeploymentInfo_WorkerDeploymentVersionSummary' + description: Summary of the version that was added most recently in the Worker Deployment. + currentVersionSummary: + allOf: + - $ref: '#/components/schemas/WorkerDeploymentInfo_WorkerDeploymentVersionSummary' + description: Summary of the current version of the Worker Deployment. + rampingVersionSummary: + allOf: + - $ref: '#/components/schemas/WorkerDeploymentInfo_WorkerDeploymentVersionSummary' + description: Summary of the ramping version of the Worker Deployment. description: A subset of WorkerDeploymentInfo + ListWorkersResponse: + type: object + properties: + workersInfo: + type: array + items: + $ref: '#/components/schemas/WorkerInfo' + nextPageToken: + type: string + description: Next page token + format: bytes ListWorkflowExecutionsResponse: type: object properties: @@ -8674,7 +8893,7 @@ components: description: |- Operation ID - may be empty if the operation completed synchronously. - Deprecated: Renamed to operation_token. + Deprecated. Renamed to operation_token. operationToken: type: string description: Operation token - may be empty if the operation completed synchronously. @@ -8917,7 +9136,7 @@ components: lastIndependentlyAssignedBuildId: type: string description: |- - This means the activity is independently versioned and not bound to the build ID of its workflow. + Deprecated. This means the activity is independently versioned and not bound to the build ID of its workflow. The activity will use the build id in this field instead. If the task fails and is scheduled again, the assigned build ID may change according to the latest versioning rules. @@ -8925,8 +9144,8 @@ components: allOf: - $ref: '#/components/schemas/WorkerVersionStamp' description: |- - The version stamp of the worker to whom this activity was most recently dispatched - Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] + Deprecated. The version stamp of the worker to whom this activity was most recently dispatched + This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] currentRetryInterval: pattern: ^-?(?:0|[1-9][0-9]{0,11})(?:\.[0-9]{1,9})?s$ type: string @@ -9030,7 +9249,7 @@ components: description: |- Operation ID. Only set for asynchronous operations after a successful StartOperation call. - Deprecated: Renamed to operation_token. + Deprecated. Renamed to operation_token. scheduleToCloseTimeout: pattern: ^-?(?:0|[1-9][0-9]{0,11})(?:\.[0-9]{1,9})?s$ type: string @@ -9462,6 +9681,20 @@ components: description: |- Will be set to true if the activity was reset. Applies only to the current run. + RecordWorkerHeartbeatRequest: + type: object + properties: + namespace: + type: string + description: Namespace this worker belongs to. + identity: + type: string + description: The identity of the client who initiated this request. + workerHeartbeat: + $ref: '#/components/schemas/WorkerHeartbeat' + RecordWorkerHeartbeatResponse: + type: object + properties: {} RegisterNamespaceRequest: type: object properties: @@ -9560,7 +9793,7 @@ components: corresponds to control: type: string - description: Deprecated + description: Deprecated. RequestCancelExternalWorkflowExecutionInitiatedEventAttributes: type: object properties: @@ -9578,7 +9811,7 @@ components: $ref: '#/components/schemas/WorkflowExecution' control: type: string - description: Deprecated + description: Deprecated. childWorkflowOnly: type: boolean description: |- @@ -9728,6 +9961,12 @@ components: description: |- If set, and activity is in backoff, the activity will start at a random time within the specified jitter duration. (unless it is paused and keep_paused is set) + restoreOriginalOptions: + type: boolean + description: |- + If set, the activity options will be restored to the defaults. + Default options are then options activity was created with. + They are part of the first SCHEDULE event. ResetActivityResponse: type: object properties: {} @@ -9756,7 +9995,7 @@ components: - RESET_REAPPLY_TYPE_ALL_ELIGIBLE type: string description: |- - Event types to be reapplied (deprecated) + Deprecated. Use `options`. Default: RESET_REAPPLY_TYPE_SIGNAL format: enum currentRunOnly: @@ -9787,7 +10026,7 @@ components: description: Worker build id. binaryChecksum: type: string - description: A worker binary version identifier (deprecated). + description: Deprecated. A worker binary version identifier. runId: type: string description: The first run ID in the execution chain that was touched by this worker build. @@ -9848,7 +10087,7 @@ components: - RESET_REAPPLY_TYPE_ALL_ELIGIBLE type: string description: |- - Event types to be reapplied (deprecated) + Deprecated. Use `options`. Default: RESET_REAPPLY_TYPE_SIGNAL format: enum resetReapplyExcludeTypes: @@ -10278,6 +10517,7 @@ components: format: date-time invalidScheduleError: type: string + description: Deprecated. ScheduleListEntry: type: object properties: @@ -10432,7 +10672,9 @@ components: type: array items: $ref: '#/components/schemas/CalendarSpec' - description: Any timestamps matching any of exclude_* will be skipped. + description: |- + Any timestamps matching any of exclude_* will be skipped. + Deprecated. Use exclude_structured_calendar. excludeStructuredCalendar: type: array items: @@ -10727,7 +10969,7 @@ components: type: string control: type: string - description: Deprecated + description: Deprecated. SignalExternalWorkflowExecutionInitiatedEventAttributes: type: object properties: @@ -10752,7 +10994,7 @@ components: description: Serialized arguments to provide to the signal handler control: type: string - description: Deprecated + description: Deprecated. childWorkflowOnly: type: boolean description: |- @@ -10832,7 +11074,7 @@ components: description: Serialized value(s) to provide with the signal control: type: string - description: Deprecated + description: Deprecated. retryPolicy: allOf: - $ref: '#/components/schemas/RetryPolicy' @@ -10908,7 +11150,7 @@ components: description: Used to de-dupe sent signals control: type: string - description: Deprecated + description: Deprecated. header: allOf: - $ref: '#/components/schemas/Header' @@ -11001,7 +11243,7 @@ components: format: enum control: type: string - description: Deprecated + description: Deprecated. initiatedEventId: type: string description: Id of the `START_CHILD_WORKFLOW_EXECUTION_INITIATED` event which this event corresponds to @@ -11049,7 +11291,7 @@ components: format: enum control: type: string - description: Deprecated + description: Deprecated. workflowTaskCompletedEventId: type: string description: The `WORKFLOW_TASK_COMPLETED` event which this command was reported with @@ -11079,6 +11321,7 @@ components: description: |- If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment rules of the child's Task Queue will be used to independently assign a Build ID to it. + Deprecated. Only considered for versioning v0.2. priority: allOf: - $ref: '#/components/schemas/Priority' @@ -11747,6 +11990,14 @@ components: type: type: string description: Update all running activities of this type. + restoreOriginal: + type: boolean + description: |- + If set, the activity options will be restored to the default. + Default options are then options activity was created with. + They are part of the first SCHEDULE event. + This flag cannot be combined with any other option; if you supply + restore_original together with other options, the request will be rejected. UpdateActivityOptionsResponse: type: object properties: @@ -11914,6 +12165,9 @@ components: items: type: string description: List of keys to remove from the metadata. + identity: + type: string + description: Optional. The identity of the client who initiated this request. description: Used to update the user-defined metadata of a Worker Deployment Version. UpdateWorkerDeploymentVersionMetadataResponse: type: object @@ -12145,6 +12399,8 @@ components: `WorkflowExecutionInfo.VersioningInfo` for more information. To remove the override, call `UpdateWorkflowExecutionOptions` with a null `VersioningOverride`, and use the `update_mask` to indicate that it should be mutated. + Pinned overrides are automatically inherited by child workflows, continue-as-new workflows, + workflow retries, and cron workflows. VersioningOverride_PinnedOverride: type: object properties: @@ -12208,6 +12464,17 @@ components: version: type: string description: Deprecated. Use `deployment_version`. + status: + enum: + - WORKER_DEPLOYMENT_VERSION_STATUS_UNSPECIFIED + - WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE + - WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT + - WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING + - WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING + - WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED + type: string + description: The status of the Worker Deployment Version. + format: enum deploymentVersion: allOf: - $ref: '#/components/schemas/WorkerDeploymentVersion' @@ -12221,7 +12488,40 @@ components: - VERSION_DRAINAGE_STATUS_DRAINING - VERSION_DRAINAGE_STATUS_DRAINED type: string + description: Deprecated. Use `drainage_info` instead. format: enum + drainageInfo: + allOf: + - $ref: '#/components/schemas/VersionDrainageInfo' + description: |- + Information about workflow drainage to help the user determine when it is safe + to decommission a Version. Not present while version is current or ramping + currentSinceTime: + type: string + description: |- + Unset if not current. + (-- api-linter: core::0140::prepositions=disabled + aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + format: date-time + rampingSinceTime: + type: string + description: |- + Unset if not ramping. Updated when the version first starts ramping, not on each ramp change. + (-- api-linter: core::0140::prepositions=disabled + aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + format: date-time + routingUpdateTime: + type: string + description: Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. + format: date-time + firstActivationTime: + type: string + description: Timestamp when this version first became current or ramping. + format: date-time + lastDeactivationTime: + type: string + description: Timestamp when this version last stopped being current or ramping. + format: date-time WorkerDeploymentOptions: type: object properties: @@ -12272,6 +12572,17 @@ components: version: type: string description: Deprecated. Use `deployment_version`. + status: + enum: + - WORKER_DEPLOYMENT_VERSION_STATUS_UNSPECIFIED + - WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE + - WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT + - WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING + - WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING + - WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED + type: string + description: The status of the Worker Deployment Version. + format: enum deploymentVersion: allOf: - $ref: '#/components/schemas/WorkerDeploymentVersion' @@ -12290,14 +12601,22 @@ components: description: |- (-- api-linter: core::0140::prepositions=disabled aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - Nil if not current. + Unset if not current. format: date-time rampingSinceTime: type: string description: |- (-- api-linter: core::0140::prepositions=disabled aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. + Unset if not ramping. Updated when the version first starts ramping, not on each ramp change. + format: date-time + firstActivationTime: + type: string + description: Timestamp when this version first became current or ramping. + format: date-time + lastDeactivationTime: + type: string + description: Timestamp when this version last stopped being current or ramping. format: date-time rampPercentage: type: number @@ -12345,6 +12664,169 @@ components: - TASK_QUEUE_TYPE_NEXUS type: string format: enum + WorkerHeartbeat: + type: object + properties: + workerInstanceKey: + type: string + description: |- + Worker identifier, should be unique for the namespace. + It is distinct from worker identity, which is not necessarily namespace-unique. + workerIdentity: + type: string + description: |- + Worker identity, set by the client, may not be unique. + Usually host_name+(user group name)+process_id, but can be overwritten by the user. + hostInfo: + allOf: + - $ref: '#/components/schemas/WorkerHostInfo' + description: Worker host information. + taskQueue: + type: string + description: Task queue this worker is polling for tasks. + deploymentVersion: + $ref: '#/components/schemas/WorkerDeploymentVersion' + sdkName: + type: string + sdkVersion: + type: string + status: + enum: + - WORKER_STATUS_UNSPECIFIED + - WORKER_STATUS_RUNNING + - WORKER_STATUS_SHUTTING_DOWN + - WORKER_STATUS_SHUTDOWN + type: string + description: Worker status. Defined by SDK. + format: enum + startTime: + type: string + description: |- + Worker start time. + It can be used to determine worker uptime. (current time - start time) + format: date-time + heartbeatTime: + type: string + description: |- + Timestamp of this heartbeat, coming from the worker. Worker should set it to "now". + Note that this timestamp comes directly from the worker and is subject to workers' clock skew. + format: date-time + elapsedSinceLastHeartbeat: + pattern: ^-?(?:0|[1-9][0-9]{0,11})(?:\.[0-9]{1,9})?s$ + type: string + description: Elapsed time since the last heartbeat from the worker. + workflowTaskSlotsInfo: + $ref: '#/components/schemas/WorkerSlotsInfo' + activityTaskSlotsInfo: + $ref: '#/components/schemas/WorkerSlotsInfo' + nexusTaskSlotsInfo: + $ref: '#/components/schemas/WorkerSlotsInfo' + localActivitySlotsInfo: + $ref: '#/components/schemas/WorkerSlotsInfo' + workflowPollerInfo: + $ref: '#/components/schemas/WorkerPollerInfo' + workflowStickyPollerInfo: + $ref: '#/components/schemas/WorkerPollerInfo' + activityPollerInfo: + $ref: '#/components/schemas/WorkerPollerInfo' + nexusPollerInfo: + $ref: '#/components/schemas/WorkerPollerInfo' + totalStickyCacheHit: + type: integer + description: A Workflow Task found a cached Workflow Execution to run against. + format: int32 + totalStickyCacheMiss: + type: integer + description: A Workflow Task did not find a cached Workflow execution to run against. + format: int32 + currentStickyCacheSize: + type: integer + description: Current cache size, expressed in number of Workflow Executions. + format: int32 + description: |- + Worker info message, contains information about the worker and its current state. + All information is provided by the worker itself. + (-- api-linter: core::0140::prepositions=disabled + aip.dev/not-precedent: Removing those words make names less clear. --) + WorkerHostInfo: + type: object + properties: + hostName: + type: string + description: Worker host identifier. + processId: + type: string + description: Worker process identifier, should be unique for the host. + currentHostCpuUsage: + type: number + description: |- + System used CPU as a float in the range [0.0, 1.0] where 1.0 is defined as all + cores on the host pegged. + format: float + currentHostMemUsage: + type: number + description: |- + System used memory as a float in the range [0.0, 1.0] where 1.0 is defined as + all available memory on the host is used. + format: float + description: Holds everything needed to identify the worker host/process context + WorkerInfo: + type: object + properties: + workerHeartbeat: + $ref: '#/components/schemas/WorkerHeartbeat' + WorkerPollerInfo: + type: object + properties: + currentPollers: + type: integer + description: Number of polling RPCs that are currently in flight. + format: int32 + lastSuccessfulPollTime: + type: string + format: date-time + isAutoscaling: + type: boolean + description: Set true if the number of concurrent pollers is auto-scaled + WorkerSlotsInfo: + type: object + properties: + currentAvailableSlots: + type: integer + description: |- + Number of slots available for the worker to specific tasks. + May be -1 if the upper bound is not known. + format: int32 + currentUsedSlots: + type: integer + description: Number of slots used by the worker for specific tasks. + format: int32 + slotSupplierKind: + type: string + description: |- + Kind of the slot supplier, which is used to determine how the slots are allocated. + Possible values: "Fixed | ResourceBased | Custom String" + totalProcessedTasks: + type: integer + description: |- + Total number of tasks processed (completed both successfully and unsuccesfully, or any other way) + by the worker since the worker started. This is a cumulative counter. + format: int32 + totalFailedTasks: + type: integer + description: Total number of failed tasks processed by the worker so far. + format: int32 + lastIntervalProcessedTasks: + type: integer + description: |- + Number of tasks processed in since the last heartbeat from the worker. + This is a cumulative counter, and it is reset to 0 each time the worker sends a heartbeat. + Contains both successful and failed tasks. + format: int32 + lastIntervalFailureTasks: + type: integer + description: Number of failed tasks processed since the last heartbeat from the worker. + format: int32 WorkerVersionCapabilities: type: object properties: @@ -12640,6 +13122,7 @@ components: description: |- If this is set, the new execution inherits the Build ID of the current execution. Otherwise, the assignment rules will be used to independently assign a Build ID to the new execution. + Deprecated. Only considered for versioning v0.2. WorkflowExecutionExtendedInfo: type: object properties: @@ -12870,10 +13353,12 @@ components: header: allOf: - $ref: '#/components/schemas/Header' - description: "Headers that were passed by the sender of the signal and copied by temporal \n server into the workflow task." + description: |- + Headers that were passed by the sender of the signal and copied by temporal + server into the workflow task. skipGenerateWorkflowTask: type: boolean - description: This field is deprecated and never respected. It should always be set to false. + description: Deprecated. This field is never respected and should always be set to false. externalWorkflowExecution: allOf: - $ref: '#/components/schemas/WorkflowExecution' @@ -12920,7 +13405,7 @@ components: continuedExecutionRunId: type: string description: |- - Run id of the previous workflow which continued-as-new or retired or cron executed into this + Run id of the previous workflow which continued-as-new or retried or cron executed into this workflow. initiator: enum: @@ -13035,7 +13520,10 @@ components: versioningOverride: allOf: - $ref: '#/components/schemas/VersioningOverride' - description: Versioning override applied to this workflow when it was started. + description: |- + Versioning override applied to this workflow when it was started. + Children, crons, retries, and continue-as-new will inherit source run's override if pinned + and if the new workflow's Task Queue belongs to the override version. parentPinnedWorkerDeploymentVersion: type: string description: |- @@ -13044,20 +13532,29 @@ components: of starting on the Current Version of its Task Queue. This is set only if the child workflow is starting on a Task Queue belonging to the same Worker Deployment Version. - Deprecated. Use `parent_pinned_deployment_version`. - parentPinnedDeploymentVersion: - allOf: - - $ref: '#/components/schemas/WorkerDeploymentVersion' - description: |- - When present, it means this is a child workflow of a parent that is Pinned to this Worker - Deployment Version. In this case, child workflow will start as Pinned to this Version instead - of starting on the Current Version of its Task Queue. - This is set only if the child workflow is starting on a Task Queue belonging to the same - Worker Deployment Version. + Deprecated. Use `parent_versioning_info`. priority: allOf: - $ref: '#/components/schemas/Priority' description: Priority metadata + inheritedPinnedVersion: + allOf: + - $ref: '#/components/schemas/WorkerDeploymentVersion' + description: |- + If present, the new workflow should start on this version with pinned base behavior. + Child of pinned parent will inherit the parent's version if the Child's Task Queue belongs to that version. + + New run initiated by workflow ContinueAsNew of pinned run, will inherit the previous run's version if the + new run's Task Queue belongs to that version. + + New run initiated by workflow Cron will never inherit. + + New run initiated by workflow Retry will only inherit if the retried run is effectively pinned at the time + of retry, and the retried run inherited a pinned version when it started (ie. it is a child of a pinned + parent, or a CaN of a pinned run, and is running on a Task Queue in the inherited version). + + Pinned override is inherited if Task Queue of new run is compatible with the override version. + Override is inherited separately and takes precedence over inherited base version. description: Always the first event in workflow history WorkflowExecutionTerminatedEventAttributes: type: object @@ -13178,7 +13675,7 @@ components: behaviors. This field is first set after an execution completes its first workflow task on a versioned worker, and set again on completion of every subsequent workflow task. - For child workflows of Pinned parents, this will be set to Pinned (along with `version`) when + For child workflows of Pinned parents, this will be set to Pinned (along with `deployment_version`) when the the child starts so that child's first workflow task goes to the same Version as the parent. After the first workflow task, it depends on the child workflow itself if it wants to stay pinned or become unpinned (according to Versioning Behavior set in the worker). @@ -13219,7 +13716,8 @@ components: precedence over SDK-sent `behavior` (and `version` when override is PINNED). An override can be set when starting a new execution, as well as afterwards by calling the `UpdateWorkflowExecutionOptions` API. - Pinned overrides are automatically inherited by child workflows. + Pinned overrides are automatically inherited by child workflows, continue-as-new workflows, + workflow retries, and cron workflows. deploymentTransition: allOf: - $ref: '#/components/schemas/DeploymentTransition' @@ -13260,7 +13758,7 @@ components: start a transition to that version and continue execution there. A version transition can only exist while there is a pending or started workflow task. Once the pending workflow task completes on the transition's target version, the - transition completes and the workflow's `behavior`, and `version` fields are updated per the + transition completes and the workflow's `behavior`, and `deployment_version` fields are updated per the worker's task completion response. Pending activities will not start new attempts during a transition. Once the transition is completed, pending activities will start their next attempt on the new version. @@ -13580,6 +14078,7 @@ components: - WORKFLOW_TASK_FAILED_CAUSE_PENDING_NEXUS_OPERATIONS_LIMIT_EXCEEDED - WORKFLOW_TASK_FAILED_CAUSE_BAD_REQUEST_CANCEL_NEXUS_OPERATION_ATTRIBUTES - WORKFLOW_TASK_FAILED_CAUSE_FEATURE_DISABLED + - WORKFLOW_TASK_FAILED_CAUSE_GRPC_MESSAGE_TOO_LARGE type: string format: enum failure: @@ -13601,7 +14100,7 @@ components: binaryChecksum: type: string description: |- - DEPRECATED since 1.21 - This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] + Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] If a worker explicitly failed this task, its binary id workerVersion: allOf: diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/batch/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/batch/v1/message.proto index b807fd570..7c79c149d 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/batch/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/batch/v1/message.proto @@ -77,10 +77,10 @@ message BatchOperationReset { // Describes what to reset to and how. If set, `reset_type` and `reset_reapply_type` are ignored. temporal.api.common.v1.ResetOptions options = 4; - // Reset type (deprecated, use `options`). - temporal.api.enums.v1.ResetType reset_type = 1; - // History event reapply options (deprecated, use `options`). - temporal.api.enums.v1.ResetReapplyType reset_reapply_type = 2; + // Deprecated. Use `options`. + temporal.api.enums.v1.ResetType reset_type = 1 [deprecated = true]; + // Deprecated. Use `options`. + temporal.api.enums.v1.ResetReapplyType reset_reapply_type = 2 [deprecated = true]; // Operations to perform after the workflow has been reset. These operations will be applied // to the *new* run of the workflow execution in the order they are provided. // All operations are applied to the workflow before the first new workflow task is generated diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/command/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/command/v1/message.proto index cd6549fb6..d5ccaefc3 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/command/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/command/v1/message.proto @@ -106,7 +106,7 @@ message RequestCancelExternalWorkflowExecutionCommandAttributes { string workflow_id = 2; string run_id = 3; // Deprecated. - string control = 4; + string control = 4 [deprecated = true]; // Set this to true if the workflow being cancelled is a child of the workflow originating this // command. The request will be rejected if it is set to true and the target workflow is *not* // a child of the requesting workflow. @@ -123,7 +123,7 @@ message SignalExternalWorkflowExecutionCommandAttributes { // Serialized value(s) to provide with the signal. temporal.api.common.v1.Payloads input = 4; // Deprecated - string control = 5; + string control = 5 [deprecated = true]; // Set this to true if the workflow being cancelled is a child of the workflow originating this // command. The request will be rejected if it is set to true and the target workflow is *not* // a child of the requesting workflow. @@ -176,7 +176,8 @@ message ContinueAsNewWorkflowExecutionCommandAttributes { temporal.api.common.v1.SearchAttributes search_attributes = 14; // If this is set, the new execution inherits the Build ID of the current execution. Otherwise, // the assignment rules will be used to independently assign a Build ID to the new execution. - bool inherit_build_id = 15; + // Deprecated. Only considered for versioning v0.2. + bool inherit_build_id = 15 [deprecated = true]; // `workflow_execution_timeout` is omitted as it shouldn't be overridden from within a workflow. } @@ -206,7 +207,8 @@ message StartChildWorkflowExecutionCommandAttributes { temporal.api.common.v1.SearchAttributes search_attributes = 16; // If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment // rules of the child's Task Queue will be used to independently assign a Build ID to it. - bool inherit_build_id = 17; + // Deprecated. Only considered for versioning v0.2. + bool inherit_build_id = 17 [deprecated = true]; // Priority metadata. If this message is not present, or any fields are not // present, they inherit the values from the workflow. temporal.api.common.v1.Priority priority = 18; diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto index 884192f2d..58d109142 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/common/v1/message.proto @@ -154,9 +154,9 @@ message ResetOptions { string build_id = 4; } - // Event types to be reapplied (deprecated) + // Deprecated. Use `options`. // Default: RESET_REAPPLY_TYPE_SIGNAL - temporal.api.enums.v1.ResetReapplyType reset_reapply_type = 10; + temporal.api.enums.v1.ResetReapplyType reset_reapply_type = 10 [deprecated = true]; // If true, limit the reset to only within the current run. (Applies to build_id targets and // possibly others in the future.) diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto index ead82fcdf..b35d7ec9e 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/deployment/v1/message.proto @@ -97,6 +97,9 @@ message WorkerDeploymentVersionInfo { // Deprecated. Use `deployment_version`. string version = 1 [deprecated = true]; + // The status of the Worker Deployment Version. + temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 14; + // Required. WorkerDeploymentVersion deployment_version = 11; string deployment_name = 2; @@ -107,14 +110,19 @@ message WorkerDeploymentVersionInfo { // (-- api-linter: core::0140::prepositions=disabled // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not current. + // Unset if not current. google.protobuf.Timestamp current_since_time = 5; // (-- api-linter: core::0140::prepositions=disabled // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) - // Nil if not ramping. Updated when the version first starts ramping, not on each ramp change. + // Unset if not ramping. Updated when the version first starts ramping, not on each ramp change. google.protobuf.Timestamp ramping_since_time = 6; + // Timestamp when this version first became current or ramping. + google.protobuf.Timestamp first_activation_time = 12; + // Timestamp when this version last stopped being current or ramping. + google.protobuf.Timestamp last_deactivation_time = 13; + // Range: [0, 100]. Must be zero if the version is not ramping (i.e. `ramping_since_time` is nil). // Can be in the range [0, 100] if the version is ramping. float ramp_percentage = 7; @@ -190,10 +198,31 @@ message WorkerDeploymentInfo { // Deprecated. Use `deployment_version`. string version = 1 [deprecated = true]; + // The status of the Worker Deployment Version. + temporal.api.enums.v1.WorkerDeploymentVersionStatus status = 11; + // Required. WorkerDeploymentVersion deployment_version = 4; google.protobuf.Timestamp create_time = 2; + // Deprecated. Use `drainage_info` instead. enums.v1.VersionDrainageStatus drainage_status = 3; + // Information about workflow drainage to help the user determine when it is safe + // to decommission a Version. Not present while version is current or ramping + VersionDrainageInfo drainage_info = 5; + // Unset if not current. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + google.protobuf.Timestamp current_since_time = 6; + // Unset if not ramping. Updated when the version first starts ramping, not on each ramp change. + // (-- api-linter: core::0140::prepositions=disabled + // aip.dev/not-precedent: 'Since' captures the field semantics despite being a preposition. --) + google.protobuf.Timestamp ramping_since_time = 7; + // Last time `current_since_time`, `ramping_since_time, or `ramp_percentage` of this version changed. + google.protobuf.Timestamp routing_update_time = 8; + // Timestamp when this version first became current or ramping. + google.protobuf.Timestamp first_activation_time = 9; + // Timestamp when this version last stopped being current or ramping. + google.protobuf.Timestamp last_deactivation_time = 10; } } diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/common.proto b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/common.proto index c45174b77..192c1d75b 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/common.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/common.proto @@ -96,4 +96,13 @@ enum ApplicationErrorCategory { APPLICATION_ERROR_CATEGORY_UNSPECIFIED = 0; // Expected application error with little/no severity. APPLICATION_ERROR_CATEGORY_BENIGN = 1; -} \ No newline at end of file +} + +// (-- api-linter: core::0216::synonyms=disabled +// aip.dev/not-precedent: It seems we have both state and status, and status is a better fit for workers. --) +enum WorkerStatus { + WORKER_STATUS_UNSPECIFIED = 0; + WORKER_STATUS_RUNNING = 1; + WORKER_STATUS_SHUTTING_DOWN = 2; + WORKER_STATUS_SHUTDOWN = 3; +} diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/deployment.proto b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/deployment.proto index 4cf3944d6..710d7c38d 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/deployment.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/deployment.proto @@ -72,3 +72,29 @@ enum WorkerVersioningMode { // VersioningBehavior enum.) WORKER_VERSIONING_MODE_VERSIONED = 2; } + +// (-- api-linter: core::0216::synonyms=disabled +// aip.dev/not-precedent: Call this status because it is . --) +// Specify the status of a Worker Deployment Version. +// Experimental. Worker Deployments are experimental and might significantly change in the future. +enum WorkerDeploymentVersionStatus { + WORKER_DEPLOYMENT_VERSION_STATUS_UNSPECIFIED = 0; + // The Worker Deployment Version has been created inside the Worker Deployment but is not used by any + // workflow executions. These Versions can still have workflows if they have an explicit Versioning Override targeting + // this Version. Such Versioning Override could be set at workflow start time, or at a later time via `UpdateWorkflowExecutionOptions`. + WORKER_DEPLOYMENT_VERSION_STATUS_INACTIVE = 1; + // The Worker Deployment Version is the current version of the Worker Deployment. All new workflow executions + // and tasks of existing unversioned or AutoUpgrade workflows are routed to this version. + WORKER_DEPLOYMENT_VERSION_STATUS_CURRENT = 2; + // The Worker Deployment Version is the ramping version of the Worker Deployment. A subset of new Pinned workflow executions are + // routed to this version. Moreover, a portion of existing unversioned or AutoUpgrade workflow executions are also routed to this version. + WORKER_DEPLOYMENT_VERSION_STATUS_RAMPING = 3; + // The Worker Deployment Version is not used by new workflows but is still used by + // open pinned workflows. The version cannot be decommissioned safely. + WORKER_DEPLOYMENT_VERSION_STATUS_DRAINING = 4; + // The Worker Deployment Version is not used by new or open workflows, but might be still needed by + // Queries sent to closed workflows. The version can be decommissioned safely if user does + // not query closed workflows. If the user does query closed workflows for some time x after + // workflows are closed, they should decommission the version after it has been drained for that duration. + WORKER_DEPLOYMENT_VERSION_STATUS_DRAINED = 5; +} diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto index 192757487..606d967e6 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/failed_cause.proto @@ -79,6 +79,8 @@ enum WorkflowTaskFailedCause { // for the workflow's namespace). // Check the workflow task failure message for more information. WORKFLOW_TASK_FAILED_CAUSE_FEATURE_DISABLED = 35; + // A workflow task failed because a grpc message was too large. + WORKFLOW_TASK_FAILED_CAUSE_GRPC_MESSAGE_TOO_LARGE = 36; } enum StartChildWorkflowExecutionFailedCause { diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/reset.proto b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/reset.proto index 79b885cfd..33ced5cf9 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/reset.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/enums/v1/reset.proto @@ -22,9 +22,9 @@ enum ResetReapplyExcludeType { RESET_REAPPLY_EXCLUDE_TYPE_CANCEL_REQUEST = 4 [deprecated=true]; } -// Event types to include when reapplying events. Deprecated: applications -// should use ResetReapplyExcludeType to specify exclusions from this set, and -// new event types should be added to ResetReapplyExcludeType instead of here. +// Deprecated: applications should use ResetReapplyExcludeType to specify +// exclusions from this set, and new event types should be added to ResetReapplyExcludeType +// instead of here. enum ResetReapplyType { RESET_REAPPLY_TYPE_UNSPECIFIED = 0; // Signals are reapplied when workflow is reset. @@ -35,7 +35,7 @@ enum ResetReapplyType { RESET_REAPPLY_TYPE_ALL_ELIGIBLE = 3; } -// Reset type options. Deprecated, see temporal.api.common.v1.ResetOptions. +// Deprecated, see temporal.api.common.v1.ResetOptions. enum ResetType { RESET_TYPE_UNSPECIFIED = 0; // Resets to event of the first workflow task completed, or if it does not exist, the event after task scheduled. diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/failure/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/failure/v1/message.proto index cbf28e7a6..3b27f65ff 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/failure/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/failure/v1/message.proto @@ -77,8 +77,8 @@ message NexusOperationFailureInfo { string operation = 4; // Operation ID - may be empty if the operation completed synchronously. // - // Deprecated: Renamed to operation_token. - string operation_id = 5; + // Deprecated. Renamed to operation_token. + string operation_id = 5 [deprecated = true]; // Operation token - may be empty if the operation completed synchronously. string operation_token = 6; } diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/history/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/history/v1/message.proto index a5f1ebc01..2c77ba904 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/history/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/history/v1/message.proto @@ -46,7 +46,7 @@ message WorkflowExecutionStartedEventAttributes { google.protobuf.Duration workflow_run_timeout = 8; // Timeout of a single workflow task. google.protobuf.Duration workflow_task_timeout = 9; - // Run id of the previous workflow which continued-as-new or retired or cron executed into this + // Run id of the previous workflow which continued-as-new or retried or cron executed into this // workflow. string continued_execution_run_id = 10; temporal.api.enums.v1.ContinueAsNewInitiator initiator = 11; @@ -84,7 +84,7 @@ message WorkflowExecutionStartedEventAttributes { // If this workflow intends to use anything other than the current overall default version for // the queue, then we include it here. // Deprecated. [cleanup-experimental-wv] - temporal.api.common.v1.WorkerVersionStamp source_version_stamp = 29; + temporal.api.common.v1.WorkerVersionStamp source_version_stamp = 29 [deprecated = true]; // Completion callbacks attached when this workflow was started. repeated temporal.api.common.v1.Callback completion_callbacks = 30; @@ -117,26 +117,40 @@ message WorkflowExecutionStartedEventAttributes { temporal.api.common.v1.WorkflowExecution root_workflow_execution = 31; // When present, this execution is assigned to the build ID of its parent or previous execution. // Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] - string inherited_build_id = 32; + string inherited_build_id = 32 [deprecated = true]; // Versioning override applied to this workflow when it was started. + // Children, crons, retries, and continue-as-new will inherit source run's override if pinned + // and if the new workflow's Task Queue belongs to the override version. temporal.api.workflow.v1.VersioningOverride versioning_override = 33; // When present, it means this is a child workflow of a parent that is Pinned to this Worker // Deployment Version. In this case, child workflow will start as Pinned to this Version instead // of starting on the Current Version of its Task Queue. // This is set only if the child workflow is starting on a Task Queue belonging to the same // Worker Deployment Version. - // Deprecated. Use `parent_pinned_deployment_version`. + // Deprecated. Use `parent_versioning_info`. string parent_pinned_worker_deployment_version = 34 [deprecated = true]; - // When present, it means this is a child workflow of a parent that is Pinned to this Worker - // Deployment Version. In this case, child workflow will start as Pinned to this Version instead - // of starting on the Current Version of its Task Queue. - // This is set only if the child workflow is starting on a Task Queue belonging to the same - // Worker Deployment Version. - temporal.api.deployment.v1.WorkerDeploymentVersion parent_pinned_deployment_version = 36; - // Priority metadata temporal.api.common.v1.Priority priority = 35; + + reserved 36; + reserved "parent_pinned_deployment_version"; + + // If present, the new workflow should start on this version with pinned base behavior. + // Child of pinned parent will inherit the parent's version if the Child's Task Queue belongs to that version. + // + // New run initiated by workflow ContinueAsNew of pinned run, will inherit the previous run's version if the + // new run's Task Queue belongs to that version. + // + // New run initiated by workflow Cron will never inherit. + // + // New run initiated by workflow Retry will only inherit if the retried run is effectively pinned at the time + // of retry, and the retried run inherited a pinned version when it started (ie. it is a child of a pinned + // parent, or a CaN of a pinned run, and is running on a Task Queue in the inherited version). + // + // Pinned override is inherited if Task Queue of new run is compatible with the override version. + // Override is inherited separately and takes precedence over inherited base version. + temporal.api.deployment.v1.WorkerDeploymentVersion inherited_pinned_version = 37; } message WorkflowExecutionCompletedEventAttributes { @@ -183,7 +197,7 @@ message WorkflowExecutionContinuedAsNewEventAttributes { // Deprecated. If a workflow's retry policy would cause a new run to start when the current one // has failed, this field would be populated with that failure. Now (when supported by server // and sdk) the final event will be `WORKFLOW_EXECUTION_FAILED` with `new_execution_run_id` set. - temporal.api.failure.v1.Failure failure = 10; + temporal.api.failure.v1.Failure failure = 10 [deprecated = true]; // TODO: Is this the result of *this* workflow as it continued-as-new? temporal.api.common.v1.Payloads last_completion_result = 11; temporal.api.common.v1.Header header = 12; @@ -191,7 +205,8 @@ message WorkflowExecutionContinuedAsNewEventAttributes { temporal.api.common.v1.SearchAttributes search_attributes = 14; // If this is set, the new execution inherits the Build ID of the current execution. Otherwise, // the assignment rules will be used to independently assign a Build ID to the new execution. - bool inherit_build_id = 15; + // Deprecated. Only considered for versioning v0.2. + bool inherit_build_id = 15 [deprecated = true]; // workflow_execution_timeout is omitted as it shouldn't be overridden from within a workflow. } @@ -301,7 +316,7 @@ message WorkflowTaskFailedEventAttributes { string new_run_id = 7; // TODO: ? int64 fork_event_version = 8; - // DEPRECATED since 1.21 - This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] + // Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] // If a worker explicitly failed this task, its binary id string binary_checksum = 9 [deprecated = true]; // Version info of the worker who processed this workflow task. If present, the `build_id` field @@ -512,7 +527,7 @@ message WorkflowExecutionSignaledEventAttributes { // Headers that were passed by the sender of the signal and copied by temporal // server into the workflow task. temporal.api.common.v1.Header header = 4; - // This field is deprecated and never respected. It should always be set to false. + // Deprecated. This field is never respected and should always be set to false. bool skip_generate_workflow_task = 5 [deprecated = true]; // When signal origin is a workflow execution, this field is set. temporal.api.common.v1.WorkflowExecution external_workflow_execution = 6; @@ -534,8 +549,8 @@ message RequestCancelExternalWorkflowExecutionInitiatedEventAttributes { string namespace = 2; string namespace_id = 7; temporal.api.common.v1.WorkflowExecution workflow_execution = 3; - // Deprecated - string control = 4; + // Deprecated. + string control = 4 [deprecated = true]; // Workers are expected to set this to true if the workflow they are requesting to cancel is // a child of the workflow which issued the request bool child_workflow_only = 5; @@ -555,8 +570,8 @@ message RequestCancelExternalWorkflowExecutionFailedEventAttributes { // id of the `REQUEST_CANCEL_EXTERNAL_WORKFLOW_EXECUTION_INITIATED` event this failure // corresponds to int64 initiated_event_id = 5; - // Deprecated - string control = 6; + // Deprecated. + string control = 6 [deprecated = true]; } message ExternalWorkflowExecutionCancelRequestedEventAttributes { @@ -582,8 +597,8 @@ message SignalExternalWorkflowExecutionInitiatedEventAttributes { string signal_name = 4; // Serialized arguments to provide to the signal handler temporal.api.common.v1.Payloads input = 5; - // Deprecated - string control = 6; + // Deprecated. + string control = 6 [deprecated = true]; // Workers are expected to set this to true if the workflow they are requesting to cancel is // a child of the workflow which issued the request bool child_workflow_only = 7; @@ -600,8 +615,8 @@ message SignalExternalWorkflowExecutionFailedEventAttributes { string namespace_id = 7; temporal.api.common.v1.WorkflowExecution workflow_execution = 4; int64 initiated_event_id = 5; - // Deprecated - string control = 6; + // Deprecated. + string control = 6 [deprecated = true]; } message ExternalWorkflowExecutionSignaledEventAttributes { @@ -612,8 +627,8 @@ message ExternalWorkflowExecutionSignaledEventAttributes { string namespace = 2; string namespace_id = 5; temporal.api.common.v1.WorkflowExecution workflow_execution = 3; - // Deprecated - string control = 4; + // Deprecated. + string control = 4 [deprecated = true]; } message UpsertWorkflowSearchAttributesEventAttributes { @@ -648,8 +663,8 @@ message StartChildWorkflowExecutionInitiatedEventAttributes { google.protobuf.Duration workflow_task_timeout = 8; // Default: PARENT_CLOSE_POLICY_TERMINATE. temporal.api.enums.v1.ParentClosePolicy parent_close_policy = 9; - // Deprecated - string control = 10; + // Deprecated. + string control = 10 [deprecated = true]; // The `WORKFLOW_TASK_COMPLETED` event which this command was reported with int64 workflow_task_completed_event_id = 11; // Default: WORKFLOW_ID_REUSE_POLICY_ALLOW_DUPLICATE. @@ -662,7 +677,8 @@ message StartChildWorkflowExecutionInitiatedEventAttributes { temporal.api.common.v1.SearchAttributes search_attributes = 17; // If this is set, the child workflow inherits the Build ID of the parent. Otherwise, the assignment // rules of the child's Task Queue will be used to independently assign a Build ID to it. - bool inherit_build_id = 19; + // Deprecated. Only considered for versioning v0.2. + bool inherit_build_id = 19 [deprecated = true]; // Priority metadata temporal.api.common.v1.Priority priority = 20; } @@ -675,8 +691,8 @@ message StartChildWorkflowExecutionFailedEventAttributes { string workflow_id = 2; temporal.api.common.v1.WorkflowType workflow_type = 3; temporal.api.enums.v1.StartChildWorkflowExecutionFailedCause cause = 4; - // Deprecated - string control = 5; + // Deprecated. + string control = 5 [deprecated = true]; // Id of the `START_CHILD_WORKFLOW_EXECUTION_INITIATED` event which this event corresponds to int64 initiated_event_id = 6; // The `WORKFLOW_TASK_COMPLETED` event which this command was reported with @@ -893,7 +909,7 @@ message NexusOperationStartedEventAttributes { // This ID is used when canceling the operation. // // Deprecated: Renamed to operation_token. - string operation_id = 3; + string operation_id = 3 [deprecated = true]; // The request ID allocated at schedule time. string request_id = 4; diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/nexus/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/nexus/v1/message.proto index 6a6eb5641..f39890ad5 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/nexus/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/nexus/v1/message.proto @@ -68,8 +68,8 @@ message CancelOperationRequest { string operation = 2; // Operation ID as originally generated by a Handler. // - // Deprecated: Renamed to operation_token. - string operation_id = 3; + // Deprecated. Renamed to operation_token. + string operation_id = 3 [deprecated = true]; // Operation token as originally generated by a Handler. string operation_token = 4; @@ -103,8 +103,8 @@ message StartOperationResponse { // The operation will complete asynchronously. // The returned ID can be used to reference this operation. message Async { - // Deprecated: Renamed to operation_token. - string operation_id = 1; + // Deprecated. Renamed to operation_token. + string operation_id = 1 [deprecated = true]; repeated Link links = 2; string operation_token = 3; } diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/schedule/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/schedule/v1/message.proto index 6f9cb206b..4f633fa92 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/schedule/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/schedule/v1/message.proto @@ -167,7 +167,8 @@ message ScheduleSpec { // Interval-based specifications of times. repeated IntervalSpec interval = 2; // Any timestamps matching any of exclude_* will be skipped. - repeated CalendarSpec exclude_calendar = 3 [deprecated = true]; // use exclude_structured_calendar + // Deprecated. Use exclude_structured_calendar. + repeated CalendarSpec exclude_calendar = 3 [deprecated = true]; repeated StructuredCalendarSpec exclude_structured_calendar = 9; // If start_time is set, any timestamps before start_time will be skipped. // (Together, start_time and end_time make an inclusive interval.) @@ -342,6 +343,7 @@ message ScheduleInfo { google.protobuf.Timestamp create_time = 6; google.protobuf.Timestamp update_time = 7; + // Deprecated. string invalid_schedule_error = 8 [deprecated = true]; } diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto new file mode 100644 index 000000000..024357ce5 --- /dev/null +++ b/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto @@ -0,0 +1,126 @@ +syntax = "proto3"; + +package temporal.api.worker.v1; + +option go_package = "go.temporal.io/api/worker/v1;worker"; +option java_package = "io.temporal.api.worker.v1"; +option java_multiple_files = true; +option java_outer_classname = "MessageProto"; +option ruby_package = "Temporalio::Api::Worker::V1"; +option csharp_namespace = "Temporalio.Api.Worker.V1"; + +import "google/protobuf/duration.proto"; +import "google/protobuf/timestamp.proto"; +import "temporal/api/deployment/v1/message.proto"; +import "temporal/api/enums/v1/common.proto"; + +message WorkerPollerInfo { + // Number of polling RPCs that are currently in flight. + int32 current_pollers = 1; + + google.protobuf.Timestamp last_successful_poll_time = 2; + + // Set true if the number of concurrent pollers is auto-scaled + bool is_autoscaling = 3; +} + +message WorkerSlotsInfo { + // Number of slots available for the worker to specific tasks. + // May be -1 if the upper bound is not known. + int32 current_available_slots = 1; + // Number of slots used by the worker for specific tasks. + int32 current_used_slots = 2; + + // Kind of the slot supplier, which is used to determine how the slots are allocated. + // Possible values: "Fixed | ResourceBased | Custom String" + string slot_supplier_kind = 3; + + // Total number of tasks processed (completed both successfully and unsuccesfully, or any other way) + // by the worker since the worker started. This is a cumulative counter. + int32 total_processed_tasks = 4; + // Total number of failed tasks processed by the worker so far. + int32 total_failed_tasks = 5; + + // Number of tasks processed in since the last heartbeat from the worker. + // This is a cumulative counter, and it is reset to 0 each time the worker sends a heartbeat. + // Contains both successful and failed tasks. + int32 last_interval_processed_tasks = 6; + // Number of failed tasks processed since the last heartbeat from the worker. + int32 last_interval_failure_tasks = 7; +} + +// Holds everything needed to identify the worker host/process context +message WorkerHostInfo { + // Worker host identifier. + string host_name = 1; + + // Worker process identifier, should be unique for the host. + string process_id = 2; + + // System used CPU as a float in the range [0.0, 1.0] where 1.0 is defined as all + // cores on the host pegged. + float current_host_cpu_usage = 3; + // System used memory as a float in the range [0.0, 1.0] where 1.0 is defined as + // all available memory on the host is used. + float current_host_mem_usage = 4; +} + +// Worker info message, contains information about the worker and its current state. +// All information is provided by the worker itself. +// (-- api-linter: core::0140::prepositions=disabled +// aip.dev/not-precedent: Removing those words make names less clear. --) +message WorkerHeartbeat { + // Worker identifier, should be unique for the namespace. + // It is distinct from worker identity, which is not necessarily namespace-unique. + string worker_instance_key = 1; + + // Worker identity, set by the client, may not be unique. + // Usually host_name+(user group name)+process_id, but can be overwritten by the user. + string worker_identity = 2; + + + // Worker host information. + WorkerHostInfo host_info = 3; + + // Task queue this worker is polling for tasks. + string task_queue = 4; + + temporal.api.deployment.v1.WorkerDeploymentVersion deployment_version = 5; + + string sdk_name = 6; + string sdk_version = 7; + + // Worker status. Defined by SDK. + temporal.api.enums.v1.WorkerStatus status = 8; + + // Worker start time. + // It can be used to determine worker uptime. (current time - start time) + google.protobuf.Timestamp start_time = 9; + + // Timestamp of this heartbeat, coming from the worker. Worker should set it to "now". + // Note that this timestamp comes directly from the worker and is subject to workers' clock skew. + google.protobuf.Timestamp heartbeat_time = 10; + // Elapsed time since the last heartbeat from the worker. + google.protobuf.Duration elapsed_since_last_heartbeat = 11; + + WorkerSlotsInfo workflow_task_slots_info = 12; + WorkerSlotsInfo activity_task_slots_info = 13; + WorkerSlotsInfo nexus_task_slots_info = 14; + WorkerSlotsInfo local_activity_slots_info = 15; + + WorkerPollerInfo workflow_poller_info = 16; + WorkerPollerInfo workflow_sticky_poller_info = 17; + WorkerPollerInfo activity_poller_info = 18; + WorkerPollerInfo nexus_poller_info = 19; + + // A Workflow Task found a cached Workflow Execution to run against. + int32 total_sticky_cache_hit = 20; + // A Workflow Task did not find a cached Workflow execution to run against. + int32 total_sticky_cache_miss = 21; + // Current cache size, expressed in number of Workflow Executions. + int32 current_sticky_cache_size = 22; +} + +message WorkerInfo { + WorkerHeartbeat worker_heartbeat = 1; +} \ No newline at end of file diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/workflow/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/workflow/v1/message.proto index 98134eb14..1d5737c79 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/workflow/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/workflow/v1/message.proto @@ -45,7 +45,7 @@ message WorkflowExecutionInfo { int64 history_size_bytes = 15; // If set, the most recent worker version stamp that appeared in a workflow task completion // Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] - temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 16; + temporal.api.common.v1.WorkerVersionStamp most_recent_worker_version_stamp = 16 [deprecated = true]; // Workflow execution duration is defined as difference between close time and execution time. // This field is only populated if the workflow is closed. google.protobuf.Duration execution_duration = 17; @@ -137,7 +137,7 @@ message WorkflowExecutionVersioningInfo { // behaviors. // This field is first set after an execution completes its first workflow task on a versioned // worker, and set again on completion of every subsequent workflow task. - // For child workflows of Pinned parents, this will be set to Pinned (along with `version`) when + // For child workflows of Pinned parents, this will be set to Pinned (along with `deployment_version`) when // the the child starts so that child's first workflow task goes to the same Version as the // parent. After the first workflow task, it depends on the child workflow itself if it wants // to stay pinned or become unpinned (according to Versioning Behavior set in the worker). @@ -167,7 +167,8 @@ message WorkflowExecutionVersioningInfo { // precedence over SDK-sent `behavior` (and `version` when override is PINNED). An // override can be set when starting a new execution, as well as afterwards by calling the // `UpdateWorkflowExecutionOptions` API. - // Pinned overrides are automatically inherited by child workflows. + // Pinned overrides are automatically inherited by child workflows, continue-as-new workflows, + // workflow retries, and cron workflows. VersioningOverride versioning_override = 3; // When present, indicates the workflow is transitioning to a different deployment. Can // indicate one of the following transitions: unversioned -> versioned, versioned -> versioned @@ -202,7 +203,7 @@ message WorkflowExecutionVersioningInfo { // start a transition to that version and continue execution there. // A version transition can only exist while there is a pending or started workflow task. // Once the pending workflow task completes on the transition's target version, the - // transition completes and the workflow's `behavior`, and `version` fields are updated per the + // transition completes and the workflow's `behavior`, and `deployment_version` fields are updated per the // worker's task completion response. // Pending activities will not start new attempts during a transition. Once the transition is // completed, pending activities will start their next attempt on the new version. @@ -260,16 +261,16 @@ message PendingActivityInfo { // independently-assigned build ID to the database. This case heals automatically once the task is dispatched. // Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] oneof assigned_build_id { - // When present, it means this activity is assigned to the build ID of its workflow. + // Deprecated. When present, it means this activity is assigned to the build ID of its workflow. google.protobuf.Empty use_workflow_build_id = 13 [deprecated = true]; - // This means the activity is independently versioned and not bound to the build ID of its workflow. + // Deprecated. This means the activity is independently versioned and not bound to the build ID of its workflow. // The activity will use the build id in this field instead. // If the task fails and is scheduled again, the assigned build ID may change according to the latest versioning // rules. string last_independently_assigned_build_id = 14 [deprecated = true]; } - // The version stamp of the worker to whom this activity was most recently dispatched - // Deprecated. This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] + // Deprecated. The version stamp of the worker to whom this activity was most recently dispatched + // This field should be cleaned up when versioning-2 API is removed. [cleanup-experimental-wv] temporal.api.common.v1.WorkerVersionStamp last_worker_version_stamp = 15 [deprecated = true]; // The time activity will wait until the next retry. @@ -370,7 +371,7 @@ message ResetPoints { message ResetPointInfo { // Worker build id. string build_id = 7; - // A worker binary version identifier (deprecated). + // Deprecated. A worker binary version identifier. string binary_checksum = 1 [deprecated = true]; // The first run ID in the execution chain that was touched by this worker build. string run_id = 2; @@ -465,8 +466,8 @@ message PendingNexusOperationInfo { // Operation ID. Only set for asynchronous operations after a successful StartOperation call. // - // Deprecated: Renamed to operation_token. - string operation_id = 4; + // Deprecated. Renamed to operation_token. + string operation_id = 4 [deprecated = true]; // Schedule-to-close timeout for this operation. // This is the only timeout settable by a workflow. @@ -535,6 +536,8 @@ message WorkflowExecutionOptions { // `WorkflowExecutionInfo.VersioningInfo` for more information. To remove the override, call // `UpdateWorkflowExecutionOptions` with a null `VersioningOverride`, and use the `update_mask` // to indicate that it should be mutated. +// Pinned overrides are automatically inherited by child workflows, continue-as-new workflows, +// workflow retries, and cron workflows. message VersioningOverride { // Indicates whether to override the workflow to be AutoUpgrade or Pinned. oneof override { diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto index 43993e246..652cc257c 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto @@ -40,6 +40,7 @@ import "temporal/api/batch/v1/message.proto"; import "temporal/api/sdk/v1/task_complete_metadata.proto"; import "temporal/api/sdk/v1/user_metadata.proto"; import "temporal/api/nexus/v1/message.proto"; +import "temporal/api/worker/v1/message.proto"; import "google/protobuf/duration.proto"; import "google/protobuf/field_mask.proto"; @@ -254,17 +255,20 @@ message PollWorkflowTaskQueueRequest { temporal.api.taskqueue.v1.TaskQueue task_queue = 2; // The identity of the worker/client who is polling this task queue string identity = 3; - // DEPRECATED since 1.21 - use `deployment_options` instead. + // Deprecated. Use deployment_options instead. // Each worker process should provide an ID unique to the specific set of code it is running // "checksum" in this field name isn't very accurate, it should be though of as an id. string binary_checksum = 4 [deprecated = true]; + // Deprecated. Use deployment_options instead. // Information about this worker's build identifier and if it is choosing to use the versioning // feature. See the `WorkerVersionCapabilities` docstring for more. - // Deprecated. Replaced by deployment_options. temporal.api.common.v1.WorkerVersionCapabilities worker_version_capabilities = 5 [deprecated = true]; // Worker deployment options that user has set in the worker. // Experimental. Worker Deployments are experimental and might significantly change in the future. temporal.api.deployment.v1.WorkerDeploymentOptions deployment_options = 6; + + // Worker info to be sent to the server. + temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 7; } message PollWorkflowTaskQueueResponse { @@ -339,7 +343,7 @@ message RespondWorkflowTaskCompletedRequest { // something useful, but cannot complete it within the workflow task timeout. Local activities // which run for longer than the task timeout being the prime example. bool force_create_new_workflow_task = 6; - // DEPRECATED since 1.21 - use `deployment_options` instead. + // Deprecated. Use `deployment_options` instead. // Worker process' unique binary id string binary_checksum = 7 [deprecated = true]; // Responses to the `queries` field in the task being responded to @@ -402,7 +406,7 @@ message RespondWorkflowTaskFailedRequest { temporal.api.failure.v1.Failure failure = 3; // The identity of the worker/client string identity = 4; - // DEPRECATED since 1.21 - use `deployment_options` instead. + // Deprecated. Use `deployment_options` instead. // Worker process' unique binary id string binary_checksum = 5 [deprecated = true]; string namespace = 6; @@ -436,6 +440,10 @@ message PollActivityTaskQueueRequest { temporal.api.common.v1.WorkerVersionCapabilities worker_version_capabilities = 5 [deprecated = true]; // Worker deployment options that user has set in the worker. temporal.api.deployment.v1.WorkerDeploymentOptions deployment_options = 6; + + // Worker info to be sent to the server. + temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 7; + } message PollActivityTaskQueueResponse { @@ -713,8 +721,8 @@ message SignalWorkflowExecutionRequest { string identity = 5; // Used to de-dupe sent signals string request_id = 6; - // Deprecated - string control = 7; + // Deprecated. + string control = 7 [deprecated = true]; // Headers that are passed with the signal to the processing workflow. // These can include things like auth or tracing tokens. temporal.api.common.v1.Header header = 8; @@ -760,8 +768,8 @@ message SignalWithStartWorkflowExecutionRequest { string signal_name = 12; // Serialized value(s) to provide with the signal temporal.api.common.v1.Payloads signal_input = 13; - // Deprecated - string control = 14; + // Deprecated. + string control = 14 [deprecated = true]; // Retry policy for the workflow temporal.api.common.v1.RetryPolicy retry_policy = 15; // See https://docs.temporal.io/docs/content/what-is-a-temporal-cron-job/ @@ -809,9 +817,9 @@ message ResetWorkflowExecutionRequest { int64 workflow_task_finish_event_id = 4; // Used to de-dupe reset requests string request_id = 5; - // Event types to be reapplied (deprecated) + // Deprecated. Use `options`. // Default: RESET_REAPPLY_TYPE_SIGNAL - temporal.api.enums.v1.ResetReapplyType reset_reapply_type = 6; + temporal.api.enums.v1.ResetReapplyType reset_reapply_type = 6 [deprecated = true]; // Event types not to be reapplied repeated temporal.api.enums.v1.ResetReapplyExcludeType reset_reapply_exclude_types = 7; // Operations to perform after the workflow has been reset. These operations will be applied @@ -993,6 +1001,8 @@ message ShutdownWorkerRequest { string sticky_task_queue = 2; string identity = 3; string reason = 4; + + temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 5; } message ShutdownWorkerResponse { @@ -1032,44 +1042,52 @@ message DescribeWorkflowExecutionResponse { // aip.dev/not-precedent: field_behavior annotation not available in our gogo fork --) message DescribeTaskQueueRequest { string namespace = 1; - // Sticky queues are not supported in `ENHANCED` mode. + + // Sticky queues are not supported in deprecated ENHANCED mode. temporal.api.taskqueue.v1.TaskQueue task_queue = 2; - // Deprecated. Use `ENHANCED` mode with `task_queue_types`. Ignored in `ENHANCED` mode. + // If unspecified (TASK_QUEUE_TYPE_UNSPECIFIED), then default value (TASK_QUEUE_TYPE_WORKFLOW) will be used. + // Only supported in default mode (use `task_queue_types` in ENHANCED mode instead). temporal.api.enums.v1.TaskQueueType task_queue_type = 3; - // Deprecated. Ignored in `ENHANCED` mode. - bool include_task_queue_status = 4; - // All options except `task_queue_type` and `include_task_queue_status` are only available in the `ENHANCED` mode. - temporal.api.enums.v1.DescribeTaskQueueMode api_mode = 5; + // Report stats for the requested task queue type(s). + bool report_stats = 8; + + // Deprecated, use `report_stats` instead. + // If true, the task queue status will be included in the response. + bool include_task_queue_status = 4 [deprecated = true]; + + // Deprecated. ENHANCED mode is also being deprecated. + // Select the API mode to use for this request: DEFAULT mode (if unset) or ENHANCED mode. + // Consult the documentation for each field to understand which mode it is supported in. + temporal.api.enums.v1.DescribeTaskQueueMode api_mode = 5 [deprecated = true]; + // Deprecated (as part of the ENHANCED mode deprecation). // Optional. If not provided, the result for the default Build ID will be returned. The default Build ID is the one // mentioned in the first unconditional Assignment Rule. If there is no default Build ID, the result for the // unversioned queue will be returned. // (-- api-linter: core::0140::prepositions --) - temporal.api.taskqueue.v1.TaskQueueVersionSelection versions = 6; + temporal.api.taskqueue.v1.TaskQueueVersionSelection versions = 6 [deprecated = true]; + // Deprecated (as part of the ENHANCED mode deprecation). // Task queue types to report info about. If not specified, all types are considered. - repeated temporal.api.enums.v1.TaskQueueType task_queue_types = 7; - // Report stats for the requested task queue types and versions - bool report_stats = 8; - // Report list of pollers for requested task queue types and versions - bool report_pollers = 9; + repeated temporal.api.enums.v1.TaskQueueType task_queue_types = 7 [deprecated = true]; + + // Deprecated (as part of the ENHANCED mode deprecation). + // Report list of pollers for requested task queue types and versions. + bool report_pollers = 9 [deprecated = true]; + + // Deprecated (as part of the ENHANCED mode deprecation). // Report task reachability for the requested versions and all task types (task reachability is not reported // per task type). - bool report_task_reachability = 10; + bool report_task_reachability = 10 [deprecated = true]; } message DescribeTaskQueueResponse { - // Deprecated. Use `versions_info.types_info.pollers` with `ENHANCED` mode instead. - // Not set in `ENHANCED` mode. repeated temporal.api.taskqueue.v1.PollerInfo pollers = 1; - // Deprecated. Not set in `ENHANCED` mode. - temporal.api.taskqueue.v1.TaskQueueStatus task_queue_status = 2; - // This map contains Task Queue information for each Build ID. Empty string as key value means unversioned. - // Only set in `ENHANCED` mode. - map versions_info = 3; + // Statistics for the task queue. Only populated when `report_stats` is set to true in the request. + temporal.api.taskqueue.v1.TaskQueueStats stats = 5; // Specifies which Worker Deployment Version(s) Server routes this Task Queue's tasks to. // When not present, it means the tasks are routed to Unversioned workers (workers with @@ -1081,6 +1099,15 @@ message DescribeTaskQueueResponse { // are typically not Pinned until they complete their first task (unless they are started with // a Pinned VersioningOverride or are Child Workflows of a Pinned parent). temporal.api.taskqueue.v1.TaskQueueVersioningInfo versioning_info = 4; + + // Deprecated. + // Status of the task queue. Only populated when `include_task_queue_status` is set to true in the request. + temporal.api.taskqueue.v1.TaskQueueStatus task_queue_status = 2 [deprecated = true]; + + // Deprecated. + // Only returned in ENHANCED mode. + // This map contains Task Queue information for each Build ID. Empty string as key value means unversioned. + map versions_info = 3 [deprecated = true]; } message GetClusterInfoRequest { @@ -1153,6 +1180,7 @@ message GetSystemInfoResponse { // True if the server supports Nexus operations. // This flag is dependent both on server version and for Nexus to be enabled via server configuration. bool nexus = 11; + } } @@ -1737,6 +1765,9 @@ message PollNexusTaskQueueRequest { temporal.api.common.v1.WorkerVersionCapabilities worker_version_capabilities = 4 [deprecated = true]; // Worker deployment options that user has set in the worker. temporal.api.deployment.v1.WorkerDeploymentOptions deployment_options = 6; + + // Worker info to be sent to the server. + temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 7; } message PollNexusTaskQueueResponse { @@ -1835,13 +1866,20 @@ message UpdateActivityOptionsRequest { // Controls which fields from `activity_options` will be applied google.protobuf.FieldMask update_mask = 5; - // either activity id or activity type must be provided - oneof activity { - // Only activity with this ID will be updated. - string id = 6; - // Update all running activities of this type. - string type = 7; - } + // either activity id or activity type must be provided + oneof activity { + // Only activity with this ID will be updated. + string id = 6; + // Update all running activities of this type. + string type = 7; + } + + // If set, the activity options will be restored to the default. + // Default options are then options activity was created with. + // They are part of the first SCHEDULE event. + // This flag cannot be combined with any other option; if you supply + // restore_original together with other options, the request will be rejected. + bool restore_original = 8; } message UpdateActivityOptionsResponse { @@ -1932,6 +1970,12 @@ message ResetActivityRequest { // If set, and activity is in backoff, the activity will start at a random time within the specified jitter duration. // (unless it is paused and keep_paused is set) google.protobuf.Duration jitter = 8; + + // If set, the activity options will be restored to the defaults. + // Default options are then options activity was created with. + // They are part of the first SCHEDULE event. + bool restore_original_options = 9; + } message ResetActivityResponse { @@ -2146,6 +2190,12 @@ message ListWorkerDeploymentsResponse { string name = 1; google.protobuf.Timestamp create_time = 2; temporal.api.deployment.v1.RoutingConfig routing_config = 3; + // Summary of the version that was added most recently in the Worker Deployment. + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary latest_version_summary = 4; + // Summary of the current version of the Worker Deployment. + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary current_version_summary = 5; + // Summary of the ramping version of the Worker Deployment. + temporal.api.deployment.v1.WorkerDeploymentInfo.WorkerDeploymentVersionSummary ramping_version_summary = 6; } } @@ -2193,6 +2243,8 @@ message UpdateWorkerDeploymentVersionMetadataRequest { map upsert_entries = 3; // List of keys to remove from the metadata. repeated string remove_entries = 4; + // Optional. The identity of the client who initiated this request. + string identity = 6; } message UpdateWorkerDeploymentVersionMetadataResponse { @@ -2308,3 +2360,45 @@ message TriggerWorkflowRuleResponse { // True is the rule was applied, based on the rule conditions (predicate/visibility_query). bool applied = 1; } +message RecordWorkerHeartbeatRequest { + // Namespace this worker belongs to. + string namespace = 1; + + // The identity of the client who initiated this request. + string identity = 2; + + temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 3; +} + +message RecordWorkerHeartbeatResponse { + +} + +message ListWorkersRequest { + string namespace = 1; + int32 page_size = 2; + bytes next_page_token = 3; + + // `query` in ListWorkers is used to filter workers based on worker status info. + // The following worker status attributes are expected are supported as part of the query: + //* WorkerInstanceKey + //* WorkerIdentity + //* HostName + //* TaskQueue + //* DeploymentName + //* BuildId + //* SdkName + //* SdkVersion + //* StartTime + //* LastHeartbeatTime + //* Status + // Currently metrics are not supported as a part of ListWorkers query. + string query = 4; +} + +message ListWorkersResponse { + repeated temporal.api.worker.v1.WorkerInfo workers_info = 1; + + // Next page token + bytes next_page_token = 2; +} diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto index 865386506..bfa622dcd 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/service.proto @@ -1173,4 +1173,25 @@ service WorkflowService { }; } + // WorkerHeartbeat receive heartbeat request from the worker. + rpc RecordWorkerHeartbeat (RecordWorkerHeartbeatRequest) returns (RecordWorkerHeartbeatResponse) { + option (google.api.http) = { + post: "/namespaces/{namespace}/workers/heartbeat" + body: "*" + additional_bindings { + post: "/api/v1/namespaces/{namespace}/workers/heartbeat" + body: "*" + } + }; + }; + + // ListWorkers is a visibility API to list worker status information in a specific namespace. + rpc ListWorkers (ListWorkersRequest) returns (ListWorkersResponse) { + option (google.api.http) = { + get: "/namespaces/{namespace}/workers" + additional_bindings { + get: "/api/v1/namespaces/{namespace}/workers" + } + }; + } } diff --git a/sdk-core-protos/src/history_builder.rs b/sdk-core-protos/src/history_builder.rs index 653cc1d6e..152eb6b81 100644 --- a/sdk-core-protos/src/history_builder.rs +++ b/sdk-core-protos/src/history_builder.rs @@ -367,7 +367,6 @@ impl TestHistoryBuilder { run_id: run_id.into(), }), signal_name: signal_name.into(), - control: "".to_string(), ..Default::default() }) } diff --git a/sdk-core-protos/src/lib.rs b/sdk-core-protos/src/lib.rs index 23a011407..51c02d0a8 100644 --- a/sdk-core-protos/src/lib.rs +++ b/sdk-core-protos/src/lib.rs @@ -1834,6 +1834,7 @@ pub mod temporal { retry_policy: s.retry_policy.map(Into::into), cron_schedule: s.cron_schedule.clone(), parent_close_policy: s.parent_close_policy, + #[allow(deprecated)] inherit_build_id, priority: s.priority, }, @@ -1887,6 +1888,7 @@ pub mod temporal { } else { Some(c.search_attributes.into()) }, + #[allow(deprecated)] inherit_build_id, ..Default::default() }, @@ -2429,6 +2431,11 @@ pub mod temporal { tonic::include_proto!("temporal.api.version.v1"); } } + pub mod worker { + pub mod v1 { + tonic::include_proto!("temporal.api.worker.v1"); + } + } pub mod workflow { pub mod v1 { tonic::include_proto!("temporal.api.workflow.v1"); From 7b83f8ebc7515f63651b12da4e0cf548371944be Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 26 Jun 2025 16:31:42 -0700 Subject: [PATCH 02/13] Need to figure out how to pass client to timer thread, tried to pass heartbeat to clientBag, but that doesn't work bc clientBag not in mutex. Might need to revert back to passing client to heartbeat_info, but that causes the original problem of the timer thread not having the actual client --- core-api/src/worker.rs | 6 + core/Cargo.toml | 1 + core/src/core_tests/activity_tasks.rs | 36 ++--- core/src/core_tests/child_workflows.rs | 8 +- core/src/core_tests/determinism.rs | 12 +- core/src/core_tests/local_activities.rs | 40 ++--- core/src/core_tests/mod.rs | 6 +- core/src/core_tests/queries.rs | 30 ++-- core/src/core_tests/replay_flag.rs | 6 +- core/src/core_tests/updates.rs | 6 +- core/src/core_tests/workers.rs | 125 +++++++++++--- core/src/core_tests/workflow_tasks.rs | 73 ++++----- core/src/lib.rs | 12 +- core/src/pollers/poll_buffer.rs | 4 +- core/src/replay/mod.rs | 6 +- core/src/test_help/mod.rs | 9 +- core/src/worker/activities.rs | 8 +- .../activities/activity_heartbeat_manager.rs | 14 +- core/src/worker/client.rs | 52 +++++- core/src/worker/client/mocks.rs | 6 +- core/src/worker/heartbeat.rs | 152 ++++++++++++++++++ core/src/worker/mod.rs | 33 +++- core/src/worker/workflow/history_update.rs | 26 +-- .../upsert_search_attributes_state_machine.rs | 4 +- 24 files changed, 497 insertions(+), 178 deletions(-) create mode 100644 core/src/worker/heartbeat.rs diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index f3757bfd1..aa513e9eb 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -161,6 +161,12 @@ pub struct WorkerConfig { /// A versioning strategy for this worker. pub versioning_strategy: WorkerVersioningStrategy, + + /// The interval in which the worker will send a heartbeat. + /// The timer is reset on each existing RPC call that also happens to send this data, like + /// `PollWorkflowTaskQueueRequest`. + #[builder(default = "Some(Duration::from_secs(1))")] + pub heartbeat_interval: Option, } impl WorkerConfig { diff --git a/core/Cargo.toml b/core/Cargo.toml index 89ee57c6b..374e1fdf4 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -35,6 +35,7 @@ enum-iterator = "2" flate2 = { version = "1.0", optional = true } futures-util = { version = "0.3", default-features = false } futures-channel = { version = "0.3", default-features = false, features = ["std"] } +gethostname = "1.0.2" governor = "0.8" http-body-util = { version = "0.1", optional = true } hyper = { version = "1.2", optional = true } diff --git a/core/src/core_tests/activity_tasks.rs b/core/src/core_tests/activity_tasks.rs index a01e2d605..c011b4f46 100644 --- a/core/src/core_tests/activity_tasks.rs +++ b/core/src/core_tests/activity_tasks.rs @@ -6,7 +6,7 @@ use crate::{ gen_assert_and_reply, mock_manual_poller, mock_poller, mock_poller_from_resps, mock_sdk_cfg, mock_worker, poll_and_reply, single_hist_mock_sg, test_worker_cfg, }, - worker::client::mocks::{mock_manual_workflow_client, mock_workflow_client}, + worker::client::mocks::{mock_manual_worker_client, mock_worker_client}, }; use futures_util::FutureExt; use itertools::Itertools; @@ -86,7 +86,7 @@ fn three_tasks() -> VecDeque { async fn max_activities_respected() { let _task_q = "q"; let mut tasks = three_tasks(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .times(3) @@ -122,7 +122,7 @@ async fn max_activities_respected() { #[tokio::test] async fn activity_not_found_returns_ok() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); // Mock won't even be called, since we weren't tracking activity mock_client.expect_complete_activity_task().times(0); @@ -139,7 +139,7 @@ async fn activity_not_found_returns_ok() { #[tokio::test] async fn heartbeats_report_cancels_only_once() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .times(2) @@ -265,7 +265,7 @@ async fn activity_cancel_interrupts_poll() { .times(3) .returning(move || poll_resps.pop_front().unwrap()); - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); mock_client .expect_record_activity_heartbeat() .times(1) @@ -323,7 +323,7 @@ async fn activity_cancel_interrupts_poll() { #[tokio::test] async fn activity_poll_timeout_retries() { - let mock_client = mock_workflow_client(); + let mock_client = mock_worker_client(); let mut calls = 0; let mut mock_act_poller = mock_poller(); mock_act_poller.expect_poll().times(3).returning(move || { @@ -352,7 +352,7 @@ async fn many_concurrent_heartbeat_cancels() { // them after a few successful heartbeats const CONCURRENCY_NUM: usize = 5; - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); let mut poll_resps = VecDeque::from( (0..CONCURRENCY_NUM) .map(|i| { @@ -516,7 +516,7 @@ async fn activity_timeout_no_double_resolve() { #[tokio::test] async fn can_heartbeat_acts_during_shutdown() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .times(1) @@ -567,7 +567,7 @@ async fn can_heartbeat_acts_during_shutdown() { #[tokio::test] async fn complete_act_with_fail_flushes_heartbeat() { let last_hb = 50; - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let last_seen_payload = Rc::new(RefCell::new(None)); let lsp = last_seen_payload.clone(); mock_client @@ -622,7 +622,7 @@ async fn complete_act_with_fail_flushes_heartbeat() { #[tokio::test] async fn max_tq_acts_set_passed_to_poll_properly() { let rate = 9.28; - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .returning(move |_, ao| { @@ -659,7 +659,7 @@ async fn no_eager_activities_requested_when_worker_options_disable_it( let num_eager_requested = Arc::new(AtomicUsize::new(0)); let num_eager_requested_clone = num_eager_requested.clone(); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .times(1) .returning(move |req| { @@ -747,7 +747,7 @@ async fn activity_tasks_from_completion_are_delivered() { // Clone it to move into the callback below let num_eager_requested_clone = num_eager_requested.clone(); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .times(1) .returning(move |req| { @@ -876,7 +876,7 @@ async fn activity_tasks_from_completion_reserve_slots() { t.add_full_wf_task(); t.add_workflow_execution_completed(); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); // Set up two tasks to be returned via normal activity polling let act_tasks = VecDeque::from(vec![ PollActivityTaskQueueResponse { @@ -1004,7 +1004,7 @@ async fn activity_tasks_from_completion_reserve_slots() { #[tokio::test] async fn retryable_net_error_exhaustion_is_nonfatal() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_complete_activity_task() .times(1) @@ -1033,7 +1033,7 @@ async fn retryable_net_error_exhaustion_is_nonfatal() { #[tokio::test] async fn cant_complete_activity_with_unset_result_payload() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .returning(move |_, _| { @@ -1076,7 +1076,7 @@ async fn graceful_shutdown(#[values(true, false)] at_max_outstanding: bool) { .times(1) .returning(move || None); // They shall all be reported as failed - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_fail_activity_task() .times(3) @@ -1153,7 +1153,7 @@ async fn activities_must_be_flushed_to_server_on_shutdown(#[values(true, false)] .expect_poll() .times(1) .returning(move || None); - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); mock_client .expect_complete_activity_task() .times(1) @@ -1251,7 +1251,7 @@ async fn pass_activity_summary_to_metadata() { #[tokio::test] async fn heartbeat_response_can_be_paused() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); // First heartbeat returns pause only mock_client .expect_record_activity_heartbeat() diff --git a/core/src/core_tests/child_workflows.rs b/core/src/core_tests/child_workflows.rs index c1bf6a923..1593acf94 100644 --- a/core/src/core_tests/child_workflows.rs +++ b/core/src/core_tests/child_workflows.rs @@ -4,7 +4,7 @@ use crate::{ MockPollCfg, ResponseType, build_fake_sdk, canned_histories, mock_sdk, mock_sdk_cfg, mock_worker, single_hist_mock_sg, }, - worker::client::mocks::mock_workflow_client, + worker::client::mocks::mock_worker_client, }; use temporal_client::WorkflowOptions; use temporal_sdk::{ChildWorkflowOptions, Signal, WfContext, WorkflowResult}; @@ -32,7 +32,7 @@ async fn signal_child_workflow(#[case] serial: bool) { let wf_id = "fakeid"; let wf_type = DEFAULT_WORKFLOW_TYPE; let t = canned_histories::single_child_workflow_signaled("child-id-1", SIGNAME); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut worker = mock_sdk(MockPollCfg::from_resp_batches( wf_id, t, @@ -130,7 +130,7 @@ async fn cancel_child_workflow_lang_thinks_not_started_but_is( } _ => canned_histories::single_child_workflow_cancelled("child-id-1"), }; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mock = single_hist_mock_sg("fakeid", t, [ResponseType::AllHistory], mock, true); let core = mock_worker(mock); let act = core.poll_workflow_activation().await.unwrap(); @@ -179,7 +179,7 @@ async fn cancel_child_workflow_lang_thinks_not_started_but_is( #[tokio::test] async fn cancel_already_complete_child_ignored() { let t = canned_histories::single_child_workflow("child-id-1"); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mock = single_hist_mock_sg("fakeid", t, [ResponseType::AllHistory], mock, true); let core = mock_worker(mock); let act = core.poll_workflow_activation().await.unwrap(); diff --git a/core/src/core_tests/determinism.rs b/core/src/core_tests/determinism.rs index 3e552e0bc..4c3ac90a4 100644 --- a/core/src/core_tests/determinism.rs +++ b/core/src/core_tests/determinism.rs @@ -2,7 +2,7 @@ use crate::{ internal_flags::CoreInternalFlags, replay::DEFAULT_WORKFLOW_TYPE, test_help::{MockPollCfg, ResponseType, canned_histories, mock_sdk, mock_sdk_cfg}, - worker::client::mocks::mock_workflow_client, + worker::client::mocks::mock_worker_client, }; use std::{ sync::atomic::{AtomicBool, AtomicUsize, Ordering}, @@ -40,7 +40,7 @@ async fn test_panic_wf_task_rejected_properly() { let wf_id = "fakeid"; let wf_type = DEFAULT_WORKFLOW_TYPE; let t = canned_histories::workflow_fails_with_failure_after_timer("1"); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches(wf_id, t, [1, 2, 2], mock); // We should see one wft failure which has unspecified cause, since panics don't have a defined // type. @@ -72,7 +72,7 @@ async fn test_wf_task_rejected_properly_due_to_nondeterminism(#[case] use_cache: let wf_id = "fakeid"; let wf_type = DEFAULT_WORKFLOW_TYPE; let t = canned_histories::single_timer_wf_completes("1"); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches( wf_id, t, @@ -131,7 +131,7 @@ async fn activity_id_or_type_change_is_nondeterministic( canned_histories::single_activity("1") }; t.set_flags_first_wft(&[CoreInternalFlags::IdAndTypeDeterminismChecks as u32], &[]); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches( wf_id, t, @@ -214,7 +214,7 @@ async fn child_wf_id_or_type_change_is_nondeterministic( let wf_type = DEFAULT_WORKFLOW_TYPE; let mut t = canned_histories::single_child_workflow("1"); t.set_flags_first_wft(&[CoreInternalFlags::IdAndTypeDeterminismChecks as u32], &[]); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches( wf_id, t, @@ -289,7 +289,7 @@ async fn repro_channel_missing_because_nondeterminism() { let _ts = t.add_by_type(EventType::TimerStarted); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches(wf_id, t, [1.into(), ResponseType::AllHistory], mock); mh.num_expected_fails = 1; diff --git a/core/src/core_tests/local_activities.rs b/core/src/core_tests/local_activities.rs index 327eb81bd..d524d0317 100644 --- a/core/src/core_tests/local_activities.rs +++ b/core/src/core_tests/local_activities.rs @@ -5,7 +5,7 @@ use crate::{ MockPollCfg, ResponseType, WorkerExt, build_mock_pollers, hist_to_poll_resp, mock_sdk, mock_sdk_cfg, mock_worker, single_hist_mock_sg, }, - worker::{LEGACY_QUERY_ID, client::mocks::mock_workflow_client}, + worker::{LEGACY_QUERY_ID, client::mocks::mock_worker_client}, }; use anyhow::anyhow; use crossbeam_queue::SegQueue; @@ -69,7 +69,7 @@ async fn local_act_two_wfts_before_marker(#[case] replay: bool, #[case] cached: t.add_workflow_execution_completed(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let resps = if replay { vec![ResponseType::AllHistory] } else { @@ -140,7 +140,7 @@ async fn local_act_many_concurrent() { t.add_workflow_execution_completed(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches(wf_id, t, [1, 2, 3], mock); let mut worker = mock_sdk(mh); @@ -178,7 +178,7 @@ async fn local_act_heartbeat(#[case] shutdown_middle: bool) { t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches(wf_id, t, [1, 2, 2, 2], mock); mh.enforce_correct_number_of_polls = false; let mut worker = mock_sdk_cfg(mh, |wc| { @@ -240,7 +240,7 @@ async fn local_act_fail_and_retry(#[case] eventually_pass: bool) { t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches(wf_id, t, [1], mock); let mut worker = mock_sdk(mh); @@ -316,7 +316,7 @@ async fn local_act_retry_long_backoff_uses_timer() { t.add_workflow_execution_completed(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches( wf_id, t, @@ -376,7 +376,7 @@ async fn local_act_null_result() { t.add_workflow_execution_completed(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches(wf_id, t, [ResponseType::AllHistory], mock); let mut worker = mock_sdk_cfg(mh, |w| w.max_cached_workflows = 1); @@ -418,7 +418,7 @@ async fn local_act_command_immediately_follows_la_marker() { t.add_full_wf_task(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); // Bug only repros when seeing history up to third wft let mh = MockPollCfg::from_resp_batches(wf_id, t, [3], mock); let mut worker = mock_sdk_cfg(mh, |w| w.max_cached_workflows = 0); @@ -489,7 +489,7 @@ async fn query_during_wft_heartbeat_doesnt_accidentally_fail_to_continue_heartbe ), ), ]; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mock = single_hist_mock_sg(wfid, t, tasks, mock, true); mock.worker_cfg(|wc| wc.max_cached_workflows = 1); let core = mock_worker(mock); @@ -605,7 +605,7 @@ async fn la_resolve_during_legacy_query_does_not_combine(#[case] impossible_quer pr }, ]; - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); if impossible_query_in_task { mock.expect_respond_legacy_query() .times(1) @@ -712,7 +712,7 @@ async fn test_schedule_to_start_timeout() { t.add_full_wf_task(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches(wf_id, t, [ResponseType::ToTaskNum(1)], mock); let mut worker = mock_sdk_cfg(mh, |w| w.max_cached_workflows = 1); @@ -791,7 +791,7 @@ async fn test_schedule_to_start_timeout_not_based_on_original_time( t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches(wf_id, t, [ResponseType::AllHistory], mock); let mut worker = mock_sdk_cfg(mh, |w| w.max_cached_workflows = 1); @@ -868,7 +868,7 @@ async fn start_to_close_timeout_allows_retries(#[values(true, false)] la_complet t.add_workflow_execution_completed(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches( wf_id, t, @@ -947,7 +947,7 @@ async fn wft_failure_cancels_running_las() { t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches(wf_id, t, [1, 2], mock); mh.num_expected_fails = 1; let mut worker = mock_sdk_cfg(mh, |w| w.max_cached_workflows = 1); @@ -1007,7 +1007,7 @@ async fn resolved_las_not_recorded_if_wft_fails_many_times() { t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches( wf_id, t, @@ -1058,7 +1058,7 @@ async fn local_act_records_nonfirst_attempts_ok() { t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches(wf_id, t, [1, 2, 3], mock); let nonfirst_counts = Arc::new(SegQueue::new()); let nfc_c = nonfirst_counts.clone(); @@ -1125,7 +1125,7 @@ async fn local_activities_can_be_delivered_during_shutdown() { t.add_timer_fired(timer_started_event_id, "1".to_string()); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mock = single_hist_mock_sg( wfid, t, @@ -1214,7 +1214,7 @@ async fn queries_can_be_received_while_heartbeating() { pr }, ]; - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_respond_legacy_query() .times(1) .returning(move |_, _| Ok(Default::default())); @@ -1291,7 +1291,7 @@ async fn local_activity_after_wf_complete_is_discarded() { t.add_full_wf_task(); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mock_cfg = MockPollCfg::from_resp_batches( wfid, t, @@ -1385,7 +1385,7 @@ async fn local_act_retry_explicit_delay() { t.add_workflow_task_scheduled_and_started(); let wf_id = "fakeid"; - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches(wf_id, t, [1], mock); let mut worker = mock_sdk(mh); diff --git a/core/src/core_tests/mod.rs b/core/src/core_tests/mod.rs index 2f922f0b3..62fc66a02 100644 --- a/core/src/core_tests/mod.rs +++ b/core/src/core_tests/mod.rs @@ -13,7 +13,7 @@ use crate::{ Worker, errors::PollError, test_help::{MockPollCfg, build_mock_pollers, canned_histories, mock_worker, test_worker_cfg}, - worker::client::mocks::{mock_manual_workflow_client, mock_workflow_client}, + worker::client::mocks::{mock_manual_worker_client, mock_worker_client}, }; use futures_util::FutureExt; use std::{sync::LazyLock, time::Duration}; @@ -24,7 +24,7 @@ use tokio::{sync::Barrier, time::sleep}; #[tokio::test] async fn after_shutdown_server_is_not_polled() { let t = canned_histories::single_timer("fake_timer"); - let mh = MockPollCfg::from_resp_batches("fake_wf_id", t, [1], mock_workflow_client()); + let mh = MockPollCfg::from_resp_batches("fake_wf_id", t, [1], mock_worker_client()); let mut mock = build_mock_pollers(mh); // Just so we don't have to deal w/ cache overflow mock.worker_cfg(|cfg| cfg.max_cached_workflows = 1); @@ -49,7 +49,7 @@ static BARR: LazyLock = LazyLock::new(|| Barrier::new(3)); #[tokio::test] async fn shutdown_interrupts_both_polls() { - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); mock_client .expect_poll_activity_task() .times(1) diff --git a/core/src/core_tests/queries.rs b/core/src/core_tests/queries.rs index 4c9a9528f..5e2289321 100644 --- a/core/src/core_tests/queries.rs +++ b/core/src/core_tests/queries.rs @@ -3,7 +3,7 @@ use crate::{ MockPollCfg, MocksHolder, ResponseType, WorkerExt, build_mock_pollers, canned_histories, hist_to_poll_resp, mock_worker, single_hist_mock_sg, }, - worker::{LEGACY_QUERY_ID, client::mocks::mock_workflow_client}, + worker::{LEGACY_QUERY_ID, client::mocks::mock_worker_client}, }; use futures_util::stream; use std::{ @@ -64,7 +64,7 @@ async fn legacy_query(#[case] include_history: bool) { }, hist_to_poll_resp(&t, wfid.to_owned(), 2.into()), ]; - let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); mock.num_expected_legacy_query_resps = 1; let mut mock = build_mock_pollers(mock); if !include_history { @@ -156,9 +156,9 @@ async fn new_queries(#[values(1, 3)] num_queries: usize) { } pr }]); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client.expect_respond_legacy_query().times(0); - let mut mh = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mut mh = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); mh.completion_mock_fn = Some(Box::new(move |c| { // If the completion is the one ending the workflow, make sure it includes the query resps if c.commands[0].command_type() == CommandType::CompleteWorkflowExecution { @@ -235,7 +235,7 @@ async fn legacy_query_failure_on_wft_failure() { pr.history = Some(History { events: vec![] }); pr }]); - let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); mock.num_expected_legacy_query_resps = 1; let mut mock = build_mock_pollers(mock); mock.worker_cfg(|wc| wc.max_cached_workflows = 10); @@ -298,7 +298,7 @@ async fn query_failure_because_nondeterminism(#[values(true, false)] legacy: boo } pr }]; - let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); if legacy { mock.num_expected_legacy_query_resps = 1; } else { @@ -350,7 +350,7 @@ async fn legacy_query_after_complete(#[values(false, true)] full_history: bool) }; tasks.extend([query_with_hist_task.clone(), query_with_hist_task]); - let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); mock.num_expected_legacy_query_resps = 2; let mut mock = build_mock_pollers(mock); mock.worker_cfg(|wc| wc.max_cached_workflows = 10); @@ -443,7 +443,7 @@ async fn query_cache_miss_causes_page_fetch_dont_reply_wft_too_early( ); pr }]); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); if !matches!(hist_type, QueryHists::Full) { mock_client .expect_get_workflow_execution_history() @@ -534,7 +534,7 @@ async fn query_replay_with_continue_as_new_doesnt_reply_empty_command() { pr }; let tasks = VecDeque::from(vec![query_with_hist_task]); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_complete_workflow_task() .times(1) @@ -618,7 +618,7 @@ async fn legacy_query_response_gets_not_found_not_fatal() { }); pr }]; - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_respond_legacy_query() .times(1) .returning(move |_, _| Err(tonic::Status::not_found("Query gone boi"))); @@ -671,7 +671,7 @@ async fn new_query_fail() { ); pr }]); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_complete_workflow_task() .times(1) @@ -774,7 +774,7 @@ async fn legacy_query_combined_with_timer_fire_repro() { pr }, ]; - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_respond_legacy_query() .times(1) .returning(move |_, _| Ok(Default::default())); @@ -875,9 +875,9 @@ async fn build_id_set_properly_on_query_on_first_task() { ); pr }]); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client.expect_respond_legacy_query().times(0); - let mh = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mh = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); let mut mock = build_mock_pollers(mh); mock.worker_cfg(|wc| { wc.max_cached_workflows = 10; @@ -962,7 +962,7 @@ async fn queries_arent_lost_in_buffer_void(#[values(false, true)] buffered_becau hist_to_poll_resp(&t, wfid.to_owned(), 2.into()), ]); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .returning(|_| Ok(Default::default())); mock.expect_respond_legacy_query() diff --git a/core/src/core_tests/replay_flag.rs b/core/src/core_tests/replay_flag.rs index db4599f32..bb17f4414 100644 --- a/core/src/core_tests/replay_flag.rs +++ b/core/src/core_tests/replay_flag.rs @@ -3,7 +3,7 @@ use crate::{ MockPollCfg, ResponseType, build_fake_sdk, build_mock_pollers, canned_histories, hist_to_poll_resp, mock_worker, }, - worker::{LEGACY_QUERY_ID, client::mocks::mock_workflow_client}, + worker::{LEGACY_QUERY_ID, client::mocks::mock_worker_client}, }; use rstest::{fixture, rstest}; use std::{collections::VecDeque, time::Duration}; @@ -94,7 +94,7 @@ async fn replay_flag_correct_with_query() { }, hist_to_poll_resp(&t, wfid.to_owned(), 2.into()), ]); - let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_workflow_client()); + let mut mock = MockPollCfg::from_resp_batches(wfid, t, tasks, mock_worker_client()); mock.num_expected_legacy_query_resps = 1; let mut mock = build_mock_pollers(mock); mock.worker_cfg(|wc| wc.max_cached_workflows = 10); @@ -139,7 +139,7 @@ async fn replay_flag_correct_signal_before_query_ending_on_wft_completed() { pr }; - let mut mock = MockPollCfg::from_resp_batches(wfid, t, [task], mock_workflow_client()); + let mut mock = MockPollCfg::from_resp_batches(wfid, t, [task], mock_worker_client()); mock.num_expected_legacy_query_resps = 1; let mut mock = build_mock_pollers(mock); mock.worker_cfg(|wc| wc.max_cached_workflows = 10); diff --git a/core/src/core_tests/updates.rs b/core/src/core_tests/updates.rs index 854d85d4a..778e6dfb4 100644 --- a/core/src/core_tests/updates.rs +++ b/core/src/core_tests/updates.rs @@ -4,7 +4,7 @@ use crate::{ MockPollCfg, PollWFTRespExt, ResponseType, build_mock_pollers, hist_to_poll_resp, mock_worker, }, - worker::client::mocks::mock_workflow_client, + worker::client::mocks::mock_worker_client, }; use temporal_sdk_core_api::Worker; use temporal_sdk_core_protos::{ @@ -108,7 +108,7 @@ async fn initial_request_sent_back(#[values(false, true)] reject: bool) { let mut poll_resp = hist_to_poll_resp(&t, wfid, ResponseType::AllHistory); let upd_req_body = poll_resp.add_update_request(update_id, 1); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_complete_workflow_task() .times(1) @@ -171,7 +171,7 @@ async fn speculative_wft_with_command_event() { EventType::ActivityTaskScheduled as i32 ); - let mock_client = mock_workflow_client(); + let mock_client = mock_worker_client(); let mut mh = MockPollCfg::from_resp_batches( wfid, real_hist, diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index f4ea8482c..483585a12 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -1,19 +1,16 @@ -use crate::{ - PollError, prost_dur, - test_help::{ - MockPollCfg, MockWorkerInputs, MocksHolder, ResponseType, WorkerExt, build_fake_worker, - build_mock_pollers, canned_histories, mock_worker, test_worker_cfg, +use crate::{PollError, prost_dur, test_help::{ + MockPollCfg, MockWorkerInputs, MocksHolder, ResponseType, WorkerExt, build_fake_worker, + build_mock_pollers, canned_histories, mock_worker, test_worker_cfg, +}, worker::{ + self, + client::{ + MockWorkerClient, + mocks::{DEFAULT_TEST_CAPABILITIES, DEFAULT_WORKERS_REGISTRY, mock_worker_client}, }, - worker::{ - self, - client::{ - MockWorkerClient, - mocks::{DEFAULT_TEST_CAPABILITIES, DEFAULT_WORKERS_REGISTRY, mock_workflow_client}, - }, - }, -}; +}, advance_fut}; use futures_util::{stream, stream::StreamExt}; use std::{cell::RefCell, time::Duration}; +use mockall::mock; use temporal_sdk_core_api::{Worker, worker::PollerBehavior}; use temporal_sdk_core_protos::{ coresdk::{ @@ -27,6 +24,7 @@ use temporal_sdk_core_protos::{ }; use temporal_sdk_core_test_utils::{WorkerTestHelpers, start_timer_cmd}; use tokio::sync::{Barrier, watch}; +use temporal_sdk_core_protos::temporal::api::workflowservice::v1::{PollActivityTaskQueueResponse, RecordWorkerHeartbeatResponse}; #[tokio::test] async fn after_shutdown_of_worker_get_shutdown_err() { @@ -106,7 +104,7 @@ async fn worker_shutdown_during_poll_doesnt_deadlock() { )) }); let mw = MockWorkerInputs::new(stream.boxed()); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_complete_workflow_task() .returning(|_| Ok(RespondWorkflowTaskCompletedResponse::default())); @@ -126,7 +124,7 @@ async fn worker_shutdown_during_poll_doesnt_deadlock() { #[tokio::test] async fn can_shutdown_local_act_only_worker_when_act_polling() { let t = canned_histories::single_timer("1"); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mh = MockPollCfg::from_resp_batches("fakeid", t, [1], mock); let mut mock = build_mock_pollers(mh); mock.worker_cfg(|w| { @@ -166,7 +164,7 @@ async fn can_shutdown_local_act_only_worker_when_act_polling() { #[tokio::test] async fn complete_with_task_not_found_during_shutdown() { let t = canned_histories::single_timer("1"); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .times(1) .returning(|_| Err(tonic::Status::not_found("Workflow task not found."))); @@ -209,7 +207,7 @@ async fn complete_eviction_after_shutdown_doesnt_panic() { "fakeid", t, [1], - mock_workflow_client(), + mock_worker_client(), )); mh.make_wft_stream_interminable(); let core = mock_worker(mh); @@ -236,7 +234,7 @@ async fn complete_eviction_after_shutdown_doesnt_panic() { #[tokio::test] async fn worker_does_not_panic_on_retry_exhaustion_of_nonfatal_net_err() { let t = canned_histories::single_timer("1"); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); // Return a failure that counts as retryable, and hence we want to be swallowed mock.expect_complete_workflow_task() .times(1) @@ -264,7 +262,7 @@ async fn worker_does_not_panic_on_retry_exhaustion_of_nonfatal_net_err() { #[rstest::rstest] #[tokio::test] async fn worker_can_shutdown_after_never_polling_ok(#[values(true, false)] poll_workflow: bool) { - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_poll_activity_task() .returning(|_, _| Err(tonic::Status::permission_denied("you shall not pass"))); if poll_workflow { @@ -361,3 +359,92 @@ async fn worker_shutdown_api(#[case] use_cache: bool, #[case] api_success: bool) ); }); } + +#[rstest::rstest] +#[tokio::test] +async fn worker_heartbeat() { + let mut mock = mock_worker_client(); // mock worker client + mock + .expect_record_worker_heartbeat() + .times(1) + .returning(|heartbeat| { + let host_info = heartbeat.host_info.clone().unwrap(); + println!("heartbeat: {:?}", heartbeat); + assert_eq!(heartbeat.worker_identity, "TODO"); + assert_eq!(heartbeat.worker_instance_key, "TODO"); + assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); + assert_eq!(host_info.process_id, std::process::id().to_string()); + assert_eq!(heartbeat.sdk_name, "test-core"); + assert_eq!(heartbeat.sdk_version, "0.0.0"); + // TODO: assert_eq!(heartbeat.task_queue, tasks); + assert!(heartbeat.heartbeat_time.is_some()); + assert!(heartbeat.start_time.is_some()); + + + Ok(RecordWorkerHeartbeatResponse {}) + }); + mock + .expect_poll_activity_task() + // .times(1) + .returning(move |_, _| Ok(PollActivityTaskQueueResponse { + task_token: vec![1], + ..Default::default() + })); + + // or let worker = mock_worker(MocksHolder::from_mock_worker(mock_client, mw)); + let worker = worker::Worker::new_test( + test_worker_cfg() + .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) + .build() + .unwrap(), + mock, + ); + // Give time for worker heartbeat timer to fire + tokio::time::sleep(Duration::from_millis(3000)).await; + worker.poll_activity_task().await.unwrap(); + assert!(false); +} + +// #[tokio::test] +// async fn worker_heartbeat1() { +// let mut mock = mock_worker_client(); // mock worker client +// mock +// .expect_record_worker_heartbeat() +// .times(1) +// .returning(|heartbeat| { +// let host_info = heartbeat.host_info.clone().unwrap(); +// println!("heartbeat: {:?}", heartbeat); +// assert_eq!(heartbeat.worker_identity, "TODO"); +// assert_eq!(heartbeat.worker_instance_key, "TODO"); +// assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); +// assert_eq!(host_info.process_id, std::process::id().to_string()); +// assert_eq!(heartbeat.sdk_name, "test-core"); +// assert_eq!(heartbeat.sdk_version, "0.0.0"); +// // TODO: assert_eq!(heartbeat.task_queue, tasks); +// assert!(heartbeat.heartbeat_time.is_some()); +// assert!(heartbeat.start_time.is_some()); +// +// +// Ok(RecordWorkerHeartbeatResponse {}) +// }); +// mock +// .expect_poll_activity_task() +// .times(1) +// .returning(move |_, _| Ok(PollActivityTaskQueueResponse { +// task_token: vec![1], +// ..Default::default() +// })); +// +// // or let worker = mock_worker(MocksHolder::from_mock_worker(mock_client, mw)); +// let worker = worker::Worker::new_test( +// test_worker_cfg() +// .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) +// .build() +// .unwrap(), +// mock, +// ); +// // Give time for worker heartbeat timer to fire +// tokio::time::sleep(Duration::from_millis(3000)).await; +// worker.poll_activity_task().await.unwrap(); +// assert!(false); +// } \ No newline at end of file diff --git a/core/src/core_tests/workflow_tasks.rs b/core/src/core_tests/workflow_tasks.rs index 1b7ea2760..ea3530345 100644 --- a/core/src/core_tests/workflow_tasks.rs +++ b/core/src/core_tests/workflow_tasks.rs @@ -13,7 +13,7 @@ use crate::{ }, worker::{ TunerBuilder, - client::mocks::{mock_manual_workflow_client, mock_workflow_client}, + client::mocks::{mock_manual_worker_client, mock_worker_client}, }, }; use futures_util::{FutureExt, stream}; @@ -499,7 +499,7 @@ async fn abandoned_activities_ignore_start_and_complete(hist_batches: &'static [ t.add_timer_fired(timer_started_event_id, "2".to_string()); t.add_full_wf_task(); t.add_workflow_execution_completed(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut worker = mock_sdk(MockPollCfg::from_resp_batches(wfid, t, hist_batches, mock)); worker.register_wf(wf_type.to_owned(), |ctx: WfContext| async move { @@ -1156,7 +1156,7 @@ async fn wft_timeout_repro(hist_batches: &'static [usize]) { async fn complete_after_eviction() { let wfid = "fake_wf_id"; let t = canned_histories::single_timer("1"); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task().times(0); let mock = single_hist_mock_sg(wfid, t, [2], mock, true); let core = mock_worker(mock); @@ -1194,7 +1194,7 @@ async fn sends_appropriate_sticky_task_queue_responses() { // include the information that tells the server to enqueue the next task on a sticky queue. let wfid = "fake_wf_id"; let t = canned_histories::single_timer("1"); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .withf(|comp| comp.sticky_attributes.is_some()) .times(1) @@ -1218,7 +1218,7 @@ async fn sends_appropriate_sticky_task_queue_responses() { async fn new_server_work_while_eviction_outstanding_doesnt_overwrite_activation() { let wfid = "fake_wf_id"; let t = canned_histories::single_timer("1"); - let mock = single_hist_mock_sg(wfid, t, [1, 2], mock_workflow_client(), false); + let mock = single_hist_mock_sg(wfid, t, [1, 2], mock_worker_client(), false); let taskmap = mock.outstanding_task_map.clone().unwrap(); let core = mock_worker(mock); @@ -1279,7 +1279,7 @@ async fn buffered_work_drained_on_shutdown() { tasks.extend( std::iter::repeat_with(|| hist_to_poll_resp(&t, wfid.to_owned(), 2.into()).resp).take(50), ); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .returning(|_| Ok(RespondWorkflowTaskCompletedResponse::default())); let mut mock = MocksHolder::from_wft_stream(mock, stream::iter(tasks)); @@ -1323,7 +1323,7 @@ async fn fail_wft_then_recover() { t, // We need to deliver all of history twice because of eviction [ResponseType::AllHistory, ResponseType::AllHistory], - mock_workflow_client(), + mock_worker_client(), ); mh.num_expected_fails = 1; mh.expect_fail_wft_matcher = @@ -1388,7 +1388,7 @@ async fn poll_response_triggers_wf_error() { "fake_wf_id", t, [ResponseType::AllHistory], - mock_workflow_client(), + mock_worker_client(), ); // Fail wft will be called when auto-failing. mh.num_expected_fails = 1; @@ -1418,7 +1418,7 @@ async fn lang_slower_than_wft_timeouts() { t.add_full_wf_task(); t.add_workflow_execution_completed(); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_complete_workflow_task() .times(1) .returning(|_| Err(tonic::Status::not_found("Workflow task not found."))); @@ -1476,7 +1476,7 @@ async fn tries_cancel_of_completed_activity() { t.add_activity_task_completed(scheduled_event_id, started_event_id, Default::default()); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mock = single_hist_mock_sg("fake_wf_id", t, [1, 2], mock, true); mock.worker_cfg(|cfg| cfg.max_cached_workflows = 1); let core = mock_worker(mock); @@ -1524,7 +1524,7 @@ async fn failing_wft_doesnt_eat_permit_forever() { t.add_by_type(EventType::WorkflowExecutionStarted); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mock = MockPollCfg::from_resp_batches("fake_wf_id", t, [1, 1, 1], mock); mock.num_expected_fails = 1; let mut mock = build_mock_pollers(mock); @@ -1586,7 +1586,7 @@ async fn cache_miss_will_fetch_history() { "fake_wf_id", t, [ResponseType::ToTaskNum(1), ResponseType::OneTask(2)], - mock_workflow_client(), + mock_worker_client(), ); mh.mock_client .expect_get_workflow_execution_history() @@ -1682,7 +1682,7 @@ async fn history_byte_size_and_can_suggestion_in_activation() { "fake_wf_id", t, [ResponseType::ToTaskNum(1), ResponseType::OneTask(2)], - mock_workflow_client(), + mock_worker_client(), ); let mut mock = build_mock_pollers(mh); mock.worker_cfg(|cfg| cfg.max_cached_workflows = 1); @@ -1713,7 +1713,7 @@ async fn tasks_from_completion_are_delivered() { t.add_full_wf_task(); t.add_workflow_execution_completed(); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); let complete_resp = RespondWorkflowTaskCompletedResponse { workflow_task: Some(hist_to_poll_resp(&t, wfid.to_owned(), 2.into()).resp), activity_tasks: vec![], @@ -1758,7 +1758,7 @@ async fn pagination_works_with_tasks_from_completion() { t.add_we_signaled("sig", vec![]); t.add_workflow_task_scheduled_and_started(); - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); let mut needs_pag_resp = hist_to_poll_resp(&t, wfid.to_owned(), ResponseType::OneTask(2)).resp; needs_pag_resp.next_page_token = vec![1]; let complete_resp = RespondWorkflowTaskCompletedResponse { @@ -1812,7 +1812,7 @@ async fn poll_faster_than_complete_wont_overflow_cache() { response_batches: vec![ResponseType::ToTaskNum(1)], }) .collect(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_complete_workflow_task() .times(3) @@ -1937,7 +1937,7 @@ async fn poll_faster_than_complete_wont_overflow_cache() { async fn eviction_waits_until_replay_finished() { let wfid = "fake_wf_id"; let t = canned_histories::long_sequential_timers(3); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mock = single_hist_mock_sg(wfid, t, [3], mock, true); let core = mock_worker(mock); @@ -1998,7 +1998,7 @@ async fn autocompletes_wft_no_work() { let started_event_id = t.add_activity_task_started(scheduled_event_id); t.add_activity_task_completed(scheduled_event_id, started_event_id, Default::default()); t.add_full_wf_task(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut mock = single_hist_mock_sg(wfid, t, [1, 2, 3, 4], mock, true); mock.worker_cfg(|w| w.max_cached_workflows = 1); let core = mock_worker(mock); @@ -2052,7 +2052,7 @@ async fn autocompletes_wft_no_work() { #[tokio::test] async fn no_race_acquiring_permits() { let wfid = "fake_wf_id"; - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); // We need to allow two polls to happen by triggering two processing events in the workflow // stream, but then delivering the actual tasks after that let task_barr: &'static Barrier = Box::leak(Box::new(Barrier::new(2))); @@ -2127,7 +2127,7 @@ async fn continue_as_new_preserves_some_values() { wes_attrs.memo = Some(memo); wes_attrs.search_attributes = Some(search); wes_attrs.retry_policy = Some(retry_policy); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let t = { let mut t = TestHistoryBuilder::default(); t.add(wes_attrs.clone()); @@ -2185,7 +2185,7 @@ async fn ignorable_events_are_ok(#[values(true, false)] attribs_unset: bool) { }); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mock = single_hist_mock_sg("wheee", t, [ResponseType::AllHistory], mock, true); let core = mock_worker(mock); @@ -2198,7 +2198,7 @@ async fn ignorable_events_are_ok(#[values(true, false)] attribs_unset: bool) { #[tokio::test] async fn fetching_to_continue_replay_works() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); t.add_full_wf_task(); @@ -2269,7 +2269,7 @@ async fn fetching_to_continue_replay_works() { #[tokio::test] async fn fetching_error_evicts_wf() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); t.add_workflow_task_scheduled_and_started(); @@ -2336,7 +2336,7 @@ async fn ensure_fetching_fail_during_complete_sends_task_failure() { next_page.history.as_mut().unwrap().events.truncate(9); next_page.next_page_token = vec![2]; - let mut mock = mock_workflow_client(); + let mut mock = mock_worker_client(); mock.expect_get_workflow_execution_history() .returning(move |_, _, _| { error!("Called fetch!"); @@ -2392,7 +2392,7 @@ async fn lang_internal_flags() { "fake_wf_id", t, [ResponseType::ToTaskNum(2), ResponseType::AllHistory], - mock_workflow_client(), + mock_worker_client(), ); mh.completion_mock_fn = Some(Box::new(|c| { assert_matches!(c.sdk_metadata.lang_used_flags.as_slice(), &[2]); @@ -2433,7 +2433,7 @@ async fn lang_internal_flag_with_update() { "fake_wf_id", t, [ResponseType::AllHistory], - mock_workflow_client(), + mock_worker_client(), ); let mut mock = build_mock_pollers(mh); mock.worker_cfg(|wc| wc.max_cached_workflows = 1); @@ -2483,7 +2483,7 @@ async fn core_internal_flags() { "fake_wf_id", t, [ResponseType::ToTaskNum(1)], - mock_workflow_client(), + mock_worker_client(), ); mh.completion_mock_fn = Some(Box::new(move |c| { assert_eq!( @@ -2585,7 +2585,7 @@ async fn _do_post_terminal_commands_test( t: TestHistoryBuilder, ) { let mut mh = - MockPollCfg::from_resp_batches("fake_wf_id", t, response_types, mock_workflow_client()); + MockPollCfg::from_resp_batches("fake_wf_id", t, response_types, mock_worker_client()); if let Some(expected_command_types) = expected_command_types { mh.num_expected_completions = Some(TimesRange::from(1)); mh.completion_mock_fn = Some(Box::new(move |c| { @@ -2636,7 +2636,7 @@ async fn jobs_are_in_appropriate_order() { "fake_wf_id", t, [ResponseType::AllHistory], - mock_workflow_client(), + mock_worker_client(), ); let mut mock = build_mock_pollers(mh); mock.worker_cfg(|wc| wc.max_cached_workflows = 1); @@ -2705,7 +2705,7 @@ async fn history_length_with_fail_and_timeout( t.add_full_wf_task(); t.add_workflow_execution_completed(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let history_responses = match history_responses_case { 1 => vec![ResponseType::AllHistory], 2 => vec![ @@ -2779,7 +2779,7 @@ async fn poller_wont_run_ahead_of_task_slots() { ) .resp }); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_workflow_task() .returning(move |_, _| Ok(bunch_of_first_tasks.next().unwrap())); @@ -2837,7 +2837,7 @@ async fn poller_wont_run_ahead_of_task_slots() { #[tokio::test] async fn poller_wont_poll_until_lang_polls() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let (tx, rx) = sync_channel(101); // Normally you'd just not set any expectations, but the problem is since we never poll // the WFT stream, we'll never join the tasks running the pollers and thus the error @@ -2878,7 +2878,7 @@ async fn use_compatible_version_flag( #[values("activity", "child_wf", "continue_as_new")] command_type: &'static str, ) { let wfid = "fake_wf_id"; - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let t = { let mut t = TestHistoryBuilder::default(); t.add_by_type(EventType::WorkflowExecutionStarted); @@ -2974,7 +2974,7 @@ async fn sets_build_id_from_wft_complete() { t.add_timer_fired(timer_started_event_id, "2".to_string()); t.add_workflow_task_scheduled_and_started(); - let mock = mock_workflow_client(); + let mock = mock_worker_client(); let mut worker = mock_sdk_cfg( MockPollCfg::from_resp_batches(wfid, t, [ResponseType::AllHistory], mock), |cfg| { @@ -3026,7 +3026,7 @@ async fn slot_provider_cant_hand_out_more_permits_than_cache_size() { ) .resp }); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_workflow_task() .returning(move |_, _| Ok(bunch_of_first_tasks.next().unwrap())); @@ -3189,7 +3189,7 @@ async fn both_normal_and_sticky_pollers_poll_concurrently() { .resp }); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); // Track normal vs sticky poll requests and return actual workflow tasks let cc = Arc::clone(&counters); @@ -3261,6 +3261,7 @@ async fn both_normal_and_sticky_pollers_poll_concurrently() { Some("stickytq".to_string()), Arc::new(mock_client), None, + None, ); for _ in 1..50 { diff --git a/core/src/lib.rs b/core/src/lib.rs index 8e43bf82c..c80883b81 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -62,6 +62,7 @@ use crate::{ use anyhow::bail; use futures_util::Stream; use std::sync::Arc; +use parking_lot::Mutex; use temporal_client::{ConfiguredClient, NamespacedClient, TemporalServiceClientWithMetrics}; use temporal_sdk_core_api::{ Worker as WorkerTrait, @@ -98,18 +99,27 @@ where } let client_ident = client.get_identity().to_owned(); let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); - let client_bag = Arc::new(WorkerClientBag::new( + let mut client_bag = Arc::new(WorkerClientBag::new( client, worker_config.namespace.clone(), client_ident, worker_config.versioning_strategy.clone(), )); + let heartbeat_info = Arc::new(Mutex::new(worker::WorkerHeartbeatInfo::new(worker_config.clone(), client_bag.clone()))); + + + // TODO: Adding this afterwards feels a little clunky + client_bag.add_heartbeat_info(heartbeat_info.clone()); + + + Ok(Worker::new( worker_config, sticky_q, client_bag, Some(&runtime.telemetry), + Some(heartbeat_info), )) } diff --git a/core/src/pollers/poll_buffer.rs b/core/src/pollers/poll_buffer.rs index 47ef08924..4053f784d 100644 --- a/core/src/pollers/poll_buffer.rs +++ b/core/src/pollers/poll_buffer.rs @@ -697,7 +697,7 @@ mod tests { use super::*; use crate::{ abstractions::tests::fixed_size_permit_dealer, - worker::client::mocks::mock_manual_workflow_client, + worker::client::mocks::mock_manual_worker_client, }; use futures_util::FutureExt; use std::time::Duration; @@ -705,7 +705,7 @@ mod tests { #[tokio::test] async fn only_polls_once_with_1_poller() { - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); mock_client .expect_poll_workflow_task() .times(2) diff --git a/core/src/replay/mod.rs b/core/src/replay/mod.rs index a2c8b4764..650070b20 100644 --- a/core/src/replay/mod.rs +++ b/core/src/replay/mod.rs @@ -6,7 +6,7 @@ use crate::{ Worker, worker::{ PostActivateHookData, - client::mocks::{MockManualWorkerClient, mock_manual_workflow_client}, + client::mocks::{MockManualWorkerClient, mock_manual_worker_client}, }, }; use futures_util::{FutureExt, Stream, StreamExt}; @@ -73,7 +73,7 @@ where let mut client = if let Some(c) = self.client_override { c } else { - mock_manual_workflow_client() + mock_manual_worker_client() }; let hist_allow_tx = historator.replay_done_tx.clone(); @@ -114,7 +114,7 @@ where hist_allow_tx.send("Failed".to_string()).unwrap(); async move { Ok(RespondWorkflowTaskFailedResponse::default()) }.boxed() }); - let mut worker = Worker::new(self.config, None, Arc::new(client), None); + let mut worker = Worker::new(self.config, None, Arc::new(client), None, None); worker.set_post_activate_hook(post_activate); shutdown_tok(worker.shutdown_token()); Ok(worker) diff --git a/core/src/test_help/mod.rs b/core/src/test_help/mod.rs index 72c167f4a..9690a9283 100644 --- a/core/src/test_help/mod.rs +++ b/core/src/test_help/mod.rs @@ -9,7 +9,7 @@ use crate::{ worker::{ TaskPollers, client::{ - MockWorkerClient, WorkerClient, WorkflowTaskCompletion, mocks::mock_workflow_client, + MockWorkerClient, WorkerClient, WorkflowTaskCompletion, mocks::mock_worker_client, }, }, }; @@ -179,6 +179,7 @@ pub(crate) fn mock_worker(mocks: MocksHolder) -> Worker { .unwrap_or_else(|| mock_poller_from_resps([])), }, None, + None, // TODO: set this up properly ) } @@ -426,7 +427,7 @@ impl MockPollCfg { enforce_correct_number_of_polls, num_expected_fails, num_expected_legacy_query_resps: 0, - mock_client: mock_workflow_client(), + mock_client: mock_worker_client(), expect_fail_wft_matcher: Box::new(|_, _, _| true), completion_mock_fn: None, num_expected_completions: None, @@ -439,14 +440,14 @@ impl MockPollCfg { pub(crate) fn from_hist_builder(t: TestHistoryBuilder) -> Self { let full_hist_info = t.get_full_history_info().unwrap(); let tasks = 1..=full_hist_info.wf_task_count(); - Self::from_resp_batches("fake_wf_id", t, tasks, mock_workflow_client()) + Self::from_resp_batches("fake_wf_id", t, tasks, mock_worker_client()) } pub(crate) fn from_resps( t: TestHistoryBuilder, resps: impl IntoIterator>, ) -> Self { - Self::from_resp_batches("fake_wf_id", t, resps, mock_workflow_client()) + Self::from_resp_batches("fake_wf_id", t, resps, mock_worker_client()) } pub(crate) fn from_resp_batches( diff --git a/core/src/worker/activities.rs b/core/src/worker/activities.rs index 0faf06bf7..505d5c840 100644 --- a/core/src/worker/activities.rs +++ b/core/src/worker/activities.rs @@ -726,14 +726,14 @@ mod tests { abstractions::tests::fixed_size_permit_dealer, pollers::{ActivityTaskOptions, LongPollBuffer}, prost_dur, - worker::client::mocks::mock_workflow_client, + worker::client::mocks::mock_worker_client, }; use temporal_sdk_core_api::worker::PollerBehavior; use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; #[tokio::test] async fn per_worker_ratelimit() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .times(1) @@ -812,7 +812,7 @@ mod tests { #[tokio::test] async fn local_timeouts() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .times(1) @@ -902,7 +902,7 @@ mod tests { #[tokio::test] async fn local_timeout_heartbeating() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .times(1) diff --git a/core/src/worker/activities/activity_heartbeat_manager.rs b/core/src/worker/activities/activity_heartbeat_manager.rs index 88c0e902a..145c80ef6 100644 --- a/core/src/worker/activities/activity_heartbeat_manager.rs +++ b/core/src/worker/activities/activity_heartbeat_manager.rs @@ -423,7 +423,7 @@ impl HeartbeatStreamState { mod test { use super::*; - use crate::worker::client::mocks::mock_workflow_client; + use crate::worker::client::mocks::mock_worker_client; use std::time::Duration; use temporal_sdk_core_protos::temporal::api::{ common::v1::Payload, workflowservice::v1::RecordActivityTaskHeartbeatResponse, @@ -434,7 +434,7 @@ mod test { /// every 1/2 of the heartbeat timeout. #[tokio::test] async fn process_heartbeats_and_shutdown() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .returning(|_, _| Ok(RecordActivityTaskHeartbeatResponse::default())) @@ -456,7 +456,7 @@ mod test { #[tokio::test] async fn send_heartbeats_less_frequently_than_throttle_interval() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .returning(|_, _| Ok(RecordActivityTaskHeartbeatResponse::default())) @@ -475,7 +475,7 @@ mod test { /// Ensure that heartbeat can be called from a tight loop and correctly throttle #[tokio::test] async fn process_tight_loop_and_shutdown() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .returning(|_, _| Ok(RecordActivityTaskHeartbeatResponse::default())) @@ -495,7 +495,7 @@ mod test { /// This test reports one heartbeat and waits for the throttle_interval to elapse before sending another #[tokio::test] async fn report_heartbeat_after_timeout() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .returning(|_, _| Ok(RecordActivityTaskHeartbeatResponse::default())) @@ -513,7 +513,7 @@ mod test { #[tokio::test] async fn evict_works() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .returning(|_, _| Ok(RecordActivityTaskHeartbeatResponse::default())) @@ -534,7 +534,7 @@ mod test { #[tokio::test] async fn evict_immediate_after_record() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_record_activity_heartbeat() .returning(|_, _| Ok(RecordActivityTaskHeartbeatResponse::default())) diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 238d5a9b7..9ffae56a9 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -1,7 +1,7 @@ //! Worker-specific client needs pub(crate) mod mocks; -use parking_lot::RwLock; +use parking_lot::{Mutex, RwLock}; use std::{sync::Arc, time::Duration}; use temporal_client::{ Client, IsWorkerTaskLongPoll, Namespace, NamespacedClient, NoRetryOnMatching, RetryClient, @@ -31,6 +31,8 @@ use temporal_sdk_core_protos::{ }, }; use tonic::IntoRequest; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; +use crate::worker::heartbeat::WorkerHeartbeatInfo; type Result = std::result::Result; @@ -40,6 +42,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, + heartbeat_info: Option>>, } impl WorkerClientBag { @@ -54,6 +57,7 @@ impl WorkerClientBag { namespace, identity, worker_versioning_strategy, + heartbeat_info: None, } } @@ -114,6 +118,29 @@ impl WorkerClientBag { None } } + + pub(crate) fn add_heartbeat_info(&mut self, heartbeat_info: Arc>) { + self.heartbeat_info = Some(heartbeat_info); + } + + fn capture_heartbeat(&self) -> Option { + if let Some(heartbeat_info) = self.heartbeat_info.as_ref() { + Some(heartbeat_info.lock().capture_heartbeat()) + } else { + warn!("WorkerClientBag missing client, unable to send worker heartbeat"); + None + } + } + + /// Wrap the `record_worker_heartbeat` call to allow mocking of the trait call for testing + pub(crate) async fn worker_heartbeat(&self) -> Result { + if let Some(heartbeat) = self.capture_heartbeat() { + self.record_worker_heartbeat(heartbeat).await + } else { + // TODO: + Ok(RecordWorkerHeartbeatResponse::default()) + } + } } /// This trait contains everything workers need to interact with Temporal, and hence provides a @@ -203,6 +230,8 @@ pub trait WorkerClient: Sync + Send { async fn describe_namespace(&self) -> Result; /// Shutdown the worker async fn shutdown_worker(&self, sticky_task_queue: String) -> Result; + /// Record a worker heartbeat + async fn record_worker_heartbeat(&self, heartbeat: WorkerHeartbeat) -> Result; /// Replace the underlying client fn replace_client(&self, new_client: RetryClient); @@ -267,7 +296,7 @@ impl WorkerClient for WorkerClientBag { binary_checksum: self.binary_checksum(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: None, + worker_heartbeat: self.capture_heartbeat(), } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -304,7 +333,7 @@ impl WorkerClient for WorkerClientBag { }), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: None, + worker_heartbeat: self.capture_heartbeat(), } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -337,7 +366,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: None, + worker_heartbeat: self.capture_heartbeat(), } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -615,7 +644,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), sticky_task_queue, reason: "graceful shutdown".to_string(), - worker_heartbeat: None, + worker_heartbeat: self.capture_heartbeat(), }; Ok( @@ -630,6 +659,19 @@ impl WorkerClient for WorkerClientBag { *replaceable_client = new_client; } + async fn record_worker_heartbeat(&self, heartbeat: WorkerHeartbeat) -> Result { + println!("[record_worker_heartbeat] received heartbeat {:?}", heartbeat); + Ok(self + .cloned_client() + .record_worker_heartbeat(RecordWorkerHeartbeatRequest { + namespace: self.namespace.clone(), + identity: self.identity.clone(), + worker_heartbeat: Some(heartbeat), + }) + .await? + .into_inner()) + } + fn capabilities(&self) -> Option { let client = self.replaceable_client.read(); client.get_client().inner().capabilities().cloned() diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index c626d384a..dc2ed21f3 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -22,7 +22,7 @@ pub(crate) static DEFAULT_TEST_CAPABILITIES: &Capabilities = &Capabilities { #[cfg(test)] /// Create a mock client primed with basic necessary expectations -pub(crate) fn mock_workflow_client() -> MockWorkerClient { +pub(crate) fn mock_worker_client() -> MockWorkerClient { let mut r = MockWorkerClient::new(); r.expect_capabilities() .returning(|| Some(*DEFAULT_TEST_CAPABILITIES)); @@ -37,7 +37,7 @@ pub(crate) fn mock_workflow_client() -> MockWorkerClient { } /// Create a mock manual client primed with basic necessary expectations -pub(crate) fn mock_manual_workflow_client() -> MockManualWorkerClient { +pub(crate) fn mock_manual_worker_client() -> MockManualWorkerClient { let mut r = MockManualWorkerClient::new(); r.expect_capabilities() .returning(|| Some(*DEFAULT_TEST_CAPABILITIES)); @@ -146,6 +146,8 @@ mockall::mock! { fn shutdown_worker<'a, 'b>(&self, sticky_task_queue: String) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; + fn record_worker_heartbeat<'a, 'b>(&self, heartbeat: WorkerHeartbeat) -> impl Future> + Send + 'b where 'a: 'b, Self: 'b; + fn replace_client(&self, new_client: RetryClient); fn capabilities(&self) -> Option; fn workers(&self) -> Arc; diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs new file mode 100644 index 000000000..092de6790 --- /dev/null +++ b/core/src/worker/heartbeat.rs @@ -0,0 +1,152 @@ +use std::sync::Arc; +use std::time::{Duration, SystemTime}; +use futures_util::future; +use futures_util::future::AbortHandle; +use gethostname::gethostname; +use parking_lot::Mutex; +use prost_types::Duration as PbDuration; +use uuid::Uuid; +use temporal_sdk_core_api::worker::WorkerConfig; +use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; +use crate::worker::client::WorkerClientBag; +use crate::WorkerClient; + +type Result = std::result::Result; + +/// Heartbeat information +/// +/// Note: Experimental +pub struct WorkerHeartbeatInfo { + pub(crate) data: Arc>, + timer_abort: AbortHandle, + client: Arc, + interval: Option, +} + +impl WorkerHeartbeatInfo { + /// Create a new WorkerHeartbeatInfo. A timer is immediately started to track the worker + /// heartbeat interval. + pub(crate) fn new(worker_config: WorkerConfig, client: Arc) -> Self { + // spawn heartbeat things here, then on capture_heartbeat, send signal to thread + let (abort_handle, _) = AbortHandle::new_pair(); + + let mut heartbeat = Self { + data: Arc::new(Mutex::new(WorkerHeartbeatData::new(worker_config.clone()))), + timer_abort: abort_handle, + client, + interval: worker_config.heartbeat_interval, + }; + heartbeat.create_new_timer(); + heartbeat + } + + // TODO: This is called by Client when it sends other requests. + /// Transform heartbeat data into `WorkerHeartbeat` we can send in gRPC request. Some + /// metrics are also cached for future calls of this function. + pub(crate) fn capture_heartbeat(&mut self) -> WorkerHeartbeat { + self.create_new_timer(); + + self.data.lock().capture_heartbeat() + } + + fn create_new_timer(&mut self) { + println!("[create_new_timer]"); + self.timer_abort.abort(); + + let (abort_handle, abort_reg) = AbortHandle::new_pair(); + let client = self.client.clone(); + let interval = if let Some(dur) = self.interval { + dur + } else { + Duration::from_secs(60) + }; + let data = self.data.clone(); + let client = self.client.clone(); + tokio::spawn(future::Abortable::new( + async move { + println!("sleeping for {:?}", interval); + tokio::time::sleep(interval).await; + println!("sleep done"); + + if let Err(e) = client.clone().record_worker_heartbeat(data.lock().capture_heartbeat()).await { + warn!(error=?e, "Network error while sending worker heartbeat"); + } + + }, + abort_reg, + )); + + self.timer_abort = abort_handle; + } + + // pub(crate) fn add_client(&mut self, client: Arc) { + // println!("[add_client]"); + // self.client = Some(client); + // } +} + +#[derive(Debug, Clone)] +pub(crate) struct WorkerHeartbeatData { + worker_instance_key: String, + pub(crate) worker_identity: String, + host_info: WorkerHostInfo, + // Time of the last heartbeat. This is used to both for heartbeat_time and last_heartbeat_time + pub(crate) heartbeat_time: Option, + pub(crate) task_queue: String, + /// SDK name + pub(crate) sdk_name: String, + /// SDK version + pub(crate) sdk_version: String, + /// Worker start time + pub(crate) start_time: SystemTime, +} + +impl WorkerHeartbeatData { + fn new(worker_config: WorkerConfig) -> Self { + Self { + worker_identity: worker_config.client_identity_override.clone().unwrap_or_default(), + host_info: WorkerHostInfo { + host_name: gethostname().to_string_lossy().to_string(), + process_id: std::process::id().to_string(), + ..Default::default() + }, + sdk_name: String::new(), + sdk_version: String::new(), + task_queue: worker_config.task_queue.clone(), + start_time: SystemTime::now(), + heartbeat_time: None, + worker_instance_key: Uuid::new_v4().to_string(), + } + } + + fn capture_heartbeat(&mut self) -> WorkerHeartbeat { + let now = SystemTime::now(); + let elapsed_since_last_heartbeat = if let Some(heartbeat_time) = self.heartbeat_time { + let dur = now.duration_since(heartbeat_time).unwrap_or(Duration::ZERO); // TODO: do we want to fall back to ZERO? + Some(PbDuration { + seconds: dur.as_secs() as i64, + nanos: dur.subsec_nanos() as i32, + }) + } else { + None + }; + + self.heartbeat_time = Some(now.into()); + + let heartbeat = WorkerHeartbeat { + worker_instance_key: self.worker_instance_key.clone(), + worker_identity: self.worker_identity.clone(), + host_info: Some(self.host_info.clone()), + task_queue: self.task_queue.clone(), + sdk_name: self.sdk_name.clone(), + sdk_version: self.sdk_version.clone(), + status: 0, + start_time: Some(self.start_time.into()), + heartbeat_time: Some(SystemTime::now().into()), + elapsed_since_last_heartbeat, + ..Default::default() + }; + println!("[hb]: {:#?}", heartbeat); + heartbeat + } +} \ No newline at end of file diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index c1b16993c..c96f12127 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -4,6 +4,7 @@ mod nexus; mod slot_provider; pub(crate) mod tuner; mod workflow; +mod heartbeat; pub use temporal_sdk_core_api::worker::{WorkerConfig, WorkerConfigBuilder}; pub use tuner::{ @@ -54,6 +55,7 @@ use std::{ }, time::Duration, }; +use std::time::SystemTime; use temporal_client::{ConfiguredClient, TemporalServiceClientWithMetrics, WorkerKey}; use temporal_sdk_core_api::{ errors::{CompleteNexusError, WorkerValidationError}, @@ -93,6 +95,7 @@ use { PollActivityTaskQueueResponse, PollNexusTaskQueueResponse, }, }; +pub(crate) use crate::worker::heartbeat::WorkerHeartbeatInfo; /// A worker polls on a certain task queue pub struct Worker { @@ -271,6 +274,7 @@ impl Worker { sticky_queue_name: Option, client: Arc, telem_instance: Option<&TelemetryInstance>, + heartbeat_info: Option>>, ) -> Self { info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker"); @@ -280,6 +284,7 @@ impl Worker { client, TaskPollers::Real, telem_instance, + heartbeat_info, ) } @@ -297,7 +302,10 @@ impl Worker { #[cfg(test)] pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { - Self::new(config, None, Arc::new(client), None) + let heartbeat_info = Arc::new(Mutex::new(WorkerHeartbeatInfo::new(config.clone()))); + let client = Arc::new(client); + heartbeat_info.lock().add_client(client.clone()); + Self::new(config, None, client, None, Some(heartbeat_info)) } pub(crate) fn new_with_pollers( @@ -306,6 +314,7 @@ impl Worker { client: Arc, task_pollers: TaskPollers, telem_instance: Option<&TelemetryInstance>, + heartbeat_info: Option>>, ) -> Self { // TODO: Use existing MetricsContext or a new meter to record and export these metrics, possibly through the same MetricsCallBuffer let (metrics, meter) = if let Some(ti) = telem_instance { @@ -438,17 +447,17 @@ impl Worker { }; let (hb_tx, hb_rx) = unbounded_channel(); - let la_pemit_dealer = MeteredPermitDealer::new( + let la_permit_dealer = MeteredPermitDealer::new( tuner.local_activity_slot_supplier(), metrics.with_new_attrs([local_activity_worker_type()]), None, - slot_context_data, + slot_context_data.clone(), meter.clone(), ); - let la_permits = la_pemit_dealer.get_extant_count_rcv(); + let la_permits = la_permit_dealer.get_extant_count_rcv(); let local_act_mgr = Arc::new(LocalActivityManager::new( config.namespace.clone(), - la_pemit_dealer, + la_permit_dealer, hb_tx, metrics.clone(), )); @@ -485,6 +494,14 @@ impl Worker { ); let worker_key = Mutex::new(client.workers().register(Box::new(provider))); let sdk_name_and_ver = client.sdk_name_and_version(); + if let Some(heartbeat_info) = heartbeat_info { + let mut heartbeat_info = heartbeat_info.lock(); + let mut data = heartbeat_info.data.lock(); + data.sdk_name = sdk_name_and_ver.0.clone(); + data.sdk_version = sdk_name_and_ver.1.clone(); + data.start_time = SystemTime::now(); + } + Self { worker_key, client: client.clone(), @@ -884,7 +901,7 @@ mod tests { use crate::{ advance_fut, test_help::test_worker_cfg, - worker::client::mocks::{mock_manual_workflow_client, mock_workflow_client}, + worker::client::mocks::{mock_manual_worker_client, mock_worker_client}, }; use futures_util::FutureExt; use temporal_sdk_core_api::worker::PollerBehavior; @@ -892,7 +909,7 @@ mod tests { #[tokio::test] async fn activity_timeouts_maintain_permit() { - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_poll_activity_task() .returning(|_, _| Ok(PollActivityTaskQueueResponse::default())); @@ -914,7 +931,7 @@ mod tests { async fn activity_errs_dont_eat_permits() { // Return one error followed by simulating waiting on the poll, otherwise the poller will // loop very fast and be in some indeterminate state. - let mut mock_client = mock_manual_workflow_client(); + let mut mock_client = mock_manual_worker_client(); mock_client .expect_poll_activity_task() .returning(|_, _| async { Err(tonic::Status::internal("ahhh")) }.boxed()) diff --git a/core/src/worker/workflow/history_update.rs b/core/src/worker/workflow/history_update.rs index f929fc503..3568a0c0a 100644 --- a/core/src/worker/workflow/history_update.rs +++ b/core/src/worker/workflow/history_update.rs @@ -803,7 +803,7 @@ mod tests { use crate::{ replay::{HistoryInfo, TestHistoryBuilder}, test_help::{MockPollCfg, ResponseType, canned_histories, hist_to_poll_resp, mock_sdk_cfg}, - worker::client::mocks::mock_workflow_client, + worker::client::mocks::mock_worker_client, }; use futures_util::{StreamExt, TryStreamExt}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -965,7 +965,7 @@ mod tests { let wft_started = hinfo.workflow_task_started_event_id(); let full_hist = hinfo.into_events(); let initial_hist = full_hist.chunks(chunk_size).next().unwrap().to_vec(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let mut npt = 1; mock_client @@ -1162,7 +1162,7 @@ mod tests { // Chop off the last event, which is WFT started, which server doesn't return in get // history history_from_get.history.as_mut().map(|h| h.events.pop()); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| Ok(history_from_get.clone())); @@ -1220,7 +1220,7 @@ mod tests { let partial_task = timer_hist.get_one_wft(2).unwrap(); let prev_started_wft_id = partial_task.previous_started_event_id(); let wft_started_id = partial_task.workflow_task_started_event_id(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| Ok(Default::default())); @@ -1247,7 +1247,7 @@ mod tests { let wft_started_id = partial_task.workflow_task_started_event_id(); let full_resp: GetWorkflowExecutionHistoryResponse = timer_hist.get_full_history_info().unwrap().into(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| { @@ -1296,7 +1296,7 @@ mod tests { timer_hist.get_full_history_info().unwrap().into(); full_resp_with_npt.next_page_token = vec![1]; - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| Ok(full_resp_with_npt.clone())) @@ -1375,7 +1375,7 @@ mod tests { resp_1.next_page_token = vec![1]; resp_1.history.as_mut().unwrap().events.truncate(4); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| Ok(resp_1.clone())) @@ -1486,7 +1486,7 @@ mod tests { t.add_we_signaled("hi", vec![]); t.add_workflow_task_scheduled_and_started(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let events: Vec = t.get_full_history_info().unwrap().into_events(); let first_event = events[0].clone(); @@ -1602,7 +1602,7 @@ mod tests { let events: Vec = t.get_full_history_info().unwrap().into_events(); let first_event = events[0].clone(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); for (i, event) in events.into_iter().enumerate() { // Add an empty page @@ -1722,7 +1722,7 @@ mod tests { t.get_full_history_info().unwrap().into(); resp_1.next_page_token = vec![2]; - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| Ok(resp_1.clone())) @@ -1765,7 +1765,7 @@ mod tests { let workflow_task = t.get_full_history_info().unwrap(); let prev_started_wft_id = workflow_task.previous_started_event_id(); let wft_started_id = workflow_task.workflow_task_started_event_id(); - let mock_client = mock_workflow_client(); + let mock_client = mock_worker_client(); let mut paginator = HistoryPaginator::new( workflow_task.into(), prev_started_wft_id, @@ -1802,7 +1802,7 @@ mod tests { let full_resp: GetWorkflowExecutionHistoryResponse = t.get_full_history_info().unwrap().into(); - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); mock_client .expect_get_workflow_execution_history() .returning(move |_, _, _| Ok(full_resp.clone())) @@ -1839,7 +1839,7 @@ mod tests { let incremental_task = hist_to_poll_resp(&t, "wfid".to_owned(), ResponseType::OneTask(3)).resp; - let mut mock_client = mock_workflow_client(); + let mut mock_client = mock_worker_client(); let mut one_task_resp: GetWorkflowExecutionHistoryResponse = t.get_history_info(1).unwrap().into(); one_task_resp.next_page_token = vec![1]; diff --git a/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs b/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs index 5d3309e06..dafdbffbd 100644 --- a/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs +++ b/core/src/worker/workflow/machines/upsert_search_attributes_state_machine.rs @@ -183,7 +183,7 @@ mod tests { replay::TestHistoryBuilder, test_help::{MockPollCfg, ResponseType, build_fake_sdk, build_mock_pollers, mock_worker}, worker::{ - client::mocks::mock_workflow_client, + client::mocks::mock_worker_client, workflow::machines::patch_state_machine::VERSION_SEARCH_ATTR_KEY, }, }; @@ -328,7 +328,7 @@ mod tests { "fakeid", t, [ResponseType::ToTaskNum(1), ResponseType::ToTaskNum(2)], - mock_workflow_client(), + mock_worker_client(), ); // Ensure the upsert command has an empty map when not using the patched command if !with_patched_cmd { From a1d60a614d5f6ce769ca4450fa0479eb0f0e788f Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Fri, 27 Jun 2025 15:49:53 -0700 Subject: [PATCH 03/13] Changed it back to passing client to heartbeat_info, test seems to pass. Now i need to write a test to verify the timer mechanism with an existing gRPC request. Or maybe heartbeating is enough, that it runs twice? --- core/src/core_tests/workers.rs | 52 +++------------------------------- core/src/lib.rs | 11 ++++--- core/src/worker/client.rs | 18 +++++------- core/src/worker/heartbeat.rs | 52 ++++++++++++++++++---------------- 4 files changed, 43 insertions(+), 90 deletions(-) diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index 483585a12..3c8eb97ff 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -370,17 +370,17 @@ async fn worker_heartbeat() { .returning(|heartbeat| { let host_info = heartbeat.host_info.clone().unwrap(); println!("heartbeat: {:?}", heartbeat); - assert_eq!(heartbeat.worker_identity, "TODO"); - assert_eq!(heartbeat.worker_instance_key, "TODO"); + // TODO + assert_eq!(heartbeat.worker_identity, ""); + assert!(!heartbeat.worker_instance_key.is_empty()); assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); assert_eq!(host_info.process_id, std::process::id().to_string()); assert_eq!(heartbeat.sdk_name, "test-core"); assert_eq!(heartbeat.sdk_version, "0.0.0"); - // TODO: assert_eq!(heartbeat.task_queue, tasks); + assert_eq!(heartbeat.task_queue, "q"); assert!(heartbeat.heartbeat_time.is_some()); assert!(heartbeat.start_time.is_some()); - Ok(RecordWorkerHeartbeatResponse {}) }); mock @@ -404,47 +404,3 @@ async fn worker_heartbeat() { worker.poll_activity_task().await.unwrap(); assert!(false); } - -// #[tokio::test] -// async fn worker_heartbeat1() { -// let mut mock = mock_worker_client(); // mock worker client -// mock -// .expect_record_worker_heartbeat() -// .times(1) -// .returning(|heartbeat| { -// let host_info = heartbeat.host_info.clone().unwrap(); -// println!("heartbeat: {:?}", heartbeat); -// assert_eq!(heartbeat.worker_identity, "TODO"); -// assert_eq!(heartbeat.worker_instance_key, "TODO"); -// assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); -// assert_eq!(host_info.process_id, std::process::id().to_string()); -// assert_eq!(heartbeat.sdk_name, "test-core"); -// assert_eq!(heartbeat.sdk_version, "0.0.0"); -// // TODO: assert_eq!(heartbeat.task_queue, tasks); -// assert!(heartbeat.heartbeat_time.is_some()); -// assert!(heartbeat.start_time.is_some()); -// -// -// Ok(RecordWorkerHeartbeatResponse {}) -// }); -// mock -// .expect_poll_activity_task() -// .times(1) -// .returning(move |_, _| Ok(PollActivityTaskQueueResponse { -// task_token: vec![1], -// ..Default::default() -// })); -// -// // or let worker = mock_worker(MocksHolder::from_mock_worker(mock_client, mw)); -// let worker = worker::Worker::new_test( -// test_worker_cfg() -// .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) -// .build() -// .unwrap(), -// mock, -// ); -// // Give time for worker heartbeat timer to fire -// tokio::time::sleep(Duration::from_millis(3000)).await; -// worker.poll_activity_task().await.unwrap(); -// assert!(false); -// } \ No newline at end of file diff --git a/core/src/lib.rs b/core/src/lib.rs index c80883b81..851e0a8fd 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -99,18 +99,17 @@ where } let client_ident = client.get_identity().to_owned(); let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); - let mut client_bag = Arc::new(WorkerClientBag::new( + let heartbeat_info = Arc::new(Mutex::new(worker::WorkerHeartbeatInfo::new(worker_config.clone()))); + + let client_bag = Arc::new(WorkerClientBag::new( client, worker_config.namespace.clone(), client_ident, worker_config.versioning_strategy.clone(), + heartbeat_info.clone(), )); - - let heartbeat_info = Arc::new(Mutex::new(worker::WorkerHeartbeatInfo::new(worker_config.clone(), client_bag.clone()))); - - // TODO: Adding this afterwards feels a little clunky - client_bag.add_heartbeat_info(heartbeat_info.clone()); + heartbeat_info.lock().add_client(client_bag.clone()); diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 9ffae56a9..1b647a721 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -42,7 +42,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_info: Option>>, + heartbeat_info: Arc>, } impl WorkerClientBag { @@ -51,13 +51,14 @@ impl WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, + heartbeat_info: Arc> ) -> Self { Self { replaceable_client: RwLock::new(client), namespace, identity, worker_versioning_strategy, - heartbeat_info: None, + heartbeat_info, } } @@ -119,17 +120,12 @@ impl WorkerClientBag { } } - pub(crate) fn add_heartbeat_info(&mut self, heartbeat_info: Arc>) { - self.heartbeat_info = Some(heartbeat_info); - } + // pub(crate) fn add_heartbeat_info(&mut self, heartbeat_info: Arc>) { + // self.heartbeat_info = heartbeat_info; + // } fn capture_heartbeat(&self) -> Option { - if let Some(heartbeat_info) = self.heartbeat_info.as_ref() { - Some(heartbeat_info.lock().capture_heartbeat()) - } else { - warn!("WorkerClientBag missing client, unable to send worker heartbeat"); - None - } + Some(self.heartbeat_info.lock().capture_heartbeat()) } /// Wrap the `record_worker_heartbeat` call to allow mocking of the trait call for testing diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index 092de6790..cf0a3ead5 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -19,24 +19,23 @@ type Result = std::result::Result; pub struct WorkerHeartbeatInfo { pub(crate) data: Arc>, timer_abort: AbortHandle, - client: Arc, + client: Option>, interval: Option, } impl WorkerHeartbeatInfo { /// Create a new WorkerHeartbeatInfo. A timer is immediately started to track the worker /// heartbeat interval. - pub(crate) fn new(worker_config: WorkerConfig, client: Arc) -> Self { + pub(crate) fn new(worker_config: WorkerConfig) -> Self { // spawn heartbeat things here, then on capture_heartbeat, send signal to thread let (abort_handle, _) = AbortHandle::new_pair(); - let mut heartbeat = Self { + let heartbeat = Self { data: Arc::new(Mutex::new(WorkerHeartbeatData::new(worker_config.clone()))), timer_abort: abort_handle, - client, + client: None, interval: worker_config.heartbeat_interval, }; - heartbeat.create_new_timer(); heartbeat } @@ -54,35 +53,37 @@ impl WorkerHeartbeatInfo { self.timer_abort.abort(); let (abort_handle, abort_reg) = AbortHandle::new_pair(); - let client = self.client.clone(); let interval = if let Some(dur) = self.interval { dur } else { Duration::from_secs(60) }; let data = self.data.clone(); - let client = self.client.clone(); - tokio::spawn(future::Abortable::new( - async move { - println!("sleeping for {:?}", interval); - tokio::time::sleep(interval).await; - println!("sleep done"); - - if let Err(e) = client.clone().record_worker_heartbeat(data.lock().capture_heartbeat()).await { - warn!(error=?e, "Network error while sending worker heartbeat"); - } - - }, - abort_reg, - )); - + if let Some(client) = self.client.clone() { + tokio::spawn(future::Abortable::new( + async move { + println!("sleeping for {:?}", interval); + tokio::time::sleep(interval).await; + println!("sleep done"); + + if let Err(e) = client.clone().record_worker_heartbeat(data.lock().capture_heartbeat()).await { + warn!(error=?e, "Network error while sending worker heartbeat"); + } + + }, + abort_reg, + )); + } else { + warn!("No client attached to heartbeat_info") + }; self.timer_abort = abort_handle; } - // pub(crate) fn add_client(&mut self, client: Arc) { - // println!("[add_client]"); - // self.client = Some(client); - // } + pub(crate) fn add_client(&mut self, client: Arc) { + println!("[add_client]"); + self.client = Some(client); + self.create_new_timer(); + } } #[derive(Debug, Clone)] @@ -104,6 +105,7 @@ pub(crate) struct WorkerHeartbeatData { impl WorkerHeartbeatData { fn new(worker_config: WorkerConfig) -> Self { Self { + // TODO: Is this right for worker_identity? worker_identity: worker_config.client_identity_override.clone().unwrap_or_default(), host_info: WorkerHostInfo { host_name: gethostname().to_string_lossy().to_string(), From aef076ee218c104889726883a35f64905a1ae764 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 1 Jul 2025 11:08:46 -0700 Subject: [PATCH 04/13] Added worker_heartbeat test. Need to test heartbeat timer cancel still --- core/src/core_tests/workers.rs | 110 ++++++++++++++++++++++++++++----- core/src/worker/client.rs | 1 - core/src/worker/mod.rs | 4 +- 3 files changed, 97 insertions(+), 18 deletions(-) diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index 3c8eb97ff..621e7c74a 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -7,10 +7,9 @@ use crate::{PollError, prost_dur, test_help::{ MockWorkerClient, mocks::{DEFAULT_TEST_CAPABILITIES, DEFAULT_WORKERS_REGISTRY, mock_worker_client}, }, -}, advance_fut}; +}}; use futures_util::{stream, stream::StreamExt}; use std::{cell::RefCell, time::Duration}; -use mockall::mock; use temporal_sdk_core_api::{Worker, worker::PollerBehavior}; use temporal_sdk_core_protos::{ coresdk::{ @@ -24,7 +23,9 @@ use temporal_sdk_core_protos::{ }; use temporal_sdk_core_test_utils::{WorkerTestHelpers, start_timer_cmd}; use tokio::sync::{Barrier, watch}; -use temporal_sdk_core_protos::temporal::api::workflowservice::v1::{PollActivityTaskQueueResponse, RecordWorkerHeartbeatResponse}; +use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; +use temporal_sdk_core_protos::coresdk::ActivityTaskCompletion; +use temporal_sdk_core_protos::temporal::api::workflowservice::v1::{PollActivityTaskQueueResponse, RecordWorkerHeartbeatResponse, RespondActivityTaskCompletedResponse}; #[tokio::test] async fn after_shutdown_of_worker_get_shutdown_err() { @@ -366,12 +367,10 @@ async fn worker_heartbeat() { let mut mock = mock_worker_client(); // mock worker client mock .expect_record_worker_heartbeat() - .times(1) + .times(2) .returning(|heartbeat| { let host_info = heartbeat.host_info.clone().unwrap(); - println!("heartbeat: {:?}", heartbeat); - // TODO - assert_eq!(heartbeat.worker_identity, ""); + assert!(heartbeat.worker_identity.is_empty()); assert!(!heartbeat.worker_instance_key.is_empty()); assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); assert_eq!(host_info.process_id, std::process::id().to_string()); @@ -385,22 +384,103 @@ async fn worker_heartbeat() { }); mock .expect_poll_activity_task() - // .times(1) + .times(1) .returning(move |_, _| Ok(PollActivityTaskQueueResponse { - task_token: vec![1], - ..Default::default() - })); + task_token: vec![1], + activity_id: "act1".to_string(), + ..Default::default() + },)); + mock + .expect_complete_activity_task() + .returning(|_, _| Ok(RespondActivityTaskCompletedResponse::default())); - // or let worker = mock_worker(MocksHolder::from_mock_worker(mock_client, mw)); let worker = worker::Worker::new_test( test_worker_cfg() .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) + .max_outstanding_activities(1_usize) + .heartbeat_interval(Duration::from_millis(200)) .build() .unwrap(), mock, ); // Give time for worker heartbeat timer to fire - tokio::time::sleep(Duration::from_millis(3000)).await; - worker.poll_activity_task().await.unwrap(); - assert!(false); + tokio::time::sleep(Duration::from_millis(500)).await; + let task = worker.poll_activity_task().await.unwrap(); + worker + .complete_activity_task(ActivityTaskCompletion { + task_token: task.task_token, + result: Some(ActivityExecutionResult::ok(vec![1].into())), + }) + .await + .unwrap(); + worker.drain_activity_poller_and_shutdown().await; + // assert!(false); + } + +// +// #[rstest::rstest] +// #[tokio::test] +// async fn worker_heartbeat1(#[values(true, false)] poll_activity: bool) { +// let mut mock = mock_worker_client(); +// let heartbeat_calls = if poll_activity { 1 } else { 2 }; +// // let heartbeat_calls = if poll_activity { 2 } else { 1 }; +// let activity_poll_calls = if poll_activity { 1 } else { 0 }; +// mock +// .expect_record_worker_heartbeat() +// .returning(|heartbeat| { +// println!("expect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeat"); +// let host_info = heartbeat.host_info.clone().unwrap(); +// assert!(heartbeat.worker_identity.is_empty()); +// assert!(!heartbeat.worker_instance_key.is_empty()); +// assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); +// assert_eq!(host_info.process_id, std::process::id().to_string()); +// assert_eq!(heartbeat.sdk_name, "test-core"); +// assert_eq!(heartbeat.sdk_version, "0.0.0"); +// assert_eq!(heartbeat.task_queue, "q"); +// assert!(heartbeat.heartbeat_time.is_some()); +// assert!(heartbeat.start_time.is_some()); +// +// Ok(RecordWorkerHeartbeatResponse {}) +// }) +// .times(heartbeat_calls); +// mock +// .expect_poll_activity_task() +// .times(activity_poll_calls) +// .returning(move |_, _| Ok(PollActivityTaskQueueResponse { +// task_token: vec![1], +// ..Default::default() +// })); +// mock // We can end up polling again - just return nothing. +// .expect_poll_activity_task() +// .returning(|_, _| Ok(Default::default())); +// mock +// .expect_complete_activity_task() +// .returning(|_, _| Ok(RespondActivityTaskCompletedResponse::default())); +// +// let worker = worker::Worker::new_test( +// test_worker_cfg() +// .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) +// .max_outstanding_activities(1_usize) +// .heartbeat_interval(Duration::from_millis(200)) +// .build() +// .unwrap(), +// mock, +// ); +// // Give time for worker heartbeat timer to fire +// println!("// Give time for worker heartbeat timer to fire"); +// tokio::time::sleep(Duration::from_millis(250)).await; +// if poll_activity { +// println!("// Poll activity"); +// let task = worker.poll_activity_task().await.unwrap(); +// worker +// .complete_activity_task(ActivityTaskCompletion { +// task_token: task.task_token, +// result: Some(ActivityExecutionResult::ok(vec![1].into())), +// }) +// .await +// .unwrap(); +// } +// tokio::time::sleep(Duration::from_millis(150)).await; +// worker.drain_activity_poller_and_shutdown().await; +// } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 1b647a721..a70f732a5 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -656,7 +656,6 @@ impl WorkerClient for WorkerClientBag { } async fn record_worker_heartbeat(&self, heartbeat: WorkerHeartbeat) -> Result { - println!("[record_worker_heartbeat] received heartbeat {:?}", heartbeat); Ok(self .cloned_client() .record_worker_heartbeat(RecordWorkerHeartbeatRequest { diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index c96f12127..46f388eb6 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -494,8 +494,8 @@ impl Worker { ); let worker_key = Mutex::new(client.workers().register(Box::new(provider))); let sdk_name_and_ver = client.sdk_name_and_version(); - if let Some(heartbeat_info) = heartbeat_info { - let mut heartbeat_info = heartbeat_info.lock(); + if let Some(ref heartbeat_info) = heartbeat_info { + let heartbeat_info = heartbeat_info.lock(); let mut data = heartbeat_info.data.lock(); data.sdk_name = sdk_name_and_ver.0.clone(); data.sdk_version = sdk_name_and_ver.1.clone(); From 261201c03e43b221cd863eee598520c83f0ca6e1 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 1 Jul 2025 15:55:02 -0700 Subject: [PATCH 05/13] Add timer cancel feature to test --- core-api/src/worker.rs | 2 +- core/src/core_tests/workers.rs | 127 --------------------------------- core/src/worker/client.rs | 14 ---- core/src/worker/heartbeat.rs | 125 ++++++++++++++++++++++++++++---- 4 files changed, 111 insertions(+), 157 deletions(-) diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index aa513e9eb..c866eaf84 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -165,7 +165,7 @@ pub struct WorkerConfig { /// The interval in which the worker will send a heartbeat. /// The timer is reset on each existing RPC call that also happens to send this data, like /// `PollWorkflowTaskQueueRequest`. - #[builder(default = "Some(Duration::from_secs(1))")] + #[builder(default)] pub heartbeat_interval: Option, } diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index 621e7c74a..94e41f632 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -23,9 +23,6 @@ use temporal_sdk_core_protos::{ }; use temporal_sdk_core_test_utils::{WorkerTestHelpers, start_timer_cmd}; use tokio::sync::{Barrier, watch}; -use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; -use temporal_sdk_core_protos::coresdk::ActivityTaskCompletion; -use temporal_sdk_core_protos::temporal::api::workflowservice::v1::{PollActivityTaskQueueResponse, RecordWorkerHeartbeatResponse, RespondActivityTaskCompletedResponse}; #[tokio::test] async fn after_shutdown_of_worker_get_shutdown_err() { @@ -360,127 +357,3 @@ async fn worker_shutdown_api(#[case] use_cache: bool, #[case] api_success: bool) ); }); } - -#[rstest::rstest] -#[tokio::test] -async fn worker_heartbeat() { - let mut mock = mock_worker_client(); // mock worker client - mock - .expect_record_worker_heartbeat() - .times(2) - .returning(|heartbeat| { - let host_info = heartbeat.host_info.clone().unwrap(); - assert!(heartbeat.worker_identity.is_empty()); - assert!(!heartbeat.worker_instance_key.is_empty()); - assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); - assert_eq!(host_info.process_id, std::process::id().to_string()); - assert_eq!(heartbeat.sdk_name, "test-core"); - assert_eq!(heartbeat.sdk_version, "0.0.0"); - assert_eq!(heartbeat.task_queue, "q"); - assert!(heartbeat.heartbeat_time.is_some()); - assert!(heartbeat.start_time.is_some()); - - Ok(RecordWorkerHeartbeatResponse {}) - }); - mock - .expect_poll_activity_task() - .times(1) - .returning(move |_, _| Ok(PollActivityTaskQueueResponse { - task_token: vec![1], - activity_id: "act1".to_string(), - ..Default::default() - },)); - mock - .expect_complete_activity_task() - .returning(|_, _| Ok(RespondActivityTaskCompletedResponse::default())); - - let worker = worker::Worker::new_test( - test_worker_cfg() - .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) - .max_outstanding_activities(1_usize) - .heartbeat_interval(Duration::from_millis(200)) - .build() - .unwrap(), - mock, - ); - // Give time for worker heartbeat timer to fire - tokio::time::sleep(Duration::from_millis(500)).await; - let task = worker.poll_activity_task().await.unwrap(); - worker - .complete_activity_task(ActivityTaskCompletion { - task_token: task.task_token, - result: Some(ActivityExecutionResult::ok(vec![1].into())), - }) - .await - .unwrap(); - worker.drain_activity_poller_and_shutdown().await; - // assert!(false); - -} - -// -// #[rstest::rstest] -// #[tokio::test] -// async fn worker_heartbeat1(#[values(true, false)] poll_activity: bool) { -// let mut mock = mock_worker_client(); -// let heartbeat_calls = if poll_activity { 1 } else { 2 }; -// // let heartbeat_calls = if poll_activity { 2 } else { 1 }; -// let activity_poll_calls = if poll_activity { 1 } else { 0 }; -// mock -// .expect_record_worker_heartbeat() -// .returning(|heartbeat| { -// println!("expect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeatexpect_record_worker_heartbeat"); -// let host_info = heartbeat.host_info.clone().unwrap(); -// assert!(heartbeat.worker_identity.is_empty()); -// assert!(!heartbeat.worker_instance_key.is_empty()); -// assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); -// assert_eq!(host_info.process_id, std::process::id().to_string()); -// assert_eq!(heartbeat.sdk_name, "test-core"); -// assert_eq!(heartbeat.sdk_version, "0.0.0"); -// assert_eq!(heartbeat.task_queue, "q"); -// assert!(heartbeat.heartbeat_time.is_some()); -// assert!(heartbeat.start_time.is_some()); -// -// Ok(RecordWorkerHeartbeatResponse {}) -// }) -// .times(heartbeat_calls); -// mock -// .expect_poll_activity_task() -// .times(activity_poll_calls) -// .returning(move |_, _| Ok(PollActivityTaskQueueResponse { -// task_token: vec![1], -// ..Default::default() -// })); -// mock // We can end up polling again - just return nothing. -// .expect_poll_activity_task() -// .returning(|_, _| Ok(Default::default())); -// mock -// .expect_complete_activity_task() -// .returning(|_, _| Ok(RespondActivityTaskCompletedResponse::default())); -// -// let worker = worker::Worker::new_test( -// test_worker_cfg() -// .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) -// .max_outstanding_activities(1_usize) -// .heartbeat_interval(Duration::from_millis(200)) -// .build() -// .unwrap(), -// mock, -// ); -// // Give time for worker heartbeat timer to fire -// println!("// Give time for worker heartbeat timer to fire"); -// tokio::time::sleep(Duration::from_millis(250)).await; -// if poll_activity { -// println!("// Poll activity"); -// let task = worker.poll_activity_task().await.unwrap(); -// worker -// .complete_activity_task(ActivityTaskCompletion { -// task_token: task.task_token, -// result: Some(ActivityExecutionResult::ok(vec![1].into())), -// }) -// .await -// .unwrap(); -// } -// tokio::time::sleep(Duration::from_millis(150)).await; -// worker.drain_activity_poller_and_shutdown().await; -// } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index a70f732a5..f57acfe62 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -120,23 +120,9 @@ impl WorkerClientBag { } } - // pub(crate) fn add_heartbeat_info(&mut self, heartbeat_info: Arc>) { - // self.heartbeat_info = heartbeat_info; - // } - fn capture_heartbeat(&self) -> Option { Some(self.heartbeat_info.lock().capture_heartbeat()) } - - /// Wrap the `record_worker_heartbeat` call to allow mocking of the trait call for testing - pub(crate) async fn worker_heartbeat(&self) -> Result { - if let Some(heartbeat) = self.capture_heartbeat() { - self.record_worker_heartbeat(heartbeat).await - } else { - // TODO: - Ok(RecordWorkerHeartbeatResponse::default()) - } - } } /// This trait contains everything workers need to interact with Temporal, and hence provides a diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index cf0a3ead5..0a56666a4 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -8,11 +8,8 @@ use prost_types::Duration as PbDuration; use uuid::Uuid; use temporal_sdk_core_api::worker::WorkerConfig; use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; -use crate::worker::client::WorkerClientBag; use crate::WorkerClient; -type Result = std::result::Result; - /// Heartbeat information /// /// Note: Experimental @@ -21,6 +18,8 @@ pub struct WorkerHeartbeatInfo { timer_abort: AbortHandle, client: Option>, interval: Option, + #[cfg(test)] + heartbeats_sent: Arc>, } impl WorkerHeartbeatInfo { @@ -35,11 +34,12 @@ impl WorkerHeartbeatInfo { timer_abort: abort_handle, client: None, interval: worker_config.heartbeat_interval, + #[cfg(test)] + heartbeats_sent: Arc::new(Mutex::new(0)), }; heartbeat } - // TODO: This is called by Client when it sends other requests. /// Transform heartbeat data into `WorkerHeartbeat` we can send in gRPC request. Some /// metrics are also cached for future calls of this function. pub(crate) fn capture_heartbeat(&mut self) -> WorkerHeartbeat { @@ -49,7 +49,6 @@ impl WorkerHeartbeatInfo { } fn create_new_timer(&mut self) { - println!("[create_new_timer]"); self.timer_abort.abort(); let (abort_handle, abort_reg) = AbortHandle::new_pair(); @@ -59,28 +58,33 @@ impl WorkerHeartbeatInfo { Duration::from_secs(60) }; let data = self.data.clone(); + #[cfg(test)] + let heartbeats_sent = self.heartbeats_sent.clone(); + self.timer_abort = abort_handle.clone(); if let Some(client) = self.client.clone() { tokio::spawn(future::Abortable::new( async move { - println!("sleeping for {:?}", interval); - tokio::time::sleep(interval).await; - println!("sleep done"); - - if let Err(e) = client.clone().record_worker_heartbeat(data.lock().capture_heartbeat()).await { - warn!(error=?e, "Network error while sending worker heartbeat"); + loop { + tokio::time::sleep(interval).await; + #[cfg(test)] + { + let mut num = heartbeats_sent.lock(); + *num += 1; + } + + if let Err(e) = client.clone().record_worker_heartbeat(data.lock().capture_heartbeat()).await { + warn!(error=?e, "Network error while sending worker heartbeat"); + } } - }, abort_reg, )); } else { warn!("No client attached to heartbeat_info") }; - self.timer_abort = abort_handle; } pub(crate) fn add_client(&mut self, client: Arc) { - println!("[add_client]"); self.client = Some(client); self.create_new_timer(); } @@ -148,7 +152,98 @@ impl WorkerHeartbeatData { elapsed_since_last_heartbeat, ..Default::default() }; - println!("[hb]: {:#?}", heartbeat); heartbeat } +} + +#[cfg(test)] +mod tests { + use crate::test_help::WorkerExt; + use temporal_sdk_core_api::Worker; + use std::ops::Deref; + use crate::{worker}; + use std::time::Duration; + use crate::worker::WorkerHeartbeatInfo; + use crate::worker::client::mocks::mock_worker_client; + use std::sync::Arc; + use parking_lot::Mutex; + use temporal_sdk_core_api::worker::PollerBehavior; + use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; + use temporal_sdk_core_protos::coresdk::ActivityTaskCompletion; + use temporal_sdk_core_protos::temporal::api::workflowservice::v1::{PollActivityTaskQueueResponse, RecordWorkerHeartbeatResponse, RespondActivityTaskCompletedResponse}; + use crate::test_help::test_worker_cfg; + + #[rstest::rstest] + #[tokio::test] + async fn worker_heartbeat(#[values(true, false)] extra_heartbeat: bool) { + let mut mock = mock_worker_client(); + let record_heartbeat_calls = if extra_heartbeat { 2 } else { 3 }; + mock + .expect_record_worker_heartbeat() + .times(record_heartbeat_calls) + .returning(|heartbeat| { + let host_info = heartbeat.host_info.clone().unwrap(); + assert!(heartbeat.worker_identity.is_empty()); + assert!(!heartbeat.worker_instance_key.is_empty()); + assert_eq!(host_info.host_name, gethostname::gethostname().to_string_lossy().to_string()); + assert_eq!(host_info.process_id, std::process::id().to_string()); + assert_eq!(heartbeat.sdk_name, "test-core"); + assert_eq!(heartbeat.sdk_version, "0.0.0"); + assert_eq!(heartbeat.task_queue, "q"); + assert!(heartbeat.heartbeat_time.is_some()); + assert!(heartbeat.start_time.is_some()); + + Ok(RecordWorkerHeartbeatResponse {}) + }); + mock + .expect_poll_activity_task() + .times(1) + .returning(move |_, _| Ok(PollActivityTaskQueueResponse { + task_token: vec![1], + activity_id: "act1".to_string(), + ..Default::default() + },)); + mock + .expect_complete_activity_task() + .returning(|_, _| Ok(RespondActivityTaskCompletedResponse::default())); + + let config = test_worker_cfg() + .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) + .max_outstanding_activities(1_usize) + .heartbeat_interval(Duration::from_millis(100)) + .build() + .unwrap(); + + let heartbeat_info = Arc::new(Mutex::new(WorkerHeartbeatInfo::new(config.clone()))); + let client = Arc::new(mock); + heartbeat_info.lock().add_client(client.clone()); + let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_info.clone())); + let _ = heartbeat_info.lock().capture_heartbeat(); + + // heartbeat timer fires once + tokio::time::sleep(Duration::from_millis(150)).await; + if extra_heartbeat { + // reset heartbeat timer + heartbeat_info.lock().capture_heartbeat(); + } + // heartbeat timer fires once + tokio::time::sleep(Duration::from_millis(180)).await; + + if extra_heartbeat { + assert_eq!(2, *heartbeat_info.lock().heartbeats_sent.lock().deref()); + } else { + assert_eq!(3, *heartbeat_info.lock().heartbeats_sent.lock().deref()); + } + + let task = worker.poll_activity_task().await.unwrap(); + worker + .complete_activity_task(ActivityTaskCompletion { + task_token: task.task_token, + result: Some(ActivityExecutionResult::ok(vec![1].into())), + }) + .await + .unwrap(); + worker.drain_activity_poller_and_shutdown().await; + } + } \ No newline at end of file From ebc6910796a8dcb20005e729547b0669249fcaba Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 3 Jul 2025 14:46:31 -0700 Subject: [PATCH 06/13] PR feedback, remove WorkerHeartbeatInfo, switch timer process to long-running, use channel to reset timer, --- core-api/src/worker.rs | 6 +- core/src/lib.rs | 8 +- core/src/worker/client.rs | 14 +-- core/src/worker/heartbeat.rs | 183 ++++++++--------------------------- core/src/worker/mod.rs | 89 +++++++++++++---- 5 files changed, 125 insertions(+), 175 deletions(-) diff --git a/core-api/src/worker.rs b/core-api/src/worker.rs index 74397ca1b..18638c1fb 100644 --- a/core-api/src/worker.rs +++ b/core-api/src/worker.rs @@ -162,11 +162,11 @@ pub struct WorkerConfig { /// A versioning strategy for this worker. pub versioning_strategy: WorkerVersioningStrategy, - /// The interval in which the worker will send a heartbeat. + /// The interval within which the worker will send a heartbeat. /// The timer is reset on each existing RPC call that also happens to send this data, like /// `PollWorkflowTaskQueueRequest`. - #[builder(default)] - pub heartbeat_interval: Option, + #[builder(default = "Duration::from_secs(60)")] + pub heartbeat_interval: Duration, } impl WorkerConfig { diff --git a/core/src/lib.rs b/core/src/lib.rs index e58408cf1..d4b881894 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -99,7 +99,7 @@ where } let client_ident = client.get_identity().to_owned(); let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); - let heartbeat_info = Arc::new(Mutex::new(worker::WorkerHeartbeatInfo::new( + let data = Arc::new(Mutex::new(worker::WorkerHeartbeatData::new( worker_config.clone(), ))); @@ -108,17 +108,15 @@ where worker_config.namespace.clone(), client_ident, worker_config.versioning_strategy.clone(), - heartbeat_info.clone(), + data.clone(), )); - // TODO: Adding this afterwards feels a little clunky - heartbeat_info.lock().add_client(client_bag.clone()); Ok(Worker::new( worker_config, sticky_q, client_bag, Some(&runtime.telemetry), - Some(heartbeat_info), + Some(data), )) } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index da519d5ca..14f4dc16d 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -2,7 +2,7 @@ pub(crate) mod mocks; use crate::protosext::legacy_query_failure; -use crate::worker::heartbeat::WorkerHeartbeatInfo; +use crate::worker::heartbeat::WorkerHeartbeatData; use parking_lot::{Mutex, RwLock}; use std::{sync::Arc, time::Duration}; use temporal_client::{ @@ -48,7 +48,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_info: Arc>, + heartbeat_data: Arc>, } impl WorkerClientBag { @@ -57,14 +57,14 @@ impl WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_info: Arc>, + heartbeat_data: Arc>, ) -> Self { Self { replaceable_client: RwLock::new(client), namespace, identity, worker_versioning_strategy, - heartbeat_info, + heartbeat_data, } } @@ -127,7 +127,7 @@ impl WorkerClientBag { } fn capture_heartbeat(&self) -> Option { - Some(self.heartbeat_info.lock().capture_heartbeat()) + self.heartbeat_data.lock().capture_heartbeat_if_needed() } } @@ -287,7 +287,7 @@ impl WorkerClient for WorkerClientBag { binary_checksum: self.binary_checksum(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: self.capture_heartbeat(), + worker_heartbeat: None, } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -324,7 +324,7 @@ impl WorkerClient for WorkerClientBag { }), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: self.capture_heartbeat(), + worker_heartbeat: None, } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index 44f3a5ae4..e09d81fb7 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -1,97 +1,13 @@ -use crate::WorkerClient; -use futures_util::future; -use futures_util::future::AbortHandle; use gethostname::gethostname; -use parking_lot::Mutex; use prost_types::Duration as PbDuration; -use std::sync::Arc; use std::time::{Duration, SystemTime}; use temporal_sdk_core_api::worker::WorkerConfig; use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; +use tokio::sync::watch; use uuid::Uuid; -/// Heartbeat information -/// -/// Note: Experimental -pub struct WorkerHeartbeatInfo { - pub(crate) data: Arc>, - timer_abort: AbortHandle, - client: Option>, - interval: Option, - #[cfg(test)] - heartbeats_sent: Arc>, -} - -impl WorkerHeartbeatInfo { - /// Create a new WorkerHeartbeatInfo. A timer is immediately started to track the worker - /// heartbeat interval. - pub(crate) fn new(worker_config: WorkerConfig) -> Self { - // unused abort handle, will be replaced with a new one when we start a new timer - let (abort_handle, _) = AbortHandle::new_pair(); - - Self { - data: Arc::new(Mutex::new(WorkerHeartbeatData::new(worker_config.clone()))), - timer_abort: abort_handle, - client: None, - interval: worker_config.heartbeat_interval, - #[cfg(test)] - heartbeats_sent: Arc::new(Mutex::new(0)), - } - } - - /// Transform heartbeat data into `WorkerHeartbeat` we can send in gRPC request. Some - /// metrics are also cached for future calls of this function. - pub(crate) fn capture_heartbeat(&mut self) -> WorkerHeartbeat { - self.create_new_timer(); - - self.data.lock().capture_heartbeat() - } - - fn create_new_timer(&mut self) { - self.timer_abort.abort(); - - let (abort_handle, abort_reg) = AbortHandle::new_pair(); - let interval = if let Some(dur) = self.interval { - dur - } else { - Duration::from_secs(60) - }; - let data = self.data.clone(); - #[cfg(test)] - let heartbeats_sent = self.heartbeats_sent.clone(); - self.timer_abort = abort_handle.clone(); - if let Some(client) = self.client.clone() { - tokio::spawn(future::Abortable::new( - async move { - loop { - tokio::time::sleep(interval).await; - #[cfg(test)] - { - let mut num = heartbeats_sent.lock(); - *num += 1; - } - - let heartbeat = data.lock().capture_heartbeat(); - if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await { - warn!(error=?e, "Network error while sending worker heartbeat"); - } - } - }, - abort_reg, - )); - } else { - warn!("No client attached to heartbeat_info") - }; - } - - pub(crate) fn add_client(&mut self, client: Arc) { - self.client = Some(client); - self.create_new_timer(); - } -} - #[derive(Debug, Clone)] -pub(crate) struct WorkerHeartbeatData { +pub struct WorkerHeartbeatData { worker_instance_key: String, pub(crate) worker_identity: String, host_info: WorkerHostInfo, @@ -104,10 +20,12 @@ pub(crate) struct WorkerHeartbeatData { pub(crate) sdk_version: String, /// Worker start time pub(crate) start_time: SystemTime, + pub(crate) heartbeat_interval: Duration, + pub(crate) reset_tx: Option>, } impl WorkerHeartbeatData { - fn new(worker_config: WorkerConfig) -> Self { + pub fn new(worker_config: WorkerConfig) -> Self { Self { // TODO: Is this right for worker_identity? worker_identity: worker_config @@ -125,13 +43,22 @@ impl WorkerHeartbeatData { start_time: SystemTime::now(), heartbeat_time: None, worker_instance_key: Uuid::new_v4().to_string(), + heartbeat_interval: worker_config.heartbeat_interval, + reset_tx: None, } } - fn capture_heartbeat(&mut self) -> WorkerHeartbeat { + pub fn capture_heartbeat_if_needed(&mut self) -> Option { let now = SystemTime::now(); let elapsed_since_last_heartbeat = if let Some(heartbeat_time) = self.heartbeat_time { - let dur = now.duration_since(heartbeat_time).unwrap_or(Duration::ZERO); // TODO: do we want to fall back to ZERO? + let dur = now.duration_since(heartbeat_time).unwrap_or(Duration::ZERO); + + // Only send poll data if it's nearly been a full interval since this data has been sent + // In this case, "nearly" is 90% of the interval + if dur.as_secs_f64() < 0.9 * self.heartbeat_interval.as_secs_f64() { + println!("Heartbeat interval not yet elapsed, not sending poll data"); + return None; + } Some(PbDuration { seconds: dur.as_secs() as i64, nanos: dur.subsec_nanos() as i32, @@ -141,8 +68,15 @@ impl WorkerHeartbeatData { }; self.heartbeat_time = Some(now); + if let Some(reset_tx) = &self.reset_tx { + let _ = reset_tx.send(()); + } else { + warn!( + "No reset_tx attached to heartbeat_info, worker heartbeat was not properly setup" + ); + } - WorkerHeartbeat { + Some(WorkerHeartbeat { worker_instance_key: self.worker_instance_key.clone(), worker_identity: self.worker_identity.clone(), host_info: Some(self.host_info.clone()), @@ -154,38 +88,33 @@ impl WorkerHeartbeatData { heartbeat_time: Some(SystemTime::now().into()), elapsed_since_last_heartbeat, ..Default::default() - } + }) + } + + pub(crate) fn set_reset_tx(&mut self, reset_tx: watch::Sender<()>) { + self.reset_tx = Some(reset_tx); } } #[cfg(test)] mod tests { + use super::*; use crate::test_help::WorkerExt; use crate::test_help::test_worker_cfg; use crate::worker; - use crate::worker::WorkerHeartbeatInfo; use crate::worker::client::mocks::mock_worker_client; use parking_lot::Mutex; - use std::ops::Deref; use std::sync::Arc; use std::time::Duration; - use temporal_sdk_core_api::Worker; use temporal_sdk_core_api::worker::PollerBehavior; - use temporal_sdk_core_protos::coresdk::ActivityTaskCompletion; - use temporal_sdk_core_protos::coresdk::activity_result::ActivityExecutionResult; - use temporal_sdk_core_protos::temporal::api::workflowservice::v1::{ - PollActivityTaskQueueResponse, RecordWorkerHeartbeatResponse, - RespondActivityTaskCompletedResponse, - }; + use temporal_sdk_core_protos::temporal::api::workflowservice::v1::RecordWorkerHeartbeatResponse; - #[rstest::rstest] #[tokio::test] - async fn worker_heartbeat(#[values(true, false)] extra_heartbeat: bool) { + async fn worker_heartbeat() { let mut mock = mock_worker_client(); - let record_heartbeat_calls = if extra_heartbeat { 2 } else { 3 }; mock.expect_record_worker_heartbeat() - .times(record_heartbeat_calls) - .returning(|heartbeat| { + .times(2) + .returning(move |heartbeat| { let host_info = heartbeat.host_info.clone().unwrap(); assert!(heartbeat.worker_identity.is_empty()); assert!(!heartbeat.worker_instance_key.is_empty()); @@ -202,54 +131,26 @@ mod tests { Ok(RecordWorkerHeartbeatResponse {}) }); - mock.expect_poll_activity_task() - .times(1) - .returning(move |_, _| { - Ok(PollActivityTaskQueueResponse { - task_token: vec![1], - activity_id: "act1".to_string(), - ..Default::default() - }) - }); - mock.expect_complete_activity_task() - .returning(|_, _| Ok(RespondActivityTaskCompletedResponse::default())); let config = test_worker_cfg() .activity_task_poller_behavior(PollerBehavior::SimpleMaximum(1_usize)) .max_outstanding_activities(1_usize) - .heartbeat_interval(Duration::from_millis(100)) + .heartbeat_interval(Duration::from_millis(200)) .build() .unwrap(); - let heartbeat_info = Arc::new(Mutex::new(WorkerHeartbeatInfo::new(config.clone()))); + let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new(config.clone()))); let client = Arc::new(mock); - heartbeat_info.lock().add_client(client.clone()); - let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_info.clone())); - let _ = heartbeat_info.lock().capture_heartbeat(); + let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_data.clone())); + let _ = heartbeat_data.lock().capture_heartbeat_if_needed(); // heartbeat timer fires once - tokio::time::sleep(Duration::from_millis(150)).await; - if extra_heartbeat { - // reset heartbeat timer - heartbeat_info.lock().capture_heartbeat(); - } + tokio::time::sleep(Duration::from_millis(300)).await; + // it hasn't been >90% of the interval since the last heartbeat, so no data should be returned here + assert_eq!(None, heartbeat_data.lock().capture_heartbeat_if_needed()); // heartbeat timer fires once - tokio::time::sleep(Duration::from_millis(180)).await; - - if extra_heartbeat { - assert_eq!(2, *heartbeat_info.lock().heartbeats_sent.lock().deref()); - } else { - assert_eq!(3, *heartbeat_info.lock().heartbeats_sent.lock().deref()); - } + tokio::time::sleep(Duration::from_millis(150)).await; - let task = worker.poll_activity_task().await.unwrap(); - worker - .complete_activity_task(ActivityTaskCompletion { - task_token: task.task_token, - result: Some(ActivityExecutionResult::ok(vec![1].into())), - }) - .await - .unwrap(); worker.drain_activity_poller_and_shutdown().await; } } diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 99a7b3834..fd8d7bf07 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -20,6 +20,7 @@ pub(crate) use activities::{ pub(crate) use wft_poller::WFTPollerShared; pub(crate) use workflow::LEGACY_QUERY_ID; +pub(crate) use crate::worker::heartbeat::WorkerHeartbeatData; use crate::{ ActivityHeartbeat, CompleteActivityError, PollError, WorkerTrait, abstractions::{MeteredPermitDealer, PermitDealerContextData, dbg_panic}, @@ -42,6 +43,10 @@ use crate::{ }, }, }; +use crate::{ + pollers::{ActivityTaskOptions, LongPollBuffer}, + worker::workflow::wft_poller, +}; use activities::WorkerActivityTasks; use futures_util::{StreamExt, stream}; use parking_lot::Mutex; @@ -77,14 +82,9 @@ use temporal_sdk_core_protos::{ }, }; use tokio::sync::{mpsc::unbounded_channel, watch}; +use tokio::time::MissedTickBehavior; use tokio_stream::wrappers::UnboundedReceiverStream; use tokio_util::sync::CancellationToken; - -pub(crate) use crate::worker::heartbeat::WorkerHeartbeatInfo; -use crate::{ - pollers::{ActivityTaskOptions, LongPollBuffer}, - worker::workflow::wft_poller, -}; #[cfg(test)] use { crate::{ @@ -122,6 +122,8 @@ pub struct Worker { local_activities_complete: Arc, /// Used to track all permits have been released all_permits_tracker: tokio::sync::Mutex, + /// Used to track and shutdown worker heartbeat process + worker_heartbeat_handle: Option>, } struct AllPermitsTracker { @@ -274,7 +276,7 @@ impl Worker { sticky_queue_name: Option, client: Arc, telem_instance: Option<&TelemetryInstance>, - heartbeat_info: Option>>, + heartbeat_data: Option>>, ) -> Self { info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker"); @@ -284,7 +286,7 @@ impl Worker { client, TaskPollers::Real, telem_instance, - heartbeat_info, + heartbeat_data, ) } @@ -302,10 +304,9 @@ impl Worker { #[cfg(test)] pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { - let heartbeat_info = Arc::new(Mutex::new(WorkerHeartbeatInfo::new(config.clone()))); + let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new(config.clone()))); let client = Arc::new(client); - heartbeat_info.lock().add_client(client.clone()); - Self::new(config, None, client, None, Some(heartbeat_info)) + Self::new(config, None, client, None, Some(heartbeat_data)) } pub(crate) fn new_with_pollers( @@ -314,7 +315,7 @@ impl Worker { client: Arc, task_pollers: TaskPollers, telem_instance: Option<&TelemetryInstance>, - heartbeat_info: Option>>, + heartbeat_data: Option>>, ) -> Self { let (metrics, meter) = if let Some(ti) = telem_instance { ( @@ -493,13 +494,24 @@ impl Worker { ); let worker_key = Mutex::new(client.workers().register(Box::new(provider))); let sdk_name_and_ver = client.sdk_name_and_version(); - if let Some(ref heartbeat_info) = heartbeat_info { - let heartbeat_info = heartbeat_info.lock(); - let mut data = heartbeat_info.data.lock(); - data.sdk_name = sdk_name_and_ver.0.clone(); - data.sdk_version = sdk_name_and_ver.1.clone(); - data.start_time = SystemTime::now(); - } + + let worker_heartbeat_handle = if let Some(heartbeat_data) = heartbeat_data { + let (reset_tx, reset_rx) = watch::channel(()); + { + let mut data = heartbeat_data.lock(); + data.sdk_name = sdk_name_and_ver.0.clone(); + data.sdk_version = sdk_name_and_ver.1.clone(); + data.start_time = SystemTime::now(); + data.set_reset_tx(reset_tx); + } + Some(create_worker_heartbeat_process( + heartbeat_data.clone(), + client.clone(), + reset_rx, + )) + } else { + None + }; Self { worker_key, @@ -557,6 +569,7 @@ impl Worker { la_permits, }), nexus_mgr, + worker_heartbeat_handle, } } @@ -601,6 +614,9 @@ impl Worker { dbg_panic!("Waiting for all slot permits to release took too long!"); } } + if let Some(jh) = self.worker_heartbeat_handle.as_ref() { + jh.abort(); + } } /// Finish shutting down by consuming the background pollers and freeing all resources @@ -894,6 +910,41 @@ fn wft_poller_behavior(config: &WorkerConfig, is_sticky: bool) -> PollerBehavior } } +fn create_worker_heartbeat_process( + data: Arc>, + client: Arc, + reset_rx: watch::Receiver<()>, +) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + let mut reset_rx = reset_rx; + let mut ticker = tokio::time::interval(data.lock().heartbeat_interval); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + loop { + tokio::select! { + _ = ticker.tick() => { + let heartbeat = if let Some(heartbeat) = data.lock().capture_heartbeat_if_needed() { + heartbeat + } else { + continue + }; + if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await { + warn!(error=?e, "Network error while sending worker heartbeat"); + if matches!( + e.code(), + tonic::Code::Unimplemented + ) { + return; + } + } + } + _ = reset_rx.changed() => { + ticker.reset(); + } + } + } + }) +} + #[cfg(test)] mod tests { use super::*; From 207feaae4b53be925a37836732e45c0deed42bc5 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Thu, 3 Jul 2025 15:23:08 -0700 Subject: [PATCH 07/13] clean up worker_identity --- core/src/lib.rs | 5 +++++ core/src/worker/client.rs | 6 ++++++ core/src/worker/client/mocks.rs | 5 +++++ core/src/worker/heartbeat.rs | 15 +++++++-------- core/src/worker/mod.rs | 7 +++++-- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/core/src/lib.rs b/core/src/lib.rs index d4b881894..1e6619bbf 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -101,8 +101,13 @@ where let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); let data = Arc::new(Mutex::new(worker::WorkerHeartbeatData::new( worker_config.clone(), + client_ident.clone(), ))); + if client_ident == "" { + bail!("Client identity cannot be empty. Either lang or user should be setting this value"); + } + let client_bag = Arc::new(WorkerClientBag::new( client, worker_config.namespace.clone(), diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 14f4dc16d..d61a6137c 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -234,6 +234,8 @@ pub trait WorkerClient: Sync + Send { fn is_mock(&self) -> bool; /// Return name and version of the SDK fn sdk_name_and_version(&self) -> (String, String); + /// Get worker identity + fn get_identity(&self) -> String; } /// Configuration options shared by workflow, activity, and Nexus polling calls @@ -695,6 +697,10 @@ impl WorkerClient for WorkerClientBag { let opts = lock.get_client().inner().options(); (opts.client_name.clone(), opts.client_version.clone()) } + + fn get_identity(&self) -> String { + self.identity.clone() + } } impl NamespacedClient for WorkerClientBag { diff --git a/core/src/worker/client/mocks.rs b/core/src/worker/client/mocks.rs index 029ef1af9..dc09db75b 100644 --- a/core/src/worker/client/mocks.rs +++ b/core/src/worker/client/mocks.rs @@ -33,6 +33,8 @@ pub(crate) fn mock_worker_client() -> MockWorkerClient { .returning(|_| Ok(ShutdownWorkerResponse {})); r.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); + r.expect_get_identity() + .returning(|| "test-identity".to_string()); r } @@ -46,6 +48,8 @@ pub(crate) fn mock_manual_worker_client() -> MockManualWorkerClient { r.expect_is_mock().returning(|| true); r.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); + r.expect_get_identity() + .returning(|| "test-identity".to_string()); r } @@ -153,5 +157,6 @@ mockall::mock! { fn workers(&self) -> Arc; fn is_mock(&self) -> bool; fn sdk_name_and_version(&self) -> (String, String); + fn get_identity(&self) -> String; } } diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index e09d81fb7..2f8edfac6 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -25,13 +25,9 @@ pub struct WorkerHeartbeatData { } impl WorkerHeartbeatData { - pub fn new(worker_config: WorkerConfig) -> Self { + pub fn new(worker_config: WorkerConfig, worker_identity: String) -> Self { Self { - // TODO: Is this right for worker_identity? - worker_identity: worker_config - .client_identity_override - .clone() - .unwrap_or_default(), + worker_identity, host_info: WorkerHostInfo { host_name: gethostname().to_string_lossy().to_string(), process_id: std::process::id().to_string(), @@ -116,7 +112,7 @@ mod tests { .times(2) .returning(move |heartbeat| { let host_info = heartbeat.host_info.clone().unwrap(); - assert!(heartbeat.worker_identity.is_empty()); + assert_eq!("test_identity", heartbeat.worker_identity); assert!(!heartbeat.worker_instance_key.is_empty()); assert_eq!( host_info.host_name, @@ -139,7 +135,10 @@ mod tests { .build() .unwrap(); - let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new(config.clone()))); + let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new( + config.clone(), + "test_identity".to_string(), + ))); let client = Arc::new(mock); let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_data.clone())); let _ = heartbeat_data.lock().capture_heartbeat_if_needed(); diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index fd8d7bf07..06d88cfe8 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -304,8 +304,11 @@ impl Worker { #[cfg(test)] pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { - let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new(config.clone()))); let client = Arc::new(client); + let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new( + config.clone(), + client.get_identity(), + ))); Self::new(config, None, client, None, Some(heartbeat_data)) } @@ -335,7 +338,7 @@ impl Worker { let shutdown_token = CancellationToken::new(); let slot_context_data = Arc::new(PermitDealerContextData { task_queue: config.task_queue.clone(), - worker_identity: config.client_identity_override.clone().unwrap_or_default(), + worker_identity: client.get_identity(), worker_deployment_version: config.computed_deployment_version(), }); let wft_slots = MeteredPermitDealer::new( From 35d996fe43ea8af7d0c480f8d0e477f48f406c48 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 8 Jul 2025 11:22:40 -0700 Subject: [PATCH 08/13] use OnceLock to get rid of unrepresentable state --- core/src/lib.rs | 13 ++-- core/src/worker/client.rs | 16 +++-- core/src/worker/heartbeat.rs | 133 ++++++++++++++++++++++++++--------- core/src/worker/mod.rs | 81 +++++---------------- 4 files changed, 133 insertions(+), 110 deletions(-) diff --git a/core/src/lib.rs b/core/src/lib.rs index 1e6619bbf..a70a9ee76 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -61,8 +61,7 @@ use crate::{ }; use anyhow::bail; use futures_util::Stream; -use parking_lot::Mutex; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use temporal_client::{ConfiguredClient, NamespacedClient, TemporalServiceClientWithMetrics}; use temporal_sdk_core_api::{ Worker as WorkerTrait, @@ -99,21 +98,19 @@ where } let client_ident = client.get_identity().to_owned(); let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); - let data = Arc::new(Mutex::new(worker::WorkerHeartbeatData::new( - worker_config.clone(), - client_ident.clone(), - ))); if client_ident == "" { bail!("Client identity cannot be empty. Either lang or user should be setting this value"); } + let heartbeat_fn = Arc::new(OnceLock::new()); + let client_bag = Arc::new(WorkerClientBag::new( client, worker_config.namespace.clone(), client_ident, worker_config.versioning_strategy.clone(), - data.clone(), + heartbeat_fn.clone(), )); Ok(Worker::new( @@ -121,7 +118,7 @@ where sticky_q, client_bag, Some(&runtime.telemetry), - Some(data), + Some(heartbeat_fn), )) } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index d61a6137c..f532e2ab5 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -1,9 +1,10 @@ //! Worker-specific client needs pub(crate) mod mocks; +use crate::abstractions::dbg_panic; use crate::protosext::legacy_query_failure; -use crate::worker::heartbeat::WorkerHeartbeatData; -use parking_lot::{Mutex, RwLock}; +use parking_lot::RwLock; +use std::sync::OnceLock; use std::{sync::Arc, time::Duration}; use temporal_client::{ Client, IsWorkerTaskLongPoll, Namespace, NamespacedClient, NoRetryOnMatching, RetryClient, @@ -48,7 +49,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_data: Arc>, + heartbeat_data: Arc Option + Send + Sync>>>, } impl WorkerClientBag { @@ -57,7 +58,7 @@ impl WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_data: Arc>, + heartbeat_data: Arc Option + Send + Sync>>>, ) -> Self { Self { replaceable_client: RwLock::new(client), @@ -127,7 +128,12 @@ impl WorkerClientBag { } fn capture_heartbeat(&self) -> Option { - self.heartbeat_data.lock().capture_heartbeat_if_needed() + if let Some(hb) = self.heartbeat_data.get() { + hb() + } else { + dbg_panic!("Heartbeat function never set"); + None + } } } diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index 2f8edfac6..11efc8b2e 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -1,31 +1,109 @@ +use crate::WorkerClient; +use crate::abstractions::dbg_panic; use gethostname::gethostname; +use parking_lot::Mutex; use prost_types::Duration as PbDuration; +use std::sync::{Arc, OnceLock}; use std::time::{Duration, SystemTime}; use temporal_sdk_core_api::worker::WorkerConfig; use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; use tokio::sync::watch; +use tokio::task::JoinHandle; +use tokio::time::MissedTickBehavior; use uuid::Uuid; +pub(crate) struct WorkerHeartbeatManager { + heartbeat_handle: JoinHandle<()>, +} + +impl WorkerHeartbeatManager { + pub(crate) fn new( + config: WorkerConfig, + identity: String, + heartbeat_fn: Arc Option + Send + Sync>>>, + client: Arc, + ) -> Self { + let sdk_name_and_ver = client.sdk_name_and_version(); + let (reset_tx, reset_rx) = watch::channel(()); + let data = Arc::new(Mutex::new(WorkerHeartbeatData::new( + config, + identity, + sdk_name_and_ver, + reset_tx, + ))); + let data_clone = data.clone(); + + let heartbeat_handle = tokio::spawn(async move { + let mut reset_rx = reset_rx; + let mut ticker = tokio::time::interval(data_clone.lock().heartbeat_interval); + ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + loop { + tokio::select! { + _ = ticker.tick() => { + let heartbeat = if let Some(heartbeat) = data_clone.lock().capture_heartbeat_if_needed() { + heartbeat + } else { + continue + }; + if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await { + warn!(error=?e, "Network error while sending worker heartbeat"); + if matches!( + e.code(), + tonic::Code::Unimplemented + ) { + return; + } + } + } + _ = reset_rx.changed() => { + ticker.reset(); + } + } + } + }); + + let data_clone = data.clone(); + if let Err(_) = heartbeat_fn.set(Box::new(move || { + data_clone.lock().capture_heartbeat_if_needed() + })) { + dbg_panic!( + "Failed to set heartbeat_fn, heartbeat_fn should only be set once, when a singular WorkerHeartbeatInfo is created" + ); + } + + Self { heartbeat_handle } + } + + pub(crate) fn shutdown(&self) { + self.heartbeat_handle.abort() + } +} + #[derive(Debug, Clone)] -pub struct WorkerHeartbeatData { +struct WorkerHeartbeatData { worker_instance_key: String, - pub(crate) worker_identity: String, + worker_identity: String, host_info: WorkerHostInfo, // Time of the last heartbeat. This is used to both for heartbeat_time and last_heartbeat_time - pub(crate) heartbeat_time: Option, - pub(crate) task_queue: String, + heartbeat_time: Option, + task_queue: String, /// SDK name - pub(crate) sdk_name: String, + sdk_name: String, /// SDK version - pub(crate) sdk_version: String, + sdk_version: String, /// Worker start time - pub(crate) start_time: SystemTime, - pub(crate) heartbeat_interval: Duration, - pub(crate) reset_tx: Option>, + start_time: SystemTime, + heartbeat_interval: Duration, + reset_tx: watch::Sender<()>, } impl WorkerHeartbeatData { - pub fn new(worker_config: WorkerConfig, worker_identity: String) -> Self { + fn new( + worker_config: WorkerConfig, + worker_identity: String, + sdk_name_and_ver: (String, String), + reset_tx: watch::Sender<()>, + ) -> Self { Self { worker_identity, host_info: WorkerHostInfo { @@ -33,18 +111,18 @@ impl WorkerHeartbeatData { process_id: std::process::id().to_string(), ..Default::default() }, - sdk_name: String::new(), - sdk_version: String::new(), + sdk_name: sdk_name_and_ver.0, + sdk_version: sdk_name_and_ver.1, task_queue: worker_config.task_queue.clone(), start_time: SystemTime::now(), heartbeat_time: None, worker_instance_key: Uuid::new_v4().to_string(), heartbeat_interval: worker_config.heartbeat_interval, - reset_tx: None, + reset_tx, } } - pub fn capture_heartbeat_if_needed(&mut self) -> Option { + fn capture_heartbeat_if_needed(&mut self) -> Option { let now = SystemTime::now(); let elapsed_since_last_heartbeat = if let Some(heartbeat_time) = self.heartbeat_time { let dur = now.duration_since(heartbeat_time).unwrap_or(Duration::ZERO); @@ -64,13 +142,8 @@ impl WorkerHeartbeatData { }; self.heartbeat_time = Some(now); - if let Some(reset_tx) = &self.reset_tx { - let _ = reset_tx.send(()); - } else { - warn!( - "No reset_tx attached to heartbeat_info, worker heartbeat was not properly setup" - ); - } + + let _ = self.reset_tx.send(()); Some(WorkerHeartbeat { worker_instance_key: self.worker_instance_key.clone(), @@ -86,10 +159,6 @@ impl WorkerHeartbeatData { ..Default::default() }) } - - pub(crate) fn set_reset_tx(&mut self, reset_tx: watch::Sender<()>) { - self.reset_tx = Some(reset_tx); - } } #[cfg(test)] @@ -99,7 +168,6 @@ mod tests { use crate::test_help::test_worker_cfg; use crate::worker; use crate::worker::client::mocks::mock_worker_client; - use parking_lot::Mutex; use std::sync::Arc; use std::time::Duration; use temporal_sdk_core_api::worker::PollerBehavior; @@ -112,7 +180,7 @@ mod tests { .times(2) .returning(move |heartbeat| { let host_info = heartbeat.host_info.clone().unwrap(); - assert_eq!("test_identity", heartbeat.worker_identity); + assert_eq!("test-identity", heartbeat.worker_identity); assert!(!heartbeat.worker_instance_key.is_empty()); assert_eq!( host_info.host_name, @@ -135,18 +203,15 @@ mod tests { .build() .unwrap(); - let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new( - config.clone(), - "test_identity".to_string(), - ))); + let heartbeat_fn = Arc::new(OnceLock::new()); let client = Arc::new(mock); - let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_data.clone())); - let _ = heartbeat_data.lock().capture_heartbeat_if_needed(); + let worker = worker::Worker::new(config, None, client, None, Some(heartbeat_fn.clone())); + heartbeat_fn.get().unwrap()(); // heartbeat timer fires once tokio::time::sleep(Duration::from_millis(300)).await; // it hasn't been >90% of the interval since the last heartbeat, so no data should be returned here - assert_eq!(None, heartbeat_data.lock().capture_heartbeat_if_needed()); + assert_eq!(None, heartbeat_fn.get().unwrap()()); // heartbeat timer fires once tokio::time::sleep(Duration::from_millis(150)).await; diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 06d88cfe8..14a86c386 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -20,7 +20,7 @@ pub(crate) use activities::{ pub(crate) use wft_poller::WFTPollerShared; pub(crate) use workflow::LEGACY_QUERY_ID; -pub(crate) use crate::worker::heartbeat::WorkerHeartbeatData; +use crate::worker::heartbeat::WorkerHeartbeatManager; use crate::{ ActivityHeartbeat, CompleteActivityError, PollError, WorkerTrait, abstractions::{MeteredPermitDealer, PermitDealerContextData, dbg_panic}, @@ -51,7 +51,7 @@ use activities::WorkerActivityTasks; use futures_util::{StreamExt, stream}; use parking_lot::Mutex; use slot_provider::SlotProvider; -use std::time::SystemTime; +use std::sync::OnceLock; use std::{ convert::TryInto, future, @@ -66,6 +66,7 @@ use temporal_sdk_core_api::{ errors::{CompleteNexusError, WorkerValidationError}, worker::PollerBehavior, }; +use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; use temporal_sdk_core_protos::{ TaskToken, coresdk::{ @@ -82,7 +83,6 @@ use temporal_sdk_core_protos::{ }, }; use tokio::sync::{mpsc::unbounded_channel, watch}; -use tokio::time::MissedTickBehavior; use tokio_stream::wrappers::UnboundedReceiverStream; use tokio_util::sync::CancellationToken; #[cfg(test)] @@ -122,8 +122,8 @@ pub struct Worker { local_activities_complete: Arc, /// Used to track all permits have been released all_permits_tracker: tokio::sync::Mutex, - /// Used to track and shutdown worker heartbeat process - worker_heartbeat_handle: Option>, + /// Used to shutdown the worker heartbeat task + worker_heartbeat: Option, } struct AllPermitsTracker { @@ -276,7 +276,7 @@ impl Worker { sticky_queue_name: Option, client: Arc, telem_instance: Option<&TelemetryInstance>, - heartbeat_data: Option>>, + heartbeat_fn: Option Option + Send + Sync>>>>, ) -> Self { info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker"); @@ -286,7 +286,7 @@ impl Worker { client, TaskPollers::Real, telem_instance, - heartbeat_data, + heartbeat_fn, ) } @@ -305,11 +305,8 @@ impl Worker { #[cfg(test)] pub(crate) fn new_test(config: WorkerConfig, client: impl WorkerClient + 'static) -> Self { let client = Arc::new(client); - let heartbeat_data = Arc::new(Mutex::new(WorkerHeartbeatData::new( - config.clone(), - client.get_identity(), - ))); - Self::new(config, None, client, None, Some(heartbeat_data)) + let heartbeat_fn = Arc::new(OnceLock::new()); + Self::new(config, None, client, None, Some(heartbeat_fn)) } pub(crate) fn new_with_pollers( @@ -318,7 +315,7 @@ impl Worker { client: Arc, task_pollers: TaskPollers, telem_instance: Option<&TelemetryInstance>, - heartbeat_data: Option>>, + heartbeat_fn: Option Option + Send + Sync>>>>, ) -> Self { let (metrics, meter) = if let Some(ti) = telem_instance { ( @@ -498,19 +495,12 @@ impl Worker { let worker_key = Mutex::new(client.workers().register(Box::new(provider))); let sdk_name_and_ver = client.sdk_name_and_version(); - let worker_heartbeat_handle = if let Some(heartbeat_data) = heartbeat_data { - let (reset_tx, reset_rx) = watch::channel(()); - { - let mut data = heartbeat_data.lock(); - data.sdk_name = sdk_name_and_ver.0.clone(); - data.sdk_version = sdk_name_and_ver.1.clone(); - data.start_time = SystemTime::now(); - data.set_reset_tx(reset_tx); - } - Some(create_worker_heartbeat_process( - heartbeat_data.clone(), + let worker_heartbeat = if let Some(heartbeat_fn) = heartbeat_fn { + Some(WorkerHeartbeatManager::new( + config.clone(), + client.get_identity(), + heartbeat_fn, client.clone(), - reset_rx, )) } else { None @@ -572,7 +562,7 @@ impl Worker { la_permits, }), nexus_mgr, - worker_heartbeat_handle, + worker_heartbeat, } } @@ -617,8 +607,8 @@ impl Worker { dbg_panic!("Waiting for all slot permits to release took too long!"); } } - if let Some(jh) = self.worker_heartbeat_handle.as_ref() { - jh.abort(); + if let Some(heartbeat) = self.worker_heartbeat.as_ref() { + heartbeat.shutdown(); } } @@ -913,41 +903,6 @@ fn wft_poller_behavior(config: &WorkerConfig, is_sticky: bool) -> PollerBehavior } } -fn create_worker_heartbeat_process( - data: Arc>, - client: Arc, - reset_rx: watch::Receiver<()>, -) -> tokio::task::JoinHandle<()> { - tokio::spawn(async move { - let mut reset_rx = reset_rx; - let mut ticker = tokio::time::interval(data.lock().heartbeat_interval); - ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); - loop { - tokio::select! { - _ = ticker.tick() => { - let heartbeat = if let Some(heartbeat) = data.lock().capture_heartbeat_if_needed() { - heartbeat - } else { - continue - }; - if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await { - warn!(error=?e, "Network error while sending worker heartbeat"); - if matches!( - e.code(), - tonic::Code::Unimplemented - ) { - return; - } - } - } - _ = reset_rx.changed() => { - ticker.reset(); - } - } - } - }) -} - #[cfg(test)] mod tests { use super::*; From 4b902b69b84cc51ed8a95f3af05c9f9012f02a4f Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 8 Jul 2025 11:40:14 -0700 Subject: [PATCH 09/13] cargo lint --- core/src/lib.rs | 2 +- core/src/worker/client.rs | 5 +++-- core/src/worker/heartbeat.rs | 8 +++++--- core/src/worker/mod.rs | 23 +++++++++-------------- 4 files changed, 18 insertions(+), 20 deletions(-) diff --git a/core/src/lib.rs b/core/src/lib.rs index a70a9ee76..5c2d2301f 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -99,7 +99,7 @@ where let client_ident = client.get_identity().to_owned(); let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); - if client_ident == "" { + if client_ident.is_empty(){ bail!("Client identity cannot be empty. Either lang or user should be setting this value"); } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index f532e2ab5..bf209b40f 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -35,6 +35,7 @@ use temporal_sdk_core_protos::{ }, }; use tonic::IntoRequest; +use crate::worker::heartbeat::HeartbeatFn; type Result = std::result::Result; @@ -49,7 +50,7 @@ pub(crate) struct WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_data: Arc Option + Send + Sync>>>, + heartbeat_data: Arc>, } impl WorkerClientBag { @@ -58,7 +59,7 @@ impl WorkerClientBag { namespace: String, identity: String, worker_versioning_strategy: WorkerVersioningStrategy, - heartbeat_data: Arc Option + Send + Sync>>>, + heartbeat_data: Arc>, ) -> Self { Self { replaceable_client: RwLock::new(client), diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index 11efc8b2e..c9620f19f 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -12,6 +12,8 @@ use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; use uuid::Uuid; +pub(crate) type HeartbeatFn = Box Option + Send + Sync>; + pub(crate) struct WorkerHeartbeatManager { heartbeat_handle: JoinHandle<()>, } @@ -20,7 +22,7 @@ impl WorkerHeartbeatManager { pub(crate) fn new( config: WorkerConfig, identity: String, - heartbeat_fn: Arc Option + Send + Sync>>>, + heartbeat_fn: Arc>, client: Arc, ) -> Self { let sdk_name_and_ver = client.sdk_name_and_version(); @@ -63,9 +65,9 @@ impl WorkerHeartbeatManager { }); let data_clone = data.clone(); - if let Err(_) = heartbeat_fn.set(Box::new(move || { + if heartbeat_fn.set(Box::new(move || { data_clone.lock().capture_heartbeat_if_needed() - })) { + })).is_err() { dbg_panic!( "Failed to set heartbeat_fn, heartbeat_fn should only be set once, when a singular WorkerHeartbeatInfo is created" ); diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index 14a86c386..b02ed774e 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -20,7 +20,7 @@ pub(crate) use activities::{ pub(crate) use wft_poller::WFTPollerShared; pub(crate) use workflow::LEGACY_QUERY_ID; -use crate::worker::heartbeat::WorkerHeartbeatManager; +use crate::worker::heartbeat::{HeartbeatFn, WorkerHeartbeatManager}; use crate::{ ActivityHeartbeat, CompleteActivityError, PollError, WorkerTrait, abstractions::{MeteredPermitDealer, PermitDealerContextData, dbg_panic}, @@ -66,7 +66,6 @@ use temporal_sdk_core_api::{ errors::{CompleteNexusError, WorkerValidationError}, worker::PollerBehavior, }; -use temporal_sdk_core_protos::temporal::api::worker::v1::WorkerHeartbeat; use temporal_sdk_core_protos::{ TaskToken, coresdk::{ @@ -276,7 +275,7 @@ impl Worker { sticky_queue_name: Option, client: Arc, telem_instance: Option<&TelemetryInstance>, - heartbeat_fn: Option Option + Send + Sync>>>>, + heartbeat_fn: Option>>, ) -> Self { info!(task_queue=%config.task_queue, namespace=%config.namespace, "Initializing worker"); @@ -315,7 +314,7 @@ impl Worker { client: Arc, task_pollers: TaskPollers, telem_instance: Option<&TelemetryInstance>, - heartbeat_fn: Option Option + Send + Sync>>>>, + heartbeat_fn: Option>>, ) -> Self { let (metrics, meter) = if let Some(ti) = telem_instance { ( @@ -495,16 +494,12 @@ impl Worker { let worker_key = Mutex::new(client.workers().register(Box::new(provider))); let sdk_name_and_ver = client.sdk_name_and_version(); - let worker_heartbeat = if let Some(heartbeat_fn) = heartbeat_fn { - Some(WorkerHeartbeatManager::new( - config.clone(), - client.get_identity(), - heartbeat_fn, - client.clone(), - )) - } else { - None - }; + let worker_heartbeat = heartbeat_fn.map(|heartbeat_fn| WorkerHeartbeatManager::new( + config.clone(), + client.get_identity(), + heartbeat_fn, + client.clone(), + )); Self { worker_key, From ba8836084d08131a923284b40fb63258c1bdb168 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 8 Jul 2025 16:05:42 -0700 Subject: [PATCH 10/13] cargo fmt, and test that manually creates MockWorkerClient --- core/src/core_tests/workers.rs | 2 ++ core/src/lib.rs | 2 +- core/src/worker/client.rs | 2 +- core/src/worker/heartbeat.rs | 9 ++++++--- core/src/worker/mod.rs | 14 ++++++++------ 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/core/src/core_tests/workers.rs b/core/src/core_tests/workers.rs index 2625ea6b1..475f3d1d3 100644 --- a/core/src/core_tests/workers.rs +++ b/core/src/core_tests/workers.rs @@ -314,6 +314,8 @@ async fn worker_shutdown_api(#[case] use_cache: bool, #[case] api_success: bool) mock.expect_is_mock().returning(|| true); mock.expect_sdk_name_and_version() .returning(|| ("test-core".to_string(), "0.0.0".to_string())); + mock.expect_get_identity() + .returning(|| "test-identity".to_string()); if use_cache { if api_success { mock.expect_shutdown_worker() diff --git a/core/src/lib.rs b/core/src/lib.rs index 5c2d2301f..12629c59b 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -99,7 +99,7 @@ where let client_ident = client.get_identity().to_owned(); let sticky_q = sticky_q_name_for_worker(&client_ident, &worker_config); - if client_ident.is_empty(){ + if client_ident.is_empty() { bail!("Client identity cannot be empty. Either lang or user should be setting this value"); } diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index bf209b40f..6e4e41609 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -3,6 +3,7 @@ pub(crate) mod mocks; use crate::abstractions::dbg_panic; use crate::protosext::legacy_query_failure; +use crate::worker::heartbeat::HeartbeatFn; use parking_lot::RwLock; use std::sync::OnceLock; use std::{sync::Arc, time::Duration}; @@ -35,7 +36,6 @@ use temporal_sdk_core_protos::{ }, }; use tonic::IntoRequest; -use crate::worker::heartbeat::HeartbeatFn; type Result = std::result::Result; diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index c9620f19f..d59c7be3d 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -65,9 +65,12 @@ impl WorkerHeartbeatManager { }); let data_clone = data.clone(); - if heartbeat_fn.set(Box::new(move || { - data_clone.lock().capture_heartbeat_if_needed() - })).is_err() { + if heartbeat_fn + .set(Box::new(move || { + data_clone.lock().capture_heartbeat_if_needed() + })) + .is_err() + { dbg_panic!( "Failed to set heartbeat_fn, heartbeat_fn should only be set once, when a singular WorkerHeartbeatInfo is created" ); diff --git a/core/src/worker/mod.rs b/core/src/worker/mod.rs index b02ed774e..f21d203a9 100644 --- a/core/src/worker/mod.rs +++ b/core/src/worker/mod.rs @@ -494,12 +494,14 @@ impl Worker { let worker_key = Mutex::new(client.workers().register(Box::new(provider))); let sdk_name_and_ver = client.sdk_name_and_version(); - let worker_heartbeat = heartbeat_fn.map(|heartbeat_fn| WorkerHeartbeatManager::new( - config.clone(), - client.get_identity(), - heartbeat_fn, - client.clone(), - )); + let worker_heartbeat = heartbeat_fn.map(|heartbeat_fn| { + WorkerHeartbeatManager::new( + config.clone(), + client.get_identity(), + heartbeat_fn, + client.clone(), + ) + }); Self { worker_key, From 51d0de0fd8f6c13d0ad5dbae56ea6b392c62d51a Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Tue, 8 Jul 2025 16:20:24 -0700 Subject: [PATCH 11/13] use notify instead of watch::channel --- core/src/worker/heartbeat.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index d59c7be3d..f4e00a96b 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -7,7 +7,7 @@ use std::sync::{Arc, OnceLock}; use std::time::{Duration, SystemTime}; use temporal_sdk_core_api::worker::WorkerConfig; use temporal_sdk_core_protos::temporal::api::worker::v1::{WorkerHeartbeat, WorkerHostInfo}; -use tokio::sync::watch; +use tokio::sync::Notify; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; use uuid::Uuid; @@ -26,17 +26,16 @@ impl WorkerHeartbeatManager { client: Arc, ) -> Self { let sdk_name_and_ver = client.sdk_name_and_version(); - let (reset_tx, reset_rx) = watch::channel(()); + let reset_notify = Arc::new(Notify::new()); let data = Arc::new(Mutex::new(WorkerHeartbeatData::new( config, identity, sdk_name_and_ver, - reset_tx, + reset_notify.clone(), ))); let data_clone = data.clone(); let heartbeat_handle = tokio::spawn(async move { - let mut reset_rx = reset_rx; let mut ticker = tokio::time::interval(data_clone.lock().heartbeat_interval); ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); loop { @@ -57,7 +56,7 @@ impl WorkerHeartbeatManager { } } } - _ = reset_rx.changed() => { + _ = reset_notify.notified() => { ticker.reset(); } } @@ -99,7 +98,7 @@ struct WorkerHeartbeatData { /// Worker start time start_time: SystemTime, heartbeat_interval: Duration, - reset_tx: watch::Sender<()>, + reset_notify: Arc, } impl WorkerHeartbeatData { @@ -107,7 +106,7 @@ impl WorkerHeartbeatData { worker_config: WorkerConfig, worker_identity: String, sdk_name_and_ver: (String, String), - reset_tx: watch::Sender<()>, + reset_notify: Arc, ) -> Self { Self { worker_identity, @@ -123,7 +122,7 @@ impl WorkerHeartbeatData { heartbeat_time: None, worker_instance_key: Uuid::new_v4().to_string(), heartbeat_interval: worker_config.heartbeat_interval, - reset_tx, + reset_notify, } } @@ -148,7 +147,7 @@ impl WorkerHeartbeatData { self.heartbeat_time = Some(now); - let _ = self.reset_tx.send(()); + self.reset_notify.notify_one(); Some(WorkerHeartbeat { worker_instance_key: self.worker_instance_key.clone(), From ac3503342070a68b879e4f2a57fef0b39d5041a2 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 9 Jul 2025 13:28:47 -0700 Subject: [PATCH 12/13] Update API, use Vec instead of Optional --- core/src/worker/client.rs | 4 ++-- sdk-core-protos/protos/api_upstream/buf.yaml | 2 -- .../protos/api_upstream/openapi/openapiv2.json | 12 ++++++++++-- .../protos/api_upstream/openapi/openapiv3.yaml | 16 ++++++++++++++-- .../temporal/api/worker/v1/message.proto | 12 ++++++++++-- .../workflowservice/v1/request_response.proto | 4 ++-- 6 files changed, 38 insertions(+), 12 deletions(-) diff --git a/core/src/worker/client.rs b/core/src/worker/client.rs index 6e4e41609..8ba1502a4 100644 --- a/core/src/worker/client.rs +++ b/core/src/worker/client.rs @@ -366,7 +366,7 @@ impl WorkerClient for WorkerClientBag { identity: self.identity.clone(), worker_version_capabilities: self.worker_version_capabilities(), deployment_options: self.deployment_options(), - worker_heartbeat: self.capture_heartbeat(), + worker_heartbeat: self.capture_heartbeat().into_iter().collect(), } .into_request(); request.extensions_mut().insert(IsWorkerTaskLongPoll); @@ -679,7 +679,7 @@ impl WorkerClient for WorkerClientBag { .record_worker_heartbeat(RecordWorkerHeartbeatRequest { namespace: self.namespace.clone(), identity: self.identity.clone(), - worker_heartbeat: Some(heartbeat), + worker_heartbeat: vec![heartbeat], }) .await? .into_inner()) diff --git a/sdk-core-protos/protos/api_upstream/buf.yaml b/sdk-core-protos/protos/api_upstream/buf.yaml index 9f94a9edc..e984c1439 100644 --- a/sdk-core-protos/protos/api_upstream/buf.yaml +++ b/sdk-core-protos/protos/api_upstream/buf.yaml @@ -13,8 +13,6 @@ breaking: - WIRE_JSON ignore: - google - # TODO (yuri) remove this - - temporal/api/workflow/v1/message.proto lint: use: - DEFAULT diff --git a/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json b/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json index 0a169fbc6..6da2b5b0f 100644 --- a/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json +++ b/sdk-core-protos/protos/api_upstream/openapi/openapiv2.json @@ -7606,7 +7606,11 @@ "description": "The identity of the client who initiated this request." }, "workerHeartbeat": { - "$ref": "#/definitions/v1WorkerHeartbeat" + "type": "array", + "items": { + "type": "object", + "$ref": "#/definitions/v1WorkerHeartbeat" + } } } }, @@ -15410,9 +15414,13 @@ "type": "string", "description": "Worker host identifier." }, + "processKey": { + "type": "string", + "title": "Worker process identifier. This id should be unique for all _processes_\nrunning workers in the namespace, and should be shared by all workers\nin the same process.\nThis will be used to build the worker command nexus task queue name:\n\"temporal-sys/worker-commands/{process_key}\"" + }, "processId": { "type": "string", - "description": "Worker process identifier, should be unique for the host." + "description": "Worker process identifier. Unlike process_key, this id only needs to be unique\nwithin one host (so using e.g. a unix pid would be appropriate)." }, "currentHostCpuUsage": { "type": "number", diff --git a/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml b/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml index 6bb75b4f0..5c3cccb7b 100644 --- a/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml +++ b/sdk-core-protos/protos/api_upstream/openapi/openapiv3.yaml @@ -9723,7 +9723,9 @@ components: type: string description: The identity of the client who initiated this request. workerHeartbeat: - $ref: '#/components/schemas/WorkerHeartbeat' + type: array + items: + $ref: '#/components/schemas/WorkerHeartbeat' RecordWorkerHeartbeatResponse: type: object properties: {} @@ -12797,9 +12799,19 @@ components: hostName: type: string description: Worker host identifier. + processKey: + type: string + description: |- + Worker process identifier. This id should be unique for all _processes_ + running workers in the namespace, and should be shared by all workers + in the same process. + This will be used to build the worker command nexus task queue name: + "temporal-sys/worker-commands/{process_key}" processId: type: string - description: Worker process identifier, should be unique for the host. + description: |- + Worker process identifier. Unlike process_key, this id only needs to be unique + within one host (so using e.g. a unix pid would be appropriate). currentHostCpuUsage: type: number description: |- diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto b/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto index 024357ce5..f5ad9ebe0 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/worker/v1/message.proto @@ -54,7 +54,16 @@ message WorkerHostInfo { // Worker host identifier. string host_name = 1; - // Worker process identifier, should be unique for the host. + + // Worker process identifier. This id should be unique for all _processes_ + // running workers in the namespace, and should be shared by all workers + // in the same process. + // This will be used to build the worker command nexus task queue name: + // "temporal-sys/worker-commands/{process_key}" + string process_key = 5; + + // Worker process identifier. Unlike process_key, this id only needs to be unique + // within one host (so using e.g. a unix pid would be appropriate). string process_id = 2; // System used CPU as a float in the range [0.0, 1.0] where 1.0 is defined as all @@ -78,7 +87,6 @@ message WorkerHeartbeat { // Usually host_name+(user group name)+process_id, but can be overwritten by the user. string worker_identity = 2; - // Worker host information. WorkerHostInfo host_info = 3; diff --git a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto index 718ddc666..180dbba75 100644 --- a/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto +++ b/sdk-core-protos/protos/api_upstream/temporal/api/workflowservice/v1/request_response.proto @@ -1772,7 +1772,7 @@ message PollNexusTaskQueueRequest { temporal.api.deployment.v1.WorkerDeploymentOptions deployment_options = 6; // Worker info to be sent to the server. - temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 7; + repeated temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 7; } message PollNexusTaskQueueResponse { @@ -2384,7 +2384,7 @@ message RecordWorkerHeartbeatRequest { // The identity of the client who initiated this request. string identity = 2; - temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 3; + repeated temporal.api.worker.v1.WorkerHeartbeat worker_heartbeat = 3; } message RecordWorkerHeartbeatResponse { From 0190cb77d82668c1abb25c765f78cd23be2fade7 Mon Sep 17 00:00:00 2001 From: Andrew Yuan Date: Wed, 9 Jul 2025 15:06:02 -0700 Subject: [PATCH 13/13] Use tokio::time::advance, only log warning if server supports feature --- core/Cargo.toml | 1 + core/src/worker/heartbeat.rs | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/core/Cargo.toml b/core/Cargo.toml index b8e18b16e..a62fa20b8 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -96,6 +96,7 @@ criterion = { version = "0.6", features = ["async", "async_tokio"] } rstest = "0.25" temporal-sdk-core-test-utils = { path = "../test-utils" } temporal-sdk = { path = "../sdk" } +tokio = { version = "1.37", features = ["rt", "rt-multi-thread", "parking_lot", "time", "fs", "process", "test-util"] } tokio-stream = { version = "0.1", features = ["net"] } [[test]] diff --git a/core/src/worker/heartbeat.rs b/core/src/worker/heartbeat.rs index f4e00a96b..4ca01ff67 100644 --- a/core/src/worker/heartbeat.rs +++ b/core/src/worker/heartbeat.rs @@ -47,13 +47,13 @@ impl WorkerHeartbeatManager { continue }; if let Err(e) = client.clone().record_worker_heartbeat(heartbeat).await { - warn!(error=?e, "Network error while sending worker heartbeat"); if matches!( e.code(), tonic::Code::Unimplemented ) { return; } + warn!(error=?e, "Network error while sending worker heartbeat"); } } _ = reset_notify.notified() => { @@ -134,7 +134,6 @@ impl WorkerHeartbeatData { // Only send poll data if it's nearly been a full interval since this data has been sent // In this case, "nearly" is 90% of the interval if dur.as_secs_f64() < 0.9 * self.heartbeat_interval.as_secs_f64() { - println!("Heartbeat interval not yet elapsed, not sending poll data"); return None; } Some(PbDuration { @@ -213,12 +212,18 @@ mod tests { heartbeat_fn.get().unwrap()(); // heartbeat timer fires once - tokio::time::sleep(Duration::from_millis(300)).await; + advance_time(Duration::from_millis(300)).await; // it hasn't been >90% of the interval since the last heartbeat, so no data should be returned here assert_eq!(None, heartbeat_fn.get().unwrap()()); // heartbeat timer fires once - tokio::time::sleep(Duration::from_millis(150)).await; + advance_time(Duration::from_millis(300)).await; worker.drain_activity_poller_and_shutdown().await; } + + async fn advance_time(dur: Duration) { + tokio::time::pause(); + tokio::time::advance(dur).await; + tokio::time::resume(); + } }