Skip to content

Commit ca6e7a2

Browse files
Merge pull request #9101 from romayalon/romy-5.18-backports
Backports to 5.18.6 | NC
2 parents 020bde4 + a3fc7ed commit ca6e7a2

File tree

13 files changed

+501
-200
lines changed

13 files changed

+501
-200
lines changed

config.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,6 +1003,8 @@ config.ENDPOINT_SSL_PORT = Number(process.env.ENDPOINT_SSL_PORT) || 6443;
10031003
// Remove the NSFS condition when NSFS starts to support STS.
10041004
config.ENDPOINT_SSL_STS_PORT = Number(process.env.ENDPOINT_SSL_STS_PORT) || (process.env.NC_NSFS_NO_DB_ENV === 'true' ? -1 : 7443);
10051005
config.ENDPOINT_SSL_IAM_PORT = Number(process.env.ENDPOINT_SSL_IAM_PORT) || -1;
1006+
// each fork will get port in range [ENDPOINT_FORK_PORT_BASE, ENDPOINT_FORK_PORT_BASE + number of forks - 1)]
1007+
config.ENDPOINT_FORK_PORT_BASE = Number(process.env.ENDPOINT_FORK_PORT_BASE) || 6002;
10061008
config.ALLOW_HTTP = false;
10071009
config.ALLOW_HTTP_METRICS = true;
10081010
config.ALLOW_HTTPS_METRICS = true;
@@ -1015,6 +1017,8 @@ config.VIRTUAL_HOSTS = process.env.VIRTUAL_HOSTS || '';
10151017

10161018
config.NC_HEALTH_ENDPOINT_RETRY_COUNT = 3;
10171019
config.NC_HEALTH_ENDPOINT_RETRY_DELAY = 10;
1020+
config.NC_FORK_SERVER_TIMEOUT = 5; // 5 minutes
1021+
config.NC_FORK_SERVER_RETRIES = 10;
10181022

10191023

10201024
/** @type {'file' | 'executable'} */
@@ -1060,6 +1064,7 @@ config.NC_LIFECYCLE_BUCKET_BATCH_SIZE = 10000;
10601064
config.NC_LIFECYCLE_GPFS_ILM_ENABLED = true;
10611065
config.NC_LIFECYCLE_GPFS_ALLOW_SCAN_ON_REMOTE = true;
10621066
config.NC_GPFS_BIN_DIR = '/usr/lpp/mmfs/bin/';
1067+
config.NC_LIFECYCLE_GPFS_MMAPPLY_ILM_POLICY_CONCURRENCY = 1;
10631068

10641069
////////// GPFS //////////
10651070
config.GPFS_DOWN_DELAY = 1000;

src/endpoint/endpoint.js

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ const { SemaphoreMonitor } = require('../server/bg_services/semaphore_monitor');
4343
const prom_reporting = require('../server/analytic_services/prometheus_reporting');
4444
const { PersistentLogger } = require('../util/persistent_logger');
4545
const { get_notification_logger } = require('../util/notifications_util');
46+
const { is_nc_environment } = require('../nc/nc_utils');
4647
const NoobaaEvent = require('../manage_nsfs/manage_nsfs_events_utils').NoobaaEvent;
4748
const cluster = /** @type {import('node:cluster').Cluster} */ (
4849
/** @type {unknown} */ (require('node:cluster'))
@@ -57,7 +58,8 @@ const SERVICES_TYPES_ENUM = Object.freeze({
5758
S3: 'S3',
5859
STS: 'STS',
5960
IAM: 'IAM',
60-
METRICS: 'METRICS'
61+
METRICS: 'METRICS',
62+
FORK_HEALTH: 'FORK_HEALTH',
6163
});
6264

6365
const new_umask = process.env.NOOBAA_ENDPOINT_UMASK || 0o000;
@@ -117,11 +119,11 @@ async function main(options = {}) {
117119
const https_metrics_port = options.https_metrics_port || config.EP_METRICS_SERVER_SSL_PORT;
118120
/**
119121
* Please notice that we can run the main in 2 states:
120-
* 1. Only the primary process runs the main (fork is 0 or undefined) - everything that
122+
* 1. Only the primary process runs the main (fork is 0 or undefined) - everything that
121123
* is implemented here would be run by this process.
122-
* 2. A primary process with multiple forks (IMPORTANT) - if there is implementation that
123-
* in only relevant to the primary process it should be implemented in
124-
* fork_utils.start_workers because the primary process returns after start_workers
124+
* 2. A primary process with multiple forks (IMPORTANT) - if there is implementation that
125+
* in only relevant to the primary process it should be implemented in
126+
* fork_utils.start_workers because the primary process returns after start_workers
125127
* and the forks will continue executing the code lines in this function
126128
* */
127129
const is_workers_started_from_primary = await fork_utils.start_workers(http_metrics_port, https_metrics_port,
@@ -202,14 +204,29 @@ async function main(options = {}) {
202204
{ ...options, https_port: https_port_s3, http_port: http_port_s3, virtual_hosts, bucket_logger, notification_logger });
203205
await start_endpoint_server_and_cert(SERVICES_TYPES_ENUM.STS, init_request_sdk, { https_port: https_port_sts, virtual_hosts });
204206
await start_endpoint_server_and_cert(SERVICES_TYPES_ENUM.IAM, init_request_sdk, { https_port: https_port_iam });
205-
207+
const is_nc = is_nc_environment();
208+
// fork health server currently runs only on non containerized enviorment
209+
if (is_nc) {
210+
// current process is the primary and only fork. start the fork server directly with the base port
211+
if (cluster.isPrimary) {
212+
await fork_message_request_handler({
213+
nsfs_config_root: options.nsfs_config_root,
214+
health_port: config.ENDPOINT_FORK_PORT_BASE
215+
});
216+
// current process is a worker so we listen to get the port from the primary process.
217+
} else {
218+
process.on('message', fork_message_request_handler);
219+
//send a message to the primary process that we are ready to receive messages
220+
process.send({ready_to_start_fork_server: true});
221+
}
222+
}
206223

207224
// START METRICS SERVER
208225
if ((http_metrics_port > 0 || https_metrics_port > 0) && cluster.isPrimary) {
209226
await prom_reporting.start_server(http_metrics_port, https_metrics_port, false, options.nsfs_config_root);
210227
}
211228

212-
// TODO: currently NC NSFS deployments don't have internal_rpc_client nor db,
229+
// TODO: currently NC NSFS deployments don't have internal_rpc_client nor db,
213230
// there for namespace monitor won't be registered
214231
if (internal_rpc_client && config.NAMESPACE_MONITOR_ENABLED) {
215232
endpoint_stats_collector.instance().set_rpc_client(internal_rpc_client);
@@ -293,8 +310,6 @@ function create_endpoint_handler(server_type, init_request_sdk, { virtual_hosts,
293310
return blob_rest_handler(req, res);
294311
} else if (req.url.startsWith('/total_fork_count')) {
295312
return fork_count_handler(req, res);
296-
} else if (req.url.startsWith('/endpoint_fork_id')) {
297-
return endpoint_fork_id_handler(req, res);
298313
} else if (req.url.startsWith('/_/')) {
299314
// internals non S3 requests
300315
const api = req.url.slice('/_/'.length);
@@ -535,8 +550,38 @@ function unavailable_handler(req, res) {
535550
res.end(reply);
536551
}
537552

553+
/**
554+
* handler for the inidivdual fork server. used to handle requests the get the worker id
555+
* currently used to check if fork is alive by the health script
556+
* @param {EndpointRequest} req
557+
* @param {import('http').ServerResponse} res
558+
*/
559+
function fork_main_handler(req, res) {
560+
endpoint_utils.set_noobaa_server_header(res);
561+
endpoint_utils.prepare_rest_request(req);
562+
if (req.url.startsWith('/endpoint_fork_id')) {
563+
return endpoint_fork_id_handler(req, res);
564+
} else {
565+
return internal_api_error(req, res, `Unknown API call ${req.url}`);
566+
}
567+
}
568+
569+
/**
570+
* fork_message_request_handler is used to handle messages from the primary process.
571+
* the primary process sends a message with the designated port to start the fork server.
572+
* @param {Object} msg
573+
*/
574+
async function fork_message_request_handler(msg) {
575+
await http_utils.start_https_server(msg.health_port,
576+
SERVICES_TYPES_ENUM.FORK_HEALTH,
577+
fork_main_handler,
578+
msg.nsfs_config_root
579+
);
580+
}
581+
538582
exports.main = main;
539583
exports.create_endpoint_handler = create_endpoint_handler;
540584
exports.create_init_request_sdk = create_init_request_sdk;
585+
exports.endpoint_fork_id_handler = endpoint_fork_id_handler;
541586

542587
if (require.main === module) main();

src/manage_nsfs/health.js

Lines changed: 36 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ process.env.AWS_SDK_JS_SUPPRESS_MAINTENANCE_MODE_MESSAGE = '1';
101101
class NSFSHealth {
102102
constructor(options) {
103103
this.https_port = options.https_port;
104+
this.fork_base_port = options.fork_base_port;
104105
this.all_account_details = options.all_account_details;
105106
this.all_bucket_details = options.all_bucket_details;
106107
this.all_connection_details = options.all_connection_details;
@@ -241,10 +242,10 @@ class NSFSHealth {
241242
return service_health;
242243
}
243244

244-
async make_endpoint_health_request(url_path) {
245+
async make_endpoint_health_request(url_path, port = this.https_port) {
245246
const response = await make_https_request({
246-
HOSTNAME,
247-
port: this.https_port,
247+
hostname: HOSTNAME,
248+
port,
248249
path: url_path,
249250
method: 'GET',
250251
rejectUnauthorized: false,
@@ -260,43 +261,37 @@ class NSFSHealth {
260261
let url_path = '/total_fork_count';
261262
const worker_ids = [];
262263
let total_fork_count = 0;
264+
let fork_count_response;
263265
let response;
264266
try {
265-
const fork_count_response = await this.make_endpoint_health_request(url_path);
266-
if (!fork_count_response) {
267-
return {
268-
response: fork_response_code.NOT_RUNNING,
269-
total_fork_count: total_fork_count,
270-
running_workers: worker_ids,
271-
};
272-
}
273-
total_fork_count = fork_count_response.fork_count;
274-
if (total_fork_count > 0) {
275-
url_path = '/endpoint_fork_id';
276-
await P.retry({
277-
attempts: total_fork_count * 2,
278-
delay_ms: 1,
279-
func: async () => {
280-
const fork_id_response = await this.make_endpoint_health_request(url_path);
281-
if (fork_id_response.worker_id && !worker_ids.includes(fork_id_response.worker_id)) {
282-
worker_ids.push(fork_id_response.worker_id);
283-
}
284-
if (worker_ids.length < total_fork_count) {
285-
throw new Error('Number of running forks is less than the expected fork count.');
286-
}
287-
}
288-
});
289-
if (worker_ids.length === total_fork_count) {
290-
response = fork_response_code.RUNNING;
291-
} else {
292-
response = fork_response_code.MISSING_FORKS;
293-
}
294-
} else {
295-
response = fork_response_code.RUNNING;
296-
}
267+
fork_count_response = await this.make_endpoint_health_request(url_path);
297268
} catch (err) {
298-
dbg.log1('Error while pinging endpoint host :' + HOSTNAME + ', port ' + this.https_port, err);
299-
response = fork_response_code.NOT_RUNNING;
269+
dbg.log0('Error while pinging endpoint host :' + HOSTNAME, err);
270+
}
271+
if (!fork_count_response) {
272+
return {
273+
response: fork_response_code.NOT_RUNNING,
274+
total_fork_count: total_fork_count,
275+
running_workers: worker_ids,
276+
};
277+
}
278+
279+
total_fork_count = fork_count_response.fork_count;
280+
url_path = '/endpoint_fork_id';
281+
for (let i = 0; i < total_fork_count; i++) {
282+
const port = this.fork_base_port + i;
283+
try {
284+
const fork_id_response = await this.make_endpoint_health_request(url_path, port);
285+
worker_ids.push(fork_id_response.worker_id);
286+
} catch (err) {
287+
dbg.log0('Error while pinging fork :' + HOSTNAME + ', port ' + port, err);
288+
}
289+
}
290+
if (worker_ids.length < total_fork_count) {
291+
dbg.log0('Number of running forks is less than the expected fork count.');
292+
response = fork_response_code.MISSING_FORKS;
293+
} else {
294+
response = fork_response_code.RUNNING;
300295
}
301296
return {
302297
response: response,
@@ -637,6 +632,7 @@ class NSFSHealth {
637632
async function get_health_status(argv, config_fs) {
638633
try {
639634
const https_port = Number(argv.https_port) || config.ENDPOINT_SSL_PORT;
635+
const fork_base_port = Number(argv.fork_base_port) || config.ENDPOINT_FORK_PORT_BASE;
640636
const deployment_type = argv.deployment_type || 'nc';
641637
const all_account_details = get_boolean_or_string_value(argv.all_account_details);
642638
const all_bucket_details = get_boolean_or_string_value(argv.all_bucket_details);
@@ -645,8 +641,9 @@ async function get_health_status(argv, config_fs) {
645641
const lifecycle = get_boolean_or_string_value(argv.lifecycle);
646642

647643
if (deployment_type === 'nc') {
648-
const health = new NSFSHealth({ https_port,
649-
all_account_details, all_bucket_details, all_connection_details, notif_storage_threshold, lifecycle, config_fs });
644+
const health = new NSFSHealth({ https_port, fork_base_port,
645+
all_account_details, all_bucket_details, all_connection_details,
646+
notif_storage_threshold, lifecycle, config_fs });
650647
const health_status = await health.nc_nsfs_health();
651648
write_stdout_response(ManageCLIResponse.HealthStatus, health_status);
652649
} else {

0 commit comments

Comments
 (0)