|
7 | 7 | from kubernetes.client import Configuration
|
8 | 8 |
|
9 | 9 | from ydb.tools.ydbd_slice import nodes
|
10 |
| -from ydb.tools.ydbd_slice.kube import api, kubectl, yaml, generate, cms, dynconfig, docker |
| 10 | +from ydb.tools.ydbd_slice.kube import api, kubectl, yaml, generate, cms, dynconfig, docker, utils |
11 | 11 |
|
12 | 12 |
|
13 | 13 | logger = logging.getLogger(__name__)
|
@@ -86,6 +86,36 @@ def get_all_manifests(directory):
|
86 | 86 | return result
|
87 | 87 |
|
88 | 88 |
|
| 89 | +def get_job_manifests(directory): |
| 90 | + result = [] |
| 91 | + for file in os.listdir(directory): |
| 92 | + path = os.path.abspath(os.path.join(directory, file)) |
| 93 | + if not file.endswith(('.yaml', '.yml')): |
| 94 | + logger.info('skipping file: %s, not yaml file extension', path) |
| 95 | + continue |
| 96 | + try: |
| 97 | + with open(path) as file: |
| 98 | + data = yaml.load(file) |
| 99 | + except Exception as e: |
| 100 | + logger.error('failed to open and parse file: %s, error: %s', path, str(e)) |
| 101 | + continue |
| 102 | + |
| 103 | + if not utils.is_kubernetes_manifest(data): |
| 104 | + logger.info('skipping file: %s, not kubernetes manifest', path) |
| 105 | + continue |
| 106 | + |
| 107 | + api_version = data['apiVersion'] |
| 108 | + kind = data['kind'].lower() |
| 109 | + namespace = data['metadata'].get('namespace') |
| 110 | + name = data['metadata']['name'] |
| 111 | + result.append((path, api_version, kind, namespace, name, data)) |
| 112 | + |
| 113 | + if not result: |
| 114 | + raise RuntimeError(f'failed to find any manifests in {os.path.abspath(directory)}') |
| 115 | + |
| 116 | + return result |
| 117 | + |
| 118 | + |
89 | 119 | def validate_components_selector(value):
|
90 | 120 | if not re.match(r'^[a-zA-Z][a-zA-Z0-9\-]*$', value):
|
91 | 121 | raise ValueError('invalid value: %s' % value)
|
@@ -157,6 +187,45 @@ def get_domain(api_client, project_path, manifests):
|
157 | 187 | return data['spec']['domain']
|
158 | 188 |
|
159 | 189 |
|
| 190 | +def get_namespace_nodeclaim_image(manifests): |
| 191 | + """ |
| 192 | + Extracts the namespace, name, and image name from the first suitable nodeclaim manifest. |
| 193 | +
|
| 194 | + Args: |
| 195 | + manifests (list): A list of tuples, where each tuple contains: |
| 196 | + - path (str): The file path of the manifest. |
| 197 | + - api_version (str): The API version of the manifest. |
| 198 | + - kind (str): The kind of the manifest. |
| 199 | + - namespace (str): The namespace of the manifest. |
| 200 | + - name (str): The name of the manifest. |
| 201 | + - data (dict): The data of the manifest. |
| 202 | +
|
| 203 | + Returns: |
| 204 | + tuple: A tuple containing: |
| 205 | + - namespace (str): The namespace of the nodeclaim. |
| 206 | + - name (str): The name of the nodeclaim. |
| 207 | + - image_name (str): The image name specified in the nodeclaim. |
| 208 | +
|
| 209 | + Raises: |
| 210 | + RuntimeError: If no suitable nodeclaim manifest is found. |
| 211 | + """ |
| 212 | + nodeclaim_namespace, nodeclaim_name, image_name = "", "", "" |
| 213 | + for path, _, kind, namespace, name, data in utils.filter_manifests(manifests, 'ydb.tech/v1alpha1', ['nodeclaim', 'storage']): |
| 214 | + if kind == 'nodeclaim' and not nodeclaim_name: |
| 215 | + nodeclaim_namespace = namespace |
| 216 | + nodeclaim_name = name |
| 217 | + elif kind == 'storage' and not image_name: |
| 218 | + try: |
| 219 | + image_name = data['spec']['image']['name'] |
| 220 | + except KeyError: |
| 221 | + pass |
| 222 | + if namespace and nodeclaim_name and image_name: |
| 223 | + return nodeclaim_namespace, nodeclaim_name, image_name |
| 224 | + |
| 225 | + if not namespace or not nodeclaim_name or not image_name: |
| 226 | + raise RuntimeError(f"No suitable nodeclaim or storage manifest found. Namespace: {namespace}, NodeClaim: {nodeclaim_name}, Image: {image_name}") |
| 227 | + |
| 228 | + |
160 | 229 | def manifests_ydb_set_image(project_path, manifests, image):
|
161 | 230 | for (path, api_version, kind, namespace, name, data) in manifests:
|
162 | 231 | if not (kind in ['storage', 'database'] and api_version in ['ydb.tech/v1alpha1']):
|
@@ -254,21 +323,78 @@ def slice_nodeclaim_nodes(api_client, project_path, manifests):
|
254 | 323 |
|
255 | 324 |
|
256 | 325 | def slice_nodeclaim_format(api_client, project_path, manifests):
|
| 326 | + """ |
| 327 | + Formats and processes node claims by creating, waiting for completion, and deleting Kubernetes jobs. |
| 328 | +
|
| 329 | + This function performs the following steps: |
| 330 | + 1. Creates a directory for job manifests if it doesn't exist. |
| 331 | + 2. Retrieves the namespace, node claim, and YDB image from the provided manifests. |
| 332 | + 3. Retrieves the list of nodes. |
| 333 | + 4. Generates obliterate jobs and saves them to the jobs directory. |
| 334 | + 5. Creates and waits for the completion of each job. |
| 335 | + 6. Deletes the jobs and their associated pods. |
| 336 | +
|
| 337 | + Args: |
| 338 | + api_client (object): The Kubernetes API client. |
| 339 | + project_path (str): The path to the project directory. |
| 340 | + manifests (dict): The manifests containing the namespace, node claim, and YDB image information. |
| 341 | +
|
| 342 | + Raises: |
| 343 | + SystemExit: If no namespace or node claim is found, or if no nodes are found. |
| 344 | + TimeoutError: If some jobs do not complete within the expected time. |
| 345 | + Exception: If an error occurs during job processing or cleanup. |
| 346 | +
|
| 347 | + Note: |
| 348 | + This function logs errors and exits the program if critical issues are encountered. |
| 349 | + """ |
| 350 | + jobs_path = os.path.join(project_path, 'jobs') |
| 351 | + if not os.path.exists(jobs_path): |
| 352 | + os.makedirs(jobs_path) |
| 353 | + |
| 354 | + namespace, nodeclaim, ydb_image = get_namespace_nodeclaim_image(manifests) |
| 355 | + if not namespace or not nodeclaim: |
| 356 | + logger.error("No namespace or nodclaim found, nothing to format.") |
| 357 | + sys.exit(2) |
| 358 | + |
257 | 359 | node_list = get_nodes(api_client, project_path, manifests)
|
258 | 360 | if len(node_list) == 0:
|
259 |
| - logger.info('no nodes found, nothing to format.') |
260 |
| - return |
261 |
| - node_list = nodes.Nodes(node_list) |
262 |
| - cmd = r"sudo find /dev/disk/ -path '*/by-partlabel/kikimr_*' " \ |
263 |
| - r"-exec dd if=/dev/zero of={} bs=1M count=1 status=none \;" |
264 |
| - node_list.execute_async(cmd) |
| 361 | + logger.error("No nodes found, nothing to format.") |
| 362 | + sys.exit(2) |
| 363 | + |
| 364 | + # save obliterate jobs to project_path/jobs for debug porposes and to be able to rerun them manually |
| 365 | + generate.generate_obliterate(jobs_path, namespace, nodeclaim, ydb_image, node_list) |
| 366 | + jobs = get_job_manifests(jobs_path) |
| 367 | + |
| 368 | + try: |
| 369 | + for (_, _, _, namespace, name, data) in utils.filter_manifests(jobs, 'batch/v1', ['job']): |
| 370 | + api.create_job(api_client, namespace, data) |
| 371 | + |
| 372 | + # wait for job completion and job pods completion |
| 373 | + api.wait_jobs_completed(api_client, namespace) |
| 374 | + |
| 375 | + # cleanup jobs and pods |
| 376 | + for (_, _, _, namespace, name, data) in utils.filter_manifests(jobs, 'batch/v1', ['job']): |
| 377 | + api.wait_job_pods_completed(api_client, namespace, name) |
| 378 | + api.delete_job(api_client, namespace, name) |
| 379 | + api.delete_job_pods(api_client, namespace, name) |
| 380 | + |
| 381 | + except TimeoutError as e: |
| 382 | + logger.error(f"Some jobs did not complete within the expected time. Please check the job manifests in {jobs_path} for manual rerun.") |
| 383 | + sys.exit(e.args[0]) |
| 384 | + except Exception as e: |
| 385 | + logger.error(f"An error occurred: {e}") |
| 386 | + sys.exit(f""" |
| 387 | + An error occurred while processing the jobs. This might indicate an issue with the job's execution or cleanup process. |
| 388 | + To investigate further, you can check the status of the jobs and pods by running: |
| 389 | + kubectl get jobs -n {namespace} |
| 390 | + kubectl get pods -n {namespace} -l job-name={name} |
| 391 | + You may need to manually delete these jobs or pods if they are stuck or not terminating properly. |
| 392 | +""") |
265 | 393 |
|
266 | 394 |
|
267 | 395 | def slice_nodeclaim_delete(api_client, project_path, manifests):
|
268 | 396 | nodeclaims = []
|
269 |
| - for (path, api_version, kind, namespace, name, data) in manifests: |
270 |
| - if not (kind in ['nodeclaim'] and api_version in ['ydb.tech/v1alpha1']): |
271 |
| - continue |
| 397 | + for (path, api_version, kind, namespace, name, data) in utils.filter_manifests(manifests, 'ydb.tech/v1alpha1', ['nodeclaim']): |
272 | 398 | namespace = data['metadata']['namespace']
|
273 | 399 | name = data['metadata']['name']
|
274 | 400 | api.delete_nodeclaim(api_client, namespace, name)
|
|
0 commit comments