diff --git a/.github/workflows/automated-publish-docs.yaml b/.github/workflows/automated-publish-docs.yaml index 97c38ef47d..4823a8d9ac 100644 --- a/.github/workflows/automated-publish-docs.yaml +++ b/.github/workflows/automated-publish-docs.yaml @@ -19,11 +19,13 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Get all v*.* branches id: calculate-env run: | - BRANCHES=$(git branch --list --all | grep -v master | grep 'origin/v*.*' | sed -n -E 's:.*/(v[0-9]+\.[0-9]+).*:\1:p' | sort -Vu) + BRANCHES=$(git branch -r | grep -E '^ *origin/v[0-9]{1,2}\.[0-9]{1,2}$' | sort -Vu | sed 's/origin\///g' | sed 's/ //g') NEWEST_VERSION=$(printf '%s\n' "${BRANCHES[@]}" | sort -V | tail -n 1) CURRENT_BRANCH=${GITHUB_REF#refs/heads/} ALIAS=$CURRENT_BRANCH-alias @@ -48,7 +50,6 @@ jobs: uses: actions/checkout@v4 with: ref: ${{ needs.env.outputs.CURRENT_BRANCH }} - fetch-depth: 0 - name: setup python uses: actions/setup-python@v5 @@ -97,4 +98,5 @@ jobs: SLACK_MESSAGE_ON_SUCCESS: "Docs were updated successfully for version ${{ needs.env.outputs.TITLE }}" SLACK_MESSAGE_ON_FAILURE: "Docs update FAILED for version ${{ needs.env.outputs.TITLE }}" MSG_MINIMAL: true - SLACK_FOOTER: "" \ No newline at end of file + SLACK_FOOTER: "" + diff --git a/docs/Researcher/Walkthroughs/quickstart-overview.md b/docs/Researcher/Walkthroughs/quickstart-overview.md index 7e8ada7d47..ef25b7a14f 100644 --- a/docs/Researcher/Walkthroughs/quickstart-overview.md +++ b/docs/Researcher/Walkthroughs/quickstart-overview.md @@ -7,7 +7,6 @@ Follow the Quickstart documents below to learn more: * [Interactive build sessions with externalized services](walkthrough-build-ports.md) * [Using GPU Fractions](walkthrough-fractions.md) * [Distributed Training](walkthrough-distributed-training.md) -* [Hyperparameter Optimization](walkthrough-hpo.md) * [Over-Quota, Basic Fairness & Bin Packing](walkthrough-overquota.md) * [Fairness](walkthrough-queue-fairness.md) * [Inference](quickstart-inference.md) diff --git a/docs/Researcher/best-practices/env-variables.md b/docs/Researcher/best-practices/env-variables.md index a235e989a6..a131a32a28 100644 --- a/docs/Researcher/best-practices/env-variables.md +++ b/docs/Researcher/best-practices/env-variables.md @@ -13,13 +13,6 @@ Run:ai provides the following environment variables: Note that the Job can be deleted and then recreated with the same name. A Job UUID will be different even if the Job names are the same. -## Identifying a Pod - -With [Hyperparameter Optimization](../Walkthroughs/walkthrough-hpo.md), experiments are run as _Pods_ within the Job. Run:ai provides the following environment variables to identify the Pod. - -* ``POD_INDEX`` - An index number (0, 1, 2, 3....) for a specific Pod within the Job. This is useful for Hyperparameter Optimization to allow easy mapping to individual experiments. The Pod index will remain the same if restarted (due to a failure or preemption). Therefore, it can be used by the Researcher to identify experiments. -* ``POD_UUID`` - a unique identifier for the Pod. if the Pod is restarted, the Pod UUID will change. - ## GPU Allocation Run:ai provides an environment variable, visible inside the container, to help identify the number of GPUs allocated for the container. Use `RUNAI_NUM_OF_GPUS` diff --git a/docs/Researcher/cli-reference/runai-submit.md b/docs/Researcher/cli-reference/runai-submit.md index fe026c01ab..4426884676 100644 --- a/docs/Researcher/cli-reference/runai-submit.md +++ b/docs/Researcher/cli-reference/runai-submit.md @@ -50,14 +50,6 @@ runai submit --name frac05 -i gcr.io/run-ai-demo/quickstart -g 0.5 (see: [GPU fractions Quickstart](../Walkthroughs/walkthrough-fractions.md)). -Hyperparameter Optimization - -```console -runai submit --name hpo1 -i gcr.io/run-ai-demo/quickstart-hpo -g 1 \ - --parallelism 3 --completions 12 -v /nfs/john/hpo:/hpo -``` - -(see: [hyperparameter optimization Quickstart](../Walkthroughs/walkthrough-hpo.md)). Submit a Job without a name (automatically generates a name) diff --git a/docs/Researcher/scheduling/the-runai-scheduler.md b/docs/Researcher/scheduling/the-runai-scheduler.md index b8f0550a8d..804fea145a 100644 --- a/docs/Researcher/scheduling/the-runai-scheduler.md +++ b/docs/Researcher/scheduling/the-runai-scheduler.md @@ -226,5 +226,3 @@ To search for good hyperparameters, Researchers typically start a series of smal With HPO, the Researcher provides a single script that is used with multiple, varying, parameters. Each run is a *pod* (see definition above). Unlike Gang Scheduling, with HPO, pods are **independent**. They are scheduled independently, started, and end independently, and if preempted, the other pods are unaffected. The scheduling behavior for individual pods is exactly as described in the Scheduler Details section above for Jobs. In case node pools are enabled, if the HPO workload has been assigned with more than one node pool, the different pods might end up running on different node pools. - -For more information on Hyperparameter Optimization in Run:ai see [here](../Walkthroughs/walkthrough-hpo.md) diff --git a/docs/Researcher/user-interface/workspaces/blocks/Existing PVC.md b/docs/Researcher/user-interface/workspaces/blocks/Existing PVC.md new file mode 100644 index 0000000000..2dbc829f56 --- /dev/null +++ b/docs/Researcher/user-interface/workspaces/blocks/Existing PVC.md @@ -0,0 +1,90 @@ +# Persistent volume claim (PVCs) + +# Persistent Volumes (PVs) & Persistent Volume Claims (PVCs) + +Persistent Volumes (PVs) and Persistent Volume Claims (PVCs) are concepts in Kubernetes for managing storage. A PV is a piece of storage in the cluster, provisioned by an administrator or dynamically by Kubernetes using a StorageClass. It is a resource in the cluster, just like a node is a cluster resource. + +PVCs are requests for storage by a user. They are similar to pods, in that pods consume node resources and PVCs consume PV resources. PVCs allow users to request specific sizes and access modes (for example, read/write once, read-only many) without needing to know the details of the underlying storage infrastructure. + +Using PVs and PVCs in Kubernetes is essential for AI workloads as they provide a reliable and consistent way to manage storage that persists beyond the lifecycle of individual pods. This ensures that data generated by AI workloads is not lost when pods are rescheduled or updated, providing a seamless and efficient storage solution that can handle the large datasets typically associated with AI projects. + +## Data source of type Persistent Volume Claim (PVC) + +At Run:ai, a data source of type PVC is an abstraction, mapping directly to a Kubernetes PVC. This type of integration allows you to specify and manage your data storage requirements within the Run:ai platform, while using familiar Kubernetes storage concepts. + +By leveraging PVCs as data sources, Run:ai enables access to persistent storage for workloads, ensuring that data remains consistent and accessible across different compute resources and workload executions. + +## Creating a data source of type PVC via the UI + +Like any other asset, when creating a data source, the user can select the scope of the data source, based on their permissions set in Run:ai’s Role Based Access Control (RBAC) system. + +For example: By selecting Department B as the scope of the asset, any user with a role which allows them to view the data source in Department A or any of its subordinate units (current and future) can view this PVC. +![][image1] + +There are two different ways of creating data source of type PVC: + +1. **Existing PVC** \- Data source of type PVC using an existing PVC in the cluster +1. **New PVC** \- Data source of type PVC by creating a new pvc in the cluster![][image2] + +**NOTE**: If there are no existing PVCs that Run:ai has visibility or authorization to use, this option is disabled in the Run:ai platform. For details on providing visibility and authorization, see below Existing PVC. + +## Existing PVC + +To select an existing PVC in the Run:ai platform, the admin is responsible for performing a number of actions prior to creating the data source via the Run:ai UI (or API). These actions provide Run:ai with access to the existing PVC, authorization to share across the selected scope and eventually result in exposing the existing PVC in the UI for the user to select. + +Click the link for more information on [creating a data source of type PVC via API](https://envinaclickstaging.staging.run.ai/api/docs\#tag/PVC/operation/create\_pvc\_asset). + +The actions taken by the admin are based on the scope (cluster, department or project) that the admin wants for data source of type PVC. + +### For a cluster scope + +1. Locate the PVC in the runai namespace +1. Provide Run:ai with visibility and authorization to share the PVC to your selected scope by implementing the following label: + [run.ai/cluster-wide](http://run.ai/cluster-wide): "true" + **NOTE:** This step is also relevant for creating the data source of type PVC via API. + + In the Run:ai platform finish creating the data source of type PVC: + +1. Select your cluster as a scope + 1. Select the existing PVC + 1. Complete all mandatory fields + 1. Click Create + +### For a department scope: + +1. Locate the PVC in the runai namespace +1. Provide Run:ai with visibility and authorization to share the PVC to your selected scope by implementing the following label: + [run.ai/department](http://run.ai/department): "\" +1. In the Run:ai platform finish creating the data source of type PVC: + 1. Select you department as a scope (the same one as in the label) + 1. Select the existing PVC + 1. Complete all mandatory fields + 1. Click Create + +### For a project scope: + +**NOTE**: For project scope, no labels are required. + +1. In the Run:ai platform finish creating the data source of type PVC: + 1. Select your project as a scope + 1. Select the existing PVC + 1. Complete all mandatory fields + 1. Click Create + +## Creating a new PVC + +When creating a data source of type PVC using a new PVC, Run:ai creates the PVC for you in the cluster. + +1. Select your scope of choice +1. Select new PVC +1. Complete all mandatory fields +1. Click Create + +**NOTES**: + +* When creating data source of type PVC using a new PVC, the PVC is created immediately in the cluster runai namespace (even if no workload has requested to use this PVC) +* A PVC created in the cluster by selecting the option “New PVC” never appears as a PVC in the “Existing PVC“ option. + +[image1]: + +[image2]: \ No newline at end of file diff --git a/docs/admin/aiinitiatives/img/assigning.png b/docs/admin/aiinitiatives/img/assigning.png new file mode 100644 index 0000000000..3a9daa05bb Binary files /dev/null and b/docs/admin/aiinitiatives/img/assigning.png differ diff --git a/docs/admin/aiinitiatives/img/bu.png b/docs/admin/aiinitiatives/img/bu.png new file mode 100644 index 0000000000..252c5674dd Binary files /dev/null and b/docs/admin/aiinitiatives/img/bu.png differ diff --git a/docs/admin/aiinitiatives/img/groupbyhardware.png b/docs/admin/aiinitiatives/img/groupbyhardware.png new file mode 100644 index 0000000000..f4290225fe Binary files /dev/null and b/docs/admin/aiinitiatives/img/groupbyhardware.png differ diff --git a/docs/admin/aiinitiatives/img/groupbytopology.png b/docs/admin/aiinitiatives/img/groupbytopology.png new file mode 100644 index 0000000000..8bdbacc512 Binary files /dev/null and b/docs/admin/aiinitiatives/img/groupbytopology.png differ diff --git a/docs/admin/aiinitiatives/img/individuals.png b/docs/admin/aiinitiatives/img/individuals.png new file mode 100644 index 0000000000..d0d85fc40c Binary files /dev/null and b/docs/admin/aiinitiatives/img/individuals.png differ diff --git a/docs/admin/aiinitiatives/img/org.png b/docs/admin/aiinitiatives/img/org.png new file mode 100644 index 0000000000..5a566d05d5 Binary files /dev/null and b/docs/admin/aiinitiatives/img/org.png differ diff --git a/docs/admin/aiinitiatives/overview.md b/docs/admin/aiinitiatives/overview.md new file mode 100644 index 0000000000..799d01d15a --- /dev/null +++ b/docs/admin/aiinitiatives/overview.md @@ -0,0 +1,98 @@ +# AI Initiatives + +AI initiatives refer to advancing research, development, and implementation of AI technologies. These initiatives represent your business needs and involve collaboration between individuals, teams, and other stakeholders. AI initiatives require compute resources and a methodology to effectively and efficiently use those compute resources and split them among the different AI initiatives stakeholders. The building blocks of AI compute resources are GPUs, CPUs, and CPU memory, which are built into nodes (servers) and can be further grouped into node pools. Nodes and node pools are part of a Kubernetes Cluster. + +To manage AI initiatives in Run:ai you should: + +* Map your organization and initiatives to projects and optionally departments +* Map compute resources (node pools and quotas) to projects and optionally departments +* Assign users (e.g. AI practitioners, ML engineers, Admins) to projects and departments + +## Mapping your organization + +The way you map your AI initiatives and organization into Run:ai projects and departments should reflect your organization’s structure and Project management practices. There are multiple options, and we provide you here with 3 examples of typical forms in which to map your organization, initiatives, and users into Run:ai, but of course, other ways that suit your requirements are also acceptable. + +### Based on individuals + +A typical use case would be students (individual practitioners) within a faculty (business unit) - an individual practitioner may be involved in one or more initiatives. In this example, the resources are accounted for by the student (project) and aggregated per faculty (department). +Department = business unit / Project = individual practitioner + +![](img/individuals.png) + +### Based on business units + +A typical use case would be an AI service (business unit) split into AI capabilities (initiatives) - an individual practitioner may be involved in several initiatives. In this example, the resources are accounted for by Initiative (project) and aggregated per AI service (department). + +Department = business unit / Project = initiative + +![](img/bu.png) + +### Based on the organizational structure + +A typical use case would be a business unit split into teams - an individual practitioner is involved in a single team (project) but the team may be involved in several AI initiatives. In this example, the resources are accounted for by team (project) and aggregated per business unit (department). + +Department = business unit / Project = team + +![](img/org.png) + +## Mapping your resources + +AI initiatives require compute resources such as GPUs and CPUs to run. Compute resources in any organization are limited, either due to the number of servers (nodes) owned by the organization is limited, the budget it can spend to lease resources in the cloud or spending for in-house servers is also limited. Every organization strives to optimize the usage of its resources by maximizing their utilization and providing all users with their needs. Therefore, the organization needs to split resources according to the organization's internal priorities and budget constraints. But even after splitting the resources, the orchestration layer should still provide fairness between the resourced consumers, and allow access to unused resources to minimize scenarios of idle resources. + +Another aspect of resource management is how to group your resources effectively, especially in large environments, or environments that are made of heterogeneous types of hardware, where some users need to use specific hardware types, or where other users should avoid occupying critical hardware of some users or initiatives. + +Run:ai assists you with all of these complex issues by allowing you to map your cluster resources to node pools, then map each Project and Department a quota allocation per node pool, and set access rights to unused resources (Over quota) per node pool. + +### Grouping your resources + +There are several reasons why you would group resources (nodes) into node pools: + +* **Control the GPU type to use in heterogeneous hardware environment** - in many cases, AI models can be optimized per hardware type they will use, e.g. a training workload that is optimized for H100 does not necessarily run optimally on an A100, and vice versa. Therefore segmenting into node pools, each with a different hardware type gives the AI researcher and ML engineer better control of where to run. +* **Quota control** - splitting to node pools allows the admin to set specific quota per hardware type, e.g. give high priority project guaranteed access to advanced GPU hardware, while keeping lower priority project with a lower quota or even with no quota at all for that high-end GPU, but give it a “best-effort” access only (i.e. if the high priority guaranteed project is not using those resources). +* **Multi-region or multi-availability-zone cloud environments** - if some or all of your clusters run on the cloud (or even on-premise) but any of your clusters uses different physical locations or different topologies (e.g. racks), you probably want to segment your resources per region/zone/topology to be able to control where to run your workloads, how much quota to assign to specific environments (per project, per department), even if all those locations are all using the same hardware type. This methodology can help in optimizing the performance of your workloads because of the superior performance of local computing such as the locality of distributed workloads, local storage etc. +* **Explainability and predictability** - large environments are complex to understand, this becomes even more complex when an environment is loaded. To maintain users’ satisfaction and their understanding of the resources state, as well as to keep predictability of your workload chances to get scheduled, segmenting your cluster into smaller pools may significantly help. +* **Scale** - Run:ai implementation of node pools has many benefits, one of the main of them is scale. Each node pool has its own scheduler instance, therefore allowing the cluster to handle more nodes and schedule workloads faster when segmented into node pools vs. one large cluster. To allow your workloads to use any resource within a cluster that is split to node pools, a second-level Scheduler is in charge of scheduling workloads between node pools according to your preferences and resource availability. +* **Prevent mutual exclusion** - Some AI workloads consume CPU-only resources, to prevent those workloads from consuming the CPU resources of GPU nodes and thus block GPU workloads from using those nodes, it is recommended to group CPU-only nodes into a dedicated node pool(s) and assign a quota for CPU projects to CPU node-pools only while keeping GPU node-pools with zero quota and optionally “best-effort” over-quota access for CPU-only projects. + +#### Grouping Examples + +Set out below are illustrations of different grouping options. + +Example: grouping nodes by topology + +![](img/groupbytopology.png) + + +Example: grouping nodes by hardware type + +![](img/groupbyhardware.png) + +### Assigning your resources + +After the initial grouping of resources, it is time to associate resources to AI initiatives, this is performed by assigning quotas to projects and optionally to departments. Assigning GPU quota to a project, on a node pool basis, means that the workloads submitted by that project are entitled to use those GPUs as guaranteed resources and can use them for all workload types. + +However, what happens if the project requires more resources than its quota? This depends on the type of workloads that the user wants to submit. If the user requires more resources for non-preemptible workloads, then the quota must be increased, because non-preemptible workloads require guaranteed resources. On the other hand, if the type of workload is, for example, a model Training workload that is preemptible - in this case the project can exploit unused resources of other projects, as long as the other projects don’t need them. Over-quota is set per project on a node-pool basis and per department. + +Administrators can use quota allocations to prioritize resources between users, teams, and AI initiatives. The administrator can completely prevent the use of certain node pools by a project or department by setting the node pool quota to 0 and disabling over quota for that node pool, or it can keep the quota to 0 and enable over-quota to that node pool and allow access based on resource availability only (e.g. unused GPUs). However, when a project with a non-zero quota needs to use those resources, the Scheduler reclaims those resources back and preempts the preemptible workloads of over-quota projects. As an administrator, you can also have an impact on the amount of over-quota resources a project or department uses. + +It is essential to make sure that the sum of all projects' quota does NOT surpass that of the Department, and that the sum of all departments does not surpass the number of physical resources, per node pool and for the entire cluster (we call such behavior - ‘over-subscription’). The reason over-subscription is not recommended is that it may produce unexpected scheduling decisions, especially those that might preempt ‘non-preemptive’ workloads or fail to schedule workloads within quota, either non-preemptible or preemptible, thus quota cannot be considered anymore as ‘guaranteed’. Admins can opt-in a system flag that helps to prevent over-subscription scenarios. + +Example: assigning resources to projects + +![](img/assigning.png) + + + +## Assigning users to projects and departments + +Run:ai system is using ‘Role Based Access Control’ (RBAC) to manage users’ access rights to the different objects of the system, its resources, and the set of allowed actions. +To allow AI researchers, ML engineers, Project Admins, or any other stakeholder of your AI initiatives to access projects and use AI compute resources with their AI initiatives, the administrator needs to assign users to projects. After a user is assigned to a project with the proper role, e.g. ‘L1 Researcher’, the user can submit and monitor its workloads under that project. Assigning users to departments is usually done to assign ‘Department Admin’ to manage a specific department. Other roles, such as ‘L1 Researcher’, can also be assigned to departments, this allows the researcher access to all projects within that department. + +## Submitting workloads + +Now that resources are grouped into node pools, organizational units or business initiatives are mapped into projects and departments, projects’ quota parameters are set per node pool, and users are assigned to projects, you can finally submit workloads from a project and use compute resources to run your AI initiatives. + +When a workload is submitted, it goes to the chosen Kubernetes cluster, and the Run:ai Scheduler handles it. + +The Scheduler’s main role is to find the best-suited node or nodes for each submitted workload, so that those nodes match the resources and other characteristics requested by the workload while adhering to the quota and fairness principles of the Run:ai system. A workload can be a single pod running on a single node, or a distributed workload using multiple pods, each running on a node (or part of a node). It is not rare to find large training workloads using 128 nodes and even more, or inference workloads using multiple pods and nodes. There are numerous types of workloads, some are Kubernetes native and some are 3rd party extensions on top of Kubernetes native pods. The Run:ai Scheduler schedules any Kubernetes native workloads, Run:ai workloads, or any type of 3rd party workload. + diff --git a/docs/admin/runai-setup/maintenance/alert-monitoring.md b/docs/admin/runai-setup/maintenance/alert-monitoring.md index bbe9d54937..4616c259eb 100644 --- a/docs/admin/runai-setup/maintenance/alert-monitoring.md +++ b/docs/admin/runai-setup/maintenance/alert-monitoring.md @@ -1,156 +1,316 @@ ---- -title: Setting up Alert Monitoring for Run:ai Using Alertmanager in Prometheus -summary: This article describes how to set up and configure Alertmanager in Prometheus. -authors: - - Jason Novich - - Viktor Koukouliev -date: 2024-Jan-15 ---- +This article explains how to configure Run:ai to generate health alerts and to connect these alerts to alert-management systems within your organization. Alerts are generated for Run:ai clusters. -## Introduction +## Alert infrastructure -This documentation outlines the steps required to set up Alertmanager within the Prometheus Operator ecosystem. It also provides guidance on configuring Prometheus to send alerts to Alertmanager and customizing Alertmanager to trigger alerts based on specific **Run.ai** conditions. +Run:ai uses Prometheus for externalizing metrics and providing visibility to end-users. The Run:ai Cluster installation includes Prometheus or can connect to an existing Prometheus instance used in your organization. The alerts are based on the Prometheus AlertManager. Once installed, it is enabled by default. + +This document explains how to: + +* Configure alert destinations - triggered alerts send data to specified destinations +* Understand the out-of-the-box cluster alerts, provided by Run:ai +* Add additional custom alerts ## Prerequisites -* A Kubernetes cluster with the necessary permissions and manage resources. -* `kubectl` command-line tool installed and configured to interact with the cluster. -* Basic knowledge of Kubernetes resources and manifests. -* up and running Prometheus Operator -* Up and running Run.ai environment +* A Kubernetes cluster with the necessary permissions +* Up and running Run:ai environment, including Prometheus Operator +* [kubectl](https://kubernetes.io/docs/reference/kubectl/) command-line tool installed and configured to interact with the cluster + +## Set-up + +Use the steps below to set up monitoring alerts. + +### Validating Prometheus operator installed + +1. Verify that the Prometheus Operator Deployment is running + Copy the following command and paste it in your terminal, where you have access to the Kubernetes cluster: + `kubectl get deployment kube-prometheus-stack-operator -n monitoring` + In your terminal, you can see an output indicating the deployment's status, including the number of replicas and their current state. +1. Verify that Prometheus instances are running + Copy the following command and paste it in your terminal: + `kubectl get prometheus -n runai` + You can see the Prometheus instance(s) listed along with their status. + +### Enabling Prometheus AlertManager + +In each of the steps in this section, copy the content of the code snippet to a new YAML file (e.g., `step1.yaml`). + +* Copy the following command to your terminal, to apply the YAML file to the cluster: + +kubectl apply -f step1.yaml +Copy the following command to your terminal to create the AlertManager CustomResource, to enable AlertManager: + +``` yaml +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + name: runai + namespace: runai +spec: + replicas: 1 + alertmanagerConfigSelector: + matchLabels: + alertmanagerConfig: runai +``` + +* Copy the following command to your terminal to validate that the AlertManager instance has started: + `kubectl get alertmanager -n runai` +* Copy the following command to your terminal to validate that the Prometheus operator has created a Service for AlertManager: + `kubectl get svc alertmanager-operated -n runai` + +### Configuring Prometheus to send alerts + +1. Open the terminal on your local machine or another machine that has access to your Kubernetes cluster +1. Copy and paste the following command in your terminal to edit the Prometheus configuration for the `runai` Namespace: +``` +kubectl edit prometheus runai -n runai +``` +This command opens the Prometheus configuration file in your default text editor (usually `vi` or `nano`). + +3. Copy and paste the following text to your terminal to change the configuration file: +``` yaml +alerting: + alertmanagers: + - namespace: runai + name: alertmanager-operated + port: web +``` +4. Save the changes and exit the text editor. + +!!! Note + To save changes using `vi`, type `:wq` and press Enter. + The changes are applied to the Prometheus configuration in the cluster. + +## Alert destinations + +Set out below are the various alert destinations. + +### Configuring AlertManager for custom email alerts + +In each step, copy the contents of the code snippets to a new file and apply it to the cluster using `kubectl apply -f`. + +Add your smtp password as a secret: + +``` yaml +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-smtp-password + namespace: runai +stringData: + password: "your_smtp_password" +``` + +Replace the relevant smtp details with your own, then apply the `alertmanagerconfig` using `kubectl apply`. + +``` yaml + apiVersion: monitoring.coreos.com/v1alpha1 + kind: AlertmanagerConfig + metadata: + name: runai + namespace: runai + labels: + alertmanagerConfig: runai + spec: + route: + continue: true + groupBy: + - alertname + + groupWait: 30s + groupInterval: 5m + repeatInterval: 1h + + matchers: + - matchType: =~ + name: alertname + value: Runai.* + + receiver: email + + receivers: + - name: 'email' + emailConfigs: + - to: '' + from: '' + smarthost: 'smtp.gmail.com:587' + authUsername: '' + authPassword: + name: alertmanager-smtp-password + key: password +``` + +Save and exit the editor. The configuration is automatically reloaded. + +### Third-party alert destinations + +Prometheus AlertManager provides a structured way to connect to alert-management systems. There are built-in plugins for popular systems such as PagerDuty and OpsGenie, including a generic Webhook. + +#### Example: Integrating Run:ai with a Webhook + +1. Use [webhook.site](https://webhook.site/) to get a unique URL. +1. Use the upgrade cluster instructions to modify the values file: + Edit the values file to add the following, and replace `` with the URL from [webhook.site](http://webhook.site). + +``` yaml +codekube-prometheus-stack: + ... + alertmanager: + enabled: true + config: + global: + resolve_timeout: 5m + receivers: + - name: "null" + - name: webhook-notifications + webhook_configs: + - url: + send_resolved: true + route: + group_by: + - alertname + group_interval: 5m + group_wait: 30s + receiver: 'null' + repeat_interval: 10m + routes: + - receiver: webhook-notifications +``` +3. Verify that you are receiving alerts on the [webhook.site](https://webhook.site/), in the left pane: + +![](img/monitoring-webhook.png) + +### Built-in alerts + +A Run:ai cluster comes with several built-in alerts. Each alert notifies on a specific functionality of a Run:ai’s entity. There is also a single, inclusive alert: `Run:ai Critical Problems`, which aggregates all component-based alerts into a single cluster health test. + +Runai agent cluster info push rate low + +| Meaning | The `cluster-sync` Pod in the `runai` namespace might not be functioning properly | +| :---- | :---- | +| **Impact** | Possible impact - no info/partial info from the cluster is being synced back to the control-plane | +| **Severity** | Critical | +| **Diagnosis** | `kubectl get pod -n runai` to see if the `cluster-sync` pod is running | +| **Troubleshooting/Mitigation** | To diagnose issues with the `cluster-sync` pod, follow these steps: **Paste the following command to your terminal, to receive detailed information about the** `cluster-sync` deployment:`kubectl describe deployment cluster-sync -n runai` **Check the Logs**: Use the following command to view the logs of the `cluster-sync` deployment:`kubectl logs deployment/cluster-sync -n runai` **Analyze the Logs and Pod Details**: From the information provided by the logs and the deployment details, attempt to identify the reason why the `cluster-sync` pod is not functioning correctly **Check Connectivity**: Ensure there is a stable network connection between the cluster and the Run:ai Control Plane. A connectivity issue may be the root cause of the problem. **Contact Support**: If the network connection is stable and you are still unable to resolve the issue, contact Run:ai support for further assistance | + +Runai agent pull rate low + +| Meaning | The `runai-agent` pod may be too loaded, is slow in processing data (possible in very big clusters), or the `runai-agent` pod itself in the `runai` namespace may not be functioning properly. | +| :---- | :---- | +| **Impact** | Possible impact - no info/partial info from the control-plane is bein[g synced i](http://running.To)n the cluster | +| **Severity** | Critical | +| **Diagnosis** | Run: `kubectl get pod -n runai` And see if the `runai-agent` pod is running. | +| **Troubleshooting/Mitigation** | To diagnose issues with the `runai-agent` pod, follow these steps: **Describe the Deployment**: Run the following command to get detailed information about the `runai-agent` deployment:`kubectl describe deployment runai-agent -n runai` **Check the Logs**: Use the following command to view the logs of the `runai-agent` deployment:`kubectl logs deployment/runai-agent -n runai` **Analyze the Logs and Pod Details**: From the information provided by the logs and the deployment details, attempt to identify the reason why the `runai-agent` pod is not functioning correctly. There may be a connectivity issue with the control plane. **Check Connectivity**: Ensure there is a stable network connection between the `runai-agent` and the control plane. A connectivity issue may be the root cause of the problem. **Consider Cluster Load**: If the `runai-agent` appears to be functioning properly but the cluster is very large and heavily loaded, it may take more time for the agent to process data from the control plane. **Adjust Alert Threshold**: If the cluster load is causing the alert to fire, you can adjust the threshold at which the alert triggers. The default value is 0.05. You can try changing it to a lower value (e.g., 0.045 or 0.04).To edit the value, paste the following in your terminal:`kubectl edit runaiconfig -n runai`In the editor, navigate to:spec: prometheus: agentPullPushRateMinForAlert: If the `agentPullPushRateMinForAlert` value does not exist, add it under `spec -> prometheus` | + +Runai container memory usage critical + +| Meaning | `Runai` container is using more than 90% of its Memory limit | +| :---- | :---- | +| **Impact** | The container might run out of memory and crash. | +| **Severity** | Critical | +| **Diagnosis** | Calculate the memory usage, this is performed by pasting the following to your terminal: `container_memory_usage_bytes{namespace=~"runai|runai-backend"}` | +| **Troubleshooting/Mitigation** | Add more memory resources to the container. If the issue persists, contact Run:ai | + +Runai container memory usage warning + +| Meaning | Runai container is using more than 80% of its memory limit | +| :---- | :---- | +| **Impact** | The container might run out of memory and crash | +| **Severity** | Warning | +| **Diagnosis** | Calculate the memory usage, this can be done by pasting the following to your terminal: `container_memory_usage_bytes{namespace=~"runai|runai-backend"}` | +| **Troubleshooting/Mitigation** | Add more memory resources to the container. If the issue persists, contact Run:ai | + +Runai container restarting + +| Meaning | `Runai` container has restarted more than twice in the last 10 min | +| :---- | :---- | +| **Impact** | The container might become unavailable and impact the Run:ai system | +| **Severity** | Warning | +| **Diagnosis** | To diagnose the issue and identify the problematic pods, paste this into your terminal: `kubectl get pods -n runai kubectl get pods -n runai-backend`One or more of the pods have a restart count >= 2. | +| **Troubleshooting/Mitigation** | Paste this into your terminal:`kubectl logs -n NAMESPACE POD_NAME`Replace `NAMESPACE` and `POD_NAME` with the relevant pod information from the previous step. Check the logs for any standout issues and verify that the container has sufficient resources. If you need further assistance, contact Run:ai | + +Runai CPU usage warning + +| Meaning | `runai` container is using more than 80% of its CPU limit | +| :---- | :---- | +| **Impact** | This might cause slowness in the operation of certain Run:ai features. | +| **Severity** | Warning | +| **Diagnosis** | Paste the following query to your terminal in order to calculate the CPU usage: `rate(container_cpu_usage_seconds_total{namespace=~"runai|runai-backend"}[2m])` | +| **Troubleshooting/Mitigation** | Add more CPU resources to the container. If the issue persists, please contact Run:ai. | + +Runai critical problem + +| Meaning | One of the critical Run:ai alerts is currently active | +| :---- | :---- | +| **Impact** | Impact is based on the active alert | +| **Severity** | Critical | +| **Diagnosis** | Check Run:ai alerts in Prometheus to identify any active critical alerts | + +Runai daemonSet rollout stuck / Runai DaemonSet unavailable on nodes + +| Meaning | There are currently 0 available pods for the `runai` daemonset on the relevant node | +| :---- | :---- | +| **Impact** | No fractional GPU workloads support | +| **Severity** | Critical | +| **Diagnosis** | Paste the following command to your terminal: `kubectl get daemonset -n runai-backend` In the result of this command, identify the daemonset(s) that don’t have any running pods | +| **Troubleshooting/Mitigation** | Paste the following command to your terminal, where `daemonsetX` is the problematic daemonset from the pervious step: `kubectl describe daemonsetX -n runai` on the relevant deamonset(s) from the previous step. The next step is to look for the specific error which prevents it from creating pods. Possible reasons might be:**Node Resource Constraints**: The nodes in the cluster may lack sufficient resources (CPU, memory, etc.) to accommodate new pods from the daemonset. **Node Selector or Affinity Rules**: The daemonset may have node selector or affinity rules that are not matching with any nodes currently available in the cluster, thus preventing pod creation. | + +Runai deployment insufficient replicas / Runai deployment no available replicas /RunaiDeploymentUnavailableReplicas + +| Meaning | `Runai` deployment has one or more unavailable pods | +| :---- | :---- | +| **Impact** | When this happens, there may be scale issues. Additionally, new versions cannot be deployed, potentially resulting in missing features. | +| **Severity** | Critical | +| **Diagnosis** | Paste the following commands to your terminal, in order to get the status of the deployments in the `runai` and `runai-backend` namespaces:`kubectl get deployment -n runai kubectl get deployment -n runai-backend`Identify any deployments that have missing pods. Look for discrepancies in the `DESIRED` and `AVAILABLE` columns. If the number of `AVAILABLE` pods is less than the `DESIRED` pods, it indicates that there are missing pods. | +| **Troubleshooting/Mitigation** | Paste the following commands to your terminal, to receive detailed information about the problematic deployment:`kubectl describe deployment -n runai kubectl describe deployment -n runai-backend` Paste the following commands to your terminal, to check the replicaset details associated with the deployment:`kubectl describe replicaset -n runai kubectl describe replicaset -n runai-backend` Paste the following commands to your terminal to retrieve the logs for the deployment to identify any errors or issues:`kubectl logs deployment/ -n runai kubectl logs deployment/ -n runai-backend` From the logs and the detailed information provided by the `describe` commands, analyze the reasons why the deployment is unable to create pods. Look for common issues such as: Resource constraints (CPU, memory) Misconfigured deployment settings or replicasets Node selector or affinity rules preventing pod schedulingIf the issue persists, contact Run:ai. | + +Runai project controller reconcile failure + +| Meaning | The `project-controller` in `runai` namespace had errors while reconciling projects | +| :---- | :---- | +| **Impact** | Some projects might not be in the “Ready” state. This means that they are not fully operational and may not have all the necessary components running or configured correctly. | +| **Severity** | Critical | +| **Diagnosis** | Retrieve the logs for the `project-controller` deployment by pasting the following command in your terminal:`kubectl logs deployment/project-controller -n runai` Carefully examine the logs for any errors or warning messages. These logs help you understand what might be going wrong with the project controller. | +| **Troubleshooting/Mitigation** | Once errors in the log have been identified, follow these steps to mitigate the issue: The error messages in the logs should provide detailed information about the problem. Read through them to understand the nature of the issue. If the logs indicate which project failed to reconcile, you can further investigate by checking the status of that specific project. Run the following command, replacing `` with the name of the problematic project:`kubectl get project -o yaml` Review the status section in the YAML output. This section describes the current state of the project and provide insights into what might be causing the failure.If the issue persists, contact Run:ai. | + +Runai StatefulSet insufficient replicas / Runai StatefulSet no available replicas + +| Meaning | `Runai` statefulset has no available pods | +| :---- | :---- | +| **Impact** | Absence of Metrics Database Unavailability | +| **Severity** | Critical | +| **Diagnosis** | To diagnose the issue, follow these steps: Check the status of the stateful sets in the `runai-backend` namespace by running the following command:`kubectl get statefulset -n runai-backend` Identify any stateful sets that have no running pods. These are the ones that might be causing the problem. | +| **Troubleshooting/Mitigation** | Once you've identified the problematic stateful sets, follow these steps to mitigate the issue: Describe the stateful set to get detailed information on why it cannot create pods. Replace `X` with the name of the stateful set:`kubectl describe statefulset X -n runai-backend` Review the description output to understand the root cause of the issue. Look for events or error messages that explain why the pods are not being created. If you're unable to resolve the issue based on the information gathered, contact Run:ai support for further assistance. | + +### Adding a custom alert + +You can add additional alerts from Run:ai. Alerts are triggered by using the Prometheus query language with any Run:ai metric. + +To create an alert, follow these steps using Prometheus query language with Run:ai Metrics: + +* **Modify Values File:** Use the upgrade cluster instructions to modify the values file. +* **Add Alert Structure:** Incorporate alerts according to the structure outlined below. Replace placeholders ``, ``, ``, ``, and `` with appropriate values for your alert, as described below. + +``` yaml +kube-prometheus-stack: + additionalPrometheusRulesMap: + custom-runai: + groups: + - name: custom-runai-rules + rules: + - alert: + annotations: + summary: + expr: + for: + labels: + severity: +``` +* ``: Choose a descriptive name for your alert, such as `HighCPUUsage` or `LowMemory`. +* ``: Provide a brief summary of what the alert signifies, for example, `High CPU usage detected` or `Memory usage below threshold`. +* ``: Construct a Prometheus query (PROMQL) that defines the conditions under which the alert should trigger. This query should evaluate to a boolean value (`1` for alert, `0` for no alert). +* ``: Optionally, specify a duration in seconds (`s`), minutes (`m`), or hours (`h`) that the alert condition should persist before triggering an alert. If not specified, the alert triggers as soon as the condition is met. +* ``: Assign a severity level to the alert, indicating its importance. Choose between `critical` for severe issues requiring immediate attention, or `warning` for less critical issues that still need monitoring. + +You can find an example in the [Prometheus documentation](https://prometheus.io/docs/prometheus/latest/querying/examples/). -## Validate Prometheus Operator Installed - -1. Verify that the Prometheus Operator deployment is running: - - `kubectl get deployment prometheus-operator -n runai` - - You should see output indicating the deployment's status, including the number of replicas and their current state. - -2. Check if Prometheus instances are running: - - `kubectl get prometheus -n runai` - - You should see the Prometheus instance(s) listed along with their status. - -## Enabling Alertmanager - -1. Create an `AlertmanagerConfig` file that triggers alerts on Run.ai events: - - cat < # (1) ``` diff --git a/docs/admin/runai-setup/self-hosted/ocp/backend.md b/docs/admin/runai-setup/self-hosted/ocp/backend.md index 03b2754906..b87392b3e2 100644 --- a/docs/admin/runai-setup/self-hosted/ocp/backend.md +++ b/docs/admin/runai-setup/self-hosted/ocp/backend.md @@ -12,7 +12,7 @@ Run the helm command below: ``` bash helm repo add runai-backend https://runai.jfrog.io/artifactory/cp-charts-prod helm repo update - helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version "~2.17.0" \ + helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane --version "~2.18.0" \ --set global.domain=runai.apps. \ # (1) --set global.config.kubernetesDistribution=openshift ``` diff --git a/docs/admin/troubleshooting/cluster-health-check.md b/docs/admin/troubleshooting/cluster-health-check.md index 78e7018616..3c204fd850 100644 --- a/docs/admin/troubleshooting/cluster-health-check.md +++ b/docs/admin/troubleshooting/cluster-health-check.md @@ -186,7 +186,7 @@ kubectl get cm runai-public -oyaml ### Resources not deployed / System unavailable / Reconciliation failed -1. Run the [Preinstall diagnostic script](cluster-prerequisites.md#pre-install-script) and check for issues. +1. Run the [Preinstall diagnostic script](../runai-setup/cluster-setup/cluster-prerequisites.md#pre-install-script) and check for issues. 2. Run ``` diff --git a/docs/admin/workloads/README.md b/docs/admin/workloads/README.md index 57e63f52a3..125df62201 100644 --- a/docs/admin/workloads/README.md +++ b/docs/admin/workloads/README.md @@ -121,12 +121,12 @@ To get the full experience of Run:ai’s environment and platform use the follow * [Workspaces](../../Researcher/user-interface/workspaces/overview.md#getting-familiar-with-workspaces) * [Trainings](../../Researcher/user-interface/trainings.md#trainings) (Only available when using the *Jobs* view) -* [Distributed trainings](../../Researcher/user-interface/trainings.md#trainings) -* [Deployment](../admin-ui-setup/deployments.md#viewing-and-submitting-deployments) +* [Distributed training](../../Researcher/user-interface/trainings.md#trainings) +* Deployments. -## Supported integrations +## Workload-related Integrations -To assist you with other platforms, and other types of workloads use the integrations listed below. +To assist you with other platforms, and other types of workloads use the integrations listed below. These integrations are not regularly tested by Run:ai and are hence provided on an as-is basis. The link below point to the Run:ai customer portal. 1. [Airflow](https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-Apache-Airflow){target=_blank} 2. [MLflow](https://runai.my.site.com/community/s/article/How-to-integrate-Run-ai-with-MLflow){target=_blank} diff --git a/docs/admin/workloads/inference-overview.md b/docs/admin/workloads/inference-overview.md index 5c84085b91..5bf8e4e147 100644 --- a/docs/admin/workloads/inference-overview.md +++ b/docs/admin/workloads/inference-overview.md @@ -30,13 +30,12 @@ Run:ai provides *Inference* services as an equal part together with the other tw * Multiple replicas will appear in Run:ai as a single *Inference* workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface. -* Inference workloads can be submitted via Run:ai [user interface](../admin-ui-setup/deployments.md) as well as [Run:ai API](../../developer/cluster-api/workload-overview-dev.md). Internally, spawning an Inference workload also creates a Kubernetes *Service*. The service is an end-point to which clients can connect. +* Inference workloads can be submitted via Run:ai user interface as well as [Run:ai API](../../developer/cluster-api/workload-overview-dev.md). Internally, spawning an Inference workload also creates a Kubernetes *Service*. The service is an end-point to which clients can connect. ## Autoscaling To withstand SLA, *Inference* workloads are typically set with *auto scaling*. Auto-scaling is the ability to add more computing power (Kubernetes pods) when the load increases and shrink allocated resources when the system is idle. - -There are a number of ways to trigger autoscaling. Run:ai supports the following: +There are several ways to trigger autoscaling. Run:ai supports the following: | Metric | Units | Run:ai name | |-----------------|--------------|-----------------| @@ -45,7 +44,7 @@ There are a number of ways to trigger autoscaling. Run:ai supports the following The Minimum and Maximum number of replicas can be configured as part of the autoscaling configuration. -Autoscaling also supports a scale to zero policy with *Throughput* and *Concurrency* metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0. +Autoscaling also supports a scale-to-zero policy with *Throughput* and *Concurrency* metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0. This has the benefit of conserving resources at the risk of a delay from "cold starting" the model when traffic resumes. diff --git a/docs/admin/workloads/policies/README.md b/docs/admin/workloads/policies/README.md index f76540e3ad..bdb3a90afb 100644 --- a/docs/admin/workloads/policies/README.md +++ b/docs/admin/workloads/policies/README.md @@ -8,16 +8,16 @@ date: 2023-Dec-12 ## Introduction -*Policies* allow administrators to impose restrictions and set default values for researcher workloads. Restrictions and default values can be placed on CPUs, GPUs, and other resources or entities. Enabling the *New Policy Manager* provides information about resources that are non-compliant to applied policies. Resources that are non-compliant will appear greyed out. To see how a resource is not compliant, press on the clipboard icon in the upper right hand corner of the resource. +*Policies* allow administrators to impose restrictions and set default values for researcher workloads. Restrictions and default values can be placed on CPUs, GPUs, and other resources or entities. Enabling the *New Policy Manager* provides information about resources that are non-compliant to applied policies. Resources that are non-compliant will appear greyed out. To see how a resource is not compliant, press on the clipboard icon in the upper right-hand corner of the resource. !!! Note - Policies from Run:ai versions 2.15 or lower will still work after enabling the *New Policy Manager*. However, showing non-compliant policy rules will not be available. For more information about policies for version 2.15 or lower, see [What are Policies](policies.md#what-are-policies). + Policies from Run:ai versions 2.17 or lower will still work after enabling the New Policy Manager. For more information about policies for version 2.17 or lower, see [What are Policies](policies.md#what-are-policies). For example, an administrator can create and apply a policy that will restrict researchers from requesting more than 2 GPUs, or less than 1GB of memory per type of workload. Another example is an administrator who wants to set different amounts of CPU, GPUs and memory for different kinds of workloads. A training workload can have a default of 1 GB of memory, or an interactive workload can have a default amount of GPUs. -Policies are created for each Run:ai project (Kubernetes namespace). When a policy is created in the `runai` namespace, it will take effect when there is no project-specific policy for the workloads of the same kind. +Policies are created for each Run:ai project (Kubernetes namespace). When a policy is created in the `runai` namespace, it will take effect when there is no project-specific policy for workloads of the same kind. In interactive workloads or workspaces, applied policies will only allow researchers access to resources that are permitted in the policy. This can include compute resources as well as node pools and node pool priority. @@ -47,7 +47,7 @@ A policy configured to a specific scope, is applied to all elements in that scop ### Policy Editor UI -Policies are added to the system using the policy editor and are written in YAML format. YAML™ is a human-friendly, cross language, Unicode based data serialization language designed around the common native data types of dynamic programming languages. It is useful for programming needs ranging from configuration files to internet messaging to object persistence to data auditing and visualization. For more information, see [YAML.org](https://yaml.org/){target=_blank}. +Policies are added to the system using the policy editor and are written in YAML format. YAML™ is a human-friendly, cross-language, Unicode-based data serialization language designed around the common native data types of dynamic programming languages. It is useful for programming needs ranging from configuration files to internet messaging to object persistence to data auditing and visualization. For more information, see [YAML.org](https://yaml.org/){target=_blank}. ### Policy API @@ -59,50 +59,47 @@ The following is an example of a workspace policy you can apply in your platform ```YAML defaults: - environment: - allowPrivilegeEscalation: false - createHomeDir: true - environmentVariables: - - name: MY_ENV - value: my_value - workspace: - allowOverQuota: true + createHomeDir: true + environmentVariables: + instances: + - name: MY_ENV + value: my_value + security: + allowPrivilegeEscalation: false rules: - compute: - cpuCoreLimit: - min: 0 - max: 9 - required: true - gpuPortionRequest: - min: 0 - max: 10 + imagePullPolicy: + required: true + options: + - value: Always + displayed: Always + - value: Never + displayed: Never + createHomeDir: + canEdit: false + security: + runAsUid: + min: 1 + max: 32700 + allowPrivilegeEscalation: + canEdit: false + compute: + cpuCoreLimit: + required: true + min: 0 + max: 9 + gpuPortionRequest: + min: 0 + max: 10 + storage: + nfs: + instances: + canAdd: false s3: - url: - options: - - displayed: "https://www.google.com" - value: "https://www.google.com" - - displayed: "https://www.yahoo.com" - value: "https://www.yahoo.com" - environment: - imagePullPolicy: - options: - - displayed: "Always" - value: "Always" - - displayed: "Never" - value: "Never" - required: true - runAsUid: - min: 1 - max: 32700 - createHomeDir: - canEdit: false - allowPrivilegeEscalation: - canEdit: false - workspace: - allowOverQuota: - canEdit: false - imposedAssets: - dataSources: - nfs: - canAdd: false + attributes: + url: + options: + - value: https://www.google.com + displayed: https://www.google.com + - value: https://www.yahoo.com + displayed: https://www.yahoo.com ``` diff --git a/docs/home/whats-new-2-18.md b/docs/home/whats-new-2-18.md index 6a4c840d3c..427ed90289 100644 --- a/docs/home/whats-new-2-18.md +++ b/docs/home/whats-new-2-18.md @@ -22,11 +22,11 @@ date: 2024-June-14 * Added new *Data sources* of type *Secret* to workload form. *Data sources* of type *Secret* are used to hide 3rd party access credentials when submitting workloads. For more information, see [Submitting Workloads](../admin/workloads/submitting-workloads.md#how-to-submit-a-workload). -* Added new graphs for *Inference* workloads. The new graphs provide more information for *Inference* workloads to help analyze performance of the workloads. New graphs include Latency, Throughput, and number of replicas. For more information, see [Workloads View](../admin/workloads/README.md#workloads-view) (Requires minimum cluster version v2.18). +* Added new graphs for *Inference* workloads. The new graphs provide more information for *Inference* workloads to help analyze performance of the workloads. New graphs include Latency, Throughput, and number of replicas. For more information, see [Workloads View](../admin/workloads/README.md#workloads-view). (Requires minimum cluster version v2.18). * Added latency metric for autoscaling. This feature allows automatic scale-up/down the number of replicas of a Run:ai inference workload based on the threshold set by the ML Engineer. This ensures that response time is kept under the target SLA. (Requires minimum cluster version v2.18). -* Improved autoscaling for inference models by taking out ChatBot UI from models images. By moving ChatBot UI to predefined *Environments*, autoscaling is more accurate by taking into account all types of requests (API, and ChatBot UI). Adding a ChatBot UI environment preset by Run:ai allows AI practitioners to easily connect them to workloads. +* Improved autoscaling for inference models by taking out ChatBot UI from models images. By moving ChatBot UI to predefined *Environments*, autoscaling is more accurate by taking into account all types of requests (API, and ChatBot UI). Adding a ChatBot UI environment preset by Run:ai allows AI practitioners to easily connect them to workloads. * Added more precision to trigger auto-scaling to zero. Now users can configure a precise consecutive idle threshold custom setting to trigger Run:ai inference workloads to scale-to-zero. (Requires minimum cluster version v2.18). @@ -101,7 +101,7 @@ date: 2024-June-14 #### Policy for distributed and inference workloads in the API -Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly. +* Added a new API for creating distributed training workload policies and inference workload policies. These new policies in the API allow to set defaults, enforce rules and impose setup on distributed training and inference workloads. For distributed policies, worker and master may require different rules due to their different specifications. The new capability is currently available via API only. Documentation on submitting policies to follow shortly. ## Deprecation Notifications diff --git a/mkdocs.yml b/mkdocs.yml index dd0620c3f3..d3b55256cb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -113,9 +113,6 @@ plugins: 'admin/runai-setup/cluster-setup/researcher-authentication.md' : 'admin/runai-setup/authentication/sso.md' 'admin/researcher-setup/cli-troubleshooting.md' : 'admin/troubleshooting/troubleshooting.md' 'developer/deprecated/inference/submit-via-yaml.md' : 'developer/cluster-api/other-resources.md' - 'Researcher/researcher-library/rl-hpo-support.md' : 'Researcher/scheduling/hpo.md' - 'Researcher/researcher-library/researcher-library-overview.md' : 'Researcher/scheduling/hpo.md' - nav: - Home: - 'Overview': 'index.md' @@ -189,7 +186,7 @@ nav: - 'Email and System Notifications': 'admin/runai-setup/notifications/notifications.md' - 'Maintenance' : - 'Node Downtime' : 'admin/runai-setup/maintenance/node-downtime.md' - - 'Monitoring Cluster Health' : 'admin/runai-setup/maintenance/alert-monitoring.md' + - 'System Monitoring' : 'admin/runai-setup/maintenance/alert-monitoring.md' - 'Audit Log' : 'admin/runai-setup/maintenance/audit-log.md' - 'Researcher Setup' : - 'Introduction' : 'admin/researcher-setup/researcher-setup-intro.md' @@ -209,6 +206,8 @@ nav: - 'Secrets' : 'admin/workloads/secrets.md' - 'Inference' : 'admin/workloads/inference-overview.md' - 'Submitting Workloads' : 'admin/workloads/submitting-workloads.md' + - 'Managing AI Intiatives' : + - 'Overview' : 'admin/aiinitiatives/overview.md' - 'User Interface' : - 'Overview' : 'admin/admin-ui-setup/overview.md' - 'Users' : 'admin/admin-ui-setup/admin-ui-users.md' @@ -217,29 +216,11 @@ nav: - 'Dashboard Analysis' : 'admin/admin-ui-setup/dashboard-analysis.md' - 'Jobs' : 'admin/admin-ui-setup/jobs.md' - 'Credentials' : 'admin/admin-ui-setup/credentials-setup.md' - - 'Deployments' : 'admin/admin-ui-setup/deployments.md' - 'Templates': 'admin/admin-ui-setup/templates.md' - 'Troubleshooting' : - 'Cluster Health' : 'admin/troubleshooting/cluster-health-check.md' - 'Troubleshooting' : 'admin/troubleshooting/troubleshooting.md' - 'Diagnostics' : 'admin/troubleshooting/diagnostics.md' - - 'Alert Manager Alerts' : - - 'admin/troubleshooting/alertmanager/README.md' - - 'Runai Agent Cluster Info Push Rate Low' : 'admin/troubleshooting/alertmanager/RunaiAgentClusterInfoPushRateLow.md' - - 'Runai Agent Pull Rate Low' : 'admin/troubleshooting/alertmanager/RunaiAgentPullRateLow.md' - - 'Runai Container Memory Usage Critical' : 'admin/troubleshooting/alertmanager/RunaiContainerMemoryUsageCritical.md' - - 'Runai Container Memory Usage Warning' : 'admin/troubleshooting/alertmanager/RunaiContainerMemoryUsageWarning.md' - - 'Runai Container Restarting' : 'admin/troubleshooting/alertmanager/RunaiContainerRestarting.md' - - 'Runai Cpu Usage Warning' : 'admin/troubleshooting/alertmanager/RunaiCpuUsageWarning.md' - - 'Runai Critical Problem' : 'admin/troubleshooting/alertmanager/RunaiCriticalProblem.md' - - 'Runai DaemonSet Rollout Stuck' : 'admin/troubleshooting/alertmanager/RunaiDaemonSetRolloutStuck.md' - - 'Runai DaemonSet Unavailable On Nodes' : 'admin/troubleshooting/alertmanager/RunaiDaemonSetUnavailableOnNodes.md' - - 'Runai Deployment Insufficient Replicas' : 'admin/troubleshooting/alertmanager/RunaiDeploymentInsufficientReplicas.md' - - 'Runai Deployment NoAvailable Replicas' : 'admin/troubleshooting/alertmanager/RunaiDeploymentNoAvailableReplicas.md' - - 'Runai Deployment Unavailable Replicas' : 'admin/troubleshooting/alertmanager/RunaiDeploymentUnavailableReplicas.md' - - 'Runai Project Controller Reconcile Failure' : 'admin/troubleshooting/alertmanager/RunaiProjectControllerReconcileFailure.md' - - 'Runai StatefulSet Insufficient Replicas' : 'admin/troubleshooting/alertmanager/RunaiStatefulSetInsufficientReplicas.md' - - 'Runai StatefulSet No Available Replicas' : 'admin/troubleshooting/alertmanager/RunaiStatefulSetNoAvailableReplicas.md' - 'Best Practices' : - 'From Docker to Run:ai ' : 'admin/researcher-setup/docker-to-runai.md' - 'Researcher' :