diff --git a/.github/workflows/mkdocs-release.yml b/.github/workflows/mkdocs-release.yml index 6b83c4ae7..eecee5773 100644 --- a/.github/workflows/mkdocs-release.yml +++ b/.github/workflows/mkdocs-release.yml @@ -29,5 +29,5 @@ jobs: git config --global user.email mike@docs.hopsworks.ai # Put this back and increment version when cutting a new release branch - # - name: mike deploy docs - # run: mike deploy 3.0 latest -u --push + - name: mike deploy docs + run: mike deploy 4.3 latest -u --push diff --git a/docs/assets/images/guides/feature_group/credentials_selection.png b/docs/assets/images/guides/feature_group/credentials_selection.png new file mode 100644 index 000000000..8cbb87d0e Binary files /dev/null and b/docs/assets/images/guides/feature_group/credentials_selection.png differ diff --git a/docs/assets/images/guides/feature_group/data_source.png b/docs/assets/images/guides/feature_group/data_source.png new file mode 100644 index 000000000..9db62e434 Binary files /dev/null and b/docs/assets/images/guides/feature_group/data_source.png differ diff --git a/docs/assets/images/guides/feature_group/ext_table_selection.png b/docs/assets/images/guides/feature_group/ext_table_selection.png new file mode 100644 index 000000000..8b418bfd0 Binary files /dev/null and b/docs/assets/images/guides/feature_group/ext_table_selection.png differ diff --git a/docs/assets/images/guides/feature_group/primary_key_selection.png b/docs/assets/images/guides/feature_group/primary_key_selection.png new file mode 100644 index 000000000..05f68753f Binary files /dev/null and b/docs/assets/images/guides/feature_group/primary_key_selection.png differ diff --git a/docs/assets/images/guides/feature_group/validation_ext_feature_group.png b/docs/assets/images/guides/feature_group/validation_ext_feature_group.png new file mode 100644 index 000000000..d996f16ba Binary files /dev/null and b/docs/assets/images/guides/feature_group/validation_ext_feature_group.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_external_access.png b/docs/assets/images/guides/mlops/serving/deployment_external_access.png new file mode 100644 index 000000000..076f2faa9 Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/deployment_external_access.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_external_access_edit.png b/docs/assets/images/guides/mlops/serving/deployment_external_access_edit.png new file mode 100644 index 000000000..328c51d29 Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/deployment_external_access_edit.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_external_api_key.png b/docs/assets/images/guides/mlops/serving/deployment_external_api_key.png new file mode 100644 index 000000000..7a9f0da02 Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/deployment_external_api_key.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_external_code_snippets.png b/docs/assets/images/guides/mlops/serving/deployment_external_code_snippets.png new file mode 100644 index 000000000..3e3f23cdc Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/deployment_external_code_snippets.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_external_groups.png b/docs/assets/images/guides/mlops/serving/deployment_external_groups.png new file mode 100644 index 000000000..20494dead Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/deployment_external_groups.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_external_list.png b/docs/assets/images/guides/mlops/serving/deployment_external_list.png new file mode 100644 index 000000000..083cc5c5c Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/deployment_external_list.png differ diff --git a/docs/assets/images/guides/mlops/serving/deployment_overview.png b/docs/assets/images/guides/mlops/serving/deployment_overview.png index 0c67c7fe6..bc75dcb28 100644 Binary files a/docs/assets/images/guides/mlops/serving/deployment_overview.png and b/docs/assets/images/guides/mlops/serving/deployment_overview.png differ diff --git a/docs/assets/images/guides/mlops/serving/login_external_idp.png b/docs/assets/images/guides/mlops/serving/login_external_idp.png new file mode 100644 index 000000000..9eb44127f Binary files /dev/null and b/docs/assets/images/guides/mlops/serving/login_external_idp.png differ diff --git a/docs/setup_installation/admin/auth.md b/docs/setup_installation/admin/auth.md index 2d986b963..7c6c5b676 100644 --- a/docs/setup_installation/admin/auth.md +++ b/docs/setup_installation/admin/auth.md @@ -1,7 +1,7 @@ # Authentication Methods ## Introduction -Hopsworks can be configured to use different type of authentication methods. In this guide we will look at the +Hopsworks can be configured to use different types of authentication methods. In this guide we will look at the different authentication methods available in Hopsworks. ## Prerequisites diff --git a/docs/user_guides/fs/feature_group/create_external.md b/docs/user_guides/fs/feature_group/create_external.md index 9c0bbceb1..04a6921fa 100644 --- a/docs/user_guides/fs/feature_group/create_external.md +++ b/docs/user_guides/fs/feature_group/create_external.md @@ -134,18 +134,40 @@ Nevertheless, external feature groups defined top of any storage connector can b ## Create using the UI -You can also create a new feature group through the UI. For this, navigate to the `Feature Groups` section and press the `Create` button at the top-right corner. +You can also create a new feature group through the UI. For this, navigate to the `Data Source ` section and select existing credentials or create new ones for your prefered data source.

- List of Feature Groups + Data Source UI

-Subsequently, you will be able to define its properties (such as name, mode, features, and more). Refer to the documentation above for an explanation of the parameters available, they are the same as when you create a feature group using the SDK. Finally, complete the creation by clicking `Create New Feature Group` at the bottom of the page. +If you have existing credentials, simply proceed by clicking `Next: Select Tables `. If you not, create and save the credentials first.

- Create new Feature Group + setup credentials in Data Sources +
+

+ +The database navigation structure depends on your specific data source. You'll navigate through the appropriate hierarchy for your platform—such as Database → Schema → Table for Snowflake, or Project → Dataset → Table for BigQuery. In the UI you can select one or more tables, for each selected table, you must designate one or more primary keys before proceeding. You can also review the names and data types of individual columns you want to include. + +

+

+ Select Table in Data Sources for External feature Group +
+

+ +

+

+ select details of external feature group +
+

+ +Complete the creation by clicking `Next: Review Configuration` at the bottom of the page, you will be prompted with a final validation window where you will be able to create a name for your external feature group. + +

+

+ Validate the creation of a new external feature group

diff --git a/docs/user_guides/mlops/serving/external-access.md b/docs/user_guides/mlops/serving/external-access.md new file mode 100644 index 000000000..d32130dea --- /dev/null +++ b/docs/user_guides/mlops/serving/external-access.md @@ -0,0 +1,138 @@ +--- +description: Documentation on how to configure external access to a model deployment +--- + +# How To Configure External Access To A Model Deployment + +## Introduction + +Hopsworks supports role-based access control (RBAC) for project members within a project, where a project ML assets can only be accessed by Hopsworks users that are members of that project (See [governance](../../../concepts/projects/governance.md)). + +However, there are cases where you might want to grant ==external users== with access to specific model deployments without them having to register into Hopsworks or to join the project which will give them access to all project ML assets. For these cases, Hopsworks supports fine-grained access control to model deployments based on ==user groups== managed by an external Identity Provider. + +!!! info "Authentication methods" + Hopsworks can be configured to use different types of authentication methods including OAuth2, LDAP and Kerberos. See the [Authentication Methods Guide](../../../setup_installation/admin/auth.md) for more information. + +## GUI (for Hopsworks users) + +### Step 1: Navigate to a model deployment + +If you have at least one model deployment already created, navigate to the model deployments page by clicking on the `Deployments` tab on the navigation menu on the left. + +

+

+ Deployments navigation tab +
Deployments navigation tab
+
+

+ +Once in the model deployments page, find the model deployment you want to configure external access and click on the name of the deployment to open the model deployment overview page. + +

+

+ Deployment overview +
Deployment overview
+
+

+ +### Step 2: Go to External Access + +You can find the external access configuration by clicking on `External access` on the navigation menu on the left or scrolling down to the external access section. + +

+

+ Deployment external access +
External access configuration
+
+

+ +### Step 3: Add or remove user groups + +In this section, you can add and remove user groups by clicking on `edit external user groups` and typing the group name in the **text-free** input field or **selecting** one of the existing ones in the dropdown list. After that, click on the `save` button to persist the changes. + + +!!! Warn "Case sensitivity" + Inference requests are authorized using a ==case-sensitive exact match== between the group names of the user making the request and the group names granted access to the model deployment. Therefore, a user assigned to the group `lab1` won't have access to a model deployment accessible by group `LAB1`. + +

+

+ Deployment external access +
External access configuration
+
+

+ +## GUI (for external users) + +### Step 1: Login with the external identity provider + +Navigate to Hopsworks, and click on the `Login with` button to sign in using the configured external identity provider (e.g., Keycloak in this example). + +

+

+ Login external identity provider +
Login with External Identity Provider
+
+

+ +### Step 2: Explore the model deployments you are granted access to + +Once you sign in to Hopsworks, you can see the list of model deployments you are granted access to based on your assigned groups. + +

+

+ Deployments list +
Deployments with external access
+
+

+ +### Step 2: Inspect your current groups + +You can find the current groups you are assigned to at the top of the page. + +

+

+ External user groups +
External user groups
+
+

+ +### Step 3: Get an API key + +Inference requests to model deployments are authenticated and authorized based on your external user and user groups. You can create API keys to authenticate your inference requests by clicking on the `Create API Key` button. + +!!! info "Authorization header" + API keys are set in the `Authorization` header following the format `ApiKey ` + +

+

+ Get API key +
Get API key
+
+

+ +### Step 4: Send inference requests + +Depending on the type of model deployment, the URI of the model server can differ (e.g., `/chat/completions` for LLM deployments or `/predict` for traditional model deployments). You can find the corresponding URI on every model deployment card. + +In addition to the `Authorization` header containing the API key, the `Host` header needs to be set according to the model deployment where the inference requests are sent to. This header is used by the ingress to route the inference requests to the corresponding model deployment. You can find the `Host` header value in the model deployment card. + +!!! tip "Code snippets" + For clients sending inference requests using libraries similar to curl or OpenAI API-compatible libraries (e.g., LangChain), you can find code snippet examples by clicking on the `Curl >_` and `LangChain >_` buttons. + +

+

+ Deployment endpoint +
Deployment endpoint
+
+

+ +## Refreshing External User Groups + +Every time an external user signs in to Hopsworks using a pre-configured [authentication method](../../../setup_installation/admin/auth.md), Hopsworks fetches the external user groups and updates the internal state accordingly. Given that groups can be added/removed from users at any time by the Identity Provider, Hopsworks needs to periodically fetch the external user groups to keep the state updated. + +Therefore, external users that want to access model deployments are **required to login periodically** to ensure they are still part of the allowed groups. The timespan between logins is controlled by the configuration parameter `requireExternalUserLoginAfterHours` available during the Hopsworks installation and upgrade. + +The `requireExternalUserLoginAfterHours` configuration parameter controls the ==number of hours== after which external users are required to sign in to Hopsworks to refresh their external user groups. + +!!! info "Configuring `requireExternalUserLoginAfterHours`" + Allowed values are -1, 0 and greater than 0, where -1 disables the periodic login requirement and 0 disables external access completely for every model deployment. diff --git a/docs/user_guides/mlops/serving/index.md b/docs/user_guides/mlops/serving/index.md index 1ab46e000..dc0915bd9 100644 --- a/docs/user_guides/mlops/serving/index.md +++ b/docs/user_guides/mlops/serving/index.md @@ -26,4 +26,8 @@ Configure the predictor to log inference requests and predictions, see the [Infe ### Troubleshooting -Inspect the model server logs to troubleshoot your model deployments, see the [Troubleshooting Guide](troubleshooting.md). \ No newline at end of file +Inspect the model server logs to troubleshoot your model deployments, see the [Troubleshooting Guide](troubleshooting.md). + +### External access + +Grant users authenticated by an external Identity Provider access to model deployments, see the [External Access Guide](external-access.md). \ No newline at end of file diff --git a/docs/user_guides/projects/jobs/notebook_job.md b/docs/user_guides/projects/jobs/notebook_job.md index a17788651..364b5900e 100644 --- a/docs/user_guides/projects/jobs/notebook_job.md +++ b/docs/user_guides/projects/jobs/notebook_job.md @@ -82,7 +82,7 @@ It is possible to also set following configuration settings for a `PYTHON` job. * `Environment`: The python environment to use * `Container memory`: The amount of memory in MB to be allocated to the Jupyter Notebook script * `Container cores`: The number of cores to be allocated for the Jupyter Notebook script -* `Additional files`: List of files that will be locally accessible by the application +* `Additional files`: List of files that will be locally accessible in the working directory of the application. Only recommended to use if project datasets are not mounted under `/hopsfs`. You can always modify the arguments in the job settings.

@@ -142,7 +142,7 @@ In this snippet we get the `JobsApi` object to get the default job configuration ```python -jobs_api = project.get_jobs_api() +jobs_api = project.get_job_api() notebook_job_config = jobs_api.get_configuration("PYTHON") @@ -166,7 +166,33 @@ In this code snippet, we execute the job with arguments and wait until it reache execution = job.run(args='-p a 2 -p b 5', await_termination=True) ``` -### API Reference +## Configuration +The following table describes the JSON payload returned by `jobs_api.get_configuration("PYTHON")` + +| Field | Type | Description | Default | +|-------------------------|----------------|------------------------------------------------------|--------------------------| +| `type` | string | Type of the job configuration | `"pythonJobConfiguration"` | +| `appPath` | string | Project path to notebook (e.g `Resources/foo.ipynb`) | `null` | +| `environmentName` | string | Name of the python environment | `"pandas-training-pipeline"` | +| `resourceConfig.cores` | number (float) | Number of CPU cores to be allocated | `1.0` | +| `resourceConfig.memory` | number (int) | Number of MBs to be allocated | `2048` | +| `resourceConfig.gpus` | number (int) | Number of GPUs to be allocated | `0` | +| `logRedirection` | boolean | Whether logs are redirected | `true` | +| `jobType` | string | Type of job | `"PYTHON"` | + + +## Accessing project data +!!! notice "Recommended approach if `/hopsfs` is mounted" + If your Hopsworks installation is configured to mount the project datasets under `/hopsfs`, which it is in most cases, then please refer to this section instead of the `Additional files` property to reference file resources. + +### Absolute paths +The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your notebook. + +### Relative paths +The notebook's working directory is the folder it is located in. For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. + + +## API Reference [Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) diff --git a/docs/user_guides/projects/jobs/pyspark_job.md b/docs/user_guides/projects/jobs/pyspark_job.md index 3cc9e3030..c0cb7e804 100644 --- a/docs/user_guides/projects/jobs/pyspark_job.md +++ b/docs/user_guides/projects/jobs/pyspark_job.md @@ -8,7 +8,7 @@ description: Documentation on how to configure and execute a PySpark job on Hops All members of a project in Hopsworks can launch the following types of applications through a project's Jobs service: -- Python (*Hopsworks Enterprise only*) +- Python - Apache Spark Launching a job of any type is very similar process, what mostly differs between job types is @@ -179,7 +179,7 @@ In this snippet we get the `JobsApi` object to get the default job configuration ```python -jobs_api = project.get_jobs_api() +jobs_api = project.get_job_api() spark_config = jobs_api.get_configuration("PYSPARK") @@ -211,7 +211,45 @@ print(f_err.read()) ``` -### API Reference +## Configuration +The following table describes the JSON payload returned by `jobs_api.get_configuration("PYSPARK")` + +| Field | Type | Description | Default | +| ------------------------------------------ | -------------- |-----------------------------------------------------| -------------------------- | +| `type` | string | Type of the job configuration | `"sparkJobConfiguration"` | +| `appPath` | string | Project path to script (e.g `Resources/foo.py`) | `null` | +| `environmentName` | string | Name of the project spark environment | `"spark-feature-pipeline"` | +| `spark.driver.cores` | number (float) | Number of CPU cores allocated for the driver | `1.0` | +| `spark.driver.memory` | number (int) | Memory allocated for the driver (in MB) | `2048` | +| `spark.executor.instances` | number (int) | Number of executor instances | `1` | +| `spark.executor.cores` | number (float) | Number of CPU cores per executor | `1.0` | +| `spark.executor.memory` | number (int) | Memory allocated per executor (in MB) | `4096` | +| `spark.dynamicAllocation.enabled` | boolean | Enable dynamic allocation of executors | `true` | +| `spark.dynamicAllocation.minExecutors` | number (int) | Minimum number of executors with dynamic allocation | `1` | +| `spark.dynamicAllocation.maxExecutors` | number (int) | Maximum number of executors with dynamic allocation | `2` | +| `spark.dynamicAllocation.initialExecutors` | number (int) | Initial number of executors with dynamic allocation | `1` | +| `spark.blacklist.enabled` | boolean | Whether executor/node blacklisting is enabled | `false` | + + +## Accessing project data + +### Read directly from the filesystem (recommended) + +To read a dataset in your project using Spark, use the full filesystem path where the data is stored. For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: + +```python +df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True) +df.show() +``` + +### Additional files + +Different file types can be attached to the spark job and made available in the `/srv/hops/artifacts` folder when the PySpark job is started. This configuration is mainly useful when you need to add additional setup, such as jars that needs to be added to the CLASSPATH. + +When reading data in your Spark job it is recommended to use the Spark read API as previously demonstrated, since this reads from the filesystem directly, whereas `Additional files` configuration options will download the files in its entirety and is not a scalable option. + + +## API Reference [Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) diff --git a/docs/user_guides/projects/jobs/python_job.md b/docs/user_guides/projects/jobs/python_job.md index 4fa58cfa6..420e38e49 100644 --- a/docs/user_guides/projects/jobs/python_job.md +++ b/docs/user_guides/projects/jobs/python_job.md @@ -81,7 +81,8 @@ It is possible to also set following configuration settings for a `PYTHON` job. * `Environment`: The python environment to use * `Container memory`: The amount of memory in MB to be allocated to the Python script * `Container cores`: The number of cores to be allocated for the Python script -* `Additional files`: List of files that will be locally accessible by the application +* `Additional files`: List of files that will be locally accessible in the working directory of the application. Only recommended to use if project datasets are not mounted under `/hopsfs`. + You can always modify the arguments in the job settings.

@@ -129,7 +130,7 @@ In this snippet we get the `JobsApi` object to get the default job configuration ```python -jobs_api = project.get_jobs_api() +jobs_api = project.get_job_api() py_job_config = jobs_api.get_configuration("PYTHON") @@ -163,7 +164,33 @@ print(f_err.read()) ``` -### API Reference +## Configuration +The following table describes the JSON payload returned by `jobs_api.get_configuration("PYTHON")` + +| Field | Type | Description | Default | +|-------------------------|----------------|-------------------------------------------------|--------------------------| +| `type` | string | Type of the job configuration | `"pythonJobConfiguration"` | +| `appPath` | string | Project path to script (e.g `Resources/foo.py`) | `null` | +| `environmentName` | string | Name of the project python environment | `"pandas-training-pipeline"` | +| `resourceConfig.cores` | number (float) | Number of CPU cores to be allocated | `1.0` | +| `resourceConfig.memory` | number (int) | Number of MBs to be allocated | `2048` | +| `resourceConfig.gpus` | number (int) | Number of GPUs to be allocated | `0` | +| `logRedirection` | boolean | Whether logs are redirected | `true` | +| `jobType` | string | Type of job | `"PYTHON"` | + + +## Accessing project data +!!! notice "Recommended approach if `/hopsfs` is mounted" + If your Hopsworks installation is configured to mount the project datasets under `/hopsfs`, which it is in most cases, then please refer to this section instead of the `Additional files` property to reference file resources. + +### Absolute paths +The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your script. + +### Relative paths +The script's working directory is the folder it is located in. For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. + + +## API Reference [Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) diff --git a/docs/user_guides/projects/jobs/ray_job.md b/docs/user_guides/projects/jobs/ray_job.md index 99312f4a2..1e919fac6 100644 --- a/docs/user_guides/projects/jobs/ray_job.md +++ b/docs/user_guides/projects/jobs/ray_job.md @@ -8,7 +8,7 @@ description: Documentation on how to configure and execute a Ray job on Hopswork All members of a project in Hopsworks can launch the following types of applications through a project's Jobs service: -- Python (*Hopsworks Enterprise only*) +- Python - Apache Spark - Ray @@ -168,7 +168,7 @@ In this snippet we get the `JobsApi` object to get the default job configuration ```python -jobs_api = project.get_jobs_api() +jobs_api = project.get_job_api() ray_config = jobs_api.get_configuration("RAY") @@ -203,7 +203,12 @@ print(f_err.read()) ``` -### API Reference +## Accessing project data + +The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your script. + + +## API Reference [Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) diff --git a/docs/user_guides/projects/jobs/spark_job.md b/docs/user_guides/projects/jobs/spark_job.md index 66be8c001..6d0f0510b 100644 --- a/docs/user_guides/projects/jobs/spark_job.md +++ b/docs/user_guides/projects/jobs/spark_job.md @@ -183,7 +183,7 @@ In this snippet we get the `JobsApi` object to get the default job configuration ```python -jobs_api = project.get_jobs_api() +jobs_api = project.get_job_api() spark_config = jobs_api.get_configuration("SPARK") @@ -212,7 +212,48 @@ print(f_err.read()) ``` -### API Reference +## Configuration +The following table describes the JSON payload returned by `jobs_api.get_configuration("SPARK")` + +| Field | Type | Description | Default | +|--------------------------------------------| -------------- |---------------------------------------------------------| -------------------------- | +| `type` | string | Type of the job configuration | `"sparkJobConfiguration"` | +| `appPath` | string | Project path to spark program (e.g `Resources/foo.jar`) | `null` | +| `mainClass` | string | Name of the main class to run (e.g `org.company.Main`) | `null` | +| `environmentName` | string | Name of the project spark environment | `"spark-feature-pipeline"` | +| `spark.driver.cores` | number (float) | Number of CPU cores allocated for the driver | `1.0` | +| `spark.driver.memory` | number (int) | Memory allocated for the driver (in MB) | `2048` | +| `spark.executor.instances` | number (int) | Number of executor instances | `1` | +| `spark.executor.cores` | number (float) | Number of CPU cores per executor | `1.0` | +| `spark.executor.memory` | number (int) | Memory allocated per executor (in MB) | `4096` | +| `spark.dynamicAllocation.enabled` | boolean | Enable dynamic allocation of executors | `true` | +| `spark.dynamicAllocation.minExecutors` | number (int) | Minimum number of executors with dynamic allocation | `1` | +| `spark.dynamicAllocation.maxExecutors` | number (int) | Maximum number of executors with dynamic allocation | `2` | +| `spark.dynamicAllocation.initialExecutors` | number (int) | Initial number of executors with dynamic allocation | `1` | +| `spark.blacklist.enabled` | boolean | Whether executor/node blacklisting is enabled | `false` | + +## Accessing project data + +### Read directly from the filesystem (recommended) + +To read a dataset in your project using Spark, use the full filesystem path where the data is stored. For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: + +```java +Dataset df = spark.read() + .option("header", "true") // CSV has header + .option("inferSchema", "true") // Infer data types + .csv("/Projects/my_project/Resources/data.csv"); + +df.show(); +``` + +### Additional files + +Different file types can be attached to the spark job and made available in the `/srv/hops/artifacts` folder when the Spark job is started. This configuration is mainly useful when you need to add additional configuration such as jars that needs to be added to the CLASSPATH. + +When reading data in your Spark job it is recommended to use the Spark read API as previously demonstrated, since this reads from the filesystem directly, whereas `Additional files` configuration options will download the files in its entirety and is not a scalable option. + +## API Reference [Jobs](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/jobs/) diff --git a/docs/user_guides/projects/jupyter/python_notebook.md b/docs/user_guides/projects/jupyter/python_notebook.md index 3412a0d96..409faa6d5 100644 --- a/docs/user_guides/projects/jupyter/python_notebook.md +++ b/docs/user_guides/projects/jupyter/python_notebook.md @@ -5,7 +5,7 @@ Jupyter is provided as a service in Hopsworks, providing the same user experience and features as if run on your laptop. * Supports JupyterLab and the classic Jupyter front-end -* Configured with Python and PySpark kernels +* Configured with Python3, PySpark and Ray kernels ## Step 1: Jupyter dashboard @@ -82,6 +82,17 @@ Start the Jupyter instance by clicking the `Run Jupyter` button.

+## Accessing project data +!!! notice "Recommended approach if `/hopsfs` is mounted" + If your Hopsworks installation is configured to mount the project datasets under `/hopsfs`, which it is in most cases, then please refer to this section. + If the file system is not mounted, then project files can be localized using the [download api](https://docs.hopsworks.ai/hopsworks-api/{{{ hopsworks_version }}}/generated/api/datasets/#download) to localize files in the current working directory. + +### Absolute paths +The project datasets are mounted under `/hopsfs`, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv` in your notebook. + +### Relative paths +The notebook's working directory is the folder it is located in. For example, if it is located in the `Resources` dataset, and you have a file named `data.csv` in that dataset, you simply access it using `data.csv`. Also, if you write a local file, for example `output.txt`, it will be saved in the `Resources` dataset. + ## Going Further diff --git a/docs/user_guides/projects/jupyter/ray_notebook.md b/docs/user_guides/projects/jupyter/ray_notebook.md index d6d4eae3e..07077a11a 100644 --- a/docs/user_guides/projects/jupyter/ray_notebook.md +++ b/docs/user_guides/projects/jupyter/ray_notebook.md @@ -139,4 +139,8 @@ In the Ray Dashboard, you can monitor the resources used by code you are runnin Access Ray Dashboard
Access Ray Dashboard for Jupyter Ray session
-

\ No newline at end of file +

+ +## Accessing project data + +The project datasets are mounted under `/hopsfs` in the Ray containers, so you can access `data.csv` from the `Resources` dataset using `/hopsfs/Resources/data.csv`. diff --git a/docs/user_guides/projects/jupyter/spark_notebook.md b/docs/user_guides/projects/jupyter/spark_notebook.md index c358bee61..689df54ba 100644 --- a/docs/user_guides/projects/jupyter/spark_notebook.md +++ b/docs/user_guides/projects/jupyter/spark_notebook.md @@ -135,6 +135,23 @@ Navigate back to Hopsworks and a Spark session will have appeared, click on the

+## Accessing project data + +### Read directly from the filesystem (recommended) + +To read a dataset in your project using Spark, use the full filesystem path where the data is stored. For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: + +```python +df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True) +df.show() +``` + +### Additional files + +Different files can be attached to the jupyter session and made available in the `/srv/hops/artifacts` folder when the PySpark kernel is started. This configuration is mainly useful when you need to add additional configuration such as jars that needs to be added to the CLASSPATH. + +When reading data in your Spark application, it is recommended to use the Spark read API as previously demonstrated, since this reads from the filesystem directly, whereas `Additional files` configuration options will download the files in its entirety and is not a scalable option. + ## Going Further You can learn how to [install a library](../python/python_install.md) so that it can be used in a notebook. diff --git a/mkdocs.yml b/mkdocs.yml index a84b46b95..4e7f707c1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,8 +4,6 @@ site_author: "Hopsworks" site_url: "https://docs.hopsworks.ai/" # Repository -repo_name: logicalclocks/hopsworks -repo_url: https://github.com/logicalclocks/hopsworks edit_uri: "" strict: false @@ -152,11 +150,11 @@ nav: - Run Ray Notebook: user_guides/projects/jupyter/ray_notebook.md - Remote Filesystem Driver: user_guides/projects/jupyter/remote_filesystem_driver.md - Jobs: + - Run Python Job: user_guides/projects/jobs/python_job.md + - Run Jupyter Notebook Job: user_guides/projects/jobs/notebook_job.md - Run PySpark Job: user_guides/projects/jobs/pyspark_job.md - Run Spark Job: user_guides/projects/jobs/spark_job.md - - Run Python Job: user_guides/projects/jobs/python_job.md - Run Ray Job: user_guides/projects/jobs/ray_job.md - - Run Jupyter Notebook Job: user_guides/projects/jobs/notebook_job.md - Scheduling: user_guides/projects/jobs/schedule_job.md - Kubernetes Scheduling: user_guides/projects/scheduling/kube_scheduler.md - Airflow: user_guides/projects/airflow/airflow.md @@ -201,6 +199,7 @@ nav: - Inference Batcher: user_guides/mlops/serving/inference-batcher.md - API Protocol: user_guides/mlops/serving/api-protocol.md - Troubleshooting: user_guides/mlops/serving/troubleshooting.md + - External Access: user_guides/mlops/serving/external-access.md - Vector Database: user_guides/mlops/vector_database/index.md - Provenance: user_guides/mlops/provenance/provenance.md - Migration: @@ -245,7 +244,7 @@ nav: - Audit: - Access Audit Logs: setup_installation/admin/audit/audit-logs.md - Export Audit Logs: setup_installation/admin/audit/export-audit-logs.md - - : https://docs.hopsworks.ai + - : https://docs.hopsworks.ai - Community ↗: https://community.hopsworks.ai/ theme: @@ -268,7 +267,7 @@ theme: - navigation.indexes extra: - hopsworks_version: dev + hopsworks_version: 4.3 version: provider: mike default: latest