Add engine parameter to hopsworks.login (4.1) (#384)

aversey · web-flow · commit 03fc722354ce · 2024-12-04T09:25:54.000+01:00
* Add engine parameter to hopsworks.login

* Fix the docstring for the engine parameter

* Fix typing

* Remove engine param from get_feature_store

* Remove redundant code from get_feature_store
diff --git a/python/hopsworks/__init__.py b/python/hopsworks/__init__.py
@@ -22,6 +22,7 @@
 import tempfile
 import warnings
 from pathlib import Path
+from typing import Literal, Union
 
 from hopsworks import client, constants, project, version
 from hopsworks.client.exceptions import (
@@ -83,6 +84,7 @@ def login(
     api_key_file: str = None,
     hostname_verification: bool = False,
     trust_store_path: str = None,
+    engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None,
 ) -> project.Project:
     """Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments.
 
@@ -122,6 +124,13 @@ def login(
         api_key_file: Path to file wih Api Key
         hostname_verification: Whether to verify Hopsworks' certificate
         trust_store_path: Path on the file system containing the Hopsworks certificates
+        engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
+            which initializes the engine to Spark if the environment provides Spark, for
+            example on Hopsworks and Databricks, or falls back to Python if Spark is not
+            available, e.g. on local Python environments or AWS SageMaker. This option
+            allows you to override this behaviour. `"training"` engine is useful when only
+            feature store metadata is needed, for example training dataset location and label
+            information when Hopsworks training experiment is conducted.
     # Returns
         `Project`: The Project object to perform operations on
     # Raises
@@ -138,7 +147,7 @@ def login(
 
     # If inside hopsworks, just return the current project for now
     if "REST_ENDPOINT" in os.environ:
-        _hw_connection = _hw_connection(hostname_verification=hostname_verification)
+        _hw_connection = _hw_connection(hostname_verification=hostname_verification, engine=engine)
         _connected_project = _hw_connection.get_project()
         _initialize_module_apis()
         print("\nLogged in to project, explore it here " + _connected_project.get_url())
@@ -207,6 +216,7 @@ def login(
             _hw_connection = _hw_connection(
                 host=host,
                 port=port,
+                engine=engine,
                 api_key_file=api_key_path,
                 hostname_verification=hostname_verification,
                 trust_store_path=trust_store_path,
@@ -246,6 +256,7 @@ def login(
         _hw_connection = _hw_connection(
             host=host,
             port=port,
+            engine=engine,
             api_key_value=api_key,
             hostname_verification=hostname_verification,
             trust_store_path=trust_store_path,
diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py
@@ -100,7 +100,7 @@ class Connection:
             Defaults to `None`.
         engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
             which initializes the engine to Spark if the environment provides Spark, for
-            example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not
+            example on Hopsworks and Databricks, or falls back to Python if Spark is not
             available, e.g. on local Python environments or AWS SageMaker. This option
             allows you to override this behaviour. `"training"` engine is useful when only
             feature store metadata is needed, for example training dataset location and label
@@ -151,7 +151,6 @@ def __init__(
     def get_feature_store(
         self,
         name: Optional[str] = None,
-        engine: Optional[str] = None,
     ):  # -> feature_store.FeatureStore
         # the typing is commented out due to circular dependency, it breaks auto_doc.py
         """Get a reference to a feature store to perform operations on.
@@ -161,25 +160,10 @@ def get_feature_store(
 
         # Arguments
             name: The name of the feature store, defaults to `None`.
-            engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
-            which initializes the engine to Spark if the environment provides Spark, for
-            example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not
-            available, e.g. on local Python environments or AWS SageMaker. This option
-            allows you to override this behaviour. `"training"` engine is useful when only
-            feature store metadata is needed, for example training dataset location and label
-            information when Hopsworks training experiment is conducted.
 
         # Returns
             `FeatureStore`. A feature store handle object to perform operations on.
         """
-        # Ensure the engine is initialized and of right type
-        from hsfs import engine as hsfs_engine
-
-        if engine:
-            global _hsfs_engine_type
-            _hsfs_engine_type = engine
-        hsfs_engine.get_instance()
-
         if not name:
             name = client.get_instance()._project_name
         return self._feature_store_api.get(util.append_feature_store_suffix(name))
@@ -532,7 +516,7 @@ def connection(
                 Defaults to `None`.
             engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
                 which initializes the engine to Spark if the environment provides Spark, for
-                example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not
+                example on Hopsworks and Databricks, or falls back to Python if Spark is not
                 available, e.g. on local Python environments or AWS SageMaker. This option
                 allows you to override this behaviour. `"training"` engine is useful when only
                 feature store metadata is needed, for example training dataset location and label
diff --git a/python/hopsworks_common/project.py b/python/hopsworks_common/project.py
@@ -109,7 +109,7 @@ def project_namespace(self):
         return self._project_namespace
 
     def get_feature_store(
-        self, name: Optional[str] = None, engine: Optional[str] = None
+        self, name: Optional[str] = None
     ):  # -> hsfs.feature_store.FeatureStore
         """Connect to Project's Feature Store.
 
@@ -127,15 +127,12 @@ def get_feature_store(
 
         # Arguments
             name: Project name of the feature store.
-            engine: Which engine to use, `"spark"`, `"python"` or `"training"`.
-                Defaults to `"python"` when connected to [Serverless Hopsworks](https://app.hopsworks.ai).
-                See [`hopsworks.connection`](connection.md#connection) documentation for more information.
         # Returns
             `hsfs.feature_store.FeatureStore`: The Feature Store API
         # Raises
             `RestAPIError`: If unable to connect
         """
-        return client.get_connection().get_feature_store(name, engine)
+        return client.get_connection().get_feature_store(name)
 
     def get_model_registry(self):
         """Connect to Project's Model Registry API.