Skip to content

Commit b610eb2

Browse files
bubriksSirOibaf
andauthored
[FSTORE-1626] Spark client without hopsfs (#419) (#431)
Co-authored-by: Fabio Buso <dev.siroibaf@gmail.com>
1 parent af881fb commit b610eb2

File tree

5 files changed

+22
-14
lines changed

5 files changed

+22
-14
lines changed

python/hopsworks/__init__.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def login(
8484
api_key_file: str = None,
8585
hostname_verification: bool = False,
8686
trust_store_path: str = None,
87-
engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None,
87+
engine: Union[None, Literal["spark"], Literal["python"], Literal["training"], Literal["spark-no-metastore"], Literal["spark-delta"]] = None,
8888
) -> project.Project:
8989
"""Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments.
9090
@@ -124,13 +124,12 @@ def login(
124124
api_key_file: Path to file wih Api Key
125125
hostname_verification: Whether to verify Hopsworks' certificate
126126
trust_store_path: Path on the file system containing the Hopsworks certificates
127-
engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
128-
which initializes the engine to Spark if the environment provides Spark, for
129-
example on Hopsworks and Databricks, or falls back to Python if Spark is not
130-
available, e.g. on local Python environments or AWS SageMaker. This option
131-
allows you to override this behaviour. `"training"` engine is useful when only
132-
feature store metadata is needed, for example training dataset location and label
133-
information when Hopsworks training experiment is conducted.
127+
engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment:
128+
"spark": Used if Spark is available, such as in Hopsworks or Databricks environments.
129+
"python": Used in local Python environments or AWS SageMaker when Spark is not available.
130+
"training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments.
131+
"spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore.
132+
"spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS.
134133
# Returns
135134
`Project`: The Project object to perform operations on
136135
# Raises

python/hopsworks_common/client/external.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,15 @@ def provide_project(self, project):
148148
for conf_key, conf_value in configuration_dict.items():
149149
_spark_session._jsc.hadoopConfiguration().set(conf_key, conf_value)
150150

151+
elif self._engine == "spark-delta":
152+
_logger.debug(
153+
"Running in Spark environment with no metastore and hopsfs, initializing Spark session"
154+
)
155+
_spark_session = SparkSession.builder \
156+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
157+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
158+
.getOrCreate()
159+
151160
hopsworks_common.client.get_connection()._provide_project()
152161

153162
def download_certs(self):

python/hopsworks_common/connection.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ class Connection:
9999
defaults to the project from where the client is run from.
100100
Defaults to `None`.
101101
engine: Specifies the engine to use. Possible options are "spark", "python", "training", "spark-no-metastore", or "spark-delta". The default value is None, which automatically selects the engine based on the environment:
102-
"spark": Used if Spark is available and the connection is not to serverless Hopsworks, such as in Hopsworks or Databricks environments.
103-
"python": Used in local Python environments or AWS SageMaker when Spark is not available or the connection is done to serverless Hopsworks.
102+
"spark": Used if Spark is available, such as in Hopsworks or Databricks environments.
103+
"python": Used in local Python environments or AWS SageMaker when Spark is not available.
104104
"training": Used when only feature store metadata is needed, such as for obtaining training dataset locations and label information during Hopsworks training experiments.
105105
"spark-no-metastore": Functions like "spark" but does not rely on the Hive metastore.
106106
"spark-delta": Minimizes dependencies further by avoiding both Hive metastore and HopsFS.
@@ -361,7 +361,7 @@ def connect(self) -> None:
361361
else:
362362
raise ConnectionError(
363363
"Engine you are trying to initialize is unknown. "
364-
"Supported engines are `'spark'`, `'python'` and `'training'`."
364+
"Supported engines are `'spark'`, `'python'`, `'training'`, `'spark-no-metastore'`, and `'spark-delta'`."
365365
)
366366

367367
# init client
@@ -518,7 +518,7 @@ def connection(
518518
project: The name of the project to connect to. When running on Hopsworks, this
519519
defaults to the project from where the client is run from.
520520
Defaults to `None`.
521-
engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`,
521+
engine: Which engine to use, `"spark"`, `"python"`, `"training"`, `"spark-no-metastore"` or `"spark-delta"`. Defaults to `None`,
522522
which initializes the engine to Spark if the environment provides Spark, for
523523
example on Hopsworks and Databricks, or falls back to Python if Spark is not
524524
available, e.g. on local Python environments or AWS SageMaker. This option

python/hsfs/engine/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def init(engine_type: str) -> None:
4141
raise ValueError(
4242
"Hive engine is not supported in hopsworks client version >= 4.0."
4343
)
44-
elif engine_type == "spark-no-metastore":
44+
elif engine_type == "spark-no-metastore" or engine_type == "spark-delta":
4545
_engine = spark_no_metastore.Engine()
4646
elif engine_type in python_types:
4747
try:

python/hsfs/engine/spark_no_metastore.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,6 @@ def __init__(self) -> None:
3232

3333
super().__init__()
3434

35-
def _sql_offline(self, sql_query):
35+
def _sql_offline(self, sql_query, feature_store):
3636
# Spark no metastore does not require the
3737
return self._spark_session.sql(sql_query)

0 commit comments

Comments
 (0)