Skip to content

[SPARK-52168] Support to for DataFrame #156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions Sources/SparkConnect/DataFrame.swift
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ import Synchronization
/// - ``show(_:_:_:)``
///
/// ### Transformation Operations
/// - ``to(_:)``
/// - ``toDF(_:)``
/// - ``toJSON()``
/// - ``select(_:)``
Expand Down Expand Up @@ -478,6 +479,19 @@ public actor DataFrame: Sendable {
return df
}

/// Returns a new DataFrame where each row is reconciled to match the specified schema.
/// - Parameter schema: The given schema.
/// - Returns: A ``DataFrame`` with the given schema.
public func to(_ schema: String) async throws -> DataFrame {
// Validate by parsing.
do {
let dataType = try await sparkSession.client.ddlParse(schema)
return DataFrame(spark: self.spark, plan: SparkConnectClient.getToSchema(self.plan.root, dataType))
} catch {
throw SparkConnectError.InvalidTypeException
}
}

/// Returns the content of the Dataset as a Dataset of JSON strings.
/// - Returns: A ``DataFrame`` with a single string column whose content is JSON.
public func toJSON() -> DataFrame {
Expand Down
11 changes: 11 additions & 0 deletions Sources/SparkConnect/SparkConnectClient.swift
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,17 @@ public actor SparkConnectClient {
return plan
}

static func getToSchema(_ child: Relation, _ schema: Spark_Connect_DataType) -> Plan {
var toSchema = Spark_Connect_ToSchema()
toSchema.input = child
toSchema.schema = schema
var relation = Relation()
relation.toSchema = toSchema
var plan = Plan()
plan.opType = .root(relation)
return plan
}

static func getProjectExprs(_ child: Relation, _ exprs: [String]) -> Plan {
var project = Project()
project.input = child
Expand Down
20 changes: 20 additions & 0 deletions Tests/SparkConnectTests/DataFrameTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,26 @@ struct DataFrameTests {
await spark.stop()
}

@Test
func to() async throws {
let spark = try await SparkSession.builder.getOrCreate()

let schema1 = try await spark.range(1).to("shortID SHORT").schema
#expect(
schema1
== #"{"struct":{"fields":[{"name":"shortID","dataType":{"short":{}},"nullable":true}]}}"#
)

let schema2 = try await spark.sql("SELECT '1'").to("id INT").schema
print(schema2)
#expect(
schema2
== #"{"struct":{"fields":[{"name":"id","dataType":{"integer":{}},"nullable":true}]}}"#
)

await spark.stop()
}

@Test
func selectMultipleColumns() async throws {
let spark = try await SparkSession.builder.getOrCreate()
Expand Down
Loading