Skip to content

Commit 9fb185f

Browse files
authored
Bump to Spark 3.2.2 (#170)
* Bumping to Spark 3.2.2 * Page update * Fixing SPARK-35132 * Bumping netty to 4.1.68.Final
1 parent a0c9ad1 commit 9fb185f

13 files changed

+58
-24
lines changed

build.sbt

+15-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import sbtassembly.AssemblyPlugin.autoImport.ShadeRule
33
import scala.util.Properties
44

55
name := """sequila"""
6-
val DEFAULT_SPARK_3_VERSION = "3.1.2"
6+
val DEFAULT_SPARK_3_VERSION = "3.2.2"
77
lazy val sparkVersion = Properties.envOrElse("SPARK_VERSION", DEFAULT_SPARK_3_VERSION)
88

99
version := s"${sys.env.getOrElse("VERSION", "0.1.0")}"
@@ -18,6 +18,16 @@ val DEFAULT_HADOOP_VERSION = "3.1.2"
1818

1919
lazy val hadoopVersion = Properties.envOrElse("SPARK_HADOOP_VERSION", DEFAULT_HADOOP_VERSION)
2020

21+
val nettyVersion = "4.1.68.Final"
22+
dependencyOverrides += "io.netty" % "netty-all" % nettyVersion
23+
dependencyOverrides += "io.netty" % "netty-buffer" % nettyVersion
24+
dependencyOverrides += "io.netty" % "netty-codec" % nettyVersion
25+
dependencyOverrides += "io.netty" % "netty-common" % nettyVersion
26+
dependencyOverrides += "io.netty" % "netty-handler" % nettyVersion
27+
dependencyOverrides += "io.netty" % "netty-resolver" % nettyVersion
28+
dependencyOverrides += "io.netty" % "netty-transport" % nettyVersion
29+
dependencyOverrides += "io.netty" % "netty-transport-native-epoll" % nettyVersion
30+
dependencyOverrides += "io.netty" % "netty-transport-native-unix-common" % nettyVersion
2131
dependencyOverrides += "com.google.guava" % "guava" % "15.0"
2232
dependencyOverrides += "org.apache.orc" % "orc-core" % "1.6.9"
2333
dependencyOverrides += "org.apache.logging.log4j" % "log4j-core" % "2.3"
@@ -30,7 +40,7 @@ libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion
3040
libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion
3141
libraryDependencies += "com.github.mrpowers" %% "spark-fast-tests" % "0.21.3"
3242
libraryDependencies += "com.github.mrpowers" %% "spark-daria" % "0.38.2"
33-
libraryDependencies += "com.holdenkarau" %% "spark-testing-base" % "3.1.2_1.1.0" % "test" excludeAll ExclusionRule(organization = "javax.servlet") excludeAll (ExclusionRule("org.apache.hadoop"))
43+
libraryDependencies += "com.holdenkarau" %% "spark-testing-base" % "3.2.0_1.2.0" % "test" excludeAll ExclusionRule(organization = "javax.servlet") excludeAll (ExclusionRule("org.apache.hadoop"))
3444
libraryDependencies += "org.bdgenomics.adam" %% "adam-core-spark3" % "0.36.0" excludeAll (ExclusionRule("org.seqdoop"))
3545
libraryDependencies += "org.bdgenomics.adam" %% "adam-apis-spark3" % "0.36.0" excludeAll (ExclusionRule("org.seqdoop"))
3646
libraryDependencies += "org.bdgenomics.adam" %% "adam-cli-spark3" % "0.36.0" excludeAll (ExclusionRule("org.seqdoop"))
@@ -156,7 +166,8 @@ publishTo := {
156166
if (!version.value.toLowerCase.contains("snapshot")) {
157167
sonatypePublishToBundle.value
158168
} else {
159-
val nexus = "http://zsibio.ii.pw.edu.pl/nexus/repository/"
169+
val nexus = "https://zsibio.ii.pw.edu.pl/nexus/repository/"
160170
Some("snapshots" at nexus + "maven-snapshots")
161171
}
162-
}
172+
}
173+
ThisBuild / useCoursier := true

page/content/en/_index.html

+6-6
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,14 @@ <h2 class="text-center">Getting started</h2>
8282
<div class="text-left">
8383
{{< highlight bash>}}
8484
# ensure you have Apache Spark and GraalVM installed
85-
sdk install spark 3.1.2
85+
sdk install spark 3.2.2
8686
sdk install java 21.3.0.r11-grl
8787
# set Apache Spark and JDK using sdkman
88-
sdk use spark 3.1.2
88+
sdk use spark 3.2.2
8989
# use GraalVM for best performance
9090
sdk use java 21.3.0.r11-grl
9191
# in case you prefer to use a Python interface
92-
pip install pysequila==0.3.3
92+
pip install pysequila==0.4.0
9393
# download sample data
9494
mkdir -p data
9595
#BAM
@@ -130,7 +130,7 @@ <h2 class="text-center">Getting started</h2>
130130
{{< highlight bash>}}
131131
pyspark --master local[1] \
132132
--driver-memory 4g \
133-
--packages org.biodatageeks:sequila_2.12:1.0.0
133+
--packages org.biodatageeks:sequila_2.12:1.1.0
134134
{{< / highlight >}}
135135
{{< highlight python>}}
136136
from pysequila import SequilaSession
@@ -187,7 +187,7 @@ <h2 class="text-center">Getting started</h2>
187187
{{< highlight bash>}}
188188
pyspark --master local[1] \
189189
--driver-memory 4g \
190-
--packages org.biodatageeks:sequila_2.12:1.0.0
190+
--packages org.biodatageeks:sequila_2.12:1.1.0
191191
{{< / highlight >}}
192192
{{< highlight python>}}
193193
from pysequila import SequilaSession
@@ -256,7 +256,7 @@ <h2 class="text-center">Getting started</h2>
256256
{{< highlight bash>}}
257257
pyspark --master local[1] \
258258
--driver-memory 4g \
259-
--packages org.biodatageeks:sequila_2.12:1.0.0
259+
--packages org.biodatageeks:sequila_2.12:1.1.0
260260
{{< / highlight >}}
261261
{{< highlight python>}}
262262
targets_df = ss.read\

project/plugins.sbt

+2
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@ addSbtPlugin("com.julianpeeters" % "sbt-avrohugger" % "2.0.0-RC19")
99
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.4")
1010
addSbtPlugin("com.jsuereth" % "sbt-pgp" % "2.0.1")
1111
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
12+
13+
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.10.0-RC1")

src/main/scala/org/biodatageeks/sequila/datasources/BAM/SequilaDataSourceStrategy.scala

+3
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ case class SequilaDataSourceStrategy(spark: SparkSession) extends Strategy
102102
l.output.toStructType,
103103
Set.empty,
104104
Set.empty,
105+
None,
105106
toCatalystRDD(l, baseRelation.buildScan()),
106107
baseRelation,
107108
None) :: Nil
@@ -215,6 +216,7 @@ case class SequilaDataSourceStrategy(spark: SparkSession) extends Strategy
215216
projects.map(_.toAttribute).toStructType,
216217
Set.empty,
217218
Set.empty,
219+
None,
218220
scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
219221
relation.relation,
220222
relation.catalogTable.map(_.identifier))
@@ -229,6 +231,7 @@ case class SequilaDataSourceStrategy(spark: SparkSession) extends Strategy
229231
requestedColumns.toStructType,
230232
Set.empty,
231233
Set.empty,
234+
None,
232235
scanBuilder(requestedColumns, candidatePredicates, pushedFilters),
233236
relation.relation,
234237
relation.catalogTable.map(_.identifier))

src/main/scala/org/biodatageeks/sequila/pileup/PileupStrategy.scala

+1
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,5 @@ case class PileupPlan [T<:BDGAlignInputFormat](plan:LogicalPlan, spark:SparkSess
119119
conf
120120
}
121121

122+
override protected def withNewChildrenInternal(newChildren: IndexedSeq[SparkPlan]): SparkPlan = this
122123
}

src/main/scala/org/biodatageeks/sequila/rangejoins/methods/IntervalTree/IntervalTreeJoinOptim.scala

+2
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,6 @@ case class IntervalTreeJoinOptim(left: SparkPlan,
100100
}
101101

102102
}
103+
104+
override protected def withNewChildrenInternal(newLeft: SparkPlan, newRight: SparkPlan): SparkPlan = copy(left = newLeft, right = newRight)
103105
}

src/main/scala/org/biodatageeks/sequila/rangejoins/methods/IntervalTree/IntervalTreeJoinOptimChromosome.scala

+2
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,6 @@ case class IntervalTreeJoinOptimChromosome(left: SparkPlan,
115115
.stats
116116
.sizeInBytes != Long.MaxValue)
117117
}
118+
119+
override protected def withNewChildrenInternal(newLeft: SparkPlan, newRight: SparkPlan): SparkPlan = copy(left = newLeft, right = newRight)
118120
}

src/main/scala/org/biodatageeks/sequila/rangejoins/methods/genApp/IntervalTreeJoin.scala

+2
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,6 @@ case class IntervalTreeJoin(left: SparkPlan,
6262
}
6363

6464
}
65+
66+
override protected def withNewChildrenInternal(newLeft: SparkPlan, newRight: SparkPlan): SparkPlan = copy(left = newLeft, right = newRight)
6567
}

src/main/scala/org/biodatageeks/sequila/rangejoins/methods/genApp/IntervalTreeJoinChromosome.scala

+2
Original file line numberDiff line numberDiff line change
@@ -65,4 +65,6 @@ IntervalTreeJoinChromosome(left: SparkPlan,
6565
}
6666

6767
}
68+
69+
override protected def withNewChildrenInternal(newLeft: SparkPlan, newRight: SparkPlan): SparkPlan = copy(left = newLeft, right = newRight)
6870
}

src/main/scala/org/biodatageeks/sequila/utvf/GenomicIntervalStrategy.scala

+2
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,6 @@ case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:G
3131
)
3232
}
3333
def children: Seq[SparkPlan] = Nil
34+
35+
override protected def withNewChildrenInternal(newChildren: IndexedSeq[SparkPlan]): SparkPlan = this
3436
}

src/main/scala/org/biodatageeks/sequila/utvf/ResolveTableValuedFunctionsSeq.scala

+5-5
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ object ResolveTableValuedFunctionsSeq extends Rule[LogicalPlan] {
4747
def implicitCast(values: Seq[Expression]): Option[Seq[Expression]] = {
4848
if (args.length == values.length) {
4949
val casted = values.zip(args).map { case (value, (_, expectedType)) =>
50-
TypeCoercion.ImplicitTypeCasts.implicitCast(value, expectedType)
50+
TypeCoercion.implicitCast(value, expectedType)
5151
}
5252
if (casted.forall(_.isDefined)) {
5353
return Some(casted.map(_.get))
@@ -130,7 +130,7 @@ object ResolveTableValuedFunctionsSeq extends Rule[LogicalPlan] {
130130

131131
override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
132132
case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
133-
val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match {
133+
val resolvedFunc = builtinFunctions.get(u.name.funcName.toLowerCase(Locale.ROOT)) match {
134134
case Some(tvf) =>
135135
val resolved = tvf.flatMap { case (argList, resolver) =>
136136
argList.implicitCast(u.functionArgs) match {
@@ -143,12 +143,12 @@ object ResolveTableValuedFunctionsSeq extends Rule[LogicalPlan] {
143143
resolved.headOption.getOrElse {
144144
val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ")
145145
u.failAnalysis(
146-
s"""error: table-valued function ${u.functionName} with alternatives:
146+
s"""error: table-valued function ${u.name.funcName} with alternatives:
147147
|${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")}
148148
|cannot be applied to: (${argTypes})""".stripMargin)
149149
}
150150
case _ =>
151-
u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function")
151+
u.failAnalysis(s"could not resolve `${u.name.funcName}` to a table-valued function")
152152
}
153153

154154
// If alias names assigned, add `Project` with the aliases
@@ -157,7 +157,7 @@ object ResolveTableValuedFunctionsSeq extends Rule[LogicalPlan] {
157157
// Checks if the number of the aliases is equal to expected one
158158
if (u.output.size != outputAttrs.size) {
159159
u.failAnalysis(s"Number of given aliases does not match number of output columns. " +
160-
s"Function name: ${u.functionName}; number of aliases: " +
160+
s"Function name: ${u.name.funcName}; number of aliases: " +
161161
s"${u.output.size}; number of output columns: ${outputAttrs.size}.")
162162
}
163163
val aliases = outputAttrs.zip(u.output).map {

src/main/scala/org/biodatageeks/sequila/utvf/SeQuiLaAnalyzer.scala

+13-7
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class SeQuiLaAnalyzer(session: SparkSession) extends
2424
new ResolveSQLOnFile(session) +:
2525
new FallBackFileSourceV2(session) +:
2626
new ResolveSessionCatalog(
27-
catalogManager, catalog.isTempView, catalog.isTempFunction) +:
27+
catalogManager) +:
2828
ResolveEncodersInScalaAgg+: session.extensions.buildResolutionRules(session)
2929

3030

@@ -72,9 +72,11 @@ class SeQuiLaAnalyzer(session: SparkSession) extends
7272
ResolveRelations ::
7373
ResolveTables ::
7474
ResolvePartitionSpec ::
75+
ResolveAlterTableCommands ::
7576
AddMetadataColumns ::
77+
DeduplicateRelations ::
7678
ResolveReferences ::
77-
ResolveCreateNamedStruct ::
79+
ResolveExpressionsWithNamePlaceholders ::
7880
ResolveDeserializer ::
7981
ResolveNewInstance ::
8082
ResolveUpCast ::
@@ -97,21 +99,23 @@ class SeQuiLaAnalyzer(session: SparkSession) extends
9799
GlobalAggregates ::
98100
ResolveAggregateFunctions ::
99101
TimeWindowing ::
102+
SessionWindowing ::
100103
ResolveInlineTables ::
101-
ResolveHigherOrderFunctions(v1SessionCatalog) ::
104+
ResolveHigherOrderFunctions(catalogManager) ::
102105
ResolveLambdaVariables ::
103106
ResolveTimeZone ::
104107
ResolveRandomSeed ::
105108
ResolveBinaryArithmetic ::
106109
ResolveUnion ::
107-
TypeCoercion.typeCoercionRules ++
110+
typeCoercionRules ++
111+
Seq(ResolveWithCTE) ++
108112
extendedResolutionRules : _*),
113+
Batch("Remove TempResolvedColumn", Once, RemoveTempResolvedColumn),
109114
Batch("Apply Char Padding", Once,
110115
ApplyCharTypePadding),
111116
Batch("Post-Hoc Resolution", Once,
112-
Seq(ResolveNoopDropTable) ++
117+
Seq(ResolveCommandsWithIfExists) ++
113118
postHocResolutionRules: _*),
114-
Batch("Normalize Alter Table", Once, ResolveAlterTableChanges),
115119
Batch("Remove Unresolved Hints", Once,
116120
new ResolveHints.RemoveAllHints),
117121
Batch("Nondeterministic", Once,
@@ -124,6 +128,8 @@ class SeQuiLaAnalyzer(session: SparkSession) extends
124128
Batch("Subquery", Once,
125129
UpdateOuterReferences),
126130
Batch("Cleanup", fixedPoint,
127-
CleanupAliases)
131+
CleanupAliases),
132+
Batch("HandleAnalysisOnlyCommand", Once,
133+
HandleAnalysisOnlyCommand)
128134
)
129135
}

src/main/scala/org/biodatageeks/sequila/utvf/SequilaSession.scala

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import htsjdk.samtools.ValidationStringency
55
import org.apache.log4j.Logger
66
import org.apache.spark.sql.catalyst.analysis.{Analyzer, SeQuiLaAnalyzer}
77
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
8-
import org.apache.spark.sql.execution.QueryExecution
8+
import org.apache.spark.sql.execution.{CommandExecutionMode, QueryExecution}
99
import org.apache.spark.sql.execution.datasources.SequilaDataSourceStrategy
1010
import org.apache.spark.sql.functions.{lit, typedLit}
1111
import org.apache.spark.sql.internal.SessionState
@@ -120,6 +120,7 @@ case class SequilaSessionState(sparkSession: SparkSession, customAnalyzer: Analy
120120
sparkSession.sessionState.conf,
121121
sparkSession.sessionState.experimentalMethods,
122122
sparkSession.sessionState.functionRegistry,
123+
sparkSession.sessionState.tableFunctionRegistry,
123124
sparkSession.sessionState.udfRegistration,
124125
() => sparkSession.sessionState.catalog,
125126
sparkSession.sessionState.sqlParser,
@@ -129,7 +130,7 @@ case class SequilaSessionState(sparkSession: SparkSession, customAnalyzer: Analy
129130
() => sparkSession.sessionState.streamingQueryManager,
130131
sparkSession.sessionState.listenerManager,
131132
() =>sparkSession.sessionState.resourceLoader,
132-
executePlan,
133+
sparkSession.sessionState.executePlan,
133134
(sparkSession:SparkSession,sessionState: SessionState) => sessionState.clone(sparkSession),
134135
sparkSession.sessionState.columnarRules,
135136
sparkSession.sessionState.queryStagePrepRules

0 commit comments

Comments
 (0)