Skip to content

Commit e3f3c1b

Browse files
Addison Highamfalaki
Addison Higham
authored andcommitted
Allow for maxCharsPerCol to be set
Currently, the maxCharactersPerColumn value is hardcoded to 100,000. I (sadly) have CSV fields with more than 100k characters so need to be able to configure this. This allows the value to be passed the same way as other paramters while keeping a backward compatible default of 100k Author: Addison Higham <ahigham@instructure.com> Closes #307 from addisonj/master.
1 parent 1aebd64 commit e3f3c1b

File tree

6 files changed

+72
-11
lines changed

6 files changed

+72
-11
lines changed

src/main/scala/com/databricks/spark/csv/CsvParser.scala

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class CsvParser extends Serializable {
4242
private var codec: String = null
4343
private var nullValue: String = ""
4444
private var dateFormat: String = null
45+
private var maxCharsPerCol: Int = 100000
4546

4647
def withUseHeader(flag: Boolean): CsvParser = {
4748
this.useHeader = flag
@@ -123,6 +124,11 @@ class CsvParser extends Serializable {
123124
this
124125
}
125126

127+
def withMaxCharsPerCol(maxCharsPerCol: Int): CsvParser = {
128+
this.maxCharsPerCol = maxCharsPerCol
129+
this
130+
}
131+
126132
/** Returns a Schema RDD for the given CSV path. */
127133
@throws[RuntimeException]
128134
def csvFile(sqlContext: SQLContext, path: String): DataFrame = {
@@ -143,7 +149,8 @@ class CsvParser extends Serializable {
143149
inferSchema,
144150
codec,
145151
nullValue,
146-
dateFormat)(sqlContext)
152+
dateFormat,
153+
maxCharsPerCol)(sqlContext)
147154
sqlContext.baseRelationToDataFrame(relation)
148155
}
149156

@@ -165,7 +172,8 @@ class CsvParser extends Serializable {
165172
inferSchema,
166173
codec,
167174
nullValue,
168-
dateFormat)(sqlContext)
175+
dateFormat,
176+
maxCharsPerCol)(sqlContext)
169177
sqlContext.baseRelationToDataFrame(relation)
170178
}
171179
}

src/main/scala/com/databricks/spark/csv/CsvRelation.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ case class CsvRelation protected[spark] (
4949
inferCsvSchema: Boolean,
5050
codec: String = null,
5151
nullValue: String = "",
52-
dateFormat: String = null)(@transient val sqlContext: SQLContext)
52+
dateFormat: String = null,
53+
maxCharsPerCol: Int = 100000)(@transient val sqlContext: SQLContext)
5354
extends BaseRelation with TableScan with PrunedScan with InsertableRelation {
5455

5556
// Share date format object as it is expensive to parse date pattern.
@@ -287,7 +288,8 @@ case class CsvRelation protected[spark] (
287288

288289
new BulkCsvReader(iter, split,
289290
headers = header, fieldSep = delimiter,
290-
quote = quoteChar, escape = escapeVal, commentMarker = commentChar)
291+
quote = quoteChar, escape = escapeVal,
292+
commentMarker = commentChar, maxCharsPerCol = maxCharsPerCol)
291293
}
292294
}, true)
293295

src/main/scala/com/databricks/spark/csv/DefaultSource.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,14 @@ class DefaultSource
142142

143143
val codec = parameters.getOrElse("codec", null)
144144

145+
val maxCharsPerColStr = parameters.getOrElse("maxCharsPerCol", "100000")
146+
val maxCharsPerCol = try {
147+
maxCharsPerColStr.toInt
148+
} catch {
149+
case e: Exception => throw new Exception("maxCharsPerCol must be a valid integer")
150+
}
151+
152+
145153
CsvRelation(
146154
() => TextFile.withCharset(sqlContext.sparkContext, path, charset),
147155
Some(path),
@@ -159,7 +167,8 @@ class DefaultSource
159167
inferSchemaFlag,
160168
codec,
161169
nullValue,
162-
dateFormat)(sqlContext)
170+
dateFormat,
171+
maxCharsPerCol)(sqlContext)
163172
}
164173

165174
override def createRelation(

src/main/scala/com/databricks/spark/csv/readers/readers.scala

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ private[readers] abstract class CsvReader(
4444
ignoreTrailingSpace: Boolean = true,
4545
headers: Seq[String],
4646
inputBufSize: Int = 128,
47-
maxCols: Int = 20480) {
47+
maxCols: Int = 20480,
48+
maxCharsPerCol: Int = 100000) {
4849
protected lazy val parser: CsvParser = {
4950
val settings = new CsvParserSettings()
5051
val format = settings.getFormat
@@ -59,7 +60,7 @@ private[readers] abstract class CsvReader(
5960
settings.setInputBufferSize(inputBufSize)
6061
settings.setMaxColumns(maxCols)
6162
settings.setNullValue("")
62-
settings.setMaxCharsPerColumn(100000)
63+
settings.setMaxCharsPerColumn(maxCharsPerCol)
6364
if (headers != null) settings.setHeaders(headers: _*)
6465

6566
new CsvParser(settings)
@@ -86,7 +87,8 @@ private[csv] class LineCsvReader(
8687
ignoreLeadingSpace: Boolean = true,
8788
ignoreTrailingSpace: Boolean = true,
8889
inputBufSize: Int = 128,
89-
maxCols: Int = 20480)
90+
maxCols: Int = 20480,
91+
maxCharsPerCol: Int = 100000)
9092
extends CsvReader(
9193
fieldSep,
9294
lineSep,
@@ -97,7 +99,8 @@ private[csv] class LineCsvReader(
9799
ignoreTrailingSpace,
98100
null,
99101
inputBufSize,
100-
maxCols) {
102+
maxCols,
103+
maxCharsPerCol) {
101104
/**
102105
* parse a line
103106
* @param line a String with no newline at the end
@@ -136,7 +139,8 @@ private[csv] class BulkCsvReader(
136139
ignoreTrailingSpace: Boolean = true,
137140
headers: Seq[String],
138141
inputBufSize: Int = 128,
139-
maxCols: Int = 20480)
142+
maxCols: Int = 20480,
143+
maxCharsPerCol: Int = 100000)
140144
extends CsvReader(
141145
fieldSep,
142146
lineSep,
@@ -147,7 +151,8 @@ private[csv] class BulkCsvReader(
147151
ignoreTrailingSpace,
148152
headers,
149153
inputBufSize,
150-
maxCols)
154+
maxCols,
155+
maxCharsPerCol)
151156
with Iterator[Array[String]] {
152157

153158
private val reader = new StringIteratorReader(iter)

src/test/resources/long-cols.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
id,text
2+
1,bacon is yummy
3+
2,Bacon ipsum dolor amet dolore ipsum boudin jowl. Frankfurter flank turkey incididunt minim elit tongue id t-bone labore magna bresaola alcatra hamburger. Ribeye alcatra in shank, turkey ground round sunt short ribs reprehenderit labore tongue. Qui consectetur salami labore. Est sed venison sausage, kielbasa cupidatat porchetta ad beef. Ex capicola short loin drumstick shank, pork reprehenderit shankle beef ribs proident short ribs kevin sausage ribeye chicken. Sirloin kielbasa capicola sausage tenderloin, aliquip brisket short ribs ham duis filet mignon. Boudin turkey jerky in ullamco rump turducken bresaola kevin qui andouille tri-tip pastrami. Tenderloin ut frankfurter hamburger in pork belly cow proident short loin veniam shank filet mignon beef pastrami. Pariatur porchetta in swine officia spare ribs sunt. Landjaeger fatback nisi tenderloin kielbasa. Swine jowl filet mignon fatback. Turducken picanha reprehenderit, ham hock pancetta consectetur sirloin sed excepteur jowl qui commodo corned beef lorem. Non rump nostrud sunt alcatra aute. Nulla irure aliqua commodo laboris. Fugiat do ad occaecat in, ut porchetta commodo. Ham hock cillum in cow salami minim in. Beef pancetta reprehenderit, leberkas excepteur swine qui cow magna chicken exercitation esse short ribs.Nulla jowl esse aute, quis fugiat ipsum tail commodo ex anim kevin. Ribeye quis porchetta meatball, shankle pork loin jowl. Landjaeger andouille dolore jowl ut drumstick pariatur. Et pork chop id laborum landjaeger pancetta occaecat swine. Porchetta picanha flank consectetur shankle reprehenderit pork chop short ribs. Brisket dolore ribeye ea drumstick landjaeger. Dolore consectetur flank, tenderloin occaecat do reprehenderit.Pig elit in dolore, ullamco nostrud biltong kevin culpa venison labore swine tri-tip. Brisket laborum short ribs id capicola venison. Velit picanha nostrud strip steak exercitation, tongue pig duis id ad labore tempor. Veniam frankfurter sed hamburger short ribs kevin cupim sint lorem Bacon ipsum dolor amet dolore ipsum boudin jowl. Frankfurter flank turkey incididunt minim elit tongue id t-bone labore magna bresaola alcatra hamburger. Ribeye alcatra in shank, turkey ground round sunt short ribs reprehenderit labore tongue. Qui consectetur salami labore. Est sed venison sausage, kielbasa cupidatat porchetta ad beef. Ex capicola short loin drumstick shank, pork reprehenderit shankle beef ribs proident short ribs kevin sausage ribeye chicken. Sirloin kielbasa capicola sausage tenderloin, aliquip brisket short ribs ham duis filet mignon. Boudin turkey jerky in ullamco rump turducken bresaola kevin qui andouille tri-tip pastrami. Tenderloin ut frankfurter hamburger in pork belly cow proident short loin veniam shank filet mignon beef pastrami. Pariatur porchetta in swine officia spare ribs sunt. Landjaeger fatback nisi tenderloin kielbasa. Swine jowl filet mignon fatback. Turducken picanha reprehenderit, ham hock pancetta consectetur sirloin sed excepteur jowl qui commodo corned beef lorem. Non rump nostrud sunt alcatra aute. Nulla irure aliqua commodo laboris. Fugiat do ad occaecat in, ut porchetta commodo. Ham hock cillum in cow salami minim in. Beef pancetta reprehenderit, leberkas excepteur swine qui cow magna chicken exercitation esse short ribs.Nulla jowl esse aute, quis fugiat ipsum tail commodo ex anim kevin. Ribeye quis porchetta meatball, shankle pork loin jowl. Landjaeger andouille dolore jowl ut drumstick pariatur. Et pork chop id laborum landjaeger pancetta occaecat swine. Porchetta picanha flank consectetur shankle reprehenderit pork chop short ribs. Brisket dolore ribeye ea drumstick landjaeger. Dolore consectetur flank, tenderloin occaecat do reprehenderit.Pig elit in dolore, ullamco nostrud biltong kevin culpa venison labore swine tri-tip. Brisket laborum short ribs id capicola venison. Velit picanha nostrud strip steak exercitation, tongue pig duis id ad labore tempor. Veniam frankfurter sed hamburger short ribs kevin cupim sint lorem Bacon ipsum dolor amet dolore ipsum boudin jowl. Frankfurter flank turkey incididunt minim elit tongue id t-bone labore magna bresaola alcatra hamburger. Ribeye alcatra in shank, turkey ground round sunt short ribs reprehenderit labore tongue. Qui consectetur salami labore. Est sed venison sausage, kielbasa cupidatat porchetta ad beef. Ex capicola short loin drumstick shank, pork reprehenderit shankle beef ribs proident short ribs kevin sausage ribeye chicken. Sirloin kielbasa capicola sausage tenderloin, aliquip brisket short ribs ham duis filet mignon. Boudin turkey jerky in ullamco rump turducken bresaola kevin qui andouille tri-tip pastrami. Tenderloin ut frankfurter hamburger in pork belly cow proident short loin veniam shank filet mignon beef pastrami. Pariatur porchetta in swine officia spare ribs sunt. Landjaeger fatback nisi tenderloin kielbasa. Swine jowl filet mignon fatback. Turducken picanha reprehenderit, ham hock pancetta consectetur sirloin sed excepteur jowl qui commodo corned beef lorem. Non rump nostrud sunt alcatra aute. Nulla irure aliqua commodo laboris. Fugiat do ad occaecat in, ut porchetta commodo. Ham hock cillum in cow salami minim in. Beef pancetta reprehenderit, leberkas excepteur swine qui cow magna chicken exercitation esse short ribs.Nulla jowl esse aute, quis fugiat ipsum tail commodo ex anim kevin. Ribeye quis porchetta meatball, shankle pork loin jowl. Landjaeger andouille dolore jowl ut drumstick pariatur. Et pork chop id laborum landjaeger pancetta occaecat swine. Porchetta picanha flank consectetur shankle reprehenderit pork chop short ribs. Brisket dolore ribeye ea drumstick landjaeger. Dolore consectetur flank, tenderloin occaecat do reprehenderit.Pig elit in dolore, ullamco nostrud biltong kevin culpa venison labore swine tri-tip. Brisket laborum short ribs id capicola venison. Velit picanha nostrud strip steak exercitation, tongue pig duis id ad labore tempor. Veniam frankfurter sed hamburger short ribs kevin cupim sint lorem Bacon ipsum dolor amet dolore ipsum boudin jowl. Frankfurter flank turkey incididunt minim elit tongue id t-bone labore magna bresaola alcatra hamburger. Ribeye alcatra in shank, turkey ground round sunt short ribs reprehenderit labore tongue. Qui consectetur salami labore. Est sed venison sausage, kielbasa cupidatat porchetta ad beef. Ex capicola short loin drumstick shank, pork reprehenderit shankle beef ribs proident short ribs kevin sausage ribeye chicken. Sirloin kielbasa capicola sausage tenderloin, aliquip brisket short ribs ham duis filet mignon. Boudin turkey jerky in ullamco rump turducken bresaola kevin qui andouille tri-tip pastrami. Tenderloin ut frankfurter hamburger in pork belly cow proident short loin veniam shank filet mignon beef pastrami. Pariatur porchetta in swine officia spare ribs sunt. Landjaeger fatback nisi tenderloin kielbasa. Swine jowl filet mignon fatback. Turducken picanha reprehenderit, ham hock pancetta consectetur sirloin sed excepteur jowl qui commodo corned beef lorem. Non rump nostrud sunt alcatra aute. Nulla irure aliqua commodo laboris. Fugiat do ad occaecat in, ut porchetta commodo. Ham hock cillum in cow salami minim in. Beef pancetta reprehenderit, leberkas excepteur swine qui cow magna chicken exercitation esse short ribs.Nulla jowl esse aute, quis fugiat ipsum tail commodo ex anim kevin. Ribeye quis porchetta meatball, shankle pork loin jowl. Landjaeger andouille dolore jowl ut drumstick pariatur. Et pork chop id laborum landjaeger pancetta occaecat swine. Porchetta picanha flank consectetur shankle reprehenderit pork chop short ribs. Brisket dolore ribeye ea drumstick landjaeger. Dolore consectetur flank, tenderloin occaecat do reprehenderit.Pig elit in dolore, ullamco nostrud biltong kevin culpa venison labore swine tri-tip. Brisket laborum short ribs id capicola venison. Velit picanha nostrud strip steak exercitation, tongue pig duis id ad labore tempor. Veniam frankfurter sed hamburger short ribs kevin cupim sint lorem Bacon ipsum dolor amet dolore ipsum boudin jowl. Frankfurter flank turkey incididunt minim elit tongue id t-bone labore magna bresaola alcatra hamburger. Ribeye alcatra in shank, turkey ground round sunt short ribs reprehenderit labore tongue. Qui consectetur salami labore. Est sed venison sausage, kielbasa cupidatat porchetta ad beef. Ex capicola short loin drumstick shank, pork reprehenderit shankle beef ribs proident short ribs kevin sausage ribeye chicken. Sirloin kielbasa capicola sausage tenderloin, aliquip brisket short ribs ham duis filet mignon. Boudin turkey jerky in ullamco rump turducken bresaola kevin qui andouille tri-tip pastrami. Tenderloin ut frankfurter hamburger in pork belly cow proident short loin veniam shank filet mignon beef pastrami. Pariatur porchetta in swine officia spare ribs sunt. Landjaeger fatback nisi tenderloin kielbasa. Swine jowl filet mignon fatback. Turducken picanha reprehenderit, ham hock pancetta consectetur sirloin sed excepteur jowl qui commodo corned beef lorem. Non rump nostrud sunt alcatra aute. Nulla irure aliqua commodo laboris. Fugiat do ad occaecat in, ut porchetta commodo. Ham hock cillum in cow salami minim in. Beef pancetta reprehenderit, leberkas excepteur swine qui cow magna chicken exercitation esse short ribs.Nulla jowl esse aute, quis fugiat ipsum tail commodo ex anim kevin. Ribeye quis porchetta meatball, shankle pork loin jowl. Landjaeger andouille dolore jowl ut drumstick pariatur. Et pork chop id laborum landjaeger pancetta occaecat swine. Porchetta picanha flank consectetur shankle reprehenderit pork chop short ribs. Brisket dolore ribeye ea drumstick landjaeger. Dolore consectetur flank, tenderloin occaecat do reprehenderit.Pig elit in dolore, ullamco nostrud biltong kevin culpa venison labore swine tri-tip. Brisket laborum short ribs id capicola venison. Velit picanha nostrud strip steak exercitation, tongue pig duis id ad labore tempor. Veniam frankfurter sed hamburger short ribs kevin cupim sint lorem
4+
3,pork is the best

src/test/scala/com/databricks/spark/csv/CsvSuite.scala

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ abstract class AbstractCsvSuite extends FunSuite with BeforeAndAfterAll {
4747
val disableCommentsFile = "src/test/resources/disable_comments.csv"
4848
val boolFile = "src/test/resources/bool.csv"
4949
val datesFile = "src/test/resources/dates.csv"
50+
val longColsFile = "src/test/resources/long-cols.csv"
5051
private val simpleDatasetFile = "src/test/resources/simple.csv"
5152

5253
val numCars = 3
@@ -857,6 +858,38 @@ abstract class AbstractCsvSuite extends FunSuite with BeforeAndAfterAll {
857858
assert(results.toSeq.map(_.toSeq) === expected)
858859
}
859860

861+
test("DSL allows for setting maxColsPerChar and expect error") {
862+
val parser = new CsvParser()
863+
.withDelimiter(',')
864+
.withUseHeader(true)
865+
.withParseMode(ParseModes.FAIL_FAST_MODE)
866+
.withMaxCharsPerCol(5000)
867+
868+
val exception = intercept[SparkException]{
869+
parser.csvFile(sqlContext, longColsFile)
870+
.select("text")
871+
.collect()
872+
}
873+
874+
assert(exception.getMessage.contains("Malformed line in FAILFAST mode: 2,Bacon ipsum dolor amet dolore"))
875+
}
876+
877+
test("DSL allows for setting maxColsPerChar and succeeds") {
878+
val parser = new CsvParser()
879+
.withDelimiter(',')
880+
.withUseHeader(true)
881+
.withMaxCharsPerCol(15000)
882+
883+
val res = parser.csvFile(sqlContext, longColsFile)
884+
.collect()
885+
886+
assert(res.size === 3)
887+
assert(res(0).toSeq === Seq("1", "bacon is yummy"))
888+
assert(res(1).getAs[String](0) === "2")
889+
assert(res(1).getAs[String](1).startsWith("Bacon ipsum dolor amet dolore"))
890+
assert(res(2).toSeq === Seq("3", "pork is the best"))
891+
}
892+
860893
test("DSL load csv from rdd") {
861894
val csvRdd = sqlContext.sparkContext.parallelize(Seq("age,height", "20,1.8", "16,1.7"))
862895
val df = new CsvParser()

0 commit comments

Comments
 (0)