Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,22 @@ class UnivocityParser(

private def parseLine(line: String): Array[String] = {
try {
tokenizer.parseLine(line)
val tokens = tokenizer.parseLine(line)
// SPARK-46959: When escape is set to "" (mapped to '\u0000'), univocity's parser
// correctly parses mid-line empty quoted fields ("") as empty strings, but misparses
// the last column's "" as a literal '"' character. This happens because there is no
// delimiter after the last field to signal field boundary, and univocity interprets
// the second '"' as content rather than a closing quote. We fix this by replacing
// a single quote-char token in the last position with the configured emptyValue.
if (tokens != null && tokens.length > 0 && options.escape == '\u0000') {
val lastIdx = tokens.length - 1
val lastToken = tokens(lastIdx)
if (lastToken != null && lastToken.length == 1 &&
lastToken.charAt(0) == options.quote) {
tokens(lastIdx) = options.emptyValueInRead
}
}
tokens
}
catch {
case e: TextParsingException if e.getCause.isInstanceOf[ArrayIndexOutOfBoundsException] =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,14 @@ object DataSourceUtils extends PredicateHelper {
}

def shouldIgnoreCorruptFileException(e: Throwable): Boolean = e match {
case _: RuntimeException | _: IOException | _: InternalError => true
case _: RuntimeException | _: IOException | _: InternalError =>
val msg = e.getMessage
if (msg != null && msg.contains(
"Cannot reserve additional contiguous bytes in the vectorized reader")) {
false
} else {
true
}
case _ => false
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,42 @@ abstract class CSVSuite
verifyCars(cars, withHeader = true, checkTypes = true)
}

test("SPARK-46959: CSV reader reads data inconsistently depending on column position") {
// When escape="" is set, empty quoted strings ("") should be parsed consistently
// as empty (and mapped to null via nullValue="") regardless of column position.
// Before the fix, the last column on a line would parse "" as a literal '"'.
import testImplicits._
val data = Seq(
""""a";"b";"c";"d"""",
"""10;100,00;"Some;String";"ok"""",
"""20;200,00;"";"still ok"""",
"""30;300,00;"also ok";"" """.trim,
"""40;400,00;"";"" """.trim
)
val df = spark.read
.option("header", "true")
.option("sep", ";")
.option("nullValue", "")
.option("quote", "\"")
.option("escape", "")
.csv(spark.createDataset(data))

val results = df.collect()
assert(results.length == 4)
// Row 0: both c and d have values
assert(results(0).getString(2) == "Some;String")
assert(results(0).getString(3) == "ok")
// Row 1: c is empty (mid-line), d has a value
assert(results(1).getString(2) == null)
assert(results(1).getString(3) == "still ok")
// Row 2: c has a value, d is empty (last column) - this was the buggy case
assert(results(2).getString(2) == "also ok")
assert(results(2).getString(3) == null)
// Row 3: both c and d are empty
assert(results(3).getString(2) == null)
assert(results(3).getString(3) == null)
}

test("simple csv test with string dataset") {
val csvDataset = spark.read.text(testFile(carsFile)).as[String]
val cars = spark.read
Expand Down