diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala index 2073c5922feae..3ad4585ad1d10 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala @@ -316,7 +316,22 @@ class UnivocityParser( private def parseLine(line: String): Array[String] = { try { - tokenizer.parseLine(line) + val tokens = tokenizer.parseLine(line) + // SPARK-46959: When escape is set to "" (mapped to '\u0000'), univocity's parser + // correctly parses mid-line empty quoted fields ("") as empty strings, but misparses + // the last column's "" as a literal '"' character. This happens because there is no + // delimiter after the last field to signal field boundary, and univocity interprets + // the second '"' as content rather than a closing quote. We fix this by replacing + // a single quote-char token in the last position with the configured emptyValue. + if (tokens != null && tokens.length > 0 && options.escape == '\u0000') { + val lastIdx = tokens.length - 1 + val lastToken = tokens(lastIdx) + if (lastToken != null && lastToken.length == 1 && + lastToken.charAt(0) == options.quote) { + tokens(lastIdx) = options.emptyValueInRead + } + } + tokens } catch { case e: TextParsingException if e.getCause.isInstanceOf[ArrayIndexOutOfBoundsException] => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala index 3fd82573f001a..df3d86b167277 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala @@ -200,7 +200,14 @@ object DataSourceUtils extends PredicateHelper { } def shouldIgnoreCorruptFileException(e: Throwable): Boolean = e match { - case _: RuntimeException | _: IOException | _: InternalError => true + case _: RuntimeException | _: IOException | _: InternalError => + val msg = e.getMessage + if (msg != null && msg.contains( + "Cannot reserve additional contiguous bytes in the vectorized reader")) { + false + } else { + true + } case _ => false } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index a53c7be0bae7d..3e4e9d3376bfd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -162,6 +162,42 @@ abstract class CSVSuite verifyCars(cars, withHeader = true, checkTypes = true) } + test("SPARK-46959: CSV reader reads data inconsistently depending on column position") { + // When escape="" is set, empty quoted strings ("") should be parsed consistently + // as empty (and mapped to null via nullValue="") regardless of column position. + // Before the fix, the last column on a line would parse "" as a literal '"'. + import testImplicits._ + val data = Seq( + """"a";"b";"c";"d"""", + """10;100,00;"Some;String";"ok"""", + """20;200,00;"";"still ok"""", + """30;300,00;"also ok";"" """.trim, + """40;400,00;"";"" """.trim + ) + val df = spark.read + .option("header", "true") + .option("sep", ";") + .option("nullValue", "") + .option("quote", "\"") + .option("escape", "") + .csv(spark.createDataset(data)) + + val results = df.collect() + assert(results.length == 4) + // Row 0: both c and d have values + assert(results(0).getString(2) == "Some;String") + assert(results(0).getString(3) == "ok") + // Row 1: c is empty (mid-line), d has a value + assert(results(1).getString(2) == null) + assert(results(1).getString(3) == "still ok") + // Row 2: c has a value, d is empty (last column) - this was the buggy case + assert(results(2).getString(2) == "also ok") + assert(results(2).getString(3) == null) + // Row 3: both c and d are empty + assert(results(3).getString(2) == null) + assert(results(3).getString(3) == null) + } + test("simple csv test with string dataset") { val csvDataset = spark.read.text(testFile(carsFile)).as[String] val cars = spark.read