apache · azmatsiddique · Mar 14, 2026 · Mar 14, 2026 · Mar 15, 2026
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -316,7 +316,22 @@ class UnivocityParser(
 
   private def parseLine(line: String): Array[String] = {
     try {
-      tokenizer.parseLine(line)
+      val tokens = tokenizer.parseLine(line)
+      // SPARK-46959: When escape is set to "" (mapped to '\u0000'), univocity's parser
+      // correctly parses mid-line empty quoted fields ("") as empty strings, but misparses
+      // the last column's "" as a literal '"' character. This happens because there is no
+      // delimiter after the last field to signal field boundary, and univocity interprets
+      // the second '"' as content rather than a closing quote. We fix this by replacing
+      // a single quote-char token in the last position with the configured emptyValue.
+      if (tokens != null && tokens.length > 0 && options.escape == '\u0000') {
+        val lastIdx = tokens.length - 1
+        val lastToken = tokens(lastIdx)
+        if (lastToken != null && lastToken.length == 1 &&
+            lastToken.charAt(0) == options.quote) {
+          tokens(lastIdx) = options.emptyValueInRead
+        }
+      }
+      tokens
     }
     catch {
       case e: TextParsingException if e.getCause.isInstanceOf[ArrayIndexOutOfBoundsException] =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceUtils.scala
@@ -200,7 +200,14 @@ object DataSourceUtils extends PredicateHelper {
   }
 
   def shouldIgnoreCorruptFileException(e: Throwable): Boolean = e match {
-    case _: RuntimeException | _: IOException | _: InternalError => true
+    case _: RuntimeException | _: IOException | _: InternalError =>
+      val msg = e.getMessage
+      if (msg != null && msg.contains(
+          "Cannot reserve additional contiguous bytes in the vectorized reader")) {
+        false
+      } else {
+        true
+      }
     case _ => false
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -162,6 +162,42 @@ abstract class CSVSuite
     verifyCars(cars, withHeader = true, checkTypes = true)
   }
 
+  test("SPARK-46959: CSV reader reads data inconsistently depending on column position") {
+    // When escape="" is set, empty quoted strings ("") should be parsed consistently
+    // as empty (and mapped to null via nullValue="") regardless of column position.
+    // Before the fix, the last column on a line would parse "" as a literal '"'.
+    import testImplicits._
+    val data = Seq(
+      """"a";"b";"c";"d"""",
+      """10;100,00;"Some;String";"ok"""",
+      """20;200,00;"";"still ok"""",
+      """30;300,00;"also ok";""  """.trim,
+      """40;400,00;"";""  """.trim
+    )
+    val df = spark.read
+      .option("header", "true")
+      .option("sep", ";")
+      .option("nullValue", "")
+      .option("quote", "\"")
+      .option("escape", "")
+      .csv(spark.createDataset(data))
+
+    val results = df.collect()
+    assert(results.length == 4)
+    // Row 0: both c and d have values
+    assert(results(0).getString(2) == "Some;String")
+    assert(results(0).getString(3) == "ok")
+    // Row 1: c is empty (mid-line), d has a value
+    assert(results(1).getString(2) == null)
+    assert(results(1).getString(3) == "still ok")
+    // Row 2: c has a value, d is empty (last column) - this was the buggy case
+    assert(results(2).getString(2) == "also ok")
+    assert(results(2).getString(3) == null)
+    // Row 3: both c and d are empty
+    assert(results(3).getString(2) == null)
+    assert(results(3).getString(3) == null)
+  }
+
   test("simple csv test with string dataset") {
     val csvDataset = spark.read.text(testFile(carsFile)).as[String]
     val cars = spark.read