knime · HedgehogCode · Jan 28, 2026 · Feb 2, 2026 · Feb 2, 2026 · Jan 29, 2026
@@ -14,7 +14,7 @@ pixi run format       # Ruff (target: Python 3.8)
 
 ## Repository-Specific Conventions
 - **Path setup**: Root `conftest.py` adds all plugin paths to `sys.path` - **never modify `sys.path` in tests**
-- **Test style**: Function-based tests only (`pytest.ini` disables class discovery)
+- **Test style**: Primarily function-based tests. Class-based tests (`Test*` and `*Test` patterns) are enabled in `pytest.ini` to support parameterized tests.
 - **Multi-version**: Changes must work across Python 3.8-3.14 - test with `pixi run test-all`
 - **PyArrow versions**: Tightly coupled to Python version in `pixi.toml` - don't update independently
 

@@ -74,6 +74,7 @@
     _generate_test_data_frame,
     _apply_to_array,
     _register_extension_types,
+    _generate_arrow_table,
 )
 
 
@@ -942,6 +943,49 @@ def test_struct_dict_encoded_logical_type_extension_type(self):
         self.assertEqual(df["Name"].iloc[5], "LINESTRING (40 20, 10 30, 35 40)")
         self.assertEqual(df["Name"].iloc[6], "LINESTRING (30 10, 10 30, 40 40)")
 
+    def test_struct_dict_encoding_with_chunks_regression(self):
+        """
+        Regression test for struct dict encoded array corruption during round-trip conversion.
+
+        Bug Context:
+        KNIME uses struct dict encoding to compress data: values are stored only on first
+        occurrence in a struct {key: uint, value: T}, with subsequent occurrences referencing
+        them by key. This creates an encoded array where keys point to indices in the data
+        array, and the data array contains actual values at those indices.
+
+        The Problem:
+        In pandas 2.1.0+, pd.concat changed and calls KnimePandasExtensionArray.take with indices
+        re-batches ChunkedArrays at different boundaries. When an already-encoded struct dict
+        array gets split incorrectly:
+        - Chunk 1 might contain data array with actual values at indices [0,1,2...]
+        - Chunk 2 gets keys [0,0,0...] referencing index 0, but chunk 2's data array has
+          null at index 0 (the actual value is in chunk 1)
+
+        This causes reads to fail with "Cannot read DataCell with empty type information"
+        because the dictionary structure is broken across chunks.
+
+        What This Test Does:
+        1. Loads structDictEncodedDataCellsWithBatches.zip which
+           - has 3 batches
+           - contains a struct dict encoded column for generic data cells
+        2. Performs arrow → pandas → arrow round-trip conversion
+        3. Asserts all columns remain equal after round-trip
+
+        This was fixed by shortcutting taking indices from the storage array in KnimePandasExtensionArray.take if
+        the indices cover the full array with the lines:
+        ```
+        if len(indices) == len(self) and np.all(indices == np.arange(len(self))):
+            return self.copy()
+        ```
+        """
+        arrow_table = _generate_arrow_table("structDictEncodedDataCellsWithBatches.zip")
+        df = kap.arrow_data_to_pandas_df(arrow_table)
+        arrow_table_2 = kap.pandas_df_to_arrow(df)
+
+        self.assertEqual(len(arrow_table_2.columns), 2)
+        self.assertEqual(arrow_table.column(0), arrow_table_2.column(0))
+        self.assertEqual(arrow_table.column(1), arrow_table_2.column(1))
+
     def test_chunk_calculation(self):
         def _get_chunked_array_for_start_indices(chunk_start_indices):
             chunk_list = []