Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ public BaseLargeVariableWidthVector(Field field, final BufferAllocator allocator
lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1;
valueCount = 0;
lastSet = -1;
offsetBuffer = allocator.getEmpty();
// Allocate offset buffer with at least OFFSET_WIDTH capacity to ensure
// offset[0] is always available according to Arrow spec.
offsetBuffer = allocateOffsetBuffer(OFFSET_WIDTH);
validityBuffer = allocator.getEmpty();
valueBuffer = allocator.getEmpty();
}
Expand Down Expand Up @@ -373,14 +375,29 @@ private void setReaderAndWriterIndex() {
valueBuffer.readerIndex(0);
if (valueCount == 0) {
validityBuffer.writerIndex(0);
offsetBuffer.writerIndex(0);
valueBuffer.writerIndex(0);
} else {
final long lastDataOffset = getStartOffset(valueCount);
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
valueBuffer.writerIndex(lastDataOffset);
}
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
// in other libraries. According to Arrow spec, we should still output the offset buffer which
// is [0].
final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH;
if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
// Allocate a new buffer with sufficient capacity. This can happen when vector
// was loaded via loadFieldBuffers() with an empty offset buffer.
ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize);
// Copy existing data if any
if (offsetBuffer.capacity() > 0) {
newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
}
offsetBuffer.getReferenceManager().release();
offsetBuffer = newOffsetBuffer;
}
offsetBuffer.writerIndex(requiredOffsetBufferSize);
}

/** Same as {@link #allocateNewSafe()}. */
Expand Down Expand Up @@ -492,7 +509,9 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) {

/* allocate offset buffer */
private ArrowBuf allocateOffsetBuffer(final long size) {
ArrowBuf offsetBuffer = allocator.buffer(size);
// Ensure at least OFFSET_WIDTH capacity according to Arrow spec
final long curSize = Math.max(size, OFFSET_WIDTH);
ArrowBuf offsetBuffer = allocator.buffer(curSize);
offsetBuffer.readerIndex(0);
offsetBuffer.setZero(0, offsetBuffer.capacity());
return offsetBuffer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ public BaseVariableWidthVector(Field field, final BufferAllocator allocator) {
lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1;
valueCount = 0;
lastSet = -1;
offsetBuffer = allocator.getEmpty();
// Allocate offset buffer with at least OFFSET_WIDTH capacity to ensure
// offset[0] is always available according to Arrow spec.
offsetBuffer = allocateOffsetBuffer(OFFSET_WIDTH);
validityBuffer = allocator.getEmpty();
valueBuffer = allocator.getEmpty();
}
Expand Down Expand Up @@ -389,14 +391,29 @@ private void setReaderAndWriterIndex() {
valueBuffer.readerIndex(0);
if (valueCount == 0) {
validityBuffer.writerIndex(0);
offsetBuffer.writerIndex(0);
valueBuffer.writerIndex(0);
} else {
final int lastDataOffset = getStartOffset(valueCount);
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
valueBuffer.writerIndex(lastDataOffset);
}
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
// in other libraries. According to Arrow spec, we should still output the offset buffer which
// is [0].
final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH;
if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
// Allocate a new buffer with sufficient capacity. This can happen when vector
// was loaded via loadFieldBuffers() with an empty offset buffer.
ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize);
// Copy existing data if any
if (offsetBuffer.capacity() > 0) {
newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
}
offsetBuffer.getReferenceManager().release();
offsetBuffer = newOffsetBuffer;
}
offsetBuffer.writerIndex(requiredOffsetBufferSize);
}

/** Same as {@link #allocateNewSafe()}. */
Expand Down Expand Up @@ -509,7 +526,8 @@ private void allocateBytes(final long valueBufferSize, final int valueCount) {

/* allocate offset buffer */
private ArrowBuf allocateOffsetBuffer(final long size) {
final int curSize = (int) size;
// Ensure at least OFFSET_WIDTH capacity according to Arrow spec
final int curSize = (int) Math.max(size, OFFSET_WIDTH);
ArrowBuf offsetBuffer = allocator.buffer(curSize);
offsetBuffer.readerIndex(0);
offsetBuffer.setZero(0, offsetBuffer.capacity());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3940,4 +3940,42 @@ public void testVectorLoadUnloadOnNonVariadicVectors() {
}
}
}

@Test
public void testEmptyVarCharOffsetBuffer() {
// Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0)
// even when valueCount is 0, per Arrow specification.
try (VarCharVector vector = newVarCharVector("varchar", allocator)) {
vector.allocateNew();
vector.setValueCount(0);

List<ArrowBuf> buffers = vector.getFieldBuffers();
// buffers: [validity, offset, data]
assertTrue(
buffers.get(1).readableBytes() >= BaseVariableWidthVector.OFFSET_WIDTH,
"Offset buffer should have at least "
+ BaseVariableWidthVector.OFFSET_WIDTH
+ " bytes for offset[0]");
assertEquals(0, vector.getOffsetBuffer().getInt(0));
}
}

@Test
public void testEmptyLargeVarCharOffsetBuffer() {
// Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0)
// even when valueCount is 0, per Arrow specification.
try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar", allocator)) {
vector.allocateNew();
vector.setValueCount(0);

List<ArrowBuf> buffers = vector.getFieldBuffers();
// buffers: [validity, offset, data]
assertTrue(
buffers.get(1).readableBytes() >= BaseLargeVariableWidthVector.OFFSET_WIDTH,
"Offset buffer should have at least "
+ BaseLargeVariableWidthVector.OFFSET_WIDTH
+ " bytes for offset[0]");
assertEquals(0, vector.getOffsetBuffer().getLong(0));
}
}
}
Loading