Skip to content

feat: Add support for List data types #39

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions Arrow/Sources/Arrow/ArrowArray.swift
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ public class ArrowArrayHolderImpl: ArrowArrayHolder {
return try ArrowArrayHolderImpl(BinaryArray(with))
case .strct:
return try ArrowArrayHolderImpl(StructArray(with))
case .list:
return try ArrowArrayHolderImpl(ListArray(with))
default:
throw ArrowError.invalid("Array not found for type: \(arrowType)")
}
Expand Down Expand Up @@ -405,3 +407,69 @@ public class StructArray: ArrowArray<[Any?]> {
return output
}
}

public class ListArray: ArrowArray<[Any?]> {
Copy link
Contributor

@abandy abandy Jun 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use a generic here: like ListArray<T>: ArrowArray<[T?]>?

public private(set) var values: ArrowArrayHolder?

public required init(_ arrowData: ArrowData) throws {
try super.init(arrowData)
guard arrowData.children.count == 1 else {
throw ArrowError.invalid("List array must have exactly one child")
}

guard let listType = arrowData.type as? ArrowTypeList else {
throw ArrowError.invalid("Expected ArrowTypeList")
}

self.values = try ArrowArrayHolderImpl.loadArray(
listType.elementType,
with: arrowData.children[0]
)
}

public override subscript(_ index: UInt) -> [Any?]? {
guard let values = self.values else { return nil }

if self.arrowData.isNull(index) {
return nil
}

let offsets = self.arrowData.buffers[1]
let offsetIndex = Int(index) * MemoryLayout<Int32>.stride

let startOffset = offsets.rawPointer.advanced(by: offsetIndex).load(as: Int32.self)
let endOffset = offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).load(as: Int32.self)

var items = [Any?]()
for i in startOffset..<endOffset {
items.append(values.array.asAny(UInt(i)))
}

return items
}

public override func asString(_ index: UInt) -> String {
guard let list = self[index] else {
return "null"
}

var output = "["

for (i, item) in list.enumerated() {
if i > 0 {
output.append(",")
}

if item == nil {
output.append("null")
} else if let asStringItem = item as? AsString {
output.append(asStringItem.asString(0))
} else {
output.append("\(item!)")
}
}

output.append("]")
return output
}
}
47 changes: 45 additions & 2 deletions Arrow/Sources/Arrow/ArrowArrayBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
public init(_ fields: [ArrowField], builders: [any ArrowArrayHolderBuilder]) throws {
self.fields = fields
self.builders = builders
try super.init(ArrowNestedType(ArrowType.ArrowStruct, fields: fields))
try super.init(ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields))
self.bufferBuilder.initializeTypeInfo(fields)
}

Expand All @@ -143,7 +143,7 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
}

self.builders = builders
try super.init(ArrowNestedType(ArrowType.ArrowStruct, fields: fields))
try super.init(ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields))
}

public override func append(_ values: [Any?]?) {
Expand Down Expand Up @@ -174,6 +174,31 @@ public class StructArrayBuilder: ArrowArrayBuilder<StructBufferBuilder, StructAr
}
}

public class ListArrayBuilder: ArrowArrayBuilder<ListBufferBuilder, ListArray> {
let valueBuilder: any ArrowArrayHolderBuilder

public override init(_ elementType: ArrowType) throws {
self.valueBuilder = try ArrowArrayBuilders.loadBuilder(arrowType: elementType)
try super.init(ArrowTypeList(elementType))
}

public override func append(_ values: [Any?]?) {
self.bufferBuilder.append(values)
if let vals = values {
for val in vals {
self.valueBuilder.appendAny(val)
}
}
}

public override func finish() throws -> ListArray {
let buffers = self.bufferBuilder.finish()
let childData = try valueBuilder.toHolder().array.arrowData
let arrowData = try ArrowData(self.type, buffers: buffers, children: [childData], nullCount: self.nullCount, length: self.length)
return try ListArray(arrowData)
}
}

public class ArrowArrayBuilders {
public static func loadBuilder( // swiftlint:disable:this cyclomatic_complexity
_ builderType: Any.Type) throws -> ArrowArrayHolderBuilder {
Expand Down Expand Up @@ -290,6 +315,16 @@ public class ArrowArrayBuilders {
throw ArrowError.invalid("Expected arrow type for \(arrowType.id) not found")
}
return try TimestampArrayBuilder(timestampType.unit)
case .list:
guard let listType = arrowType as? ArrowTypeList else {
throw ArrowError.invalid("Expected ArrowTypeList for \(arrowType.id)")
}
return try ListArrayBuilder(listType.elementType)
case .strct:
guard let structType = arrowType as? ArrowTypeStruct else {
throw ArrowError.invalid("Expected ArrowStructType for \(arrowType.id)")
}
return try StructArrayBuilder(structType.fields)
Comment on lines +318 to +327
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you use struct -> list order like others?

Suggested change
case .list:
guard let listType = arrowType as? ArrowTypeList else {
throw ArrowError.invalid("Expected ArrowTypeList for \(arrowType.id)")
}
return try ListArrayBuilder(listType.elementType)
case .strct:
guard let structType = arrowType as? ArrowTypeStruct else {
throw ArrowError.invalid("Expected ArrowStructType for \(arrowType.id)")
}
return try StructArrayBuilder(structType.fields)
case .strct:
guard let structType = arrowType as? ArrowTypeStruct else {
throw ArrowError.invalid("Expected ArrowStructType for \(arrowType.id)")
}
return try StructArrayBuilder(structType.fields)
case .list:
guard let listType = arrowType as? ArrowTypeList else {
throw ArrowError.invalid("Expected ArrowTypeList for \(arrowType.id)")
}
return try ListArrayBuilder(listType.elementType)

default:
throw ArrowError.unknownType("Builder not found for arrow type: \(arrowType.id)")
}
Expand Down Expand Up @@ -353,4 +388,12 @@ public class ArrowArrayBuilders {
public static func loadTimestampArrayBuilder(_ unit: ArrowTimestampUnit, timezone: String? = nil) throws -> TimestampArrayBuilder {
return try TimestampArrayBuilder(unit, timezone: timezone)
}

public static func loadStructArrayBuilder(_ fields: [ArrowField]) throws -> StructArrayBuilder {
return try StructArrayBuilder(fields)
}

public static func loadListArrayBuilder(_ elementType: ArrowType) throws -> ListArrayBuilder {
return try ListArrayBuilder(elementType)
}
}
63 changes: 61 additions & 2 deletions Arrow/Sources/Arrow/ArrowBufferBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -338,14 +338,14 @@ public class Date64BufferBuilder: AbstractWrapperBufferBuilder<Date, Int64> {

public final class StructBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
public typealias ItemType = [Any?]
var info: ArrowNestedType?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nested type should be able to handle a list without making an individual type for list and struct. A list should be able to be expressed as nested type with only a single field. Is there a reason you are choosing to move away from Nested types?

var info: ArrowTypeStruct?
public init() throws {
let nulls = ArrowBuffer.createBuffer(0, size: UInt(MemoryLayout<UInt8>.stride))
super.init(nulls)
}

public func initializeTypeInfo(_ fields: [ArrowField]) {
info = ArrowNestedType(ArrowType.ArrowStruct, fields: fields)
info = ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields)
}

public func append(_ newValue: [Any?]?) {
Expand Down Expand Up @@ -379,3 +379,62 @@ public final class StructBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
return [nulls]
}
}

public class ListBufferBuilder: BaseBufferBuilder, ArrowBufferBuilder {
public typealias ItemType = [Any?]
var offsets: ArrowBuffer

public required init() throws {
self.offsets = ArrowBuffer.createBuffer(1, size: UInt(MemoryLayout<Int32>.stride))
let nulls = ArrowBuffer.createBuffer(0, size: UInt(MemoryLayout<UInt8>.stride))
super.init(nulls)
self.offsets.rawPointer.storeBytes(of: Int32(0), as: Int32.self)
}

public func append(_ newValue: [Any?]?) {
let index = UInt(self.length)
self.length += 1

if length >= self.offsets.length {
self.resize(length + 1)
Comment on lines +398 to +399
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we use self.length here?

Suggested change
if length >= self.offsets.length {
self.resize(length + 1)
if self.length >= self.offsets.length {
self.resize(self.length + 1)

}

let offsetIndex = Int(index) * MemoryLayout<Int32>.stride
let currentOffset = self.offsets.rawPointer.advanced(by: offsetIndex).load(as: Int32.self)

if let vals = newValue {
BitUtility.setBit(index + self.offset, buffer: self.nulls)
let newOffset = currentOffset + Int32(vals.count)
self.offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).storeBytes(of: newOffset, as: Int32.self)
} else {
self.nullCount += 1
BitUtility.clearBit(index + self.offset, buffer: self.nulls)
self.offsets.rawPointer.advanced(by: offsetIndex + MemoryLayout<Int32>.stride).storeBytes(of: currentOffset, as: Int32.self)
}
}

public override func isNull(_ index: UInt) -> Bool {
return !BitUtility.isSet(index + self.offset, buffer: self.nulls)
}

public func resize(_ length: UInt) {
if length > self.offsets.length {
let resizeLength = resizeLength(self.offsets)
var offsets = ArrowBuffer.createBuffer(resizeLength, size: UInt(MemoryLayout<Int32>.size))
var nulls = ArrowBuffer.createBuffer(resizeLength/8 + 1, size: UInt(MemoryLayout<UInt8>.size))
ArrowBuffer.copyCurrent(self.offsets, to: &offsets, len: self.offsets.capacity)
ArrowBuffer.copyCurrent(self.nulls, to: &nulls, len: self.nulls.capacity)
self.offsets = offsets
self.nulls = nulls
}
}

public func finish() -> [ArrowBuffer] {
let length = self.length
var nulls = ArrowBuffer.createBuffer(length/8 + 1, size: UInt(MemoryLayout<UInt8>.size))
var offsets = ArrowBuffer.createBuffer(length + 1, size: UInt(MemoryLayout<Int32>.size))
ArrowBuffer.copyCurrent(self.nulls, to: &nulls, len: nulls.capacity)
ArrowBuffer.copyCurrent(self.offsets, to: &offsets, len: offsets.capacity)
return [nulls, offsets]
}
}
44 changes: 39 additions & 5 deletions Arrow/Sources/Arrow/ArrowReader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,35 @@ public class ArrowReader { // swiftlint:disable:this type_body_length
rbLength: UInt(loadInfo.batchData.recordBatch.length))
}

private func loadListData(_ loadInfo: DataLoadInfo, field: org_apache_arrow_flatbuf_Field) -> Result<ArrowArrayHolder, ArrowError> {
guard let node = loadInfo.batchData.nextNode() else {
return .failure(.invalid("Node not found"))
}

guard let nullBuffer = loadInfo.batchData.nextBuffer() else {
return .failure(.invalid("Null buffer not found"))
}

guard let offsetBuffer = loadInfo.batchData.nextBuffer() else {
return .failure(.invalid("Offset buffer not found"))
}

let nullLength = UInt(ceil(Double(node.length) / 8))
let arrowNullBuffer = makeBuffer(nullBuffer, fileData: loadInfo.fileData, length: nullLength, messageOffset: loadInfo.messageOffset)
let arrowOffsetBuffer = makeBuffer(offsetBuffer, fileData: loadInfo.fileData, length: UInt(node.length + 1), messageOffset: loadInfo.messageOffset)

guard field.childrenCount == 1, let childField = field.children(at: 0) else {
return .failure(.invalid("List must have exactly one child"))
}

switch loadField(loadInfo, field: childField) {
case .success(let childHolder):
return makeArrayHolder(field, buffers: [arrowNullBuffer, arrowOffsetBuffer], nullCount: UInt(node.nullCount), children: [childHolder.array.arrowData], rbLength: UInt(loadInfo.batchData.recordBatch.length))
case .failure(let error):
return .failure(error)
}
}

private func loadPrimitiveData(
_ loadInfo: DataLoadInfo,
field: org_apache_arrow_flatbuf_Field)
Expand Down Expand Up @@ -178,12 +207,17 @@ public class ArrowReader { // swiftlint:disable:this type_body_length
_ loadInfo: DataLoadInfo,
field: org_apache_arrow_flatbuf_Field)
-> Result<ArrowArrayHolder, ArrowError> {
if isNestedType(field.typeType) {
switch field.typeType {
case .struct_:
return loadStructData(loadInfo, field: field)
} else if isFixedPrimitive(field.typeType) {
return loadPrimitiveData(loadInfo, field: field)
} else {
return loadVariableData(loadInfo, field: field)
case .list:
return loadListData(loadInfo, field: field)
default:
if isFixedPrimitive(field.typeType) {
return loadPrimitiveData(loadInfo, field: field)
} else {
return loadVariableData(loadInfo, field: field)
}
}
}

Expand Down
36 changes: 26 additions & 10 deletions Arrow/Sources/Arrow/ArrowReaderHelper.swift
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,23 @@ func makeStructHolder(
}
}

func makeListHolder(
_ field: ArrowField,
buffers: [ArrowBuffer],
nullCount: UInt,
children: [ArrowData],
rbLength: UInt
) -> Result<ArrowArrayHolder, ArrowError> {
do {
let arrowData = try ArrowData(field.type, buffers: buffers, children: children, nullCount: nullCount, length: rbLength)
return .success(ArrowArrayHolderImpl(try ListArray(arrowData)))
} catch let error as ArrowError {
return .failure(error)
} catch {
return .failure(.unknownError("\(error)"))
}
}

func makeArrayHolder(
_ field: org_apache_arrow_flatbuf_Field,
buffers: [ArrowBuffer],
Expand Down Expand Up @@ -208,6 +225,8 @@ func makeArrayHolder( // swiftlint:disable:this cyclomatic_complexity
return makeTimestampHolder(field, buffers: buffers, nullCount: nullCount)
case .strct:
return makeStructHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
case .list:
return makeListHolder(field, buffers: buffers, nullCount: nullCount, children: children!, rbLength: rbLength)
default:
return .failure(.unknownType("Type \(typeId) currently not supported"))
}
Expand All @@ -230,15 +249,6 @@ func isFixedPrimitive(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
}
}

func isNestedType(_ type: org_apache_arrow_flatbuf_Type_) -> Bool {
switch type {
case .struct_:
return true
default:
return false
}
}

func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_body_length
_ field: org_apache_arrow_flatbuf_Field) -> ArrowType {
let type = field.typeType
Expand Down Expand Up @@ -307,7 +317,13 @@ func findArrowType( // swiftlint:disable:this cyclomatic_complexity function_bod
ArrowField(childField.name ?? "", type: childType, isNullable: childField.nullable))
}

return ArrowNestedType(ArrowType.ArrowStruct, fields: fields)
return ArrowTypeStruct(ArrowType.ArrowStruct, fields: fields)
case .list:
guard field.childrenCount == 1, let childField = field.children(at: 0) else {
return ArrowType(ArrowType.ArrowUnknown)
}
let childType = findArrowType(childField)
return ArrowTypeList(childType)
default:
return ArrowType(ArrowType.ArrowUnknown)
}
Expand Down
Loading