Skip to content

Commit 005c946

Browse files
committed
Implement C Data integration
This starts work towards supporting teh C data interface for the arrow format, as documented [here](https://arrow.apache.org/docs/format/CDataInterface.html#). Currently in this PR, it includes struct definitions and basic methods to allow getting a pointer to an `ArrowSchema`/`ArrowArray` C-compatible struct that can then be populated by another implementation. For example, with this PR, you can do: ```julia using Arrow, PyCall pd = pyimport("pandas") pa = pyimport("pyarrow") df = pd.DataFrame(py"""{'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e']}"""o) rb = pa.record_batch(df) sch = Arrow.CData.getschema() do ptr rb.schema._export_to_c(Int(ptr)) end arr = Arrow.CData.getarray() do ptr rb._export_to_c(Int(ptr)) end ``` Currently, these `ArrowSchema`/`ArrowArray` structs are pretty bare bones, but it at least lays some ground work for integration. Things we still need/want to make all this nicer to use/work with: * Type format string parsing/converting: we need to parse the type format strings as outlined [here](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings) to figure out what type of data we'll get in the arrays. It'd probably be best to add a `type` field to the ArrowSchema struct that we'd populate when converting from `CArrowSchema` -> `ArrowSchema` * Add a method like `Arrow.ArrowVector(::ArrowSchema, ::ArrowArray)` that produced a concrete `ArrowVector` subtype, like `Arrow.Primitive`, `Arrow.List`, etc. This will be a bit tricky, because have to follow all the same columnar layout trickery that we currently handle for IPC in the table.jl `build` methods. Perhaps we can refactor all that so we can re-use some code? Otherwise, we might just need to reimplement a bunch of that logic specific to converting `ArrrowArray`s. * That should give a robust consuming story; for producing, we probably need a definition like `Arrow.ArrowSchema(a::Arrow.ArrowVector)` that produced a valid `ArrowSchema`, and then overloads per `ArrowVector` subtype like `Arrow.ArrowArray(x::Arrow.Primitive)` that produced the right `ArrowArray` for a concrete arrow array * Then the last piece we need is just figuring out the right mechanics for providing a pointer to the `CArrowSchema`, `CArrowArray` structs once they're populated If anyone would like to help out, I'm happy to provide as much guidance as possible so others can get their feet wet in some arrow spec nitty-gritty.
1 parent bdd0e54 commit 005c946

2 files changed

Lines changed: 166 additions & 0 deletions

File tree

src/Arrow.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ include("arraytypes/arraytypes.jl")
9292
include("eltypes.jl")
9393
include("table.jl")
9494
include("write.jl")
95+
include("cinterface.jl")
9596

9697
const LZ4_FRAME_COMPRESSOR = LZ4FrameCompressor[]
9798
const ZSTD_COMPRESSOR = ZstdCompressor[]

src/cinterface.jl

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
module CData
2+
3+
export ArrowSchema, ArrowArray, getschema, getarray
4+
5+
const ARROW_FLAG_DICTIONARY_ORDERED = 1
6+
const ARROW_FLAG_NULLABLE = 2
7+
const ARROW_FLAG_MAP_KEYS_SORTED = 4
8+
9+
struct CArrowSchema
10+
format::Ptr{UInt8}
11+
name::Ptr{UInt8}
12+
metadata::Ptr{UInt8}
13+
flags::Int64
14+
n_children::Int64
15+
children::Ptr{Ptr{CArrowSchema}}
16+
dictionary::Ptr{CArrowSchema}
17+
release::Ptr{Cvoid}
18+
private_data::Ptr{Cvoid}
19+
end
20+
21+
CArrowSchema() = CArrowSchema(C_NULL, C_NULL, C_NULL, 0, 0, C_NULL, C_NULL, _CNULL, C_NULL)
22+
23+
Base.propertynames(::CArrowSchema) = (:format, :name, :metadata, :flags, :n_children, :children, :dictionary)
24+
25+
function readmetadata(ptr::Ptr{UInt8})
26+
pos = 1
27+
meta = Dict{String, String}()
28+
if ptr != C_NULL
29+
n_entries = unsafe_load(convert(Ptr{Int32}, ptr))
30+
ptr += 4
31+
for _ = 1:n_entries
32+
keylen = unsafe_load(convert(Ptr{Int32}, ptr))
33+
ptr += 4
34+
key = unsafe_string(ptr, keylen)
35+
ptr += keylen
36+
vallen = unsafe_load(convert(Ptr{Int32}, ptr))
37+
ptr += 4
38+
val = unsafe_string(ptr, vallen)
39+
ptr += vallen
40+
meta[key] = val
41+
end
42+
end
43+
return meta
44+
end
45+
46+
function Base.getproperty(x::CArrowSchema, nm::Symbol)
47+
if nm === :format
48+
return unsafe_string(getfield(x, :format))
49+
elseif nm === :name
50+
return unsafe_string(getfield(x, :name))
51+
elseif nm === :metadata
52+
return readmetadata(getfield(x, :metadata))
53+
elseif nm === :flags
54+
return getfield(x, :flags)
55+
elseif nm === :n_children
56+
return getfield(x, :n_children)
57+
elseif nm === :children
58+
c = getfield(x, :children)
59+
return c == C_NULL ? CArrowSchema[] : unsafe_wrap(Array, unsafe_load(c), getfield(x, :n_children))
60+
elseif nm === :dictionary
61+
d = getfield(x, :dictionary)
62+
return d == C_NULL ? nothing : unsafe_load(d)
63+
end
64+
error("unknown property requested: $nm")
65+
end
66+
67+
mutable struct ArrowSchema
68+
format::String
69+
name::String
70+
metadata::Dict{String, String}
71+
flags::Int64
72+
n_children::Int64
73+
children::Vector{ArrowSchema}
74+
dictionary::Union{Nothing, ArrowSchema}
75+
carrowschema::Ref{CArrowSchema}
76+
end
77+
78+
ArrowSchema(s::Ref{CArrowSchema}) = ArrowSchema(s[].format, s[].name, s[].metadata, s[].flags, s[].n_children, map(ArrowSchema, s[].children), s[].dictionary === nothing ? nothing : ArrowSchema(s[].dictionary), s)
79+
ArrowSchema(s::CArrowSchema) = ArrowSchema(s.format, s.name, s.metadata, s.flags, s.n_children, map(ArrowSchema, s.children), s.dictionary === nothing ? nothing : ArrowSchema(s.dictionary), Ref{CArrowSchema}())
80+
81+
function getschema(f)
82+
schref = Ref{CArrowSchema}()
83+
ptr = Base.unsafe_convert(Ptr{CArrowSchema}, schref)
84+
f(ptr)
85+
sch = ArrowSchema(schref)
86+
finalizer(sch) do x
87+
r = getfield(x.carrowschema[], :release)
88+
if r != C_NULL
89+
ccall(r, Cvoid, (Ptr{CArrowSchema},), x.carrowschema)
90+
end
91+
end
92+
return sch
93+
end
94+
95+
struct CArrowArray
96+
length::Int64
97+
null_count::Int64
98+
offset::Int64
99+
n_buffers::Int64
100+
n_children::Int64
101+
buffers::Ptr{Ptr{UInt8}}
102+
children::Ptr{Ptr{CArrowArray}}
103+
dictionary::Ptr{CArrowArray}
104+
release::Ptr{Cvoid}
105+
private_data::Ptr{Cvoid}
106+
end
107+
108+
CArrowArray() = CArrowArray(0, 0, 0, 0, 0, C_NULL, C_NULL, C_NULL, C_NULL, C_NULL)
109+
110+
Base.propertynames(::CArrowArray) = (:length, :null_count, :offset, :n_buffers, :n_children, :buffers, :children, :dictionary)
111+
112+
function Base.getproperty(x::CArrowArray, nm::Symbol)
113+
if nm === :length
114+
return getfield(x, :length)
115+
elseif nm === :null_count
116+
return getfield(x, :null_count)
117+
elseif nm === :offset
118+
return getfield(x, :offset)
119+
elseif nm === :n_buffers
120+
return getfield(x, :n_buffers)
121+
elseif nm === :n_children
122+
return getfield(x, :n_children)
123+
elseif nm === :buffers
124+
b = getfield(x, :buffers)
125+
return b == C_NULL ? Ptr{UInt8}[] : unsafe_wrap(Array, b, getfield(x, :n_buffers))
126+
elseif nm === :children
127+
c = getfield(x, :children)
128+
return c == C_NULL ? CArrowArray[] : unsafe_wrap(Array, unsafe_load(c), getfield(x, :n_children))
129+
elseif nm === :dictionary
130+
d = getfield(x, :dictionary)
131+
return d == C_NULL ? nothing : unsafe_load(d)
132+
end
133+
error("unknown property requested: $nm")
134+
end
135+
136+
mutable struct ArrowArray
137+
length::Int64
138+
null_count::Int64
139+
offset::Int64
140+
n_buffers::Int64
141+
n_children::Int64
142+
buffers::Vector{Ptr{UInt8}}
143+
children::Vector{ArrowArray}
144+
dictionary::Union{Nothing, ArrowArray}
145+
carrowarray::Ref{CArrowArray}
146+
end
147+
148+
ArrowArray(a::Ref{CArrowArray}) = ArrowArray(a[].length, a[].null_count, a[].offset, a[].n_buffers, a[].n_children, a[].buffers, map(ArrowArray, a[].children), a[].dictionary === nothing ? nothing : ArrowArray(a[].dictionary), a)
149+
ArrowArray(a::CArrowArray) = ArrowArray(a.length, a.null_count, a.offset, a.n_buffers, a.n_children, a.buffers, map(ArrowArray, a.children), a.dictionary === nothing ? nothing : ArrowArray(a.dictionary), Ref{CArrowArray}())
150+
151+
function getarray(f)
152+
arrref = Ref{CArrowArray}()
153+
ptr = Base.unsafe_convert(Ptr{CArrowArray}, arrref)
154+
f(ptr)
155+
arr = ArrowArray(arrref)
156+
finalizer(arr) do x
157+
r = getfield(x.carrowarray[], :release)
158+
if r != C_NULL
159+
ccall(r, Cvoid, (Ptr{CArrowArray},), x.carrowarray)
160+
end
161+
end
162+
return arr
163+
end
164+
165+
end # module

0 commit comments

Comments
 (0)