1
- function read_dataframe (tablegroup:: HDF5.Group )
1
+ function read_dataframe (tablegroup:: HDF5.Group ; separate_index = true , kwargs ... )
2
2
columns = read_attribute (tablegroup, " column-order" )
3
3
4
- havecat = false
5
- if haskey (tablegroup, " __categories" )
6
- havecat = true
7
- catcols = tablegroup[" __categories" ]
8
- end
9
-
10
- if haskey (attributes (tablegroup), " _index" )
4
+ if separate_index && haskey (attributes (tablegroup), " _index" )
11
5
indexdsetname = read_attribute (tablegroup, " _index" )
12
6
rownames = read (tablegroup[indexdsetname])
13
7
else
@@ -17,26 +11,42 @@ function read_dataframe(tablegroup::HDF5.Group)
17
11
df = DataFrame ()
18
12
19
13
for col in columns
20
- column = read (tablegroup, col)
21
- if havecat && haskey (catcols, col)
22
- cats = read (catcols, col)
23
- column = compress (CategoricalArray (map (x -> cats[x + 1 ], column)))
14
+ column = read_matrix (tablegroup[col])
15
+ if sum (size (column) .> 1 ) > 1
16
+ @warn " column $col has more than 1 dimension for data frame $(HDF5. name (tablegroup)) , skipping"
24
17
end
25
18
df[! , col] = column
26
19
end
27
20
28
21
return df, rownames
29
22
end
30
23
31
- function read_matrix (f:: HDF5.Dataset )
24
+ function read_matrix (f:: HDF5.Dataset ; kwargs ... )
32
25
mat = read (f)
33
- if ndims (mat) > 1
26
+ if ndims (f) == 0
27
+ return mat
28
+ end
29
+ if ndims (f) > 1
34
30
mat = PermutedDimsArray (mat, ndims (mat): - 1 : 1 ) # transpose for h5py compatibility
35
31
end
32
+ if haskey (attributes (f), " categories" )
33
+ categories = f[read_attribute (f, " categories" )]
34
+ ordered =
35
+ haskey (attributes (categories), " ordered" ) &&
36
+ read_attribute (categories, " ordered" ) == true
37
+ cats = read (categories)
38
+ mat = mat .+ 0x1
39
+ mat = compress (
40
+ CategoricalArray {eltype(cats), ndims(mat)} (
41
+ mat,
42
+ CategoricalPool {eltype(cats), eltype(mat)} (cats, ordered),
43
+ ),
44
+ )
45
+ end
36
46
return mat
37
47
end
38
48
39
- function read_matrix (f:: HDF5.Group )
49
+ function read_matrix (f:: HDF5.Group ; kwargs ... )
40
50
enctype = read_attribute (f, " encoding-type" )
41
51
42
52
if enctype == " csc_matrix" || enctype == " csr_matrix"
@@ -61,30 +71,46 @@ function read_matrix(f::HDF5.Group)
61
71
mat = SparseMatrixCSC (shape... , indptr, indices, data)
62
72
return iscsr ? mat' : mat
63
73
else
64
- throw (" unknown encoding $enctype " )
74
+ error (" unknown encoding $enctype " )
65
75
end
66
76
end
67
77
68
- function read_dict_of_matrices (f:: HDF5.Group )
69
- return Dict {String, AbstractArray{<:Number}} (key => read_matrix (f[key]) for key in keys (f))
78
+ function read_dict_of_matrices (f:: HDF5.Group ; kwargs... )
79
+ return Dict {String, AbstractArray{<:Number}} (
80
+ key => read_matrix (f[key]; kwargs... ) for key in keys (f)
81
+ )
70
82
end
71
83
72
- read_auto (f:: HDF5.Dataset ) = (read_matrix (f), nothing )
73
- function read_auto (f:: HDF5.Group )
74
- enctype = read_attribute (f, " encoding-type" )
75
- if enctype == " dataframe"
76
- return read_dataframe (f)
77
- elseif endswith (enctype, " matrix" )
78
- return read_matrix (f), nothing
84
+ read_auto (f:: HDF5.Dataset ; kwargs... ) = (read_matrix (f; kwargs... ), nothing )
85
+ function read_auto (f:: HDF5.Group ; kwargs... )
86
+ if haskey (attributes (f), " encoding-type" )
87
+ enctype = read_attribute (f, " encoding-type" )
88
+ if enctype == " dataframe"
89
+ return read_dataframe (f; kwargs... )
90
+ elseif endswith (enctype, " matrix" )
91
+ return read_matrix (f; kwargs), nothing
92
+ else
93
+ error (" unknown encoding $enctype " )
94
+ end
79
95
else
80
- throw ( " unknown encoding $enctype " )
96
+ return read_dict_of_mixed (f; kwargs ... ), nothing
81
97
end
82
98
end
83
99
84
- function read_dict_of_mixed (f:: HDF5.Group )
85
- ret = Dict {String, Union{DataFrame, AbstractArray{<:Number}}} ()
100
+ function read_dict_of_mixed (f:: HDF5.Group ; kwargs... )
101
+ ret = Dict{
102
+ String,
103
+ Union{
104
+ DataFrame,
105
+ <: AbstractArray{<:Number} ,
106
+ <: AbstractArray{<:AbstractString} ,
107
+ <: AbstractString ,
108
+ <: Number ,
109
+ Dict,
110
+ },
111
+ }()
86
112
for k in keys (f)
87
- ret[k] = read_auto (f[k])[1 ] # assume data frames are properly aligned, so we can discard rownames
113
+ ret[k] = read_auto (f[k]; kwargs ... )[1 ] # assume data frames are properly aligned, so we can discard rownames
88
114
end
89
115
return ret
90
116
end
@@ -96,6 +122,15 @@ function write_attr(parent::Union{HDF5.File, HDF5.Group}, name::AbstractString,
96
122
write_impl (parent, name, data; kwargs... )
97
123
end
98
124
125
+ function write_impl (
126
+ parent:: Union{HDF5.File, HDF5.Group} ,
127
+ name:: AbstractString ,
128
+ data:: Union{<:Number, <:AbstractString} ;
129
+ kwargs... ,
130
+ )
131
+ parent[name] = data
132
+ end
133
+
99
134
function write_impl (
100
135
parent:: Union{HDF5.File, HDF5.Group} ,
101
136
name:: AbstractString ,
@@ -113,19 +148,20 @@ end
113
148
function write_impl (
114
149
parent:: Union{HDF5.File, HDF5.Group} ,
115
150
name:: AbstractString ,
116
- data:: CategoricalVector ;
151
+ data:: CategoricalArray ;
117
152
kwargs... ,
118
153
)
119
154
write_impl (parent, name, data. refs .- 0x1 ; kwargs... )
120
155
write_impl (parent, " __categories/$name " , levels (data); kwargs... )
156
+ attributes (parent[" __categories/$name " ])[" ordered" ] = UInt8 (isordered (data))
121
157
attributes (parent[name])[" categories" ] = HDF5. Reference (parent[" __categories" ], name)
122
158
end
123
159
124
160
function write_impl (
125
161
parent:: Union{HDF5.File, HDF5.Group} ,
126
162
name:: AbstractString ,
127
163
data:: AbstractDataFrame ;
128
- index:: AbstractVector{<:AbstractString} ,
164
+ index:: AbstractVector{<:AbstractString} = nothing ,
129
165
kwargs... ,
130
166
)
131
167
g = create_group (parent, name)
@@ -140,8 +176,18 @@ function write_impl(
140
176
141
177
idxname = " _index"
142
178
columns = names (data)
143
- while idxname ∈ columns
144
- idxname = " _" * idxname
179
+ if ! isnothing (index)
180
+ while idxname ∈ columns
181
+ idxname = " _" * idxname
182
+ end
183
+ else
184
+ if idxname ∈ columns
185
+ index = data[! , idxname]
186
+ select! (data, Not (idxname))
187
+ else
188
+ @warn " Data frame $(HDF5. name (parent)) /$name does not have an _index column, a row number index will be written"
189
+ index = 1 : nrow (data)
190
+ end
145
191
end
146
192
g = parent[name]
147
193
write_impl (g, idxname, values (index); kwargs... )
@@ -157,22 +203,16 @@ write_impl(
157
203
extensible:: Bool = false ,
158
204
compress:: UInt8 = UInt8 (9 ),
159
205
kwargs... ,
160
- ) = write_impl (
161
- parent,
162
- name,
163
- Int8 .(data);
164
- extensible= extensible,
165
- compress= compress,
166
- kwargs... ,
167
- )
206
+ ) = write_impl (parent, name, Int8 .(data); extensible= extensible, compress= compress, kwargs... )
168
207
169
208
write_impl (parent:: Union{HDF5.File, HDF5.Group} , name:: AbstractString , data:: SubArray ; kwargs... ) =
170
209
write_impl (parent, name, copy (data); kwargs... )
171
210
172
211
function write_impl (
173
212
parentgrp:: Union{HDF5.File, HDF5.Group} ,
174
213
name:: AbstractString ,
175
- data:: AbstractArray ,;
214
+ data:: AbstractArray ,
215
+ ;
176
216
extensible:: Bool = false ,
177
217
compress:: UInt8 = 0x9 ,
178
218
kwargs... ,
0 commit comments