Skip to content

Commit c1cbbc5

Browse files
committed
implement read/write of .uns
1 parent bb87633 commit c1cbbc5

File tree

3 files changed

+107
-47
lines changed

3 files changed

+107
-47
lines changed

src/anndata.jl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ mutable struct AnnData <: AbstractAnnData
1919

2020
layers::AbstractAlignedMapping{Tuple{1 => 1, 2 => 2}, String}
2121

22+
uns::Dict{<:AbstractString, <:Any}
23+
2224
function AnnData(file::Union{HDF5.File, HDF5.Group}, backed=true, checkversion=true)
2325
if checkversion
2426
attrs = attributes(file)
@@ -71,6 +73,11 @@ mutable struct AnnData <: AbstractAnnData
7173
adata.layers = BackedAlignedMapping{Tuple{1 => 1, 2 => 2}}(adata, adata.file, "layers")
7274
end
7375

76+
# unstructured
77+
adata.uns =
78+
haskey(file, "uns") ? read_dict_of_mixed(file["uns"], separate_index=false) :
79+
Dict{String, Any}()
80+
7481
return adata
7582
end
7683

@@ -91,6 +98,7 @@ mutable struct AnnData <: AbstractAnnData
9198
obsp::Union{AbstractDict{<:AbstractString, <:AbstractMatrix{<:Number}}, Nothing}=nothing,
9299
varp::Union{AbstractDict{<:AbstractString, <:AbstractMatrix{<:Number}}, Nothing}=nothing,
93100
layers::Union{AbstractDict{<:AbstractString, <:AbstractMatrix{<:Number}}, Nothing}=nothing,
101+
uns::Union{AbstractDict{<:AbstractString, <:Any}, Nothing}=nothing,
94102
)
95103
m, n = size(X)
96104
if isnothing(obs)
@@ -122,6 +130,7 @@ mutable struct AnnData <: AbstractAnnData
122130
adata.varm = StrAlignedMapping{Tuple{1 => 2}}(adata, varm)
123131
adata.varp = StrAlignedMapping{Tuple{1 => 2, 2 => 2}}(adata, varp)
124132
adata.layers = StrAlignedMapping{Tuple{1 => 1, 2 => 2}}(adata, layers)
133+
adata.uns = isnothing(uns) ? Dict{String, Any}() : uns
125134

126135
return adata
127136
end
@@ -190,24 +199,25 @@ function Base.write(parent::Union{HDF5.File, HDF5.Group}, adata::AbstractAnnData
190199
else
191200
write_attr(parent, "X", adata.X)
192201
write_attr(parent, "layers", adata.layers)
193-
write_unbacked(parent, adata)
202+
write_metadata(parent, adata)
194203
end
195204
end
196205

197206
function Base.write(adata)
198207
if file(adata) === nothing
199-
throw("adata is not backed, need somewhere to write to")
208+
error("adata is not backed, need somewhere to write to")
200209
end
201-
write_unbacked(file(adata), adata)
210+
write_metadata(file(adata), adata)
202211
end
203212

204-
function write_unbacked(parent::Union{HDF5.File, HDF5.Group}, adata::AbstractAnnData)
213+
function write_metadata(parent::Union{HDF5.File, HDF5.Group}, adata::AbstractAnnData)
205214
write_attr(parent, "obs", adata.obs, index=adata.obs_names)
206215
write_attr(parent, "obsm", adata.obsm, index=adata.obs_names)
207216
write_attr(parent, "obsp", adata.obsp)
208217
write_attr(parent, "var", adata.var, index=adata.var_names)
209218
write_attr(parent, "varm", adata.varm, index=adata.var_names)
210219
write_attr(parent, "varp", adata.varp)
220+
write_attr(parent, "uns", adata.uns)
211221
end
212222
# FileIO support
213223
load(f::File{format"h5ad"}) = readh5ad(filename(f), backed=false) # I suppose this is more consistent with the rest of FileIO?

src/hdf5_io.jl

Lines changed: 82 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,7 @@
1-
function read_dataframe(tablegroup::HDF5.Group)
1+
function read_dataframe(tablegroup::HDF5.Group; separate_index=true, kwargs...)
22
columns = read_attribute(tablegroup, "column-order")
33

4-
havecat = false
5-
if haskey(tablegroup, "__categories")
6-
havecat = true
7-
catcols = tablegroup["__categories"]
8-
end
9-
10-
if haskey(attributes(tablegroup), "_index")
4+
if separate_index && haskey(attributes(tablegroup), "_index")
115
indexdsetname = read_attribute(tablegroup, "_index")
126
rownames = read(tablegroup[indexdsetname])
137
else
@@ -17,26 +11,42 @@ function read_dataframe(tablegroup::HDF5.Group)
1711
df = DataFrame()
1812

1913
for col in columns
20-
column = read(tablegroup, col)
21-
if havecat && haskey(catcols, col)
22-
cats = read(catcols, col)
23-
column = compress(CategoricalArray(map(x -> cats[x + 1], column)))
14+
column = read_matrix(tablegroup[col])
15+
if sum(size(column) .> 1) > 1
16+
@warn "column $col has more than 1 dimension for data frame $(HDF5.name(tablegroup)), skipping"
2417
end
2518
df[!, col] = column
2619
end
2720

2821
return df, rownames
2922
end
3023

31-
function read_matrix(f::HDF5.Dataset)
24+
function read_matrix(f::HDF5.Dataset; kwargs...)
3225
mat = read(f)
33-
if ndims(mat) > 1
26+
if ndims(f) == 0
27+
return mat
28+
end
29+
if ndims(f) > 1
3430
mat = PermutedDimsArray(mat, ndims(mat):-1:1) # transpose for h5py compatibility
3531
end
32+
if haskey(attributes(f), "categories")
33+
categories = f[read_attribute(f, "categories")]
34+
ordered =
35+
haskey(attributes(categories), "ordered") &&
36+
read_attribute(categories, "ordered") == true
37+
cats = read(categories)
38+
mat = mat .+ 0x1
39+
mat = compress(
40+
CategoricalArray{eltype(cats), ndims(mat)}(
41+
mat,
42+
CategoricalPool{eltype(cats), eltype(mat)}(cats, ordered),
43+
),
44+
)
45+
end
3646
return mat
3747
end
3848

39-
function read_matrix(f::HDF5.Group)
49+
function read_matrix(f::HDF5.Group; kwargs...)
4050
enctype = read_attribute(f, "encoding-type")
4151

4252
if enctype == "csc_matrix" || enctype == "csr_matrix"
@@ -61,30 +71,46 @@ function read_matrix(f::HDF5.Group)
6171
mat = SparseMatrixCSC(shape..., indptr, indices, data)
6272
return iscsr ? mat' : mat
6373
else
64-
throw("unknown encoding $enctype")
74+
error("unknown encoding $enctype")
6575
end
6676
end
6777

68-
function read_dict_of_matrices(f::HDF5.Group)
69-
return Dict{String, AbstractArray{<:Number}}(key => read_matrix(f[key]) for key in keys(f))
78+
function read_dict_of_matrices(f::HDF5.Group; kwargs...)
79+
return Dict{String, AbstractArray{<:Number}}(
80+
key => read_matrix(f[key]; kwargs...) for key in keys(f)
81+
)
7082
end
7183

72-
read_auto(f::HDF5.Dataset) = (read_matrix(f), nothing)
73-
function read_auto(f::HDF5.Group)
74-
enctype = read_attribute(f, "encoding-type")
75-
if enctype == "dataframe"
76-
return read_dataframe(f)
77-
elseif endswith(enctype, "matrix")
78-
return read_matrix(f), nothing
84+
read_auto(f::HDF5.Dataset; kwargs...) = (read_matrix(f; kwargs...), nothing)
85+
function read_auto(f::HDF5.Group; kwargs...)
86+
if haskey(attributes(f), "encoding-type")
87+
enctype = read_attribute(f, "encoding-type")
88+
if enctype == "dataframe"
89+
return read_dataframe(f; kwargs...)
90+
elseif endswith(enctype, "matrix")
91+
return read_matrix(f; kwargs), nothing
92+
else
93+
error("unknown encoding $enctype")
94+
end
7995
else
80-
throw("unknown encoding $enctype")
96+
return read_dict_of_mixed(f; kwargs...), nothing
8197
end
8298
end
8399

84-
function read_dict_of_mixed(f::HDF5.Group)
85-
ret = Dict{String, Union{DataFrame, AbstractArray{<:Number}}}()
100+
function read_dict_of_mixed(f::HDF5.Group; kwargs...)
101+
ret = Dict{
102+
String,
103+
Union{
104+
DataFrame,
105+
<:AbstractArray{<:Number},
106+
<:AbstractArray{<:AbstractString},
107+
<:AbstractString,
108+
<:Number,
109+
Dict,
110+
},
111+
}()
86112
for k in keys(f)
87-
ret[k] = read_auto(f[k])[1] # assume data frames are properly aligned, so we can discard rownames
113+
ret[k] = read_auto(f[k]; kwargs...)[1] # assume data frames are properly aligned, so we can discard rownames
88114
end
89115
return ret
90116
end
@@ -96,6 +122,15 @@ function write_attr(parent::Union{HDF5.File, HDF5.Group}, name::AbstractString,
96122
write_impl(parent, name, data; kwargs...)
97123
end
98124

125+
function write_impl(
126+
parent::Union{HDF5.File, HDF5.Group},
127+
name::AbstractString,
128+
data::Union{<:Number, <:AbstractString};
129+
kwargs...,
130+
)
131+
parent[name] = data
132+
end
133+
99134
function write_impl(
100135
parent::Union{HDF5.File, HDF5.Group},
101136
name::AbstractString,
@@ -113,19 +148,20 @@ end
113148
function write_impl(
114149
parent::Union{HDF5.File, HDF5.Group},
115150
name::AbstractString,
116-
data::CategoricalVector;
151+
data::CategoricalArray;
117152
kwargs...,
118153
)
119154
write_impl(parent, name, data.refs .- 0x1; kwargs...)
120155
write_impl(parent, "__categories/$name", levels(data); kwargs...)
156+
attributes(parent["__categories/$name"])["ordered"] = UInt8(isordered(data))
121157
attributes(parent[name])["categories"] = HDF5.Reference(parent["__categories"], name)
122158
end
123159

124160
function write_impl(
125161
parent::Union{HDF5.File, HDF5.Group},
126162
name::AbstractString,
127163
data::AbstractDataFrame;
128-
index::AbstractVector{<:AbstractString},
164+
index::AbstractVector{<:AbstractString}=nothing,
129165
kwargs...,
130166
)
131167
g = create_group(parent, name)
@@ -140,8 +176,18 @@ function write_impl(
140176

141177
idxname = "_index"
142178
columns = names(data)
143-
while idxname columns
144-
idxname = "_" * idxname
179+
if !isnothing(index)
180+
while idxname columns
181+
idxname = "_" * idxname
182+
end
183+
else
184+
if idxname columns
185+
index = data[!, idxname]
186+
select!(data, Not(idxname))
187+
else
188+
@warn "Data frame $(HDF5.name(parent))/$name does not have an _index column, a row number index will be written"
189+
index = 1:nrow(data)
190+
end
145191
end
146192
g = parent[name]
147193
write_impl(g, idxname, values(index); kwargs...)
@@ -157,22 +203,16 @@ write_impl(
157203
extensible::Bool=false,
158204
compress::UInt8=UInt8(9),
159205
kwargs...,
160-
) = write_impl(
161-
parent,
162-
name,
163-
Int8.(data);
164-
extensible=extensible,
165-
compress=compress,
166-
kwargs...,
167-
)
206+
) = write_impl(parent, name, Int8.(data); extensible=extensible, compress=compress, kwargs...)
168207

169208
write_impl(parent::Union{HDF5.File, HDF5.Group}, name::AbstractString, data::SubArray; kwargs...) =
170209
write_impl(parent, name, copy(data); kwargs...)
171210

172211
function write_impl(
173212
parentgrp::Union{HDF5.File, HDF5.Group},
174213
name::AbstractString,
175-
data::AbstractArray,;
214+
data::AbstractArray,
215+
;
176216
extensible::Bool=false,
177217
compress::UInt8=0x9,
178218
kwargs...,

src/mudata.jl

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ mutable struct MuData <: AbstractMuData
1616
varp::StrAlignedMapping{Tuple{1 => 2, 2 => 2}, MuData}
1717
varmap::StrAlignedMapping{Tuple{1 => 2}, MuData}
1818

19+
uns::Dict{<:AbstractString, <:Any}
20+
1921
function MuData(file::Union{HDF5.File, HDF5.Group}, backed=true, checkversion=true)
2022
if checkversion
2123
attrs = attributes(file)
@@ -62,6 +64,11 @@ mutable struct MuData <: AbstractMuData
6264
haskey(file, "varmap") ? read_dict_of_matrices(file["varmap"]) : nothing,
6365
)
6466

67+
# unstructured
68+
mdata.uns =
69+
haskey(file, "uns") ? read_dict_of_mixed(file["uns"], separate_index=false) :
70+
Dict{String, Any}()
71+
6572
# Modalities
6673
mdata.mod = Dict{String, AnnData}()
6774
mods = HDF5.keys(file["mod"])
@@ -90,6 +97,7 @@ mutable struct MuData <: AbstractMuData
9097
varp::Union{AbstractDict{<:AbstractString, <:AbstractMatrix{<:Number}}, Nothing}=nothing,
9198
obsmap::Union{AbstractDict{<:AbstractString, <:AbstractVector{<:Integer}}, Nothing}=nothing,
9299
varmap::Union{AbstractDict{<:AbstractString, <:AbstractVector{<:Integer}}, Nothing}=nothing,
100+
uns::Union{AbstractDict{<:AbstractString, <:Any}, Nothing}=nothing,
93101
do_update=true,
94102
)
95103
mdata = new(nothing, Dict{String, AnnData}())
@@ -108,6 +116,7 @@ mutable struct MuData <: AbstractMuData
108116
mdata.varp = StrAlignedMapping{Tuple{1 => 2, 2 => 2}}(mdata, varp)
109117
mdata.obsmap = StrAlignedMapping{Tuple{1 => 1}}(mdata, obsmap)
110118
mdata.varmap = StrAlignedMapping{Tuple{1 => 2}}(mdata, varmap)
119+
mdata.uns = isnothing(uns) ? Dict{String, Any}() : uns
111120

112121
if do_update
113122
update!(mdata)
@@ -197,11 +206,12 @@ function write_metadata(parent::Union{HDF5.File, HDF5.Group}, mudata::AbstractMu
197206
write_attr(parent, "obs", shrink_attr(mudata, :obs), index=mudata.obs_names)
198207
write_attr(parent, "obsm", mudata.obsm, index=mudata.obs_names)
199208
write_attr(parent, "obsp", mudata.obsp)
200-
write_attr(parent, "obsmap", mudata.obsmap,)
209+
write_attr(parent, "obsmap", mudata.obsmap)
201210
write_attr(parent, "var", shrink_attr(mudata, :var), index=mudata.var_names)
202211
write_attr(parent, "varm", mudata.varm, index=mudata.var_names)
203212
write_attr(parent, "varp", mudata.varp)
204213
write_attr(parent, "varmap", mudata.varmap)
214+
write_attr(parent, "uns", mudata.uns)
205215
end
206216

207217
# FileIO support

0 commit comments

Comments
 (0)