Skip to content

Commit 9716ccc

Browse files
committed
auto chunk size, tolerate ~ filename
1 parent c00d0de commit 9716ccc

File tree

8 files changed

+76
-37
lines changed

8 files changed

+76
-37
lines changed

auto_chunk_size.m

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
function csize = auto_chunk_size(dims)
2+
% automatically determine HDF5 / NetCDF4 chunk size.
3+
% based on https://github.com/h5py/h5py/blob/master/h5py/_hl/filters.py
4+
% refer to https://support.hdfgroup.org/HDF5/Tutor/layout.html
5+
narginchk(1,1)
6+
validateattributes(dims, {'numeric'}, {'vector', 'integer', 'positive'})
7+
8+
CHUNK_BASE = 16000; % Multiplier by which chunks are adjusted
9+
CHUNK_MIN = 8000; % lower limit: 8 kbyte
10+
CHUNK_MAX = 1000000; % upper limit: 1 Mbyte
11+
TYPESIZE = 8; % bytes, assume real64 for simplicity
12+
13+
csize = dims;
14+
Ndim = length(dims);
15+
if Ndim == 1 || prod(dims) * TYPESIZE < CHUNK_MIN
16+
return
17+
end
18+
19+
dset_size = prod(csize) * TYPESIZE;
20+
target_size = CHUNK_BASE * (2.*log10(dset_size / 1e6));
21+
if (target_size > CHUNK_MAX)
22+
target_size = CHUNK_MAX;
23+
end
24+
25+
% print *,'target_size [bytes]: ',target_size
26+
27+
i = 0;
28+
while true
29+
% Repeatedly loop over the axes, dividing them by 2.
30+
% Stop when:
31+
% 1a. We're smaller than the target chunk size, OR
32+
% 1b. We're within 50% of the target chunk size, AND
33+
% 2. The chunk is smaller than the maximum chunk size
34+
35+
chunk_bytes = prod(csize) * TYPESIZE;
36+
37+
if chunk_bytes < target_size || ...
38+
2*(abs(chunk_bytes-target_size) / target_size) < 1 && ...
39+
chunk_bytes < CHUNK_MAX
40+
break
41+
end
42+
43+
if prod(csize) == 1
44+
break
45+
end
46+
% Element size larger than CHUNK_MAX
47+
j = mod(i, Ndim) + 1;
48+
csize(j) = ceil(csize(j) / 2);
49+
i = i+1;
50+
end
51+
52+
end % function

h5save.m

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ function h5save(filename, varname, A, sizeA, dtype)
22

33
narginchk(3, 5)
44

5+
% Matlab can't cope with tilde in many cases, especially on Windows
6+
filename = expanduser(filename);
7+
58
if nargin < 4 || isempty(sizeA)
69
if isvector(A)
710
sizeA = length(A);
@@ -18,8 +21,6 @@ function h5save(filename, varname, A, sizeA, dtype)
1821
sizeA = size(A);
1922
end
2023

21-
22-
2324
if h5exists(filename, varname)
2425
exist_file(filename, varname, A, sizeA)
2526
else
@@ -54,26 +55,20 @@ function exist_file(filename, varname, A, sizeA)
5455

5556
function new_file(filename, varname, A, sizeA)
5657

57-
if ~ismatrix(A)
58+
if isvector(A)
59+
h5create(filename, varname, sizeA, 'DataType', class(A))
60+
else
5861
% enable Gzip compression--remember Matlab's dim order is flipped from
5962
% C / Python
60-
switch length(sizeA)
61-
case 4, chunksize = [sizeA(1), sizeA(2), 1, sizeA(4)];
62-
case 3, chunksize = [sizeA(1), sizeA(2), 1];
63-
otherwise, error('h5save:fixme', '%s is bigger than 4 dims', varname)
64-
end
6563
h5create(filename, varname, sizeA, 'DataType', class(A), ...
66-
'Deflate', 1, 'Fletcher32', true, 'Shuffle', true, 'ChunkSize', chunksize)
67-
else
68-
h5create(filename, varname, sizeA, 'DataType', class(A))
64+
'Deflate', 1, 'Fletcher32', true, 'Shuffle', true, 'ChunkSize', auto_chunk_size(sizeA))
6965
end % if
7066

7167
h5write(filename, varname, A)
7268

7369
end % function
7470

7571

76-
7772
% Copyright 2020 Michael Hirsch, Ph.D.
7873

7974
% Licensed under the Apache License, Version 2.0 (the "License");

h5variables.m

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
% get dataset names and groups in an HDF5 file
33
narginchk(1,1)
44

5+
filename = expanduser(filename);
6+
57
% use temporary variable to be R2017b OK
68
finf = h5info(filename);
79
ds = finf.Datasets;

is_file.m

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@
1212
try
1313
ret = isfile(path);
1414
catch excp
15-
if any(strcmp(excp.identifier, {'MATLAB:UndefinedFunction', 'Octave:undefined-function'}))
16-
ret = exist(path, 'file') == 2;
17-
else
15+
if ~any(strcmp(excp.identifier, {'MATLAB:UndefinedFunction', 'Octave:undefined-function'}))
1816
rethrow(excp)
1917
end
18+
ret = exist(path, 'file') == 2;
2019
end
2120

2221
end % function

ncsave.m

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ function ncsave(filename, varname, A, ncdims, dtype)
22

33
narginchk(3, 5)
44

5+
filename = expanduser(filename);
6+
57
if nargin >= 4 && ~isempty(ncdims)
68
for i = 2:2:length(ncdims)
79
sizeA(i/2) = ncdims{i};
@@ -37,24 +39,13 @@ function ncsave(filename, varname, A, ncdims, dtype)
3739

3840
if any(strcmp(vars, varname))
3941
exist_file(filename, varname, A, sizeA)
40-
% catch excp
41-
% if any(strcmp(excp.identifier, {'MATLAB:imagesci:netcdf:unableToOpenFileforRead', 'MATLAB:imagesci:netcdf:unknownLocation'}))
42-
% % pass Matlab
43-
% elseif any(strcmp(excp.message, {'No such file or directory', 'NetCDF: Variable not found'}))
44-
% % pass Octave
45-
% else
46-
% disp(['failed create ', varname])
47-
% rethrow(excp)
48-
% end
4942
else
5043
new_file(filename, varname, A, sizeA, ncdims)
5144
end
5245

5346
end % function
5447

5548

56-
57-
5849
function exist_file(filename, varname, A, sizeA)
5950
narginchk(4,4)
6051

@@ -76,20 +67,15 @@ function new_file(filename, varname, A, sizeA, ncdims)
7667

7768
if isscalar(A)
7869
nccreate(filename, varname, 'Datatype', class(A), 'Format', 'netcdf4')
79-
elseif isvector(A) || ismatrix(A)
70+
elseif isvector(A)
8071
nccreate(filename, varname, 'Dimensions', ncdims, 'Datatype', class(A), 'Format', 'netcdf4')
8172
else
8273
% enable Gzip compression--remember Matlab's dim order is flipped from
8374
% C / Python
84-
switch length(sizeA)
85-
case 4, chunksize = [sizeA(1), sizeA(2), 1, sizeA(4)];
86-
case 3, chunksize = [sizeA(1), sizeA(2), 1];
87-
otherwise, error('ncsave:fixme', '%s is bigger than 4 dims', varname)
88-
end
8975
% "Datatype" to be Octave case-sensitive keyword compatible
9076
nccreate(filename, varname, 'Dimensions', ncdims, ...
9177
'Datatype', class(A), ...
92-
'DeflateLevel', 1, 'Shuffle', true, 'ChunkSize', chunksize, 'Format', 'netcdf4')
78+
'DeflateLevel', 1, 'Shuffle', true, 'ChunkSize', auto_chunk_size(sizeA), 'Format', 'netcdf4')
9379
end
9480

9581
ncwrite(filename, varname, A)

ncsize.m

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
narginchk(2,2)
1010
validateattributes(varname, {'char'}, {'vector'}, 2)
1111

12+
filename = expanduser(filename);
13+
1214
vinf = ncinfo(filename, varname);
1315
fsize = vinf.Size;
1416

@@ -17,4 +19,4 @@
1719
fsize = 1;
1820
end
1921

20-
end
22+
end

ncvariables.m

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
narginchk(1,1)
44

55
% use temporary variable to be R2017b OK
6-
finf = ncinfo(filename);
6+
finf = ncinfo(expanduser(filename));
77
ds = finf.Variables(:);
88
names = {ds(:).Name};
99

10-
end % function
10+
end % function

test_hdf5.m

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99

1010
basic = fullfile(tempdir, 'basic.h5');
1111
if is_file(basic), delete(basic), end
12-
12+
%% test_auto_chunk_size
13+
assert(isequal(auto_chunk_size([1500,2500,1000,500,100]), [12,20,8,8,2]), '5D chunk fail')
14+
assert(isequal(auto_chunk_size([15,250,100]), [2,32,25]), '3D chunk fail')
15+
assert(isequal(auto_chunk_size([15,250]), [15,250]), '2D small chunk fail')
1316
%% test_write_basic
1417
h5save(basic, '/A0', A0)
1518
h5save(basic, '/A1', A1)

0 commit comments

Comments
 (0)