From 792d14492e894bba04fb4a48cb4846253db2535c Mon Sep 17 00:00:00 2001 From: Kris Fedorenko <1648280+krisfed@users.noreply.github.com> Date: Thu, 19 Jun 2025 11:20:43 -0400 Subject: [PATCH 1/6] First draft of partial read --- PythonModule/ZarrPy.py | 12 ++++- Zarr.m | 56 ++++++++++++++++++- test/dataFiles/grp_v2/smallArr/.zarray | 1 + test/dataFiles/grp_v2/smallArr/0.0 | Bin 0 -> 96 bytes test/tZarrRead.m | 71 +++++++++++++++++++++++++ zarrread.m | 26 +++++++-- 6 files changed, 157 insertions(+), 9 deletions(-) create mode 100644 test/dataFiles/grp_v2/smallArr/.zarray create mode 100644 test/dataFiles/grp_v2/smallArr/0.0 diff --git a/PythonModule/ZarrPy.py b/PythonModule/ZarrPy.py index 3e51709..0f61b34 100644 --- a/PythonModule/ZarrPy.py +++ b/PythonModule/ZarrPy.py @@ -81,12 +81,16 @@ def writeZarr (kvstore_schema, data): zarr_file[...] = data -def readZarr (kvstore_schema): +def readZarr (kvstore_schema, starts, ends, strides): """ Reads a subset of data from a Zarr file. Parameters: - kvstore_schema (dictionary): Schema for the file store (local or remote) + - starts (list): Array of start indices for each dimension (0-based) + - ends (list): Array of end indices for each dimension (elements + at the end index will not be read) + - strides (list): Array of strides for each dimensions Returns: - numpy.ndarray: The subset of the data read from the Zarr file. @@ -96,6 +100,10 @@ def readZarr (kvstore_schema): 'kvstore': kvstore_schema, }).result() + # Construct the indexing slices + slices = tuple(slice(start, end, stride) for start, end, stride in zip(starts, ends, strides)) + # Read a subset of the data - data = zarr_file[...].read().result() + data = zarr_file[slices].read().result() + #data = zarr_file[...].read().result() return data diff --git a/Zarr.m b/Zarr.m index e8acc5a..ddd56b9 100644 --- a/Zarr.m +++ b/Zarr.m @@ -53,6 +53,38 @@ isZgroup = isfile(fullfile(path, '.zgroup')); end + function newParams = validatePartialReadParams(params, dims, defaultValues) + % Validate the parameters for partial read (Start, Stride, + % Count) + + arguments (Output) + newParams (1,:) int64 + end + + if isempty(params) + newParams = defaultValues; + return + end + + % Allow using a scalar value for indexing into row or column + % datasets + if isscalar(params) && any(dims==1) && numel(dims)==2 + newParams = defaultValues; + % use the provided value for the non-scalar dimension + newParams(dims~=1) = params; + return + end + + if numel(params) ~= numel(dims) + error("MATLAB:Zarr:badPartialReadDimensions",... + "Length of parameters for partial reading " +... + "(Start, Stride, Count) must be the same "+... + "as the number of dataset dimensions.") + end + + newParams = params; + end + function resolvedPath = getFullPath(path) % Given a path, resolves it to a full path. The trailing % directories do not have to exist. @@ -200,7 +232,7 @@ function makeZarrGroups(existingParentPath, newGroupsPath) end - function data = read(obj) + function data = read(obj, start, count, stride) % Function to read the Zarr array % If the Zarr array is local, verify that it is a valid folder @@ -214,7 +246,27 @@ function makeZarrGroups(existingParentPath, newGroupsPath) end end - ndArrayData = py.ZarrPy.readZarr(obj.KVStoreSchema); + % Validate partial read parameters + info = zarrinfo(obj.Path); + numDims = numel(info.shape); + start = Zarr.validatePartialReadParams(start, info.shape,... + ones([1,numDims])); + stride = Zarr.validatePartialReadParams(stride, info.shape,... + ones([1,numDims])); + maxCount = (int64(info.shape') - start + 1)./stride; % has to be a row vector + count = Zarr.validatePartialReadParams(count, info.shape,... + maxCount); + + % Convert partial read parameters to tensorstore-style + % indexing + start = start - 1; % tensorstore is 0-based + % Tensorstore uses end index instead of count + % (it does NOT include element at the end index) + endInds = start + stride.*count; + + % Read the data + ndArrayData = py.ZarrPy.readZarr(obj.KVStoreSchema,... + start, endInds, stride); % Store the datatype obj.Datatype = ZarrDatatype.fromTensorstoreType(ndArrayData.dtype.name); diff --git a/test/dataFiles/grp_v2/smallArr/.zarray b/test/dataFiles/grp_v2/smallArr/.zarray new file mode 100644 index 0000000..b0f27fa --- /dev/null +++ b/test/dataFiles/grp_v2/smallArr/.zarray @@ -0,0 +1 @@ +{"chunks":[3,4],"compressor":null,"dimension_separator":".","dtype":" Date: Thu, 19 Jun 2025 12:04:42 -0400 Subject: [PATCH 2/6] mustBeRow not available before R2024b - for now allow other shapes and reshape them into row vectors --- PythonModule/ZarrPy.py | 8 ++++---- test/tZarrRead.m | 12 ------------ zarrread.m | 6 +++--- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/PythonModule/ZarrPy.py b/PythonModule/ZarrPy.py index 0f61b34..6a182b9 100644 --- a/PythonModule/ZarrPy.py +++ b/PythonModule/ZarrPy.py @@ -87,10 +87,10 @@ def readZarr (kvstore_schema, starts, ends, strides): Parameters: - kvstore_schema (dictionary): Schema for the file store (local or remote) - - starts (list): Array of start indices for each dimension (0-based) - - ends (list): Array of end indices for each dimension (elements + - starts (numpy.ndarray): Array of start indices for each dimension (0-based) + - ends (numpy.ndarray): Array of end indices for each dimension (elements at the end index will not be read) - - strides (list): Array of strides for each dimensions + - strides (numpy.ndarray): Array of strides for each dimensions Returns: - numpy.ndarray: The subset of the data read from the Zarr file. @@ -105,5 +105,5 @@ def readZarr (kvstore_schema, starts, ends, strides): # Read a subset of the data data = zarr_file[slices].read().result() - #data = zarr_file[...].read().result() + return data diff --git a/test/tZarrRead.m b/test/tZarrRead.m index 5ecd9c7..b54071a 100644 --- a/test/tZarrRead.m +++ b/test/tZarrRead.m @@ -139,18 +139,6 @@ function invalidPartialReadParams(testcase) @()zarrread(zpath,Count=wrongNumberOfDimensions),... errID); - errID = 'MATLAB:validators:mustBeRow'; - wrongSize = [1; 1]; - testcase.verifyError(... - @()zarrread(zpath,Start=wrongSize),... - errID); - testcase.verifyError(... - @()zarrread(zpath,Stride=wrongSize),... - errID); - testcase.verifyError(... - @()zarrread(zpath,Count=wrongSize),... - errID); - %TODO: negative values, wrong datatypes, out of bounds end diff --git a/zarrread.m b/zarrread.m index 7691a49..c7ca127 100644 --- a/zarrread.m +++ b/zarrread.m @@ -22,9 +22,9 @@ arguments filepath {mustBeTextScalar, mustBeNonzeroLengthText} - options.Start {mustBeInteger, mustBePositive, mustBeRow} = double.empty(1,0); - options.Count {mustBeInteger, mustBePositive, mustBeRow} = double.empty(1,0); - options.Stride {mustBeInteger, mustBePositive, mustBeRow} = double.empty(1,0); + options.Start (1,:) {mustBeInteger, mustBePositive} = []; + options.Count (1,:) {mustBeInteger, mustBePositive} = []; + options.Stride (1,:) {mustBeInteger, mustBePositive} = []; end zarrObj = Zarr(filepath); From 603eb9beede7626575438d4ec5f181d450fe051d Mon Sep 17 00:00:00 2001 From: Kris Fedorenko <1648280+krisfed@users.noreply.github.com> Date: Thu, 19 Jun 2025 12:26:53 -0400 Subject: [PATCH 3/6] Adding test for scalar Start/Stride/Count --- test/dataFiles/grp_v2/vectorData/.zarray | 1 + test/dataFiles/grp_v2/vectorData/0.0 | Bin 0 -> 80 bytes test/tZarrRead.m | 13 +++++++++++++ 3 files changed, 14 insertions(+) create mode 100644 test/dataFiles/grp_v2/vectorData/.zarray create mode 100644 test/dataFiles/grp_v2/vectorData/0.0 diff --git a/test/dataFiles/grp_v2/vectorData/.zarray b/test/dataFiles/grp_v2/vectorData/.zarray new file mode 100644 index 0000000..d3eb18f --- /dev/null +++ b/test/dataFiles/grp_v2/vectorData/.zarray @@ -0,0 +1 @@ +{"chunks":[1,10],"compressor":null,"dimension_separator":".","dtype":" Date: Thu, 19 Jun 2025 12:47:24 -0400 Subject: [PATCH 4/6] Included info about defaults for Start/Stride/Count in M-help --- test/tZarrRead.m | 2 +- zarrread.m | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/test/tZarrRead.m b/test/tZarrRead.m index bb7bef3..262c77b 100644 --- a/test/tZarrRead.m +++ b/test/tZarrRead.m @@ -60,7 +60,7 @@ function verifyPartialArrayData(testcase) % Start, Stride, and Count actData = zarrread(zpath,... - Start=[2, 1], Stride=[1, 2], Count=[1,2]); + Start=[2, 1], Stride=[1, 2], Count=[1, 2]); expData = [2, 8]; testcase.verifyEqual(actData,expData,... 'Failed to verify reading with Start, Stride, and Count.'); diff --git a/zarrread.m b/zarrread.m index c7ca127..f8f93ad 100644 --- a/zarrread.m +++ b/zarrread.m @@ -6,17 +6,22 @@ % % DATA = ZARRREAD(FILEPATH, Start=start) retrieves a subset of the data % from the Zarr array located at FILEPATH. Start is a row vector of -% one-based indices of the first element to be read in each dimension. +% one-based indices of the first element to be read in each dimension. +% Default is to read all the elements starting from the first (Start= +% [1,1,..]. % % DATA = ZARRREAD(FILEPATH, Count=count) retrieves a subset of the data % from the Zarr array located at FILEPATH. Count is a row vector -% of number of elements to be read in each dimension. +% of number of elements to be read in each dimension. Default is to read +% all the available elements (based on dimension size and the specified +% Start and Stride). % % DATA = ZARRREAD(FILEPATH, Stride=stride) retrieves a subset of the data % from the Zarr array located at FILEPATH. Stride is a row vector of % spaces between indices along each dimension. A value of 1 accesses % adjacent elements in the corresponding dimension, a value of 2 % accesses every other element in the corresponding dimension, etc. +% Default is to read all elements without skipping (Stride=[1,1,...]) % Copyright 2025 The MathWorks, Inc. From 9936f798b0caf6a8b7826b5b93ea9594f5a3de0b Mon Sep 17 00:00:00 2001 From: Kris Fedorenko <1648280+krisfed@users.noreply.github.com> Date: Fri, 20 Jun 2025 11:49:40 -0400 Subject: [PATCH 5/6] Error message update --- Zarr.m | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Zarr.m b/Zarr.m index ddd56b9..5018aab 100644 --- a/Zarr.m +++ b/Zarr.m @@ -77,9 +77,9 @@ if numel(params) ~= numel(dims) error("MATLAB:Zarr:badPartialReadDimensions",... - "Length of parameters for partial reading " +... - "(Start, Stride, Count) must be the same "+... - "as the number of dataset dimensions.") + "Number of elements in " +... + "Start/Stride/Count must be the same "+... + "as the number of Zarr array dimensions.") end newParams = params; From 78892035145507ec2b1e0cdf05f2985e2c814678 Mon Sep 17 00:00:00 2001 From: Kris Fedorenko <1648280+krisfed@users.noreply.github.com> Date: Fri, 27 Jun 2025 06:57:33 -0400 Subject: [PATCH 6/6] Address feedback and add doc --- Zarr.m | 30 +++++++++++++++++++----------- doc/documentation.md | 25 +++++++++++++++++++++++-- doc/examples.md | 12 ++++++++++++ zarrread.m | 38 +++++++++++++++++++------------------- 4 files changed, 73 insertions(+), 32 deletions(-) diff --git a/Zarr.m b/Zarr.m index 5018aab..3088425 100644 --- a/Zarr.m +++ b/Zarr.m @@ -53,12 +53,19 @@ isZgroup = isfile(fullfile(path, '.zgroup')); end - function newParams = validatePartialReadParams(params, dims, defaultValues) - % Validate the parameters for partial read (Start, Stride, + function newParams = processPartialReadParams(params, dims,... + defaultValues, paramName) + % Process the parameters for partial read (Start, Stride, % Count) + arguments (Input) + params % Start/Stride/Count parameter to be validated + dims (1,:) double % Zarr array dimensions + defaultValues (1,:) + paramName (1,1) string + end arguments (Output) - newParams (1,:) int64 + newParams (1,:) int64 % must be integers for tensorstore end if isempty(params) @@ -78,8 +85,9 @@ if numel(params) ~= numel(dims) error("MATLAB:Zarr:badPartialReadDimensions",... "Number of elements in " +... - "Start/Stride/Count must be the same "+... - "as the number of Zarr array dimensions.") + "%s must be the same "+... + "as the number of Zarr array dimensions.",... + paramName) end newParams = params; @@ -249,13 +257,13 @@ function makeZarrGroups(existingParentPath, newGroupsPath) % Validate partial read parameters info = zarrinfo(obj.Path); numDims = numel(info.shape); - start = Zarr.validatePartialReadParams(start, info.shape,... - ones([1,numDims])); - stride = Zarr.validatePartialReadParams(stride, info.shape,... - ones([1,numDims])); + start = Zarr.processPartialReadParams(start, info.shape,... + ones([1,numDims]), "Start"); + stride = Zarr.processPartialReadParams(stride, info.shape,... + ones([1,numDims]), "Stride"); maxCount = (int64(info.shape') - start + 1)./stride; % has to be a row vector - count = Zarr.validatePartialReadParams(count, info.shape,... - maxCount); + count = Zarr.processPartialReadParams(count, info.shape,... + maxCount, "Count"); % Convert partial read parameters to tensorstore-style % indexing diff --git a/doc/documentation.md b/doc/documentation.md index 9d6a556..6ae5757 100644 --- a/doc/documentation.md +++ b/doc/documentation.md @@ -68,11 +68,32 @@ do not already exist. If `FILEPATH` exists already, the contents are overwritten Write the MATLAB variable data (specified by DATA) to the path specified by `FILEPATH`. The size of `DATA` must match the size of the Zarr array specified during creation. -## `DATA = zarrread(FILEPATH)` -Retrieve all the data from the Zarr array located at `FILEPATH`. +## `DATA = zarrread(FILEPATH, Name=Value)` +Retrieve data from the Zarr array located at `FILEPATH`. The datatype of DATA is the MATLAB equivalent of the Zarr datatype of the array located at `FILEPATH`. +### Name - Value Pairs +``` +Start - a row vector of one-based indices of the first + elements to be read in each dimension. If you + do not specify start, then the function starts + reading the dataset from the first index along + each dimension. + +Count - a row vector of numbers of elements to + be read in each dimension. If you do not specify + count, then the function reads data until the end + of each dimension. + +Stride - a row vector of differences between indices along + each dimension. A value of 1 accesses adjacent elements + in the corresponding dimension, a value of 2 accesses + every other element in the corresponding dimension, and + so on. If you do not specify stride, then the function + reads data without skipping indices along each dimension. +``` + ## `INFO = zarrinfo(FILEPATH)` Read the metadata associated with a Zarr array or group located at `FILEPATH` and return the information in a structure INFO, whose fields are the names of the metadata keys. If `FILEPATH` is a Zarr array (has a valid `.zarray` file), the value of `node_type` is "array"; if `FILEPATH` is a Zarr group (has a valid `.zgroup` file), the value of the field `node_type` is "group". diff --git a/doc/examples.md b/doc/examples.md index 3233ea1..a573822 100644 --- a/doc/examples.md +++ b/doc/examples.md @@ -6,6 +6,18 @@ filepath = "group1\dset1"; data = zarrread(filepath) ``` +### Read a subset of Zarr array +Read a 2x2 subset of a 3x4 array, starting with the second element in the second dimension, and only reading every other element in the second dimension. +``` MATLAB +filepath = "grp_v2/smallArr"; +d = zarrread(filepath, Start=[1,2], Stride=[1,2], Count=[2,2]) + +d = + + 4 10 + 5 11 +``` + ### Create and write to a Zarr array ``` MATLAB filepath = "myZarrfiles\singleDset"; diff --git a/zarrread.m b/zarrread.m index f8f93ad..fbfd458 100644 --- a/zarrread.m +++ b/zarrread.m @@ -3,25 +3,25 @@ % DATA = ZARRREAD(FILEPATH) retrieves all the data from the Zarr array % located at FILEPATH. The datatype of DATA is the MATLAB equivalent of % the Zarr datatype of the array located at FILEPATH. -% -% DATA = ZARRREAD(FILEPATH, Start=start) retrieves a subset of the data -% from the Zarr array located at FILEPATH. Start is a row vector of -% one-based indices of the first element to be read in each dimension. -% Default is to read all the elements starting from the first (Start= -% [1,1,..]. -% -% DATA = ZARRREAD(FILEPATH, Count=count) retrieves a subset of the data -% from the Zarr array located at FILEPATH. Count is a row vector -% of number of elements to be read in each dimension. Default is to read -% all the available elements (based on dimension size and the specified -% Start and Stride). -% -% DATA = ZARRREAD(FILEPATH, Stride=stride) retrieves a subset of the data -% from the Zarr array located at FILEPATH. Stride is a row vector of -% spaces between indices along each dimension. A value of 1 accesses -% adjacent elements in the corresponding dimension, a value of 2 -% accesses every other element in the corresponding dimension, etc. -% Default is to read all elements without skipping (Stride=[1,1,...]) +% +% DATA = ZARRREAD(..., Start=start) retrieves a subset of the data from +% the Zarr array. Specify start as a row vector of one-based indices of +% the first elements to be read in each dimension. If you do not specify +% start, then the function starts reading the dataset from the first +% index along each dimension. +% +% DATA = ZARRREAD(..., Count=count) retrieves a subset of the data from +% the Zarr array. Specify count as a row vector of numbers of elements to +% be read in each dimension. If you do not specify count, then the +% function reads data until the end of each dimension. +% +% DATA = ZARRREAD(..., Stride=stride) retrieves a subset of the data from +% the Zarr array. Specify stride as a row vector of differences between +% indices along each dimension. A value of 1 accesses adjacent elements +% in the corresponding dimension, a value of 2 accesses every other +% element in the corresponding dimension, and so on. If you do not +% specify stride, then the function reads data without skipping indices +% along each dimension. % Copyright 2025 The MathWorks, Inc.