diff --git a/data-types/fixed-length-ascii/README.md b/data-types/fixed-length-ascii/README.md new file mode 100644 index 0000000..befc01a --- /dev/null +++ b/data-types/fixed-length-ascii/README.md @@ -0,0 +1,33 @@ +# Fixed-length ASCII data type + +Defines a data type for fixed-length ASCII strings. + +## Permitted fill values + +The value of the `fill_value` metadata key must be a string. + +## Example + +For example, the array metadata below specifies that the array contains fixed-length ASCII strings: + +```json +{ + "data_type": "fixed-length-ascii", + "fill_value": "", + "configuration": { + "length_bits": 24 + }, +} +``` + +## Notes + +TBD + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-ascii/schema.json b/data-types/fixed-length-ascii/schema.json new file mode 100644 index 0000000..53fb75f --- /dev/null +++ b/data-types/fixed-length-ascii/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "fixed-length-ascii" + }, + "configuration": { + "type": "object", + "properties": { + "length_bits": { + "type": "integer" + } + }, + "required": ["length_bits"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "fixed-length-ascii" } + ] +} \ No newline at end of file diff --git a/data-types/fixed-length-bytes/README.md b/data-types/fixed-length-bytes/README.md new file mode 100644 index 0000000..107dcc8 --- /dev/null +++ b/data-types/fixed-length-bytes/README.md @@ -0,0 +1,33 @@ +# Fixed-length bytes data type + +Defines a data type for fixed-length byte strings. + +## Permitted fill values + +The value of the `fill_value` metadata key must be a string. + +## Example + +For example, the array metadata below specifies that the array contains fixed-length byte strings: + +```json +{ + "data_type": "fixed-length-bytes", + "fill_value": "", + "configuration": { + "length_bits": 24 + }, +} +``` + +## Notes + +TBD + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-bytes/schema.json b/data-types/fixed-length-bytes/schema.json new file mode 100644 index 0000000..22861cd --- /dev/null +++ b/data-types/fixed-length-bytes/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "fixed-length-bytes" + }, + "configuration": { + "type": "object", + "properties": { + "length_bits": { + "type": "integer" + } + }, + "required": ["length_bits"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "fixed-length-bytes" } + ] +} \ No newline at end of file diff --git a/data-types/fixed-length-utf32/README.md b/data-types/fixed-length-utf32/README.md new file mode 100644 index 0000000..81ab3ed --- /dev/null +++ b/data-types/fixed-length-utf32/README.md @@ -0,0 +1,54 @@ +# `fixed_length_utf32` data type + +This document defines a data type for fixed-length, null-terminated Unicode strings encoded using [UTF-32](https://www.unicode.org/versions/Unicode5.0.0/appC.pdf#M9.19040.HeadingAppendix.C2.Encoding.Forms.in.ISOIEC.10646). UTF-32, also known as UCS4, is an encoding of Unicode strings that allocates 4 bytes to each Unicode code point. + +"Fixed length" as used here means that the `fixed_length_utf32` data type is parametrized by a integral length, which sets a fixed length for every scalar belonging to that data type. + +"Null-terminated" as used here means that, for an integral length `L`, a `fixed_length_utf32` data type parameterized with `L` can represent a string shorter than `L` by adding null bytes to the end of that string until it has length `L`. + +### Name + +The name of this data type is the string `"fixed_length_utf32"` + +### Configuration + +This data type requires a configuration. The configuration for this data type is a JSON object with the following fields: + +| field name | type | required | notes | +|------------|----------|---|---| +| `"length_bytes"` | integer | yes | The number MUST represent an integer divisible by 4 in the inclusive range `[0, 2147483644]` | + +> Note: the maximum length of 2147483644 was chosen to match the semantics of the [NumPy `"U"` data type](https://numpy.org/devdocs/reference/arrays.scalars.html#numpy.str_), which as of this writing has a maximum length in code points of 536870911, i.e. 2147483644 / 4. + +> Note: given a particular `fixed_length_utf32` data type, the length of an array element in Unicode code points is the value of the `length_bytes` field divided by 4. + +### Examples + +```json +{ + "name": "fixed_length_utf32", + "configuration" : { + "length_bytes": 4 + } +} +``` + +## Fill value representation + +The value of the `fill_value` metadata key must be a string. When encoded in UTF-32, the fill value MUST have a length in bytes less than or equal to the value of the `length_bytes` specified in the `configuration` of this data type. + +## Codec compatibility + +This data type is compatible with any codec that supports arrays with fixed-sized data types. + +## Notes + +This data type is designed for NumPy compatibility. UTF-32 is not a good fit for many applications that need to model arrays of strings, as real string datasets are often composed of variable-length strings. A variable-length string data type should be preferred in these cases. + +## Change log + +No changes yet. + +## Current maintainers + +* [zarr-python core development team](https://github.com/orgs/zarr-developers/teams/python-core-devs) \ No newline at end of file diff --git a/data-types/fixed-length-utf32/schema.json b/data-types/fixed-length-utf32/schema.json new file mode 100644 index 0000000..3135246 --- /dev/null +++ b/data-types/fixed-length-utf32/schema.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "oneOf": [ + { + "type": "object", + "properties": { + "name": { + "const": "fixed_length_utf32" + }, + "configuration": { + "type": "object", + "properties": { + "length_bytes": { + "type": "integer" + } + }, + "required": ["length_bytes"], + "additionalProperties": false + } + }, + "required": ["name", "configuration"], + "additionalProperties": false + }, + { "const": "fixed_length_utf32" } + ] +} \ No newline at end of file