Skip to content

Commit 03bdb2a

Browse files
GH-40278: [C++] Support casting string to duration in CSV converter (#46035)
### Rationale for this change Currently, the Arrow C++ CSV converter does not support parsing strings into `duration` types. This limits CSV ingestion capabilities when handling datasets with time-based intervals represented as numeric strings (e.g., `1000`, `2000000`). This PR adds support for parsing such strings into Arrow's `DurationType`. Note: Human-readable duration formats such as `1s`, `2m`, or `3h` are not supported in this PR. Support for those formats may be considered in a future enhancement. ### What changes are included in this PR? - Added `DurationValueDecoder` using `StringConverter<DurationType>` - Registered support in both standard and dictionary converters - Added unit tests covering: - Basic parsing across all time units (s, ms, µs, ns) - Null and custom null values - Whitespace handling and error cases ### Are these changes tested? Yes, conversion logic is fully covered by new tests in `converter_test.cc`. ### Are there any user-facing changes? Yes, users can now convert duration strings in CSV files to Arrow `duration` arrays by specifying the appropriate schema type. * GitHub Issue: #40278 Lead-authored-by: Zihan Qi <zihan.qi@tum.de> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent d62a521 commit 03bdb2a

File tree

4 files changed

+97
-1
lines changed

4 files changed

+97
-1
lines changed

cpp/src/arrow/csv/converter.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,32 @@ struct MultipleParsersTimestampValueDecoder : public ValueDecoder {
470470
std::vector<const TimestampParser*> parsers_;
471471
};
472472

473+
//
474+
// Value decoder for durations
475+
//
476+
struct DurationValueDecoder : public ValueDecoder {
477+
using value_type = int64_t;
478+
479+
explicit DurationValueDecoder(const std::shared_ptr<DataType>& type,
480+
const ConvertOptions& options)
481+
: ValueDecoder(type, options),
482+
concrete_type_(checked_cast<const DurationType&>(*type)),
483+
string_converter_() {}
484+
485+
Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* out) {
486+
TrimWhiteSpace(&data, &size);
487+
if (ARROW_PREDICT_FALSE(!string_converter_.Convert(
488+
concrete_type_, reinterpret_cast<const char*>(data), size, out))) {
489+
return GenericConversionError(type_, data, size);
490+
}
491+
return Status::OK();
492+
}
493+
494+
protected:
495+
const DurationType& concrete_type_;
496+
arrow::internal::StringConverter<DurationType> string_converter_;
497+
};
498+
473499
/////////////////////////////////////////////////////////////////////////
474500
// Concrete Converter hierarchy
475501

@@ -702,6 +728,7 @@ Result<std::shared_ptr<Converter>> Converter::Make(const std::shared_ptr<DataTyp
702728
NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
703729
NUMERIC_CONVERTER_CASE(Type::TIME32, Time32Type)
704730
NUMERIC_CONVERTER_CASE(Type::TIME64, Time64Type)
731+
NUMERIC_CONVERTER_CASE(Type::DURATION, DurationType)
705732
CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, BooleanValueDecoder>))
706733
CONVERTER_CASE(Type::BINARY,
707734
(PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
@@ -785,6 +812,7 @@ Result<std::shared_ptr<DictionaryConverter>> DictionaryConverter::Make(
785812
CONVERTER_CASE(Type::UINT64, UInt64Type, NumericValueDecoder<UInt64Type>)
786813
CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder<FloatType>)
787814
CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder<DoubleType>)
815+
CONVERTER_CASE(Type::DURATION, DurationType, DurationValueDecoder)
788816
REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
789817
CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryType,
790818
FixedSizeBinaryValueDecoder)

cpp/src/arrow/csv/converter_test.cc

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,68 @@ TEST(TimestampConversion, UserDefinedParsersWithZone) {
660660
AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, options);
661661
}
662662

663+
TEST(DurationConversion, Basics) {
664+
auto type = duration(TimeUnit::SECOND);
665+
AssertConversion<DurationType, int64_t>(
666+
type, {"1,120\n", "10800,345600\n", "-1,-120\n", "-10800,-345600\n"},
667+
{{1, 10800, -1, -10800}, {120, 345600, -120, -345600}});
668+
669+
type = duration(TimeUnit::MILLI);
670+
AssertConversion<DurationType, int64_t>(
671+
type, {"1000,120000\n", "10800000,345600000\n", "500,0\n", "-1000,-120000\n"},
672+
{{1000, 10800000, 500, -1000}, {120000, 345600000, 0, -120000}});
673+
674+
type = duration(TimeUnit::MICRO);
675+
AssertConversion<DurationType, int64_t>(
676+
type, {"1000000,500000\n", "120000000,10800000000\n", "-500000,-1000000\n"},
677+
{{1000000, 120000000, -500000}, {500000, 10800000000, -1000000}});
678+
679+
type = duration(TimeUnit::NANO);
680+
AssertConversion<DurationType, int64_t>(
681+
type,
682+
{"1000000000,500000000\n", "120000000000,10800000000000\n", "7000,9\n",
683+
"-7000,-9\n"},
684+
{{1000000000, 120000000000, 7000, -7000}, {500000000, 10800000000000, 9, -9}});
685+
}
686+
687+
TEST(DurationConversion, Nulls) {
688+
auto type = duration(TimeUnit::MILLI);
689+
AssertConversion<DurationType, int64_t>(type, {"1000,N/A\n", ",10800000\n"},
690+
{{1000, 0}, {0, 10800000}},
691+
{{true, false}, {false, true}});
692+
}
693+
694+
TEST(DurationConversion, CustomNulls) {
695+
auto options = ConvertOptions::Defaults();
696+
options.null_values = {"xxx", "zzz"};
697+
698+
auto type = duration(TimeUnit::SECOND);
699+
AssertConversion<DurationType, int64_t>(type, {"1,xxx\n"}, {{1}, {0}},
700+
{{true}, {false}}, options);
701+
702+
options.quoted_strings_can_be_null = false;
703+
AssertConversionError(type, {"\"1\",\"xxx\"\n"}, {1}, options);
704+
705+
AssertConversion<DurationType, int64_t>(type, {"1,xxx\n", "zzz,120\n"},
706+
{{1, 0}, {0, 120}},
707+
{{true, false}, {false, true}}, options);
708+
}
709+
710+
TEST(DurationConversion, Whitespace) {
711+
auto type = duration(TimeUnit::MILLI);
712+
AssertConversion<DurationType, int64_t>(type,
713+
{" 1000 , 120000 \n", " 500 , 10800000 \n"},
714+
{{1000, 500}, {120000, 10800000}});
715+
}
716+
717+
TEST(DurationConversion, Invalid) {
718+
auto type = duration(TimeUnit::SECOND);
719+
AssertConversionError(type, {"xyz\n"}, {0});
720+
AssertConversionError(type, {"123abc\n"}, {0});
721+
AssertConversionError(type, {"1.5\n"}, {0}); // floats not allowed
722+
AssertConversionError(type, {"s1\n"}, {0}); // bad format
723+
}
724+
663725
Decimal128 Dec128(std::string_view value) {
664726
Decimal128 dec;
665727
int32_t scale = 0;

docs/source/cpp/csv.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ can be chosen from the following list:
265265
* Binary and Large Binary
266266
* String and Large String (with optional UTF8 input validation)
267267
* Fixed-Size Binary
268+
* Duration (numeric strings matching the schema unit, e.g., "60000" for duration[ms])
268269
* Dictionary with index type Int32 and value type one of the following:
269270
Binary, String, LargeBinary, LargeString, Int32, UInt32, Int64, UInt64,
270271
Float32, Float64, Decimal128

docs/source/python/csv.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ The features currently offered are the following:
3030
* fetching column names from the first row in the CSV file
3131
* column-wise type inference and conversion to one of ``null``, ``int64``,
3232
``float64``, ``date32``, ``time32[s]``, ``timestamp[s]``, ``timestamp[ns]``,
33-
``string`` or ``binary`` data
33+
``duration`` (from numeric strings), ``string`` or ``binary`` data
3434
* opportunistic dictionary encoding of ``string`` and ``binary`` columns
3535
(disabled by default)
3636
* detecting various spellings of null values such as ``NaN`` or ``#N/A``
@@ -125,6 +125,11 @@ a :class:`ConvertOptions` instance and pass it to :func:`read_csv`::
125125
}
126126
))
127127

128+
.. note::
129+
To assign a column as ``duration``, the CSV values must be numeric strings
130+
that match the expected unit (e.g. ``60000`` for 60 seconds when
131+
using ``duration[ms]``).
132+
128133
Available convert options are:
129134

130135
.. autosummary::

0 commit comments

Comments
 (0)