Skip to content

feat: implement Primitive type Literal #117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/iceberg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(ICEBERG_SOURCES
arrow_c_data_internal.cc
catalog/in_memory_catalog.cc
demo.cc
datum.cc
expression/expression.cc
file_reader.cc
json_internal.cc
Expand Down
103 changes: 103 additions & 0 deletions src/iceberg/datum.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include "iceberg/datum.h"

#include <sstream>

#include "iceberg/exception.h"

namespace iceberg {

// Constructor
PrimitiveLiteral::PrimitiveLiteral(PrimitiveLiteralValue value,
std::shared_ptr<PrimitiveType> type)
: value_(std::move(value)), type_(std::move(type)) {}

// Factory methods
PrimitiveLiteral PrimitiveLiteral::Boolean(bool value) {
return PrimitiveLiteral(value, std::make_shared<BooleanType>());
}

PrimitiveLiteral PrimitiveLiteral::Integer(int32_t value) {
return PrimitiveLiteral(value, std::make_shared<IntType>());
}

PrimitiveLiteral PrimitiveLiteral::Long(int64_t value) {
return PrimitiveLiteral(value, std::make_shared<LongType>());
}

PrimitiveLiteral PrimitiveLiteral::Float(float value) {
return PrimitiveLiteral(value, std::make_shared<FloatType>());
}

PrimitiveLiteral PrimitiveLiteral::Double(double value) {
return PrimitiveLiteral(value, std::make_shared<DoubleType>());
}

PrimitiveLiteral PrimitiveLiteral::String(std::string value) {
return PrimitiveLiteral(std::move(value), std::make_shared<StringType>());
}

PrimitiveLiteral PrimitiveLiteral::Binary(std::vector<uint8_t> value) {
return PrimitiveLiteral(std::move(value), std::make_shared<BinaryType>());
}

Result<PrimitiveLiteral> PrimitiveLiteral::Deserialize(std::span<const uint8_t> data) {
return NotImplemented("Deserialization of PrimitiveLiteral is not implemented yet");
}

Result<std::vector<uint8_t>> PrimitiveLiteral::Serialize() const {
return NotImplemented("Serialization of PrimitiveLiteral is not implemented yet");
}

// Getters
const PrimitiveLiteralValue& PrimitiveLiteral::value() const { return value_; }

const std::shared_ptr<PrimitiveType>& PrimitiveLiteral::type() const { return type_; }

// Cast method
Result<PrimitiveLiteral> PrimitiveLiteral::CastTo(
const std::shared_ptr<PrimitiveType>& target_type) const {
if (*type_ == *target_type) {
// If types are the same, return a copy of the current literal
return PrimitiveLiteral(value_, target_type);
}

return NotImplemented("Cast from {} to {} is not implemented", type_->ToString(),
target_type->ToString());
}

// Three-way comparison operator
std::partial_ordering PrimitiveLiteral::operator<=>(const PrimitiveLiteral& other) const {
// If types are different, comparison is unordered
if (type_->type_id() != other.type_->type_id()) {
return std::partial_ordering::unordered;
}
if (value_ == other.value_) {
return std::partial_ordering::equivalent;
}
throw IcebergError("Not implemented: comparison between different primitive types");
}

std::string PrimitiveLiteral::ToString() const {
throw NotImplemented("ToString for PrimitiveLiteral is not implemented yet");
}

} // namespace iceberg
108 changes: 108 additions & 0 deletions src/iceberg/datum.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#pragma once

#include <compare>
#include <memory>
#include <string>
#include <variant>
#include <vector>

#include "iceberg/result.h"
#include "iceberg/type.h"

namespace iceberg {

/// \brief Exception type for values that are below the minimum allowed value for a
/// primitive type.
///
/// When casting a value to a narrow primitive type, if the value exceeds the maximum of
/// dest type, it might be above the maximum allowed value for that type.
struct BelowMin {
bool operator==(const BelowMin&) const = default;
std::strong_ordering operator<=>(const BelowMin&) const = default;
};

/// \brief Exception type for values that are above the maximum allowed value for a
/// primitive type.
///
/// When casting a value to a narrow primitive type, if the value exceeds the maximum of
/// dest type, it might be above the maximum allowed value for that type.
struct AboveMax {
bool operator==(const AboveMax&) const = default;
std::strong_ordering operator<=>(const AboveMax&) const = default;
};

using PrimitiveLiteralValue =
std::variant<bool, // for boolean
int32_t, // for int, date
int64_t, // for long, timestamp, timestamp_tz, time
float, // for float
double, // for double
std::string, // for string
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

C++ string not requires utf8, so I wonder whether std::vector<uint8_t> with string is possible

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small String Optimization (SSO) will make PrimitiveLiteralValue 40 bytes, may be use a unique_ptr to own the string?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a nit, Literal might be used as vector, so more bytes is not critical?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the data part of vector will not be in Literal's layout?
If we are not going to use something like std::vector, I think it's fine with the current design.

Copy link
Member Author

@mapleFU mapleFU Jun 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Emmm, a tiny wrapper like this should be defined if unique_ptr should be added. I'm also ok for the case

  struct StringLiteralValue {
    StringLiteralValue(std::string value);
    StringLiteralValue(const StringLiteralValue&);
    StringLiteralValue(StringLiteralValue&&) noexcept;
    const std::string& str() const {
      return *value_;
    }
    bool operator==(const StringLiteralValue&) const;
    std::strong_ordering operator<=>(const StringLiteralValue&) const;
  private:
    std::unique_ptr<std::string> value_;
  };

In libc++ version, sizeof(PrimitiveLiteralValue) == 48, since std::vector is large, it's still 48b after change to StringLiteralValue.

After change to below:

  template <typename T>
  struct LiteralValueWrapper {
    LiteralValueWrapper(T value);
    LiteralValueWrapper(const LiteralValueWrapper&);
    LiteralValueWrapper(LiteralValueWrapper&&) noexcept = default;
    const std::string& str() const {
      return *value_;
    }
    bool operator==(const LiteralValueWrapper<T>&) const;
    std::strong_ordering operator<=>(const LiteralValueWrapper<T>&) const;
  private:
    std::unique_ptr<T> value_;
  };

  using PrimitiveLiteralValue =
      std::variant<bool,                     // for boolean
                   int32_t,                  // for int, date
                   int64_t,                  // for long, timestamp, timestamp_tz, time
                   float,                    // for float
                   double,                   // for double
                   LiteralValueWrapper<std::string>,              // for string
                   LiteralValueWrapper<std::vector<uint8_t>>,     // for binary, fixed
                   std::array<uint8_t, 16>,  // for uuid and decimal
                   BelowMin, AboveMax>;

The sizeof sizeof(PrimitiveLiteralValue) == 48 becomes 40B, just eliminate 8B 😅. I think it's due to C++ doesn't do any niche optimization for std::variant... If the largest element in PrimitiveLiteralValue is 8B, sizeof(PrimitiveLiteralValue) can be 32B. So I think currently std::string is enough here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the analysis, let's keep std::string :)

std::vector<uint8_t>, // for binary, fixed, decimal and uuid
BelowMin, AboveMax>;

/// \brief PrimitiveLiteral is owned literal of a primitive type.
class PrimitiveLiteral {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rename to Datum since you are using datum.h/cc as the filename?
Personally I prefer the following:

using PrimitiveLiteral = std::variant<bool, ...
class Datum {
    PrimitiveLiteral literal_,
    std::shared_ptr<PrimitiveType>
};

public:
explicit PrimitiveLiteral(PrimitiveLiteralValue value,
std::shared_ptr<PrimitiveType> type);

// Factory methods for primitive types
static PrimitiveLiteral Boolean(bool value);
static PrimitiveLiteral Integer(int32_t value);
static PrimitiveLiteral Long(int64_t value);
static PrimitiveLiteral Float(float value);
static PrimitiveLiteral Double(double value);
static PrimitiveLiteral String(std::string value);
static PrimitiveLiteral Binary(std::vector<uint8_t> value);

/// Create iceberg value from bytes.
///
/// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization)
/// for reference.
static Result<PrimitiveLiteral> Deserialize(std::span<const uint8_t> data);
/// Serialize iceberg value to bytes.
///
/// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization)
/// for reference.
Result<std::vector<uint8_t>> Serialize() const;

/// Get the value as a variant
const PrimitiveLiteralValue& value() const;

/// Get the Iceberg Type of the literal
const std::shared_ptr<PrimitiveType>& type() const;

/// Cast the literal to a specific type
Result<PrimitiveLiteral> CastTo(
const std::shared_ptr<PrimitiveType>& target_type) const;

std::partial_ordering operator<=>(const PrimitiveLiteral& other) const;

std::string ToString() const;

private:
PrimitiveLiteralValue value_;
std::shared_ptr<PrimitiveType> type_;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't know std::shared_ptr<PrimitiveType> to heavy for this type

};

} // namespace iceberg
2 changes: 1 addition & 1 deletion src/iceberg/expression/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#pragma once

/// \file iceberg/expression.h
/// \file iceberg/expression/expression.h
/// Expression interface for Iceberg table operations.

#include <memory>
Expand Down
Loading