databendlabs
diff --git a/‎.typos.toml
Lines changed: 2 additions & 0 deletions b/‎.typos.toml
Lines changed: 2 additions & 0 deletions
diff --git a/‎Cargo.lock
Lines changed: 33 additions & 20 deletions b/‎Cargo.lock
Lines changed: 33 additions & 20 deletions
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/common/arrow/Cargo.toml
Lines changed: 15 additions & 12 deletions b/‎src/common/arrow/Cargo.toml
Lines changed: 15 additions & 12 deletions
diff --git a/‎src/common/parquet2/.gitignore
Lines changed: 5 additions & 0 deletions b/‎src/common/parquet2/.gitignore
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/common/parquet2/Cargo.toml
Lines changed: 42 additions & 0 deletions b/‎src/common/parquet2/Cargo.toml
Lines changed: 42 additions & 0 deletions
diff --git a/‎src/common/parquet2/LICENSE
Lines changed: 14 additions & 0 deletions b/‎src/common/parquet2/LICENSE
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/common/parquet2/src/bloom_filter/hash.rs
Lines changed: 32 additions & 0 deletions b/‎src/common/parquet2/src/bloom_filter/hash.rs
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/common/parquet2/src/bloom_filter/mod.rs
Lines changed: 86 additions & 0 deletions b/‎src/common/parquet2/src/bloom_filter/mod.rs
Lines changed: 86 additions & 0 deletions
@@ -21,4 +21,6 @@ extend-exclude = [
     "src/meta",
     "src/query",
     "src/binaries",
+    # Forked from upstream
+    "src/common/parquet2"
 ]
@@ -29,6 +29,7 @@ members = [
     "src/common/storage",
     "src/common/vector",
     "src/common/license",
+    "src/common/parquet2",
     # Query
     "src/query/ast",
     "src/query/async_functions",
@@ -271,7 +272,6 @@ rpath = false
 [patch.crates-io]
 # If there are dependencies that need patching, they can be listed below.
 arrow-format = { git = "https://github.com/everpcpc/arrow-format", rev = "ad8f2dd" }
-parquet2 = { git = "https://github.com/jorgecarleitao/parquet2", rev = "b0e6545" }
 metrics = { git = "https://github.com/datafuse-extras/metrics.git", rev = "fc2ecd1" }
 icelake = { git = "https://github.com/icelake-io/icelake", rev = "54fd72f" }
 micromarshal = { git = "https://github.com/ariesdevil/opensrv", rev = "6c96813" }
 
@@ -1,10 +1,10 @@
 [package]
+description = "Arrow implementation forked from arrow2 and native format implementation forked from strawboat."
 edition = "2021"
 license = "Apache-2.0"
 name = "databend-common-arrow"
 publish = false
 version = "0.1.0"
-description = "Arrow implementation forked from arrow2 and native format implementation forked from strawboat."
 
 [lib]
 doctest = false
@@ -14,9 +14,9 @@ test = true
 default = ["arrow-default", "parquet-default"]
 
 arrow = ["arrow-buffer", "arrow-schema", "arrow-data", "arrow-array"]
+io_flight = ["io_ipc", "arrow-format/flight-data"]
 io_ipc = []
 io_ipc_compression = []
-io_flight = ["io_ipc", "arrow-format/flight-data"]
 
 # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format.
 io_parquet = ["io_ipc", "base64", "streaming-iterator", "fallible-streaming-iterator"]
@@ -34,21 +34,15 @@ io_parquet_compression = [
 io_parquet_sample_test = ["io_parquet_async"]
 
 # compression backends
-io_parquet_zstd = ["parquet2/zstd"]
+io_parquet_brotli = ["parquet2/brotli"]
 io_parquet_gzip = ["parquet2/gzip"]
-io_parquet_snappy = ["parquet2/snappy"]
 io_parquet_lz4 = ["parquet2/lz4"]
-io_parquet_brotli = ["parquet2/brotli"]
+io_parquet_snappy = ["parquet2/snappy"]
+io_parquet_zstd = ["parquet2/zstd"]
 
 # parquet bloom filter functions
 io_parquet_bloom_filter = ["parquet2/bloom_filter"]
 
-compute_aggregate = []
-compute_cast = ["lexical-core", "compute_take"]
-compute_concatenate = []
-compute_merge_sort = ["itertools", "compute_sort"]
-compute_sort = ["compute_take"]
-compute_take = []
 compute = [
     "compute_aggregate",
     "compute_cast",
@@ -57,6 +51,12 @@ compute = [
     "compute_sort",
     "compute_take",
 ]
+compute_aggregate = []
+compute_cast = ["lexical-core", "compute_take"]
+compute_concatenate = []
+compute_merge_sort = ["itertools", "compute_sort"]
+compute_sort = ["compute_take"]
+compute_take = []
 
 serde_types = ["serde", "serde_derive"]
 simd = []
@@ -104,7 +104,10 @@ dyn-clone = "1"
 either = "1.9"
 foreign_vec = "0.1.0"
 num-traits = "0.2"
-parquet2 = { version = "0.17.0", default_features = false, features = ["serde_types", "async"] }
+parquet2 = { package = "databend-common-parquet2", path = "../parquet2", default_features = false, features = [
+    "serde_types",
+    "async",
+] }
 
 # for decimal i256
 ethnum = { workspace = true }
 
@@ -0,0 +1,5 @@
+target
+Cargo.lock
+.idea
+venv
+fixtures/
@@ -0,0 +1,42 @@
+[package]
+description = "Safe implementation of parquet IO, forked from parquet2."
+edition = "2021"
+license = "Apache-2.0"
+name = "databend-common-parquet2"
+version = "0.1.0"
+
+[lib]
+bench = false
+name = "parquet2"
+
+[dependencies]
+parquet-format-safe = "0.2"
+seq-macro = { version = "0.3", default-features = false }
+streaming-decompression = "0.1"
+
+async-stream = { version = "0.3.3", optional = true }
+futures = { version = "0.3", optional = true }
+
+brotli = { version = "^3.3", optional = true }
+flate2 = { version = "^1.0", optional = true, default-features = false }
+lz4 = { version = "1.24", optional = true }
+serde = { version = "^1.0", optional = true, features = ["derive"] }
+snap = { version = "^1.1", optional = true }
+zstd = { version = "^0.12", optional = true, default-features = false }
+
+xxhash-rust = { version = "0.8", optional = true, features = ["xxh64"] }
+
+[dev-dependencies]
+criterion = "0.4"
+rand = "0.8"
+tokio = { version = "1", features = ["macros", "rt"] }
+
+[features]
+async = ["async-stream", "futures", "parquet-format-safe/async"]
+bloom_filter = ["xxhash-rust"]
+default = ["snappy", "gzip", "lz4", "zstd", "brotli", "bloom_filter"]
+full = ["snappy", "gzip", "lz4", "zstd", "brotli", "bloom_filter", "async"]
+gzip = ["flate2/rust_backend"]
+gzip_zlib_ng = ["flate2/zlib-ng"]
+serde_types = ["serde"]
+snappy = ["snap"]
@@ -0,0 +1,14 @@
+Copyright [2021] [Jorge C Leitao]
+Copyright 2021 Datafuse Labs
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -0,0 +1,32 @@
+// Copyright [2021] [Jorge C Leitao]
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use xxhash_rust::xxh64::xxh64;
+
+use crate::types::NativeType;
+
+const SEED: u64 = 0;
+
+/// (xxh64) hash of a [`NativeType`].
+#[inline]
+pub fn hash_native<T: NativeType>(value: T) -> u64 {
+    xxh64(value.to_le_bytes().as_ref(), SEED)
+}
+
+/// (xxh64) hash of a sequence of bytes (e.g. ByteArray).
+#[inline]
+pub fn hash_byte<A: AsRef<[u8]>>(value: A) -> u64 {
+    xxh64(value.as_ref(), SEED)
+}
@@ -0,0 +1,86 @@
+// Copyright [2021] [Jorge C Leitao]
+// Copyright 2021 Datafuse Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! API to read and use bloom filters
+mod hash;
+mod read;
+mod split_block;
+
+pub use hash::hash_byte;
+pub use hash::hash_native;
+pub use read::read;
+pub use split_block::insert;
+pub use split_block::is_in_set;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn basics() {
+        let mut bitset = vec![0; 32];
+
+        // insert
+        for a in 0..10i64 {
+            let hash = hash_native(a);
+            insert(&mut bitset, hash);
+        }
+
+        // bloom filter produced by parquet-mr/spark for a column of i64 (0..=10)
+        // import pyspark.sql  // 3.2.1
+        // spark = pyspark.sql.SparkSession.builder.getOrCreate()
+        // spark.conf.set("parquet.bloom.filter.enabled", True)
+        // spark.conf.set("parquet.bloom.filter.expected.ndv", 10)
+        // spark.conf.set("parquet.bloom.filter.max.bytes", 32)
+        //
+        // data = [(i % 10,) for i in range(100)]
+        // df = spark.createDataFrame(data, ["id"]).repartition(1)
+        //
+        // df.write.parquet("bla.parquet", mode = "overwrite")
+        let expected: &[u8] = &[
+            24, 130, 24, 8, 134, 8, 68, 6, 2, 101, 128, 10, 64, 2, 38, 78, 114, 1, 64, 38, 1, 192,
+            194, 152, 64, 70, 0, 36, 56, 121, 64, 0,
+        ];
+        assert_eq!(bitset, expected);
+
+        // check
+        for a in 0..11i64 {
+            let hash = hash_native(a);
+
+            let valid = is_in_set(&bitset, hash);
+
+            assert_eq!(a < 10, valid);
+        }
+    }
+
+    #[test]
+    fn binary() {
+        let mut bitset = vec![0; 32];
+
+        // insert
+        for a in 0..10i64 {
+            let value = format!("a{}", a);
+            let hash = hash_byte(value);
+            insert(&mut bitset, hash);
+        }
+
+        // bloom filter produced by parquet-mr/spark for a column of i64 f"a{i}" for i in 0..10
+        let expected: &[u8] = &[
+            200, 1, 80, 20, 64, 68, 8, 109, 6, 37, 4, 67, 144, 80, 96, 32, 8, 132, 43, 33, 0, 5,
+            99, 65, 2, 0, 224, 44, 64, 78, 96, 4,
+        ];
+        assert_eq!(bitset, expected);
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -21,4 +21,6 @@ extend-exclude = [`
`21`	`21`	`"src/meta",`
`22`	`22`	`"src/query",`
`23`	`23`	`"src/binaries",`
	`24`	`+ # Forked from upstream`
	`25`	`+ "src/common/parquet2"`
`24`	`26`	`]`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +target
 +Cargo.lock
 +.idea
 +venv
 +fixtures/