Skip to content

Commit 670ee45

Browse files
authored
branch-3.0: [feat](test)add some be ut for orc/parquet reader (#49418) (#50414)
bp #49418
1 parent c28a1eb commit 670ee45

6 files changed

+1391
-6
lines changed

be/src/vec/exec/format/orc/vorc_reader.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,8 @@ bool OrcReader::_check_acid_schema(const orc::Type& type) {
430430
return false;
431431
}
432432
}
433+
} else {
434+
return false;
433435
}
434436
return true;
435437
}
@@ -1425,15 +1427,9 @@ Status OrcReader::_fill_doris_data_column(const std::string& col_name,
14251427
case TypeIndex::Decimal128V3:
14261428
return _decode_decimal_column<Decimal128V3, is_filter>(col_name, data_column, data_type,
14271429
cvb, num_values);
1428-
case TypeIndex::Date:
1429-
return _decode_time_column<VecDateTimeValue, Int64, orc::LongVectorBatch, is_filter>(
1430-
col_name, data_column, cvb, num_values);
14311430
case TypeIndex::DateV2:
14321431
return _decode_time_column<DateV2Value<DateV2ValueType>, UInt32, orc::LongVectorBatch,
14331432
is_filter>(col_name, data_column, cvb, num_values);
1434-
case TypeIndex::DateTime:
1435-
return _decode_time_column<VecDateTimeValue, Int64, orc::TimestampVectorBatch, is_filter>(
1436-
col_name, data_column, cvb, num_values);
14371433
case TypeIndex::DateTimeV2:
14381434
return _decode_time_column<DateV2Value<DateTimeV2ValueType>, UInt64,
14391435
orc::TimestampVectorBatch, is_filter>(col_name, data_column, cvb,
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <gtest/gtest.h>
19+
20+
#include <memory>
21+
22+
#include "orc/ColumnPrinter.hh"
23+
#include "vec/columns/column_array.h"
24+
#include "vec/columns/column_nullable.h"
25+
#include "vec/columns/column_string.h"
26+
#include "vec/columns/column_struct.h"
27+
#include "vec/exec/format/orc/vorc_reader.h"
28+
29+
namespace doris {
30+
namespace vectorized {
31+
class OrcReaderConvertDictTest : public ::testing::Test {
32+
protected:
33+
void SetUp() override {}
34+
35+
void TearDown() override {}
36+
};
37+
38+
std::unique_ptr<orc::EncodedStringVectorBatch> create_encoded_string_batch(
39+
const std::vector<std::string>& dict_values) {
40+
auto batch =
41+
std::make_unique<orc::EncodedStringVectorBatch>(1024 * 1024, *orc::getDefaultPool());
42+
batch->dictionary = std::make_unique<orc::StringDictionary>(*orc::getDefaultPool());
43+
44+
// Fill dictionary data
45+
int sz = 0;
46+
for (const auto& value : dict_values) {
47+
sz += value.length();
48+
}
49+
50+
batch->dictionary->dictionaryBlob.resize(sz + 1024);
51+
batch->dictionary->dictionaryOffset.resize(dict_values.size() + 10);
52+
int x = 0;
53+
for (const auto& value : dict_values) {
54+
batch->dictionary->dictionaryOffset[x + 1] =
55+
batch->dictionary->dictionaryOffset[x] + value.size();
56+
int y = batch->dictionary->dictionaryOffset[x];
57+
for (auto ch : value) {
58+
batch->dictionary->dictionaryBlob[y] = ch;
59+
y++;
60+
}
61+
x++;
62+
}
63+
64+
return batch;
65+
}
66+
67+
TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnBasic) {
68+
// Prepare dictionary data
69+
std::vector<std::string> dict_values = {"hello", "world", "doris", "test"};
70+
auto string_batch = create_encoded_string_batch(dict_values);
71+
72+
// Prepare dictionary index column
73+
auto dict_column = ColumnInt32::create();
74+
std::vector<int32_t> indices = {0, 1, 2,
75+
3, 1, 0}; // "hello", "world", "doris", "test", "world", "hello"
76+
for (auto x : indices) {
77+
dict_column->insert(x);
78+
}
79+
80+
// Create ORC type
81+
auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
82+
83+
TFileScanRangeParams params;
84+
TFileRangeDesc range;
85+
auto reader = OrcReader::create_unique(params, range, "", nullptr, true);
86+
87+
// Execute conversion
88+
auto result_column = reader->_convert_dict_column_to_string_column(
89+
dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get());
90+
91+
// Validate results
92+
auto* string_column = assert_cast<const ColumnString*>(result_column.get());
93+
ASSERT_EQ(string_column->size(), 6);
94+
ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello");
95+
ASSERT_EQ(string_column->get_data_at(1).to_string(), "world");
96+
ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris");
97+
ASSERT_EQ(string_column->get_data_at(3).to_string(), "test");
98+
ASSERT_EQ(string_column->get_data_at(4).to_string(), "world");
99+
ASSERT_EQ(string_column->get_data_at(5).to_string(), "hello");
100+
}
101+
102+
TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnWithNulls) {
103+
// Prepare dictionary data
104+
std::vector<std::string> dict_values = {"hello", "world", "doris"};
105+
auto string_batch = create_encoded_string_batch(dict_values);
106+
107+
// Prepare dictionary index column
108+
auto dict_column = ColumnInt32::create();
109+
std::vector<int32_t> indices = {0, 1, 2, 1, 0};
110+
for (auto x : indices) {
111+
dict_column->insert(x);
112+
}
113+
114+
// Prepare null map
115+
NullMap null_map = {0, 1, 0, 0, 1}; // 2nd and 5th elements are null
116+
117+
// Create ORC type
118+
auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
119+
120+
TFileScanRangeParams params;
121+
TFileRangeDesc range;
122+
auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
123+
124+
// Execute conversion
125+
auto result_column = _reader->_convert_dict_column_to_string_column(
126+
dict_column.get(), &null_map, string_batch.get(), orc_type_ptr.get());
127+
128+
// Validate results
129+
auto* string_column = assert_cast<const ColumnString*>(result_column.get());
130+
ASSERT_EQ(string_column->size(), 5);
131+
ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello");
132+
ASSERT_EQ(string_column->get_data_at(1).to_string(), ""); // null value
133+
ASSERT_EQ(string_column->get_data_at(2).to_string(), "doris");
134+
ASSERT_EQ(string_column->get_data_at(3).to_string(), "world");
135+
ASSERT_EQ(string_column->get_data_at(4).to_string(), ""); // null value
136+
}
137+
138+
TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnChar) {
139+
// Prepare dictionary data (CHAR type with right-padded spaces)
140+
std::vector<std::string> dict_values = {"hello ", "world ", "test "};
141+
auto string_batch = create_encoded_string_batch(dict_values);
142+
143+
// Prepare dictionary index column
144+
auto dict_column = ColumnInt32::create();
145+
std::vector<int32_t> indices = {0, 1, 2, 1};
146+
for (auto x : indices) {
147+
dict_column->insert(x);
148+
}
149+
150+
// Create ORC CHAR type
151+
auto orc_type_ptr = createPrimitiveType(orc::TypeKind::CHAR);
152+
TFileScanRangeParams params;
153+
TFileRangeDesc range;
154+
auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
155+
156+
// Execute conversion
157+
auto result_column = _reader->_convert_dict_column_to_string_column(
158+
dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get());
159+
160+
// Validate results (should remove trailing spaces)
161+
auto* string_column = assert_cast<const ColumnString*>(result_column.get());
162+
ASSERT_EQ(string_column->size(), 4);
163+
ASSERT_EQ(string_column->get_data_at(0).to_string(), "hello"); // spaces removed
164+
ASSERT_EQ(string_column->get_data_at(1).to_string(), "world"); // spaces removed
165+
ASSERT_EQ(string_column->get_data_at(2).to_string(), "test"); // spaces removed
166+
ASSERT_EQ(string_column->get_data_at(3).to_string(), "world"); // spaces removed
167+
}
168+
169+
TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnEmpty) {
170+
// Prepare empty dictionary data
171+
std::vector<std::string> dict_values = {""};
172+
auto string_batch = create_encoded_string_batch(dict_values);
173+
174+
// Prepare dictionary index column
175+
auto dict_column = ColumnInt32::create();
176+
std::vector<int32_t> indices = {0, 0, 0};
177+
for (auto x : indices) {
178+
dict_column->insert(x);
179+
}
180+
181+
// Create ORC type
182+
auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
183+
TFileScanRangeParams params;
184+
TFileRangeDesc range;
185+
auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
186+
// Execute conversion
187+
auto result_column = _reader->_convert_dict_column_to_string_column(
188+
dict_column.get(), nullptr, string_batch.get(), orc_type_ptr.get());
189+
190+
// Validate results
191+
auto* string_column = assert_cast<const ColumnString*>(result_column.get());
192+
ASSERT_EQ(string_column->size(), 3);
193+
ASSERT_EQ(string_column->get_data_at(0).to_string(), "");
194+
ASSERT_EQ(string_column->get_data_at(1).to_string(), "");
195+
ASSERT_EQ(string_column->get_data_at(2).to_string(), "");
196+
}
197+
198+
TEST_F(OrcReaderConvertDictTest, ConvertDictColumnToStringColumnMixed) {
199+
// Prepare mixed length dictionary data
200+
std::vector<std::string> dict_values = {"", "a", "ab", "abc", "abcd"};
201+
auto string_batch = create_encoded_string_batch(dict_values);
202+
203+
// Prepare dictionary index column
204+
auto dict_column = ColumnInt32::create();
205+
std::vector<int32_t> indices = {0, 1, 2, 3, 4, 2, 1, 0};
206+
for (auto x : indices) {
207+
dict_column->insert(x);
208+
}
209+
210+
// Prepare partial null values
211+
NullMap null_map = {0, 0, 1, 0, 0, 1, 0, 0};
212+
213+
// Create ORC type
214+
auto orc_type_ptr = createPrimitiveType(orc::TypeKind::STRING);
215+
TFileScanRangeParams params;
216+
TFileRangeDesc range;
217+
auto _reader = OrcReader::create_unique(params, range, "", nullptr, true);
218+
// Execute conversion
219+
auto result_column = _reader->_convert_dict_column_to_string_column(
220+
dict_column.get(), &null_map, string_batch.get(), orc_type_ptr.get());
221+
222+
// Validate results
223+
auto* string_column = assert_cast<const ColumnString*>(result_column.get());
224+
ASSERT_EQ(string_column->size(), 8);
225+
ASSERT_EQ(string_column->get_data_at(0).to_string(), "");
226+
ASSERT_EQ(string_column->get_data_at(1).to_string(), "a");
227+
ASSERT_EQ(string_column->get_data_at(2).to_string(), ""); // null
228+
ASSERT_EQ(string_column->get_data_at(3).to_string(), "abc");
229+
ASSERT_EQ(string_column->get_data_at(4).to_string(), "abcd");
230+
ASSERT_EQ(string_column->get_data_at(5).to_string(), ""); // null
231+
ASSERT_EQ(string_column->get_data_at(6).to_string(), "a");
232+
ASSERT_EQ(string_column->get_data_at(7).to_string(), "");
233+
}
234+
235+
} // namespace vectorized
236+
237+
} // namespace doris

0 commit comments

Comments
 (0)