Skip to content

Commit d7985dc

Browse files
mergify[bot]wyb
andauthored
[Enhancement] Support parquet list legacy encodings in files() (backport #64160) (#64223)
Signed-off-by: wyb <[email protected]> Co-authored-by: wyb <[email protected]>
1 parent 7c0bd61 commit d7985dc

File tree

4 files changed

+153
-16
lines changed

4 files changed

+153
-16
lines changed

be/src/exec/parquet_schema_builder.cpp

Lines changed: 93 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -147,32 +147,110 @@ LIST must always annotate a 3-level structure:
147147
The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
148148
The middle level, named list, must be a repeated group with a single field named element.
149149
The element field encodes the list's element type and repetition. Element repetition must be required or optional.
150-
*/
151150
151+
Support legacy encodings:
152+
1. List<Integer> (nullable list, non-null elements)
153+
optional group my_list (LIST) {
154+
repeated int32 element;
155+
}
156+
157+
2. List<Tuple<String, Integer>> (nullable list, non-null elements)
158+
optional group my_list (LIST) {
159+
repeated group element {
160+
required binary str (STRING);
161+
required int32 num;
162+
}
163+
}
164+
165+
3. List<List<Integer>> (nullable outer list, non-null elements)
166+
optional group my_list (LIST) {
167+
repeated group array (LIST) {
168+
repeated int32 array;
169+
}
170+
}
171+
*/
152172
static Status get_parquet_type_from_list(const ::parquet::schema::NodePtr& node, TypeDescriptor* type_desc) {
153173
// 1st level.
154-
// <list-repetition> group <name> (LIST)
155174
DCHECK(node->is_group());
156175
DCHECK(node->logical_type()->is_list());
157176

158177
auto group_node = std::static_pointer_cast<::parquet::schema::GroupNode>(node);
159-
DCHECK(group_node->field_count() == 1);
178+
if (group_node->field_count() != 1) {
179+
return Status::NotSupported(fmt::format("list 1st level group {} must have exactly one child, but got {}",
180+
group_node->name(), group_node->field_count()));
181+
}
160182

161183
// 2nd level.
162-
// repeated group list {
163184
auto list_node = group_node->field(0);
164-
auto list_group_node = std::static_pointer_cast<::parquet::schema::GroupNode>(list_node);
165-
DCHECK(list_group_node->field_count() == 1);
166-
DCHECK(list_group_node->is_group());
167-
168-
// 3rd level.
169-
// <list-repetition> group <name> (LIST)
170-
const auto& child_node = list_group_node->field(0);
171-
TypeDescriptor child_type_desc;
172-
RETURN_IF_ERROR(get_parquet_type(child_node, &child_type_desc));
173-
*type_desc = TypeDescriptor::create_array_type(child_type_desc);
185+
if (!list_node->is_repeated()) {
186+
return Status::NotSupported(fmt::format("list 2nd level node {} is not repeated", list_node->name()));
187+
}
174188

175-
return Status::OK();
189+
if (list_node->is_group()) {
190+
auto list_group_node = std::static_pointer_cast<::parquet::schema::GroupNode>(list_node);
191+
int field_count = list_group_node->field_count();
192+
193+
if (field_count > 1) {
194+
// The inner type of the list should be a struct when there are multiple fields in the repeated group
195+
//
196+
// List<Tuple<String, Integer>> (nullable list, non-null elements)
197+
// optional group my_list (LIST) {
198+
// repeated group element {
199+
// required binary str (STRING);
200+
// required int32 num;
201+
// }
202+
// }
203+
TypeDescriptor child_type_desc;
204+
RETURN_IF_ERROR(try_to_infer_struct_type(list_group_node, &child_type_desc));
205+
*type_desc = TypeDescriptor::create_array_type(child_type_desc);
206+
return Status::OK();
207+
} else if (field_count == 1) {
208+
const auto& child_node = list_group_node->field(0);
209+
if (list_group_node->logical_type()->is_list() && child_node->is_repeated()) {
210+
// The inner type might be a list with two-level encoding
211+
//
212+
// List<List<Integer>> (nullable outer list, non-null elements)
213+
// optional group my_list (LIST) {
214+
// repeated group array (LIST) {
215+
// repeated int32 array;
216+
// }
217+
// }
218+
TypeDescriptor child_type_desc;
219+
RETURN_IF_ERROR(get_parquet_type(list_group_node, &child_type_desc));
220+
*type_desc = TypeDescriptor::create_array_type(child_type_desc);
221+
return Status::OK();
222+
} else {
223+
// 3-level encoding
224+
//
225+
// List<String> (list non-null, elements nullable)
226+
// required group my_list (LIST) {
227+
// repeated group list {
228+
// optional binary element (STRING);
229+
// }
230+
// }
231+
//
232+
// 3rd level.
233+
TypeDescriptor child_type_desc;
234+
RETURN_IF_ERROR(get_parquet_type(child_node, &child_type_desc));
235+
*type_desc = TypeDescriptor::create_array_type(child_type_desc);
236+
return Status::OK();
237+
}
238+
} else {
239+
return Status::NotSupported(
240+
fmt::format("list 2nd level group {} must have at least one child", list_group_node->name()));
241+
}
242+
} else {
243+
// 2-level encoding
244+
//
245+
// List<Integer> (nullable list, non-null elements)
246+
// optional group my_list (LIST) {
247+
// repeated int32 element;
248+
// }
249+
TypeDescriptor child_type_desc;
250+
RETURN_IF_ERROR(get_parquet_type(list_node, &child_type_desc));
251+
*type_desc = TypeDescriptor::create_array_type(child_type_desc);
252+
return Status::OK();
253+
}
176254
}
177255

178256
/*
@@ -191,7 +269,6 @@ The middle level, named key_value, must be a repeated group with a key field for
191269
The key field encodes the map's key type. This field must have repetition required and must always be present.
192270
The value field encodes the map's value type and repetition. This field can be required, optional, or omitted.
193271
*/
194-
195272
static Status get_parquet_type_from_map(const ::parquet::schema::NodePtr& node, TypeDescriptor* type_desc) {
196273
// 1st level.
197274
// <map-repetition> group <name> (MAP) {
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
-- name: test_parquet_list_legacy_encoding
2+
3+
create database db_${uuid0};
4+
use db_${uuid0};
5+
6+
shell: ossutil64 mkdir oss://${oss_bucket}/test_files/parquet_format/${uuid0} >/dev/null || echo "exit 0" >/dev/null
7+
8+
shell: ossutil64 cp --force ./sql/test_files/parquet_format/list_legacy_encoding.parquet oss://${oss_bucket}/test_files/parquet_format/${uuid0}/ | grep -Pv "(average|elapsed)"
9+
-- result:
10+
0
11+
12+
Succeed: Total num: 1, size: 2,796. OK num: 1(upload 1 files).
13+
-- !result
14+
15+
16+
desc files(
17+
"path" = "oss://${oss_bucket}/test_files/parquet_format/${uuid0}/*",
18+
"format" = "parquet",
19+
"aws.s3.access_key" = "${oss_ak}",
20+
"aws.s3.secret_key" = "${oss_sk}",
21+
"aws.s3.endpoint" = "${oss_endpoint}");
22+
-- result:
23+
field1 struct<list1 array<varchar(1048576)>> YES
24+
-- !result
25+
26+
select count(*) from files(
27+
"path" = "oss://${oss_bucket}/test_files/parquet_format/${uuid0}/*",
28+
"format" = "parquet",
29+
"aws.s3.access_key" = "${oss_ak}",
30+
"aws.s3.secret_key" = "${oss_sk}",
31+
"aws.s3.endpoint" = "${oss_endpoint}");
32+
-- result:
33+
100
34+
-- !result
35+
36+
37+
shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/parquet_format/${uuid0}/ > /dev/null
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
-- name: test_parquet_list_legacy_encoding
2+
3+
create database db_${uuid0};
4+
use db_${uuid0};
5+
6+
shell: ossutil64 mkdir oss://${oss_bucket}/test_files/parquet_format/${uuid0} >/dev/null || echo "exit 0" >/dev/null
7+
shell: ossutil64 cp --force ./sql/test_files/parquet_format/list_legacy_encoding.parquet oss://${oss_bucket}/test_files/parquet_format/${uuid0}/ | grep -Pv "(average|elapsed)"
8+
9+
desc files(
10+
"path" = "oss://${oss_bucket}/test_files/parquet_format/${uuid0}/*",
11+
"format" = "parquet",
12+
"aws.s3.access_key" = "${oss_ak}",
13+
"aws.s3.secret_key" = "${oss_sk}",
14+
"aws.s3.endpoint" = "${oss_endpoint}");
15+
16+
select count(*) from files(
17+
"path" = "oss://${oss_bucket}/test_files/parquet_format/${uuid0}/*",
18+
"format" = "parquet",
19+
"aws.s3.access_key" = "${oss_ak}",
20+
"aws.s3.secret_key" = "${oss_sk}",
21+
"aws.s3.endpoint" = "${oss_endpoint}");
22+
23+
shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/parquet_format/${uuid0}/ > /dev/null
2.73 KB
Binary file not shown.

0 commit comments

Comments
 (0)