@@ -147,32 +147,110 @@ LIST must always annotate a 3-level structure:
147147The outer-most level must be a group annotated with LIST that contains a single field named list. The repetition of this level must be either optional or required and determines whether the list is nullable.
148148The middle level, named list, must be a repeated group with a single field named element.
149149The element field encodes the list's element type and repetition. Element repetition must be required or optional.
150- */
151150
151+ Support legacy encodings:
152+ 1. List<Integer> (nullable list, non-null elements)
153+ optional group my_list (LIST) {
154+ repeated int32 element;
155+ }
156+
157+ 2. List<Tuple<String, Integer>> (nullable list, non-null elements)
158+ optional group my_list (LIST) {
159+ repeated group element {
160+ required binary str (STRING);
161+ required int32 num;
162+ }
163+ }
164+
165+ 3. List<List<Integer>> (nullable outer list, non-null elements)
166+ optional group my_list (LIST) {
167+ repeated group array (LIST) {
168+ repeated int32 array;
169+ }
170+ }
171+ */
152172static Status get_parquet_type_from_list (const ::parquet::schema::NodePtr& node, TypeDescriptor* type_desc) {
153173 // 1st level.
154- // <list-repetition> group <name> (LIST)
155174 DCHECK (node->is_group ());
156175 DCHECK (node->logical_type ()->is_list ());
157176
158177 auto group_node = std::static_pointer_cast<::parquet::schema::GroupNode>(node);
159- DCHECK (group_node->field_count () == 1 );
178+ if (group_node->field_count () != 1 ) {
179+ return Status::NotSupported (fmt::format (" list 1st level group {} must have exactly one child, but got {}" ,
180+ group_node->name (), group_node->field_count ()));
181+ }
160182
161183 // 2nd level.
162- // repeated group list {
163184 auto list_node = group_node->field (0 );
164- auto list_group_node = std::static_pointer_cast<::parquet::schema::GroupNode>(list_node);
165- DCHECK (list_group_node->field_count () == 1 );
166- DCHECK (list_group_node->is_group ());
167-
168- // 3rd level.
169- // <list-repetition> group <name> (LIST)
170- const auto & child_node = list_group_node->field (0 );
171- TypeDescriptor child_type_desc;
172- RETURN_IF_ERROR (get_parquet_type (child_node, &child_type_desc));
173- *type_desc = TypeDescriptor::create_array_type (child_type_desc);
185+ if (!list_node->is_repeated ()) {
186+ return Status::NotSupported (fmt::format (" list 2nd level node {} is not repeated" , list_node->name ()));
187+ }
174188
175- return Status::OK ();
189+ if (list_node->is_group ()) {
190+ auto list_group_node = std::static_pointer_cast<::parquet::schema::GroupNode>(list_node);
191+ int field_count = list_group_node->field_count ();
192+
193+ if (field_count > 1 ) {
194+ // The inner type of the list should be a struct when there are multiple fields in the repeated group
195+ //
196+ // List<Tuple<String, Integer>> (nullable list, non-null elements)
197+ // optional group my_list (LIST) {
198+ // repeated group element {
199+ // required binary str (STRING);
200+ // required int32 num;
201+ // }
202+ // }
203+ TypeDescriptor child_type_desc;
204+ RETURN_IF_ERROR (try_to_infer_struct_type (list_group_node, &child_type_desc));
205+ *type_desc = TypeDescriptor::create_array_type (child_type_desc);
206+ return Status::OK ();
207+ } else if (field_count == 1 ) {
208+ const auto & child_node = list_group_node->field (0 );
209+ if (list_group_node->logical_type ()->is_list () && child_node->is_repeated ()) {
210+ // The inner type might be a list with two-level encoding
211+ //
212+ // List<List<Integer>> (nullable outer list, non-null elements)
213+ // optional group my_list (LIST) {
214+ // repeated group array (LIST) {
215+ // repeated int32 array;
216+ // }
217+ // }
218+ TypeDescriptor child_type_desc;
219+ RETURN_IF_ERROR (get_parquet_type (list_group_node, &child_type_desc));
220+ *type_desc = TypeDescriptor::create_array_type (child_type_desc);
221+ return Status::OK ();
222+ } else {
223+ // 3-level encoding
224+ //
225+ // List<String> (list non-null, elements nullable)
226+ // required group my_list (LIST) {
227+ // repeated group list {
228+ // optional binary element (STRING);
229+ // }
230+ // }
231+ //
232+ // 3rd level.
233+ TypeDescriptor child_type_desc;
234+ RETURN_IF_ERROR (get_parquet_type (child_node, &child_type_desc));
235+ *type_desc = TypeDescriptor::create_array_type (child_type_desc);
236+ return Status::OK ();
237+ }
238+ } else {
239+ return Status::NotSupported (
240+ fmt::format (" list 2nd level group {} must have at least one child" , list_group_node->name ()));
241+ }
242+ } else {
243+ // 2-level encoding
244+ //
245+ // List<Integer> (nullable list, non-null elements)
246+ // optional group my_list (LIST) {
247+ // repeated int32 element;
248+ // }
249+ TypeDescriptor child_type_desc;
250+ RETURN_IF_ERROR (get_parquet_type (list_node, &child_type_desc));
251+ *type_desc = TypeDescriptor::create_array_type (child_type_desc);
252+ return Status::OK ();
253+ }
176254}
177255
178256/*
@@ -191,7 +269,6 @@ The middle level, named key_value, must be a repeated group with a key field for
191269The key field encodes the map's key type. This field must have repetition required and must always be present.
192270The value field encodes the map's value type and repetition. This field can be required, optional, or omitted.
193271*/
194-
195272static Status get_parquet_type_from_map (const ::parquet::schema::NodePtr& node, TypeDescriptor* type_desc) {
196273 // 1st level.
197274 // <map-repetition> group <name> (MAP) {
0 commit comments