diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 676bc445238..04b11a77782 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -259,6 +259,9 @@ class ParquetFile: If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + arrow_extensions_enabled : bool, default True + If True, read Parquet logical types as Arrow extension types where + possible. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. arrow_extensions_enabled : bool, default True @@ -2347,6 +2350,10 @@ def read_metadata(where, memory_map=False, decryption_properties=None, If nothing passed, will be inferred based on path. Path will try to be found in the local on-disk filesystem otherwise it will be parsed as an URI to determine the filesystem. + arrow_extensions_enabled : bool, default True + If True, read Parquet logical types as Arrow extension types where + possible (e.g. UUID as the canonical `arrow.uuid` extension type). + If False, use the underlying storage types instead. Returns ------- @@ -2382,7 +2389,7 @@ def read_metadata(where, memory_map=False, decryption_properties=None, def read_schema(where, memory_map=False, decryption_properties=None, - filesystem=None): + filesystem=None, arrow_extensions_enabled=True): """ Read effective Arrow schema from Parquet file metadata. @@ -2422,11 +2429,15 @@ def read_schema(where, memory_map=False, decryption_properties=None, with file_ctx: file = ParquetFile( - where, memory_map=memory_map, - decryption_properties=decryption_properties) + where, + memory_map=memory_map, + decryption_properties=decryption_properties, + arrow_extensions_enabled=arrow_extensions_enabled, + ) + if arrow_extensions_enabled: + return file.schema_arrow return file.schema.to_arrow_schema() - __all__ = ( "ColumnChunkMetaData", "ColumnSchema", diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index c546bc1532a..571343472ac 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -604,6 +604,25 @@ def test_uuid_extension_type(): store_schema=False) +def test_read_schema_uuid_extension_type(tmp_path): + data = [ + b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', + b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', + None, + ] + table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"]) + + file_path = tmp_path / "uuid.parquet" + file_path_str = str(file_path) + pq.write_table(table, file_path_str, store_schema=False) + + schema_default = pq.read_schema(file_path_str) + assert schema_default.field("ext").type == pa.uuid() + + schema_disabled = pq.read_schema(file_path_str, arrow_extensions_enabled=False) + assert schema_disabled.field("ext").type == pa.binary(16) + + def test_undefined_logical_type(parquet_test_datadir): test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"