|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | import pickle |
4 | | -from collections.abc import Iterator |
5 | | -from typing import Any |
| 4 | +from collections.abc import Iterable, Iterator |
| 5 | +from typing import TYPE_CHECKING, Any |
| 6 | + |
| 7 | +from toolz import first |
6 | 8 |
|
7 | 9 | from distributed.protocol.utils import pack_frames_prelude, unpack_frames |
8 | 10 |
|
| 11 | +if TYPE_CHECKING: |
| 12 | + import pandas as pd |
| 13 | + |
9 | 14 |
|
10 | 15 | def pickle_bytelist(obj: object, prelude: bool = True) -> list[pickle.PickleBuffer]: |
11 | 16 | """Variant of :func:`serialize_bytelist`, that doesn't support compression, locally |
@@ -39,3 +44,72 @@ def unpickle_bytestream(b: bytes | bytearray | memoryview) -> Iterator[Any]: |
39 | 44 | if remainder.nbytes == 0: |
40 | 45 | break |
41 | 46 | b = remainder |
| 47 | + |
| 48 | + |
| 49 | +def pickle_dataframe_shard( |
| 50 | + input_part_id: int, |
| 51 | + shard: pd.DataFrame, |
| 52 | +) -> list[pickle.PickleBuffer]: |
| 53 | + """Optimized pickler for pandas Dataframes. DIscard all unnecessary metadata |
| 54 | + (like the columns header). |
| 55 | +
|
| 56 | + Parameters: |
| 57 | + obj: pandas |
| 58 | + """ |
| 59 | + return pickle_bytelist( |
| 60 | + (input_part_id, shard.index, *shard._mgr.blocks), prelude=False |
| 61 | + ) |
| 62 | + |
| 63 | + |
| 64 | +def unpickle_and_concat_dataframe_shards( |
| 65 | + parts: Iterable[Any], meta: pd.DataFrame |
| 66 | +) -> pd.DataFrame: |
| 67 | + """Optimized unpickler for pandas Dataframes. |
| 68 | +
|
| 69 | + Parameters |
| 70 | + ---------- |
| 71 | + parts: |
| 72 | + output of ``unpickle_bytestream(b)``, where b is the memory-mapped blob of |
| 73 | + pickled data which is the concatenation of the outputs of |
| 74 | + :func:`pickle_dataframe_shard` in arbitrary order |
| 75 | + meta: |
| 76 | + DataFrame header |
| 77 | +
|
| 78 | + Returns |
| 79 | + ------- |
| 80 | + Reconstructed output shard, sorted by input partition ID |
| 81 | +
|
| 82 | + **Roundtrip example** |
| 83 | +
|
| 84 | + .. code-block:: python |
| 85 | +
|
| 86 | + import random |
| 87 | + import pandas as pd |
| 88 | +
|
| 89 | + df = pd.DataFrame(...) # Input partition |
| 90 | + meta = df.iloc[:0] |
| 91 | + shards = df.iloc[0:10], df.iloc[10:20], ... |
| 92 | + frames = [pickle_dataframe_shard(i, shard) for i, shard in enumerate(shards)] |
| 93 | + random.shuffle(frames) # Simulate the frames arriving in arbitrary order |
| 94 | + frames = [f for fs in frames for f in fs] # Flatten |
| 95 | + blob = bytearray(b"".join(frames)) # Simulate disk roundtrip |
| 96 | + parts = unpickle_bytestream(blob) |
| 97 | + df2 = unpickle_and_concat_dataframe_shards(parts, meta) |
| 98 | +
|
| 99 | + """ |
| 100 | + import pandas as pd |
| 101 | + from pandas.core.internals import BlockManager |
| 102 | + |
| 103 | + # [(input_part_id, index, *blocks), ...] |
| 104 | + parts = sorted(parts, key=first) |
| 105 | + shards = [] |
| 106 | + for _, idx, *blocks in parts: |
| 107 | + axes = [meta.columns, idx] |
| 108 | + df = pd.DataFrame._from_mgr( # type: ignore[attr-defined] |
| 109 | + BlockManager(blocks, axes, verify_integrity=False), axes |
| 110 | + ) |
| 111 | + shards.append(df) |
| 112 | + |
| 113 | + # Actually load memory-mapped buffers into memory and close the file |
| 114 | + # descriptors |
| 115 | + return pd.concat(shards, copy=True) |
0 commit comments