Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda-envs/environment-alternative-backends.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies:
- jaxlib>=0.4.28
- libblas=*=*mkl
- mkl-service
- narwhals>=2.11.0
- numpy>=1.25.0
- numpyro>=0.8.0
- pandas>=0.24.0
Expand Down
1 change: 1 addition & 0 deletions conda-envs/environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- blas
- cachetools>=4.2.1
- cloudpickle
- narwhals>=2.11.0
- numpy>=1.25.0
- pandas>=0.24.0
- pip
Expand Down
1 change: 1 addition & 0 deletions conda-envs/environment-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- arviz>=0.13.0
- cachetools>=4.2.1
- cloudpickle
- narwhals>=2.11.0
- numpy>=1.25.0
- pandas>=0.24.0
- pip
Expand Down
1 change: 1 addition & 0 deletions conda-envs/environment-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies:
- cachetools>=4.2.1
- cloudpickle
- jax
- narwhals>=2.11.0
- numpy>=1.25.0
- pandas>=0.24.0
- pip
Expand Down
1 change: 1 addition & 0 deletions conda-envs/windows-environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dependencies:
- blas
- cachetools>=4.2.1
- cloudpickle
- narwhals>=2.11.0
- numpy>=1.25.0
- pandas>=0.24.0
- pip
Expand Down
1 change: 1 addition & 0 deletions conda-envs/windows-environment-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies:
- cloudpickle
- libpython
- mkl-service>=2.3.0
- narwhals>=2.11.0
- numpy>=1.25.0
- pandas>=0.24.0
- pip
Expand Down
226 changes: 172 additions & 54 deletions pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import io
import typing
import urllib.request

from collections.abc import Sequence
from copy import copy
from functools import singledispatch
from typing import Union, cast

import narwhals as nw
import numpy as np
import pandas as pd
import pytensor
import pytensor.tensor as pt
import xarray as xr

from narwhals.typing import IntoFrameT, IntoLazyFrameT, IntoSeriesT
from pytensor.compile import SharedVariable
from pytensor.compile.builders import OpFromGraph
from pytensor.compile.sharedvalue import SharedVariable
from pytensor.graph.basic import Variable
from pytensor.raise_op import Assert
from pytensor.tensor.random.basic import IntegersRV
Expand Down Expand Up @@ -161,65 +163,178 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size:
return mb_tensors if len(variables) else mb_tensors[0]


def _handle_none_dims(
dims: Sequence[str | None] | None, ndim: int
) -> Sequence[str | None] | Sequence[None]:
if dims is None:
return [None] * ndim
else:
return dims


@singledispatch
def determine_coords(
model,
value: pd.DataFrame | pd.Series | xr.DataArray,
dims: Sequence[str] | None = None,
value,
model: "Model",
dims: Sequence[str | None] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str] | Sequence[None]]:
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
"""Determine coordinate values from data or the model (via ``dims``)."""
raise NotImplementedError(
f"Cannot determine coordinates for data of type {type(value)}, please provide `coords` explicitly or "
f"convert the data to a supported type"
)


@determine_coords.register(np.ndarray)
def determine_array_coords(
value: np.ndarray,
model: "Model",
dims: Sequence[str] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
if coords is None:
coords = {}

dim_name = None
# If value is a df or a series, we interpret the index as coords:
if hasattr(value, "index"):
if dims is not None:
dim_name = dims[0]
if dim_name is None and value.index.name is not None:
dim_name = value.index.name
if dim_name is not None:
coords[dim_name] = value.index

# If value is a df, we also interpret the columns as coords:
if hasattr(value, "columns"):
if dims is not None:
dim_name = dims[1]
if dim_name is None and value.columns.name is not None:
dim_name = value.columns.name
if dim_name is not None:
coords[dim_name] = value.columns

if isinstance(value, xr.DataArray):
if dims is not None:
for dim in dims:
dim_name = dim
# str is applied because dim entries may be None
coords[str(dim_name)] = cast(xr.DataArray, value[dim]).to_numpy()

if isinstance(value, np.ndarray) and dims is not None:
if len(dims) != value.ndim:
raise ShapeError(
"Invalid data shape. The rank of the dataset must match the length of `dims`.",
actual=value.shape,
expected=value.ndim,
)
for size, dim in zip(value.shape, dims):
coord = model.coords.get(dim, None)
if coord is None and dim is not None:
coords[dim] = range(size)
if dims is None:
return coords, _handle_none_dims(dims, value.ndim)

if len(dims) != value.ndim:
raise ShapeError(
"Invalid data shape. The rank of the dataset must match the length of `dims`.",
actual=value.shape,
expected=len(value.shape),
)

for size, dim in zip(value.shape, dims):
coord = model.coords.get(dim, None)
if coord is None and dim is not None:
coords[dim] = range(size)

return coords, _handle_none_dims(dims, value.ndim)


@determine_coords.register(xr.DataArray)
def determine_xarray_coords(
value: xr.DataArray,
model: "Model",
dims: Sequence[str | None] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
if coords is None:
coords = {}

if dims is None:
# TODO: Also determine dim names from the index
new_dims: Sequence[str] | Sequence[None] = [None] * np.ndim(value)
else:
new_dims = dims
return coords, new_dims
return coords, _handle_none_dims(dims, value.ndim)

for dim in dims:
dim_name = dim
# str is applied because dim entries may be None
coords[str(dim_name)] = cast(xr.DataArray, value[dim]).to_numpy()

return coords, _handle_none_dims(dims, value.ndim)


def _dataframe_agnostic_coords(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can add a simple test for this function?

value: IntoFrameT | IntoLazyFrameT | nw.DataFrame | nw.LazyFrame,
model: "Model",
ndim_in: int = 2,
dims: Sequence[str | None] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
if coords is None:
coords = {}

value = cast(nw.DataFrame | nw.LazyFrame, nw.from_native(value, allow_series=False)) # type: ignore[type-var]
if isinstance(value, nw.LazyFrame):
value = value.collect()

index = nw.maybe_get_index(value)
if index is not None:
value = value.with_columns(**{index.name: index.to_numpy()})

if dims is None:
return coords, _handle_none_dims(dims, ndim_in)

if len(dims) != ndim_in:
raise ShapeError(
"Invalid data shape. The rank of the dataset must match the length of `dims`.",
actual=value.shape,
expected=len(dims),
)

index_dim = dims[0]
if index_dim is not None:
if index_dim in value.columns:
coords[index_dim] = tuple(value.select(nw.col(index_dim)).to_numpy().flatten())
elif index_dim in model.coords:
coords[index_dim] = model.coords[index_dim] # type: ignore[assignment]
else:
raise ValueError(
f"Dimension '{index_dim}' not found in DataFrame columns or model coordinates. Cannot infer "
"index coordinates."
)

if len(dims) > 1:
column_dim = dims[1]
if column_dim is not None:
select_expr = nw.exclude(index_dim) if index_dim is not None else nw.all()
coords[column_dim] = value.select(select_expr).columns

return coords, _handle_none_dims(dims, ndim_in)


def _series_agnostic_coords(
value: IntoSeriesT,
model: "Model",
dims: Sequence[str | None] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
value = cast(nw.Series, nw.from_native(value, series_only=True)) # type: ignore[assignment]
return _dataframe_agnostic_coords(
cast(nw.DataFrame | nw.LazyFrame, value.to_frame()), # type: ignore[attr-defined]
ndim_in=1,
model=model,
dims=dims,
coords=coords,
) # type: ignore[arg-type]


def _register_dataframe_backend(library_name: str):
try:
library = importlib.import_module(library_name)

@determine_coords.register(library.Series)
def determine_series_coords(
value: IntoSeriesT,
model: "Model",
dims: Sequence[str] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
return _series_agnostic_coords(value, model=model, dims=dims, coords=coords)

@determine_coords.register(library.DataFrame)
def determine_dataframe_coords(
value: IntoFrameT,
model: "Model",
dims: Sequence[str] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
return _dataframe_agnostic_coords(value, model=model, dims=dims, coords=coords)

except ImportError:
# Dataframe backends are optional
pass


_register_dataframe_backend("pandas")
_register_dataframe_backend("polars")
_register_dataframe_backend("dask.dataframe")


def Data(
name: str,
value,
value: IntoFrameT | IntoSeriesT | xr.DataArray | np.ndarray,
*,
dims: Sequence[str] | None = None,
coords: dict[str, Sequence | np.ndarray] | None = None,
Expand Down Expand Up @@ -248,11 +363,11 @@ def Data(
----------
name : str
The name for this variable.
value : array_like or pandas.Series, pandas.Dataframe
value : array_like or Narwhals-compatible Series or DataFrame
A value to associate with this variable.
dims : str, tuple of str or tuple of None, optional
Dimension names of the random variables (as opposed to the shapes of these
random variables). Use this when ``value`` is a pandas Series or DataFrame. The
random variables). Use this when ``value`` is a Series or DataFrame. The
``dims`` will then be the name of the Series / DataFrame's columns. See ArviZ
documentation for more information about dimensions and coordinates:
:ref:`arviz:quickstart`.
Expand All @@ -265,6 +380,9 @@ def Data(
infer_dims_and_coords : bool, default=False
If True, the ``Data`` container will try to infer what the coordinates
and dimension names should be if there is an index in ``value``.
model : pymc.Model, optional
Model to which to add the data variable. If not specified, the data variable
will be added to the model on the context stack.
**kwargs : dict, optional
Extra arguments passed to :func:`pytensor.shared`.

Expand Down Expand Up @@ -333,9 +451,9 @@ def Data(
expected=x.ndim,
)

new_dims: Sequence[str] | Sequence[None] | None
new_dims: Sequence[str | None] | Sequence[None] | None
if infer_dims_and_coords:
coords, new_dims = determine_coords(model, value, dims)
coords, new_dims = determine_coords(value, model, dims)
else:
new_dims = dims

Expand Down
35 changes: 29 additions & 6 deletions pymc/pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib
import warnings

from collections.abc import Iterable, Sequence
from typing import cast

import narwhals as nw
import numpy as np
import pandas as pd
import pytensor
import pytensor.tensor as pt
import scipy.sparse as sps
Expand Down Expand Up @@ -128,11 +129,33 @@ def convert_data(data) -> np.ndarray | Variable:
return smarttypeX(ret)


@_as_tensor_variable.register(pd.Series)
@_as_tensor_variable.register(pd.DataFrame)
def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)

# Optional registrations for DataFrame packages
def _register_dataframe_backend(library_name: str):
try:
library = importlib.import_module(library_name)

@_as_tensor_variable.register(library.Series)
def series_to_tensor_variable(s: library.Series, *args, **kwargs) -> TensorVariable:
s = nw.from_native(s, allow_series=True)
if isinstance(s, nw.LazyFrame):
s = s.collect()
return pt.as_tensor_variable(s.to_numpy(), *args, **kwargs)

@_as_tensor_variable.register(library.DataFrame)
def dataframe_to_tensor_variable(df: library.DataFrame, *args, **kwargs) -> TensorVariable:
df = nw.from_native(df, allow_series=False)
if isinstance(df, nw.LazyFrame):
df = df.collect()
return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)

except ImportError:
# Data backends are optional. Take no action if not installed.
pass


_register_dataframe_backend("pandas")
_register_dataframe_backend("polars")
_register_dataframe_backend("dask.dataframe")

_cheap_eval_mode = Mode(linker="py", optimizer="minimum_compile")

Expand Down
Loading