pymc-devs · jessegrabowski · Sep 26, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 17, 2025
diff --git a/conda-envs/environment-alternative-backends.yml b/conda-envs/environment-alternative-backends.yml
@@ -18,6 +18,7 @@ dependencies:
 - jaxlib>=0.4.28
 - libblas=*=*mkl
 - mkl-service
+- narwhals>=2.11.0
 - numpy>=1.25.0
 - numpyro>=0.8.0
 - pandas>=0.24.0

diff --git a/conda-envs/environment-dev.yml b/conda-envs/environment-dev.yml
@@ -9,6 +9,7 @@ dependencies:
 - blas
 - cachetools>=4.2.1
 - cloudpickle
+- narwhals>=2.11.0
 - numpy>=1.25.0
 - pandas>=0.24.0
 - pip

diff --git a/conda-envs/environment-docs.yml b/conda-envs/environment-docs.yml
@@ -8,6 +8,7 @@ dependencies:
 - arviz>=0.13.0
 - cachetools>=4.2.1
 - cloudpickle
+- narwhals>=2.11.0
 - numpy>=1.25.0
 - pandas>=0.24.0
 - pip

diff --git a/conda-envs/environment-test.yml b/conda-envs/environment-test.yml
@@ -10,6 +10,7 @@ dependencies:
 - cachetools>=4.2.1
 - cloudpickle
 - jax
+- narwhals>=2.11.0
 - numpy>=1.25.0
 - pandas>=0.24.0
 - pip

diff --git a/conda-envs/windows-environment-dev.yml b/conda-envs/windows-environment-dev.yml
@@ -9,6 +9,7 @@ dependencies:
 - blas
 - cachetools>=4.2.1
 - cloudpickle
+- narwhals>=2.11.0
 - numpy>=1.25.0
 - pandas>=0.24.0
 - pip

diff --git a/conda-envs/windows-environment-test.yml b/conda-envs/windows-environment-test.yml
@@ -11,6 +11,7 @@ dependencies:
 - cloudpickle
 - libpython
 - mkl-service>=2.3.0
+- narwhals>=2.11.0
 - numpy>=1.25.0
 - pandas>=0.24.0
 - pip

diff --git a/pymc/data.py b/pymc/data.py
@@ -11,23 +11,25 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
-
+import importlib
 import io
 import typing
 import urllib.request
 
 from collections.abc import Sequence
 from copy import copy
+from functools import singledispatch
 from typing import Union, cast
 
+import narwhals as nw
 import numpy as np
-import pandas as pd
 import pytensor
 import pytensor.tensor as pt
 import xarray as xr
 
+from narwhals.typing import IntoFrameT, IntoLazyFrameT, IntoSeriesT
+from pytensor.compile import SharedVariable
 from pytensor.compile.builders import OpFromGraph
-from pytensor.compile.sharedvalue import SharedVariable
 from pytensor.graph.basic import Variable
 from pytensor.raise_op import Assert
 from pytensor.tensor.random.basic import IntegersRV
@@ -161,65 +163,178 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size:
     return mb_tensors if len(variables) else mb_tensors[0]
 
 
+def _handle_none_dims(
+    dims: Sequence[str | None] | None, ndim: int
+) -> Sequence[str | None] | Sequence[None]:
+    if dims is None:
+        return [None] * ndim
+    else:
+        return dims
+
+
+@singledispatch
 def determine_coords(
-    model,
-    value: pd.DataFrame | pd.Series | xr.DataArray,
-    dims: Sequence[str] | None = None,
+    value,
+    model: "Model",
+    dims: Sequence[str | None] | None = None,
     coords: dict[str, Sequence | np.ndarray] | None = None,
-) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str] | Sequence[None]]:
+) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
     """Determine coordinate values from data or the model (via ``dims``)."""
+    raise NotImplementedError(
+        f"Cannot determine coordinates for data of type {type(value)}, please provide `coords` explicitly or "
+        f"convert the data to a supported type"
+    )
+
+
+@determine_coords.register(np.ndarray)
+def determine_array_coords(
+    value: np.ndarray,
+    model: "Model",
+    dims: Sequence[str] | None = None,
+    coords: dict[str, Sequence | np.ndarray] | None = None,
+) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
     if coords is None:
         coords = {}
 
-    dim_name = None
-    # If value is a df or a series, we interpret the index as coords:
-    if hasattr(value, "index"):
-        if dims is not None:
-            dim_name = dims[0]
-        if dim_name is None and value.index.name is not None:
-            dim_name = value.index.name
-        if dim_name is not None:
-            coords[dim_name] = value.index
-
-    # If value is a df, we also interpret the columns as coords:
-    if hasattr(value, "columns"):
-        if dims is not None:
-            dim_name = dims[1]
-        if dim_name is None and value.columns.name is not None:
-            dim_name = value.columns.name
-        if dim_name is not None:
-            coords[dim_name] = value.columns
-
-    if isinstance(value, xr.DataArray):
-        if dims is not None:
-            for dim in dims:
-                dim_name = dim
-                # str is applied because dim entries may be None
-                coords[str(dim_name)] = cast(xr.DataArray, value[dim]).to_numpy()
-
-    if isinstance(value, np.ndarray) and dims is not None:
-        if len(dims) != value.ndim:
-            raise ShapeError(
-                "Invalid data shape. The rank of the dataset must match the length of `dims`.",
-                actual=value.shape,
-                expected=value.ndim,
-            )
-        for size, dim in zip(value.shape, dims):
-            coord = model.coords.get(dim, None)
-            if coord is None and dim is not None:
-                coords[dim] = range(size)
+    if dims is None:
+        return coords, _handle_none_dims(dims, value.ndim)
+
+    if len(dims) != value.ndim:
+        raise ShapeError(
+            "Invalid data shape. The rank of the dataset must match the length of `dims`.",
+            actual=value.shape,
+            expected=len(value.shape),
+        )
+
+    for size, dim in zip(value.shape, dims):
+        coord = model.coords.get(dim, None)
+        if coord is None and dim is not None:
+            coords[dim] = range(size)
+
+    return coords, _handle_none_dims(dims, value.ndim)
+
+
+@determine_coords.register(xr.DataArray)
+def determine_xarray_coords(
+    value: xr.DataArray,
+    model: "Model",
+    dims: Sequence[str | None] | None = None,
+    coords: dict[str, Sequence | np.ndarray] | None = None,
+) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
+    if coords is None:
+        coords = {}
 
     if dims is None:
-        # TODO: Also determine dim names from the index
-        new_dims: Sequence[str] | Sequence[None] = [None] * np.ndim(value)
-    else:
-        new_dims = dims
-    return coords, new_dims
+        return coords, _handle_none_dims(dims, value.ndim)
+
+    for dim in dims:
+        dim_name = dim
+        # str is applied because dim entries may be None
+        coords[str(dim_name)] = cast(xr.DataArray, value[dim]).to_numpy()
+
+    return coords, _handle_none_dims(dims, value.ndim)
+
+
+def _dataframe_agnostic_coords(
+    value: IntoFrameT | IntoLazyFrameT | nw.DataFrame | nw.LazyFrame,
+    model: "Model",
+    ndim_in: int = 2,
+    dims: Sequence[str | None] | None = None,
+    coords: dict[str, Sequence | np.ndarray] | None = None,
+) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
+    if coords is None:
+        coords = {}
+
+    value = cast(nw.DataFrame | nw.LazyFrame, nw.from_native(value, allow_series=False))  # type: ignore[type-var]
+    if isinstance(value, nw.LazyFrame):
+        value = value.collect()
+
+    index = nw.maybe_get_index(value)
+    if index is not None:
+        value = value.with_columns(**{index.name: index.to_numpy()})
+
+    if dims is None:
+        return coords, _handle_none_dims(dims, ndim_in)
+
+    if len(dims) != ndim_in:
+        raise ShapeError(
+            "Invalid data shape. The rank of the dataset must match the length of `dims`.",
+            actual=value.shape,
+            expected=len(dims),
+        )
+
+    index_dim = dims[0]
+    if index_dim is not None:
+        if index_dim in value.columns:
+            coords[index_dim] = tuple(value.select(nw.col(index_dim)).to_numpy().flatten())
+        elif index_dim in model.coords:
+            coords[index_dim] = model.coords[index_dim]  # type: ignore[assignment]
+        else:
+            raise ValueError(
+                f"Dimension '{index_dim}' not found in DataFrame columns or model coordinates. Cannot infer "
+                "index coordinates."
+            )
+
+    if len(dims) > 1:
+        column_dim = dims[1]
+        if column_dim is not None:
+            select_expr = nw.exclude(index_dim) if index_dim is not None else nw.all()
+            coords[column_dim] = value.select(select_expr).columns
+
+    return coords, _handle_none_dims(dims, ndim_in)
+
+
+def _series_agnostic_coords(
+    value: IntoSeriesT,
+    model: "Model",
+    dims: Sequence[str | None] | None = None,
+    coords: dict[str, Sequence | np.ndarray] | None = None,
+) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
+    value = cast(nw.Series, nw.from_native(value, series_only=True))  # type: ignore[assignment]
+    return _dataframe_agnostic_coords(
+        cast(nw.DataFrame | nw.LazyFrame, value.to_frame()),  # type: ignore[attr-defined]
+        ndim_in=1,
+        model=model,
+        dims=dims,
+        coords=coords,
+    )  # type: ignore[arg-type]
+
+
+def _register_dataframe_backend(library_name: str):
+    try:
+        library = importlib.import_module(library_name)
+
+        @determine_coords.register(library.Series)
+        def determine_series_coords(
+            value: IntoSeriesT,
+            model: "Model",
+            dims: Sequence[str] | None = None,
+            coords: dict[str, Sequence | np.ndarray] | None = None,
+        ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
+            return _series_agnostic_coords(value, model=model, dims=dims, coords=coords)
+
+        @determine_coords.register(library.DataFrame)
+        def determine_dataframe_coords(
+            value: IntoFrameT,
+            model: "Model",
+            dims: Sequence[str] | None = None,
+            coords: dict[str, Sequence | np.ndarray] | None = None,
+        ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None] | Sequence[None]]:
+            return _dataframe_agnostic_coords(value, model=model, dims=dims, coords=coords)
+
+    except ImportError:
+        # Dataframe backends are optional
+        pass
+
+
+_register_dataframe_backend("pandas")
+_register_dataframe_backend("polars")
+_register_dataframe_backend("dask.dataframe")
 
 
 def Data(
     name: str,
-    value,
+    value: IntoFrameT | IntoSeriesT | xr.DataArray | np.ndarray,
     *,
     dims: Sequence[str] | None = None,
     coords: dict[str, Sequence | np.ndarray] | None = None,
@@ -248,11 +363,11 @@ def Data(
     ----------
     name : str
         The name for this variable.
-    value : array_like or pandas.Series, pandas.Dataframe
+    value : array_like or Narwhals-compatible Series or DataFrame
         A value to associate with this variable.
     dims : str, tuple of str or tuple of None, optional
         Dimension names of the random variables (as opposed to the shapes of these
-        random variables). Use this when ``value`` is a pandas Series or DataFrame. The
+        random variables). Use this when ``value`` is a Series or DataFrame. The
         ``dims`` will then be the name of the Series / DataFrame's columns. See ArviZ
         documentation for more information about dimensions and coordinates:
         :ref:`arviz:quickstart`.
@@ -265,6 +380,9 @@ def Data(
     infer_dims_and_coords : bool, default=False
         If True, the ``Data`` container will try to infer what the coordinates
         and dimension names should be if there is an index in ``value``.
+    model : pymc.Model, optional
+        Model to which to add the data variable. If not specified, the data variable
+        will be added to the model on the context stack.
     **kwargs : dict, optional
         Extra arguments passed to :func:`pytensor.shared`.
 
@@ -333,9 +451,9 @@ def Data(
             expected=x.ndim,
         )
 
-    new_dims: Sequence[str] | Sequence[None] | None
+    new_dims: Sequence[str | None] | Sequence[None] | None
     if infer_dims_and_coords:
-        coords, new_dims = determine_coords(model, value, dims)
+        coords, new_dims = determine_coords(value, model, dims)
     else:
         new_dims = dims
 

diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
@@ -11,13 +11,14 @@
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
+import importlib
 import warnings
 
 from collections.abc import Iterable, Sequence
 from typing import cast
 
+import narwhals as nw
 import numpy as np
-import pandas as pd
 import pytensor
 import pytensor.tensor as pt
 import scipy.sparse as sps
@@ -128,11 +129,33 @@ def convert_data(data) -> np.ndarray | Variable:
     return smarttypeX(ret)
 
 
-@_as_tensor_variable.register(pd.Series)
-@_as_tensor_variable.register(pd.DataFrame)
-def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable:
-    return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
-
+# Optional registrations for DataFrame packages
+def _register_dataframe_backend(library_name: str):
+    try:
+        library = importlib.import_module(library_name)
+
+        @_as_tensor_variable.register(library.Series)
+        def series_to_tensor_variable(s: library.Series, *args, **kwargs) -> TensorVariable:
+            s = nw.from_native(s, allow_series=True)
+            if isinstance(s, nw.LazyFrame):
+                s = s.collect()
+            return pt.as_tensor_variable(s.to_numpy(), *args, **kwargs)
+
+        @_as_tensor_variable.register(library.DataFrame)
+        def dataframe_to_tensor_variable(df: library.DataFrame, *args, **kwargs) -> TensorVariable:
+            df = nw.from_native(df, allow_series=False)
+            if isinstance(df, nw.LazyFrame):
+                df = df.collect()
+            return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs)
+
+    except ImportError:
+        # Data backends are optional. Take no action if not installed.
+        pass
+
+
+_register_dataframe_backend("pandas")
+_register_dataframe_backend("polars")
+_register_dataframe_backend("dask.dataframe")
 
 _cheap_eval_mode = Mode(linker="py", optimizer="minimum_compile")