Skip to content

Commit a8a4a77

Browse files
zhengruifengdongjoon-hyun
authored andcommitted
[SPARK-54233][PYTHON][DOCS] Fix the doctest of cogrouped applyInArrow and applyInPandas
### What changes were proposed in this pull request? Fix the doctest of cogrouped applyInArrow and applyInPandas ### Why are the changes needed? to improve test coverage ### Does this PR introduce _any_ user-facing change? yes, doc-only changes ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #52932 from zhengruifeng/doc_cogrouped_apply. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 6da54d9 commit a8a4a77

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

python/pyspark/sql/pandas/group_ops.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,7 @@ def applyInPandas(
10261026
10271027
Examples
10281028
--------
1029+
>>> import pandas as pd
10291030
>>> df1 = spark.createDataFrame(
10301031
... [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
10311032
... ("time", "id", "v1"))
@@ -1037,7 +1038,7 @@ def applyInPandas(
10371038
...
10381039
>>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
10391040
... asof_join, schema="time int, id int, v1 double, v2 string"
1040-
... ).show() # doctest: +SKIP
1041+
... ).sort("id", "time").show()
10411042
+--------+---+---+---+
10421043
| time| id| v1| v2|
10431044
+--------+---+---+---+
@@ -1060,7 +1061,8 @@ def applyInPandas(
10601061
... return pd.DataFrame(columns=['time', 'id', 'v1', 'v2'])
10611062
...
10621063
>>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
1063-
... asof_join, "time int, id int, v1 double, v2 string").show() # doctest: +SKIP
1064+
... asof_join, "time int, id int, v1 double, v2 string"
1065+
... ).sort("time").show()
10641066
+--------+---+---+---+
10651067
| time| id| v1| v2|
10661068
+--------+---+---+---+
@@ -1124,17 +1126,17 @@ def applyInArrow(
11241126
11251127
Examples
11261128
--------
1127-
>>> import pyarrow # doctest: +SKIP
1129+
>>> import pyarrow as pa
11281130
>>> df1 = spark.createDataFrame([(1, 1.0), (2, 2.0), (1, 3.0), (2, 4.0)], ("id", "v1"))
11291131
>>> df2 = spark.createDataFrame([(1, "x"), (2, "y")], ("id", "v2"))
11301132
>>> def summarize(l, r):
1131-
... return pyarrow.Table.from_pydict({
1133+
... return pa.Table.from_pydict({
11321134
... "left": [l.num_rows],
11331135
... "right": [r.num_rows]
11341136
... })
11351137
>>> df1.groupby("id").cogroup(df2.groupby("id")).applyInArrow(
11361138
... summarize, schema="left long, right long"
1137-
... ).show() # doctest: +SKIP
1139+
... ).show()
11381140
+----+-----+
11391141
|left|right|
11401142
+----+-----+
@@ -1149,14 +1151,14 @@ def applyInArrow(
11491151
in as two `pyarrow.Table`\\s containing all columns from the original Spark DataFrames.
11501152
11511153
>>> def summarize(key, l, r):
1152-
... return pyarrow.Table.from_pydict({
1154+
... return pa.Table.from_pydict({
11531155
... "key": [key[0].as_py()],
11541156
... "left": [l.num_rows],
11551157
... "right": [r.num_rows]
11561158
... })
11571159
>>> df1.groupby("id").cogroup(df2.groupby("id")).applyInArrow(
11581160
... summarize, schema="key long, left long, right long"
1159-
... ).show() # doctest: +SKIP
1161+
... ).sort("key").show()
11601162
+---+----+-----+
11611163
|key|left|right|
11621164
+---+----+-----+
@@ -1205,9 +1207,11 @@ def _test() -> None:
12051207
if not have_pandas or not have_pyarrow:
12061208
del pyspark.sql.pandas.group_ops.PandasGroupedOpsMixin.apply.__doc__
12071209
del pyspark.sql.pandas.group_ops.PandasGroupedOpsMixin.applyInPandas.__doc__
1210+
del pyspark.sql.pandas.group_ops.PandasCogroupedOps.applyInPandas.__doc__
12081211

12091212
if not have_pyarrow:
12101213
del pyspark.sql.pandas.group_ops.PandasGroupedOpsMixin.applyInArrow.__doc__
1214+
del pyspark.sql.pandas.group_ops.PandasCogroupedOps.applyInArrow.__doc__
12111215

12121216
spark = SparkSession.builder.master("local[4]").appName("sql.pandas.group tests").getOrCreate()
12131217
globs["spark"] = spark

0 commit comments

Comments
 (0)