>>> import pandas as pd
>>>
>>> def summary(key, lpdf, rpdf):
...     (key,) = key
...     return pd.DataFrame({
...         "id": [key],
...         "diff": [lpdf.v.mean() - rpdf.v.mean()],
...         "count": [lpdf.v.count() + rpdf.v.count()],
...     })
...
>>> df1 = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0)], ["id", "v"])
>>> df2 = spark.createDataFrame([(1, 5.0)], ["id", "v"])
>>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
...     summary, schema="id long, diff double, count long"
... ).sort("id").show()
+---+----+-----+
| id|diff|count|
+---+----+-----+
|  1|-3.5|    3|
|  2|NULL|    1|
+---+----+-----+
