Coverage for polypandas/factory.py: 86%
101 statements
« prev ^ index » next coverage.py v7.6.1, created at 2026-02-24 14:21 -0500
« prev ^ index » next coverage.py v7.6.1, created at 2026-02-24 14:21 -0500
1"""PandasFactory class for generating pandas DataFrames."""
3import functools
4from abc import ABC
5from dataclasses import asdict, is_dataclass
6from typing import Any, Dict, List, Optional, Type, TypeVar
8from polyfactory.factories import DataclassFactory
10try:
11 from pydantic import BaseModel
12except ImportError:
13 BaseModel = None # type: ignore[assignment, misc]
15from polypandas.exceptions import PandasNotAvailableError
16from polypandas.protocols import is_pandas_available, is_pyarrow_available
17from polypandas.schema import has_nested_structs, infer_pyarrow_schema, infer_schema
19T = TypeVar("T")
22def _instances_to_dicts(instances: List[Any]) -> List[Dict[str, Any]]:
23 """Convert a list of model instances to list of dicts."""
24 dicts = []
25 for instance in instances:
26 if is_dataclass(instance):
27 dicts.append(asdict(instance)) # type: ignore[arg-type]
28 elif BaseModel is not None and isinstance(instance, BaseModel):
29 dicts.append(instance.model_dump())
30 elif isinstance(instance, dict):
31 dicts.append(instance)
32 else:
33 try:
34 dicts.append(dict(instance)) # type: ignore[call-overload]
35 except (TypeError, ValueError):
36 dicts.append(instance.__dict__)
37 return dicts
40class PandasFactory(DataclassFactory[T], ABC):
41 """Factory for generating pandas DataFrames from models.
43 Works with dataclasses, Pydantic models, and TypedDicts.
45 Example:
46 ```python
47 from dataclasses import dataclass
48 from polypandas import PandasFactory
50 @dataclass
51 class User:
52 id: int
53 name: str
54 email: str
56 class UserFactory(PandasFactory[User]):
57 __model__ = User
59 df = UserFactory.build_dataframe(size=100)
60 ```
61 """
63 __is_base_factory__ = True
65 @classmethod
66 def build_dataframe(
67 cls,
68 size: int = 10,
69 schema: Optional[Dict[str, Any]] = None,
70 use_pyarrow: Optional[bool] = None,
71 **kwargs: Any,
72 ) -> Any:
73 """Build a pandas DataFrame with generated data.
75 When PyArrow is installed (pip install polypandas[pyarrow]) and the model
76 has nested structs, the DataFrame uses PyArrow-backed dtypes for proper
77 nested columns. Set use_pyarrow=False to always use the standard path.
79 Args:
80 size: Number of rows to generate.
81 schema: Optional explicit dtype dict (column name -> dtype). If None, inferred from model.
82 Ignored when use_pyarrow is True and PyArrow schema is used.
83 use_pyarrow: If None, use PyArrow when available and model has nested structs.
84 If True, use PyArrow when available. If False, never use PyArrow.
85 **kwargs: Additional keyword arguments passed to the factory.
87 Returns:
88 A pandas DataFrame with generated data.
90 Raises:
91 PandasNotAvailableError: If pandas is not installed.
92 """
93 if not is_pandas_available():
94 raise PandasNotAvailableError()
96 import pandas as pd
98 model = cls.__model__
99 data = cls.build_dicts(size=size, **kwargs)
101 if use_pyarrow is None:
102 use_pyarrow = is_pyarrow_available() and has_nested_structs(model)
104 if use_pyarrow and is_pyarrow_available():
105 pa_schema = infer_pyarrow_schema(model)
106 if pa_schema is not None:
107 import pyarrow as pa
109 table = pa.Table.from_pylist(data, schema=pa_schema)
110 if hasattr(pd, "ArrowDtype"):
111 return table.to_pandas(types_mapper=pd.ArrowDtype)
112 return table.to_pandas()
114 dtypes = infer_schema(model, schema)
115 df = pd.DataFrame(data)
116 if dtypes:
117 df = df.astype(dtypes)
118 return df
120 @classmethod
121 def build_dicts(
122 cls,
123 size: int = 10,
124 **kwargs: Any,
125 ) -> List[Dict[str, Any]]:
126 """Build a list of dictionaries with generated data.
128 Does not require pandas. Use create_dataframe_from_dicts() later to convert to DataFrame.
130 Args:
131 size: Number of records to generate.
132 **kwargs: Additional keyword arguments passed to the factory.
134 Returns:
135 A list of dictionaries with generated data.
136 """
137 instances = cls.batch(size=size, **kwargs)
138 return _instances_to_dicts(instances)
140 @classmethod
141 def create_dataframe_from_dicts(
142 cls,
143 data: List[Dict[str, Any]],
144 schema: Optional[Dict[str, Any]] = None,
145 ) -> Any:
146 """Convert pre-generated dictionary data to a pandas DataFrame.
148 Args:
149 data: List of dictionaries to convert.
150 schema: Optional explicit dtype dict.
152 Returns:
153 A pandas DataFrame.
155 Raises:
156 PandasNotAvailableError: If pandas is not installed.
157 """
158 if not is_pandas_available():
159 raise PandasNotAvailableError()
161 import pandas as pd
163 dtypes = infer_schema(cls.__model__, schema)
164 return pd.DataFrame(data, dtype=object).astype(dtypes)
167def build_pandas_dataframe(
168 model: Type[T],
169 size: int = 10,
170 schema: Optional[Dict[str, Any]] = None,
171 use_pyarrow: Optional[bool] = None,
172 **kwargs: Any,
173) -> Any:
174 """Convenience function to build a DataFrame without creating a factory class.
176 Args:
177 model: The model type (dataclass, Pydantic, TypedDict).
178 size: Number of rows to generate.
179 schema: Optional explicit dtype dict.
180 use_pyarrow: If None, use PyArrow when available and model has nested structs.
181 If True/False, use or skip PyArrow. See PandasFactory.build_dataframe.
182 **kwargs: Additional keyword arguments for data generation.
184 Returns:
185 A pandas DataFrame with generated data.
186 """
187 factory_class = type(
188 f"{model.__name__}Factory",
189 (PandasFactory,),
190 {"__model__": model, "__is_base_factory__": False},
191 )
192 return factory_class.build_dataframe( # type: ignore[attr-defined]
193 size=size, schema=schema, use_pyarrow=use_pyarrow, **kwargs
194 )
197def pandas_factory(cls: Type[T]) -> Type[T]:
198 """Decorator to add factory methods directly to a model class.
200 Adds classmethods: build_dataframe, build_dicts, create_dataframe_from_dicts.
202 Example:
203 ```python
204 from dataclasses import dataclass
205 from polypandas import pandas_factory
207 @pandas_factory
208 @dataclass
209 class User:
210 id: int
211 name: str
212 email: str
214 df = User.build_dataframe(size=100)
215 dicts = User.build_dicts(size=50)
216 ```
217 """
218 if BaseModel is not None and isinstance(cls, type) and issubclass(cls, BaseModel):
219 try:
220 from polyfactory.factories.pydantic_factory import ModelFactory as PydanticModelFactory
222 class _PydanticPandasFactory(PydanticModelFactory):
223 __is_base_factory__ = True
225 @classmethod
226 def build_dataframe(cls, *args: Any, **kwargs: Any) -> Any:
227 return PandasFactory.build_dataframe.__func__(cls, *args, **kwargs) # type: ignore[attr-defined]
229 @classmethod
230 def build_dicts(cls, *args: Any, **kwargs: Any) -> Any:
231 return PandasFactory.build_dicts.__func__(cls, *args, **kwargs) # type: ignore[attr-defined]
233 @classmethod
234 def create_dataframe_from_dicts(cls, *args: Any, **kwargs: Any) -> Any:
235 return PandasFactory.create_dataframe_from_dicts.__func__(cls, *args, **kwargs) # type: ignore[attr-defined]
237 factory_class = type(
238 f"_{cls.__name__}Factory",
239 (_PydanticPandasFactory,),
240 {"__model__": cls, "__is_base_factory__": False},
241 )
242 except ImportError:
243 factory_class = type(
244 f"_{cls.__name__}Factory",
245 (PandasFactory,),
246 {"__model__": cls, "__is_base_factory__": False},
247 )
248 else:
249 factory_class = type(
250 f"_{cls.__name__}Factory",
251 (PandasFactory,),
252 {"__model__": cls, "__is_base_factory__": False},
253 )
255 @classmethod # type: ignore[misc]
256 @functools.wraps(PandasFactory.build_dataframe)
257 def build_dataframe(
258 model_cls: Type[T],
259 size: int = 10,
260 schema: Optional[Dict[str, Any]] = None,
261 use_pyarrow: Optional[bool] = None,
262 **kwargs: Any,
263 ) -> Any:
264 return factory_class.build_dataframe( # type: ignore[attr-defined]
265 size=size, schema=schema, use_pyarrow=use_pyarrow, **kwargs
266 )
268 @classmethod # type: ignore[misc]
269 @functools.wraps(PandasFactory.build_dicts)
270 def build_dicts(
271 model_cls: Type[T],
272 size: int = 10,
273 **kwargs: Any,
274 ) -> Any:
275 return factory_class.build_dicts(size=size, **kwargs) # type: ignore[attr-defined]
277 @classmethod # type: ignore[misc]
278 @functools.wraps(PandasFactory.create_dataframe_from_dicts)
279 def create_dataframe_from_dicts(
280 model_cls: Type[T],
281 data: List[Dict[str, Any]],
282 schema: Optional[Dict[str, Any]] = None,
283 ) -> Any:
284 return factory_class.create_dataframe_from_dicts(data, schema=schema) # type: ignore[attr-defined]
286 cls.build_dataframe = build_dataframe # type: ignore[attr-defined]
287 cls.build_dicts = build_dicts # type: ignore[attr-defined]
288 cls.create_dataframe_from_dicts = create_dataframe_from_dicts # type: ignore[attr-defined]
289 cls._polypandas_factory = factory_class # type: ignore[attr-defined]
291 return cls