Coverage for polypandas/factory.py: 86%

101 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2026-02-24 14:21 -0500

1"""PandasFactory class for generating pandas DataFrames.""" 

2 

3import functools 

4from abc import ABC 

5from dataclasses import asdict, is_dataclass 

6from typing import Any, Dict, List, Optional, Type, TypeVar 

7 

8from polyfactory.factories import DataclassFactory 

9 

10try: 

11 from pydantic import BaseModel 

12except ImportError: 

13 BaseModel = None # type: ignore[assignment, misc] 

14 

15from polypandas.exceptions import PandasNotAvailableError 

16from polypandas.protocols import is_pandas_available, is_pyarrow_available 

17from polypandas.schema import has_nested_structs, infer_pyarrow_schema, infer_schema 

18 

19T = TypeVar("T") 

20 

21 

22def _instances_to_dicts(instances: List[Any]) -> List[Dict[str, Any]]: 

23 """Convert a list of model instances to list of dicts.""" 

24 dicts = [] 

25 for instance in instances: 

26 if is_dataclass(instance): 

27 dicts.append(asdict(instance)) # type: ignore[arg-type] 

28 elif BaseModel is not None and isinstance(instance, BaseModel): 

29 dicts.append(instance.model_dump()) 

30 elif isinstance(instance, dict): 

31 dicts.append(instance) 

32 else: 

33 try: 

34 dicts.append(dict(instance)) # type: ignore[call-overload] 

35 except (TypeError, ValueError): 

36 dicts.append(instance.__dict__) 

37 return dicts 

38 

39 

40class PandasFactory(DataclassFactory[T], ABC): 

41 """Factory for generating pandas DataFrames from models. 

42 

43 Works with dataclasses, Pydantic models, and TypedDicts. 

44 

45 Example: 

46 ```python 

47 from dataclasses import dataclass 

48 from polypandas import PandasFactory 

49 

50 @dataclass 

51 class User: 

52 id: int 

53 name: str 

54 email: str 

55 

56 class UserFactory(PandasFactory[User]): 

57 __model__ = User 

58 

59 df = UserFactory.build_dataframe(size=100) 

60 ``` 

61 """ 

62 

63 __is_base_factory__ = True 

64 

65 @classmethod 

66 def build_dataframe( 

67 cls, 

68 size: int = 10, 

69 schema: Optional[Dict[str, Any]] = None, 

70 use_pyarrow: Optional[bool] = None, 

71 **kwargs: Any, 

72 ) -> Any: 

73 """Build a pandas DataFrame with generated data. 

74 

75 When PyArrow is installed (pip install polypandas[pyarrow]) and the model 

76 has nested structs, the DataFrame uses PyArrow-backed dtypes for proper 

77 nested columns. Set use_pyarrow=False to always use the standard path. 

78 

79 Args: 

80 size: Number of rows to generate. 

81 schema: Optional explicit dtype dict (column name -> dtype). If None, inferred from model. 

82 Ignored when use_pyarrow is True and PyArrow schema is used. 

83 use_pyarrow: If None, use PyArrow when available and model has nested structs. 

84 If True, use PyArrow when available. If False, never use PyArrow. 

85 **kwargs: Additional keyword arguments passed to the factory. 

86 

87 Returns: 

88 A pandas DataFrame with generated data. 

89 

90 Raises: 

91 PandasNotAvailableError: If pandas is not installed. 

92 """ 

93 if not is_pandas_available(): 

94 raise PandasNotAvailableError() 

95 

96 import pandas as pd 

97 

98 model = cls.__model__ 

99 data = cls.build_dicts(size=size, **kwargs) 

100 

101 if use_pyarrow is None: 

102 use_pyarrow = is_pyarrow_available() and has_nested_structs(model) 

103 

104 if use_pyarrow and is_pyarrow_available(): 

105 pa_schema = infer_pyarrow_schema(model) 

106 if pa_schema is not None: 

107 import pyarrow as pa 

108 

109 table = pa.Table.from_pylist(data, schema=pa_schema) 

110 if hasattr(pd, "ArrowDtype"): 

111 return table.to_pandas(types_mapper=pd.ArrowDtype) 

112 return table.to_pandas() 

113 

114 dtypes = infer_schema(model, schema) 

115 df = pd.DataFrame(data) 

116 if dtypes: 

117 df = df.astype(dtypes) 

118 return df 

119 

120 @classmethod 

121 def build_dicts( 

122 cls, 

123 size: int = 10, 

124 **kwargs: Any, 

125 ) -> List[Dict[str, Any]]: 

126 """Build a list of dictionaries with generated data. 

127 

128 Does not require pandas. Use create_dataframe_from_dicts() later to convert to DataFrame. 

129 

130 Args: 

131 size: Number of records to generate. 

132 **kwargs: Additional keyword arguments passed to the factory. 

133 

134 Returns: 

135 A list of dictionaries with generated data. 

136 """ 

137 instances = cls.batch(size=size, **kwargs) 

138 return _instances_to_dicts(instances) 

139 

140 @classmethod 

141 def create_dataframe_from_dicts( 

142 cls, 

143 data: List[Dict[str, Any]], 

144 schema: Optional[Dict[str, Any]] = None, 

145 ) -> Any: 

146 """Convert pre-generated dictionary data to a pandas DataFrame. 

147 

148 Args: 

149 data: List of dictionaries to convert. 

150 schema: Optional explicit dtype dict. 

151 

152 Returns: 

153 A pandas DataFrame. 

154 

155 Raises: 

156 PandasNotAvailableError: If pandas is not installed. 

157 """ 

158 if not is_pandas_available(): 

159 raise PandasNotAvailableError() 

160 

161 import pandas as pd 

162 

163 dtypes = infer_schema(cls.__model__, schema) 

164 return pd.DataFrame(data, dtype=object).astype(dtypes) 

165 

166 

167def build_pandas_dataframe( 

168 model: Type[T], 

169 size: int = 10, 

170 schema: Optional[Dict[str, Any]] = None, 

171 use_pyarrow: Optional[bool] = None, 

172 **kwargs: Any, 

173) -> Any: 

174 """Convenience function to build a DataFrame without creating a factory class. 

175 

176 Args: 

177 model: The model type (dataclass, Pydantic, TypedDict). 

178 size: Number of rows to generate. 

179 schema: Optional explicit dtype dict. 

180 use_pyarrow: If None, use PyArrow when available and model has nested structs. 

181 If True/False, use or skip PyArrow. See PandasFactory.build_dataframe. 

182 **kwargs: Additional keyword arguments for data generation. 

183 

184 Returns: 

185 A pandas DataFrame with generated data. 

186 """ 

187 factory_class = type( 

188 f"{model.__name__}Factory", 

189 (PandasFactory,), 

190 {"__model__": model, "__is_base_factory__": False}, 

191 ) 

192 return factory_class.build_dataframe( # type: ignore[attr-defined] 

193 size=size, schema=schema, use_pyarrow=use_pyarrow, **kwargs 

194 ) 

195 

196 

197def pandas_factory(cls: Type[T]) -> Type[T]: 

198 """Decorator to add factory methods directly to a model class. 

199 

200 Adds classmethods: build_dataframe, build_dicts, create_dataframe_from_dicts. 

201 

202 Example: 

203 ```python 

204 from dataclasses import dataclass 

205 from polypandas import pandas_factory 

206 

207 @pandas_factory 

208 @dataclass 

209 class User: 

210 id: int 

211 name: str 

212 email: str 

213 

214 df = User.build_dataframe(size=100) 

215 dicts = User.build_dicts(size=50) 

216 ``` 

217 """ 

218 if BaseModel is not None and isinstance(cls, type) and issubclass(cls, BaseModel): 

219 try: 

220 from polyfactory.factories.pydantic_factory import ModelFactory as PydanticModelFactory 

221 

222 class _PydanticPandasFactory(PydanticModelFactory): 

223 __is_base_factory__ = True 

224 

225 @classmethod 

226 def build_dataframe(cls, *args: Any, **kwargs: Any) -> Any: 

227 return PandasFactory.build_dataframe.__func__(cls, *args, **kwargs) # type: ignore[attr-defined] 

228 

229 @classmethod 

230 def build_dicts(cls, *args: Any, **kwargs: Any) -> Any: 

231 return PandasFactory.build_dicts.__func__(cls, *args, **kwargs) # type: ignore[attr-defined] 

232 

233 @classmethod 

234 def create_dataframe_from_dicts(cls, *args: Any, **kwargs: Any) -> Any: 

235 return PandasFactory.create_dataframe_from_dicts.__func__(cls, *args, **kwargs) # type: ignore[attr-defined] 

236 

237 factory_class = type( 

238 f"_{cls.__name__}Factory", 

239 (_PydanticPandasFactory,), 

240 {"__model__": cls, "__is_base_factory__": False}, 

241 ) 

242 except ImportError: 

243 factory_class = type( 

244 f"_{cls.__name__}Factory", 

245 (PandasFactory,), 

246 {"__model__": cls, "__is_base_factory__": False}, 

247 ) 

248 else: 

249 factory_class = type( 

250 f"_{cls.__name__}Factory", 

251 (PandasFactory,), 

252 {"__model__": cls, "__is_base_factory__": False}, 

253 ) 

254 

255 @classmethod # type: ignore[misc] 

256 @functools.wraps(PandasFactory.build_dataframe) 

257 def build_dataframe( 

258 model_cls: Type[T], 

259 size: int = 10, 

260 schema: Optional[Dict[str, Any]] = None, 

261 use_pyarrow: Optional[bool] = None, 

262 **kwargs: Any, 

263 ) -> Any: 

264 return factory_class.build_dataframe( # type: ignore[attr-defined] 

265 size=size, schema=schema, use_pyarrow=use_pyarrow, **kwargs 

266 ) 

267 

268 @classmethod # type: ignore[misc] 

269 @functools.wraps(PandasFactory.build_dicts) 

270 def build_dicts( 

271 model_cls: Type[T], 

272 size: int = 10, 

273 **kwargs: Any, 

274 ) -> Any: 

275 return factory_class.build_dicts(size=size, **kwargs) # type: ignore[attr-defined] 

276 

277 @classmethod # type: ignore[misc] 

278 @functools.wraps(PandasFactory.create_dataframe_from_dicts) 

279 def create_dataframe_from_dicts( 

280 model_cls: Type[T], 

281 data: List[Dict[str, Any]], 

282 schema: Optional[Dict[str, Any]] = None, 

283 ) -> Any: 

284 return factory_class.create_dataframe_from_dicts(data, schema=schema) # type: ignore[attr-defined] 

285 

286 cls.build_dataframe = build_dataframe # type: ignore[attr-defined] 

287 cls.build_dicts = build_dicts # type: ignore[attr-defined] 

288 cls.create_dataframe_from_dicts = create_dataframe_from_dicts # type: ignore[attr-defined] 

289 cls._polypandas_factory = factory_class # type: ignore[attr-defined] 

290 

291 return cls