Coverage for tests / unit / no_torch / test_dataframe_serialization.py: 100%
66 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-21 22:18 -0700
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-21 22:18 -0700
1"""Tests for pandas DataFrame serialization edge cases and regression prevention."""
3from __future__ import annotations
5from pathlib import Path
7import numpy as np
8import pandas as pd
10from zanj import ZANJ
12TEST_DATA_PATH: Path = Path("tests/junk_data")
15def test_dataframe_detection_logic():
16 """Verify the module + class name detection works for pandas DataFrames.
18 This test would have caught the pandas 3.0 regression where the MRO string
19 changed from 'pandas.core.frame.DataFrame' to 'pandas.DataFrame'.
20 """
21 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
23 # These are the exact checks used in serializing.py
24 assert "pandas" in df.__class__.__module__, (
25 f"Expected 'pandas' in module, got {df.__class__.__module__}"
26 )
27 assert df.__class__.__name__ == "DataFrame", (
28 f"Expected class name 'DataFrame', got {df.__class__.__name__}"
29 )
32def test_small_dataframe_roundtrip():
33 """Test DataFrame with fewer rows than external_list_threshold (256)."""
34 df = pd.DataFrame(
35 {
36 "int_col": list(range(10)),
37 "float_col": [x * 0.1 for x in range(10)],
38 "str_col": [f"row_{x}" for x in range(10)],
39 }
40 )
42 z = ZANJ()
43 path = TEST_DATA_PATH / "test_small_dataframe.zanj"
44 z.save({"df": df}, path)
45 recovered = z.read(path)
47 assert isinstance(recovered["df"], pd.DataFrame), (
48 f"Expected DataFrame, got {type(recovered['df'])}"
49 )
50 assert df.equals(recovered["df"]), "DataFrames should be equal"
53def test_single_row_dataframe():
54 """Test DataFrame with a single row (minimal case)."""
55 df = pd.DataFrame({"a": [1], "b": [2]})
57 z = ZANJ()
58 path = TEST_DATA_PATH / "test_single_row_dataframe.zanj"
59 z.save({"df": df}, path)
60 recovered = z.read(path)
62 assert isinstance(recovered["df"], pd.DataFrame), (
63 f"Expected DataFrame, got {type(recovered['df'])}"
64 )
65 assert len(recovered["df"]) == 1, "DataFrame should have 1 row"
66 assert list(recovered["df"].columns) == ["a", "b"], "Columns should be preserved"
69def test_empty_dataframe():
70 """Test DataFrame with zero rows."""
71 df = pd.DataFrame({"a": [], "b": []})
73 z = ZANJ()
74 path = TEST_DATA_PATH / "test_empty_dataframe.zanj"
75 z.save({"df": df}, path)
76 recovered = z.read(path)
78 assert isinstance(recovered["df"], pd.DataFrame), (
79 f"Expected DataFrame, got {type(recovered['df'])}"
80 )
81 assert len(recovered["df"]) == 0, "DataFrame should be empty"
82 assert list(recovered["df"].columns) == ["a", "b"], "Columns should be preserved"
85def test_dataframe_dtype_preservation():
86 """Verify that dtypes survive the round-trip."""
87 df = pd.DataFrame(
88 {
89 "int_col": pd.array([1, 2, 3], dtype="int64"),
90 "float_col": pd.array([1.1, 2.2, 3.3], dtype="float64"),
91 "str_col": pd.array(["a", "b", "c"], dtype="object"),
92 "bool_col": pd.array([True, False, True], dtype="bool"),
93 }
94 )
96 z = ZANJ()
97 path = TEST_DATA_PATH / "test_dataframe_dtypes.zanj"
98 z.save({"df": df}, path)
99 recovered = z.read(path)
101 assert isinstance(recovered["df"], pd.DataFrame)
103 # Check values are preserved (dtypes may change due to JSON serialization)
104 for col in df.columns:
105 original_vals = df[col].tolist()
106 recovered_vals = recovered["df"][col].tolist()
107 assert original_vals == recovered_vals, (
108 f"Column {col} values don't match: {original_vals} != {recovered_vals}"
109 )
112def test_dataframe_with_nan_values():
113 """Test DataFrame containing NaN and None values."""
114 df = pd.DataFrame(
115 {
116 "with_nan": [1.0, np.nan, 3.0],
117 "with_none": [1, None, 3],
118 "normal": [1, 2, 3],
119 }
120 )
122 z = ZANJ()
123 path = TEST_DATA_PATH / "test_dataframe_nan.zanj"
124 z.save({"df": df}, path)
125 recovered = z.read(path)
127 assert isinstance(recovered["df"], pd.DataFrame)
129 # Check NaN is preserved (use isna() for comparison)
130 assert pd.isna(recovered["df"]["with_nan"].iloc[1]), "NaN should be preserved"
131 assert recovered["df"]["with_nan"].iloc[0] == 1.0
132 assert recovered["df"]["with_nan"].iloc[2] == 3.0
135def test_dataframe_special_column_names():
136 """Test DataFrame with unusual column names."""
137 df = pd.DataFrame(
138 {
139 "normal_name": [1, 2],
140 "with spaces": [3, 4],
141 "with-dashes": [5, 6],
142 "123_numeric_start": [7, 8],
143 "special!@#chars": [9, 10],
144 }
145 )
147 z = ZANJ()
148 path = TEST_DATA_PATH / "test_dataframe_special_cols.zanj"
149 z.save({"df": df}, path)
150 recovered = z.read(path)
152 assert isinstance(recovered["df"], pd.DataFrame)
153 assert list(recovered["df"].columns) == list(df.columns), (
154 "Special column names should be preserved"
155 )
156 assert df.equals(recovered["df"]), "DataFrames should be equal"