Coverage for tests / unit / no_torch / test_dataframe_serialization.py: 100%

66 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-21 22:18 -0700

1"""Tests for pandas DataFrame serialization edge cases and regression prevention.""" 

2 

3from __future__ import annotations 

4 

5from pathlib import Path 

6 

7import numpy as np 

8import pandas as pd 

9 

10from zanj import ZANJ 

11 

12TEST_DATA_PATH: Path = Path("tests/junk_data") 

13 

14 

15def test_dataframe_detection_logic(): 

16 """Verify the module + class name detection works for pandas DataFrames. 

17 

18 This test would have caught the pandas 3.0 regression where the MRO string 

19 changed from 'pandas.core.frame.DataFrame' to 'pandas.DataFrame'. 

20 """ 

21 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) 

22 

23 # These are the exact checks used in serializing.py 

24 assert "pandas" in df.__class__.__module__, ( 

25 f"Expected 'pandas' in module, got {df.__class__.__module__}" 

26 ) 

27 assert df.__class__.__name__ == "DataFrame", ( 

28 f"Expected class name 'DataFrame', got {df.__class__.__name__}" 

29 ) 

30 

31 

32def test_small_dataframe_roundtrip(): 

33 """Test DataFrame with fewer rows than external_list_threshold (256).""" 

34 df = pd.DataFrame( 

35 { 

36 "int_col": list(range(10)), 

37 "float_col": [x * 0.1 for x in range(10)], 

38 "str_col": [f"row_{x}" for x in range(10)], 

39 } 

40 ) 

41 

42 z = ZANJ() 

43 path = TEST_DATA_PATH / "test_small_dataframe.zanj" 

44 z.save({"df": df}, path) 

45 recovered = z.read(path) 

46 

47 assert isinstance(recovered["df"], pd.DataFrame), ( 

48 f"Expected DataFrame, got {type(recovered['df'])}" 

49 ) 

50 assert df.equals(recovered["df"]), "DataFrames should be equal" 

51 

52 

53def test_single_row_dataframe(): 

54 """Test DataFrame with a single row (minimal case).""" 

55 df = pd.DataFrame({"a": [1], "b": [2]}) 

56 

57 z = ZANJ() 

58 path = TEST_DATA_PATH / "test_single_row_dataframe.zanj" 

59 z.save({"df": df}, path) 

60 recovered = z.read(path) 

61 

62 assert isinstance(recovered["df"], pd.DataFrame), ( 

63 f"Expected DataFrame, got {type(recovered['df'])}" 

64 ) 

65 assert len(recovered["df"]) == 1, "DataFrame should have 1 row" 

66 assert list(recovered["df"].columns) == ["a", "b"], "Columns should be preserved" 

67 

68 

69def test_empty_dataframe(): 

70 """Test DataFrame with zero rows.""" 

71 df = pd.DataFrame({"a": [], "b": []}) 

72 

73 z = ZANJ() 

74 path = TEST_DATA_PATH / "test_empty_dataframe.zanj" 

75 z.save({"df": df}, path) 

76 recovered = z.read(path) 

77 

78 assert isinstance(recovered["df"], pd.DataFrame), ( 

79 f"Expected DataFrame, got {type(recovered['df'])}" 

80 ) 

81 assert len(recovered["df"]) == 0, "DataFrame should be empty" 

82 assert list(recovered["df"].columns) == ["a", "b"], "Columns should be preserved" 

83 

84 

85def test_dataframe_dtype_preservation(): 

86 """Verify that dtypes survive the round-trip.""" 

87 df = pd.DataFrame( 

88 { 

89 "int_col": pd.array([1, 2, 3], dtype="int64"), 

90 "float_col": pd.array([1.1, 2.2, 3.3], dtype="float64"), 

91 "str_col": pd.array(["a", "b", "c"], dtype="object"), 

92 "bool_col": pd.array([True, False, True], dtype="bool"), 

93 } 

94 ) 

95 

96 z = ZANJ() 

97 path = TEST_DATA_PATH / "test_dataframe_dtypes.zanj" 

98 z.save({"df": df}, path) 

99 recovered = z.read(path) 

100 

101 assert isinstance(recovered["df"], pd.DataFrame) 

102 

103 # Check values are preserved (dtypes may change due to JSON serialization) 

104 for col in df.columns: 

105 original_vals = df[col].tolist() 

106 recovered_vals = recovered["df"][col].tolist() 

107 assert original_vals == recovered_vals, ( 

108 f"Column {col} values don't match: {original_vals} != {recovered_vals}" 

109 ) 

110 

111 

112def test_dataframe_with_nan_values(): 

113 """Test DataFrame containing NaN and None values.""" 

114 df = pd.DataFrame( 

115 { 

116 "with_nan": [1.0, np.nan, 3.0], 

117 "with_none": [1, None, 3], 

118 "normal": [1, 2, 3], 

119 } 

120 ) 

121 

122 z = ZANJ() 

123 path = TEST_DATA_PATH / "test_dataframe_nan.zanj" 

124 z.save({"df": df}, path) 

125 recovered = z.read(path) 

126 

127 assert isinstance(recovered["df"], pd.DataFrame) 

128 

129 # Check NaN is preserved (use isna() for comparison) 

130 assert pd.isna(recovered["df"]["with_nan"].iloc[1]), "NaN should be preserved" 

131 assert recovered["df"]["with_nan"].iloc[0] == 1.0 

132 assert recovered["df"]["with_nan"].iloc[2] == 3.0 

133 

134 

135def test_dataframe_special_column_names(): 

136 """Test DataFrame with unusual column names.""" 

137 df = pd.DataFrame( 

138 { 

139 "normal_name": [1, 2], 

140 "with spaces": [3, 4], 

141 "with-dashes": [5, 6], 

142 "123_numeric_start": [7, 8], 

143 "special!@#chars": [9, 10], 

144 } 

145 ) 

146 

147 z = ZANJ() 

148 path = TEST_DATA_PATH / "test_dataframe_special_cols.zanj" 

149 z.save({"df": df}, path) 

150 recovered = z.read(path) 

151 

152 assert isinstance(recovered["df"], pd.DataFrame) 

153 assert list(recovered["df"].columns) == list(df.columns), ( 

154 "Special column names should be preserved" 

155 ) 

156 assert df.equals(recovered["df"]), "DataFrames should be equal"