Coverage for intelligence_toolkit/helpers/df_functions.py: 100%
33 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-16 13:41 -0300
1# Copyright (c) 2024 Microsoft Corporation. All rights reserved.
2# Licensed under the MIT license. See LICENSE file in the project.
3#
4import sys
6import numpy as np
7import pandas as pd
10def fix_null_ints(in_df: pd.DataFrame) -> pd.DataFrame:
11 df = in_df.copy()
12 for col, dt in zip(df.columns, df.dtypes, strict=False):
13 if dt == "float64":
14 idf = df[[col]].copy()
15 idf["float"] = [x if not np.isnan(x) else 0 for x in idf[col]]
16 idf["int"] = [int(x) if not np.isnan(x) else 0 for x in idf[col]]
17 idf["float_s"] = [x if not np.isnan(x) else -sys.maxsize for x in idf[col]]
18 idf["int_s"] = [
19 int(x) if not np.isnan(x) else -sys.maxsize for x in idf[col]
20 ]
21 fsum = idf["float"].sum()
22 isum = idf["int"].sum()
23 if int(fsum) == int(isum):
24 df[col] = idf["int_s"]
25 df[col] = df[col].astype("Int64")
26 df[col] = df[col].replace(-sys.maxsize, np.nan)
28 return df.astype(str).replace("nan", "").replace("<NA>", "")
31def get_current_time() -> str:
32 return pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
34def suppress_boolean_binary(
35 input_df: pd.DataFrame, output_df: pd.DataFrame | None = None
36) -> pd.DataFrame:
37 if output_df is None:
38 output_df = input_df.copy()
40 for col in input_df.columns:
41 unique_values = [str(x) for x in input_df[col].unique()]
42 is_three_with_none = len(unique_values) == 3 and input_df[col].isna().any()
43 if len(unique_values) <= 2 or is_three_with_none:
44 if "0" in unique_values or "0.0" in unique_values:
45 output_df[col] = (
46 input_df[col]
47 .astype(str)
48 .replace("0", np.nan)
49 .replace("0.0", np.nan)
50 )
51 elif "False" in unique_values:
52 output_df[col] = input_df[col].astype(str).replace("False", np.nan)
53 return output_df