Coverage for intelligence_toolkit/helpers/df_functions.py: 100%

33 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-16 13:41 -0300

1# Copyright (c) 2024 Microsoft Corporation. All rights reserved. 

2# Licensed under the MIT license. See LICENSE file in the project. 

3# 

4import sys 

5 

6import numpy as np 

7import pandas as pd 

8 

9 

10def fix_null_ints(in_df: pd.DataFrame) -> pd.DataFrame: 

11 df = in_df.copy() 

12 for col, dt in zip(df.columns, df.dtypes, strict=False): 

13 if dt == "float64": 

14 idf = df[[col]].copy() 

15 idf["float"] = [x if not np.isnan(x) else 0 for x in idf[col]] 

16 idf["int"] = [int(x) if not np.isnan(x) else 0 for x in idf[col]] 

17 idf["float_s"] = [x if not np.isnan(x) else -sys.maxsize for x in idf[col]] 

18 idf["int_s"] = [ 

19 int(x) if not np.isnan(x) else -sys.maxsize for x in idf[col] 

20 ] 

21 fsum = idf["float"].sum() 

22 isum = idf["int"].sum() 

23 if int(fsum) == int(isum): 

24 df[col] = idf["int_s"] 

25 df[col] = df[col].astype("Int64") 

26 df[col] = df[col].replace(-sys.maxsize, np.nan) 

27 

28 return df.astype(str).replace("nan", "").replace("<NA>", "") 

29 

30 

31def get_current_time() -> str: 

32 return pd.Timestamp.now().strftime("%Y%m%d%H%M%S") 

33 

34def suppress_boolean_binary( 

35 input_df: pd.DataFrame, output_df: pd.DataFrame | None = None 

36) -> pd.DataFrame: 

37 if output_df is None: 

38 output_df = input_df.copy() 

39 

40 for col in input_df.columns: 

41 unique_values = [str(x) for x in input_df[col].unique()] 

42 is_three_with_none = len(unique_values) == 3 and input_df[col].isna().any() 

43 if len(unique_values) <= 2 or is_three_with_none: 

44 if "0" in unique_values or "0.0" in unique_values: 

45 output_df[col] = ( 

46 input_df[col] 

47 .astype(str) 

48 .replace("0", np.nan) 

49 .replace("0.0", np.nan) 

50 ) 

51 elif "False" in unique_values: 

52 output_df[col] = input_df[col].astype(str).replace("False", np.nan) 

53 return output_df