Coverage for src / autoencodix / utils / _screader.py: 32%

28 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-21 10:09 +0200

1import scanpy as sc # type: ignore 

2import mudata as md # type: ignore 

3from anndata import AnnData # type: ignore 

4from typing import Dict, Any, TYPE_CHECKING 

5from autoencodix.configs.default_config import DefaultConfig 

6 

7if TYPE_CHECKING: 

8 import mudata as md # type: ignore 

9 

10 MuData = md.MuData.MuData 

11else: 

12 MuData = Any 

13 

14 

15class SingleCellDataReader: 

16 """Reader for multi-modal single-cell data.""" 

17 

18 @staticmethod 

19 def read_data( 

20 config: DefaultConfig, 

21 ) -> Dict[str, MuData]: # ty: ignore[invalid-type-form] 

22 """Read multiple single-cell modalities into MuData object(s). 

23 

24 Args: 

25 config: Configuration object containing data paths and parameters. 

26 

27 Returns: 

28 For non-paired translation: Dict of Dicts with {'multi_sc': DataDict} as outer dict and with modalty keys and mudata obj as inner dict. 

29 For paired translation and non translation cases: dict with "multi_sc" as key and mudata as value 

30 """ 

31 modalities: Dict[str, AnnData] = {} 

32 

33 for mod_key, mod_info in config.data_config.data_info.items(): 

34 if not mod_info.is_single_cell: 

35 continue 

36 adata = sc.read_h5ad(mod_info.file_path) 

37 modalities[mod_key] = adata 

38 

39 # if config.requires_paired: 

40 # mdata = md.MuData(modalities) 

41 # common_cells = list( 

42 # set.intersection( 

43 # *(set(adata.obs_names) for adata in modalities.values()) 

44 # ) 

45 # ) 

46 # print(f"Number of common cells: {len(common_cells)}") 

47 # mdata = mdata[common_cells] 

48 # return {"multi_sc": mdata} 

49 

50 if config.requires_paired: 

51 common_cells_set = set.intersection( 

52 *(set(adata.obs_names) for adata in modalities.values()) 

53 ) 

54 common_cells_sorted = sorted(list(common_cells_set)) 

55 

56 # Subset EACH modality individually with the sorted common cells 

57 # This ensures each modality is aligned to the same order 

58 aligned_modalities = {} 

59 for mod_key, adata in modalities.items(): 

60 aligned_modalities[mod_key] = adata[common_cells_sorted].copy() 

61 mdata = md.MuData(aligned_modalities) 

62 

63 print(f"Number of common cells: {len(common_cells_sorted)}") 

64 

65 # Clean obs_names: remove modality prefixes 

66 cleaned_names = [ 

67 name.split(":")[-1] if ":" in name else name 

68 for name in mdata.obs.columns 

69 ] 

70 mdata.obs.columns = cleaned_names 

71 

72 # Remove duplicate columns from obs 

73 mdata.obs = mdata.obs.loc[:, ~mdata.obs.columns.duplicated(keep="first")] 

74 

75 return {"multi_sc": mdata} 

76 return {"multi_sc": modalities}