Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_emf.py: 97%

31 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:31 -0400

1# parser for Ensembl Multi Format (EMF) FLATFILE DUMPS 

2# we limit this to the Compara flavoured version 

3 

4import typing 

5 

6from cogent3 import open_ 

7 

8from ensembl_lite._name import EmfName 

9from ensembl_lite._util import PathType 

10 

11 

12# TODO spaces are optional between columns representing SEQ and SCORE lines 

13# gah discuss with Ensembl 

14def _get_block_seqnames(data) -> dict[str, str]: 

15 names = [] 

16 for i, line in enumerate(data): 

17 if line.startswith("SEQ"): 

18 names.append(EmfName(*line.strip().split()[1:])) 

19 elif line.startswith("DATA"): 

20 break 

21 else: 

22 raise RuntimeError("missing DATA block") 

23 

24 # EMF compara alignments store one alignment column per line 

25 # with the order corresponding to SEQ order 

26 num_seqs = len(names) 

27 seq_data = [aln_col[:num_seqs] for aln_col in data[i + 1 :]] 

28 # they also include ancestral sequences, which exclude 

29 return { 

30 n: "".join(s) 

31 for n, *s in zip(names, *seq_data) 

32 if n.species != "ancestral_sequences" 

33 } 

34 

35 

36def _iter_blocks(data: typing.Iterable[str]) -> list[tuple[int, int]]: 

37 # find block boundaries 

38 start = 0 

39 blocks = [] 

40 for i, line in enumerate(data): 

41 if line.startswith("//"): 

42 blocks.append((start, i)) 

43 start = i + 1 

44 

45 return blocks 

46 

47 

48# we need a raw parser 

49def parse_emf( 

50 path: PathType, 

51 check_format: bool = True, 

52 extract_data: typing.Callable = _get_block_seqnames, 

53) -> dict[EmfName, str]: 

54 """yield data for alignment from EMF files 

55 

56 Parameters 

57 ---------- 

58 path 

59 location of emf file 

60 check_format 

61 checks whether header 

62 

63 Returns 

64 ------- 

65 {EmfName(): <seq string>, ...} 

66 

67 Notes 

68 ----- 

69 The key (EmfName) has useful attributes, including the python 

70 coordinates for the sequence, coord name, species, etc... 

71 

72 Raises 

73 ------ 

74 NotImplementedError if not compara emf format 

75 """ 

76 with open_(path) as infile: 

77 data = infile.readlines() 

78 if check_format and not data[0].startswith("##FORMAT (compara)"): 

79 raise NotImplementedError( 

80 f"only compara format supported, not {data[0].strip()!r}" 

81 ) 

82 

83 blocks = _iter_blocks(data) 

84 for start, end in blocks: 

85 yield extract_data(data[start:end])