Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/emf.py: 97%

31 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-12-25 11:36 +1100

1# parser for Ensembl Multi Format (EMF) FLATFILE DUMPS 

2# we limit this to the Compara flavoured version 

3import os 

4import typing 

5 

6from cogent3 import open_ 

7 

8from ensembl_lite.name import EmfName 

9 

10 

11# TODO spaces are optional between columns representing SEQ and SCORE lines 

12# gah discuss with Ensembl 

13def _get_block_seqnames(data) -> dict[str, str]: 

14 names = [] 

15 for i, line in enumerate(data): 

16 if line.startswith("SEQ"): 

17 names.append(EmfName(*line.strip().split()[1:])) 

18 elif line.startswith("DATA"): 

19 break 

20 else: 

21 raise RuntimeError("missing DATA block") 

22 

23 # EMF compara alignments store one alignment column per line 

24 # with the order corresponding to SEQ order 

25 num_seqs = len(names) 

26 seq_data = [aln_col[:num_seqs] for aln_col in data[i + 1 :]] 

27 # they also include ancestral sequences, which exclude 

28 return { 

29 n: "".join(s) 

30 for n, *s in zip(names, *seq_data) 

31 if n.species != "ancestral_sequences" 

32 } 

33 

34 

35def _iter_blocks(data: typing.Iterable[str]) -> list[tuple[int, int]]: 

36 # find block boundaries 

37 start = 0 

38 blocks = [] 

39 for i, line in enumerate(data): 

40 if line.startswith("//"): 

41 blocks.append((start, i)) 

42 start = i + 1 

43 

44 return blocks 

45 

46 

47# we need a raw parser 

48def parse_emf( 

49 path: typing.Union[str, os.PathLike], 

50 check_format: bool = True, 

51 extract_data: typing.Callable = _get_block_seqnames, 

52) -> dict[EmfName, str]: 

53 """yield data for alignment from EMF files 

54 

55 Parameters 

56 ---------- 

57 path 

58 location of emf file 

59 check_format 

60 checks whether header 

61 

62 Returns 

63 ------- 

64 {EmfName(): <seq string>, ...} 

65 

66 Notes 

67 ----- 

68 The key (EmfName) has useful attributes, including the python 

69 coordinates for the sequence, coord name, species, etc... 

70 

71 Raises 

72 ------ 

73 NotImplementedError if not compara emf format 

74 """ 

75 with open_(path) as infile: 

76 data = infile.readlines() 

77 if check_format and not data[0].startswith("##FORMAT (compara)"): 

78 raise NotImplementedError( 

79 f"only compara format supported, not {data[0].strip()!r}" 

80 ) 

81 

82 blocks = _iter_blocks(data) 

83 for start, end in blocks: 

84 yield extract_data(data[start:end])