Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_emf.py: 97%
31 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
1# parser for Ensembl Multi Format (EMF) FLATFILE DUMPS
2# we limit this to the Compara flavoured version
4import typing
6from cogent3 import open_
8from ensembl_lite._name import EmfName
9from ensembl_lite._util import PathType
12# TODO spaces are optional between columns representing SEQ and SCORE lines
13# gah discuss with Ensembl
14def _get_block_seqnames(data) -> dict[str, str]:
15 names = []
16 for i, line in enumerate(data):
17 if line.startswith("SEQ"):
18 names.append(EmfName(*line.strip().split()[1:]))
19 elif line.startswith("DATA"):
20 break
21 else:
22 raise RuntimeError("missing DATA block")
24 # EMF compara alignments store one alignment column per line
25 # with the order corresponding to SEQ order
26 num_seqs = len(names)
27 seq_data = [aln_col[:num_seqs] for aln_col in data[i + 1 :]]
28 # they also include ancestral sequences, which exclude
29 return {
30 n: "".join(s)
31 for n, *s in zip(names, *seq_data)
32 if n.species != "ancestral_sequences"
33 }
36def _iter_blocks(data: typing.Iterable[str]) -> list[tuple[int, int]]:
37 # find block boundaries
38 start = 0
39 blocks = []
40 for i, line in enumerate(data):
41 if line.startswith("//"):
42 blocks.append((start, i))
43 start = i + 1
45 return blocks
48# we need a raw parser
49def parse_emf(
50 path: PathType,
51 check_format: bool = True,
52 extract_data: typing.Callable = _get_block_seqnames,
53) -> dict[EmfName, str]:
54 """yield data for alignment from EMF files
56 Parameters
57 ----------
58 path
59 location of emf file
60 check_format
61 checks whether header
63 Returns
64 -------
65 {EmfName(): <seq string>, ...}
67 Notes
68 -----
69 The key (EmfName) has useful attributes, including the python
70 coordinates for the sequence, coord name, species, etc...
72 Raises
73 ------
74 NotImplementedError if not compara emf format
75 """
76 with open_(path) as infile:
77 data = infile.readlines()
78 if check_format and not data[0].startswith("##FORMAT (compara)"):
79 raise NotImplementedError(
80 f"only compara format supported, not {data[0].strip()!r}"
81 )
83 blocks = _iter_blocks(data)
84 for start, end in blocks:
85 yield extract_data(data[start:end])