Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/emf.py: 97%
31 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
1# parser for Ensembl Multi Format (EMF) FLATFILE DUMPS
2# we limit this to the Compara flavoured version
3import os
4import typing
6from cogent3 import open_
8from ensembl_lite.name import EmfName
11# TODO spaces are optional between columns representing SEQ and SCORE lines
12# gah discuss with Ensembl
13def _get_block_seqnames(data) -> dict[str, str]:
14 names = []
15 for i, line in enumerate(data):
16 if line.startswith("SEQ"):
17 names.append(EmfName(*line.strip().split()[1:]))
18 elif line.startswith("DATA"):
19 break
20 else:
21 raise RuntimeError("missing DATA block")
23 # EMF compara alignments store one alignment column per line
24 # with the order corresponding to SEQ order
25 num_seqs = len(names)
26 seq_data = [aln_col[:num_seqs] for aln_col in data[i + 1 :]]
27 # they also include ancestral sequences, which exclude
28 return {
29 n: "".join(s)
30 for n, *s in zip(names, *seq_data)
31 if n.species != "ancestral_sequences"
32 }
35def _iter_blocks(data: typing.Iterable[str]) -> list[tuple[int, int]]:
36 # find block boundaries
37 start = 0
38 blocks = []
39 for i, line in enumerate(data):
40 if line.startswith("//"):
41 blocks.append((start, i))
42 start = i + 1
44 return blocks
47# we need a raw parser
48def parse_emf(
49 path: typing.Union[str, os.PathLike],
50 check_format: bool = True,
51 extract_data: typing.Callable = _get_block_seqnames,
52) -> dict[EmfName, str]:
53 """yield data for alignment from EMF files
55 Parameters
56 ----------
57 path
58 location of emf file
59 check_format
60 checks whether header
62 Returns
63 -------
64 {EmfName(): <seq string>, ...}
66 Notes
67 -----
68 The key (EmfName) has useful attributes, including the python
69 coordinates for the sequence, coord name, species, etc...
71 Raises
72 ------
73 NotImplementedError if not compara emf format
74 """
75 with open_(path) as infile:
76 data = infile.readlines()
77 if check_format and not data[0].startswith("##FORMAT (compara)"):
78 raise NotImplementedError(
79 f"only compara format supported, not {data[0].strip()!r}"
80 )
82 blocks = _iter_blocks(data)
83 for start, end in blocks:
84 yield extract_data(data[start:end])