Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_maf.py: 97%
39 statements
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2024-03-25 13:40 +1100
1# parser for MAF, defined at
2# https://genome.ucsc.edu/FAQ/FAQformat.html#format5
4import os
5import typing
7from cogent3 import open_
9from ensembl_lite._name import MafName
12def _get_alignment_block_indices(data: list[str]) -> list[tuple[int, int]]:
13 blocks = []
14 start = None
15 for i, line in enumerate(data):
16 if line.startswith("a"):
17 if start is not None:
18 blocks.append((start, i))
19 start = i
21 if start is None:
22 return []
24 blocks.append((start, i))
25 return blocks
28def process_maf_line(line: str) -> tuple[MafName, str]:
29 # after the s token we have src.seqid, start, size, strand, src_size, seq
30 _, src_coord, start, size, strand, coord_length, seq = line.strip().split()
31 species, coord = src_coord.split(".", maxsplit=1)
32 start, size, coord_length = int(start), int(size), int(coord_length)
33 if strand == "-":
34 start = coord_length - (start + size)
36 stop = start + size
37 n = MafName(
38 species=species,
39 seqid=coord,
40 start=start,
41 stop=stop,
42 strand=strand,
43 coord_length=coord_length,
44 )
45 return n, seq
48def _get_seqs(lines: list[str]) -> dict[MafName, str]:
49 alignment = {}
50 for line in lines:
51 if not line.startswith("s") or "ancestral" in line[:100]:
52 continue
53 n, seq = process_maf_line(line)
54 alignment[n] = seq
55 return alignment
58def parse(path: os.PathLike) -> typing.Iterable[dict[MafName, str]]:
59 with open_(path) as infile:
60 data = infile.readlines()
62 blocks = _get_alignment_block_indices(data)
63 for block_start, block_end in blocks:
64 yield _get_seqs(data[block_start:block_end])