Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_maf.py: 96%
69 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
1# parser for MAF, defined at
2# https://genome.ucsc.edu/FAQ/FAQformat.html#format5
3from __future__ import annotations
5import typing
7import numpy
9from cogent3 import make_seq, open_
10from cogent3.app.composable import LOADER, define_app
11from cogent3.app.typing import IdentifierType
13from ensembl_lite._aligndb import AlignRecord
14from ensembl_lite._name import MafName
15from ensembl_lite._util import PathType
18def _get_alignment_block_indices(data: list[str]) -> list[tuple[int, int]]:
19 blocks = []
20 start = None
21 for i, line in enumerate(data):
22 if line.startswith("a"):
23 if start is not None:
24 blocks.append((start, i))
25 start = i
27 if start is None:
28 return []
30 blocks.append((start, i))
31 return blocks
34def process_maf_line(line: str) -> tuple[MafName, str]:
35 # after the s token we have src.seqid, start, size, strand, src_size, seq
36 _, src_coord, start, size, strand, coord_length, seq = line.strip().split()
37 species, coord = src_coord.split(".", maxsplit=1)
38 start, size, coord_length = int(start), int(size), int(coord_length)
39 if strand == "-":
40 start = coord_length - (start + size)
42 stop = start + size
43 n = MafName(
44 species=species,
45 seqid=coord,
46 start=start,
47 stop=stop,
48 strand=strand,
49 coord_length=coord_length,
50 )
51 return n, seq
54def _get_seqs(lines: list[str]) -> dict[MafName, str]:
55 alignment = {}
56 for line in lines:
57 if not line.startswith("s") or "ancestral" in line[:100]:
58 continue
59 n, seq = process_maf_line(line)
60 alignment[n] = seq
61 return alignment
64def parse(path: PathType) -> typing.Iterable[dict[MafName, str]]:
65 with open_(path) as infile:
66 data = infile.readlines()
68 blocks = _get_alignment_block_indices(data)
69 for block_start, block_end in blocks:
70 yield _get_seqs(data[block_start:block_end])
73def seq2gaps(record: dict) -> AlignRecord:
74 seq = make_seq(record.pop("seq"))
75 indel_map, _ = seq.parse_out_gaps()
76 if indel_map.num_gaps:
77 record["gap_spans"] = numpy.array(
78 [indel_map.gap_pos, indel_map.get_gap_lengths()], dtype=numpy.int32
79 ).T
80 else:
81 record["gap_spans"] = numpy.array([], dtype=numpy.int32)
82 return AlignRecord(**record)
85@define_app(app_type=LOADER)
86class load_align_records:
87 def __init__(self, species: set[str] | None = None):
88 self.species = species or {}
90 def main(self, path: IdentifierType) -> list[AlignRecord]:
91 records = []
92 for block_id, align in enumerate(parse(path)):
93 converted = []
94 for maf_name, seq in align.items():
95 if self.species and maf_name.species not in self.species:
96 continue
97 record = maf_name.to_dict()
98 record["block_id"] = f"{path.name}-{block_id}"
99 record["source"] = path.name
100 record["seq"] = seq
101 converted.append(seq2gaps(record))
102 records.extend(converted)
103 return records