Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/maf.py: 71%
45 statements
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
« prev ^ index » next coverage.py v7.2.3, created at 2023-12-25 11:36 +1100
1# parser for MAF, defined at
2# https://genome.ucsc.edu/FAQ/FAQformat.html#format5
4import os
6from cogent3 import open_
8from ensembl_lite.name import MafName
11def _get_alignment_block_indices(data: list[str]) -> list[tuple[int]]:
12 blocks = []
13 start = None
14 for i, line in enumerate(data):
15 if line.startswith("a"):
16 if start is not None:
17 blocks.append((start, i))
18 start = i
20 if start is None:
21 return []
23 blocks.append((start, i))
24 return blocks
27def _get_seqs(lines: list[str]) -> dict[MafName, str]:
28 alignment = {}
29 for line in lines:
30 if not line.startswith("s") or "ancestral" in line[:100]:
31 continue
32 # after the s token we have src.coord_name, start, size, strand, src_size, seq
33 _, src_coord, start, size, strand, coord_length, seq = line.strip().split()
34 species, coord = src_coord.split(".", maxsplit=1)
35 start, size, coord_length = int(start), int(size), int(coord_length)
36 n = MafName(
37 species=species,
38 coord_name=coord,
39 start=start,
40 end=start + start,
41 strand=strand,
42 coord_length=coord_length,
43 )
44 alignment[n] = seq
45 return alignment
48def parse(path: os.PathLike) -> dict[MafName, str]:
49 with open_(path) as infile:
50 data = infile.readlines()
52 blocks = _get_alignment_block_indices(data)
53 for block_start, block_end in blocks:
54 yield _get_seqs(data[block_start:block_end])
57def demo(path):
58 with open_(path) as infile:
59 data = infile.readlines()
61 blocks = _get_alignment_block_indices(data)
62 _, end = blocks[3]
63 lines = [l[:1000] for l in data[:end]]
64 # we now shorten the sequences to the exact same length
65 for i, line in enumerate(lines):
66 if line.startswith("s"):
67 seq = line.split()[-1]
68 line = line.replace(seq, seq[:50])
70 lines[i] = line.strip() + "\n"
72 with open_("~/Desktop/Outbox/sample.maf", "w") as out:
73 out.writelines(lines)