Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_maf.py: 97%

39 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2024-03-25 13:40 +1100

1# parser for MAF, defined at 

2# https://genome.ucsc.edu/FAQ/FAQformat.html#format5 

3 

4import os 

5import typing 

6 

7from cogent3 import open_ 

8 

9from ensembl_lite._name import MafName 

10 

11 

12def _get_alignment_block_indices(data: list[str]) -> list[tuple[int, int]]: 

13 blocks = [] 

14 start = None 

15 for i, line in enumerate(data): 

16 if line.startswith("a"): 

17 if start is not None: 

18 blocks.append((start, i)) 

19 start = i 

20 

21 if start is None: 

22 return [] 

23 

24 blocks.append((start, i)) 

25 return blocks 

26 

27 

28def process_maf_line(line: str) -> tuple[MafName, str]: 

29 # after the s token we have src.seqid, start, size, strand, src_size, seq 

30 _, src_coord, start, size, strand, coord_length, seq = line.strip().split() 

31 species, coord = src_coord.split(".", maxsplit=1) 

32 start, size, coord_length = int(start), int(size), int(coord_length) 

33 if strand == "-": 

34 start = coord_length - (start + size) 

35 

36 stop = start + size 

37 n = MafName( 

38 species=species, 

39 seqid=coord, 

40 start=start, 

41 stop=stop, 

42 strand=strand, 

43 coord_length=coord_length, 

44 ) 

45 return n, seq 

46 

47 

48def _get_seqs(lines: list[str]) -> dict[MafName, str]: 

49 alignment = {} 

50 for line in lines: 

51 if not line.startswith("s") or "ancestral" in line[:100]: 

52 continue 

53 n, seq = process_maf_line(line) 

54 alignment[n] = seq 

55 return alignment 

56 

57 

58def parse(path: os.PathLike) -> typing.Iterable[dict[MafName, str]]: 

59 with open_(path) as infile: 

60 data = infile.readlines() 

61 

62 blocks = _get_alignment_block_indices(data) 

63 for block_start, block_end in blocks: 

64 yield _get_seqs(data[block_start:block_end])