Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_maf.py: 96%

69 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:31 -0400

1# parser for MAF, defined at 

2# https://genome.ucsc.edu/FAQ/FAQformat.html#format5 

3from __future__ import annotations 

4 

5import typing 

6 

7import numpy 

8 

9from cogent3 import make_seq, open_ 

10from cogent3.app.composable import LOADER, define_app 

11from cogent3.app.typing import IdentifierType 

12 

13from ensembl_lite._aligndb import AlignRecord 

14from ensembl_lite._name import MafName 

15from ensembl_lite._util import PathType 

16 

17 

18def _get_alignment_block_indices(data: list[str]) -> list[tuple[int, int]]: 

19 blocks = [] 

20 start = None 

21 for i, line in enumerate(data): 

22 if line.startswith("a"): 

23 if start is not None: 

24 blocks.append((start, i)) 

25 start = i 

26 

27 if start is None: 

28 return [] 

29 

30 blocks.append((start, i)) 

31 return blocks 

32 

33 

34def process_maf_line(line: str) -> tuple[MafName, str]: 

35 # after the s token we have src.seqid, start, size, strand, src_size, seq 

36 _, src_coord, start, size, strand, coord_length, seq = line.strip().split() 

37 species, coord = src_coord.split(".", maxsplit=1) 

38 start, size, coord_length = int(start), int(size), int(coord_length) 

39 if strand == "-": 

40 start = coord_length - (start + size) 

41 

42 stop = start + size 

43 n = MafName( 

44 species=species, 

45 seqid=coord, 

46 start=start, 

47 stop=stop, 

48 strand=strand, 

49 coord_length=coord_length, 

50 ) 

51 return n, seq 

52 

53 

54def _get_seqs(lines: list[str]) -> dict[MafName, str]: 

55 alignment = {} 

56 for line in lines: 

57 if not line.startswith("s") or "ancestral" in line[:100]: 

58 continue 

59 n, seq = process_maf_line(line) 

60 alignment[n] = seq 

61 return alignment 

62 

63 

64def parse(path: PathType) -> typing.Iterable[dict[MafName, str]]: 

65 with open_(path) as infile: 

66 data = infile.readlines() 

67 

68 blocks = _get_alignment_block_indices(data) 

69 for block_start, block_end in blocks: 

70 yield _get_seqs(data[block_start:block_end]) 

71 

72 

73def seq2gaps(record: dict) -> AlignRecord: 

74 seq = make_seq(record.pop("seq")) 

75 indel_map, _ = seq.parse_out_gaps() 

76 if indel_map.num_gaps: 

77 record["gap_spans"] = numpy.array( 

78 [indel_map.gap_pos, indel_map.get_gap_lengths()], dtype=numpy.int32 

79 ).T 

80 else: 

81 record["gap_spans"] = numpy.array([], dtype=numpy.int32) 

82 return AlignRecord(**record) 

83 

84 

85@define_app(app_type=LOADER) 

86class load_align_records: 

87 def __init__(self, species: set[str] | None = None): 

88 self.species = species or {} 

89 

90 def main(self, path: IdentifierType) -> list[AlignRecord]: 

91 records = [] 

92 for block_id, align in enumerate(parse(path)): 

93 converted = [] 

94 for maf_name, seq in align.items(): 

95 if self.species and maf_name.species not in self.species: 

96 continue 

97 record = maf_name.to_dict() 

98 record["block_id"] = f"{path.name}-{block_id}" 

99 record["source"] = path.name 

100 record["seq"] = seq 

101 converted.append(seq2gaps(record)) 

102 records.extend(converted) 

103 return records