Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/maf.py: 71%

45 statements  

« prev     ^ index     » next       coverage.py v7.2.3, created at 2023-12-25 11:36 +1100

1# parser for MAF, defined at 

2# https://genome.ucsc.edu/FAQ/FAQformat.html#format5 

3 

4import os 

5 

6from cogent3 import open_ 

7 

8from ensembl_lite.name import MafName 

9 

10 

11def _get_alignment_block_indices(data: list[str]) -> list[tuple[int]]: 

12 blocks = [] 

13 start = None 

14 for i, line in enumerate(data): 

15 if line.startswith("a"): 

16 if start is not None: 

17 blocks.append((start, i)) 

18 start = i 

19 

20 if start is None: 

21 return [] 

22 

23 blocks.append((start, i)) 

24 return blocks 

25 

26 

27def _get_seqs(lines: list[str]) -> dict[MafName, str]: 

28 alignment = {} 

29 for line in lines: 

30 if not line.startswith("s") or "ancestral" in line[:100]: 

31 continue 

32 # after the s token we have src.coord_name, start, size, strand, src_size, seq 

33 _, src_coord, start, size, strand, coord_length, seq = line.strip().split() 

34 species, coord = src_coord.split(".", maxsplit=1) 

35 start, size, coord_length = int(start), int(size), int(coord_length) 

36 n = MafName( 

37 species=species, 

38 coord_name=coord, 

39 start=start, 

40 end=start + start, 

41 strand=strand, 

42 coord_length=coord_length, 

43 ) 

44 alignment[n] = seq 

45 return alignment 

46 

47 

48def parse(path: os.PathLike) -> dict[MafName, str]: 

49 with open_(path) as infile: 

50 data = infile.readlines() 

51 

52 blocks = _get_alignment_block_indices(data) 

53 for block_start, block_end in blocks: 

54 yield _get_seqs(data[block_start:block_end]) 

55 

56 

57def demo(path): 

58 with open_(path) as infile: 

59 data = infile.readlines() 

60 

61 blocks = _get_alignment_block_indices(data) 

62 _, end = blocks[3] 

63 lines = [l[:1000] for l in data[:end]] 

64 # we now shorten the sequences to the exact same length 

65 for i, line in enumerate(lines): 

66 if line.startswith("s"): 

67 seq = line.split()[-1] 

68 line = line.replace(seq, seq[:50]) 

69 

70 lines[i] = line.strip() + "\n" 

71 

72 with open_("~/Desktop/Outbox/sample.maf", "w") as out: 

73 out.writelines(lines)