Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_faster_fasta.py: 100%

24 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-06-12 16:31 -0400

1import pathlib 

2import typing 

3 

4import numpy 

5 

6from cogent3 import get_moltype, open_ 

7 

8 

9# old style moltype 

10alphabet = get_moltype("dna").alphabets.degen_gapped 

11 

12 

13class converter: 

14 """Defines a linear mapping from provided characters to uint8. 

15 The resulting object is callable, taking a bytes object and returning a 

16 numpy array.""" 

17 

18 def __init__(self, dtype=numpy.uint8): 

19 self._tr = b"".maketrans( 

20 "".join(alphabet).encode("utf8"), bytes(bytearray(range(len(alphabet)))) 

21 ) 

22 self.dtype = dtype 

23 

24 def __call__(self, seq: bytes) -> numpy.ndarray: 

25 b = seq.translate(self._tr, delete=b" \n\r") 

26 return numpy.array(memoryview(b), dtype=self.dtype) 

27 

28 

29bytes_to_array = converter() 

30 

31 

32def quicka_parser( 

33 path: pathlib.Path, converter: typing.Callable[[bytes], bytes] = bytes_to_array 

34): 

35 """generator returning sequence labels and sequences converted bytes from a fasta file 

36 

37 Parameters 

38 ---------- 

39 path 

40 location of the fasta file 

41 converter 

42 a callable that uses converts sequence characters into nominated bytes, 

43 deleting unwanted characters. Must handle newlines. Whatever type this 

44 callable returns will be the type of the sequence returned. 

45 

46 Returns 

47 ------- 

48 the sequence label as a string and the sequence as transformed by converter 

49 """ 

50 with open_(path, mode="rb") as infile: 

51 data: bytes = infile.read() 

52 

53 records = data.split(b">") 

54 for record in records: 

55 eol = record.find(b"\n") 

56 if eol == -1: 

57 continue 

58 label = record[:eol].strip().decode("utf8") 

59 seq = converter(record[eol + 1 :]) 

60 yield label, seq