Coverage for /Users/gavin/repos/EnsemblLite/src/ensembl_lite/_faster_fasta.py: 100%
24 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
« prev ^ index » next coverage.py v7.5.1, created at 2024-06-12 16:31 -0400
1import pathlib
2import typing
4import numpy
6from cogent3 import get_moltype, open_
9# old style moltype
10alphabet = get_moltype("dna").alphabets.degen_gapped
13class converter:
14 """Defines a linear mapping from provided characters to uint8.
15 The resulting object is callable, taking a bytes object and returning a
16 numpy array."""
18 def __init__(self, dtype=numpy.uint8):
19 self._tr = b"".maketrans(
20 "".join(alphabet).encode("utf8"), bytes(bytearray(range(len(alphabet))))
21 )
22 self.dtype = dtype
24 def __call__(self, seq: bytes) -> numpy.ndarray:
25 b = seq.translate(self._tr, delete=b" \n\r")
26 return numpy.array(memoryview(b), dtype=self.dtype)
29bytes_to_array = converter()
32def quicka_parser(
33 path: pathlib.Path, converter: typing.Callable[[bytes], bytes] = bytes_to_array
34):
35 """generator returning sequence labels and sequences converted bytes from a fasta file
37 Parameters
38 ----------
39 path
40 location of the fasta file
41 converter
42 a callable that uses converts sequence characters into nominated bytes,
43 deleting unwanted characters. Must handle newlines. Whatever type this
44 callable returns will be the type of the sequence returned.
46 Returns
47 -------
48 the sequence label as a string and the sequence as transformed by converter
49 """
50 with open_(path, mode="rb") as infile:
51 data: bytes = infile.read()
53 records = data.split(b">")
54 for record in records:
55 eol = record.find(b"\n")
56 if eol == -1:
57 continue
58 label = record[:eol].strip().decode("utf8")
59 seq = converter(record[eol + 1 :])
60 yield label, seq