# Copyright (C) 2025-2026 Genome Research Ltd.
from dataclasses import dataclass
import numpy as np
import struct
import sys
import time
import typing
COMPLEMENT_MAP = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"}
ENCODING_MAP = {"A": 0, "C": 1, "G": 2, "T": 3, "N": 4}
ERROR_STR = np.uint64(0xFFFFFFFFFFFFFFFF)
FILE_HEADER_FORMAT = "<BL"
FILE_VERSION = np.uint16(3)
HEADER_SIZE = struct.calcsize(FILE_HEADER_FORMAT)
METADATA_FORMAT = "<QQQB30s30s"
METADATA_SIZE = struct.calcsize(METADATA_FORMAT)
PADDING_FORMAT = "<BBB"
[docs]
def get_guides(
guidesfile_handle: typing.BinaryIO, verbose: bool = False
) -> np.ndarray:
"""Get array of guides from the binary guides file.
:param guidesfile_handle: The file handle of the guides file
:param verbose: A boolean to print verbose output
:return: A numpy array of guides
"""
start = time.time()
guidesfile_handle.seek(HEADER_SIZE)
# read the number of sequences in the header
number_of_guides = struct.unpack("<Q", guidesfile_handle.read(8))[0]
guidesfile_handle.seek(
HEADER_SIZE + METADATA_SIZE + struct.calcsize(PADDING_FORMAT)
)
guides = np.fromfile(guidesfile_handle, dtype=np.uint64, count=-1)
if number_of_guides != guides.size:
raise ValueError("Invalid number of guides")
if verbose:
print(
f"Loading took {time.time() - start:.2f} seconds", file=sys.stderr
)
return guides
[docs]
def sequence_to_binary_encoding(sequence: str, pam_right: int) -> np.uint64:
"""Convert a string DNA sequence to bits accounting for pam right or left.
:param sequence: The string DNA sequence to convert
:param pam_right: An integer indicating if PAM is on the right (1) or left (0)
:return: A 64-bit unsigned integer representing the sequence
"""
bits = np.uint64(pam_right)
for character in sequence:
if ENCODING_MAP[character] == 4:
bits = ERROR_STR # set the error flag if we encounter an 'N'
break
else:
bits = np.uint64(bits << 2) # shift left to make room for new char
bits = np.uint64(
bits | ENCODING_MAP[character]
) # add our new char to the end
return bits
[docs]
def reverse_complement(sequence: str) -> str:
"""Return the reverse complement of a DNA sequence.
:param sequence: The string DNA sequence to reverse complement.
:return: The reverse complemented string DNA sequence
"""
return "".join([COMPLEMENT_MAP[base] for base in reversed(sequence)])
[docs]
def parse_c_string(data: bytes) -> str:
"""Parse a null-terminated C string from a byte array.
:param data: The byte array to parse
:returns: The parsed string
"""
null = data.find(b"\x00")
if null < 0:
return ""
cstr = data[:null].decode("utf-8")
return cstr