#######################################################################
#
# Copyright (C) 2021 David Palao
#
# This file is part of PacBioDataProcessing.
#
# PacBioDataProcessing is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PacBio data processing is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
from math import log10
from typing import Tuple, Optional, Union
from pathlib import Path
from functools import cached_property
from hashlib import md5
import Bio.Seq
import Bio.SeqIO
from Bio.SeqIO import SeqRecord
from pyfaidx import Faidx
from .bam import BamFile
[docs]def combine_scores(scores):
"""
>>> combine_scores([10])
10.0
>>> q = combine_scores([10, 12, 14])
>>> print(round(q, 6))
7.204355
>>> q = combine_scores([30, 20, 100, 92])
>>> print(round(q, 6))
19.590023
>>> q_500 = combine_scores([30, 20, 500])
>>> q_no_500 = combine_scores([30, 20])
>>> q_500 == q_no_500
True
>>> combine_scores([200, 300, 500])
200.0
"""
p_ok = 1
for q in scores:
p_ok *= 1-10**(-q/10)
try:
combined_score = -10*log10(1-p_ok)
except ValueError:
combined_score = float(min(scores))
return combined_score
[docs]class DNASeq:
"""Wrapper around 'Bio.Seq.Seq'."""
[docs] def __init__(self, raw_seq, name: str="", description: str=""):
self._seq = Bio.Seq.Seq(raw_seq)
self.name = name
self.description = description
self.fasta_name = None
def __eq__(self, other):
return self._seq.upper() == other.upper()
def __getattr__(self, attr):
return getattr(self._seq, attr)
[docs] def upper(self):
return self._seq.upper()
def __getitem__(self, idx):
return DNASeq(self._seq[idx])
def __len__(self):
return len(self._seq)
[docs] @classmethod
def from_fasta(cls, fasta_name: str):
"""Returns a DNASeq from the first DNA sequence stored in
the fasta named 'fasta_name'.
"""
seq_record_iter = Bio.SeqIO.parse(fasta_name, "fasta")
record = next(seq_record_iter)
seq = cls(
record.seq, name=record.name, description=record.description
)
seq.fasta_name = fasta_name
return seq
[docs] def pi_shifted(self):
"""Method to return a pi-shifted DNASeq from the original one.
pi-shifted means that a circular topology is assumed in the
DNA sequence and a shift in the origin is done by π radians, ie
the sequence is splitted in two parts and both parts are permuted.
"""
N = len(self)
original = self._seq
return DNASeq(
original[N//2:]+original[:N//2], name=self.name,
description=self.description+" (pi-shifted)"
)
[docs] def write_fasta(self, output_file_name):
# The next would be useful to have the metadata:
## orig = Bio.SeqIO.parse(self.source_fasta, "fasta")
## rec = next(orig)
## rec.seq = self._seq
rec = SeqRecord(self._seq, id=self.name, description=self.description)
Bio.SeqIO.write([rec], output_file_name, "fasta")
Faidx(output_file_name)
@cached_property
def md5sum(self) -> str:
"""It returns the MD5 checksum's hexdigest of the *upper*
version of the sequence as a string.
"""
return md5(str(self._seq).upper().encode()).hexdigest()
[docs]class Partition:
"""A Partition is a class that helps answering the following
question: assuming that we are interested in processing a fraction
of a BamFile, does the molecule ID ``mol_id`` belongs to that
fraction, or not?
A prior implementation consisted in storing all the molecule IDs
in the ``BamFile`` for a given partition in a set, and the answer
is just obtained by querying if a molecule ID belongs to the set or
not.
That former implementation is not enough for the case of multiple
alignment processes for the same raw ``BamFile`` (eg, when a
combined analysis of the so-called 'straight' and 'pi-shifted'
variants is performed). In that case the partition is decided with
one file. And all molecule IDs belonging to the non-empty
intersection with the other file must be unambiguously accomodated
in a certain partition. This class has been designed to solve
that problem.
"""
[docs] def __init__(
self,
partition_specification: Optional[Tuple[int, int]],
bamfile: BamFile):
try:
current, num_partitions = partition_specification
except TypeError:
current, num_partitions = 1, 1
self._current = current
self._num_partitions = num_partitions
self._bamfile = bamfile
self._delimit_partitions()
self._set_current_limits()
def _delimit_partitions(self):
"""[Internal method]
This method decides what are the limits of all partitions
given the number of partitions.
The method sets an internal mapping, ``self._lower_limits``, of
the type ``{partition number [int]: lower limit [int]}``
with that information.
This mapping is populated with *all* the partition numbers and
corresponding values.
"""
nmols = self._bamfile.num_molecules
mols_per_part = nmols//self._num_partitions
lower_limits = {}
all_mols = [int(_) for _ in self._bamfile.all_molecules]
all_mols.sort()
lower_limits[1] = 0
for ipart in range(2, self._num_partitions+1):
lower_limits[ipart] = all_mols[mols_per_part*(ipart-1)]
self._lower_limits = lower_limits
def _set_current_limits(self):
"""[Internal method]
Auxiliary method for __contains__
Here it is determined what is the range of molecule IDs, as
ints, that belong to the partition.
The method sets two integer attributes, namely:
- ``_lower_limit_current``: the minimum molecule ID of the
current partition, and
- ``_higher_limit_current``: the maximum molecule ID of the
current partition; it can be ``None``, meaning that there
is no maximum (last partition).
"""
self._lower_limit_current = self._lower_limits[self._current]
if self._current == self._num_partitions:
self._higher_limit_current = None
else:
self._higher_limit_current = self._lower_limits[self._current+1]
def __contains__(self, mol_id: Union[bytes, int, str]):
"""Main mathod of the Partition class. It decides whether a
given molecule ID, ``mol_id`` is within the limits of the
partition.
"""
mol_id = int(mol_id)
if mol_id >= self._lower_limit_current:
lower_check = True
else:
lower_check = False
if ((self._higher_limit_current is None) or
(mol_id < self._higher_limit_current)):
higher_check = True
else:
higher_check = False
return lower_check and higher_check
[docs]def find_gatc_positions(seq: str, offset: int=0) -> set[int]:
"""Convenience function that computes the positions of all GATCs
found in the given sequence.
The values are relative to the offset.
>>> find_gatc_positions('AAAGAGAGATCGCGCGATC') == {7, 15}
True
>>> find_gatc_positions('AAAGAGAGTCGCGCCATC')
set()
>>> find_gatc_positions('AAAGAGAGATCGgaTcCGCGATC') == {7, 12, 19}
True
>>> s = find_gatc_positions('AAAGAGAGATCGgaTcCGCGATC', offset=23)
>>> s == {30, 35, 42}
True
"""
result = set()
prev = 0
seq = seq.upper()
while (pos := seq.find("GATC", prev)) != -1:
result.add(pos+offset)
prev = pos+1
return result
[docs]def shift_me_back(pos: int, nbp: int) -> int:
"""Unshifts a given position taking into account that it has been
previously shifted by half of the number of base pairs. It takes
into account the possibility of having a sequence with an odd
length.
@params:
* pos - 1-based position of a base pair to unshift
* nbp - number of base pairs in the reference
@returns:
* unshifted position
Some examples:
>>> shift_me_back(3, 10)
8
>>> shift_me_back(1, 20)
11
>>> shift_me_back(3, 7)
6
>>> shift_me_back(4, 7)
7
>>> shift_me_back(5, 7)
1
>>> shift_me_back(7, 7)
3
>>> shift_me_back(1, 7)
4
To understand the operation of this function consider the following
example. Given a sequence of 7 base pairs with the following indices
found in the reference in the natural order, ie
1 2 3 4 5 6 7
then, after being *pi-shifted* the base pairs in the sequence are
reordered, and the indices become (in parenthesis the former
indices):
1'(=4) 2'(=5) 3'(=6) 4'(=7) 5'(=1) 6'(=2) 7'(=3)
The current function accepts *primed* indices and transforms them to
the *unprimed* indices, ie, the positions returned refer to the
original reference.
"""
shift = nbp//2
if pos <= (nbp-shift):
pos += shift
else:
pos -= (nbp-shift)
return pos
[docs]def pishift_back_positions_in_gff(gff_path: Union[str, Path]) -> None:
"""The function parses the input GFF file (assumed to be a valid
`GFF3`_ file) and *shifts back* the positions found in it (columns
4th and 5th of lines not starting by ``#``).
It is assumed that the positions in the input file (``gff_path``)
are referring to a *pi-shifted* origin.
To undo the shift, the length of the sequence(s) is (are) read
from the *GFF3 directives* (lines starting by ``##``), in
particular from the ``##sequence-region`` *pragmas*.
This function can handle the case of multiple sequences.
Warning! The function overwrites the input ``gff_path``.
.. GFF3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
"""
with open(gff_path) as ingff:
inlines = ingff.readlines()
outlines = []
seq_lens = {}
for line in inlines:
if line.startswith("##sequence-region"):
_, name, seq_num, seq_len = line.strip().split()
N = int(seq_len)
seq_lens[name] = N
else:
pieces = line.split("\t")
name = pieces[0]
if name in seq_lens:
N = seq_lens[name]
positions = [int(_) for _ in pieces[3:5]]
new_positions = [str(shift_me_back(_, N)) for _ in positions]
line = "\t".join(
pieces[:3]+new_positions+pieces[5:]
)
outlines.append(line)
with open(gff_path, "w") as outgff:
for line in outlines:
outgff.write(line)