Source code for integron_finder.prot_db

# -*- coding: utf-8 -*-

####################################################################################
# Integron_Finder - Integron Finder aims at detecting integrons in DNA sequences   #
# by finding particular features of the integron:                                  #
#   - the attC sites                                                               #
#   - the integrase                                                                #
#   - and when possible attI site and promoters.                                   #
#                                                                                  #
# Authors: Jean Cury, Bertrand Neron, Eduardo PC Rocha                             #
# Copyright (c) 2015 - 2024  Institut Pasteur, Paris and CNRS.                     #
# See the COPYRIGHT file for details                                               #
#                                                                                  #
# integron_finder is free software: you can redistribute it and/or modify          #
# it under the terms of the GNU General Public License as published by             #
# the Free Software Foundation, either version 3 of the License, or                #
# (at your option) any later version.                                              #
#                                                                                  #
# integron_finder is distributed in the hope that it will be useful,               #
# but WITHOUT ANY WARRANTY; without even the implied warranty of                   #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                    #
# GNU General Public License for more details.                                     #
#                                                                                  #
# You should have received a copy of the GNU General Public License                #
# along with this program (COPYING file).                                          #
# If not, see <http://www.gnu.org/licenses/>.                                      #
####################################################################################

from abc import ABC, abstractmethod
import os
import subprocess
import shlex
from collections import namedtuple
import re
import importlib.util
from enum import Enum


import colorlog
import pandas as pd
from Bio import SeqIO, Seq
from integron_finder import IntegronError

_log = colorlog.getLogger(__name__)


"""Sequence description with fields: id strand start stop"""
SeqDesc = namedtuple('SeqDesc', ('id', 'strand', 'start', 'stop'))


[docs] class ProteinDB(ABC): """ AbstractClass defining the interface for ProteinDB. ProteinDB provide an abstraction and a way to access to proteins corresponding to the replicon/contig CDS. """
[docs] def __init__(self, replicon, cfg, prot_file=None): """ :param replicon: :param cfg: :param prot_file: """ self.cfg = cfg self.replicon = replicon self._prot_file = self._make_protfile(path=prot_file) self._prot_db = self._make_db()
[docs] @abstractmethod def __getitem__(self, prot_seq_id): """ :param str prot_seq_id: the id of a protein sequence :return: The Sequence corresponding to the prot_seq_id. :rtype: :class:`Bio.SeqRecord` object :raise KeyError: when seq_id does not match any sequence in DB """ pass
[docs] @abstractmethod def __iter__(self): """ :return: a generator which iterate on the protein seq_id which constitute the contig. :rtype: generator """ pass
[docs] @abstractmethod def _make_protfile(self, path=None): """ Create fasta file with protein corresponding to the nucleic sequence (replicon) :return: the path of the created protein file :rtype: str """ pass
[docs] def _make_db(self): """ :return: an index of the sequence contains in protfile corresponding to the replicon """ try: # for biopython < 1.78 idx = SeqIO.index(self._prot_file, "fasta", alphabet=Seq.IUPAC.extended_protein) except AttributeError: # for biopython > 1.76 idx = SeqIO.index(self._prot_file, "fasta") return idx
[docs] @abstractmethod def get_description(self, gene_id): """ :param str gene_id: a protein/gene identifier :return: The description of the protein corresponding to the gene_id :rtype: :class:`SeqDesc` namedtuple object :raise IntegronError: when gene_id is not a valid Gembase gene identifier :raise KeyError: if gene_id is not found in GembaseDB instance """ pass
@property def protfile(self): """ :return: The absolute path to the protein file corresponding to contig id :rtype: str """ return self._prot_file
class GembaseType(Enum): """ Modelize the Gembase type en version """ COMPLETE_2plus = 1 DRAFT_2plus = 2 COMPLETE_2 = 3 DRAFT_2 = 4 COMPLETE_1 = 5 DRAFT_1 = 6 def __str__(self): _type, vers = self.name.split('_') _type = _type.capitalize() return f"vers: {vers} {_type}" @property def complete(self) -> bool: """ :return: True if the GembaseType is Complete genome or False if it's a Draft :rtype: bool """ return self.name.startswith('COMPLETE') @property def version(self) -> str: """ :return: the gembase version :rtype: str """ return self.name.split('_')[-1] class RepliconType(Enum): """ Modelize Replicon type in Gembase """ CHROMOSOME = 1 PLASMID = 2 PHAGE = 3 OTHER = 4 DRAFT = 5 def __str__(self): return self.name.capitalize() def topology(self): """ :return: the default topology of this replicon type 'circ' | 'lin' :rtype: str """ return {1: 'circ', 2: 'circ', 3: 'circ', 4: 'circ', 5: 'lin' }[self.value]
[docs] class GembaseDB(ProteinDB): """ Implements :class:`ProteinDB` from a Gembase. Managed proteins from Proteins directory corresponding to a replicon/contig """ _gene_patterns = { GembaseType.COMPLETE_2plus: r'(\w+).(\d{4})\.(\d{5})\.(\d{3})(?P<g_type>[CPVO])_(\w+)_(\d{5})', GembaseType.DRAFT_2plus: r'(\w+).(\d{4})\.(\d{5})\.(\d{3,4})(?P<g_type>D)_([ib])_(\d{5})', GembaseType.COMPLETE_2: r'(\w{7})\.(\d{4})\.(\d{5})\.(\d{3})(?P<g_type>[CPVO])_(\d{5})', GembaseType.DRAFT_2: r'(\w{4})\.(\d{4})\.(\d{5})\.(\d{4})(?P<g_type>[ib])_(\d{5})', GembaseType.COMPLETE_1: r'(\w{7})\.(\w|\d{4})\.(\d{5})\.(?P<g_type>[CPVO])(\d{3})_(\d{5})', GembaseType.DRAFT_1: r'(\w{4})\.(\d{4})\.(\d{5})\.(?P<g_type>[ib])(\d{4})_(\d{5})' } _rep_patterns = [ r'(\w+)\.(\d{4})\.(\d{5})\.(\d{3,4})(?P<g_type>[CPVOD])', # Complete + Draft V2_plus r'(\w{7})\.(\d{4})\.(\d{5})\.(\d{3})(?P<g_type>[CPVO])', # Complete V2 r'(\w{7})\.(\w|\d{4})\.(\d{5})\.(?P<g_type>[CPVO])(\d{3})', # Complete V1 r'(\w{4})\.(\d{4})\.(\d{5})\.(\d{4})', # Draft V1 et V2 ]
[docs] def __init__(self, replicon, cfg, gembase_path=None, prot_file=None): """ :param replicon: The replicon used to create ProteinDB (protein files and extra information) :type replicon: :class:`Bio.SeqRecord` object with a extra attribute path :param cfg: The integron_finder configuration :type cfg: :class:`integron_finder.config.Config` object :param prot_file: The path to a protein file in fasta format which is the translation of the replicon .. warning:: The replicon is a modified Bio.SeqRecord object. The attribute *path* must be injected in the object This attribute represent the path to a fasta file representing this replicon """ _log.debug(f"call GembaseDB with gembase_path= {gembase_path}") self.cfg = cfg if gembase_path is None: self._gembase_path = os.path.dirname(os.path.dirname(os.path.realpath(self.cfg.input_seq_path))) else: self._gembase_path = os.path.realpath(gembase_path) self.replicon = replicon # in GemBase Draft the files ar based on replicon id # but one file can contains several contig # the sequence id contains the contig number # for instance ACBA.0917.00019 vs ACBA.0917.00019.0001 # the filenames are based on replicon id # but in code the replicon id contains the contig number # for gembase complete both filename and seq_id contains contig number self._lst_dir = self.get_lst_dir(self._gembase_path) self._gembase_file_basename = self.find_gembase_file_basename(self._gembase_path, self.cfg.input_seq_path) self._lst_path = self._get_lst_path() self._gembase_type = self.gembase_sniffer(self._lst_path) self._replicon_name = os.path.splitext(os.path.basename(self.cfg.input_seq_path))[0] self._info = self._parse_lst(self._lst_path) if self._info.empty: msg = f"No CDS reported in {self._lst_path} for the replicon {replicon.id} ." _log.warning(msg) self.replicon_tye = self.get_replicon_type(seq_id=self.replicon.id) if prot_file is None: self._prot_file = self._make_protfile() else: self._prot_file = prot_file self._prot_db = self._make_db()
[docs] @staticmethod def get_lst_dir( gembase_path): """ :return: The path to the Gembase LST directory :rtype: str """ found = False for lst_dirname in ('LST', 'LSTINF', 'LSTINFO'): lst_dir_path = os.path.join(gembase_path, lst_dirname) if os.path.exists(lst_dir_path): found = True break if not found: raise IntegronError(f"Neither 'LST' nor 'LSTINF' nor 'LSTINFO' directory found in '{gembase_path}' .") return lst_dir_path
def _get_lst_path(self): lst_dir_path = self._lst_dir lst_path = os.path.join(lst_dir_path, self._gembase_file_basename + '.lst') if not os.path.exists(lst_path): raise IntegronError(f"Do not find {lst_path} in {lst_dir_path}") return lst_path
[docs] def _parse_lst(self, lst_path): """ Parse the LSTINFO file and extract information specific to the replicon :return: `class`:pandas.DataFrame` object :raise IntegronError: when LST dir is not found """ if self.gembase_type == GembaseType.DRAFT_1: prots_info = self.gembase1_draft_parser(lst_path, self.replicon.id) elif self.gembase_type == GembaseType.COMPLETE_1: prots_info = self.gembase1_complete_parser(lst_path, self.replicon.id) elif self.gembase_type.version == '2': prots_info = self.gembase2_parser(lst_path, self.replicon.id) elif self.gembase_type.version == '2plus': prots_info = self.gembase2_parser(lst_path, self.replicon.id) else: msg = f"Unknow gembase format: {self.gembase_type}" _log.critical(msg) raise IntegronError(msg) return prots_info
[docs] @classmethod def find_gembase_file_basename(cls, gembase_path, input_seq_path): """ from the input file name, try to retrieve the basename which is used in gembase This specially useful when IF is run in parallel. The input sequence is split in chunks and treat in parallel. But in this case the name of the chunk does not match neither the lstinfo file nor the protein file. So this method try retrieve the original basename without extension for instance: :: ACBA.0917.00019.fna => ACBA.0917.00019 ACBA.0917.00019.0001.fna => ACBA.0917.00019 ESCO001.C.00001.C001.fst => ESCO001.C.00001.C001 ESCO001.C.00001.C001_chunk_1.fst => ESCO001.C.00001.C001 :return: the gembase basename corresponding to the input file :rtype: string """ gembase_file_basename = os.path.splitext(os.path.basename(input_seq_path))[0] # when IF is run through nextflow & parallel_integron_finder # the input data is split the name of the chunks can vary # it can be # the name of input file with suffix _chunk_<id> # it can be the name of the contig # so wee need to find the original name to find the # - lstinfo file # - protein file match = re.search(r"_chunk_\d+$", gembase_file_basename) if match: gembase_file_basename = gembase_file_basename[:match.start()] lst_dir = cls.get_lst_dir(gembase_path) lst_path = os.path.join(lst_dir, gembase_file_basename + '.lst') if os.path.exists(lst_path): # it is a complete genome return gembase_file_basename else: # it is a contig # let's find the draft genome lst file gembase_file_basename = os.path.splitext(os.path.basename(gembase_file_basename))[0] lst_path = os.path.join(gembase_path, gembase_file_basename + '.lst') if os.path.exists(lst_path): return gembase_file_basename else: raise FileNotFoundError(f"cannot find lst file matching {input_seq_path} sequence")
[docs] def _make_protfile(self, path=None): """ Create fasta file with protein corresponding to this sequence, from the corresponding Gembase protfile This step is necessary because in Gembase1&2 Draft One nucleic file can contain several contigs, but all proteins are in the same file. and in Gembase2 replicons file can contain several chromosomes and plasmids :return: the path of the created protein file :rtype: str """ if path: prot_file_path = path else: all_prot_path = os.path.join(self._gembase_path, 'Proteins', self._gembase_file_basename + '.prt') try: all_prots = SeqIO.index(all_prot_path, "fasta", alphabet=Seq.IUPAC.extended_protein) except AttributeError: all_prots = SeqIO.index(all_prot_path, "fasta") if not os.path.exists(self.cfg.tmp_dir(self.replicon.id)): os.makedirs(self.cfg.tmp_dir(self.replicon.id)) prot_file_path = os.path.join(self.cfg.tmp_dir(self.replicon.id), self.replicon.id + '.prt') with open(prot_file_path, 'w') as prot_file: for seq_id in self._info[4]: try: seq = all_prots[seq_id] SeqIO.write(seq, prot_file, 'fasta') except KeyError: _log.warning(f'Sequence describe in LSTINF file {seq_id} is not present in {all_prot_path}') return prot_file_path
[docs] @classmethod def gembase_sniffer(cls, lst_path): """ Detect the type of gembase :param str lst_path: the path to the LSTINFO file corresponding to the nucleic sequence :returns: either the type of replicon ('Complet' or 'Draft', 1 or 2) :rtype: tuple """ with open(lst_path) as lst_file: first_line = next(lst_file) gene_id = first_line.split()[4] guess_gb_type = False for gb_type, pattern in cls._gene_patterns.items(): match = re.match(pattern, gene_id) if match: guess_gb_type = True break if not guess_gb_type: start, end , seq_id= first_line.split()[:3] if start == end == '0': msg = f"The genome {seq_id} seems empty: see {lst_path}" _log.critical(msg) raise IntegronError(msg) from None else: msg = f"Cannot detect GemBase version, check lst file '{lst_path}'." raise IntegronError(msg) from None _log.debug(f"GembaseDB sniff GemBase version:{gb_type}") return gb_type
@property def gembase_type(self): return self._gembase_type
[docs] @classmethod def get_replicon_type(cls, seq_id='', rep_id=''): """ :param seq_id: the sequence id to parse :type seq_id: str :param rep_id: the replicon identifier :type rep_id: str :return: the kind of genome, it can be either: * Chromosome * Plasmid * Phage * Other * Draft :rtype: :class:`RepliconType` object """ if not any((seq_id, rep_id)): raise IntegronError(f'{cls.__name__}.get_replicon_type you must provide either a seqid or a rep_id') elif all((seq_id, rep_id)): raise IntegronError(f'{cls.__name__}.get_replicon_type you must provide either a seqid or a rep_id') guess_gb_type = False if seq_id: patterns = cls._rep_patterns _id = seq_id elif rep_id: patterns = cls._gene_patterns.values() _id = rep_id for pattern in patterns: match = re.match(pattern, _id) if match: guess_gb_type = True try: genome_type_letter = match.group('g_type') except IndexError: # there is no group # so a seq_id has been provided # and it match a Draft genome_type_letter = 'i' break if not guess_gb_type: msg = f"Cannot detect GemBase version, from {'seq_id' if seq_id else 'rep_id'}: '{_id}'." _log.error(msg) raise IntegronError(msg) from None else: genome_type = { 'C': RepliconType.CHROMOSOME, 'P': RepliconType.PLASMID, 'V': RepliconType.PHAGE, 'O': RepliconType.OTHER, 'D': RepliconType.DRAFT, # V2_plus 'i': RepliconType.DRAFT, # V2 'b': RepliconType.DRAFT # V2 }[genome_type_letter] return genome_type
[docs] @staticmethod def gembase1_complete_parser(lst_path, sequence_id): """ :param str lst_path: the path of the LSTINFO file Gembase Complet :param str sequence_id: the id of the genomic sequence to analyse :return: the information related to the 'valid' CDS corresponding to the sequence_id :rtype: `class`:pandas.DataFrame` object """ dtype = {0: 'int', # start 1: 'int', # end 2: 'str', # strand C/D 3: 'str', # type (CDS, ncRNA, tRNA,...) 4: 'str', # seq id 5: 'str', # Valid 6: 'str', # gene name 7: 'str'} # description with open(lst_path) as lst_file: lst_data = [] for line in lst_file: start, end, strand, gene_type, seq_id, valid, gene_name, *description = line.strip().split() row = [start, end, strand, gene_type, seq_id, valid, gene_name, ' '.join(description)] lst_data.append(row) lst = pd.DataFrame(lst_data) lst = lst.astype(dtype) genome_info = lst.loc[lst[4].str.contains(sequence_id, regex=True)] prots_info = genome_info.loc[(genome_info[3] == 'CDS') & (genome_info[5] == 'Valid')] return prots_info
[docs] @staticmethod def gembase1_draft_parser(lst_path, replicon_id): """ :param str lst_path: the path of the LSTINFO file from a Gembase Draft :param str sequence_id: the id of the genomic sequence to analyse :return: the information related to the 'valid' CDS corresponding to the sequence_id :rtype: `class`:pandas.DataFrame` object """ try: lst = pd.read_csv(lst_path, header=None, sep="\t") except Exception as err: msg = f"Error while parsing {lst_path} file: {err}" _log.error(msg) raise IntegronError(msg) specie, date, strain, contig = replicon_id.split('.') pattern = fr'{specie}\.{date}\.{strain}\.[bi]{contig}' try: genome_info = lst.loc[lst[4].str.contains(pattern, regex=True)] except KeyError: msg = f"The LST file '{lst_path}' seems not to be in gembase V1 draft format." _log.error(msg) raise IntegronError(msg) from None prots_info = genome_info.loc[genome_info[3] == 'CDS'] return prots_info
[docs] @staticmethod def gembase2_parser(lst_path, replicon_id): """ :param str lst_path: the path of the LSTINFO file from a Gembase Draft :param str sequence_id: the id of the genomic sequence to analyse :return: the information related to the 'valid' CDS corresponding to the sequence_id :rtype: `class`:pandas.DataFrame` object """ try: lst = pd.read_csv(lst_path, header=None, sep="\t") except Exception as err: msg = f"Error while parsing {lst_path} file: {err}" _log.error(msg) raise IntegronError(msg) try: lst = lst.loc[:, :7] dtype = {i: 'str' for i in range(lst.shape[1])} dtype[0] = dtype[1] = 'int' lst = lst.astype(dtype) genome_info = lst.loc[lst[4].str.contains(replicon_id, regex=True)] prots_info = genome_info.loc[genome_info[3] == 'CDS'] except Exception as err: msg = f"The LST file '{lst_path}' seems not to be in gembase V2 draft format." _log.error(msg) raise IntegronError(msg) from None return prots_info
[docs] def __getitem__(self, prot_seq_id): """ :param str prot_seq_id: the id of a protein sequence :return: The Sequence corresponding to the prot_seq_id. :rtype: :class:`Bio.SeqRecord` object """ return self._prot_db[prot_seq_id]
[docs] def __iter__(self): """ :return: a generator which iterate on the protein seq_id which constitute the contig. :rtype: generator """ return (seq_id for seq_id in self._info[4])
[docs] def get_description(self, gene_id): """ :param str gene_id: a protein/gene identifier :return: The description of the protein corresponding to the gene_id :rtype: :class:`SeqDesc` namedtuple object :raise IntegronError: when gene_id is not a valid Gembase gene identifier :raise KeyError: if gene_id is not found in GembaseDB instance """ try: specie, date, strain, contig_gene = gene_id.split('.') contig_gene = contig_gene[1:] # remove the first letter b/i except ValueError: raise IntegronError(f"'{gene_id}' is not a valid Gembase protein identifier.") pattern = fr'{specie}\.{date}\.{strain}\.\w?{contig_gene}' seq_info = self._info.loc[self._info[4].str.contains(pattern, regex=True)] if not seq_info.empty: return SeqDesc(seq_info[4].values[0], 1 if seq_info[2].values[0] == "D" else -1, seq_info[0].values[0], seq_info[1].values[0], ) else: raise KeyError(gene_id)
[docs] class ProdigalDB(ProteinDB): """ Creates proteins from Replicon/contig using prodigal and provide facilities to access them. """
[docs] def _make_protfile(self, path=None): """ Use `prodigal` to generate proteins corresponding to the replicon :return: the path of the created protfile :rtype: str """ assert self.cfg.prodigal, f"'prodigal' not found." if path: prot_file_path = path else: if not os.path.exists(self.cfg.tmp_dir(self.replicon.id)): os.makedirs(self.cfg.tmp_dir(self.replicon.id)) prot_file_path = os.path.join(self.cfg.tmp_dir(self.replicon.id), self.replicon.id + ".prt") if not os.path.exists(prot_file_path): prodigal_cmd = '{prodigal} {meta} -i {replicon} -a {prot} -o {out} -q '.format( prodigal=self.cfg.prodigal.replace(' ', '\\ '), meta='' if len(self.replicon) > 200000 else '-p meta', replicon=self.replicon.path.replace(' ', '\\ '), prot=prot_file_path.replace(' ', '\\ '), out=os.devnull, ) try: _log.debug("run prodigal: {}".format(prodigal_cmd)) completed_process = subprocess.run(shlex.split(prodigal_cmd)) except Exception as err: raise RuntimeError(f"{prodigal_cmd} : failed : {err}") if completed_process.returncode != 0: raise RuntimeError(f"{prodigal_cmd} : failed : prodigal returncode = {completed_process.returncode}") return prot_file_path
[docs] def __getitem__(self, prot_seq_id): """ :param str prot_seq_id: the id of a protein sequence :return: The Sequence corresponding to the prot_seq_id. :rtype: :class:`Bio.SeqRecord` object """ try: return self._prot_db[prot_seq_id] except KeyError: raise IntegronError(f"protein file does not contains '{prot_seq_id}' id. " f"Try again with removing previous results dir {self.cfg.result_dir}")
[docs] def __iter__(self): """ :return: a generator which iterate on the protein seq_id which constitute the contig. :rtype: generator """ return (seq_id for seq_id in self._prot_db)
[docs] def get_description(self, gene_id): """ :param str gene_id: a protein/gene identifier :returns: The description of the protein corresponding to the gene_id :rtype: :class:`SeqDesc` namedtuple object :raise IntegronError: when gene_id is not a valid Gembase gene identifier :raise KeyError: if gene_id is not found in ProdigalDB instance """ seq = self[gene_id] try: id_, start, stop, strand, *_ = seq.description.split(" # ") except ValueError: raise IntegronError(f"'{gene_id}' is not a valid Prodigal protein identifier.") start = int(start) stop = int(stop) strand = int(strand) return SeqDesc(id_, strand, start, stop)
class CustomDB(ProteinDB): """ Creates proteins from Replicon/contig using prodigal and provide facilities to access them. """ def __init__(self, replicon, cfg, prot_file): super().__init__(replicon, cfg, prot_file=prot_file) try: parser_path = self.cfg.annot_parser spec = importlib.util.spec_from_file_location('custom_module', parser_path) custom_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(custom_module) self._parser = custom_module.description_parser except Exception as err: raise RuntimeError(f"Cannot import custom --annot-parser '{parser_path}': {err}") def _make_protfile(self, path=None): if path is None: raise IntegronError("If use CustomDB prot_file must be specified") return path def __getitem__(self, prot_seq_id): """ :param str prot_seq_id: the id of a protein sequence :return: The Sequence corresponding to the prot_seq_id. :rtype: :class:`Bio.SeqRecord` object """ try: return self._prot_db[prot_seq_id] except KeyError: raise IntegronError(f"protein file does not contains '{prot_seq_id}' id. " f"Check if it's the right proteins file {self._prot_file} " f"or remove previous results dir {self.cfg.result_dir}") def __iter__(self): """ :return: a generator which iterate on the protein seq_id which constitute the contig. :rtype: generator """ return (seq_id for seq_id in self._prot_db) def get_description(self, gene_id): """ :param str gene_id: a protein/gene identifier :returns: The description of the protein corresponding to the gene_id :rtype: :class:`SeqDesc` namedtuple object :raise IntegronError: when gene_id is not a valid Gembase gene identifier :raise KeyError: if gene_id is not found in ProdigalDB instance """ def check_id(x): return isinstance(x, str) def check_strand(s): return s == 1 or s == -1 def check_start(s): return isinstance(s, int) and s >= 0 check_stop = check_start seq = self[gene_id] try: id_, start, stop, strand = self._parser(seq.description) except ValueError: msg = f"'{gene_id}' protein is not compliant with custom --annot-parser '{self.cfg.annot_parser}'." _log.critical(msg) raise IntegronError(msg) except Exception as err: msg = f"Cannot parse protein file '{self._prot_file}' with annot-parser '{self.cfg.annot_parser}': {err}" _log.critical(msg) raise IntegronError(msg) from None if not all((check_id(id_), check_start(start), check_stop(stop), check_strand(strand))): msg = "Error during protein file parsing: expected seq_id: str, start: positive int, stop: positive int, " \ f"strand 1/-1. got: {id_}, {start}, {stop}, {strand}" _log.critical(msg) raise IntegronError(msg) return SeqDesc(id_, strand, start, stop)