Source code for pacbio_data_processing.split_bam

#######################################################################
#
# Copyright (C) 2020 David Velázquez
# Copyright (C) 2021 David Palao
#
# This file is part of PacBio data processing.
#
#  PacBioDataProcessing is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  PacBio data processing is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################

# flake8: noqa

import subprocess as sp
# import pandas as pd

SAM_HEADER = 'samtools view -H body11.bam'
SAM_BODY = 'samtools view body11.bam'
SAMTOOLS_WRITE_BAM = 'samtools view -S -bh'


[docs]def get_groups_lists(id_list, number): length = id_list.size step = round(length/number) main_list = [] for val in range(0, length, step): upper_val = val + step if upper_val < length: sublist = id_list[val:upper_val] main_list.append(sublist) else: sublist = id_list[val:length + 1] main_list.append(sublist) return main_list
[docs]def main(): import numpy as np # Saving the header bam_header = sp.run(SAM_HEADER, capture_output=True, shell=True, text=True) bam_header_data = bam_header.stdout print(bam_header_data) # Open the bam body bam_body = sp.run(SAM_BODY, capture_output=True, shell=True, text=True) bam_body_data = [l.split() for l in bam_body.stdout.splitlines()] bam_body_data_np = np.array(bam_body_data) unique_molecules_np = np.unique(bam_body_data_np[:, 24]) split_number = 4 split_ls = get_groups_lists(unique_molecules_np, split_number) # ###### method 1 # file_name = 1 # for s in split_ls: # names = str(file_name) + '.sam' # listX = bam_body_data_np[np.isin(bam_body_data_np[:,24], s)] # file_name += file_name # with open(names,'w') as f: # f.write(bam_header_data) # apnd = np.savetxt(f, listX, fmt='%s', delimiter='\t') # f.write(str(apnd)) # ###### method 2 file_name = 1 for s in split_ls: names = str(file_name) listX = bam_body_data_np[np.isin(bam_body_data_np[:, 24], s)] np.savetxt(names, listX, fmt='%s', delimiter='\t') file_name += 1
# file_name = 1 # for s in split_ls: # names = str(file_name) # listX = bam_body_data_np[np.isin(bam_body_data_np[:,24], s)] # with open(names, "w") as out_file: # with sp.Popen(SAMTOOLS_WRITE_BAM + out_file, stdin=sp.PIPE, # stdout=out_file, bufsize=1) as new_bam_stream: # apnd = np.savetxt( # new_bam_stream, listX, fmt='%s', delimiter='\t') # new_bam_stream.stdin.write(bam_header_data) # new_bam_stream.stdin.write(apnd)