Source code for pacbio_data_processing.split_bam
#######################################################################
#
# Copyright (C) 2020 David Velázquez
# Copyright (C) 2021 David Palao
#
# This file is part of PacBio data processing.
#
# PacBioDataProcessing is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PacBio data processing is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PacBioDataProcessing. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
# flake8: noqa
import subprocess as sp
# import pandas as pd
SAM_HEADER = 'samtools view -H body11.bam'
SAM_BODY = 'samtools view body11.bam'
SAMTOOLS_WRITE_BAM = 'samtools view -S -bh'
[docs]def get_groups_lists(id_list, number):
length = id_list.size
step = round(length/number)
main_list = []
for val in range(0, length, step):
upper_val = val + step
if upper_val < length:
sublist = id_list[val:upper_val]
main_list.append(sublist)
else:
sublist = id_list[val:length + 1]
main_list.append(sublist)
return main_list
[docs]def main():
import numpy as np
# Saving the header
bam_header = sp.run(SAM_HEADER, capture_output=True, shell=True, text=True)
bam_header_data = bam_header.stdout
print(bam_header_data)
# Open the bam body
bam_body = sp.run(SAM_BODY, capture_output=True, shell=True, text=True)
bam_body_data = [l.split() for l in bam_body.stdout.splitlines()]
bam_body_data_np = np.array(bam_body_data)
unique_molecules_np = np.unique(bam_body_data_np[:, 24])
split_number = 4
split_ls = get_groups_lists(unique_molecules_np, split_number)
# ###### method 1
# file_name = 1
# for s in split_ls:
# names = str(file_name) + '.sam'
# listX = bam_body_data_np[np.isin(bam_body_data_np[:,24], s)]
# file_name += file_name
# with open(names,'w') as f:
# f.write(bam_header_data)
# apnd = np.savetxt(f, listX, fmt='%s', delimiter='\t')
# f.write(str(apnd))
# ###### method 2
file_name = 1
for s in split_ls:
names = str(file_name)
listX = bam_body_data_np[np.isin(bam_body_data_np[:, 24], s)]
np.savetxt(names, listX, fmt='%s', delimiter='\t')
file_name += 1
# file_name = 1
# for s in split_ls:
# names = str(file_name)
# listX = bam_body_data_np[np.isin(bam_body_data_np[:,24], s)]
# with open(names, "w") as out_file:
# with sp.Popen(SAMTOOLS_WRITE_BAM + out_file, stdin=sp.PIPE,
# stdout=out_file, bufsize=1) as new_bam_stream:
# apnd = np.savetxt(
# new_bam_stream, listX, fmt='%s', delimiter='\t')
# new_bam_stream.stdin.write(bam_header_data)
# new_bam_stream.stdin.write(apnd)