#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import singlecellmultiomics.features
import matplotlib.pyplot as plt
import numpy as np
import itertools
import pandas as pd
import importlib
import singlecellmultiomics.universalBamTagger.universalBamTagger as ut
import pysamiterators.iterators as pyts
import matplotlib.lines as mlines
import os
import sys
import pysam
import collections
import argparse
import gzip
import pickle
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')
[docs]def nlaIII_molecule_acceptance_function(molecule):
first_read = molecule[0][0]
if first_read.mapping_quality < 60:
return False
reject = False
if first_read.has_tag('XA'):
for alt_align in first_read.get_tag('XA').split(';'):
if len(alt_align) == 0: # Sometimes this tag is empty for some reason
continue
hchrom, hpos, hcigar, hflag = alt_align.split(',')
if not hchrom.endswith('_alt'):
reject = True
break
return reject
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Visualise feature density of a bam file. (Coverage around stop codons, start codons, genes etc)')
argparser.add_argument('bamFile', type=str)
argparser.add_argument(
'-head',
type=int,
help='Process this many molecules')
argparser.add_argument('-binSize', type=int, default=30)
argparser.add_argument(
'-maxfs',
type=int,
default=900,
help='X axis limit of fragment size plot')
argparser.add_argument('-maxOverseq', type=int, default=4)
argparser.add_argument('--notstrict', action='store_true')
argparser.add_argument('-o', type=str, default='RT_dist')
args = argparser.parse_args()
def dd():
return collections.defaultdict(collections.Counter)
fragment_distribution = collections.Counter()
fragment_distribution_raw = collections.defaultdict(
collections.Counter) # overseq -> fragmentisze -> counts
# Read size fragments
fragment_distribution_raw_rf = collections.defaultdict(
dd) # lib -> overseq -> fragmentisze (span) -> counts
gc_distribution = collections.Counter()
gc_frag_distribution = collections.defaultdict(
collections.Counter) # fragment size -> observed gc/at+gc ratio
# fragmentSize -> umi obs
rt_frag_distribution = collections.defaultdict(collections.Counter)
# fragment size -> amount of RT reactions -> count
observed_cuts = collections.defaultdict()
used = 0
used_reads = 0
gc_capture = False
with pysam.AlignmentFile(args.bamFile) as a:
for i, molecule in enumerate(
ut.MoleculeIterator_OLD(
a, umi_hamming_distance=1)):
if not args.notstrict and not nlaIII_molecule_acceptance_function(
molecule):
continue
if args.head is not None and used > args.head:
print('Stoppping, saw enough molecules')
break
rt_reactions = ut.molecule_to_random_primer_dict(molecule)
amount_of_rt_reactions = len(rt_reactions)
# this obtains the maximum fragment size:
frag_chrom, frag_start, frag_end = pyts.getListSpanningCoordinates(
[v for v in itertools.chain.from_iterable(molecule) if v is not None])
# Obtain the fragment sizes of all RT reactions:
rt_sizes = []
for (rt_end, hexamer), fragment in rt_reactions.items():
rt_chrom, rt_start, rt_end = pyts.getListSpanningCoordinates(
itertools.chain.from_iterable(fragment))
rt_sizes.append([rt_end - rt_start])
first_read = molecule[0][0]
site = first_read.get_tag('DS')
strand = first_read.get_tag('RS')
library = first_read.get_tag('LY')
# if gc_capture:
# sequence = reference.fetch(frag_chrom, frag_start, frag_end)
# gc = sequence.count('C')+ sequence.count('G')
# length = len(sequence)
used += 1
# if gc_capture:
# gc_distribution[gc/length] += 1
# gc_frag_distribution[fragment_size][gc/length] += 1
# fragment_distribution_raw[len(molecule)][fragment_size]+=1
if len(rt_sizes) == 0:
mean_rt_size = 0
else:
mean_rt_size = int(np.mean(rt_sizes))
fragment_distribution_raw_rf[library][len(
molecule)][mean_rt_size] += 1
rt_frag_distribution[mean_rt_size][len(rt_reactions)] += 1
used_reads += len(molecule)
bin_size = args.binSize
m_overseq = args.maxOverseq
with gzip.open(f'{args.o}_raw_data.pickle.gz', 'wb') as fo:
pickle.dump(fragment_distribution_raw_rf, fo)
for library in fragment_distribution_raw_rf:
fig, axes = plt.subplots(1, 1, figsize=(10, 7))
ax = axes
ax.set_title(
f'Read fragment size distribution\n{used} molecules / {used_reads} fragments analysed\n{library}')
table = {}
for overseq in range(1, m_overseq):
try:
# Rebin in 10bp bins:
rebinned = collections.Counter()
for f_size, obs in fragment_distribution_raw_rf[library][overseq].most_common(
):
rebinned[int(f_size / bin_size) * bin_size] += obs
obs_dist_fsize = np.array(list(rebinned.keys()))
obs_dist_freq = np.array(list(rebinned.values()))
sorting_order = np.argsort(obs_dist_fsize)
obs_dist_fsize = obs_dist_fsize[sorting_order]
obs_dist_freq = obs_dist_freq[sorting_order]
obs_dist_density = obs_dist_freq / np.sum(obs_dist_freq)
for i, x in enumerate(obs_dist_fsize):
table[(overseq, x)] = {'obs_dist_freq': obs_dist_freq[i],
'obs_dist_density': obs_dist_density[i]}
ax.plot(
obs_dist_fsize, obs_dist_density, c=(
overseq / m_overseq, 0, 0), label=f'{overseq} read fragments / umi')
ax.set_xlim(-10, args.maxfs)
ax.legend()
ax.set_xlabel('fragment size')
ax.set_ylabel('density')
except Exception as e:
print(e)
pass
plt.tight_layout()
plt.savefig(f'{args.o}_{library}.png')
pd.DataFrame(table).to_csv(f'{args.o}_{library}.csv')
try:
plt.close()
except Exception as e:
pass
# Export the table: