#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
from .statistic import StatisticHistogram
import singlecellmultiomics.pyutils as pyutils
import collections
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')
[docs]class OversequencingHistogram(StatisticHistogram):
def __init__(self, args):
StatisticHistogram.__init__(self, args)
self.histogram = collections.Counter()
[docs] def processRead(self, R1,R2=None):
for read in [R1, R2]:
if read is None:
continue
if read.has_tag('RC'):
overseq = read.get_tag('RC')
self.histogram[overseq] += 1
if overseq > 1:
self.histogram[overseq - 1] -= 1
# Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS
"""
java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1
"""
elif read.has_tag('PG'):
if read.has_tag('DS') and not read.is_duplicate:
self.histogram[read.get_tag('DS')] += 1
else:
self.histogram[1] += 1
break
def __repr__(self):
return f'The average oversequencing is {pyutils.meanOfCounter(self.histogram)}, SD:{pyutils.varianceOfCounter(self.histogram)}'
[docs] def plot(self, target_path, title=None):
fig, ax = plt.subplots()
overseqRange = list(range(1, 20))
ax.scatter(overseqRange, [self.histogram[x] for x in overseqRange])
if title is not None:
ax.set_title(title)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_xlabel("Amount of fragments associated with molecule")
ax.set_ylabel("# Molecules")
plt.tight_layout()
plt.savefig(target_path)
ax = plt.gca()
ax.set_ylim(1, None)
ax.set_yscale('log')
plt.savefig(target_path.replace('.png', '.log.png'))
plt.close()