Source code for singlecellmultiomics.statistic.readcount

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
from .statistic import Statistic
import singlecellmultiomics.pyutils as pyutils
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')


[docs]class ReadCount(Statistic): def __init__(self, args): Statistic.__init__(self, args) self.totalMappedReads = collections.Counter() self.rawReadCount = None self.unmappedReads = collections.Counter() self.totalDedupReads = collections.Counter() self.totalAssignedSiteReads = collections.Counter({'R1': 0, 'R2': 0}) self.rejectionReasons = collections.Counter() self.demuxReadCount = 0 self.rawReadCount = 0
[docs] def get_df(self): df = pd.DataFrame.from_dict(dict(self)) # They are paired and only present one time in the dict but are # expanded by pandas df['Raw reads'] /= 2 df['Demultiplexed reads'] /= 2 # same df = df[['Raw reads', 'Demultiplexed reads', 'Mapped reads', 'AssignedSiteReads', 'Deduplicated reads']] return df
[docs] def to_csv(self, path): self.get_df().to_csv(path)
[docs] def plot(self, target_path, title=None): df = self.get_df() # ,'UnmappedReads']] print(df) df.plot.bar(figsize=(10, 4)).legend(bbox_to_anchor=(1, 0.98)) if title is not None: plt.title(title) plt.subplots_adjust(right=0.6) plt.tight_layout() plt.savefig(target_path) ax = plt.gca() ax.set_ylim(1, None) ax.set_ylabel('Frequency') ax.set_yscale('log') plt.tight_layout() plt.savefig(target_path.replace('.png', '.log.png')) plt.close()
[docs] def processRead(self, R1,R2): for read in [R1,R2]: if read is None: continue # Count every read only once if read.is_supplementary or read.is_secondary: return if not read.is_unmapped: if read.is_read1: self.totalMappedReads['R1'] += 1 elif read.is_read2: self.totalMappedReads['R2'] += 1 else: self.totalMappedReads['R?'] += 1 else: if read.is_read1: self.unmappedReads['R1'] += 1 elif read.is_read2: self.unmappedReads['R2'] += 1 else: self.unmappedReads['R?'] += 1 # Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS """ java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1 """ if not read.has_tag('MX'): # Bulk sample if not read.is_unmapped: if read.is_duplicate: pass else: if read.is_read1: self.totalDedupReads['R1'] += 1 else: self.totalDedupReads['R2'] += 1 else: if not read.is_unmapped: if (read.has_tag('DS') or read.has_tag('GN')) and not read.is_qcfail: if read.is_read1: self.totalAssignedSiteReads['R1'] += 1 elif read.is_read2: self.totalAssignedSiteReads['R2'] += 1 else: self.totalAssignedSiteReads['R?'] += 1 if not read.is_duplicate: if read.is_read1: self.totalDedupReads['R1'] += 1 elif read.is_read2: self.totalDedupReads['R2'] += 1 else: self.totalDedupReads['R?'] += 1
[docs] def setRawReadCount(self, readCount, paired=True): self.rawReadCount = readCount * (2 if paired else 1)
[docs] def setRawDemuxCount(self, readCount, paired=True): self.demuxReadCount = readCount * (2 if paired else 1)
[docs] def mappability(self): if self.rawReadCount > 0: return sum(self.totalMappedReads.values()) / self.rawReadCount return None
def __repr__(self): return f"Input:{int(self.rawReadCount)} raw reads, demultiplexed:{self.demuxReadCount}, final mapped: {self.totalMappedReads['R1']} R1 reads, {self.totalMappedReads['R2']} R2 reads, {self.totalMappedReads['R?']} unknown pairing reads, mappability:{self.mappability() if self.mappability() else 'Cannot be determined'}" def __iter__(self): yield 'Raw reads', self.rawReadCount yield 'Demultiplexed reads', self.demuxReadCount yield 'Mapped reads', self.totalMappedReads yield 'UnmappedReads', self.unmappedReads # if self.totalAssignedSiteReads['R1']>0 or # self.totalAssignedSiteReads['R2']>0 or # self.totalAssignedSiteReads['R?']>0: yield 'AssignedSiteReads', self.totalAssignedSiteReads yield 'Deduplicated reads', self.totalDedupReads