Source code for singlecellmultiomics.statistic.readcount

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
import collections
import matplotlib.pyplot as plt
from .statistic import Statistic
import singlecellmultiomics.pyutils as pyutils
import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')


[docs]class ReadCount(Statistic):
    def __init__(self, args):
        Statistic.__init__(self, args)
        self.totalMappedReads = collections.Counter()
        self.rawReadCount = None
        self.unmappedReads = collections.Counter()
        self.totalDedupReads = collections.Counter()
        self.totalAssignedSiteReads = collections.Counter({'R1': 0, 'R2': 0})
        self.rejectionReasons = collections.Counter()
        self.demuxReadCount = 0
        self.rawReadCount = 0

[docs]    def get_df(self):
        df = pd.DataFrame.from_dict(dict(self))
        # They are paired and only present one time in the dict but are
        # expanded by pandas
        df['Raw reads'] /= 2
        df['Demultiplexed reads'] /= 2  # same

        df = df[['Raw reads',
                 'Demultiplexed reads',
                 'Mapped reads',
                 'AssignedSiteReads',
                 'Deduplicated reads']]
        return df

[docs]    def to_csv(self, path):
        self.get_df().to_csv(path)

[docs]    def plot(self, target_path, title=None):
        df = self.get_df()  # ,'UnmappedReads']]

        print(df)

        df.plot.bar(figsize=(10, 4)).legend(bbox_to_anchor=(1, 0.98))
        if title is not None:
            plt.title(title)

        plt.subplots_adjust(right=0.6)
        plt.tight_layout()
        plt.savefig(target_path)

        ax = plt.gca()
        ax.set_ylim(1, None)
        ax.set_ylabel('Frequency')
        ax.set_yscale('log')
        plt.tight_layout()
        plt.savefig(target_path.replace('.png', '.log.png'))

        plt.close()

[docs]    def processRead(self, R1,R2):

        for read in [R1,R2]:
            if read is None:
                continue
            # Count every read only once
            if read.is_supplementary or read.is_secondary:
                return

            if not read.is_unmapped:

                if read.is_read1:
                    self.totalMappedReads['R1'] += 1
                elif read.is_read2:
                    self.totalMappedReads['R2'] += 1
                else:
                    self.totalMappedReads['R?'] += 1
            else:
                if read.is_read1:
                    self.unmappedReads['R1'] += 1
                elif read.is_read2:
                    self.unmappedReads['R2'] += 1
                else:
                    self.unmappedReads['R?'] += 1

            # Compatibility with picard --TAG_DUPLICATE_SET_MEMBERS
            """
            java -jar `which picard.jar` MarkDuplicates I=sorted.bam O=marked_duplicates.bam M=marked_dup_metrics.txt TAG_DUPLICATE_SET_MEMBERS=1
            """
            if not read.has_tag('MX'):  # Bulk sample
                if not read.is_unmapped:
                    if read.is_duplicate:
                        pass
                    else:
                        if read.is_read1:
                            self.totalDedupReads['R1'] += 1
                        else:
                            self.totalDedupReads['R2'] += 1

            else:
                if not read.is_unmapped:
                    if (read.has_tag('DS') or read.has_tag('GN')) and not read.is_qcfail:
                        if read.is_read1:
                            self.totalAssignedSiteReads['R1'] += 1
                        elif read.is_read2:
                            self.totalAssignedSiteReads['R2'] += 1
                        else:
                            self.totalAssignedSiteReads['R?'] += 1

                    if not read.is_duplicate:

                        if read.is_read1:
                            self.totalDedupReads['R1'] += 1
                        elif read.is_read2:
                            self.totalDedupReads['R2'] += 1
                        else:
                            self.totalDedupReads['R?'] += 1

[docs]    def setRawReadCount(self, readCount, paired=True):
        self.rawReadCount = readCount * (2 if paired else 1)

[docs]    def setRawDemuxCount(self, readCount, paired=True):
        self.demuxReadCount = readCount * (2 if paired else 1)

[docs]    def mappability(self):
        if self.rawReadCount > 0:
            return sum(self.totalMappedReads.values()) / self.rawReadCount
        return None

    def __repr__(self):
        return f"Input:{int(self.rawReadCount)} raw reads, demultiplexed:{self.demuxReadCount}, final mapped: {self.totalMappedReads['R1']} R1 reads, {self.totalMappedReads['R2']} R2 reads, {self.totalMappedReads['R?']} unknown pairing reads, mappability:{self.mappability() if self.mappability() else 'Cannot be determined'}"

    def __iter__(self):
        yield 'Raw reads', self.rawReadCount
        yield 'Demultiplexed reads', self.demuxReadCount
        yield 'Mapped reads', self.totalMappedReads
        yield 'UnmappedReads', self.unmappedReads
        # if self.totalAssignedSiteReads['R1']>0 or
        # self.totalAssignedSiteReads['R2']>0 or
        # self.totalAssignedSiteReads['R?']>0:
        yield 'AssignedSiteReads', self.totalAssignedSiteReads
        yield 'Deduplicated reads', self.totalDedupReads