Source code for singlecellmultiomics.libraryProcessing.libraryStatistics

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from singlecellmultiomics.statistic import *
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import pysam
import collections
import argparse

from singlecellmultiomics.bamProcessing import bam_is_processed_by_program

from colorama import Fore
from colorama import Back
from colorama import Style
import singlecellmultiomics.pyutils as pyutils
from singlecellmultiomics.tagtools import tagtools
from pysamiterators import MatePairIteratorIncludingNonProper
import gzip
import pickle
import subprocess
from glob import glob

import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')


[docs]def select_bam_file(lookup):
    for l in lookup:
        if os.path.exists(l):
            print(f'Found file at {l}')
            return l

    return None


[docs]def select_fastq_file(lookup):
    for paths in lookup:
        if isinstance(paths, tuple):
            for path in paths:
                if os.path.exists(path):
                    return paths
        elif os.path.exists(paths):
            return (paths,)
    return None


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Obtain statistics from your libraries')
    argparser.add_argument(
        'libraries',
        type=str,
        nargs='*',
        help="either a library structured folder, or the tagged BAM file")
    argparser.add_argument('-t', type=str, default='all-stats',
                           help="type of staistics to produce. options are \
            'demult-stats', 'meth-stats', 'chic-stats' and 'all-stats'")
    argparser.add_argument('-o', type=str, help="output file prefix")
    argparser.add_argument(
        '--plotsOnly',
        action='store_true',
        help="only make plots")
    argparser.add_argument(
        '--fatal',
        action='store_true',
        help="Fatal error on any issue")
    argparser.add_argument(
        '--sl',
        action='store_true',
        help="Show lookup paths")


    argparser.add_argument(
        '--tablesOnly',
        action='store_true',
        help="only make tables")

    argparser.add_argument('-head', type=int)
    argparser.add_argument(
        '-tagged_bam',
        type=str,
        help='Alias of subpath to tagged bam file. For example /tagged/sorted.bam')
    argparser.add_argument('--v', action='store_true')
    argparser.add_argument('--nort', action='store_true')
    argparser.add_argument('--nolorenz', action='store_true')
    args = argparser.parse_args()

    for library in args.libraries:
        if library.endswith('.bam'):
            # the library is a bam file..
            bamFile = os.path.abspath(library)
            library = os.path.dirname(os.path.abspath(bamFile))
            library_name = os.path.basename(os.path.abspath(bamFile))
            print("Bam file was supplied:")
            print(bamFile)
        else:
            print("A library was supplied, automatically detecting files ..")
            bamFile = None
            library_name = os.path.basename(library)
        rc = ReadCount(args)  # Is also mappability

        statistics = [
            rc,
            FragmentSizeHistogram(args),
            RejectionReasonHistogram(args),
            MappingQualityHistogram(args),
            OversequencingHistogram(args),
            CellReadCount(args)
        ]


        full_file_statistics = []

        if not args.nolorenz:
            full_file_statistics.append( Lorenz(args) )

        if(args.t in ['meth-stats', 'all-stats']):
            statistics.extend([
                MethylationContextHistogram(args),
                ConversionMatrix(args)
            ])

        if(args.t in ['chic-stats', 'all-stats']):
            statistics.extend([ScCHICLigation(args)])

        if(args.t in ['demult-stats', 'all-stats']):
            statistics.extend([
                TrimmingStats(args),
                AlleleHistogram(args),
                DataTypeHistogram(args),
                TagHistogram(args),
                PlateStatistic(args),
                PlateStatistic2(args)
            ])

        demuxFastqFilesLookup = [
            (f'{library}/demultiplexedR1.fastq.gz',
             f'{library}/demultiplexedR2.fastq.gz'),
            (f'{library}/demultiplexedR1_val_1.fq.gz',
             f'{library}/demultiplexedR2_val_2.fq.gz'),
            (f'{library}/demultiplexedR1_val_1.fq',
             f'{library}/demultiplexedR2_val_2.fq')]

        rejectFilesLookup = [
            (f'{library}/rejectsR1.fastq.gz', f'{library}/rejectsR2.fastq.gz'),
            (f'{library}/rejectsR1.fastq', f'{library}/rejectsR2.fastq'),
            (f'{library}/rejectsR1.fq.gz', f'{library}/rejectsR2.fq.gz'),
            (f'{library}/rejectsR1.fq', f'{library}/rejectsR2.fq'),
        ]

        taggedFilesLookup = [
            f'{library}/tagged.bam',
            f'{library}/tagged/tagged.bam',
            f'{library}/tagged/marked_duplicates.bam',
            f'{library}/tagged/resorted.featureCounts.bam',
            f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam',
            f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.bam',
            f'{library}/tagged/sorted.bam']
        if args.tagged_bam:
            taggedFilesLookup = [
                library + '/' + args.tagged_bam] + taggedFilesLookup

        if args.sl:
            print(f'{Style.BRIGHT}Demux file lookup paths{Style.RESET_ALL}')
            print(demuxFastqFilesLookup)
            print(f'{Style.BRIGHT}Reject fastq file lookup paths{Style.RESET_ALL}')
            print(rejectFilesLookup)
            print(f'{Style.BRIGHT}Tagged bam lookup paths{Style.RESET_ALL}')
            print(taggedFilesLookup)

        if 'cluster' in library:
            continue
        print(f'{Style.BRIGHT}Library {library}{Style.RESET_ALL}')
        # Check if the bam file is present
        if bamFile is None:
            bamFile = select_bam_file(taggedFilesLookup)

        if bamFile is None:
            # Perform glob expansion
            bams = list(glob(f'{library}/*.bam'))+list(glob(f'{library}/*/*.bam'))
            for bam_path in bams:
                print(f"Trying {bam_path}",end="\t")
                try:
                    with pysam.AlignmentFile(bam_path) as a:
                        if bam_is_processed_by_program(a, program='bamtagmultiome'):
                            bamFile = bam_path
                            print("[TAGGED]")
                            break
                        else:
                            print("[NOT TAGGED]")
                except Exception as e:
                    print(f"[ERROR] {e}")

        if bamFile is None:
            print(f'{Fore.RED}BAM FILE MISSING {library}{Style.RESET_ALL}')
            exit()
        else:
            print(f'{Fore.GREEN}Bam file at {bamFile}{Style.RESET_ALL}')

        demuxFastqFiles = select_fastq_file(demuxFastqFilesLookup)
        rejectFastqFiles = select_fastq_file(rejectFilesLookup)

        print("Selected files:")
        print(f'demultiplexed reads: {demuxFastqFiles}')
        print(f'rejected reads: {rejectFastqFiles}')
        print(f'tagged bam: {bamFile}')

        demuxReads = None
        rejectedReads = None
        if demuxFastqFiles is not None:
            firstMate = demuxFastqFiles[0]
            print(f'\tDemuxed > {firstMate}')
            if firstMate.endswith('.gz'):
                demuxReads = pyutils.wccountgz(firstMate) / 4
            else:
                demuxReads = pyutils.wccount(firstMate) / 4

        if rejectFastqFiles is not None:
            firstMate = rejectFastqFiles[0]
            print(f'\tRejects > {firstMate}')
            if firstMate.endswith('.gz'):
                rejectedReads = pyutils.wccountgz(firstMate) / 4
            else:
                rejectedReads = pyutils.wccount(firstMate) / 4

        if demuxReads is not None:
            rc.setRawDemuxCount(demuxReads, paired=True)

            if rejectedReads is not None:
                rc.setRawReadCount(rejectedReads + demuxReads, paired=True)

        if bamFile is not None and os.path.exists(bamFile):
            print(f'\tTagged > {bamFile}')
            with pysam.AlignmentFile(bamFile) as f:

                for i, (R1,R2) in enumerate(MatePairIteratorIncludingNonProper(f)):
                    for statistic in statistics:
                        statistic.processRead(R1,R2)
                    if args.head is not None and i >= (args.head - 1):
                        break
        else:
            print(f'Did not find a bam file at {bamFile}')

        statDict = {}

        if os.path.exists(
                f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam'):
            cmd = f'samtools view {bamFile} -F 4 -f 64 | cut -f 1 | sort | uniq | wc -l'
            out = subprocess.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT,
                                   shell=True
                                   ).communicate()[0]
            read1mapped = int(out.partition(b' ')[0])
            cmd = f'samtools view {bamFile} -F 4 -f 128 | cut -f 1 | sort | uniq | wc -l'
            out = subprocess.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT,
                                   shell=True
                                   ).communicate()[0]
            read2mapped = int(out.partition(b' ')[0])

            rc.totalMappedReads['R1'] = read1mapped
            rc.totalMappedReads['R2'] = read2mapped
            # Deduplicated reads have RC:i:1 set, -f 64 selects for R2
            cmd = f'samtools view {bamFile} -F 4 -f 64 | grep RC:i:1 | cut -f 1 | sort | uniq | wc -l'
            out = subprocess.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT,
                                   shell=True
                                   ).communicate()[0]
            read1mappeddedup = int(out.partition(b' ')[0])
            # Deduplicated reads have RC:i:1 set, -f 128 selects for R2
            cmd = f'samtools view {bamFile} -F 4 -f 128 | grep RC:i:1 |  cut -f 1 | sort | uniq | wc -l'
            out = subprocess.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT,
                                   shell=True
                                   ).communicate()[0]
            read2mappeddedup = int(out.partition(b' ')[0])

            rc.totalDedupReads['R1'] = read1mappeddedup
            rc.totalDedupReads['R2'] = read2mappeddedup

        for statistic in statistics:
            try:
                print(f'\t{statistic.__class__.__name__}')
                print(f'\t\t{statistic}\n')
                statDict[statistic.__class__.__name__] = dict(statistic)
                print(dict(statistic))
            except Exception as e:
                if args.v:
                    print(e)
                if args.fatal:
                    raise

        if bamFile is not None:
            for statistic in full_file_statistics:
                try:
                    with pysam.AlignmentFile(bamFile ) as alignments:
                        statistic.process_file(alignments)
                except Exception as e:
                    if args.v:
                        print(e)
                    if args.fatal:
                        raise



        # Make plots:
        if args.o is None:
            plot_dir = f'{library}/plots'
            table_dir = f'{library}/tables'
            statFile = f'{library}/statistics.pickle.gz'
        else:
            plot_dir = f'{args.o}/plots'
            table_dir = f'{args.o}/tables'
            statFile = f'{args.o}/statistics.pickle.gz'

        if not args.tablesOnly:

            if not os.path.exists(plot_dir):
                os.makedirs(plot_dir)
            for statistic in statistics + full_file_statistics:
                if not hasattr(statistic, 'plot'):
                    print(
                        f'Not making a plot for {statistic.__class__.__name__} as no plot method is defined')
                    continue
                try:
                    statistic.plot(
                        f'{plot_dir}/{statistic.__class__.__name__}.png',
                        title=library_name)
                except Exception as e:
                    if args.v:
                        import traceback
                        traceback.print_exc()

        # Make tables:
        if not args.plotsOnly:
            with gzip.open(statFile, 'wb') as f:
                pickle.dump(statDict, f)
            if os.path.exists(statFile):
                with gzip.open(statFile, 'rb') as f:
                    try:
                        statDict.update(pickle.load(f))
                    except Exception as e:
                        if args.fatal:
                            raise


            if not os.path.exists(table_dir):
                os.makedirs(table_dir)
            for statistic in statistics + full_file_statistics:
                if not hasattr(statistic, 'to_csv'):
                    print(
                        f'Not making a table for {statistic.__class__.__name__} as to_csv method is not defined')
                    continue
                try:
                    statistic.to_csv(
                        f'{table_dir}/{statistic.__class__.__name__}_{library_name}.csv')
                except Exception as e:
                    if args.fatal:
                        raise
                    if args.v:
                        import traceback
                        traceback.print_exc()


        # Make RT reaction plot:
        if bamFile is not None and os.path.exists(bamFile):
            if not args.nort:
                os.system(
                    f"bamPlotRTstats.py {bamFile} -head 2_000_000 --notstrict -o {plot_dir}/RT_")