Source code for singlecellmultiomics.libraryProcessing.libraryStatistics

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from singlecellmultiomics.statistic import *
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import pysam
import collections
import argparse

from colorama import Fore
from colorama import Back
from colorama import Style
import singlecellmultiomics.pyutils as pyutils
from singlecellmultiomics.tagtools import tagtools
import pysamiterators.iterators as pysamIterators
import gzip
import pickle
import subprocess

import matplotlib
matplotlib.rcParams['figure.dpi'] = 160
matplotlib.use('Agg')


[docs]def select_bam_file(lookup): for l in lookup: print(f'Found file at {l}') if os.path.exists(l): return l return None
[docs]def select_fastq_file(lookup): for paths in lookup: if isinstance(paths, tuple): for path in paths: if os.path.exists(path): return paths elif os.path.exists(paths): return (paths,) return None
if __name__ == '__main__': argparser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Obtain statistics from your libraries') argparser.add_argument( 'libraries', type=str, nargs='*', help="either a library structured folder, or the tagged BAM file") argparser.add_argument('-t', type=str, default='all-stats', help="type of staistics to produce. options are \ 'demult-stats', 'meth-stats', 'chic-stats' and 'all-stats'") argparser.add_argument('-o', type=str, help="output file prefix") argparser.add_argument( '--plotsOnly', action='store_true', help="only make plots") argparser.add_argument( '--sl', action='store_true', help="Show lookup paths") argparser.add_argument( '--tablesOnly', action='store_true', help="only make tables") argparser.add_argument('-head', type=int) argparser.add_argument( '-tagged_bam', type=str, help='Alias of subpath to tagged bam file. For example /tagged/sorted.bam') argparser.add_argument('--v', action='store_true') argparser.add_argument('--nort', action='store_true') args = argparser.parse_args() for library in args.libraries: library_name = library if library.endswith('.bam'): # the library is a bam file.. bamFile = os.path.abspath(library) library = os.path.dirname(os.path.abspath(bamFile)) library_name = os.path.basename(os.path.abspath(bamFile)) print("Bam file was supplied:") print(bamFile) else: print("A library was supplied, automatically detecting files ..") bamFile = None rc = ReadCount(args) # Is also mappability statistics = [ rc, FragmentSizeHistogram(args), RejectionReasonHistogram(args), MappingQualityHistogram(args), OversequencingHistogram(args), CellReadCount(args) ] if(args.t in ['meth-stats', 'all-stats']): statistics.extend([ MethylationContextHistogram(args), ConversionMatrix(args) ]) if(args.t in ['chic-stats', 'all-stats']): statistics.extend([ScCHICLigation(args)]) if(args.t in ['demult-stats', 'all-stats']): statistics.extend([ TrimmingStats(args), AlleleHistogram(args), DataTypeHistogram(args), TagHistogram(args), PlateStatistic(args) ]) demuxFastqFilesLookup = [ (f'{library}/demultiplexedR1.fastq.gz', f'{library}/demultiplexedR2.fastq.gz'), (f'{library}/demultiplexedR1_val_1.fq.gz', f'{library}/demultiplexedR2_val_2.fq.gz'), (f'{library}/demultiplexedR1_val_1.fq', f'{library}/demultiplexedR2_val_2.fq')] rejectFilesLookup = [ (f'{library}/rejectsR1.fastq.gz', f'{library}/rejectsR2.fastq.gz'), (f'{library}/rejectsR1.fastq', f'{library}/rejectsR2.fastq'), (f'{library}/rejectsR1.fq.gz', f'{library}/rejectsR2.fq.gz'), (f'{library}/rejectsR1.fq', f'{library}/rejectsR2.fq'), ] taggedFilesLookup = [ f'{library}/tagged.bam', f'{library}/tagged/tagged.bam', f'{library}/tagged/marked_duplicates.bam', f'{library}/tagged/resorted.featureCounts.bam', f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam', f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.bam', f'{library}/tagged/sorted.bam'] if args.tagged_bam: taggedFilesLookup = [ library + '/' + args.tagged_bam] + taggedFilesLookup if args.sl: print(f'{Style.BRIGHT}Demux file lookup paths{Style.RESET_ALL}') print(demuxFastqFilesLookup) print(f'{Style.BRIGHT}Reject fastq file lookup paths{Style.RESET_ALL}') print(rejectFilesLookup) print(f'{Style.BRIGHT}Tagged bam lookup paths{Style.RESET_ALL}') print(taggedFilesLookup) if 'cluster' in library: continue print(f'{Style.BRIGHT}Library {library}{Style.RESET_ALL}') # Check if the bam file is present if bamFile is None: bamFile = select_bam_file(taggedFilesLookup) if bamFile is None: print(f'{Fore.RED}BAM FILE MISSING{library}{Style.RESET_ALL}') exit() else: print(f'{Fore.GREEN}Bam file at {bamFile}{Style.RESET_ALL}') demuxFastqFiles = select_fastq_file(demuxFastqFilesLookup) rejectFastqFiles = select_fastq_file(rejectFilesLookup) print("Selected files:") print(f'demultiplexed reads: {demuxFastqFiles}') print(f'rejected reads: {rejectFastqFiles}') print(f'tagged bam: {bamFile}') demuxReads = None rejectedReads = None if demuxFastqFiles is not None: firstMate = demuxFastqFiles[0] print(f'\tDemuxed > {firstMate}') if firstMate.endswith('.gz'): demuxReads = pyutils.wccountgz(firstMate) / 4 else: demuxReads = pyutils.wccount(firstMate) / 4 if rejectFastqFiles is not None: firstMate = rejectFastqFiles[0] print(f'\tRejects > {firstMate}') if firstMate.endswith('.gz'): rejectedReads = pyutils.wccountgz(firstMate) / 4 else: rejectedReads = pyutils.wccount(firstMate) / 4 if demuxReads is not None: rc.setRawDemuxCount(demuxReads, paired=True) if rejectedReads is not None: rc.setRawReadCount(rejectedReads + demuxReads, paired=True) if bamFile is not None and os.path.exists(bamFile): print(f'\tTagged > {bamFile}') with pysam.AlignmentFile(bamFile) as f: for i, read in enumerate(f): for statistic in statistics: statistic.processRead(read) if args.head is not None and i >= (args.head - 1): break else: print(f'Did not find a bam file at {bamFile}') statDict = {} if os.path.exists( f'{library}/tagged/STAR_mappedAligned.sortedByCoord.out.featureCounts.bam'): cmd = f'samtools view {bamFile} -F 4 -f 64 | cut -f 1 | sort | uniq | wc -l' out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True ).communicate()[0] read1mapped = int(out.partition(b' ')[0]) cmd = f'samtools view {bamFile} -F 4 -f 128 | cut -f 1 | sort | uniq | wc -l' out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True ).communicate()[0] read2mapped = int(out.partition(b' ')[0]) rc.totalMappedReads['R1'] = read1mapped rc.totalMappedReads['R2'] = read2mapped # Deduplicated reads have RC:i:1 set, -f 64 selects for R2 cmd = f'samtools view {bamFile} -F 4 -f 64 | grep RC:i:1 | cut -f 1 | sort | uniq | wc -l' out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True ).communicate()[0] read1mappeddedup = int(out.partition(b' ')[0]) # Deduplicated reads have RC:i:1 set, -f 128 selects for R2 cmd = f'samtools view {bamFile} -F 4 -f 128 | grep RC:i:1 | cut -f 1 | sort | uniq | wc -l' out = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True ).communicate()[0] read2mappeddedup = int(out.partition(b' ')[0]) rc.totalDedupReads['R1'] = read1mappeddedup rc.totalDedupReads['R2'] = read2mappeddedup for statistic in statistics: try: print(f'\t{statistic.__class__.__name__}') print(f'\t\t{statistic}\n') statDict[statistic.__class__.__name__] = dict(statistic) print(dict(statistic)) except Exception as e: if args.v: print(e) # Make plots: if args.o is None: plot_dir = f'{library}/plots' table_dir = f'{library}/tables' statFile = f'{library}/statistics.pickle.gz' else: plot_dir = args.o table_dir = f'{args.o}/tables' statFile = f'{args.o}/statistics.pickle.gz' if not args.tablesOnly: if not os.path.exists(plot_dir): os.makedirs(plot_dir) for statistic in statistics: if not hasattr(statistic, 'plot'): print( f'Not making a plot for {statistic.__class__.__name__} as no plot method is defined') continue try: statistic.plot( f'{plot_dir}/{statistic.__class__.__name__}.png', title=library_name) except Exception as e: if args.v: import traceback traceback.print_exc() # Make tables: if not args.plotsOnly: with gzip.open(statFile, 'wb') as f: pickle.dump(statDict, f) if os.path.exists(statFile): with gzip.open(statFile, 'rb') as f: try: statDict.update(pickle.load(f)) except Exception as e: pass if not os.path.exists(table_dir): os.makedirs(table_dir) for statistic in statistics: if not hasattr(statistic, 'to_csv'): print( f'Not making a table for {statistic.__class__.__name__} as to_csv method is not defined') continue try: statistic.to_csv( f'{table_dir}/{statistic.__class__.__name__}_{library_name}.csv') except Exception as e: if args.v: import traceback traceback.print_exc() # Make RT reaction plot: if bamFile is not None and os.path.exists(bamFile): if not args.nort: os.system( f"bamPlotRTstats.py {bamFile} -head 2_000_000 --notstrict -o {plot_dir}/RT_")