Source code for singlecellmultiomics.universalBamTagger.tapsTagger

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pysam
import singlecellmultiomics.molecule
import singlecellmultiomics.fragment
import gzip
import collections
import matplotlib.pyplot as plt
import pandas as pd
import argparse
import pysamiterators
import sys
import os
import uuid
import singlecellmultiomics.bamProcessing.bamFunctions as bf
import singlecellmultiomics.features
import colorama
import numpy as np


[docs]class Fraction:
    def __init__(self):
        self.values = [0, 0]

    def __setitem__(self, key, value):
        self.values[key] = value

    def __getitem__(self, key):
        return self.values[key]

    def __float__(self):
        if sum(self.values) == 0:
            return np.nan
        return self.values[1] / (self.values[1] + self.values[0])


if __name__ == '__main__':

    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Add methylation information to BAM file')

    argparser.add_argument('alignmentfile', type=str)
    argparser.add_argument(
        '-o',
        type=str,
        help="output BAM path",
        required=True)
    argparser.add_argument(
        '-bed',
        type=str,
        help="output bed file (base-level methylation status)",
        default=None,
        required=False)
    # these options need to go <- VB
    argparser.add_argument(
        '-table',
        type=str,
        help="output table alias",
        default=None,
        required=False)
    argparser.add_argument('-bin_size', type=int, default=250_000)
    argparser.add_argument('-sliding_increment', type=int, default=50_000)
    #
    argparser.add_argument(
        '-ref',
        type=str,
        required=False,
        help='path to reference fasta file ')
    argparser.add_argument(
        '-min_mq',
        default=20,
        type=int,
        help='Min mapping qual')
    argparser.add_argument('-uhd', default=1, type=int,
                           help='Umi hamming distance')
    argparser.add_argument(
        '-mem',
        default=40,
        type=int,
        help='Memory used per job')
    argparser.add_argument(
        '-time',
        default=52,
        type=int,
        help='Time requested per job')
    argparser.add_argument('-head', type=int)
    argparser.add_argument('-contig', type=str, help='contig to run on')
    argparser.add_argument('-method', type=str, help='nla or chic')
    argparser.add_argument(
        '-samples',
        type=str,
        help='Samples to select, separate with comma. For example CellA,CellC,CellZ',
        default=None)
    argparser.add_argument(
        '-context',
        type=str,
        help='Context to select, separate with comma. For example Z,H,X',
        default=None)

    argparser.add_argument('--stranded', action='store_true')
    argparser.add_argument(
        '--cluster',
        action='store_true',
        help='split by chromosomes and submit the job on cluster')
    argparser.add_argument(
        '--no_sort_index',
        action='store_true',
        help='do not sort and index the output bam')

    # Transcriptome splitting mode
    tr = argparser.add_argument_group('transcriptome specific settings')
    tr.add_argument(
        '--transcriptome',
        action='store_true',
        help='Label transcripts, requires exons and introns')
    tr.add_argument('-exons', type=str, help='Exon GTF file')
    tr.add_argument(
        '-introns',
        type=str,
        help='Intron GTF file, use exonGTF_to_intronGTF.py to create this file')
    tr.add_argument(
        '-recovery_umi_pool_radius',
        type=int,
        help='BP radius. When assigning transcripts without NLA site found, use this radius for molecule pooling',
        default=4)
    args = argparser.parse_args()

    samples = None if args.samples is None else set(args.samples.split(','))
    contexts = None if args.context is None else set(args.context.split(','))
#    set(
#        [x.upper() for x in  args.context.split(',')] +
#        [x.lower() for x in  args.context.split(',')])

    alignments = pysam.AlignmentFile(args.alignmentfile)
    # Auto detect reference:
    if args.ref is None:
        args.ref = bf.get_reference_from_pysam_alignmentFile(alignments)
        if args.ref is None:
            raise ValueError("Supply reference, -ref")

    if args.transcriptome:
        print(
            colorama.Style.BRIGHT +
            'Running in transcriptome recovery mode' +
            colorama.Style.RESET_ALL)
        if args.exons is None or args.introns is None:
            raise ValueError("Please supply both intron and exon GTF files")

    if args.cluster:
        if args.contig is None:
            # Create jobs for all chromosomes:
            temp_prefix = os.path.abspath(
                os.path.dirname(args.o)) + '/' + str(uuid.uuid4())
            hold_merge = []
            for chrom in alignments.references:
                if chrom.startswith('KN') or chrom.startswith('KZ') or chrom.startswith(
                        'chrUn') or chrom.endswith('_random') or 'ERCC' in chrom:
                    continue
                temp_bam_path = f'{temp_prefix}_{chrom}.bam'
                temp_bed_path = f'{temp_prefix}_{chrom}.bed'

                arguments = " ".join([x for x in sys.argv if not x == args.o and x != '-o']) + \
                    f" -contig {chrom} -o {temp_bam_path} -bed {temp_bed_path}"

                job = f'TAPS_{str(uuid.uuid4())}'
                os.system(
                    f'submission.py --silent' +
                    f' -y --py36 -time {args.time} -t 1 -m {args.mem} -N {job} " {arguments};"')
                hold_merge.append(job)

            hold = ','.join(hold_merge)
            os.system(
                f'submission.py --silent' +
                f' -y --py36 -time {args.time} -t 1 -m 10 -N {job} -hold {hold} " samtools merge {args.o} {temp_prefix}*.bam; samtools index {args.o}; rm {temp_prefix}*.ba*; cat {temp_prefix}*.bed > {args.bed}; rm {temp_prefix}*.bed"')
            exit()

    reference = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref))
    taps = singlecellmultiomics.molecule.TAPS(reference=reference)
    temp_out = f'{args.o}.temp.out.bam'

    # Obtain contig sizes:
    ref_lengths = {r: alignments.get_reference_length(
        r) for r in alignments.references}

    # Methylation dictionary: site->cell->value
    binned_data = collections.defaultdict(
        lambda: collections.defaultdict(Fraction))
    cell_count = collections.Counter()

    # Define molecule class arguments
    molecule_class_args = {
        'reference': reference,
        'taps': taps,
        'min_max_mapping_quality': args.min_mq
    }

    fragment_class_args = {'umi_hamming_distance': args.uhd}

    # transcriptome mode specific arguments: ####
    if args.transcriptome:

        transcriptome_features = singlecellmultiomics.features.FeatureContainer()
        print("Loading exons", end='\r')
        transcriptome_features.loadGTF(
            args.exons,
            select_feature_type=['exon'],
            identifierFields=(
                'exon_id',
                'gene_id'),
            store_all=True,
            contig=args.contig,
            head=None)

        print("Loading introns", end='\r')
        transcriptome_features.loadGTF(
            args.introns,
            select_feature_type=['intron'],
            identifierFields=['transcript_id'],
            store_all=True,
            contig=args.contig,
            head=None)
        print("All features loaded")

        rejected_reads = []  # Store all rejected, potential transcript reads

        # Add more molecule class arguments
        molecule_class_args.update({
            'features': transcriptome_features
        })

    # Method specific arguments
    if args.method == 'nla':
        molecule_class_args.update({'site_has_to_be_mapped': True})
    elif args.method == 'chic':
        fragment_class_args.update({'invert_strand': True})

    if args.transcriptome:
        if args.method == 'nla':
            molecule_class = singlecellmultiomics.molecule.AnnotatedTAPSNlaIIIMolecule
            fragment_class = singlecellmultiomics.fragment.NlaIIIFragment
        elif args.method == 'chic':
            molecule_class = singlecellmultiomics.molecule.AnnotatedTAPSCHICMolecule
            fragment_class = singlecellmultiomics.fragment.CHICFragment
        else:
            raise ValueError("Supply 'nla' or 'chic' for -method")
    else:
        if args.method == 'nla':
            molecule_class = singlecellmultiomics.molecule.TAPSNlaIIIMolecule
            fragment_class = singlecellmultiomics.fragment.NlaIIIFragment
        elif args.method == 'chic':
            molecule_class = singlecellmultiomics.molecule.TAPSCHICMolecule
            fragment_class = singlecellmultiomics.fragment.CHICFragment
        else:
            raise ValueError("Supply 'nla' or 'chic' for -method")

    ###############################################
    statistics = collections.defaultdict(collections.Counter)
    mcs = collections.Counter()  # methylation calls seen
    print(
        colorama.Style.BRIGHT +
        "Running TAPS tagging" +
        colorama.Style.RESET_ALL)
    if args.bed is not None:
        bed = open(args.bed, "w")
    with pysam.AlignmentFile(temp_out, "wb", header=alignments.header) as output:
        for i, molecule in enumerate(
                singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=alignments,
                    molecule_class=molecule_class,
                    fragment_class=fragment_class,
                    fragment_class_args=fragment_class_args,
                    yield_invalid=True,
                    molecule_class_args=molecule_class_args,
                    contig=args.contig
                )):

            if args.head is not None and i >= args.head:
                print(
                    colorama.Style.BRIGHT +
                    colorama.Fore.RED +
                    f"Head was supplied, stopped at {i} molecules" +
                    colorama.Style.RESET_ALL)
                break
            statistics['Input']['molecules'] += 1
            statistics['Input']['fragments'] += len(molecule)

            # Set (chromosome) unique identifier
            molecule.set_meta('mi', f'NLA_{i}')
            if args.transcriptome:
                molecule.set_intron_exon_features()

            if samples is not None and molecule.sample not in samples:
                molecule.set_rejection_reason('sample_not_selected')
                if output is not None:
                    molecule.write_pysam(output)
                continue

            if args.transcriptome:
                if not molecule.is_valid():
                    if molecule.is_multimapped() or molecule.get_max_mapping_qual() < args.min_mq:
                        molecule.set_meta('RF', 'rejected_molecule_mq')
                        molecule.write_tags()
                        molecule.write_pysam(output)
                        statistics['Filtering']['low mapping quality'] += 1
                        statistics['Filtering']['rejected'] += 1
                        continue

                    rejected_reads.append(molecule[0].reads)
                    continue
                statistics['Filtering'][f'valid {args.method} molecule'] += 1
                if len(molecule.junctions):
                    molecule.set_meta('RF', 'transcript_junction')
                    molecule.set_meta('dt', 'RNA')
                    statistics['Data type detection']['RNA because junction found'] += 1
                else:
                    if len(molecule.genes) == 0:
                        molecule.set_meta('dt', 'DNA')
                        statistics['Data type detection']['DNA not mapping to gene'] += 1
                    else:
                        # Check if NLA III sites are skipped...
                        if args.method == 'nla':
                            skipped = molecule.get_undigested_site_count()
                            if skipped == 0:
                                molecule.set_meta('dt', 'DNA')
                            else:
                                molecule.set_meta('dt', 'RNA or DNA')
                        else:
                            molecule.set_meta('dt', 'RNA or DNA')
            else:
                if not molecule.is_valid():
                    statistics['Filtering'][f'not valid {args.method}'] += 1
                    molecule.set_meta('RF', 'rejected_molecule')
                    molecule.write_tags()
                    molecule.write_pysam(output)
                    continue
                statistics['Filtering'][f'valid {args.method} molecule'] += 1
                molecule.set_meta('RF', 'accepted_molecule')

            got_context_hit = False
            methylated_hits = 0
            unmethylated_hits = 0
            readString = []
            genomeString = []
            for (chromosome, location), call in molecule.methylation_call_dict.items():
                if call['context'] == '.':  # Only use calls concerning C's
                    continue

                got_context_hit += 1
                mcs[call['context']] += 1
                if call['context'].isupper():
                    methylated_hits += 1
                    readString.append(call['consensus'])
                    genomeString.append(call['reference_base'])
                else:
                    unmethylated_hits += 1
# NEED TO REMOVE THIS CODE ENTIRELY!! <- VB
                if args.table is not None:
                    for binIdx in singlecellmultiomics.utils.coordinate_to_bins(
                            location, args.bin_size, args.sliding_increment):
                        bin_start, bin_end = binIdx
                        if bin_start < 0 or bin_end > ref_lengths[molecule.chromosome]:
                            continue

                        if args.stranded:
                            binned_data[(chromosome, molecule.get_strand_repr(
                            ), binIdx)][molecule.get_sample()][call['context'].isupper()] += 1
                            cell_count[molecule.get_sample()] += 1
                        else:
                            binned_data[(chromosome, binIdx)][molecule.get_sample(
                            )][call['context'].isupper()] += 1
                            cell_count[molecule.get_sample()] += 1
###
                if args.bed is not None:
                    # Skip non-selected contexts only for table
                    if contexts is not None and call['context'] not in contexts:
                        continue
                    else:
                        # name = cell barcode + context
                        name = ":".join(
                            [molecule.sample.split("_")[-1], call['context']])
                        bed.write(
                            f'{chromosome}\t{location}\t{location+1}\t{name}\t1\t{molecule.get_strand_repr()}\n')

            refbase = '' if not genomeString else max(
                set(genomeString), key=genomeString.count)
            readbase = '' if not readString else max(
                set(readString), key=readString.count)

            readConversionString = None
            genomeConversionString = None
            # OT
            readConversionString = None
            genomeConversionString = None
            if readbase == 'T' and refbase == 'C' and molecule.get_strand() == 1:  # '+'
                readConversionString = 'CT'
                genomeConversionString = 'CT'
            # OB
            elif readbase == 'A' and refbase == 'G' and molecule.get_strand() == 0:  # '-'
                readConversionString = 'CT'
                genomeConversionString = 'GA'
            # CTOT
            elif readbase == 'A' and refbase == 'C' and molecule.get_strand() == 1:  # '+'
                readConversionString = 'GA'
                genomeConversionString = 'CT'
            # CTOB
            elif readbase == 'A' and refbase == 'G' and molecule.get_strand() == 0:  # '-'
                readConversionString = 'GA'
                genomeConversionString = 'GA'

            if readConversionString is not None:
                molecule.set_meta('XR', readConversionString)
            if genomeConversionString is not None:
                molecule.set_meta('XG', genomeConversionString)

            molecule.set_meta('ME', methylated_hits)
            molecule.set_meta('um', unmethylated_hits)
            statistics['Methylation']['methylated Cs'] += methylated_hits
            statistics['Methylation']['unmethylated Cs'] += unmethylated_hits
            molecule.write_tags()
            molecule.write_pysam(output)
# close bed
    if args.bed is not None:
        bed.close()

    if args.transcriptome:
        print(
            colorama.Style.BRIGHT +
            f"Running transcriptome recovery on {len(rejected_reads)} reads")
        for i, molecule in enumerate(
            singlecellmultiomics.molecule.MoleculeIterator(
                # plug in the possible_transcripts as read source
                alignments=rejected_reads,
                # Drop the TAPS and NLAIII checks
                molecule_class=singlecellmultiomics.molecule.FeatureAnnotatedMolecule,
                # Plain fragment, no NLAIII
                fragment_class=singlecellmultiomics.fragment.Fragment,
                fragment_class_args={
                    'umi_hamming_distance': args.uhd,
                    # this is the amount of bases R1 can shift to be assigned to the same molecule
                    'assignment_radius': args.recovery_umi_pool_radius
                },

                yield_invalid=True,
                molecule_class_args={
                        'features': transcriptome_features,
                    'reference': reference,
                            'min_max_mapping_quality': 20
                }
            )):
            if not molecule.is_valid():
                statistics['Filtering']['rejected at transcriptome recovery step'] += 1
                statistics['Filtering']['rejected'] += 1
                molecule.set_meta('RF', 'rejected_recovery_invalid')
                molecule.write_tags()
                molecule.write_pysam(output)
                continue

            molecule.set_meta('mi', f'TRAN_{i}')

            # Add gene annotations:
            molecule.annotate(0)
            molecule.set_intron_exon_features()
            if len(molecule.genes) == 0:
                molecule.set_meta('RF', 'rejected_recovery_no_gene')
                statistics['Filtering']['rejected_recovery_no_gene'] += 1
                molecule.write_tags()
                molecule.write_pysam(output)
                continue
            if len(molecule.junctions):
                molecule.set_meta('RF', 'recovered_transcript_junction')
                statistics['Filtering']['recovered_transcript_junction'] += 1
                statistics['Data type detection'][
                    f'RNA because junction found and no {args.method} site mapped'] += 1
            else:
                molecule.set_meta('RF', 'recovered_transcript_gene')
                statistics['Data type detection'][
                    f'RNA because gene found and no {args.method} site mapped'] += 1
                statistics['Filtering']['recovered_transcript_gene'] += 1
            molecule.set_meta('dt', 'RNA')
            statistics['Data type detection']['RNA'] += 1
            molecule.write_tags()
            molecule.write_pysam(output)

    # Show statistics:
    print(
        '\n' +
        colorama.Style.BRIGHT +
        'Statistics' +
        colorama.Style.RESET_ALL)
    for statistic_class in [
        'Input',
        'Filtering',
        'Data type detection',
            'Methylation']:
        print(f'{colorama.Style.BRIGHT} {statistic_class} {colorama.Style.RESET_ALL}')
        for statistic, value in statistics[statistic_class].most_common():
            print(f'  {statistic}\t{value}')

    print(f'{colorama.Style.BRIGHT} Methylation calls {colorama.Style.RESET_ALL}')
    for call, description in zip('zZxXhH',
                                 ['unmethylated C in CpG context (CG)',
                                  'methylated C in CpG context (CG)',
                                  'unmethylated C in CHG context ( C[ACT]G )',
                                  'methylated C in CHG context   ( C[ACT]G )',
                                  'unmethylated C in CHH context ( C[ACT][ACT] )',
                                  'methylated C in CHH context ( C[ACT][ACT] )'
                                  ]):

        if call.isupper():
            print(
                f'  {colorama.Style.BRIGHT}{call}{colorama.Style.RESET_ALL}\t{mcs[call]}',
                end='\t')
        else:
            print(f'  {call}\t{mcs[call]}', end='\t')
        print(f'{colorama.Style.DIM}{description}{colorama.Style.RESET_ALL}')

    print('\n')

    if args.table is not None:
        print(
            colorama.Style.BRIGHT +
            'Writing raw unmethylated counts' +
            colorama.Style.RESET_ALL)
        # Write raw counts:
        df = pd.DataFrame(
            {loc: {sample: binned_data[loc][sample][0] for sample in binned_data[loc]} for loc in binned_data})
        df.to_pickle(f'{args.table}_unmethylated_{args.contig}.pickle.gz')
        df.to_csv(f'{args.table}_unmethylated_{args.contig}.csv')
        del df

        print(
            colorama.Style.BRIGHT +
            'Writing raw methylated counts' +
            colorama.Style.RESET_ALL)
        df = pd.DataFrame(
            {loc: {sample: binned_data[loc][sample][1] for sample in binned_data[loc]} for loc in binned_data})
        df.to_pickle(f'{args.table}_methylated_{args.contig}.pickle.gz')
        df.to_csv(f'{args.table}_methylated_{args.contig}.csv')
        del df

        print(
            colorama.Style.BRIGHT +
            'Writing ratio tables' +
            colorama.Style.RESET_ALL)
        # cast all fractions to float
        for loc in binned_data:
            for sample in binned_data[loc]:
                binned_data[loc][sample] = float(binned_data[loc][sample])
        df = pd.DataFrame(binned_data)
        del binned_data
        if args.contig:
            df.to_pickle(f'{args.table}_ratio_{args.contig}.pickle.gz')
            df.to_csv(f'{args.table}_ratio_{args.contig}.csv')
        else:
            df.to_pickle(f'{args.table}_ratio.pickle.gz')
            df.to_csv(f'{args.table}_ratio.csv')

    print(
        colorama.Style.BRIGHT +
        'Sorting and indexing final file' +
        colorama.Style.RESET_ALL)
    # Sort and index
    # Perform a reheading, sort and index
    if not args.no_sort_index:
        cmd = f"""samtools sort {temp_out} > {args.o}; samtools index {args.o};
        rm {temp_out};
        """
    else:
        cmd = f"mv {temp_out} {args.o}"
    os.system(cmd)
    print("All done.")