Source code for singlecellmultiomics.bamProcessing.bamToCountTable

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import singlecellmultiomics.modularDemultiplexer
import os
import sys
import pysam
import collections
import argparse
import pandas as pd
import numpy as np
import itertools
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods
import gzip  # for loading blacklist bedfiles
TagDefinitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions


[docs]def coordinate_to_sliding_bin_locations(dp, bin_size, sliding_increment):
    """
    Convert a single value to a list of overlapping bins

    Parameters
    ----------
    point : int
        coordinate to look up

    bin_size : int
        bin size

    sliding_increment : int
        sliding window offset, this is the increment between bins

    Returns
    -------
    start : int
        the start coordinate of the first overlapping bin
    end :int
        the end of the last overlapping bin

    start_id : int
        the index of the first overlapping bin
    end_id : int
        the index of the last overlapping bin

    """
    start_id = int(np.ceil(((dp - bin_size) / sliding_increment)))
    start = start_id * sliding_increment
    end_id = int(np.floor(dp / sliding_increment))
    end = end_id * sliding_increment + bin_size
    return start, end, start_id, end_id


[docs]def coordinate_to_bins(point, bin_size, sliding_increment):
    """
    Convert a single value to a list of overlapping bins

    Parameters
    ----------
    point : int
        coordinate to look up

    bin_size : int
        bin size

    sliding_increment : int
        sliding window offset, this is the increment between bins

    Returns
    -------
    list: [(bin_start,bin_end), .. ]

    """
    start, end, start_id, end_id = coordinate_to_sliding_bin_locations(
        point, bin_size, sliding_increment)
    return [(i * sliding_increment, i * sliding_increment + bin_size)
            for i in range(start_id, end_id + 1)]


[docs]def read_has_alternative_hits_to_non_alts(read):
    if read.has_tag('XA'):
        for alt_align in read.get_tag('XA').split(';'):
            if len(alt_align) == 0:  # Sometimes this tag is empty for some reason
                continue

            hchrom, hpos, hcigar, hflag = alt_align.split(',')
            if not hchrom.endswith('_alt'):
                return True
    return False


[docs]def readTag(read, tag, defective='None'):
    try:
        value = singlecellmultiomics.modularDemultiplexer.metaFromRead(
            read, tag)
    except Exception as e:
        value = defective
    return value

# define if a read should be used


[docs]def read_should_be_counted(read, args, blacklist_dic = None):
    """
    Check if a read should be counted given the filter arguments

    Parameters
    ----------
    read : pysam.AlignedSegment or None
        read to check if it should be counted

    Returns
    -------
    bool
    """

    if args.r1only and read.is_read2:
        return False
    if args.r2only and read.is_read1:
        return False

    if args.filterMP:
        if not read.has_tag('mp'):
            return False

        if read.get_tag('mp')!='unique':
            return False

    if read is None or read.is_qcfail:
        return False

    # Mapping quality below threshold
    if read.mapping_quality < args.minMQ:
        return False


    if args.proper_pairs_only and not read.is_proper_pair:
        return False

    if args.no_indels and ('I' in read.cigarstring or 'D' in read.cigarstring):
        return False

    if args.max_base_edits is not None and read.has_tag('NM') and int(read.get_tag('NM'))>args.max_base_edits:
        return False

    if args.no_softclips and 'S' in read.cigarstring:
        return False

    # Read has alternative hits
    if args.filterXA:
        if read_has_alternative_hits_to_non_alts(read):
            return False

    # Read is a duplicate
    # (args.dedup and ( not read.has_tag('RC') or (read.has_tag('RC') and read.get_tag('RC')!=1))):
    if read.is_unmapped or (args.dedup and (
            read.has_tag("RR") or read.is_duplicate)):
        return False

    # Read is in blacklist
    if blacklist_dic is not None:
        if read.reference_name in blacklist_dic:
            # iterate through tuples of startend to check
            for startend in blacklist_dic[read.reference_name]:
                # start is 0-based inclusive, end is 0-based exclusive
                start_bad = read.reference_start >= startend[0] and read.reference_start < startend[1]
                end_bad = read.reference_end >= startend[0] and read.reference_end < startend[1]
                if start_bad or end_bad:
                    return False

    return True


[docs]def tagToHumanName(tag, TagDefinitions):
    if tag not in TagDefinitions:
        return tag
    return TagDefinitions[tag].humanName


[docs]def assignReads(
        read,
        countTable,
        args,
        joinFeatures,
        featureTags,
        sampleTags,
        more_args=[],
        blacklist_dic = None):

    assigned = 0
    if not read_should_be_counted(read, args, blacklist_dic):
        return assigned

    # Get the sample to which this read belongs
    sample = tuple(readTag(read, tag) for tag in sampleTags)

    # Decide how many counts this read yields
    countToAdd = 1

    if args.r1only or args.r2only:
        countToAdd = 1
    elif not args.doNotDivideFragments: # not not = True
        # IF the read is paired, and the mate mapped, we should count 0.5, and will end up
        # with 1 in total

        countToAdd = (0.5 if (read.is_paired and not read.mate_is_unmapped) else 1)

    assigned += 1

    if args.divideMultimapping:
        if read.has_tag('XA'):
            countToAdd = countToAdd / len(read.get_tag('XA').split(';'))
        elif read.has_tag('NH'):
            countToAdd = countToAdd / int(read.get_tag('NH'))
        else:
            countToAdd = countToAdd

    # Define what counts to add to what samples
    # [ (sample, feature, increment), .. ]
    count_increment = []

    feature_dict = {}
    joined_feature = []
    used_features = []
    for tag in featureTags:
        feat = str(readTag(read, tag))
        feature_dict[tag] = feat
        if args.bin is not None and args.binTag == tag:
            continue
        if args.byValue is not None and tag == args.byValue:
            continue
        joined_feature.append(feat)
        used_features.append(tag)

    if joinFeatures:
        if args.splitFeatures:
            # Obtain all key states:
            key_states = []
            for tag in featureTags:
                value = feature_dict.get(tag)
                key_states.append(value.split(args.featureDelimiter))

            for state in itertools.product(*key_states):
                joined_feature = []
                feature_dict = {
                    feature: value
                    for feature, value in zip(featureTags, state)
                }
                for feature, value in zip(featureTags, state):
                    if args.bin is not None and args.binTag == feature:
                        continue

                    if len(value) > 0:
                        joined_feature.append(value)
                    else:
                        joined_feature.append('None')

                if args.byValue is not None:
                    raise NotImplementedError(
                        'By value is not implemented for --splitFeatures')

                else:
                    count_increment.append({
                        'key': tuple(joined_feature),
                        'features': feature_dict,
                        'samples': [sample],
                        'increment': countToAdd})
                    joined_feature[0]
        else:
            if args.byValue is not None:

                try:
                    add = float(feature_dict.get(args.byValue, 0))
                except ValueError:
                    add = 0

                count_increment.append({
                    'key': tuple(joined_feature),
                    'features': feature_dict,
                    'samples': [sample],
                    'increment': add})

            else:
                count_increment.append({
                    'key': tuple(joined_feature),
                    'features': feature_dict,
                    'samples': [sample],
                    'increment': countToAdd})
    else:
        if args.bin is not None:
            raise NotImplementedError('Try using -joinedFeatureTags')

        for feature, value in feature_dict.items():
            if args.byValue is not None and feature == args.byValue:

                try:
                    add = float(feature_dict.get(args.byValue, 0))
                except ValueError:
                    add = 0

                count_increment.append({
                    'key': tuple(joined_feature),
                    'features': feature_dict,
                    'samples': [sample],
                    'increment': add})

            elif args.splitFeatures:
                for f in value.split(args.featureDelimiter):
                    count_increment.append({
                        'key': (f),
                        'features': {feature: f},
                        'samples': [sample],
                        'increment': countToAdd
                    })

            else:
                count_increment.append({
                    'key': (value, ),
                    'features': {feature: value},
                    'samples': [sample],
                    'increment': countToAdd})

    """
    Now we have a list of dicts:
    {
    'key':feature,
    'features':{feature:value},
    'samples':[sample],
    'increment':countToAdd})
    }
    """

    # increment the count table accordingly:
    if args.bin is not None:
        for dtable in count_increment:
            key = dtable['key']
            countToAdd = dtable['increment']
            samples = dtable['samples']
            value_to_be_binned = dtable['features'].get(args.binTag, None)

            if value_to_be_binned is None or value_to_be_binned == 'None':
                continue

            for start, end in coordinate_to_bins(
                    int(value_to_be_binned), args.bin, args.sliding):
                # Reject bins outside boundary
                if not args.keepOverBounds and (
                        start < 0 or end > args.ref_lengths[read.reference_name]):
                    continue
                for sample in samples:
                    countTable[sample][tuple(
                        list(key) + [start, end])] += countToAdd

    elif args.bedfile is not None:

        for dtable in count_increment:
            if args.byValue:
                key = [args.byValue]
            else:
                key = dtable['key']
            countToAdd = dtable['increment']
            samples = dtable['samples']

            # Get features from bedfile
            start, end, bname = more_args[0], more_args[1], more_args[2]
            jfeat = tuple(list(key) + [start, end, bname])
            if len(key):
                countTable[sample][jfeat] += countToAdd
            # else: this will also emit non assigned reads
            #    countTable[sample][ 'None' ] += countToAdd

    else:
        for dtable in count_increment:
            key = dtable['key']
            countToAdd = dtable['increment']
            samples = dtable['samples']

            for sample in samples:
                if len(key) == 1:
                    countTable[sample][key[0]] += countToAdd
                else:
                    countTable[sample][key] += countToAdd

    return assigned


[docs]def create_count_table(args, return_df=False):

    if len(args.alignmentfiles) == 0:
        raise ValueError("Supply at least one bam file")
    if args.bedfile is not None:
        assert(os.path.isfile(args.bedfile))

    if args.sliding is None:
        args.sliding = args.bin

    if not return_df and args.o is None and args.alignmentfiles is not None:
        args.showtags = True

    if args.showtags:
        # Find which tags are available in the file:
        head = 1000
        tagObs = collections.Counter()
        for bamFile in args.alignmentfiles:
            with pysam.AlignmentFile(bamFile) as f:
                for i, read in enumerate(f):
                    tagObs += collections.Counter([k for k,
                                                   v in read.get_tags(with_value_type=False)])
                    if i == (head - 1):
                        break
        import colorama

        print(
            f'{colorama.Style.BRIGHT}Tags seen in the supplied bam file(s):{colorama.Style.RESET_ALL}')
        for observedTag in tagObs:
            tag = observedTag
            if observedTag in TagDefinitions:
                t = TagDefinitions[observedTag]
                humanName = t.humanName
                isPhred = t.isPhred
            else:
                t = observedTag
                isPhred = False
                humanName = f'{colorama.Style.RESET_ALL}<No information available>'

            allReadsHaveTag = (tagObs[tag] == head)
            color = colorama.Fore.GREEN if allReadsHaveTag else colorama.Fore.YELLOW
            print(
                f'{color}{ colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{color+humanName}\t{colorama.Style.DIM}{"PHRED" if isPhred else ""}{colorama.Style.RESET_ALL}' +
                f'\t{"" if allReadsHaveTag else "Not all reads have this tag"}')

        print(f'{colorama.Style.BRIGHT}\nAVO lab tag list:{colorama.Style.RESET_ALL}')
        for tag, t in TagDefinitions.items():
            print(f'{colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{t.humanName}\t{colorama.Style.DIM}{"PHRED" if t.isPhred else ""}{colorama.Style.RESET_ALL}')
        exit()

    if args.o is None and not return_df:
        raise ValueError('Supply an output file')

    if args.alignmentfiles is None:
        raise ValueError('Supply alignment (BAM) files')

    joinFeatures = False
    featureTags = []
    if args.featureTags is not None:
        featureTags = args.featureTags.split(',')

    if args.joinedFeatureTags is not None:
        featureTags = args.joinedFeatureTags.split(',')
        joinFeatures = True

        # When -byValue is used, and joined feature tags are supplied, automatically append the args.byValue tag, otherwise it will get lost
        if args.byValue is not None and len(featureTags)>0 and args.byValue not in featureTags:
            featureTags.append(args.byValue)


    if args.bin is not None and args.binTag not in featureTags:
        print("The bin tag was not supplied as feature, automatically appending the bin feature.")
        featureTags.append(args.binTag)

    if len(featureTags) == 0:
        raise ValueError(
            "No features supplied! Please supply -featureTags -joinedFeatureTags and or -binTag")

    sampleTags = args.sampleTags.split(',')
    countTable = collections.defaultdict(
        collections.Counter)  # cell->feature->count

    if args.blacklist is not None:
        # create blacklist dictionary {chromosome : [ (start1, end1), ..., (startN, endN) ]}
        # used to check each read and exclude if it is within any of these start end sites
        #
        blacklist_dic = {}
        print("Creating blacklist dictionary:")
        with open(args.blacklist, mode='r') as blfile:
            for row in blfile:
                parts = row.strip().split()
                chromo, start, end = parts[0], int(parts[1]), int(parts[2])
                if chromo not in blacklist_dic:
                    # init chromo
                    blacklist_dic[chromo] = []  # list of tuples
                blacklist_dic[chromo].append( (start, end) )
        print(blacklist_dic)
    else:
        blacklist_dic = None

    assigned = 0
    for bamFile in args.alignmentfiles:

        with pysam.AlignmentFile(bamFile) as f:
            i = 0  # make sure i is defined
            if args.bin:
                # Obtain the reference sequence lengths
                ref_lengths = {
                    r: f.get_reference_length(r) for r in f.references}
                args.ref_lengths = ref_lengths
            if args.bedfile is None:
                # for adding counts associated with a tag OR with binning
                if args.contig is not None:
                    pysam_iterator = f.fetch(args.contig)
                else:
                    pysam_iterator = f

                for i, read in enumerate(pysam_iterator):
                    if i % 1_000_000 == 0:
                        print(
                            f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%")

                    if args.head is not None and i > args.head:
                        break

                    assigned += assignReads(read,
                                            countTable,
                                            args,
                                            joinFeatures,
                                            featureTags,
                                            sampleTags,
                                            blacklist_dic = blacklist_dic)
            else:  # args.bedfile is not None
                # for adding counts associated with a bedfile
                with open(args.bedfile, "r") as bfile:
                    #breader = csv.reader(bfile, delimiter = "\t")
                    for row in bfile:

                        parts = row.strip().split()
                        chromo, start, end, bname = parts[0], int(float(
                            parts[1])), int(float(parts[2])), parts[3]
                        if args.contig is not None and chromo != args.contig:
                            continue
                        for i, read in enumerate(f.fetch(chromo, start, end)):
                            if i % 1_000_000 == 0:
                                print(
                                    f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%")
                            assigned += assignReads(read,
                                                    countTable,
                                                    args,
                                                    joinFeatures,
                                                    featureTags,
                                                    sampleTags,
                                                    more_args=[start,
                                                               end,
                                                               bname],
                                                    blacklist_dic = blacklist_dic)

                            if args.head is not None and i > args.head:
                                break

            print(
                f"Finished: {bamFile} Processed {i} reads, assigned {assigned}")
    print(f"Finished counting, now exporting to {args.o}")
    df = pd.DataFrame.from_dict(countTable)

    # Set names of indices
    if not args.noNames:
        df.columns.set_names([tagToHumanName(t, TagDefinitions)
                              for t in sampleTags], inplace=True)

        try:
            if args.bin is not None:
                index_names = [tagToHumanName(
                    t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end']
                df.index.set_names(index_names, inplace=True)
            elif args.bedfile is not None:
                index_names = [tagToHumanName(
                    t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end', 'bname']
                df.index.set_names(index_names, inplace=True)
            elif joinFeatures:
                index_names = [
                    tagToHumanName(
                        t, TagDefinitions) for t in featureTags]
                df.index.set_names(index_names, inplace=True)
            else:
                index_names = ','.join(
                    [tagToHumanName(t, TagDefinitions) for t in featureTags])
                df.index.set_names(index_names, inplace=True)
        except Exception as e:
            pass
        print(index_names)

    if return_df:
        return df

    if args.bulk:
        sum_row = df.sum(axis=1)
        df = pd.DataFrame(sum_row)
        df.columns = ["Bulkseq"]

    if args.o.endswith('.pickle') or args.o.endswith('.pickle.gz'):
        df.to_pickle(args.o)
    else:
        df.to_csv(args.o)
    return args.o
    print("Finished export.")


if __name__ == '__main__':
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Convert BAM file(s) which has been annotated using featureCounts or otherwise to a count matrix')
    argparser.add_argument(
        '-o',
        type=str,
        help="output csv path, or pandas dataframe if path ends with pickle.gz",
        required=False)
    argparser.add_argument(
        '-featureTags',
        type=str,
        default=None,
        help='Tag(s) used for defining the COLUMNS of the matrix. Single dimension.')
    argparser.add_argument(
        '-joinedFeatureTags',
        type=str,
        default=None,
        help='These define the COLUMNS of your matrix. For example if you want allele (DA) and restriction site (DS) use DA,DS. If you want a column containing the chromosome mapped to use "chrom" as feature. Use this argument if you want to use a multidimensional index')
    argparser.add_argument(
        '-sampleTags',
        type=str,
        default='SM',
        help='Comma separated tags defining the names of the ROWS in the output matrix')
    argparser.add_argument('alignmentfiles', type=str, nargs='*')
    argparser.add_argument(
        '-head',
        type=int,
        help='Run the algorithm only on the first N reads to check if the result looks like what you expect.')
    

    argparser.add_argument(
        '--bulk',
        action='store_true',
        help='Include this when making a count table for bulkseq data. This operation will sum the counts of all sampleTags into a single column.')
    argparser.add_argument(
        '--r1only',
        action='store_true',
        help='Only count R1')

    argparser.add_argument(
        '--r2only',
        action='store_true',
        help='Only count R2')

    argparser.add_argument(
        '--splitFeatures',
        action='store_true',
        help='Split features by , . For example if a read has a feature Foo,Bar increase counts for both Foo and Bar')
    argparser.add_argument('-featureDelimiter', type=str, default=',')
    argparser.add_argument(
        '-contig',
        type=str,
        help='Run only on this chromosome')

    multimapping_args = argparser.add_argument_group('Multimapping', '')
    multimapping_args.add_argument(
        '--divideMultimapping',
        action='store_true',
        help='Divide multimapping reads over all targets. Requires the XA or NH tag to be set.')
    multimapping_args.add_argument(
        '--doNotDivideFragments',
        action='store_true',
        help='When used every read is counted once, a fragment will count as two reads. 0.5 otherwise')


    binning_args = argparser.add_argument_group('Binning', '')
    #binning_args.add_argument('-offset', type=int, default=0, help="Add offset to bin. If bin=1000, offset=200, f=1999 -> 1200. f=4199 -> 3200")
    binning_args.add_argument(
        '-sliding',
        type=int,
        help="make bins overlapping, the stepsize is equal to the supplied value here. If nothing is supplied this value equals the bin size")
    binning_args.add_argument(
        '--keepOverBounds',
        action='store_true',
        help="Keep bins which go over chromsome bounds (start<0) end > chromsome length")

    binning_args.add_argument(
        '-bin',
        type=int,
        help="Devide and floor to bin features. If bin=1000, f=1999 -> 1000.")
    # binning_args.add_argument('--showBinEnd', action='store_true', help="If
    # True, then show DS column as 120000-220000, otherwise 120000 only. This
    # specifies the bin range in which the read was counted" ) this is now
    # always on!
    binning_args.add_argument('-binTag', default='DS')
    binning_args.add_argument(
        '-byValue',
        type=str,
        help='Extract the value from the supplied tag and use this as count to add')
    binning_args.add_argument('-blacklist', metavar='BED FILE BLACKLIST TO FILTER OUT', help = 'Bedfile of blacklist regions to exclude', default=None)
    bed_args = argparser.add_argument_group('Bedfiles', '')
    bed_args.add_argument(
        '-bedfile',
        type=str,
        help="Bed file containing 3 columns, chromo, start, end to be read for fetching counts")



    filters = argparser.add_argument_group('Filters', '')

    filters.add_argument(
        '--proper_pairs_only',
        action='store_true',
        help='Only count reads mapped in a proper pair (within the expected insert size range)')

    filters.add_argument(
        '--no_softclips',
        action='store_true',
        help='Only count reads without softclips')

    filters.add_argument(
        '-max_base_edits',
        type=int,
        help='Count reads with at most this value of bases being different than the reference')


    filters.add_argument(
        '--no_indels',
        action='store_true',
        help='Only count reads without indels')

    filters.add_argument(
        '--dedup',
        action='store_true',
        help='Count only the first occurence of a molecule. Requires RC tag to be set. Reads without RC tag will be ignored!')

    filters.add_argument(
        '-minMQ',
        type=int,
        default=0,
        help="minimum mapping quality")

    filters.add_argument(
        '--filterMP',
        action= 'store_true',
        help="Filter reads which are not uniquely mappable, this is based on presence on the `mp` tag, which can be generated by bamtagmultiome.py ")

    filters.add_argument(
        '--filterXA',
        action='store_true',
        help="Do not count reads where the XA (alternative hits) tag has been set for a non-alternative locus.")

    argparser.add_argument(
        '--noNames',
        action='store_true',
        help='Do not set names of the index and columns, this simplifies the resulting CSV/pickle file')
    argparser.add_argument(
        '--showtags',
        action='store_true',
        help='Show a list of commonly used tags')

    args = argparser.parse_args()
    create_count_table(args)