#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import singlecellmultiomics.modularDemultiplexer
import os
import sys
import pysam
import collections
import argparse
import pandas as pd
import numpy as np
import itertools
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods
import gzip # for loading blacklist bedfiles
TagDefinitions = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions
[docs]def coordinate_to_sliding_bin_locations(dp, bin_size, sliding_increment):
"""
Convert a single value to a list of overlapping bins
Parameters
----------
point : int
coordinate to look up
bin_size : int
bin size
sliding_increment : int
sliding window offset, this is the increment between bins
Returns
-------
start : int
the start coordinate of the first overlapping bin
end :int
the end of the last overlapping bin
start_id : int
the index of the first overlapping bin
end_id : int
the index of the last overlapping bin
"""
start_id = int(np.ceil(((dp - bin_size) / sliding_increment)))
start = start_id * sliding_increment
end_id = int(np.floor(dp / sliding_increment))
end = end_id * sliding_increment + bin_size
return start, end, start_id, end_id
[docs]def coordinate_to_bins(point, bin_size, sliding_increment):
"""
Convert a single value to a list of overlapping bins
Parameters
----------
point : int
coordinate to look up
bin_size : int
bin size
sliding_increment : int
sliding window offset, this is the increment between bins
Returns
-------
list: [(bin_start,bin_end), .. ]
"""
start, end, start_id, end_id = coordinate_to_sliding_bin_locations(
point, bin_size, sliding_increment)
return [(i * sliding_increment, i * sliding_increment + bin_size)
for i in range(start_id, end_id + 1)]
[docs]def read_has_alternative_hits_to_non_alts(read):
if read.has_tag('XA'):
for alt_align in read.get_tag('XA').split(';'):
if len(alt_align) == 0: # Sometimes this tag is empty for some reason
continue
hchrom, hpos, hcigar, hflag = alt_align.split(',')
if not hchrom.endswith('_alt'):
return True
return False
[docs]def readTag(read, tag, defective='None'):
try:
value = singlecellmultiomics.modularDemultiplexer.metaFromRead(
read, tag)
except Exception as e:
value = defective
return value
# define if a read should be used
[docs]def read_should_be_counted(read, args, blacklist_dic = None):
"""
Check if a read should be counted given the filter arguments
Parameters
----------
read : pysam.AlignedSegment or None
read to check if it should be counted
Returns
-------
bool
"""
if args.r1only and read.is_read2:
return False
if args.r2only and read.is_read1:
return False
if args.filterMP:
if not read.has_tag('mp'):
return False
if read.get_tag('mp')!='unique':
return False
if read is None or read.is_qcfail:
return False
# Mapping quality below threshold
if read.mapping_quality < args.minMQ:
return False
if args.proper_pairs_only and not read.is_proper_pair:
return False
if args.no_indels and ('I' in read.cigarstring or 'D' in read.cigarstring):
return False
if args.max_base_edits is not None and read.has_tag('NM') and int(read.get_tag('NM'))>args.max_base_edits:
return False
if args.no_softclips and 'S' in read.cigarstring:
return False
# Read has alternative hits
if args.filterXA:
if read_has_alternative_hits_to_non_alts(read):
return False
# Read is a duplicate
# (args.dedup and ( not read.has_tag('RC') or (read.has_tag('RC') and read.get_tag('RC')!=1))):
if read.is_unmapped or (args.dedup and (
read.has_tag("RR") or read.is_duplicate)):
return False
# Read is in blacklist
if blacklist_dic is not None:
if read.reference_name in blacklist_dic:
# iterate through tuples of startend to check
for startend in blacklist_dic[read.reference_name]:
# start is 0-based inclusive, end is 0-based exclusive
start_bad = read.reference_start >= startend[0] and read.reference_start < startend[1]
end_bad = read.reference_end >= startend[0] and read.reference_end < startend[1]
if start_bad or end_bad:
return False
return True
[docs]def tagToHumanName(tag, TagDefinitions):
if tag not in TagDefinitions:
return tag
return TagDefinitions[tag].humanName
[docs]def assignReads(
read,
countTable,
args,
joinFeatures,
featureTags,
sampleTags,
more_args=[],
blacklist_dic = None):
assigned = 0
if not read_should_be_counted(read, args, blacklist_dic):
return assigned
# Get the sample to which this read belongs
sample = tuple(readTag(read, tag) for tag in sampleTags)
# Decide how many counts this read yields
countToAdd = 1
if args.r1only or args.r2only:
countToAdd = 1
elif not args.doNotDivideFragments: # not not = True
# IF the read is paired, and the mate mapped, we should count 0.5, and will end up
# with 1 in total
countToAdd = (0.5 if (read.is_paired and not read.mate_is_unmapped) else 1)
assigned += 1
if args.divideMultimapping:
if read.has_tag('XA'):
countToAdd = countToAdd / len(read.get_tag('XA').split(';'))
elif read.has_tag('NH'):
countToAdd = countToAdd / int(read.get_tag('NH'))
else:
countToAdd = countToAdd
# Define what counts to add to what samples
# [ (sample, feature, increment), .. ]
count_increment = []
feature_dict = {}
joined_feature = []
used_features = []
for tag in featureTags:
feat = str(readTag(read, tag))
feature_dict[tag] = feat
if args.bin is not None and args.binTag == tag:
continue
if args.byValue is not None and tag == args.byValue:
continue
joined_feature.append(feat)
used_features.append(tag)
if joinFeatures:
if args.splitFeatures:
# Obtain all key states:
key_states = []
for tag in featureTags:
value = feature_dict.get(tag)
key_states.append(value.split(args.featureDelimiter))
for state in itertools.product(*key_states):
joined_feature = []
feature_dict = {
feature: value
for feature, value in zip(featureTags, state)
}
for feature, value in zip(featureTags, state):
if args.bin is not None and args.binTag == feature:
continue
if len(value) > 0:
joined_feature.append(value)
else:
joined_feature.append('None')
if args.byValue is not None:
raise NotImplementedError(
'By value is not implemented for --splitFeatures')
else:
count_increment.append({
'key': tuple(joined_feature),
'features': feature_dict,
'samples': [sample],
'increment': countToAdd})
joined_feature[0]
else:
if args.byValue is not None:
try:
add = float(feature_dict.get(args.byValue, 0))
except ValueError:
add = 0
count_increment.append({
'key': tuple(joined_feature),
'features': feature_dict,
'samples': [sample],
'increment': add})
else:
count_increment.append({
'key': tuple(joined_feature),
'features': feature_dict,
'samples': [sample],
'increment': countToAdd})
else:
if args.bin is not None:
raise NotImplementedError('Try using -joinedFeatureTags')
for feature, value in feature_dict.items():
if args.byValue is not None and feature == args.byValue:
try:
add = float(feature_dict.get(args.byValue, 0))
except ValueError:
add = 0
count_increment.append({
'key': tuple(joined_feature),
'features': feature_dict,
'samples': [sample],
'increment': add})
elif args.splitFeatures:
for f in value.split(args.featureDelimiter):
count_increment.append({
'key': (f),
'features': {feature: f},
'samples': [sample],
'increment': countToAdd
})
else:
count_increment.append({
'key': (value, ),
'features': {feature: value},
'samples': [sample],
'increment': countToAdd})
"""
Now we have a list of dicts:
{
'key':feature,
'features':{feature:value},
'samples':[sample],
'increment':countToAdd})
}
"""
# increment the count table accordingly:
if args.bin is not None:
for dtable in count_increment:
key = dtable['key']
countToAdd = dtable['increment']
samples = dtable['samples']
value_to_be_binned = dtable['features'].get(args.binTag, None)
if value_to_be_binned is None or value_to_be_binned == 'None':
continue
for start, end in coordinate_to_bins(
int(value_to_be_binned), args.bin, args.sliding):
# Reject bins outside boundary
if not args.keepOverBounds and (
start < 0 or end > args.ref_lengths[read.reference_name]):
continue
for sample in samples:
countTable[sample][tuple(
list(key) + [start, end])] += countToAdd
elif args.bedfile is not None:
for dtable in count_increment:
if args.byValue:
key = [args.byValue]
else:
key = dtable['key']
countToAdd = dtable['increment']
samples = dtable['samples']
# Get features from bedfile
start, end, bname = more_args[0], more_args[1], more_args[2]
jfeat = tuple(list(key) + [start, end, bname])
if len(key):
countTable[sample][jfeat] += countToAdd
# else: this will also emit non assigned reads
# countTable[sample][ 'None' ] += countToAdd
else:
for dtable in count_increment:
key = dtable['key']
countToAdd = dtable['increment']
samples = dtable['samples']
for sample in samples:
if len(key) == 1:
countTable[sample][key[0]] += countToAdd
else:
countTable[sample][key] += countToAdd
return assigned
[docs]def create_count_table(args, return_df=False):
if len(args.alignmentfiles) == 0:
raise ValueError("Supply at least one bam file")
if args.bedfile is not None:
assert(os.path.isfile(args.bedfile))
if args.sliding is None:
args.sliding = args.bin
if not return_df and args.o is None and args.alignmentfiles is not None:
args.showtags = True
if args.showtags:
# Find which tags are available in the file:
head = 1000
tagObs = collections.Counter()
for bamFile in args.alignmentfiles:
with pysam.AlignmentFile(bamFile) as f:
for i, read in enumerate(f):
tagObs += collections.Counter([k for k,
v in read.get_tags(with_value_type=False)])
if i == (head - 1):
break
import colorama
print(
f'{colorama.Style.BRIGHT}Tags seen in the supplied bam file(s):{colorama.Style.RESET_ALL}')
for observedTag in tagObs:
tag = observedTag
if observedTag in TagDefinitions:
t = TagDefinitions[observedTag]
humanName = t.humanName
isPhred = t.isPhred
else:
t = observedTag
isPhred = False
humanName = f'{colorama.Style.RESET_ALL}<No information available>'
allReadsHaveTag = (tagObs[tag] == head)
color = colorama.Fore.GREEN if allReadsHaveTag else colorama.Fore.YELLOW
print(
f'{color}{ colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{color+humanName}\t{colorama.Style.DIM}{"PHRED" if isPhred else ""}{colorama.Style.RESET_ALL}' +
f'\t{"" if allReadsHaveTag else "Not all reads have this tag"}')
print(f'{colorama.Style.BRIGHT}\nAVO lab tag list:{colorama.Style.RESET_ALL}')
for tag, t in TagDefinitions.items():
print(f'{colorama.Style.BRIGHT}{tag}{colorama.Style.RESET_ALL}\t{t.humanName}\t{colorama.Style.DIM}{"PHRED" if t.isPhred else ""}{colorama.Style.RESET_ALL}')
exit()
if args.o is None and not return_df:
raise ValueError('Supply an output file')
if args.alignmentfiles is None:
raise ValueError('Supply alignment (BAM) files')
joinFeatures = False
featureTags = []
if args.featureTags is not None:
featureTags = args.featureTags.split(',')
if args.joinedFeatureTags is not None:
featureTags = args.joinedFeatureTags.split(',')
joinFeatures = True
# When -byValue is used, and joined feature tags are supplied, automatically append the args.byValue tag, otherwise it will get lost
if args.byValue is not None and len(featureTags)>0 and args.byValue not in featureTags:
featureTags.append(args.byValue)
if args.bin is not None and args.binTag not in featureTags:
print("The bin tag was not supplied as feature, automatically appending the bin feature.")
featureTags.append(args.binTag)
if len(featureTags) == 0:
raise ValueError(
"No features supplied! Please supply -featureTags -joinedFeatureTags and or -binTag")
sampleTags = args.sampleTags.split(',')
countTable = collections.defaultdict(
collections.Counter) # cell->feature->count
if args.blacklist is not None:
# create blacklist dictionary {chromosome : [ (start1, end1), ..., (startN, endN) ]}
# used to check each read and exclude if it is within any of these start end sites
#
blacklist_dic = {}
print("Creating blacklist dictionary:")
with open(args.blacklist, mode='r') as blfile:
for row in blfile:
parts = row.strip().split()
chromo, start, end = parts[0], int(parts[1]), int(parts[2])
if chromo not in blacklist_dic:
# init chromo
blacklist_dic[chromo] = [] # list of tuples
blacklist_dic[chromo].append( (start, end) )
print(blacklist_dic)
else:
blacklist_dic = None
assigned = 0
for bamFile in args.alignmentfiles:
with pysam.AlignmentFile(bamFile) as f:
i = 0 # make sure i is defined
if args.bin:
# Obtain the reference sequence lengths
ref_lengths = {
r: f.get_reference_length(r) for r in f.references}
args.ref_lengths = ref_lengths
if args.bedfile is None:
# for adding counts associated with a tag OR with binning
if args.contig is not None:
pysam_iterator = f.fetch(args.contig)
else:
pysam_iterator = f
for i, read in enumerate(pysam_iterator):
if i % 1_000_000 == 0:
print(
f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%")
if args.head is not None and i > args.head:
break
assigned += assignReads(read,
countTable,
args,
joinFeatures,
featureTags,
sampleTags,
blacklist_dic = blacklist_dic)
else: # args.bedfile is not None
# for adding counts associated with a bedfile
with open(args.bedfile, "r") as bfile:
#breader = csv.reader(bfile, delimiter = "\t")
for row in bfile:
parts = row.strip().split()
chromo, start, end, bname = parts[0], int(float(
parts[1])), int(float(parts[2])), parts[3]
if args.contig is not None and chromo != args.contig:
continue
for i, read in enumerate(f.fetch(chromo, start, end)):
if i % 1_000_000 == 0:
print(
f"{bamFile} Processed {i} reads, assigned {assigned}, completion:{100*(i/(0.001+f.mapped+f.unmapped+f.nocoordinate))}%")
assigned += assignReads(read,
countTable,
args,
joinFeatures,
featureTags,
sampleTags,
more_args=[start,
end,
bname],
blacklist_dic = blacklist_dic)
if args.head is not None and i > args.head:
break
print(
f"Finished: {bamFile} Processed {i} reads, assigned {assigned}")
print(f"Finished counting, now exporting to {args.o}")
df = pd.DataFrame.from_dict(countTable)
# Set names of indices
if not args.noNames:
df.columns.set_names([tagToHumanName(t, TagDefinitions)
for t in sampleTags], inplace=True)
try:
if args.bin is not None:
index_names = [tagToHumanName(
t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end']
df.index.set_names(index_names, inplace=True)
elif args.bedfile is not None:
index_names = [tagToHumanName(
t, TagDefinitions) for t in featureTags if t != args.binTag] + ['start', 'end', 'bname']
df.index.set_names(index_names, inplace=True)
elif joinFeatures:
index_names = [
tagToHumanName(
t, TagDefinitions) for t in featureTags]
df.index.set_names(index_names, inplace=True)
else:
index_names = ','.join(
[tagToHumanName(t, TagDefinitions) for t in featureTags])
df.index.set_names(index_names, inplace=True)
except Exception as e:
pass
print(index_names)
if return_df:
return df
if args.bulk:
sum_row = df.sum(axis=1)
df = pd.DataFrame(sum_row)
df.columns = ["Bulkseq"]
if args.o.endswith('.pickle') or args.o.endswith('.pickle.gz'):
df.to_pickle(args.o)
else:
df.to_csv(args.o)
return args.o
print("Finished export.")
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Convert BAM file(s) which has been annotated using featureCounts or otherwise to a count matrix')
argparser.add_argument(
'-o',
type=str,
help="output csv path, or pandas dataframe if path ends with pickle.gz",
required=False)
argparser.add_argument(
'-featureTags',
type=str,
default=None,
help='Tag(s) used for defining the COLUMNS of the matrix. Single dimension.')
argparser.add_argument(
'-joinedFeatureTags',
type=str,
default=None,
help='These define the COLUMNS of your matrix. For example if you want allele (DA) and restriction site (DS) use DA,DS. If you want a column containing the chromosome mapped to use "chrom" as feature. Use this argument if you want to use a multidimensional index')
argparser.add_argument(
'-sampleTags',
type=str,
default='SM',
help='Comma separated tags defining the names of the ROWS in the output matrix')
argparser.add_argument('alignmentfiles', type=str, nargs='*')
argparser.add_argument(
'-head',
type=int,
help='Run the algorithm only on the first N reads to check if the result looks like what you expect.')
argparser.add_argument(
'--bulk',
action='store_true',
help='Include this when making a count table for bulkseq data. This operation will sum the counts of all sampleTags into a single column.')
argparser.add_argument(
'--r1only',
action='store_true',
help='Only count R1')
argparser.add_argument(
'--r2only',
action='store_true',
help='Only count R2')
argparser.add_argument(
'--splitFeatures',
action='store_true',
help='Split features by , . For example if a read has a feature Foo,Bar increase counts for both Foo and Bar')
argparser.add_argument('-featureDelimiter', type=str, default=',')
argparser.add_argument(
'-contig',
type=str,
help='Run only on this chromosome')
multimapping_args = argparser.add_argument_group('Multimapping', '')
multimapping_args.add_argument(
'--divideMultimapping',
action='store_true',
help='Divide multimapping reads over all targets. Requires the XA or NH tag to be set.')
multimapping_args.add_argument(
'--doNotDivideFragments',
action='store_true',
help='When used every read is counted once, a fragment will count as two reads. 0.5 otherwise')
binning_args = argparser.add_argument_group('Binning', '')
#binning_args.add_argument('-offset', type=int, default=0, help="Add offset to bin. If bin=1000, offset=200, f=1999 -> 1200. f=4199 -> 3200")
binning_args.add_argument(
'-sliding',
type=int,
help="make bins overlapping, the stepsize is equal to the supplied value here. If nothing is supplied this value equals the bin size")
binning_args.add_argument(
'--keepOverBounds',
action='store_true',
help="Keep bins which go over chromsome bounds (start<0) end > chromsome length")
binning_args.add_argument(
'-bin',
type=int,
help="Devide and floor to bin features. If bin=1000, f=1999 -> 1000.")
# binning_args.add_argument('--showBinEnd', action='store_true', help="If
# True, then show DS column as 120000-220000, otherwise 120000 only. This
# specifies the bin range in which the read was counted" ) this is now
# always on!
binning_args.add_argument('-binTag', default='DS')
binning_args.add_argument(
'-byValue',
type=str,
help='Extract the value from the supplied tag and use this as count to add')
binning_args.add_argument('-blacklist', metavar='BED FILE BLACKLIST TO FILTER OUT', help = 'Bedfile of blacklist regions to exclude', default=None)
bed_args = argparser.add_argument_group('Bedfiles', '')
bed_args.add_argument(
'-bedfile',
type=str,
help="Bed file containing 3 columns, chromo, start, end to be read for fetching counts")
filters = argparser.add_argument_group('Filters', '')
filters.add_argument(
'--proper_pairs_only',
action='store_true',
help='Only count reads mapped in a proper pair (within the expected insert size range)')
filters.add_argument(
'--no_softclips',
action='store_true',
help='Only count reads without softclips')
filters.add_argument(
'-max_base_edits',
type=int,
help='Count reads with at most this value of bases being different than the reference')
filters.add_argument(
'--no_indels',
action='store_true',
help='Only count reads without indels')
filters.add_argument(
'--dedup',
action='store_true',
help='Count only the first occurence of a molecule. Requires RC tag to be set. Reads without RC tag will be ignored!')
filters.add_argument(
'-minMQ',
type=int,
default=0,
help="minimum mapping quality")
filters.add_argument(
'--filterMP',
action= 'store_true',
help="Filter reads which are not uniquely mappable, this is based on presence on the `mp` tag, which can be generated by bamtagmultiome.py ")
filters.add_argument(
'--filterXA',
action='store_true',
help="Do not count reads where the XA (alternative hits) tag has been set for a non-alternative locus.")
argparser.add_argument(
'--noNames',
action='store_true',
help='Do not set names of the index and columns, this simplifies the resulting CSV/pickle file')
argparser.add_argument(
'--showtags',
action='store_true',
help='Show a list of commonly used tags')
args = argparser.parse_args()
create_count_table(args)