#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import pysam
import time
import sys
import multiprocessing
import subprocess
import os
from singlecellmultiomics.utils.sequtils import hamming_distance, phred_to_prob
from singlecellmultiomics.universalBamTagger.taps import TAPSFlagger
from singlecellmultiomics.universalBamTagger.tag import TagFlagger
from singlecellmultiomics.universalBamTagger.scar import ScarFlagger
from singlecellmultiomics.universalBamTagger.mspjI import MSPJIFlagger
from singlecellmultiomics.universalBamTagger.scchic import ChicSeqFlagger
from singlecellmultiomics.universalBamTagger.nlaIII import NlaIIIFlagger
from singlecellmultiomics.universalBamTagger.digest import DigestFlagger
from singlecellmultiomics.universalBamTagger.rna import RNA_Flagger
import warnings
import numpy as np
import math
import singlecellmultiomics.features
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods
from singlecellmultiomics.tagtools import tagtools
import argparse
import pysamiterators.iterators as pysamIterators
import gzip
import pickle
from singlecellmultiomics.alleleTools import alleleTools
import uuid
import collections
import glob
c = 1_000 # !!! PLEASE USE PYTHON 3.6 OR HIGHER !!!
complement = str.maketrans('ATGC', 'TACG')
warnings.warn(
"universalBamTagger is deprecated. Please switch to use bamtagmultiome.py ",
DeprecationWarning)
# Modules:
if __name__ == "__main__":
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="""Add allelic NLA and/or MSPJI digest information to a demultiplexed bam file
Adds the following tags:
RC: number of oversequenced molecules
DT: Source type (NLA/MSPJI/RNA.. )
DA: Digest allele
RP: Recognized sequence
RS: Recognized strand '-','?','+'
Which tags are filled in depends on which files you supply, for example if you do not supply an allelic VCF file, then no allele tag will be added
""")
argparser.add_argument('bamfiles', metavar='bamfile', type=str, nargs='+')
argparser.add_argument(
'--local',
action='store_true',
help='Do not qsub sorting and indexing [deprecated]')
argparser.add_argument(
'-tmpprefix',
metavar='PREFIX.nnnn.bam',
help='When sorting, specify temp dir to prevent home directory filling up',
default='~/')
#argparser.add_argument('--noSort', action='store_true', help='Do not sort and index')
#argparser.add_argument('--cluster', action='store_true', help='Do not qsub sorting and indexing')
tagAlgs = argparser.add_argument_group('Tagging algorithms', '')
tagAlgs.add_argument(
'--ftag',
action='store_true',
help='!DEPRECATED, this flag is now always used!')
tagAlgs.add_argument(
'--atag',
action='store_true',
help='Add only allele tags')
tagAlgs.add_argument(
'--mspji',
action='store_true',
help='Look for mspji digest')
tagAlgs.add_argument(
'--nla',
action='store_true',
help='Look for nlaIII digest')
tagAlgs.add_argument(
'--chic',
action='store_true',
help='Look for mnase (scCHIC) digest')
tagAlgs.add_argument(
'--scar',
action='store_true',
help='Create R1 start, cigar sequence based DS tags')
tagAlgs.add_argument(
'--taps',
action='store_true',
help='Add TAPS based methylation state ')
tagAlgs.add_argument(
'-tag',
type=str,
default=None,
help='Determine oversequencing based on a tag (for example XT to count RNA oversequencing for featureCounts counted transcripts. chrom for chromosome/contig count)')
# RNA options
tagAlgs.add_argument(
'--rna',
action='store_true',
help='Assign RNA molecules, requires a intronic and exonic GTF file')
argparser.add_argument(
'-introns',
type=str,
help='Intronic GTF file, use exonGTFtoIntronGTF.py to generate such a GTF file')
argparser.add_argument('-exons', type=str, help='Exonic GTF file.')
"""
How much bases of the fragment fall inside the intron(s)
How much bases of the fragment fall inside the exon(s)
is the fragment spliced? How many bases are spliced out?
"""
argparser.add_argument(
'-moleculeRadius',
type=int,
help='Search radius for each molecule, if a cut site is found within this range with the same umi, it will be counted as the same molecule',
default=0)
argparser.add_argument(
'-alleles',
type=str,
help='VCF file. Add allelic info flag based on vcf file')
argparser.add_argument(
'--loadAllelesToMem',
action='store_true',
help='Load allele data completely into memory')
argparser.add_argument('-ref', type=str, help='reference FASTA file.')
argparser.add_argument(
'-o',
type=str,
default='./tagged',
help='output folder')
argparser.add_argument(
'--dedup',
action='store_true',
help='Create deduplicated bam files')
argparser.add_argument('-chr', help='run only on this chromosome')
argparser.add_argument(
'-head',
type=int,
default=None,
help='Only process first n reads of every bam file')
devArgs = argparser.add_argument_group('Development options', '')
devArgs.add_argument(
'--fatal',
action='store_true',
help='Crash upon any encountered error')
devArgs.add_argument('--verbose', action='store_true', help='Be verbose')
devArgs.add_argument(
'--knh',
action='store_true',
help='Keep non-headered bam file')
args = argparser.parse_args()
if not args.mspji and not args.nla and not args.chic and not args.ftag and not args.rna and args.tag is None and args.atag is None and args.taps is None:
raise ValueError(
'Please supply any or a combination of --ftag --nla --chic --mspji --taps')
if args.rna and (args.introns is None or args.exons is None):
raise ValueError(
'Please supply exonic and intronic GTF files -introns -exons')
"""
Corrected Tags
BC: This should be used for the sample-barcode so that there is no conflict in case that both sample- and molecular-barcodes are used
QT: This should be the quality for the sample-barcode BC
New Tags
RX: The raw sequence bases of the molecular-barcode(s) of the read(s) can be placed in this tag
QX: The original qualites (as QUAL) of the bases in RX
BX: Sequence bases of the unique molecular identifier, may be corrected or raw
BZ: Quality score the unique molecular identifier. May be corrected or raw. See BX tag for qualities
MI: Molecular-barcode sequence or identifier, may include non-sequence bases (Here, it is BC+RX)
QM: Molecular-barcode qualities, QT+QX
DS: coordinate of site
RC: number of oversequenced molecules
DT: Source type (NLA/MSPJI/RNA..? )
DA: Digest allele
RP: Recognized sequence
RS: Recognized strand '-','?','+'
LI: ligation sequence
"""
# 1: read1 2: read2
[docs]def molecule_to_random_primer_dict(molecule, primer_length=6, primer_read=2):
rp = collections.defaultdict(list)
for fragment in molecule:
if fragment[primer_read - 1] is not None:
hstart, hseq = tagtools.getRandomPrimerHash(
fragment[primer_read - 1], onStart=True, primerLength=6)
rp[hstart, hseq].append(fragment)
return rp
# for k,p in rp.items():
# yield p
[docs]class MoleculeIterator_OLD():
"""
Iterate over molecules in a bam file
Parameters
----------
alignmentfile : pysam.AlignmentFile
file to read the molecules from
look_around_radius : int
buffer to accumulate molecules in. All fragments belonging to one molecule should fit this radius
umi_hamming_distance : int
Edit distance on UMI, 0: only exact match, 1: single base distance
sample_select : iterable
Iterable of samples to only select molecules from
Yields
----------
list of molecules : list [ pysam.AlignedSegment ]
[ (R1,R2), (R1,R2) ... ]
"""
def __init__(self,
alignmentfile,
look_around_radius=100_000,
umi_hamming_distance=0, # 0: only exact match, 1: single base distance
sample_select=None, # iterable of samples to only select molecules from
# when a string is supplied only a single sample is selected
**pysam_kwargs
):
self.alignmentfile = alignmentfile
self.look_around_radius = look_around_radius
self.current_position = None
self.current_chromosome = None
self.molecules_yielded = 0
self.umi_hamming_distance = umi_hamming_distance
self.pysam_kwargs = pysam_kwargs
self.sample_select = sample_select
self.sample_tag = 'SM'
self._clear()
self.filter_function = None # function which results given two reads if it is usable
def _clear(self):
self.molecule_cache = collections.defaultdict(lambda: collections.defaultdict(
list)) # position -> cell,umi,strand,allele.. -> reads
[docs] def sample_assignment_function(self, fragment):
for read in fragment:
if read is not None:
if read.has_tag(self.sample_tag):
return read.get_tag(self.sample_tag)
return None
# Returns postion unique identifier for a fragment
[docs] def localisation_function(self, fragment):
if not fragment[0].has_tag('DS'):
return None
return fragment[0].get_tag('DS')
def __repr__(self):
return f"""Molecule iterator
current position: {self.current_chromosome}:{self.current_position}
yielded {self.molecules_yielded} so far
currently {len(self.molecule_cache)} positions cached
keeping {self.get_cached_fragment_count()} fragments cached
"""
[docs] def get_cached_fragment_count(self):
total_fragments = 0
for position, data_per_molecule in self.molecule_cache.items():
for molecule_id, molecule_reads in data_per_molecule.items():
total_fragments += len(molecule_reads)
return total_fragments
[docs] def assignment_function(self, fragment):
return fragment[0].get_tag('SM'), fragment[0].get_tag(
'RX'), fragment[0].get_tag('RS')
# Returns True if two fragment ids are identical or close enough
[docs] def eq_function(self, assignment_a, assignment_b):
if assignment_a is None or assignment_b is None:
return False
sample_A, umi_A, strand_A = assignment_a
sample_B, umi_B, strand_B = assignment_b
if sample_A != sample_B or strand_A != strand_B:
return False
if self.umi_hamming_distance == 0:
return umi_A == umi_B
else:
return hamming_distance(umi_A, umi_B) <= self.umi_hamming_distance
def _yield_all_in_current_cache(self):
for position, data_per_molecule in self.molecule_cache.items():
for molecule_id, molecule_reads in data_per_molecule.items():
self.molecules_yielded += 1
yield molecule_reads
self._clear()
def _purge(self):
drop = []
for pos in self.molecule_cache:
if pos < (self.current_position - self.look_around_radius):
# purge this coordinate:
for molecule_id, molecule_reads in self.molecule_cache[pos].items(
):
self.molecules_yielded += 1
yield molecule_reads
drop.append(pos)
for pos in drop:
del self.molecule_cache[pos]
[docs] def get_fragment_chromosome(self, fragment):
for read in fragment:
if read is not None:
return read.reference_name
def __iter__(self):
for fragment in pysamIterators.MatePairIterator(
self.alignmentfile,
**self.pysam_kwargs,
performProperPairCheck=False):
if fragment[0] is None or fragment[0].reference_name is None:
continue
# Check if we want to drop the fragment because its corresponding
# sample is not selected
if self.sample_select is not None:
sample = self.sample_assignment_function(fragment)
if isinstance(self.sample_select, str): # single sample
if self.sample_select != sample:
continue
else: # list or set of samples
if sample not in self.sample_select:
continue
# We went to another chromsome, purge all in cache:
if self.get_fragment_chromosome(
fragment) != self.current_chromosome and self.current_chromosome is not None:
for molecule in self._yield_all_in_current_cache():
yield molecule
# Check if we need to purge and yield results:
for molecule in self._purge():
yield molecule
position = self.localisation_function(fragment)
if position is None:
continue
molecule_id = self.assignment_function(fragment)
# Check if there is a molecule to assign to already present:
assigned = False
for existing_molecule in self.molecule_cache[position]:
if self.eq_function(existing_molecule, molecule_id):
assigned = True
self.molecule_cache[position][existing_molecule].append(
fragment)
if not assigned:
self.molecule_cache[position][molecule_id].append(fragment)
self.current_chromosome = self.get_fragment_chromosome(fragment)
self.current_position = self.localisation_function(
fragment) # [0].reference_end
# yield everything which was not yielded yet
for molecule in self._yield_all_in_current_cache():
yield molecule
"""
Iterate over transcripts in a bam file
Parameters
----------
alignmentfile : pysam.AlignmentFile
file to read the molecules from
look_around_radius : int
buffer to accumulate molecules in. All fragments belonging to one molecule should fit this radius
informative_read : int
which read is used to define the mapping coordinate of the fragment, 1 or 2.
umi_hamming_distance : int
Edit distance on UMI, 0: only exact match, 1: single base distance, 2 base distance ...
assignment_radius : int
tolerance on fragment starting coordinates
sample_select : iterable
Iterable of samples to only select molecules from
Yields
----------
list of molecules : list [ pysam.AlignedSegment ]
[ (R1,R2), (R1,R2) ... ]
"""
[docs]class TranscriptIterator(MoleculeIterator_OLD):
def __init__(
self,
look_around_radius=100,
informative_read=2,
assignment_radius=10,
**kwargs):
MoleculeIterator_OLD.__init__(
self, look_around_radius=look_around_radius, **kwargs)
self.informative_read = informative_read
self.assignment_radius = assignment_radius
[docs] def assignment_function(self, fragment):
return fragment[self.informative_read -
1].get_tag('SM'), fragment[self.informative_read -
1].get_tag('RX'), fragment[self.informative_read -
1].is_reverse
[docs] def localisation_function(self, fragment):
return int((fragment[self.informative_read - 1].reference_start) /
self.assignment_radius) * self.assignment_radius
[docs]class QueryNameFlagger(DigestFlagger):
def __init__(self, **kwargs):
self.assignedReadGroups = set()
DigestFlagger.__init__(self, **kwargs)
[docs] def digest(self, reads):
for read in reads:
if read is None:
continue
if read.has_tag('SM'): # It is already tagged
return
if read.query_name.startswith('UMI'): # Old format
import tagBamFile # this is not included anymore
tagBamFile.recodeRead(read)
else:
tr = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TaggedRecord(
singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions
)
tr.fromTaggedBamRecord(read)
newHeader = tr.asIlluminaHeader()
read.query_name = newHeader
tr.tagPysamRead(read)
rg = f"{read.get_tag('Fc') if read.has_tag('Fc') else 'NONE'}.{read.get_tag('La') if read.has_tag('La') else 'NONE'}.{read.get_tag('SM') if read.has_tag('SM') else 'NONE'}"
self.assignedReadGroups.add(rg)
# Add read group:
read.set_tag('RG', rg)
[docs]class AlleleTagger(DigestFlagger):
def __init__(self, **kwargs):
DigestFlagger.__init__(self, **kwargs)
[docs] def digest(self, reads):
nonNullReads = [read for read in reads if read is not None]
self.addAlleleInfo(nonNullReads)
if __name__ == "__main__":
# These data sources are fed to all flaggers
flaggerArguments = {
'reference': None if args.ref is None else pysamIterators.CachedFasta(
pysam.FastaFile(args.ref)),
'alleleResolver': None if args.alleles is None else alleleTools.AlleleResolver(
args.alleles, lazyLoad=not args.loadAllelesToMem),
'moleculeRadius': args.moleculeRadius,
'verbose': args.verbose,
'exon_gtf': args.exons,
'intron_gtf': args.introns
}
pairedEnd = False
flaggers = []
qFlagger = None
# if args.ftag:
# Now we ALWAYS put reads through qflagger
qFlagger = QueryNameFlagger(**flaggerArguments)
flaggers.append(qFlagger)
if args.nla:
flaggers.append(NlaIIIFlagger(**flaggerArguments))
pairedEnd = True
if args.mspji:
flaggers.append(MSPJIFlagger(**flaggerArguments))
pairedEnd = True
if args.chic:
flaggers.append(ChicSeqFlagger(**flaggerArguments))
pairedEnd = True
if args.scar:
flaggers.append(ScarFlagger(**flaggerArguments))
pairedEnd = True
if args.tag:
flaggers.append(TagFlagger(tag=args.tag))
if args.atag:
print("Running allele tagging")
flaggers.append(AlleleTagger(**flaggerArguments))
if args.rna:
flaggers.append(RNA_Flagger(**flaggerArguments))
if args.taps:
flaggers.append(TAPSFlagger(**flaggerArguments))
pairedEnd = True
if args.alleles is not None:
# Check if the variant file is valid..
if not (args.alleles.endswith('.vcf.gz')
or args.alleles.endswith('.bcf.gz')):
raise ValueError(f"""Please supply an indexed (bg)zipped VCF file.
You can convert your file using: bcftools view {args.alleles} -O b -o {args.alleles}.gz;
then index using bcftools index {args.alleles}.gz """)
if not (
os.path.exists(
args.alleles +
'.csi') or os.path.exists(
args.alleles +
'.tbi')):
raise ValueError(f"""Please supply an indexed (bg)zipped VCF file.
Index using: bcftools index {args.alleles} """)
if pairedEnd:
print('Assuming the input is paired end')
else:
print('Assuming the input is single end ( Has no influence on ftag)')
if not os.path.exists(args.o):
try:
os.makedirs(args.o)
except Exception as e:
pass
# Here we walk through the bamfiles and fetch
tmpprefix = f'{args.tmpprefix}'
for bamFilePath in args.bamfiles:
bamFile = pysam.AlignmentFile(bamFilePath, "rb")
header = bamFile.header.copy()
outPathTemp = f'{args.o}/{os.path.basename(bamFilePath)}.unsorted'
outPathTempWithHeader = f'{args.o}/{os.path.basename(bamFilePath)}.unsorted.with_header.bam'
outPath = f"{args.o}/{os.path.basename(bamFilePath).replace('.bam', '')}.bam"
print(f'Now reading {bamFilePath}, writing to {outPath}')
if args.dedup:
dedupOutPathTemp = f'{args.o}/{os.path.basename(bamFilePath)}.dedup.unsorted'
dedupOutPathTempWithHeader = f'{args.o}/{os.path.basename(bamFilePath)}.dedup.with_header.bam'
dedupOutPath = f"{args.o}/{os.path.basename(bamFilePath).replace('.bam', '')}.dedup.bam"
dedupOutputFile = pysam.AlignmentFile(
dedupOutPathTemp, "wb", header=header)
outputFile = pysam.AlignmentFile(outPathTemp, "wb", header=header)
bamName = os.path.basename(bamFilePath).replace('.bam', '')
# if os.path.exists(outPath+'.bai'):
# continue
i = 0
if pairedEnd:
it = pysamIterators.MatePairIterator(
bamFile, performProperPairCheck=False, contig=args.chr)
for i, (R1, R2) in enumerate(it):
for flagger in flaggers:
try:
flagger.digest([R1, R2])
except Exception as e:
print(e)
if args.fatal:
print(R1, R2)
raise e
try:
if R1 is not None:
outputFile.write(R1)
if R2 is not None:
outputFile.write(R2)
except Exception as e:
for flagger in flaggers:
print(flagger)
raise e
if args.dedup and (
(R1 is not None and R1.has_tag('RC') and R1.get_tag('RC') == 1) or (
R2 is not None and R2.has_tag('RC') and R2.get_tag('RC') == 1)):
if R1 is not None:
dedupOutputFile.write(R1)
if R2 is not None:
dedupOutputFile.write(R2)
if args.head is not None and i >= args.head:
break
else:
iterator = bamFile.fetch(
contig=args.chr) if args.chr is not None else bamFile
for i, R1 in enumerate(iterator):
for flagger in flaggers:
try:
flagger.digest([R1])
except Exception as e:
if args.fatal:
raise e
print(e)
outputFile.write(R1)
if args.head is not None and i >= args.head:
break
print(f'{bamFilePath}: processed {i} reads')
print("Finishing files (forcing close)")
outputFile.close()
if args.dedup:
dedupOutputFile.close()
if qFlagger is not None:
readGroups = {}
for readGroup in qFlagger.assignedReadGroups:
flowCell, lane, sampleLib = readGroup.split('.')
library, sample = sampleLib.rsplit('_', 1)
readGroups[readGroup] = {
'ID': readGroup,
'LB': library,
'PL': 'ILLUMINA',
'SM': sample,
'PU': readGroup}
headerSamFilePath = outPath.replace('.bam', '') + '.header.sam'
hCopy = header.to_dict()
hCopy['RG'] = list(readGroups.values())
with gzip.open(outPathTemp.replace('.bam', '') + '.readGroups.pickle.gz', 'wb') as rgzip, pysam.AlignmentFile(headerSamFilePath, 'w', header=hCopy) as headerSam:
pickle.dump(readGroups, rgzip)
# for readGroup, fields in readGroups.items():
# headerSam.write(f'@RG\tID:{readGroup}\tPL:{fields["PL"]}\tSM:{fields["SM"]}\tPU:{fields["PU"]}\tLB:{fields["LB"]}\n')
# Clear the read groups
qFlagger.assignedReadGroups = set()
# Perform a reheading, sort and index
rehead_cmd = f"""{{ cat {headerSamFilePath}; samtools view {outPathTemp}; }} | samtools view -b > {outPathTempWithHeader} ;
rm {outPathTemp};
samtools sort -T {tmpprefix} {outPathTempWithHeader} > {outPath}; samtools index {outPath};
rm {outPathTempWithHeader};
"""
print(f"Adding read groups to header and sorting.")
os.system(rehead_cmd)
# Same procedure for dedup:
if args.dedup:
rehead_cmd = f"""{{ cat {headerSamFilePath}; samtools view {dedupOutPathTemp}; }} | samtools view -b > {dedupOutPathTempWithHeader} ;
rm {dedupOutPathTemp};
samtools sort -T {tmpprefix} {dedupOutPathTempWithHeader} > {dedupOutPath}; samtools index {dedupOutPath};
rm {dedupOutPathTempWithHeader};
"""
print(f"Dedup file: adding read groups to header and sorting.")
os.system(rehead_cmd)
else:
# we cannot assign readgroups...
rehead_cmd = f"""
samtools sort -T {tmpprefix} {outPathTemp} > {outPath}; samtools index {outPath};
rm {outPathTemp};
"""
print(f"Adding read groups to header and sorting.")
os.system(rehead_cmd)
if args.dedup:
rehead_cmd = f"""
samtools sort -T {tmpprefix} {dedupOutPathTemp} > {dedupOutPath}; samtools index {dedupOutPath};
rm {dedupOutPathTemp};
"""
print(f"Dedup file: adding read groups to header and sorting.")
os.system(rehead_cmd)
"""
Is:NS500414;RN:455;Fc:HYLVHBGX5;La:3;Ti:13601;CX:9882;CY:17671;Fi:N;CN:0;aa:CCGTCC;aA:CCGTCC;aI:16;LY:A3-P15-1-1;RX:ACG;RQ:GGG;BI:17;bc:ACTCGATG;BC:ACTCGATG;QT:GGKKKKKK;MX:NLAIII384C8U3;A2:TGG;AQ:E6E
for f in $(ls *.bam | cut -f 1,2,3 -d '-' | sort | uniq | grep -v bam); do submission.py -y -time 20 -t 1 "addTagsToLennartNLABam.py $f*cell*.bam"; done
"""