Source code for singlecellmultiomics.universalBamTagger.universalBamTagger

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import pysam
import time
import sys
import multiprocessing
import subprocess
import os
from singlecellmultiomics.utils.sequtils import hamming_distance, phred_to_prob
from singlecellmultiomics.universalBamTagger.taps import TAPSFlagger
from singlecellmultiomics.universalBamTagger.tag import TagFlagger
from singlecellmultiomics.universalBamTagger.scar import ScarFlagger
from singlecellmultiomics.universalBamTagger.mspjI import MSPJIFlagger
from singlecellmultiomics.universalBamTagger.scchic import ChicSeqFlagger
from singlecellmultiomics.universalBamTagger.nlaIII import NlaIIIFlagger
from singlecellmultiomics.universalBamTagger.digest import DigestFlagger
from singlecellmultiomics.universalBamTagger.rna import RNA_Flagger
import warnings
import numpy as np
import math
import singlecellmultiomics.features
import singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods
from singlecellmultiomics.tagtools import tagtools
import argparse
import pysamiterators.iterators as pysamIterators
import gzip
import pickle
from singlecellmultiomics.alleleTools import alleleTools
import uuid
import collections
import glob
c = 1_000  # !!! PLEASE USE PYTHON 3.6 OR HIGHER !!!
complement = str.maketrans('ATGC', 'TACG')


warnings.warn(
    "universalBamTagger is deprecated. Please switch to use bamtagmultiome.py ",
    DeprecationWarning)


# Modules:


if __name__ == "__main__":
    argparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Add allelic NLA and/or MSPJI digest information to a demultiplexed bam file
    Adds the following tags:
    RC: number of oversequenced molecules
    DT: Source type (NLA/MSPJI/RNA.. )
    DA: Digest allele
    RP: Recognized sequence
    RS: Recognized strand '-','?','+'

    Which tags are filled in depends on which files you supply, for example if you do not supply an allelic VCF file, then no allele tag will be added
    """)

    argparser.add_argument('bamfiles', metavar='bamfile', type=str, nargs='+')
    argparser.add_argument(
        '--local',
        action='store_true',
        help='Do not qsub sorting and indexing [deprecated]')
    argparser.add_argument(
        '-tmpprefix',
        metavar='PREFIX.nnnn.bam',
        help='When sorting, specify temp dir to prevent home directory filling up',
        default='~/')
    #argparser.add_argument('--noSort', action='store_true', help='Do not sort and index')
    #argparser.add_argument('--cluster', action='store_true', help='Do not qsub sorting and indexing')

    tagAlgs = argparser.add_argument_group('Tagging algorithms', '')
    tagAlgs.add_argument(
        '--ftag',
        action='store_true',
        help='!DEPRECATED, this flag is now always used!')
    tagAlgs.add_argument(
        '--atag',
        action='store_true',
        help='Add only allele tags')
    tagAlgs.add_argument(
        '--mspji',
        action='store_true',
        help='Look for mspji digest')
    tagAlgs.add_argument(
        '--nla',
        action='store_true',
        help='Look for nlaIII digest')
    tagAlgs.add_argument(
        '--chic',
        action='store_true',
        help='Look for mnase (scCHIC) digest')
    tagAlgs.add_argument(
        '--scar',
        action='store_true',
        help='Create R1 start, cigar sequence based DS tags')
    tagAlgs.add_argument(
        '--taps',
        action='store_true',
        help='Add TAPS based methylation state ')
    tagAlgs.add_argument(
        '-tag',
        type=str,
        default=None,
        help='Determine oversequencing based on a tag (for example XT to count RNA oversequencing for featureCounts counted transcripts. chrom for chromosome/contig count)')

    # RNA options
    tagAlgs.add_argument(
        '--rna',
        action='store_true',
        help='Assign RNA molecules, requires a intronic and exonic GTF file')
    argparser.add_argument(
        '-introns',
        type=str,
        help='Intronic GTF file, use exonGTFtoIntronGTF.py to generate such a GTF file')
    argparser.add_argument('-exons', type=str, help='Exonic GTF file.')
    """
    How much bases of the fragment fall inside the intron(s)
    How much bases of the fragment fall inside the exon(s)
    is the fragment spliced? How many bases are spliced out?
    """

    argparser.add_argument(
        '-moleculeRadius',
        type=int,
        help='Search radius for each molecule, if a cut site is found within this range with the same umi, it will be counted as the same molecule',
        default=0)

    argparser.add_argument(
        '-alleles',
        type=str,
        help='VCF file. Add allelic info flag based on vcf file')
    argparser.add_argument(
        '--loadAllelesToMem',
        action='store_true',
        help='Load allele data completely into memory')

    argparser.add_argument('-ref', type=str, help='reference FASTA file.')
    argparser.add_argument(
        '-o',
        type=str,
        default='./tagged',
        help='output folder')
    argparser.add_argument(
        '--dedup',
        action='store_true',
        help='Create deduplicated bam files')

    argparser.add_argument('-chr', help='run only on this chromosome')

    argparser.add_argument(
        '-head',
        type=int,
        default=None,
        help='Only process first n reads of every bam file')

    devArgs = argparser.add_argument_group('Development options', '')
    devArgs.add_argument(
        '--fatal',
        action='store_true',
        help='Crash upon any encountered error')
    devArgs.add_argument('--verbose', action='store_true', help='Be verbose')
    devArgs.add_argument(
        '--knh',
        action='store_true',
        help='Keep non-headered bam file')

    args = argparser.parse_args()

    if not args.mspji and not args.nla and not args.chic and not args.ftag and not args.rna and args.tag is None and args.atag is None and args.taps is None:
        raise ValueError(
            'Please supply any or a combination of --ftag --nla --chic --mspji --taps')

    if args.rna and (args.introns is None or args.exons is None):
        raise ValueError(
            'Please supply exonic and intronic GTF files -introns -exons')

    """
    Corrected Tags
    BC: This should be used for the sample-barcode so that there is no conflict in case that both sample- and molecular-barcodes are used
    QT: This should be the quality for the sample-barcode BC

    New Tags
    RX: The raw sequence bases of the molecular-barcode(s) of the read(s) can be placed in this tag
    QX: The original qualites (as QUAL) of the bases in RX

    BX: Sequence bases of the unique molecular identifier, may be corrected or raw
    BZ: Quality score the unique molecular identifier. May be corrected or raw. See BX tag for qualities

    MI: Molecular-barcode sequence or identifier, may include non-sequence bases (Here, it is BC+RX)
    QM: Molecular-barcode qualities, QT+QX

    DS: coordinate of site
    RC: number of oversequenced molecules
    DT: Source type (NLA/MSPJI/RNA..? )
    DA: Digest allele
    RP: Recognized sequence
    RS: Recognized strand '-','?','+'
    LI: ligation sequence

    """


# 1: read1 2: read2
[docs]def molecule_to_random_primer_dict(molecule, primer_length=6, primer_read=2): rp = collections.defaultdict(list) for fragment in molecule: if fragment[primer_read - 1] is not None: hstart, hseq = tagtools.getRandomPrimerHash( fragment[primer_read - 1], onStart=True, primerLength=6) rp[hstart, hseq].append(fragment) return rp
# for k,p in rp.items(): # yield p
[docs]class MoleculeIterator_OLD(): """ Iterate over molecules in a bam file Parameters ---------- alignmentfile : pysam.AlignmentFile file to read the molecules from look_around_radius : int buffer to accumulate molecules in. All fragments belonging to one molecule should fit this radius umi_hamming_distance : int Edit distance on UMI, 0: only exact match, 1: single base distance sample_select : iterable Iterable of samples to only select molecules from Yields ---------- list of molecules : list [ pysam.AlignedSegment ] [ (R1,R2), (R1,R2) ... ] """ def __init__(self, alignmentfile, look_around_radius=100_000, umi_hamming_distance=0, # 0: only exact match, 1: single base distance sample_select=None, # iterable of samples to only select molecules from # when a string is supplied only a single sample is selected **pysam_kwargs ): self.alignmentfile = alignmentfile self.look_around_radius = look_around_radius self.current_position = None self.current_chromosome = None self.molecules_yielded = 0 self.umi_hamming_distance = umi_hamming_distance self.pysam_kwargs = pysam_kwargs self.sample_select = sample_select self.sample_tag = 'SM' self._clear() self.filter_function = None # function which results given two reads if it is usable def _clear(self): self.molecule_cache = collections.defaultdict(lambda: collections.defaultdict( list)) # position -> cell,umi,strand,allele.. -> reads
[docs] def sample_assignment_function(self, fragment): for read in fragment: if read is not None: if read.has_tag(self.sample_tag): return read.get_tag(self.sample_tag) return None
# Returns postion unique identifier for a fragment
[docs] def localisation_function(self, fragment): if not fragment[0].has_tag('DS'): return None return fragment[0].get_tag('DS')
def __repr__(self): return f"""Molecule iterator current position: {self.current_chromosome}:{self.current_position} yielded {self.molecules_yielded} so far currently {len(self.molecule_cache)} positions cached keeping {self.get_cached_fragment_count()} fragments cached """
[docs] def get_cached_fragment_count(self): total_fragments = 0 for position, data_per_molecule in self.molecule_cache.items(): for molecule_id, molecule_reads in data_per_molecule.items(): total_fragments += len(molecule_reads) return total_fragments
[docs] def assignment_function(self, fragment): return fragment[0].get_tag('SM'), fragment[0].get_tag( 'RX'), fragment[0].get_tag('RS')
# Returns True if two fragment ids are identical or close enough
[docs] def eq_function(self, assignment_a, assignment_b): if assignment_a is None or assignment_b is None: return False sample_A, umi_A, strand_A = assignment_a sample_B, umi_B, strand_B = assignment_b if sample_A != sample_B or strand_A != strand_B: return False if self.umi_hamming_distance == 0: return umi_A == umi_B else: return hamming_distance(umi_A, umi_B) <= self.umi_hamming_distance
def _yield_all_in_current_cache(self): for position, data_per_molecule in self.molecule_cache.items(): for molecule_id, molecule_reads in data_per_molecule.items(): self.molecules_yielded += 1 yield molecule_reads self._clear() def _purge(self): drop = [] for pos in self.molecule_cache: if pos < (self.current_position - self.look_around_radius): # purge this coordinate: for molecule_id, molecule_reads in self.molecule_cache[pos].items( ): self.molecules_yielded += 1 yield molecule_reads drop.append(pos) for pos in drop: del self.molecule_cache[pos]
[docs] def get_fragment_chromosome(self, fragment): for read in fragment: if read is not None: return read.reference_name
def __iter__(self): for fragment in pysamIterators.MatePairIterator( self.alignmentfile, **self.pysam_kwargs, performProperPairCheck=False): if fragment[0] is None or fragment[0].reference_name is None: continue # Check if we want to drop the fragment because its corresponding # sample is not selected if self.sample_select is not None: sample = self.sample_assignment_function(fragment) if isinstance(self.sample_select, str): # single sample if self.sample_select != sample: continue else: # list or set of samples if sample not in self.sample_select: continue # We went to another chromsome, purge all in cache: if self.get_fragment_chromosome( fragment) != self.current_chromosome and self.current_chromosome is not None: for molecule in self._yield_all_in_current_cache(): yield molecule # Check if we need to purge and yield results: for molecule in self._purge(): yield molecule position = self.localisation_function(fragment) if position is None: continue molecule_id = self.assignment_function(fragment) # Check if there is a molecule to assign to already present: assigned = False for existing_molecule in self.molecule_cache[position]: if self.eq_function(existing_molecule, molecule_id): assigned = True self.molecule_cache[position][existing_molecule].append( fragment) if not assigned: self.molecule_cache[position][molecule_id].append(fragment) self.current_chromosome = self.get_fragment_chromosome(fragment) self.current_position = self.localisation_function( fragment) # [0].reference_end # yield everything which was not yielded yet for molecule in self._yield_all_in_current_cache(): yield molecule
""" Iterate over transcripts in a bam file Parameters ---------- alignmentfile : pysam.AlignmentFile file to read the molecules from look_around_radius : int buffer to accumulate molecules in. All fragments belonging to one molecule should fit this radius informative_read : int which read is used to define the mapping coordinate of the fragment, 1 or 2. umi_hamming_distance : int Edit distance on UMI, 0: only exact match, 1: single base distance, 2 base distance ... assignment_radius : int tolerance on fragment starting coordinates sample_select : iterable Iterable of samples to only select molecules from Yields ---------- list of molecules : list [ pysam.AlignedSegment ] [ (R1,R2), (R1,R2) ... ] """
[docs]class TranscriptIterator(MoleculeIterator_OLD): def __init__( self, look_around_radius=100, informative_read=2, assignment_radius=10, **kwargs): MoleculeIterator_OLD.__init__( self, look_around_radius=look_around_radius, **kwargs) self.informative_read = informative_read self.assignment_radius = assignment_radius
[docs] def assignment_function(self, fragment): return fragment[self.informative_read - 1].get_tag('SM'), fragment[self.informative_read - 1].get_tag('RX'), fragment[self.informative_read - 1].is_reverse
[docs] def localisation_function(self, fragment): return int((fragment[self.informative_read - 1].reference_start) / self.assignment_radius) * self.assignment_radius
[docs]class QueryNameFlagger(DigestFlagger): def __init__(self, **kwargs): self.assignedReadGroups = set() DigestFlagger.__init__(self, **kwargs)
[docs] def digest(self, reads): for read in reads: if read is None: continue if read.has_tag('SM'): # It is already tagged return if read.query_name.startswith('UMI'): # Old format import tagBamFile # this is not included anymore tagBamFile.recodeRead(read) else: tr = singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TaggedRecord( singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods.TagDefinitions ) tr.fromTaggedBamRecord(read) newHeader = tr.asIlluminaHeader() read.query_name = newHeader tr.tagPysamRead(read) rg = f"{read.get_tag('Fc') if read.has_tag('Fc') else 'NONE'}.{read.get_tag('La') if read.has_tag('La') else 'NONE'}.{read.get_tag('SM') if read.has_tag('SM') else 'NONE'}" self.assignedReadGroups.add(rg) # Add read group: read.set_tag('RG', rg)
[docs]class AlleleTagger(DigestFlagger): def __init__(self, **kwargs): DigestFlagger.__init__(self, **kwargs)
[docs] def digest(self, reads): nonNullReads = [read for read in reads if read is not None] self.addAlleleInfo(nonNullReads)
if __name__ == "__main__": # These data sources are fed to all flaggers flaggerArguments = { 'reference': None if args.ref is None else pysamIterators.CachedFasta( pysam.FastaFile(args.ref)), 'alleleResolver': None if args.alleles is None else alleleTools.AlleleResolver( args.alleles, lazyLoad=not args.loadAllelesToMem), 'moleculeRadius': args.moleculeRadius, 'verbose': args.verbose, 'exon_gtf': args.exons, 'intron_gtf': args.introns } pairedEnd = False flaggers = [] qFlagger = None # if args.ftag: # Now we ALWAYS put reads through qflagger qFlagger = QueryNameFlagger(**flaggerArguments) flaggers.append(qFlagger) if args.nla: flaggers.append(NlaIIIFlagger(**flaggerArguments)) pairedEnd = True if args.mspji: flaggers.append(MSPJIFlagger(**flaggerArguments)) pairedEnd = True if args.chic: flaggers.append(ChicSeqFlagger(**flaggerArguments)) pairedEnd = True if args.scar: flaggers.append(ScarFlagger(**flaggerArguments)) pairedEnd = True if args.tag: flaggers.append(TagFlagger(tag=args.tag)) if args.atag: print("Running allele tagging") flaggers.append(AlleleTagger(**flaggerArguments)) if args.rna: flaggers.append(RNA_Flagger(**flaggerArguments)) if args.taps: flaggers.append(TAPSFlagger(**flaggerArguments)) pairedEnd = True if args.alleles is not None: # Check if the variant file is valid.. if not (args.alleles.endswith('.vcf.gz') or args.alleles.endswith('.bcf.gz')): raise ValueError(f"""Please supply an indexed (bg)zipped VCF file. You can convert your file using: bcftools view {args.alleles} -O b -o {args.alleles}.gz; then index using bcftools index {args.alleles}.gz """) if not ( os.path.exists( args.alleles + '.csi') or os.path.exists( args.alleles + '.tbi')): raise ValueError(f"""Please supply an indexed (bg)zipped VCF file. Index using: bcftools index {args.alleles} """) if pairedEnd: print('Assuming the input is paired end') else: print('Assuming the input is single end ( Has no influence on ftag)') if not os.path.exists(args.o): try: os.makedirs(args.o) except Exception as e: pass # Here we walk through the bamfiles and fetch tmpprefix = f'{args.tmpprefix}' for bamFilePath in args.bamfiles: bamFile = pysam.AlignmentFile(bamFilePath, "rb") header = bamFile.header.copy() outPathTemp = f'{args.o}/{os.path.basename(bamFilePath)}.unsorted' outPathTempWithHeader = f'{args.o}/{os.path.basename(bamFilePath)}.unsorted.with_header.bam' outPath = f"{args.o}/{os.path.basename(bamFilePath).replace('.bam', '')}.bam" print(f'Now reading {bamFilePath}, writing to {outPath}') if args.dedup: dedupOutPathTemp = f'{args.o}/{os.path.basename(bamFilePath)}.dedup.unsorted' dedupOutPathTempWithHeader = f'{args.o}/{os.path.basename(bamFilePath)}.dedup.with_header.bam' dedupOutPath = f"{args.o}/{os.path.basename(bamFilePath).replace('.bam', '')}.dedup.bam" dedupOutputFile = pysam.AlignmentFile( dedupOutPathTemp, "wb", header=header) outputFile = pysam.AlignmentFile(outPathTemp, "wb", header=header) bamName = os.path.basename(bamFilePath).replace('.bam', '') # if os.path.exists(outPath+'.bai'): # continue i = 0 if pairedEnd: it = pysamIterators.MatePairIterator( bamFile, performProperPairCheck=False, contig=args.chr) for i, (R1, R2) in enumerate(it): for flagger in flaggers: try: flagger.digest([R1, R2]) except Exception as e: print(e) if args.fatal: print(R1, R2) raise e try: if R1 is not None: outputFile.write(R1) if R2 is not None: outputFile.write(R2) except Exception as e: for flagger in flaggers: print(flagger) raise e if args.dedup and ( (R1 is not None and R1.has_tag('RC') and R1.get_tag('RC') == 1) or ( R2 is not None and R2.has_tag('RC') and R2.get_tag('RC') == 1)): if R1 is not None: dedupOutputFile.write(R1) if R2 is not None: dedupOutputFile.write(R2) if args.head is not None and i >= args.head: break else: iterator = bamFile.fetch( contig=args.chr) if args.chr is not None else bamFile for i, R1 in enumerate(iterator): for flagger in flaggers: try: flagger.digest([R1]) except Exception as e: if args.fatal: raise e print(e) outputFile.write(R1) if args.head is not None and i >= args.head: break print(f'{bamFilePath}: processed {i} reads') print("Finishing files (forcing close)") outputFile.close() if args.dedup: dedupOutputFile.close() if qFlagger is not None: readGroups = {} for readGroup in qFlagger.assignedReadGroups: flowCell, lane, sampleLib = readGroup.split('.') library, sample = sampleLib.rsplit('_', 1) readGroups[readGroup] = { 'ID': readGroup, 'LB': library, 'PL': 'ILLUMINA', 'SM': sample, 'PU': readGroup} headerSamFilePath = outPath.replace('.bam', '') + '.header.sam' hCopy = header.to_dict() hCopy['RG'] = list(readGroups.values()) with gzip.open(outPathTemp.replace('.bam', '') + '.readGroups.pickle.gz', 'wb') as rgzip, pysam.AlignmentFile(headerSamFilePath, 'w', header=hCopy) as headerSam: pickle.dump(readGroups, rgzip) # for readGroup, fields in readGroups.items(): # headerSam.write(f'@RG\tID:{readGroup}\tPL:{fields["PL"]}\tSM:{fields["SM"]}\tPU:{fields["PU"]}\tLB:{fields["LB"]}\n') # Clear the read groups qFlagger.assignedReadGroups = set() # Perform a reheading, sort and index rehead_cmd = f"""{{ cat {headerSamFilePath}; samtools view {outPathTemp}; }} | samtools view -b > {outPathTempWithHeader} ; rm {outPathTemp}; samtools sort -T {tmpprefix} {outPathTempWithHeader} > {outPath}; samtools index {outPath}; rm {outPathTempWithHeader}; """ print(f"Adding read groups to header and sorting.") os.system(rehead_cmd) # Same procedure for dedup: if args.dedup: rehead_cmd = f"""{{ cat {headerSamFilePath}; samtools view {dedupOutPathTemp}; }} | samtools view -b > {dedupOutPathTempWithHeader} ; rm {dedupOutPathTemp}; samtools sort -T {tmpprefix} {dedupOutPathTempWithHeader} > {dedupOutPath}; samtools index {dedupOutPath}; rm {dedupOutPathTempWithHeader}; """ print(f"Dedup file: adding read groups to header and sorting.") os.system(rehead_cmd) else: # we cannot assign readgroups... rehead_cmd = f""" samtools sort -T {tmpprefix} {outPathTemp} > {outPath}; samtools index {outPath}; rm {outPathTemp}; """ print(f"Adding read groups to header and sorting.") os.system(rehead_cmd) if args.dedup: rehead_cmd = f""" samtools sort -T {tmpprefix} {dedupOutPathTemp} > {dedupOutPath}; samtools index {dedupOutPath}; rm {dedupOutPathTemp}; """ print(f"Dedup file: adding read groups to header and sorting.") os.system(rehead_cmd) """ Is:NS500414;RN:455;Fc:HYLVHBGX5;La:3;Ti:13601;CX:9882;CY:17671;Fi:N;CN:0;aa:CCGTCC;aA:CCGTCC;aI:16;LY:A3-P15-1-1;RX:ACG;RQ:GGG;BI:17;bc:ACTCGATG;BC:ACTCGATG;QT:GGKKKKKK;MX:NLAIII384C8U3;A2:TGG;AQ:E6E for f in $(ls *.bam | cut -f 1,2,3 -d '-' | sort | uniq | grep -v bam); do submission.py -y -time 20 -t 1 "addTagsToLennartNLABam.py $f*cell*.bam"; done """