Source code for singlecellmultiomics.universalBamTagger.bamtagmultiome

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pysam
from singlecellmultiomics.molecule import MoleculeIterator
import singlecellmultiomics
import singlecellmultiomics.molecule
import singlecellmultiomics.fragment
from singlecellmultiomics.bamProcessing.bamFunctions import sorted_bam_file, get_reference_from_pysam_alignmentFile, write_program_tag, MapabilityReader, verify_and_fix_bam

from singlecellmultiomics.utils import is_main_chromosome
import singlecellmultiomics.alleleTools
from singlecellmultiomics.universalBamTagger.customreads import CustomAssingmentQueryNameFlagger
import singlecellmultiomics.features
import pysamiterators
import argparse
import uuid
import os
import sys
import colorama
import sklearn
import pkg_resources
import pickle
from datetime import datetime
import traceback

available_consensus_models = pkg_resources.resource_listdir('singlecellmultiomics','molecule/consensus_model')

argparser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    description='Assign molecules, set sample tags, set alleles')
argparser.add_argument('bamin', type=str)
argparser.add_argument('-o', type=str, help="output bam file", required=True)
argparser.add_argument(
    '-method',
    type=str,
    default=None,
    help="Protocol to tag, select from:nla, qflag, chic, nla_transcriptome, vasa, cs, nla_taps ,chic_taps, nla_no_overhang")
argparser.add_argument(
    '-qflagger',
    type=str,
    default=None,
    help="Query flagging algorithm")
argparser.add_argument('-custom_flags', type=str, default="MI,RX,BI,SM")
argparser.add_argument(
    '-ref',
    type=str,
    default=None,
    help="Path to reference fast (autodected if not supplied)")
argparser.add_argument('-umi_hamming_distance', type=int, default=1)
argparser.add_argument('-head', type=int)
argparser.add_argument('-contig', type=str, help='Contig to only process')
argparser.add_argument('-region_start', type=int, help='Zero based start coordinate of region to process')
argparser.add_argument('-region_end', type=int, help='Zero based end coordinate of region to process')

argparser.add_argument('-alleles', type=str, help="Phased allele file (VCF)")
argparser.add_argument(
    '-allele_samples',
    type=str,
    help="Comma separated samples to extract from the VCF file. For example B6,SPRET")
argparser.add_argument(
    '-unphased_alleles',
    type=str,
    help="Unphased allele file (VCF)")
argparser.add_argument(
    '--every_fragment_as_molecule',
    action='store_true',
    help='Assign every fragment as a molecule, this effectively disables UMI deduplication')
argparser.add_argument(
    '--ignore_bam_issues',
    action='store_true',
    help='Ignore truncation')

argparser.add_argument(
    '-mapfile',
    type=str,
    help='Path to *.safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py ')

argparser.add_argument(
    '-annotmethod',
    type=int,
    default=1,
    help="Annotation resolving method. 0: molecule consensus aligned blocks. 1: per read per aligned base")
cluster = argparser.add_argument_group('cluster execution')
cluster.add_argument(
    '--cluster',
    action='store_true',
    help='split by chromosomes and submit the job on cluster')
cluster.add_argument(
    '--no_rejects',
    action='store_true',
    help='Write rejected reads to output file')
cluster.add_argument(
    '-mem',
    default=40,
    type=int,
    help='Memory requested per job')
cluster.add_argument(
    '-time',
    default=52,
    type=int,
    help='Time requested per job')

cluster.add_argument(
    '-clusterdir',
    type=str,
    default=None,
    help='Folder to store cluster files in (scripts and sterr/out, when not specified a "cluster" folder will be made in same directory as -o')


tr = argparser.add_argument_group('transcriptome specific settings')
tr.add_argument('-exons', type=str, help='Exon GTF file')
tr.add_argument(
    '-introns',
    type=str,
    help='Intron GTF file, use exonGTF_to_intronGTF.py to create this file')

cg = argparser.add_argument_group('molecule consensus specific settings')
cg.add_argument(
    '--consensus',
    action='store_true',
    help='Calculate molecule consensus read, this feature is _VERY_ experimental')
cg.add_argument('-consensus_mask_variants', type=str,
                help='variants to mask during training (VCF file)')
cg.add_argument(
    '-consensus_model',
    type=str,
    help=f'Name of or path to consensus classifier, built-in available models are:{", ".join(available_consensus_models)}',
    default=None)
cg.add_argument(
    '-consensus_n_train',
    type=int,
    help='Amount of bases used for training',
    default=500_000)

cg.add_argument(
    '-consensus_k_rad',
    type=int,
    help='consensus model k radius',
    default=3)

cg.add_argument('--no_source_reads', action='store_true',
                help='Do not write original reads, only consensus ')

cg.add_argument('--consensus_allow_train_location_oversampling', action='store_true',
                help='Allow to train the consensus model multiple times for a single genomic location')




[docs]def run_multiome_tagging_cmd(commandline): args = argparser.parse_args(commandline) run_multiome_tagging(args)
[docs]def run_multiome_tagging(args): """ Run multiome tagging adds molecule information Arguments: bamin (str) : bam file to process o(str) : path to output bam file method(str): Protocol to tag, select from:nla, qflag, chic, nla_transcriptome, vasa, cs, nla_taps ,chic_taps, nla_no_overhang qflagger(str): Query flagging algorithm to use, this algorithm extracts UMI and sample information from your reads. When no query flagging algorithm is specified, the `singlecellmultiomics.universalBamTagger.universalBamTagger.QueryNameFlagger` is used method(str) : Method name, what kind of molecules need to be extracted. Select from: nla qflag chic nla_transcriptome vasa cs nla_taps chic_taps custom_flags(str): Arguments passed to the query name flagger, comma separated "MI,RX,BI,SM" ref(str) : Path to reference fasta file, autodected from bam header when not supplied umi_hamming_distance(int) : Max hamming distance on UMI's head (int) : Amount of molecules to process contig (str) : only process this contig region_start(int) : Zero based start coordinate of single region to process region_end(int) : Zero based end coordinate of single region to process, None: all contigs when contig is not set, complete contig when contig is set. alleles (str) : path to allele VCF allele_samples(str): Comma separated samples to extract from the VCF file. For example B6,SPRET unphased_alleles(str) : Path to VCF containing unphased germline SNPs mapfile (str) : 'Path to \*.safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py annotmethod (int) : Annotation resolving method. 0: molecule consensus aligned blocks. 1: per read per aligned base cluster (bool) : Run contigs in separate cluster jobs no_rejects(bool) : Do not write rejected reads mem (int) : Amount of gigabytes to request for cluster jobs time(int) : amount of wall clock hours to request for cluster jobs exons(str): Path to exon annotation GTF file introns(str): Path to intron annotation GTF file consensus(bool) : Calculate molecule consensus read, this feature is _VERY_ experimental consensus_model(str) : Path to consensus calling model, when none specified, this is learned based on the supplied bam file, ignoring sites supplied by -consensus_mask_variants consensus_mask_variants(str): Path VCF file masked for training on consensus caller consensus_n_train(int) : Amount of bases used for training the consensus model no_source_reads(bool) : Do not write original reads, only consensus """ MISC_ALT_CONTIGS_SCMO = 'MISC_ALT_CONTIGS_SCMO' every_fragment_as_molecule = args.every_fragment_as_molecule if not args.o.endswith('.bam'): raise ValueError( "Supply an output which ends in .bam, for example -o output.bam") # Verify wether the input file is indexed and sorted... if not args.ignore_bam_issues: verify_and_fix_bam(args.bamin) if os.path.exists(args.o): print(f"Removing existing file {args.o}") os.remove(args.o) input_bam = pysam.AlignmentFile(args.bamin, "rb", ignore_truncation=args.ignore_bam_issues) # autodetect reference: reference = None if args.ref is None: args.ref = get_reference_from_pysam_alignmentFile(input_bam) if args.ref is not None: try: reference = pysamiterators.iterators.CachedFasta( pysam.FastaFile(args.ref)) except Exception as e: print("Error when loading the reference file, continuing without a reference") reference = None ##### Define fragment and molecule class arguments and instances: #### queryNameFlagger = None if args.qflagger is not None: if args.qflagger == 'custom_flags': queryNameFlagger = CustomAssingmentQueryNameFlagger( args.custom_flags.split(',')) else: raise ValueError("Select from 'custom_flags, ..' ") molecule_class_args = { 'umi_hamming_distance': args.umi_hamming_distance, 'reference': reference } fragment_class_args = {} yield_invalid = True # if invalid reads should be written if args.no_rejects: yield_invalid = False ignore_conversions = None if args.method == 'nla_taps' or args.method == 'chic_taps': ignore_conversions = set([('C', 'T'), ('G', 'A')]) if args.alleles is not None: molecule_class_args['allele_resolver'] = singlecellmultiomics.alleleTools.AlleleResolver( args.alleles, select_samples=args.allele_samples.split(',') if args.allele_samples is not None else None, lazyLoad=True, ignore_conversions=ignore_conversions) if args.mapfile is not None: molecule_class_args['mapability_reader'] = MapabilityReader( args.mapfile) ### Transcriptome configuration ### if args.method in ('nla_transcriptome', 'cs', 'vasa'): print( colorama.Style.BRIGHT + 'Running in transcriptome annotation mode' + colorama.Style.RESET_ALL) if args.exons is None or args.introns is None: raise ValueError("Please supply both intron and exon GTF files") transcriptome_features = singlecellmultiomics.features.FeatureContainer() print("Loading exons", end='\r') transcriptome_features.loadGTF( args.exons, select_feature_type=['exon'], identifierFields=( 'exon_id', 'gene_id'), store_all=True, contig=args.contig, head=None) print("Loading introns", end='\r') transcriptome_features.loadGTF( args.introns, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, contig=args.contig, head=None) print("All features loaded") # Add more molecule class arguments molecule_class_args.update({ 'features': transcriptome_features, 'auto_set_intron_exon_features': True }) ### Method specific configuration ### if args.method == 'qflag': moleculeClass = singlecellmultiomics.molecule.Molecule fragmentClass = singlecellmultiomics.fragment.Fragment # Write all reads yield_invalid = True elif args.method == 'chic': moleculeClass = singlecellmultiomics.molecule.CHICMolecule fragmentClass = singlecellmultiomics.fragment.CHICFragment elif args.method == 'nla' or args.method == 'nla_no_overhang': moleculeClass = singlecellmultiomics.molecule.NlaIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment if args.method == 'nla_no_overhang': assert reference is not None, 'Supply a reference fasta using -ref!' fragment_class_args.update({ 'reference': reference, 'no_overhang': True }) elif args.method == 'nla_transcriptome': moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment molecule_class_args.update({ 'pooling_method': 1, # all data from the same cell can be dealt with separately 'stranded': None # data is not stranded }) elif args.method == 'nla_taps': moleculeClass = singlecellmultiomics.molecule.TAPSNlaIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment molecule_class_args.update({ 'reference': reference, 'taps': singlecellmultiomics.molecule.TAPS(reference=reference) }) elif args.method == 'chic_taps': molecule_class_args.update({ 'reference': reference, 'taps': singlecellmultiomics.molecule.TAPS(reference=reference) }) moleculeClass = singlecellmultiomics.molecule.TAPSCHICMolecule fragmentClass = singlecellmultiomics.fragment.CHICFragment elif args.method == 'vasa' or args.method == 'cs': moleculeClass = singlecellmultiomics.molecule.VASA fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript molecule_class_args.update({ 'pooling_method': 1, # all data from the same cell can be dealt with separately 'stranded': 1 # data is stranded }) else: raise ValueError("Supply a valid method") # This decides what molecules we will traverse if args.contig == MISC_ALT_CONTIGS_SCMO: contig = None else: contig = args.contig # This decides to only extract a single genomic region: if args.region_start is not None: if args.region_end is None: raise ValueError('When supplying -region_start then also supply -region_end') region_start = args.region_start region_end = args.region_end else: region_start = None region_end = None molecule_iterator_args = { 'alignments': input_bam, 'queryNameFlagger': queryNameFlagger, 'moleculeClass': moleculeClass, 'fragmentClass': fragmentClass, 'molecule_class_args': molecule_class_args, 'fragment_class_args': fragment_class_args, 'yield_invalid': yield_invalid, 'start':region_start, 'end':region_end, 'contig': contig, 'every_fragment_as_molecule': every_fragment_as_molecule } if args.contig == MISC_ALT_CONTIGS_SCMO: # When MISC_ALT_CONTIGS_SCMO is set as argument, all molecules with reads # mapping to a contig returning True from the is_main_chromosome # function are used def Misc_contig_molecule_generator(molecule_iterator_args): for reference in input_bam.references: if not is_main_chromosome(reference): molecule_iterator_args['contig'] = reference yield from MoleculeIterator(**molecule_iterator_args) molecule_iterator = Misc_contig_molecule_generator( molecule_iterator_args) else: molecule_iterator = MoleculeIterator(**molecule_iterator_args) ##### consensus_model_path = None if args.consensus: # Load from path if available: if args.consensus_model is not None: if os.path.exists(args.consensus_model): model_path = args.consensus_model else: model_path = pkg_resources.resource_filename( 'singlecellmultiomics', f'molecule/consensus_model/{args.consensus_model}') if model_path.endswith('.h5'): from tensorflow.keras.models import load_model consensus_model = load_model(model_path) else: with open(model_path, 'rb') as f: consensus_model = pickle.load(f) else: skip_already_covered_bases = not args.consensus_allow_train_location_oversampling if args.consensus_mask_variants is None: mask_variants = None else: mask_variants = pysam.VariantFile(args.consensus_mask_variants) print("Fitting consensus model, this may take a long time") consensus_model = singlecellmultiomics.molecule.train_consensus_model( molecule_iterator, mask_variants=mask_variants, n_train=args.consensus_n_train, skip_already_covered_bases=skip_already_covered_bases ) # Write the consensus model to disk consensus_model_path = os.path.abspath( os.path.dirname(args.o)) + '/consensus_model.pickle.gz' print(f'Writing consensus model to {consensus_model_path}') with open(consensus_model_path, 'wb') as f: pickle.dump(consensus_model, f) # We needed to check if every argument is properly placed. If so; the jobs # can be sent to the cluster if args.cluster: if args.contig is None: # Create jobs for all chromosomes: temp_prefix = os.path.abspath(os.path.dirname( args.o)) + '/SCMO_' + str(uuid.uuid4()) hold_merge = [] ## Create folder to store cluster files: if args.clusterdir is None: cluster_file_folder = os.path.abspath(os.path.dirname( args.o)) + '/cluster' else: cluster_file_folder = args.clusterdir print(f'Writing cluster scripts and standard out and error to {cluster_file_folder}') if not os.path.exists(cluster_file_folder): try: os.makedirs(cluster_file_folder,exist_ok=True) except Exception as e: print(e) pass found_alts = 0 for chrom in list(input_bam.references) + [MISC_ALT_CONTIGS_SCMO]: if not is_main_chromosome(chrom): found_alts += 1 continue if chrom == MISC_ALT_CONTIGS_SCMO and found_alts == 0: continue temp_bam_path = f'{temp_prefix}_{chrom}.bam' arguments = " ".join( [x for x in sys.argv if not x == args.o and x != '-o']) + f" -contig {chrom} -o {temp_bam_path}" if consensus_model_path is not None: arguments += f' -consensus_model {consensus_model_path}' job = f'SCMULTIOMICS_{str(uuid.uuid4())}' os.system( f'submission.py --silent' + f' -y -s {cluster_file_folder} --py36 -time {args.time} -t 1 -m {args.mem} -N {job} " {arguments};"') hold_merge.append(job) hold = ','.join(hold_merge) os.system( f'submission.py --silent' + f' -y --py36 -s {cluster_file_folder} -time {args.time} -t 1 -m 10 -N {job} -hold {hold} " samtools merge -c {args.o} {temp_prefix}*.bam; samtools index {args.o}; rm {temp_prefix}*.ba*"') exit() ##### # Load unphased variants to memory unphased_allele_resolver = None if args.unphased_alleles is not None: unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver( phased=False, ignore_conversions=ignore_conversions) try: for i, variant in enumerate( pysam.VariantFile( args.unphased_alleles).fetch( args.contig)): if 'PASS' not in list(variant.filter): continue if not all( len(allele) == 1 for allele in variant.alleles) or len( variant.alleles) != 2: continue if sum([len(set(variant.samples[sample].alleles)) == 2 for sample in variant.samples]) < 2: # Not heterozygous continue unphased_allele_resolver.locationToAllele[variant.chrom][variant.pos - 1] = { variant.alleles[0]: {'U'}, variant.alleles[1]: {'V'}} except Exception as e: # todo catch this more nicely print(e) out_bam_path = args.o # Copy the header input_header = input_bam.header.as_dict() # Write provenance information to BAM header write_program_tag( input_header, program_name='bamtagmultiome', command_line=" ".join( sys.argv), version=singlecellmultiomics.__version__, description=f'SingleCellMultiOmics molecule processing, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') print(f'Started writing to {out_bam_path}') read_groups = set() # Store unique read groups in this set with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups) as out: for i, molecule in enumerate(molecule_iterator): # Stop when enough molecules are processed if args.head is not None and (i - 1) >= args.head: break # set unique molecule identifier molecule.set_meta('mi', f'{molecule.get_a_reference_id()}_{i}') # Write tag values molecule.write_tags() if unphased_allele_resolver is not None: # write unphased allele tag: molecule.write_allele_phasing_information_tag( unphased_allele_resolver, 'ua') # Update read groups for fragment in molecule: read_groups.add(fragment.get_read_group()) # Calculate molecule consensus if args.consensus: try: consensus_reads = molecule.deduplicate_to_single_CIGAR_spaced( out, f'consensus_{molecule.get_a_reference_id()}_{i}', consensus_model, NUC_RADIUS=args.consensus_k_rad ) for consensus_read in consensus_reads: consensus_read.set_tag('RG', molecule[0].get_read_group()) consensus_read.set_tag('mi', i) out.write(consensus_read) except Exception as e: #traceback.print_exc() #print(e) molecule.set_rejection_reason('CONSENSUS_FAILED',set_qcfail=True) molecule.write_pysam(out) # Write the reads to the output file if not args.no_source_reads: molecule.write_pysam(out)
if __name__ == '__main__': args = argparser.parse_args() run_multiome_tagging(args)