Source code for singlecellmultiomics.universalBamTagger.tapsTabulator

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import singlecellmultiomics.molecule
import singlecellmultiomics.fragment
import pysamiterators
import pysam
import argparse
import singlecellmultiomics.bamProcessing.bamFunctions as bf
from singlecellmultiomics.features import FeatureContainer
import os
from singlecellmultiomics.alleleTools import AlleleResolver


[docs]def finish_bam(output, args, temp_out): output.close() # Sort and index # Perform a reheading, sort and index cmd = f"""samtools sort {temp_out} > {args.bamout}; samtools index {args.bamout}; rm {temp_out}; """ os.system(cmd)
# sy zcat chr18 | sort -S 4G -k 3,3 -n -T ./TMP | gzip > # chr18_sorted.gzip" -m 5 if __name__ == '__main__': argparser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Extract TAPS methylation calls from BAM file') argparser.add_argument('alignmentfile', type=str) argparser.add_argument( '-ref', type=str, help='path to reference fasta file, auto detected from bamfile') argparser.add_argument( '--every_fragment_as_molecule', action='store_true', help='Assign every fragment as a molecule, this effectively disables UMI deduplication and consensus calls based on multiple fragments') argparser.add_argument( '-head', type=int, help='Tabulate the first N valid molecules') argparser.add_argument( '-features', type=str, help='Annotate cut locations with these features, path to gtf file') argparser.add_argument( '-min_phred_score', type=int, help='Do not call methylation for bases with a phred score lower than "min_phred_score"',default=None) argparser.add_argument( '-dove_R1_distance', type=int, help='Do not call methylation N bases form the end of R1',default=8) argparser.add_argument( '-dove_R2_distance', type=int, help='Do not call methylation N bases form the end of R2',default=8) argparser.add_argument( '-skip_last_n_cycles_R1', type=int, help='Do not call methylation N bases form the end of R1',default=5) argparser.add_argument( '-skip_first_n_cycles_R1', type=int, help='Do not call methylation N bases form the start of R1',default=5) argparser.add_argument( '-skip_last_n_cycles_R2', type=int, help='Do not call methylation N bases form the end of R2',default=5) argparser.add_argument( '-skip_first_n_cycles_R2', type=int, help='Do not call methylation N bases form the start of R2',default=5) argparser.add_argument('-minmq', type=int, default=50) argparser.add_argument( '-contig', type=str, help='contig to run on, all when not specified') argparser.add_argument('-method', type=str, default='nla', help='nla, or chic') argparser.add_argument( '-fmt', type=str, default='table', help="""output format (options are: 'bed' or 'table' or 'table_more'), Columns are: table: sample, cut_site, molecule_span, umi, strand, chromosome, location (1 based), context long format (slower): sample, cut_site, molecule_span, mean_R1_cyle, mean_R2_cycle, mean_phred_score, umi, strand, chromosome, location (1 based), context """) argparser.add_argument( '-moleculeNameSep', type=str, help='Separator to use in molecule name', default=':') argparser.add_argument( '-samples', type=str, help='Samples to select, separate with comma. For example CellA,CellC,CellZ', default=None) argparser.add_argument( '-context', type=str, help='Contexts to select, separate with comma. For example Z,H,X', default=None) argparser.add_argument( '-bamout', type=str, help="optional (tagged) output BAM path") argparser.add_argument( '--allow_single_end', action='store_true', help='Allow single end reads') allele_gr = argparser.add_argument_group('alleles') allele_gr.add_argument('-alleles', type=str, help="Phased allele file (VCF)") allele_gr.add_argument( '-allele_samples', type=str, help="Comma separated samples to extract from the VCF file. For example B6,SPRET") allele_gr.add_argument( '-unphased_alleles', type=str, help="Unphased allele file (VCF)") allele_gr.add_argument( '--haplo_molecule_assignment', action='store_true', help='Take allele information into account during molecule assignment ') allele_gr.add_argument( '--set_allele_resolver_verbose', action='store_true', help='Makes the allele resolver print more') allele_gr.add_argument( '--use_allele_cache', action='store_true', help='''Write and use a cache file for the allele information. NOTE: THIS IS NOT THREAD SAFE! Meaning you should not use this function on multiple libraries at the same time when the cache files are not yet available. Once they are available there is not thread safety issue anymore''') args = argparser.parse_args() alignments = pysam.AlignmentFile(args.alignmentfile) if args.bamout is not None: temp_out = f'{args.bamout}.unsorted.bam' output = pysam.AlignmentFile(temp_out, "wb", header=alignments.header) else: output = None samples = None if args.samples is None else set(args.samples.split(',')) contexts = None if args.context is None else set( [x.upper() for x in args.context.split(',')] + [x.lower() for x in args.context.split(',')]) if args.ref is None: args.ref = bf.get_reference_from_pysam_alignmentFile(alignments) if args.ref is None: raise ValueError("Supply reference, -ref") reference = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref)) taps = singlecellmultiomics.molecule.TAPS() if args.features is not None: features = FeatureContainer() features.loadGTF(args.features,thirdOnly='gene',store_all=True) else: features = None fragment_class_args={'umi_hamming_distance': 1, 'no_umi_cigar_processing':False} molecule_class_args = { 'reference': reference, 'taps': taps, 'taps_strand':'R', 'min_max_mapping_quality': args.minmq } if args.method == 'nla': molecule_class = singlecellmultiomics.molecule.TAPSNlaIIIMolecule fragment_class = singlecellmultiomics.fragment.NlaIIIFragment molecule_class_args.update({'site_has_to_be_mapped': True}) elif args.method == 'chic': molecule_class = singlecellmultiomics.molecule.TAPSCHICMolecule fragment_class = singlecellmultiomics.fragment.CHICFragment else: raise ValueError("Supply 'nla' or 'chic' for -method") molecule_class_args['methylation_consensus_kwargs'] = { 'skip_first_n_cycles_R1':args.skip_first_n_cycles_R1, 'skip_first_n_cycles_R2':args.skip_first_n_cycles_R2, 'dove_R1_distance':args.dove_R1_distance, 'dove_R2_distance':args.dove_R2_distance, 'skip_last_n_cycles_R1':args.skip_last_n_cycles_R1, 'skip_last_n_cycles_R2':args.skip_last_n_cycles_R2, 'min_phred_score':args.min_phred_score } ignore_conversions = set([('C', 'T'), ('G', 'A')]) if args.alleles is not None and args.alleles!='none': molecule_class_args['allele_resolver'] = AlleleResolver( args.alleles, select_samples=args.allele_samples.split(',') if args.allele_samples is not None else None, lazyLoad=True, use_cache=args.use_allele_cache, verbose = args.set_allele_resolver_verbose, ignore_conversions=ignore_conversions) ar = molecule_class_args.get('allele_resolver') if args.allow_single_end: # Single end base calls are "unsafe", allow them : molecule_class_args['allow_unsafe_base_calls'] = True fragment_class_args['single_end'] = True s = args.moleculeNameSep try: for i, molecule in enumerate(singlecellmultiomics.molecule.MoleculeIterator( alignments=alignments, molecule_class=molecule_class, yield_invalid=(output is not None), every_fragment_as_molecule=args.every_fragment_as_molecule, fragment_class=fragment_class, fragment_class_args=fragment_class_args, perform_allele_clustering = args.haplo_molecule_assignment and molecule_class_args.get('allele_resolver', None) is not None, molecule_class_args=molecule_class_args, contig=args.contig)): molecule.set_meta('mi',i) if args.head and (i - 1) >= args.head: break if not molecule.is_valid(set_rejection_reasons=True): if output is not None: molecule.write_pysam(output) continue # Skip sample if not selected if samples is not None and molecule.sample not in samples: molecule.set_rejection_reason('sample_not_selected') if output is not None: molecule.write_pysam(output) continue if args.fmt == "table_more": consensus = molecule.get_consensus() cut_contig,CUT_SITE,cut_strand = molecule.get_cut_site() if features is not None: features.findFeaturesAt(cut_contig, CUT_SITE) genes = [] for hit_start,hit_end,gene,strand,meta in features.findFeaturesAt(cut_contig, CUT_SITE): g = dict(meta).get('gene_name',gene) genes.append(g) additional=f"\t{','.join(genes)}" else: additional = "" if ar is not None: # Allele resolver is defined allele = molecule.allele if allele is not None: additional+=f'\t{allele}' else: additional+=f'\tnone' for (chromosome, location), call in molecule.methylation_call_dict.items(): if call['context'] == '.': # Only print calls concerning C's continue # Skip non-selected contexts if contexts is not None and call['context'] not in contexts: continue if args.fmt == "table": print( f"{molecule.sample}{s}{i}{s}{CUT_SITE}{s}{molecule.estimated_max_length}{s}{molecule.umi}{s}{molecule.get_strand_repr()}\t{chromosome}\t{location+1}\t{call['context']}\t{molecule.ligation_motif}{additional}") elif args.fmt == "table_more": qual = call['qual'] cov = call['cov'] print( f"{molecule.sample}{s}{i}{s}{CUT_SITE}{s}{molecule.estimated_max_length}{s}{molecule.umi}{s}{molecule.get_strand_repr()}\t{chromosome}\t{location+1}\t{call['context']}\t{qual}\t{cov}\t{molecule.ligation_motif}{additional}") elif args.fmt == "bed": sample = molecule.sample.split("_")[-1] print( f'{chromosome}\t{location}\t{location+1}\t{sample}\t1\t{molecule.get_strand_repr()}') if output is not None: molecule.write_tags() molecule.write_pysam(output) except (KeyboardInterrupt, BrokenPipeError) as e: pass if output is not None: finish_bam(output, args, temp_out)