Source code for singlecellmultiomics.molecule.iterator

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from singlecellmultiomics.molecule import Molecule
from singlecellmultiomics.fragment import Fragment
from singlecellmultiomics.universalBamTagger import QueryNameFlagger
import pysamiterators.iterators
import collections
import pysam


[docs]class MoleculeIterator(): """Iterate over molecules in pysam.AlignmentFile or reads from a generator or list Example: >>> !wget https://github.com/BuysDB/SingleCellMultiOmics/blob/master/data/mini_nla_test.bam?raw=true -O mini_nla_test.bam >>> !wget https://github.com/BuysDB/SingleCellMultiOmics/blob/master/data/mini_nla_test.bam.bai?raw=true -O mini_nla_test.bam.bai >>> import pysam >>> from singlecellmultiomics.molecule import NlaIIIMolecule, MoleculeIterator >>> from singlecellmultiomics.fragment import NLAIIIFragment >>> import pysamiterators >>> alignments = pysam.AlignmentFile('mini_nla_test.bam') >>> for molecule in MoleculeIterator( >>> alignments=alignments, >>> moleculeClass=singlecellmultiomics.molecule.NlaIIIMolecule, >>> fragmentClass=singlecellmultiomics.fragment.NLAIIIFragment, >>> ): >>> break >>> molecule NlaIIIMolecule with 1 assinged fragments Allele :No allele assigned Fragment: sample:APKS1P25-NLAP2L2_57 umi:CCG span:chr1 164834728-164834868 strand:+ has R1: yes has R2: no randomer trimmed: no DS:164834865 RS:0 RZ:CAT Restriction site:('chr1', 164834865) It is also possible to supply and iterator instead of a SAM/BAM file handle Example: >>> from singlecellmultiomics.molecule import MoleculeIterator >>> from singlecellmultiomics.fragment import Fragment >>> import pysam >>> # Create SAM file to write some example reads to: >>> test_sam = pysam.AlignmentFile('test.sam','w',reference_names=['chr1','chr2'],reference_lengths=[1000,1000]) >>> read_A = pysam.AlignedSegment(test_sam.header) >>> read_A.set_tag('SM','CELL_1') >>> read_A.set_tag('RX','CAT') >>> read_A.reference_name = 'chr1' >>> read_A.reference_start = 100 >>> read_A.query_sequence = 'ATCGGG' >>> read_A.cigarstring = '6M' >>> read_A.mapping_quality = 60 >>> # Create a second read which is a duplicate of the previous >>> read_B = pysam.AlignedSegment(test_sam.header) >>> read_B.set_tag('SM','CELL_1') >>> read_B.set_tag('RX','CAT') >>> read_B.reference_name = 'chr1' >>> read_B.reference_start = 100 >>> read_B.query_sequence = 'ATCGG' >>> read_B.cigarstring = '5M' >>> read_B.mapping_quality = 60 >>> # Create a thids read which is belonging to another cell >>> read_C = pysam.AlignedSegment(test_sam.header) >>> read_C.set_tag('SM','CELL_2') >>> read_C.set_tag('RX','CAT') >>> read_C.reference_name = 'chr1' >>> read_C.reference_start = 100 >>> read_C.query_sequence = 'ATCGG' >>> read_C.cigarstring = '5M' >>> read_C.mapping_quality = 60 >>> # Set up an iterable containing the reads: >>> reads = [ read_A,read_B,read_C ] >>> molecules = [] >>> for molecule in MoleculeIterator( reads ): >>> print(molecule) Molecule with 2 assinged fragments Allele :No allele assigned Fragment: sample:CELL_1 umi:CAT span:chr1 100-106 strand:+ has R1: yes has R2: no randomer trimmed: no Fragment: sample:CELL_1 umi:CAT span:chr1 100-105 strand:+ has R1: yes has R2: no randomer trimmed: no Molecule with 1 assinged fragments Allele :No allele assigned Fragment: sample:CELL_2 umi:CAT span:chr1 100-105 strand:+ has R1: yes has R2: no randomer trimmed: no Warning: Always make sure the reads being supplied to the MoleculeIterator sorted by genomic coordinate! """ def __init__(self, alignments, moleculeClass=Molecule, fragmentClass=Fragment, check_eject_every=10_000, # bigger sizes are very speed benificial molecule_class_args={}, # because the relative amount of molecules # which can be ejected will be much higher fragment_class_args={}, perform_qflag=True, pooling_method=1, yield_invalid=False, queryNameFlagger=None, every_fragment_as_molecule=False, yield_secondary = False, yield_supplementary= False, **pysamArgs): """Iterate over molecules in pysam.AlignmentFile Args: alignments (pysam.AlignmentFile) or iterable yielding tuples: Alignments to extract molecules from moleculeClass (pysam.FastaFile): Class to use for molecules. fragmentClass (pysam.FastaFile): Class to use for fragments. check_eject_every (int): Check for yielding every N reads. molecule_class_args (dict): arguments to pass to moleculeClass. fragment_class_args (dict): arguments to pass to fragmentClass. perform_qflag (bool): Make sure the sample/umi etc tags are copied from the read name into bam tags pooling_method(int) : 0: no pooling, 1: only compare molecules with the same sample id and hash yield_invalid (bool) : When true all fragments which are invalid will be yielded as a molecule queryNameFlagger(class) : class which contains the method digest(self, reads) which accepts pysam.AlignedSegments and adds at least the SM and RX tags every_fragment_as_molecule(bool): When set to true all valid fragments are emitted as molecule with one associated fragment, this is a way to disable deduplication. yield_secondary (bool): When true all secondary alignments will be yielded as a molecule **kwargs: arguments to pass to the pysam.AlignmentFile.fetch function Yields: molecule (Molecule): Molecule """ if queryNameFlagger is None: queryNameFlagger = QueryNameFlagger() self.queryNameFlagger = queryNameFlagger self.alignments = alignments self.moleculeClass = moleculeClass self.fragmentClass = fragmentClass self.check_eject_every = check_eject_every self.molecule_class_args = molecule_class_args self.fragment_class_args = fragment_class_args self.perform_qflag = perform_qflag self.pysamArgs = pysamArgs self.matePairIterator = None self.pooling_method = pooling_method self.yield_invalid = yield_invalid self.every_fragment_as_molecule = every_fragment_as_molecule self._clear_cache() def _clear_cache(self): """Clear cache containing non yielded molecules""" self.waiting_fragments = 0 self.yielded_fragments = 0 self.yielded_molecules = 0 self.check_ejection_iter = 0 if self.pooling_method == 0: self.molecules = [] elif self.pooling_method == 1: self.molecules_per_cell = collections.defaultdict( list) # {hash:[], :} else: raise NotImplementedError() def __repr__(self): return f"""Molecule Iterator, generates fragments from {self.fragmentClass} into molecules based on {self.moleculeClass}. Yielded {self.yielded_fragments} fragments, {self.waiting_fragments} fragments are waiting to be ejected. {self.get_molecule_cache_size()} molecules cached. Mate pair iterator: {str(self.matePairIterator)}"""
[docs] def get_molecule_cache_size(self): if self.pooling_method == 0: return len(self.molecules) elif self.pooling_method == 1: return sum(len(cell_molecules) for cell, cell_molecules in self.molecules_per_cell.items()) else: raise NotImplementedError()
def __iter__(self): if self.perform_qflag: qf = self.queryNameFlagger self._clear_cache() self.waiting_fragments = 0 # prepare the source iterator which generates the read pairs: if isinstance(self.alignments, pysam.libcalignmentfile.AlignmentFile): self.matePairIterator = pysamiterators.iterators.MatePairIterator( self.alignments, performProperPairCheck=False, **self.pysamArgs) else: # If an iterable is provided use this as read source: self.matePairIterator = self.alignments for reads in self.matePairIterator: if isinstance(reads, pysam.AlignedSegment): R1 = reads R2 = None elif len(reads) == 2: R1, R2 = reads elif (isinstance(reads, list) or isinstance(reads, tuple)) and len(reads) == 1: R1 = reads[0] R2 = None else: raise ValueError( 'Iterable not understood, supply either pysam.AlignedSegment or lists of pysam.AlignedSegment') # Make sure the sample/umi etc tags are placed: if self.perform_qflag: qf.digest([R1, R2]) fragment = self.fragmentClass([R1, R2], **self.fragment_class_args) if not fragment.is_valid(): if self.yield_invalid: m = self.moleculeClass( fragment, **self.molecule_class_args) m.__finalise__() yield m continue if self.every_fragment_as_molecule: m = self.moleculeClass(fragment, **self.molecule_class_args) m.__finalise__() yield m continue added = False if self.pooling_method == 0: for molecule in self.molecules: if molecule.add_fragment(fragment, use_hash=False): added = True break elif self.pooling_method == 1: for molecule in self.molecules_per_cell[fragment.match_hash]: if molecule.add_fragment(fragment, use_hash=True): added = True break if not added: if self.pooling_method == 0: self.molecules.append(self.moleculeClass( fragment, **self.molecule_class_args)) else: self.molecules_per_cell[fragment.match_hash].append( self.moleculeClass(fragment, **self.molecule_class_args) ) self.waiting_fragments += 1 self.check_ejection_iter += 1 if self.check_ejection_iter > self.check_eject_every: current_chrom, _, current_position = fragment.get_span() if current_chrom is None: continue self.check_ejection_iter = 0 if self.pooling_method == 0: to_pop = [] for i, m in enumerate(self.molecules): if m.can_be_yielded(current_chrom, current_position): to_pop.append(i) self.waiting_fragments -= len(m) self.yielded_fragments += len(m) for i, j in enumerate(to_pop): m = self.molecules.pop(i - j) m.__finalise__() yield m else: for hash_group, molecules in self.molecules_per_cell.items(): to_pop = [] for i, m in enumerate(molecules): if m.can_be_yielded( current_chrom, current_position): to_pop.append(i) self.waiting_fragments -= len(m) self.yielded_fragments += len(m) for i, j in enumerate(to_pop): m = self.molecules_per_cell[hash_group].pop(i - j) m.__finalise__() yield m # Yield remains if self.pooling_method == 0: for m in self.molecules: m.__finalise__() yield from iter(self.molecules) else: for hash_group, molecules in self.molecules_per_cell.items(): for i, m in enumerate(molecules): m.__finalise__() yield m self._clear_cache()