Source code for singlecellmultiomics.molecule.iterator

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from singlecellmultiomics.molecule import Molecule
from singlecellmultiomics.fragment import Fragment
from singlecellmultiomics.universalBamTagger import QueryNameFlagger
import pysamiterators.iterators
import collections
import pysam


[docs]class MoleculeIterator():
    """Iterate over molecules in pysam.AlignmentFile or reads from a generator or list

    Example:
        >>> !wget https://github.com/BuysDB/SingleCellMultiOmics/blob/master/data/mini_nla_test.bam?raw=true -O mini_nla_test.bam
        >>> !wget https://github.com/BuysDB/SingleCellMultiOmics/blob/master/data/mini_nla_test.bam.bai?raw=true -O mini_nla_test.bam.bai
        >>> import pysam
        >>> from singlecellmultiomics.molecule import NlaIIIMolecule, MoleculeIterator
        >>> from singlecellmultiomics.fragment import NLAIIIFragment
        >>> import pysamiterators
        >>> alignments = pysam.AlignmentFile('mini_nla_test.bam')
        >>> for molecule in MoleculeIterator(
        >>>             alignments=alignments,
        >>>             moleculeClass=singlecellmultiomics.molecule.NlaIIIMolecule,
        >>>             fragmentClass=singlecellmultiomics.fragment.NLAIIIFragment,
        >>>         ):
        >>>     break
        >>> molecule
        NlaIIIMolecule
        with 1 assinged fragments
        Allele :No allele assigned
            Fragment:
            sample:APKS1P25-NLAP2L2_57
            umi:CCG
            span:chr1 164834728-164834868
            strand:+
            has R1: yes
            has R2: no
            randomer trimmed: no
            DS:164834865
        RS:0
        RZ:CAT
        Restriction site:('chr1', 164834865)


    It is also possible to supply and iterator instead of a SAM/BAM file handle
    Example:

        >>> from singlecellmultiomics.molecule import MoleculeIterator
        >>> from singlecellmultiomics.fragment import Fragment
        >>> import pysam
        >>> # Create SAM file to write some example reads to:
        >>> test_sam = pysam.AlignmentFile('test.sam','w',reference_names=['chr1','chr2'],reference_lengths=[1000,1000])
        >>> read_A = pysam.AlignedSegment(test_sam.header)
        >>> read_A.set_tag('SM','CELL_1')
        >>> read_A.set_tag('RX','CAT')
        >>> read_A.reference_name = 'chr1'
        >>> read_A.reference_start = 100
        >>> read_A.query_sequence = 'ATCGGG'
        >>> read_A.cigarstring = '6M'
        >>> read_A.mapping_quality = 60
        >>> # Create a second read which is a duplicate of the previous
        >>> read_B = pysam.AlignedSegment(test_sam.header)
        >>> read_B.set_tag('SM','CELL_1')
        >>> read_B.set_tag('RX','CAT')
        >>> read_B.reference_name = 'chr1'
        >>> read_B.reference_start = 100
        >>> read_B.query_sequence = 'ATCGG'
        >>> read_B.cigarstring = '5M'
        >>> read_B.mapping_quality = 60
        >>> # Create a thids read which is belonging to another cell
        >>> read_C = pysam.AlignedSegment(test_sam.header)
        >>> read_C.set_tag('SM','CELL_2')
        >>> read_C.set_tag('RX','CAT')
        >>> read_C.reference_name = 'chr1'
        >>> read_C.reference_start = 100
        >>> read_C.query_sequence = 'ATCGG'
        >>> read_C.cigarstring = '5M'
        >>> read_C.mapping_quality = 60
        >>> # Set up an iterable containing the reads:
        >>> reads = [  read_A,read_B,read_C ]
        >>> molecules = []
        >>> for molecule in MoleculeIterator( reads ):
        >>>     print(molecule)
        Molecule
            with 2 assinged fragments
            Allele :No allele assigned
                Fragment:
                sample:CELL_1
                umi:CAT
                span:chr1 100-106
                strand:+
                has R1: yes
                has R2: no
                randomer trimmed: no
            Fragment:
                sample:CELL_1
                umi:CAT
                span:chr1 100-105
                strand:+
                has R1: yes
                has R2: no
                randomer trimmed: no
        Molecule
                with 1 assinged fragments
                Allele :No allele assigned
                    Fragment:
                    sample:CELL_2
                    umi:CAT
                    span:chr1 100-105
                    strand:+
                    has R1: yes
                    has R2: no
                    randomer trimmed: no

    Warning:
        Always make sure the reads being supplied to the MoleculeIterator sorted by genomic coordinate!
    """

    def __init__(self, alignments, moleculeClass=Molecule,
                 fragmentClass=Fragment,
                 check_eject_every=10_000,  # bigger sizes are very speed benificial

                 molecule_class_args={},  # because the relative amount of molecules
                 # which can be ejected will be much higher
                 fragment_class_args={},
                 perform_qflag=True,
                 pooling_method=1,
                 yield_invalid=False,
                 queryNameFlagger=None,
                 every_fragment_as_molecule=False,
                 yield_secondary =  False,
                 yield_supplementary= False,
                 **pysamArgs):
        """Iterate over molecules in pysam.AlignmentFile

        Args:
            alignments (pysam.AlignmentFile) or iterable yielding tuples: Alignments to extract molecules from

            moleculeClass (pysam.FastaFile): Class to use for molecules.

            fragmentClass (pysam.FastaFile): Class to use for fragments.

            check_eject_every (int): Check for yielding every N reads.

            molecule_class_args (dict): arguments to pass to moleculeClass.

            fragment_class_args (dict): arguments to pass to fragmentClass.

            perform_qflag (bool):  Make sure the sample/umi etc tags are copied
                from the read name into bam tags

            pooling_method(int) : 0: no  pooling, 1: only compare molecules with the same sample id and hash

            yield_invalid (bool) : When true all fragments which are invalid will be yielded as a molecule

            queryNameFlagger(class) : class which contains the method digest(self, reads) which accepts pysam.AlignedSegments and adds at least the SM and RX tags

            every_fragment_as_molecule(bool): When set to true all valid fragments are emitted as molecule with one associated fragment, this is a way to disable deduplication.

            yield_secondary (bool):  When true all secondary alignments will be yielded as a molecule

            **kwargs: arguments to pass to the pysam.AlignmentFile.fetch function

        Yields:
            molecule (Molecule): Molecule
        """
        if queryNameFlagger is None:
            queryNameFlagger = QueryNameFlagger()
        self.queryNameFlagger = queryNameFlagger

        self.alignments = alignments
        self.moleculeClass = moleculeClass
        self.fragmentClass = fragmentClass
        self.check_eject_every = check_eject_every
        self.molecule_class_args = molecule_class_args
        self.fragment_class_args = fragment_class_args
        self.perform_qflag = perform_qflag
        self.pysamArgs = pysamArgs
        self.matePairIterator = None
        self.pooling_method = pooling_method
        self.yield_invalid = yield_invalid
        self.every_fragment_as_molecule = every_fragment_as_molecule
        self._clear_cache()

    def _clear_cache(self):
        """Clear cache containing non yielded molecules"""
        self.waiting_fragments = 0
        self.yielded_fragments = 0
        self.yielded_molecules = 0
        self.check_ejection_iter = 0
        if self.pooling_method == 0:
            self.molecules = []
        elif self.pooling_method == 1:
            self.molecules_per_cell = collections.defaultdict(
                list)  # {hash:[], :}
        else:
            raise NotImplementedError()

    def __repr__(self):
        return f"""Molecule Iterator, generates fragments from {self.fragmentClass} into molecules based on {self.moleculeClass}.
        Yielded {self.yielded_fragments} fragments, {self.waiting_fragments} fragments are waiting to be ejected.
        {self.get_molecule_cache_size()} molecules cached.
        Mate pair iterator: {str(self.matePairIterator)}"""

[docs]    def get_molecule_cache_size(self):
        if self.pooling_method == 0:
            return len(self.molecules)
        elif self.pooling_method == 1:
            return sum(len(cell_molecules) for cell,
                       cell_molecules in self.molecules_per_cell.items())

        else:
            raise NotImplementedError()

    def __iter__(self):
        if self.perform_qflag:
            qf = self.queryNameFlagger

        self._clear_cache()
        self.waiting_fragments = 0
        # prepare the source iterator which generates the read pairs:
        if isinstance(self.alignments, pysam.libcalignmentfile.AlignmentFile):
            self.matePairIterator = pysamiterators.iterators.MatePairIterator(
                self.alignments,
                performProperPairCheck=False,
                **self.pysamArgs)
        else:
            # If an iterable is provided use this as read source:
            self.matePairIterator = self.alignments

        for reads in self.matePairIterator:

            if isinstance(reads, pysam.AlignedSegment):
                R1 = reads
                R2 = None
            elif len(reads) == 2:
                R1, R2 = reads
            elif (isinstance(reads, list) or isinstance(reads, tuple)) and len(reads) == 1:
                R1 = reads[0]
                R2 = None
            else:
                raise ValueError(
                    'Iterable not understood, supply either  pysam.AlignedSegment or lists of  pysam.AlignedSegment')
            # Make sure the sample/umi etc tags are placed:
            if self.perform_qflag:
                qf.digest([R1, R2])

            fragment = self.fragmentClass([R1, R2], **self.fragment_class_args)

            if not fragment.is_valid():
                if self.yield_invalid:
                    m = self.moleculeClass(
                        fragment, **self.molecule_class_args)
                    m.__finalise__()
                    yield m
                continue

            if self.every_fragment_as_molecule:
                m = self.moleculeClass(fragment, **self.molecule_class_args)
                m.__finalise__()
                yield m
                continue

            added = False
            if self.pooling_method == 0:
                for molecule in self.molecules:
                    if molecule.add_fragment(fragment, use_hash=False):
                        added = True
                        break
            elif self.pooling_method == 1:
                for molecule in self.molecules_per_cell[fragment.match_hash]:
                    if molecule.add_fragment(fragment, use_hash=True):
                        added = True
                        break

            if not added:
                if self.pooling_method == 0:
                    self.molecules.append(self.moleculeClass(
                        fragment, **self.molecule_class_args))
                else:
                    self.molecules_per_cell[fragment.match_hash].append(
                        self.moleculeClass(fragment, **self.molecule_class_args)
                    )

            self.waiting_fragments += 1
            self.check_ejection_iter += 1
            if self.check_ejection_iter > self.check_eject_every:
                current_chrom, _, current_position = fragment.get_span()
                if current_chrom is None:
                    continue

                self.check_ejection_iter = 0
                if self.pooling_method == 0:
                    to_pop = []
                    for i, m in enumerate(self.molecules):
                        if m.can_be_yielded(current_chrom, current_position):
                            to_pop.append(i)
                            self.waiting_fragments -= len(m)
                            self.yielded_fragments += len(m)

                    for i, j in enumerate(to_pop):
                        m = self.molecules.pop(i - j)
                        m.__finalise__()
                        yield m
                else:
                    for hash_group, molecules in self.molecules_per_cell.items():

                        to_pop = []
                        for i, m in enumerate(molecules):
                            if m.can_be_yielded(
                                    current_chrom, current_position):
                                to_pop.append(i)
                                self.waiting_fragments -= len(m)
                                self.yielded_fragments += len(m)

                        for i, j in enumerate(to_pop):
                            m = self.molecules_per_cell[hash_group].pop(i - j)
                            m.__finalise__()
                            yield m

        # Yield remains
        if self.pooling_method == 0:
            for m in self.molecules:
                m.__finalise__()
            yield from iter(self.molecules)
        else:

            for hash_group, molecules in self.molecules_per_cell.items():
                for i, m in enumerate(molecules):
                    m.__finalise__()
                    yield m
        self._clear_cache()