Source code for singlecellmultiomics.molecule.featureannotatedmolecule

from singlecellmultiomics.molecule.molecule import Molecule
import collections
import pandas as pd


[docs]class FeatureAnnotatedMolecule(Molecule):
    """Molecule which is annotated with features (genes/exons/introns, .. )
    """

    def __init__(
            self,
            fragment,
            features,
            stranded=None,
            auto_set_intron_exon_features=False,
            **kwargs):
        """
            Args:
                fragments (singlecellmultiomics.fragment.Fragment): Fragments to associate to the molecule
                features (singlecellmultiomics.features.FeatureContainer) : container to use to obtain features from
                stranded : None; not stranded, False: same strand as R1, True: other strand
                auto_set_intron_exon_features(bool) : obtain intron_exon_features upon initialising
                **kwargs: extra args

        """
        Molecule.__init__(self, fragment, **kwargs)
        self.features = features
        self.hits = collections.defaultdict(set)  # feature -> hit_bases
        self.stranded = stranded
        self.is_annotated = False

        self.junctions = set()
        self.genes = set()
        self.introns = set()
        self.exons = set()
        self.exon_hit_gene_names = set()  # readable names
        self.is_spliced = None

        if auto_set_intron_exon_features:
            self.set_intron_exon_features()

[docs]    def set_spliced(self, is_spliced):
        """ Set wether the transcript is spliced, False has priority over True """
        if self.is_spliced and not is_spliced:
            # has already been set
            self.is_spliced = False
        else:
            self.is_spliced = is_spliced

[docs]    def set_intron_exon_features(self):
        if not self.is_annotated:
            self.annotate()

        # Collect all hits:
        hits = self.hits.keys()

        # (gene, transcript) -> set( exon_id  .. )
        exon_hits = collections.defaultdict(set)
        intron_hits = collections.defaultdict(set)

        for hit, locations in self.hits.items():
            if not isinstance(hit, tuple):
                continue
            meta = dict(list(hit))
            if 'gene_id' not in meta:
                continue
            if meta.get('type') == 'exon':
                if 'transcript_id' not in meta:
                    continue
                self.genes.add(meta['gene_id'])
                self.exons.add(meta['exon_id'])
                if 'transcript_id' not in meta:
                    raise ValueError(
                        "Please use an Intron GTF file generated using -id 'transcript_id'")
                exon_hits[(meta['gene_id'], meta['transcript_id'])].add(
                    meta['exon_id'])
                if 'gene_name' in meta:
                    self.exon_hit_gene_names.add(meta['gene_name'])
            elif meta.get('type') == 'intron':
                self.genes.add(meta['gene_id'])
                self.introns.add(meta['gene_id'])

        # Find junctions and add all annotations to annotation sets
        debug = []

        for (gene, transcript), exons_overlapping in exon_hits.items():
            # If two exons are detected from the same gene we detected a
            # junction:
            if len(exons_overlapping) > 1:
                self.junctions.add(gene)

                # We found two exons and an intron:
                if gene in self.introns:
                    self.set_spliced(False)
                else:
                    self.set_spliced(True)

            debug.append(
                f'{gene}_{transcript}:' +
                ','.join(
                    list(exons_overlapping)))

        # Write exon dictionary:
        self.set_meta('DB', ';'.join(debug))

[docs]    def get_hit_df(self):
        """Obtain dataframe with hits
        Returns:
            pd.DataFrame
        """
        if not self.is_annotated:
            self.set_intron_exon_features()

        d = {}
        tabulated_hits = []
        for hit, locations in self.hits.items():
            if not isinstance(hit, tuple):
                continue
            meta = dict(list(hit))
            for location in locations:
                location_dict = {
                    'chromosome': location[0],
                    'start': location[1][0],
                    'end': location[1][1]}
                location_dict.update(meta)
                tabulated_hits.append(location_dict)

        return pd.DataFrame(tabulated_hits)

[docs]    def write_tags(self):
        Molecule.write_tags(self)

        if len(self.exons) > 0:
            self.set_meta('EX', ','.join(sorted([str(x) for x in self.exons])))
        if len(self.introns) > 0:
            self.set_meta('IN', ','.join(
                sorted([str(x) for x in self.introns])))
        if len(self.genes) > 0:
            self.set_meta('GN', ','.join(sorted([str(x) for x in self.genes])))
        if len(self.junctions) > 0:
            self.set_meta('JN', ','.join(
                sorted([str(x) for x in self.junctions])))
            # Is transcriptome
            self.set_meta('IT', 1)
        elif len(self.genes) > 0:
            # Maps to gene but not junction
            self.set_meta('IT', 0.5)
        else:
            # Doesn't map to gene
            self.set_meta('IT', 0)
        if self.is_spliced is True:
            self.set_meta('SP', True)
        elif self.is_spliced is False:
            self.set_meta('SP', False)
        if len(self.exon_hit_gene_names):
            self.set_meta('gn', ';'.join(list(self.exon_hit_gene_names)))

[docs]    def annotate(self, method=0):
        """
            Args:
                method (int) : 0, obtain blocks and then obtain features. 1, try to obtain features for every aligned base

        """
        # When self.stranded is None, set to None strand. If self.stranded is
        # True reverse the strand, otherwise copy the strand
        strand = None if self.stranded is None else '+-'[
            (not self.strand if self.stranded else self.strand)]
        self.is_annotated = True
        if method == 0:

            # Obtain all blocks:
            try:
                for start, end in self.get_aligned_blocks():
                    for hit in self.features.findFeaturesBetween(
                            chromosome=self.chromosome, sampleStart=start, sampleEnd=end, strand=strand):
                        hit_start, hit_end, hit_id, hit_strand, hit_ids = hit
                        self.hits[hit_ids].add(
                            (self.chromosome, (hit_start, hit_end)))

            except TypeError:
                # This happens when no reads map
                pass
        else:

            for read in self.iter_reads():
                for q_pos, ref_pos in read.get_aligned_pairs(
                        matches_only=True, with_seq=False):
                    for hit in self.features.findFeaturesAt(
                            chromosome=read.reference_name, lookupCoordinate=ref_pos, strand=strand):
                        hit_start, hit_end, hit_id, hit_strand, hit_ids = hit
                        self.hits[hit_ids].add((read.reference_name, ref_pos))