from singlecellmultiomics.molecule.molecule import Molecule
import collections
import pandas as pd
[docs]class FeatureAnnotatedMolecule(Molecule):
"""Molecule which is annotated with features (genes/exons/introns, .. )
"""
def __init__(
self,
fragment,
features,
stranded=None,
auto_set_intron_exon_features=False,
**kwargs):
"""
Args:
fragments (singlecellmultiomics.fragment.Fragment): Fragments to associate to the molecule
features (singlecellmultiomics.features.FeatureContainer) : container to use to obtain features from
stranded : None; not stranded, False: same strand as R1, True: other strand
auto_set_intron_exon_features(bool) : obtain intron_exon_features upon initialising
**kwargs: extra args
"""
Molecule.__init__(self, fragment, **kwargs)
self.features = features
self.hits = collections.defaultdict(set) # feature -> hit_bases
self.stranded = stranded
self.is_annotated = False
self.junctions = set()
self.genes = set()
self.introns = set()
self.exons = set()
self.exon_hit_gene_names = set() # readable names
self.is_spliced = None
if auto_set_intron_exon_features:
self.set_intron_exon_features()
[docs] def set_spliced(self, is_spliced):
""" Set wether the transcript is spliced, False has priority over True """
if self.is_spliced and not is_spliced:
# has already been set
self.is_spliced = False
else:
self.is_spliced = is_spliced
[docs] def set_intron_exon_features(self):
if not self.is_annotated:
self.annotate()
# Collect all hits:
hits = self.hits.keys()
# (gene, transcript) -> set( exon_id .. )
exon_hits = collections.defaultdict(set)
intron_hits = collections.defaultdict(set)
for hit, locations in self.hits.items():
if not isinstance(hit, tuple):
continue
meta = dict(list(hit))
if 'gene_id' not in meta:
continue
if meta.get('type') == 'exon':
if 'transcript_id' not in meta:
continue
self.genes.add(meta['gene_id'])
self.exons.add(meta['exon_id'])
if 'transcript_id' not in meta:
raise ValueError(
"Please use an Intron GTF file generated using -id 'transcript_id'")
exon_hits[(meta['gene_id'], meta['transcript_id'])].add(
meta['exon_id'])
if 'gene_name' in meta:
self.exon_hit_gene_names.add(meta['gene_name'])
elif meta.get('type') == 'intron':
self.genes.add(meta['gene_id'])
self.introns.add(meta['gene_id'])
# Find junctions and add all annotations to annotation sets
debug = []
for (gene, transcript), exons_overlapping in exon_hits.items():
# If two exons are detected from the same gene we detected a
# junction:
if len(exons_overlapping) > 1:
self.junctions.add(gene)
# We found two exons and an intron:
if gene in self.introns:
self.set_spliced(False)
else:
self.set_spliced(True)
debug.append(
f'{gene}_{transcript}:' +
','.join(
list(exons_overlapping)))
# Write exon dictionary:
self.set_meta('DB', ';'.join(debug))
[docs] def get_hit_df(self):
"""Obtain dataframe with hits
Returns:
pd.DataFrame
"""
if not self.is_annotated:
self.set_intron_exon_features()
d = {}
tabulated_hits = []
for hit, locations in self.hits.items():
if not isinstance(hit, tuple):
continue
meta = dict(list(hit))
for location in locations:
location_dict = {
'chromosome': location[0],
'start': location[1][0],
'end': location[1][1]}
location_dict.update(meta)
tabulated_hits.append(location_dict)
return pd.DataFrame(tabulated_hits)
[docs] def annotate(self, method=0):
"""
Args:
method (int) : 0, obtain blocks and then obtain features. 1, try to obtain features for every aligned base
"""
# When self.stranded is None, set to None strand. If self.stranded is
# True reverse the strand, otherwise copy the strand
strand = None if self.stranded is None else '+-'[
(not self.strand if self.stranded else self.strand)]
self.is_annotated = True
if method == 0:
# Obtain all blocks:
try:
for start, end in self.get_aligned_blocks():
for hit in self.features.findFeaturesBetween(
chromosome=self.chromosome, sampleStart=start, sampleEnd=end, strand=strand):
hit_start, hit_end, hit_id, hit_strand, hit_ids = hit
self.hits[hit_ids].add(
(self.chromosome, (hit_start, hit_end)))
except TypeError:
# This happens when no reads map
pass
else:
for read in self.iter_reads():
for q_pos, ref_pos in read.get_aligned_pairs(
matches_only=True, with_seq=False):
for hit in self.features.findFeaturesAt(
chromosome=read.reference_name, lookupCoordinate=ref_pos, strand=strand):
hit_start, hit_end, hit_id, hit_strand, hit_ids = hit
self.hits[hit_ids].add((read.reference_name, ref_pos))