from singlecellmultiomics.fragment import Fragment
from singlecellmultiomics.utils.sequtils import hamming_distance
[docs]class CHICFragment(Fragment):
def __init__(self,
reads,
R1_primer_length=4,
R2_primer_length=6, # Is automatically detected now
assignment_radius=0,
umi_hamming_distance=1,
invert_strand=False,
no_umi_cigar_processing=False,
**kwargs
):
self.invert_strand = invert_strand
# Perform random primer autodect,
# When the demux profile is MX:Z:scCHIC384C8U3l, then there is no random primer
for read in reads:
if read is not None and read.has_tag('MX'):
if read.get_tag('MX')[-1]=='l':
R2_primer_length = 0
break
Fragment.__init__(self,
reads,
assignment_radius=assignment_radius,
R1_primer_length=R1_primer_length,
R2_primer_length=R2_primer_length,
umi_hamming_distance=umi_hamming_distance,
max_NUC_stretch = 18,
**kwargs
)
# set CHIC cut site given reads
self.no_umi_cigar_processing = no_umi_cigar_processing
self.strand = None
self.ligation_motif = None
self.site_location = None
self.cut_site_strand = None
self.found_valid_site = False
self.identify_site()
if self.is_valid():
if self.assignment_radius==0:
self.match_hash = (
self.strand,
self.cut_site_strand,
self.site_location[0],
self.site_location[1],
self.sample)
else:
self.match_hash = (
self.cut_site_strand,
self.site_location[0],
self.sample)
else:
self.match_hash = None
[docs] def set_site(
self,
site_chrom,
site_pos,
site_strand=None,
is_trimmed=False):
if self.found_valid_site:
self.set_meta('DS', site_pos)
if site_strand is not None:
self.set_meta('RS', site_strand)
self.set_strand(site_strand)
self.site_location = (site_chrom, site_pos)
self.cut_site_strand = site_strand
[docs] def identify_site(self):
self.found_valid_site = False
R1 = self.get_R1()
if R1 is None:
self.set_rejection_reason("R1_undefined")
return None
if R1.has_tag('lh'):
self.ligation_motif = R1.get_tag('lh')
""" Valid configs:
Observed read pair:
T######## R1 ########## ^ ########## R2 ##########
Real molecule:
A###########################################################
Real molecule: and adapter:
ADAPTER: 5' NNNNT 3'
A{A:80%,N:20%}NNN [CHIC MOLECULE]
^ real cut site
"""
is_trimmed = (R1.has_tag('MX') and R1.get_tag(
'MX').startswith('scCHIC'))
if R1.is_unmapped:
self.set_rejection_reason("R1_unmapped")
return(None)
### Check if the orientation of the reads makes sense, reads need to point inwards
# | ----- > < ---- |
if self.has_R2():
R2 = self.get_R2()
if not R2.is_unmapped:
if R1.is_reverse and not R2.is_reverse:
pass
elif not R1.is_reverse and R2.is_reverse:
pass
else:
# it does not make sense ...
self.set_rejection_reason("orientation")
return(None)
# Identify the start coordinate of Read 1 by reading the amount of softclips on the start of the read
r1_start =(R1.reference_end if R1.is_reverse else R1.reference_start)
if not self.no_umi_cigar_processing:
if R1.is_reverse:
if R1.cigartuples[-1][0]==4: # softclipped at end
r1_start+=R1.cigartuples[-1][1]
else:
if R1.cigartuples[0][0]==4: # softclipped at start
r1_start-=R1.cigartuples[0][1]
if is_trimmed:
# The first base of the read has been taken off and the lh tag is
# already set, this can be copied to RZ
self.found_valid_site = True
self.set_site(
site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse,
# We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand
# On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream
site_chrom=R1.reference_name,
site_pos=r1_start,
is_trimmed=True
)
else:
self.found_valid_site = True
self.set_site(
site_strand=not R1.is_reverse if self.invert_strand else R1.is_reverse,
# We sequence the other strand (Starting with a T, this is an A in the molecule), the digestion thus happened on the other strand
# On the next line we asume that the mnsase cut is one base after the ligated A, but it can be more bases upstream
site_chrom=R1.reference_name,
site_pos=(r1_start - 1 if R1.is_reverse else r1_start + 1),
is_trimmed=False)
[docs] def is_valid(self):
if self.qcfail:
return False
if self.max_fragment_size is not None:
try:
size = self.get_fragment_size()
if size>self.max_fragment_size:
return False
except Exception:
pass
return self.found_valid_site
[docs] def get_site_location(self):
if self.site_location is not None:
return self.site_location
else:
# We need some kind of coordinate...
for read in self:
if read is not None and read.reference_name is not None and read.reference_start is not None:
return read.reference_name, read.reference_start
def __repr__(self):
return Fragment.__repr__(
self) + f'\n\tMNase cut site:{self.get_site_location()}'
def __eq__(self, other):
# Make sure fragments map to the same strand, cheap comparisons
if self.match_hash != other.match_hash:
return False
if self.site_location is None or other.site_location is None:
return False
# Check distance between the fragments:
if self.assignment_radius>0 and abs(self.site_location[1]-other.site_location[1])>self.assignment_radius:
return False
# Make sure UMI's are similar enough, more expensive hamming distance
# calculation
return self.umi_eq(other)