import collections
from singlecellmultiomics.universalBamTagger.digest import DigestFlagger
import pysamiterators.iterators
import itertools
complement = str.maketrans('ATGC', 'TACG')
[docs]class TAPSFlagger(DigestFlagger):
def __init__(self, reference, **kwargs):
DigestFlagger.__init__(self, **kwargs)
self.overlap_tag = 'XM'
self.reference = reference
if self.reference is None:
raise ValueError('A reference fasta file is required!')
"""
z unmethylated C in CpG context (CG)
Z methylated C in CpG context (CG)
x unmethylated C in CHG context ( C[ACT]G )
X methylated C in CHG context ( C[ACT]G )
h unmethylated C in CHH context ( C[ACT][ACT] )
H methylated C in CHH context ( C[ACT][ACT] )
"""
self.context_mapping = {}
self.context_mapping[False] = {
'CGA': 'z',
'CGT': 'z',
'CGC': 'z',
'CGG': 'z',
# CHG:
'CAG': 'x',
'CCG': 'x',
'CTG': 'x',
}
self.context_mapping[True] = {
context: letter.upper() for context,
letter in self.context_mapping[False].items()}
# CHH:
# h unmethylated C in CHH context ( C[ACT][ACT] )
for x in list(itertools.product('ACT', repeat=2)):
self.context_mapping[True][''.join(['C'] + list(x))] = 'H'
self.context_mapping[False][''.join(['C'] + list(x))] = 'h'
# print(self.context_mapping)
[docs] def digest(self, reads):
#fragment_contig, fragment_start, fragment_end = pysamiterators.iterators.getListSpanningCoordinates([r for r in reads if r is not None])
# if fragment_contig is None or fragment_end is None or fragment_start is None:
# return
#fragment_size = fragment_end-fragment_start
modified_contexts = []
unmodified_contexts = []
modified_5b_contexts = []
unmodified_5b_contexts = []
fragment_methylated_count = 0
for read in reads:
if read is None:
continue
if read.is_unmapped:
continue
if not read.has_tag('RS'):
continue
strand = read.get_tag('RS')
methylationBlocks = []
readCoversionString = []
genomeConversionString = []
for qpos, rpos, ref_base in read.get_aligned_pairs(with_seq=True):
methylationStateString = '.'
if qpos is None:
continue
if ref_base is None:
continue
if rpos is None:
continue
ref_base = ref_base.upper()
qbase = read.seq[qpos].upper()
methylated = False
if ref_base == 'C' and strand == '+':
context = self.reference.fetch(
read.reference_name, rpos, rpos + 3).upper()
three_context = self.reference.fetch(
read.reference_name, rpos - 1, rpos + 2).upper()
penta_context = self.reference.fetch(
read.reference_name, rpos - 2, rpos + 3).upper()
#print(ref_base, qbase, context)
if qbase == 'T':
methylated = True
methylationStateString = self.context_mapping[methylated].get(context, 'uU'[
methylated])
if methylated:
modified_contexts.append(three_context)
modified_5b_contexts.append(penta_context)
readCoversionString.append('CT')
genomeConversionString.append('CT')
else:
unmodified_contexts.append(three_context)
unmodified_5b_contexts.append(penta_context)
elif ref_base == 'G' and strand == '-':
origin = self.reference.fetch(
read.reference_name, rpos - 2, rpos + 1).upper()
context = origin.translate(complement)[::-1]
three_context = self.reference.fetch(
read.reference_name,
rpos -
1,
rpos +
2).translate(complement)[
::-
1].upper()
penta_context = self.reference.fetch(
read.reference_name,
rpos -
2,
rpos +
3).translate(complement)[
::-
1].upper()
if qbase == 'A':
methylated = True
methylationStateString = self.context_mapping[methylated].get(context, 'uU'[
methylated])
if methylated:
modified_contexts.append(three_context)
modified_5b_contexts.append(penta_context)
readCoversionString.append('CT')
genomeConversionString.append('GA')
else:
unmodified_contexts.append(three_context)
unmodified_5b_contexts.append(penta_context)
if methylated:
fragment_methylated_count += 1
methylationBlocks.append(methylationStateString)
if len(methylationBlocks) > 0:
read.set_tag('XM', ''.join(methylationBlocks))
#read.set_tag('XR',max(set(readCoversionString), key = readConversionString.count))
#read.set_tag('XG',max(set(genomeConversionString), key = genomeConversionString.count))
if len(modified_contexts):
read.set_tag('Cm', ','.join(list(sorted(modified_contexts))))
read.set_tag(
'Qm', ','.join(
list(
sorted(modified_5b_contexts))))
if len(unmodified_contexts):
read.set_tag('Cu', ','.join(list(sorted(unmodified_contexts))))
read.set_tag(
'Qu', ','.join(
list(
sorted(unmodified_5b_contexts))))
read.set_tag('MC', fragment_methylated_count)