Source code for singlecellmultiomics.modularDemultiplexer.demultiplexModules.restrictionbisulfite

import sys
# sys.path.append("/hpc/hub_oudenaarden/group_scripts/")
# sys.path.append("/hpc/hub_oudenaarden/group_scripts/modularDemultiplexer/")

from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import UmiBarcodeDemuxMethod, IlluminaBaseDemultiplexer

from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import IlluminaBaseDemultiplexer
from singlecellmultiomics.modularDemultiplexer.baseDemultiplexMethods import NonMultiplexable

# TO do, currently not implemented:
# enzyme ID correction based on Hamming Distance
# same for ISPCR sequence


[docs]class Base_RestrictionBisulfiteDemuxMethod(UmiBarcodeDemuxMethod): def __init__(self, umiRead=0, umiStart=0, umiLength=8, # default settings UMI barcodeRead=0, barcodeStart=8, barcodeLength=8, # default settings Barcode enzymeRead=0, enzymeStart=16, enzymeLength=3, # default settings Enzyme ID ispcrRead=0, ispcrStart=19, ispcrLength=15, # default settings ISPCR ispcrSeq="CAGTGGTATCAGAGT", barcodeFileParser=None, # compatible, no need to change barcodeFileAlias=None, # passed from lower-level Classes, e.g. "reBS_nla384w" indexFileParser=None, # compatible, no need to change **kwargs): # additional arguments self.description = 'base class for restriction bisulfite' self.barcodeFileAlias = barcodeFileAlias # description , e.g. "maya_384NLA" self.barcodeFileParser = barcodeFileParser # Namespace for barcode file parse UmiBarcodeDemuxMethod.__init__( self, umiRead=umiRead, umiStart=umiStart, umiLength=umiLength, barcodeRead=barcodeRead, barcodeStart=barcodeStart, barcodeLength=barcodeLength, barcodeFileAlias=self.barcodeFileAlias, barcodeFileParser=barcodeFileParser, **kwargs) self.barcodeSummary = self.barcodeFileAlias self.umiRead = umiRead # 0:Read 1, 1: Read 2 etc self.umiStart = umiStart # First base self.umiLength = umiLength self.shortName = 'RB' self.longName = 'base class for restriction bisulfite' self.illumina_mux = IlluminaBaseDemultiplexer( indexFileParser=indexFileParser, indexFileAlias='illumina_merged_ThruPlex48S_RP') self.barcodeRead = barcodeRead self.barcodeStart = barcodeStart self.barcodeLength = barcodeLength self.enzymeRead = enzymeRead self.enzymeStart = enzymeStart self.enzymeLength = enzymeLength self.ispcrRead = ispcrRead self.ispcrStart = ispcrStart self.ispcrLength = ispcrLength self.autoDetectable = False self.sequenceCapture = [slice(None), slice(None)] # ranges # TAKE OUT IF STATEMENT if umiLength == 0: # if there is a barcode only if barcodeStart != 0: raise NotImplementedError( 'Complicated slice where we need to capture around a region') self.sequenceCapture[barcodeRead] = slice(barcodeLength, None) else: if umiRead != barcodeRead: raise NotImplementedError() if not(umiStart == 0 or barcodeStart == 0): raise NotImplementedError( 'Complicated slice where we need to capture around a region') self.sequenceCapture[barcodeRead] = slice( barcodeLength + umiLength + enzymeLength + ispcrLength, None) def __repr__(self): return f'{self.longName} {self.description}'
[docs] def demultiplex(self, records, **kwargs): # Check if the supplied reads are mate-pair: if len(records) != 2: raise NonMultiplexable('Not mate pair') # Perform first pass demultiplexing of the illumina fragments: try: taggedRecords = self.illumina_mux.demultiplex( records, inherited=True, **kwargs) except NonMultiplexable: raise rawBarcode = records[self.barcodeRead].sequence[self.barcodeStart: self.barcodeStart + self.barcodeLength] barcodeQual = records[self.barcodeRead].qual[self.barcodeStart: self.barcodeStart + self.barcodeLength] barcodeIdentifier, barcode, hammingDistance = self.barcodeFileParser.getIndexCorrectedBarcodeAndHammingDistance( alias=self.barcodeFileAlias, barcode=rawBarcode) #print(barcodeIdentifier, barcode, hammingDistance) if barcodeIdentifier is None: raise NonMultiplexable('Illumina barcode not set') if self.umiLength != 0: umi = records[self.umiRead].sequence[self.umiStart:self.umiStart + self.umiLength] umiQual = records[self.umiRead].qual[self.umiStart:self.umiStart + self.umiLength] if self.enzymeLength != 0: enz = records[self.enzymeRead].sequence[self.enzymeStart: self.enzymeStart + self.enzymeLength] enzQual = records[self.enzymeRead].qual[self.enzymeStart: self.enzymeStart + self.enzymeLength] if self.ispcrLength != 0: ispcr = records[self.ispcrRead].sequence[self.ispcrStart: self.ispcrStart + self.ispcrLength] ispcrQual = records[self.ispcrRead].qual[self.ispcrStart: self.ispcrStart + self.ispcrLength] for tr in taggedRecords: #tr.addTagByTag('uL', self.umiLength, isPhred=False) if self.umiLength == 0: #tr.addTagByTag('MI', barcode, isPhred=False) #tr.addTagByTag('QM', barcodeQual, isPhred=True) pass else: tr.addTagByTag('RX', umi, isPhred=False, make_safe=False) tr.addTagByTag('RQ', umiQual, isPhred=True) #tr.addTagByTag('MI', barcode+umi, isPhred=False) #tr.addTagByTag('QM', barcodeQual+umiQual, isPhred=True) tr.addTagByTag( 'bi', barcodeIdentifier, isPhred=False, make_safe=False) tr.addTagByTag('bc', rawBarcode, isPhred=False, make_safe=False) #tr.addTagByTag('hd', hammingDistance, isPhred=False) tr.addTagByTag('BC', barcode, isPhred=False, make_safe=False) tr.addTagByTag('QT', barcodeQual, isPhred=True) if len(barcode) != len(barcodeQual): raise ValueError() # Enzyme sequence tr.addTagByTag( 'ES', enz, isPhred=False, make_safe=False) # Enzyme sequence # Enzyme quality tr.addTagByTag('eq', enzQual, isPhred=True) # ISPCR sequence tr.addTagByTag('IS', ispcr, isPhred=False, make_safe=False) # ISPCR quality #tr.addTagByTag('is', ispcrQual, isPhred=True) tr.addTagByTag( 'MX', self.shortName, make_safe=False, isPhred=False) for rid, (record, taggedRecord) in enumerate( zip(records, taggedRecords)): taggedRecord.sequence = record.sequence[self.sequenceCapture[rid]] taggedRecord.qualities = record.qual[self.sequenceCapture[rid]] taggedRecord.plus = record.plus return taggedRecords
# Add information and rebuild header #header = f'@UMI:{umi};UMIQ:{umiQual};CBI:{barcodeIdentifier};CB:{barcode};CBQ:{barcodeQual};' # return fastqIterator.FastqRecord(header, records[1].sequence, # records[1].plus, records[1].qual ) # NLAIII, 384 well format with 3bp UMI
[docs]class Nla_384w_u8_c8_ad3_is15(Base_RestrictionBisulfiteDemuxMethod): def __init__(self, barcodeFileParser, **kwargs): self.barcodeFileAlias = 'nla_bisulfite' Base_RestrictionBisulfiteDemuxMethod.__init__( self, barcodeFileAlias=self.barcodeFileAlias, barcodeFileParser=barcodeFileParser, **kwargs) self.shortName = 'RBSN' #'ReBsNla384C8U8E3I15' self.longName = 'Restriction-BS, NlaIII; 384w; UMI: 8 bp, CB: 8bp, Enz. ID: 3bp, ISPCR: 15 bp' self.autoDetectable = True self.description = 'Bisulfite-compatible barcoded Nla Adapters (384w). R1 contains UMI, BC, Enzyme ID and ISPCR.'