#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import pysam
import collections
import pandas as pd
import numpy as np
import os
[docs]def obtain_approximate_reference_cut_position(site, contig, alt_spans):
#contig, cut_start, strand = molecule.get_cut_site()
alt_contig, alt_start, alt_end = alt_spans[contig]
return contig, site + alt_start
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Create methylation and copy number tables from bam file, ALT contig aware')
argparser.add_argument('alignmentfile', type=str)
argparser.add_argument('-head', type=int, default=None)
argparser.add_argument('-bin_size', type=int, default=250_000)
argparser.add_argument(
'-min_mq',
type=int,
default=30,
help='Minimum mapping quality')
argparser.add_argument('--met', action='store_true')
argparser.add_argument('--cnv', action='store_true')
argparser.add_argument('-ref', type=str, required=True)
args = argparser.parse_args()
#chromosomes = [f'chr{x}' for x in range(1,23)] + ['chrX']
output_alias = args.alignmentfile + f'.table.bs_{args.bin_size}'
# Load alternative contigs if available:
alt_path = f'{args.ref}.64.alt'
alt_spans = None
if os.path.exists(alt_path):
print(f'Loading ALT data from {alt_path}')
with pysam.AlignmentFile(alt_path) as alts:
alt_spans = {}
for alt in alts:
alt_spans[alt.query_name] = (
alt.reference_name, alt.reference_start, alt.reference_end)
# cell -> (allele, chromosome, bin) -> umi_count
fragment_abundance = collections.defaultdict(collections.Counter)
# cell -> (context, chromosome, bin) -> methylation
methylation_pos = collections.defaultdict(collections.Counter)
# cell -> (context, chromosome, bin) -> methylation
methylation_neg = collections.defaultdict(collections.Counter)
# cell -> (allele, chromosome, bin) -> raw_count
fragment_read_abundance = collections.defaultdict(collections.Counter)
with pysam.AlignmentFile(args.alignmentfile, threads=4) as alignments:
min_mq = 30
wrote = 0
# for chromosome in chromosomes:
# for i,read in enumerate(alignments.fetch(chromosome)):
for read in alignments:
if read.is_duplicate:
continue
if read.has_tag('mp') and read.get_tag('mp') != 'unique':
continue
if read.mapping_quality < args.min_mq or not read.has_tag('DS'):
continue
if args.head is not None and wrote >= (args.head - 1):
break
if read.has_tag('RZ') and read.get_tag('RZ') != 'CATG':
continue
contig = read.reference_name
site = int(read.get_tag('DS'))
if alt_spans is not None and contig in alt_spans:
contig, site = obtain_approximate_reference_cut_position(
site, contig, alt_spans)
wrote += 1
bin_i = int(site / args.bin_size)
allele = 'unk'
if read.has_tag('DA'):
allele = read.get_tag('DA')
if args.met:
for context in 'xhz':
if read.has_tag(f's{context}'):
methylation_neg[read.get_tag('SM')][(
context, allele, contig, bin_i)] += int(read.get_tag(f's{context}'))
if read.has_tag(f's{context.upper()}'):
methylation_pos[read.get_tag('SM')][(
context, allele, contig, bin_i)] += int(read.get_tag(f's{context.upper()}'))
fragment_abundance[read.get_tag(
'SM')][(allele, contig, bin_i)] += 1
fragment_read_abundance[read.get_tag('SM')][(
allele, contig, bin_i)] += read.get_tag('af')
if args.cnv:
print('writing count table')
pd.DataFrame(fragment_abundance).to_pickle(
f'{output_alias}.CNV_umis.pickle.gz')
pd.DataFrame(fragment_read_abundance).to_pickle(
f'{output_alias}.CNV_reads.pickle.gz')
if args.met:
print('writing count table for methylation status')
methylation_pos = pd.DataFrame(
methylation_pos).T.fillna(0).sort_index()
methylation_neg = pd.DataFrame(
methylation_neg).T.fillna(0).sort_index()
for context in 'xhz':
methylation_pos[context].T.to_pickle(
f'{output_alias}.{context.upper()}.pickle.gz')
methylation_neg[context].T.to_pickle(
f'{output_alias}.{context}.pickle.gz')
(methylation_pos[context] / (methylation_neg[context] + methylation_pos[context])
).T.to_pickle(f'{output_alias}.{context}.beta.pickle.gz')