Source code for singlecellmultiomics.bamProcessing.bamExtractRandomPrimerStats

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import singlecellmultiomics
from singlecellmultiomics.molecule import MoleculeIterator, NlaIIIMolecule
from singlecellmultiomics.fragment import NlaIIIFragment
import pysam
import collections
import pysamiterators
import pandas as pd
import numpy as np
import re
import more_itertools


[docs]def get_random_primer_histogram( molecule_source, min_mq, max_size, size_bin_size, head=None): """ get_random_primer_histogram This method counts the frequencies of the random primers which present in the supplied molecule_source. Args: molecule_source : iterable yielding Molecules min_mq (int) : minimum mean mapping quality of a fragment to be taken into account max_size(int) : maximum fragment size size_bin_size (int) : bin size in basepairs of the histogram head (int) : amount of random sequences to process. When not supplied all random primer sequences in the iterable are counted (expect for the ones with a mapping quality which is too low). """ used = 0 random_primer_obs = [collections.Counter() for fs in range( int(max_size / size_bin_size) + 1)] # RS for i, molecule in enumerate(molecule_source): if molecule.get_mean_mapping_qual() < args.min_mq: continue for frag in molecule: if not frag.has_R2(): continue fs = abs(frag.span[2] - frag.span[1]) size = int(min(max_size - 1, fs) / size_bin_size) rp = frag.random_primer_sequence if 'N' in rp: continue random_primer_obs[size][rp] += 1 used += 1 if args.head is not None and used > (args.head - 1): break qf = pd.DataFrame(random_primer_obs).fillna(0).T qf[-1] = qf.sum(1) qf = qf.T.sort_values(-1, 1).drop(-1) qf.index = [bin_index * size_bin_size for bin_index in qf.index] return qf
if __name__ == "__main__": import argparse f'Please use an environment with python 3.6 or higher!' argparser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" Known variant locations extraction tool """) argparser.add_argument('bamfile') argparser.add_argument( '-max_size', type=int, default=800, help='Maximum fragment size to clip to') argparser.add_argument( '-size_bin_size', help="Bin size in basepairs", type=int, default=5) argparser.add_argument( '-head', type=int, help='Amount of random sequences to count, when not specified all random primers are counted') argparser.add_argument('-min_mq', type=int, default=50) argparser.add_argument( '-o', type=str, default='./randomer_usage.pickle.gz', help='Output pickle/csv path') args = argparser.parse_args() with pysam.AlignmentFile(args.bamfile) as alignments: molecule_source = MoleculeIterator( alignments, molecule_class=NlaIIIMolecule, fragment_class=NlaIIIFragment, ) qf = get_random_primer_histogram( molecule_source, args.min_mq, args.max_size, args.size_bin_size, head=args.head) print('Writing dataframe to disk') if args.o.endswith('csv') or args.o.endswith('csv.gz'): qf.to_csv(args.o) else: qf.to_pickle(args.o) print('All done')