Source code for singlecellmultiomics.bamProcessing.bamFeatureDensityVisualisation

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys
import pysam
import collections
import argparse
import gzip
import pickle
import matplotlib

import numpy as np

import pandas as pd
import singlecellmultiomics.features


[docs]def distance_to_hit( lookup_coordinate, hit_start, hit_end, hit_strand, lookup_strand=None, distance_from_begin=True): distance = None if distance_from_begin: if hit_strand == '+': distance = lookup_coordinate - hit_start else: distance = hit_start - lookup_coordinate else: if hit_strand == '+': distance = lookup_coordinate - hit_end else: distance = hit_end - lookup_coordinate return distance
# check if lookup coordinate falls within a feature:
[docs]def distance_to_feature_start( chromosome, lookup_coordinate, feature_container, lookup_strand=None): distance = None min_distance = None for hit in feature_container.findFeaturesAt( chromosome=chromosome, lookupCoordinate=lookup_coordinate, strand=lookup_strand): hit_start, hit_end, hit_id, hit_strand, hit_ids = hit # calculate distance relative to selected point distance = distance_to_hit( lookup_coordinate, hit_start, hit_end, hit_strand, lookup_strand) if min_distance is None or abs(distance) < abs(min_distance): min_distance = distance if distance is None: for hit in feature_container.findNearestFeature( chromosome=chromosome, lookupCoordinate=lookup_coordinate, strand=lookup_strand): hit_start, hit_end, hit_id, hit_strand, hit_ids = hit distance = distance_to_hit( lookup_coordinate, hit_start, hit_end, hit_strand, lookup_strand) if min_distance is None or abs(distance) < abs(min_distance): min_distance = distance return distance
[docs]def bam_to_histogram( bam_path, add_to, feature_container, site_mode=False, bin_size=25, min_mq=30, max_distance=15_000, head=None, quick=True): histogram = add_to with pysam.AlignmentFile(bam_path) as f: i = 0 for read in f: if head is not None and i > head: break if read.is_unmapped: continue if read.mapping_quality < min_mq: continue if read.has_tag('LY'): library = read.get_tag('LY') else: library = 'No_library_defined' chromosome = read.reference_name if quick or site_mode: if site_mode: if not read.has_tag('DS'): continue i += 1 distance = distance_to_feature_start(chromosome, int( read.get_tag('DS')), feature_container=feature_container) if distance is not None and abs(distance) < max_distance: histogram[library][np.floor( distance / bin_size) * bin_size] += 1 else: i += 1 for distance in [ distance_to_feature_start( chromosome, read.reference_start, feature_container=feature_container), distance_to_feature_start( chromosome, read.reference_end, feature_container=feature_container)]: if distance is not None and abs( distance) < max_distance: histogram[library][np.floor( distance / bin_size) * bin_size] += 1 else: for q_pos, ref_pos in read.get_aligned_pairs( matches_only=True, with_seq=False): distance = distance_to_feature_start( chromosome, ref_pos, feature_container=feature_container) if distance is not None and abs(distance) < max_distance: histogram[library][np.floor(distance / bin_size)] += 1 print(f'{bam_path} read {i} reads')
if __name__ == '__main__': matplotlib.use('Agg') import matplotlib.pyplot as plt matplotlib.rcParams['figure.dpi'] = 160 argparser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Visualise feature density of a bam file. (Coverage around stop codons, start codons, genes etc)') argparser.add_argument( '-o', type=str, help="output plot folder path, every library will be visualised separately", default='./plots/') argparser.add_argument( '-d', type=str, help="output data folder path, the data used for plotting will be exported to this folder", default='./tables/') argparser.add_argument( '-features', type=str, default='stop_codon,start_codon,exon,transcript', help="features to plot, separate by comma without space") argparser.add_argument('alignmentfiles', type=str, nargs='*') argparser.add_argument( '-gtf', type=str, required=True, help="GTF file containing the features to plot") argparser.add_argument( '-head', type=int, default=100_000_000, help="Use this amount of reads per bam file (or less if the bam file has less reads)") argparser.add_argument( '--bySite', action='store_true', help="Use the DS tag to count density") argparser.add_argument('--binSize', type=int, default=25) argparser.add_argument( '--maxDistance', type=int, default=15_000, help='Size of the window') argparser.add_argument( '--minMQ', type=int, default=30, help='Minimum mapping quality') args = argparser.parse_args() features = args.features.split(',') if not os.path.exists(args.o): os.makedirs(args.o) if not os.path.exists(args.d): os.makedirs(args.d) # Load the GTF files with the desired annotations annotations = {} for feature in features: annotations[feature] = singlecellmultiomics.features.FeatureContainer() annotations[feature].loadGTF(args.gtf, select_feature_type=[feature]) if len(annotations[feature]) == 0: print(f"""Feature {feature} is not present in the suplied GTF file! Make sure to select feature types which are present in the GTF file!""") histograms = collections.defaultdict( lambda: collections.defaultdict( collections.Counter)) # feature->library->hist # Create histogram per feature / library for feature in features: if len(annotations[feature]) == 0: print(f"Skipping {feature} as it is not present in the GTF file") continue for bam_path in args.alignmentfiles: print(f"Now reading {bam_path} for annotation type {feature}") bam_to_histogram( bam_path, histograms[feature], annotations[feature], site_mode=args.bySite, bin_size=args.binSize, max_distance=args.maxDistance, min_mq=args.minMQ, head=args.head) # Write print('Writing tables') for feature, feature_data in histograms.items(): for library, histogram in feature_data.items(): df = pd.DataFrame( [list(histogram.keys()), list(histogram.values())]).transpose() df.columns = ['bin', 'observations'] df.to_csv( f'{args.d}/{feature}_{library}_{args.maxDistance}_{args.binSize}.csv') # Plot the histograms print('Plotting') for feature, feature_data in histograms.items(): for library, histogram in feature_data.items(): print(f'{feature} {library}') fig, ax = plt.subplots(figsize=(12, 3)) ax.scatter( list( histogram.keys()), list( histogram.values()), s=5, alpha=0.6) ax.set_xlabel(f'Distance from {feature} codon [bp]') ax.set_ylabel('# bases') ax.set_title(f'{library}') ax.set_ylim(bottom=-10) plt.tight_layout() plt.savefig( f'{args.o}/{feature}_{library}_{args.maxDistance}_{args.binSize}.png') plt.close()