#!/usr/bin/env python3
import collections
import os
import sys
import glob
import argparse
import itertools
import gzip
[docs]def decodeKvPairs(kv):
keyValues = {}
for part in kv.split(';'):
kv = part.strip().split()
if len(kv) == 2:
key = kv[0]
value = kv[1].replace('"', '')
keyValues[key] = value
return keyValues
[docs]def exonGTF_to_intronGTF(exon_path, id):
# (gene,chrom,strand)-> [range(start,end), range(start,end)]
geneToExonRanges = collections.defaultdict(list)
prevChrom = None
capture = list(set(['gene_id',
'gene_version',
'transcript_version',
'gene_name',
'gene_source',
'gene_biotype',
'transcript_name',
'transcript_source',
'transcript_biotype',
'transcript_support_level']) - set([id]))
print(capture)
id_to_features = {}
with gzip.open(exon_path, 'rt') if exon_path.endswith('.gz') else open(exon_path) as f:
for line in f:
if line.startswith('#'):
yield line
continue
chromosome, source, feature_type, feature_start, feature_end, at5, strand, at7, kv = line.split(
None, 8)
if feature_type != 'exon':
continue
meta = decodeKvPairs(kv)
id_to_features[meta[id]] = {
c: meta.get(c) for c in capture if c in meta
}
geneToExonRanges[(
meta[id], chromosome, strand, source
)
].append(range(int(feature_start), int(feature_end) + 1))
if prevChrom is not None and prevChrom != chromosome:
for intr in generate_introns(
geneToExonRanges, id, id_to_features):
yield intr
geneToExonRanges = collections.defaultdict(list)
prevChrom = chromosome
if prevChrom is not None:
for intr in generate_introns(geneToExonRanges, id, id_to_features):
yield intr
geneToExonRanges = collections.defaultdict(list)
[docs]def generate_introns(geneToExonRanges, id, id_to_features):
for g in geneToExonRanges:
if len(geneToExonRanges[g]) == 0:
continue
geneStart = min([min(r) for r in geneToExonRanges[g]])
geneEnd = max([max(r) for r in geneToExonRanges[g]])
gene_identifier_or_other, chromosome, strand, source = g
introns = []
in_intron = None
current_intron_start = None
rest_data = id_to_features.get(gene_identifier_or_other, {})
if len(rest_data) > 0:
rest = '; '.join(
[f'{key} {value}' for key, value in rest_data.items()])
else:
rest = ''
for i in sorted(
[geneStart, geneEnd] +
[i.start for i in geneToExonRanges[g]] +
[i.stop for i in geneToExonRanges[g]]):
if any([i in r for r in geneToExonRanges[g]]):
if current_intron_start is not None and in_intron is True:
introns.append([current_intron_start, i])
yield '\t'.join([str(x) for x in [
chromosome,
source,
'intron',
current_intron_start,
i - 1,
'.',
strand,
'.', f'{id} "{gene_identifier_or_other}";{rest}']]) + '\n'
in_intron = False
else:
if in_intron is False:
in_intron = True
current_intron_start = i
if __name__ == '__main__':
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Convert exonic GTF file to intronic GTF file")
argparser.add_argument('gffin')
argparser.add_argument('-id', help='identifier', default='transcript_id')
argparser.add_argument(
'-o',
help="Output GTF file",
type=str,
required=True)
args = argparser.parse_args()
with gzip.open(args.o, 'wt') if args.o.endswith('.gz') else open(args.o, 'w') as o:
for intron in exonGTF_to_intronGTF(args.gffin, args.id):
if intron is not None:
o.write(intron)