# HG changeset patch # User Jim Johnson # Date 1423618552 21600 # Node ID 4353f776dfa3198eb13bea7fc7964dcfc56732d5 # Parent a004033614d486629b3b7bd97d38f3a04488ec6f Add defuse_trinity_analysis diff -r a004033614d4 -r 4353f776dfa3 defuse_trinity_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_trinity_analysis.py Tue Feb 10 19:35:52 2015 -0600 @@ -0,0 +1,243 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2014, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" + + +""" +This tool takes the defuse results.tsv tab-delimited file, trinity +and creates a tabular report +""" + +import sys,re,os.path +import optparse +from optparse import OptionParser + +revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) + +def read_fasta(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + + +def test_rcomplement(seq, target): + try: + comp = revcompl(seq) + return comp in target + except: + pass + return False + +def test_reverse(seq,target): + return options.test_reverse and seq and seq[::-1] in target + +def cmp_alphanumeric(s1,s2): + if s1 == s2: + return 0 + a1 = re.findall("\d+|[a-zA-Z]+",s1) + a2 = re.findall("\d+|[a-zA-Z]+",s2) + for i in range(min(len(a1),len(a2))): + if a1[i] == a2[i]: + continue + if a1[i].isdigit() and a2[i].isdigit(): + return int(a1[i]) - int(a2[i]) + return 1 if a1[i] > a2[i] else -1 + return len(a1) - len(a2) + + +def parse_defuse_results(inputFile): + columns = [] + defuse_results = [] + # {cluster_id : { field : value}) + try: + for linenum,line in enumerate(inputFile): + ## print >> sys.stderr, "%d: %s\n" % (linenum,line) + fields = line.strip().split('\t') + if line.startswith('cluster_id'): + columns = fields + ## print >> sys.stderr, "columns: %s\n" % columns + continue + cluster_dict = dict() + cluster_id = fields[columns.index('cluster_id')] + cluster_dict['cluster_id'] = fields[columns.index('cluster_id')] + cluster_dict['gene_chromosome1'] = fields[columns.index('gene_chromosome1')] + cluster_dict['gene_chromosome2'] = fields[columns.index('gene_chromosome2')] + cluster_dict['genomic_strand1'] = fields[columns.index('genomic_strand1')] + cluster_dict['genomic_strand2'] = fields[columns.index('genomic_strand2')] + cluster_dict['gene1'] = fields[columns.index('gene1')] + cluster_dict['gene2'] = fields[columns.index('gene2')] + cluster_dict['gene_name1'] = fields[columns.index('gene_name1')] + cluster_dict['gene_name2'] = fields[columns.index('gene_name2')] + cluster_dict['gene_location1'] = fields[columns.index('gene_location1')] + cluster_dict['gene_location2'] = fields[columns.index('gene_location2')] + cluster_dict['expression1'] = int(fields[columns.index('expression1')]) + cluster_dict['expression2'] = int(fields[columns.index('expression2')]) + cluster_dict['genomic_break_pos1'] = int(fields[columns.index('genomic_break_pos1')]) + cluster_dict['genomic_break_pos2'] = int(fields[columns.index('genomic_break_pos2')]) + cluster_dict['breakpoint_homology'] = int(fields[columns.index('breakpoint_homology')]) + cluster_dict['orf'] = fields[columns.index('orf')] == 'Y' + cluster_dict['exonboundaries'] = fields[columns.index('exonboundaries')] == 'Y' + cluster_dict['read_through'] = fields[columns.index('read_through')] == 'Y' + cluster_dict['interchromosomal'] = fields[columns.index('interchromosomal')] == 'Y' + cluster_dict['adjacent'] = fields[columns.index('adjacent')] == 'Y' + cluster_dict['altsplice'] = fields[columns.index('altsplice')] == 'Y' + cluster_dict['deletion'] = fields[columns.index('deletion')] == 'Y' + cluster_dict['eversion'] = fields[columns.index('eversion')] == 'Y' + cluster_dict['inversion'] = fields[columns.index('inversion')] == 'Y' + cluster_dict['span_count'] = int(fields[columns.index('span_count')]) + cluster_dict['splitr_count'] = int(fields[columns.index('splitr_count')]) + cluster_dict['splice_score'] = int(fields[columns.index('splice_score')]) + cluster_dict['probability'] = float(fields[columns.index('probability')] if columns.index('probability') else 'nan') + cluster_dict['splitr_sequence'] = fields[columns.index('splitr_sequence')] + defuse_results.append(cluster_dict) + except Exception, e: + print >> sys.stderr, "failed: %s" % e + sys.exit(1) + return defuse_results + +## deFuse params to the mapping application? + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + # files + parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' ) + parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' ) + parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' ) + parser.add_option( '-o', '--output', dest='output', help='The output report (else write to stdout)' ) + parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', help='The output alignment file' ) + parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', help='The output alignment file' ) + parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' ) + parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' ) + parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' ) + parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' ) + # min_orf_len + # split_na_len + # tic_len = 1000000 + # prior + # deFuse direction reversed + # in frame ? + # contain known protein elements + # what protein change + # trinity provides full transctipt, defuse doesn't show full + #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' ) + #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file') + (options, args) = parser.parse_args() + + # results.tsv input + if options.input != None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(2) + else: + inputFile = sys.stdin + # vcf output + if options.output != None: + try: + outputPath = os.path.abspath(options.output) + outputFile = open(outputPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + else: + outputFile = sys.stdout + + ## Read defuse results + fusions = parse_defuse_results(inputFile) + ## Create a field with the 12 nt before and after the fusion point. + ## Create a field with the reverse complement of the 24 nt fusion point field. + ## Add fusion type filed (INTER, INTRA, TIC) + for i,fusion in enumerate(fusions): + fusion['ordinal'] = i + 1 + split_seqs = fusion['splitr_sequence'].split('|') + fusion['split_seqs'] = split_seqs + fwd_seq = split_seqs[0][-(min(abs(options.nbases),len(split_seqs[0]))):] + split_seqs[1][:min(abs(options.nbases),len(split_seqs[1]))] + rev_seq = revcompl(fwd_seq) + fusion['fwd_seq'] = fwd_seq + fusion['rev_seq'] = rev_seq + fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC' + fusion['fusion_type'] = fusion_type + fusion['transcripts'] = [] + fusion['Transcript'] = 'No' + fusion['Protein'] = 'No' + print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2']) + inputFile.close() + + ## Process Trinity data and compare to deFuse + matched_transcripts = dict() + matched_orfs = dict() + fusions_with_transcripts = set() + fusions_with_orfs = set() + n = 0 + if options.transcripts: + with open(options.transcripts) as fp: + for name, seq in read_fasta(fp): + n += 1 + for i,fusion in enumerate(fusions): + if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq: + fusions_with_transcripts.add(i) + matched_transcripts[name] = seq + fusion['transcripts'].append(name) + fusion['Transcript'] = 'Yes' + print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts)) + for i,fusion in enumerate(fusions): + print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts']) + ## Process ORFs and compare to matched deFuse and Trinity data. + ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*". + if options.peptides: + with open(options.peptides) as fp: + for name, seq in read_fasta(fp): + n += 1 + if len(seq) < options.min_pep_len: + continue + for i,fusion in enumerate(fusions): + if len(fusion['transcripts']) > 0: + for id_string in fusion['transcripts']: + tx_id = id_string.lstrip('>').split()[0] + if tx_id in name: + pep_len = len(seq) + start = seq.find('M') + if pep_len - start < options.min_pep_len: + continue + fusions_with_orfs.add(i) + matched_orfs[name] = seq + fusion['Protein'] = 'Yes' + # fwd or reverse + tx_seq = matched_transcripts(tx_id) + pos = tx_seq.find(fusion['fwd_seq']) + if pos < 0: + pos = tx_seq.find(fusion['rev_seq']) + # locate fusion in transcript + # locate fusion in ORF + fusion['prior_pep_seq'] = '' + fusion['novel_pep_seq'] = '' + print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs)) + ## Write reports + report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein'] + report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point','fusion_type':'Type','Transcript':'Transcript?','Protein':'Protein?' } + print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields])) + for i,fusion in enumerate(fusions): + print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields])) + # print >> outputFile, "%d\t%s\t%s\t%d\t%f\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['gene_name1'],fusion['gene_name2'],fusion['span_count'],fusion['probability'],fusion['gene_chromosome1'],fusion['gene_location1'],fusion['gene_chromosome2'],fusion['gene_location2'],fusion['fusion_type'],fusion['Transcript'],fusion['Protein']) + +if __name__ == "__main__" : __main__() + diff -r a004033614d4 -r 4353f776dfa3 defuse_trinity_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/defuse_trinity_analysis.xml Tue Feb 10 19:35:52 2015 -0600 @@ -0,0 +1,37 @@ + + + verify fusions with trinity + defuse_trinity_analysis.py --input $defuse_results --transcripts $trinity_transcripts --peptides $trinity_orfs + --nbases $nbases --min_pep_len $min_pep_len --ticdist $ticdist + --output $output + + + + + + + + + + + + + + + + + + + + + + +**Defuse Results ** + +Verifies DeFuse_ results.tsv with TrinityRNAseq_ assembled transcripts and ORFs. + +This program relies on the header line of the results.tsv to determine which columns to use for analysis. +.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse +.. _TrinityRNAseq: http://trinityrnaseq.github.io/ + +