comparison defuse_trinity_analysis.py @ 40:ed07bcc39f6e

Provide a matched tabular output
author Jim Johnson <jj@umn.edu>
date Wed, 06 May 2015 14:31:57 -0500
parents 90127ee1eae5
children
comparison
equal deleted inserted replaced
39:90127ee1eae5 40:ed07bcc39f6e
14 14
15 15
16 """ 16 """
17 This tool takes the defuse results.tsv tab-delimited file, trinity 17 This tool takes the defuse results.tsv tab-delimited file, trinity
18 and creates a tabular report 18 and creates a tabular report
19
20 Would it be possible to create 2 additional files from the deFuse-Trinity comparison program.
21 One containing all the Trinity records matched to deFuse records (with the deFuse ID number),
22 and the other with the ORFs records matching back to the Trinity records in the first files?
23
24 M045_Report.csv
25 "","deFuse_subset.count","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","deFuse.gene_chromosome1","deFuse.gene_location1","deFuse.gene_chromosome2","deFuse.gene_location2","deFuse_subset.type"
26 "1",1,"Rps6","Dennd4c",7,0.814853504,"4","coding","4","coding","TIC "
27
28
29
30 OS03_Matched_Rev.csv
31 "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein"
32
33 "","deFuse.splitr_sequence","deFuse.gene_chromosome1","deFuse.gene_chromosome2","deFuse.gene_location1","deFuse.gene_location2","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","word1","word2","fusion_part_1","fusion_part_2","fusion_point","fusion_point_rc","count","transcript"
34
19 """ 35 """
20 36
21 import sys,re,os.path 37 import sys,re,os.path,math
38 import textwrap
22 import optparse 39 import optparse
23 from optparse import OptionParser 40 from optparse import OptionParser
24 41
25 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) 42 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1])
43
44 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
45 "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
46 "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
47 "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
48 "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
49 "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
50 "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
51 "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
52 "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
53 "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
54 "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
55 "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
56 "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
57 "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
58 "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
59 "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
60
61 def translate(seq) :
62 rna = seq.upper().replace('T','U')
63 aa = []
64 for i in range(0,len(rna) - 2, 3):
65 codon = rna[i:i+3]
66 aa.append(codon_map[codon] if codon in codon_map else 'X')
67 return ''.join(aa)
68
69 def get_stop_codons(seq) :
70 rna = seq.upper().replace('T','U')
71 stop_codons = []
72 for i in range(0,len(rna) - 2, 3):
73 codon = rna[i:i+3]
74 aa = codon_map[codon] if codon in codon_map else 'X'
75 if aa == '*':
76 stop_codons.append(codon)
77 return stop_codons
26 78
27 def read_fasta(fp): 79 def read_fasta(fp):
28 name, seq = None, [] 80 name, seq = None, []
29 for line in fp: 81 for line in fp:
30 line = line.rstrip() 82 line = line.rstrip()
58 if a1[i].isdigit() and a2[i].isdigit(): 110 if a1[i].isdigit() and a2[i].isdigit():
59 return int(a1[i]) - int(a2[i]) 111 return int(a1[i]) - int(a2[i])
60 return 1 if a1[i] > a2[i] else -1 112 return 1 if a1[i] > a2[i] else -1
61 return len(a1) - len(a2) 113 return len(a1) - len(a2)
62 114
63
64 def parse_defuse_results(inputFile): 115 def parse_defuse_results(inputFile):
116 defuse_results = []
65 columns = [] 117 columns = []
66 defuse_results = [] 118 coltype_int = ['expression1', 'expression2', 'gene_start1', 'gene_start2', 'gene_end1', 'gene_end2', 'genomic_break_pos1', 'genomic_break_pos2', 'breakpoint_homology', 'span_count', 'splitr_count', 'splice_score']
67 # {cluster_id : { field : value}) 119 coltype_float = ['probability']
120 coltype_yn = [ 'orf', 'exonboundaries', 'read_through', 'interchromosomal', 'adjacent', 'altsplice', 'deletion', 'eversion', 'inversion']
68 try: 121 try:
69 for linenum,line in enumerate(inputFile): 122 for linenum,line in enumerate(inputFile):
70 ## print >> sys.stderr, "%d: %s\n" % (linenum,line) 123 ## print >> sys.stderr, "%d: %s\n" % (linenum,line)
71 fields = line.strip().split('\t') 124 fields = line.strip().split('\t')
72 if line.startswith('cluster_id'): 125 if line.startswith('cluster_id'):
73 columns = fields 126 columns = fields
74 ## print >> sys.stderr, "columns: %s\n" % columns 127 ## print >> sys.stderr, "columns: %s\n" % columns
75 continue 128 continue
76 cluster_dict = dict() 129 elif fields and len(fields) == len(columns):
77 cluster_id = fields[columns.index('cluster_id')] 130 cluster_id = fields[columns.index('cluster_id')]
78 cluster_dict['cluster_id'] = fields[columns.index('cluster_id')] 131 cluster = dict()
79 cluster_dict['gene_chromosome1'] = fields[columns.index('gene_chromosome1')] 132 flags = []
80 cluster_dict['gene_chromosome2'] = fields[columns.index('gene_chromosome2')] 133 defuse_results.append(cluster)
81 cluster_dict['genomic_strand1'] = fields[columns.index('genomic_strand1')] 134 for i,v in enumerate(columns):
82 cluster_dict['genomic_strand2'] = fields[columns.index('genomic_strand2')] 135 if v in coltype_int:
83 cluster_dict['gene1'] = fields[columns.index('gene1')] 136 cluster[v] = int(fields[i])
84 cluster_dict['gene2'] = fields[columns.index('gene2')] 137 elif v in coltype_float:
85 cluster_dict['gene_name1'] = fields[columns.index('gene_name1')] 138 cluster[v] = float(fields[i])
86 cluster_dict['gene_name2'] = fields[columns.index('gene_name2')] 139 elif v in coltype_yn:
87 cluster_dict['gene_location1'] = fields[columns.index('gene_location1')] 140 cluster[v] = fields[i] == 'Y'
88 cluster_dict['gene_location2'] = fields[columns.index('gene_location2')] 141 if cluster[v]:
89 cluster_dict['expression1'] = int(fields[columns.index('expression1')]) 142 flags.append(columns[i])
90 cluster_dict['expression2'] = int(fields[columns.index('expression2')]) 143 else:
91 cluster_dict['genomic_break_pos1'] = int(fields[columns.index('genomic_break_pos1')]) 144 cluster[v] = fields[i]
92 cluster_dict['genomic_break_pos2'] = int(fields[columns.index('genomic_break_pos2')]) 145 cluster['flags'] = ','.join(flags)
93 cluster_dict['breakpoint_homology'] = int(fields[columns.index('breakpoint_homology')])
94 cluster_dict['orf'] = fields[columns.index('orf')] == 'Y'
95 cluster_dict['exonboundaries'] = fields[columns.index('exonboundaries')] == 'Y'
96 cluster_dict['read_through'] = fields[columns.index('read_through')] == 'Y'
97 cluster_dict['interchromosomal'] = fields[columns.index('interchromosomal')] == 'Y'
98 cluster_dict['adjacent'] = fields[columns.index('adjacent')] == 'Y'
99 cluster_dict['altsplice'] = fields[columns.index('altsplice')] == 'Y'
100 cluster_dict['deletion'] = fields[columns.index('deletion')] == 'Y'
101 cluster_dict['eversion'] = fields[columns.index('eversion')] == 'Y'
102 cluster_dict['inversion'] = fields[columns.index('inversion')] == 'Y'
103 cluster_dict['span_count'] = int(fields[columns.index('span_count')])
104 cluster_dict['splitr_count'] = int(fields[columns.index('splitr_count')])
105 cluster_dict['splice_score'] = int(fields[columns.index('splice_score')])
106 cluster_dict['probability'] = float(fields[columns.index('probability')] if columns.index('probability') else 'nan')
107 cluster_dict['splitr_sequence'] = fields[columns.index('splitr_sequence')]
108 defuse_results.append(cluster_dict)
109 except Exception, e: 146 except Exception, e:
110 print >> sys.stderr, "failed: %s" % e 147 print >> sys.stderr, "failed to read cluster_dict: %s" % e
111 sys.exit(1) 148 exit(1)
112 return defuse_results 149 return defuse_results
113 150
114 ## deFuse params to the mapping application? 151 ## deFuse params to the mapping application?
115 152
116 def __main__(): 153 def __main__():
117 #Parse Command Line 154 #Parse Command Line
118 parser = optparse.OptionParser() 155 parser = optparse.OptionParser()
119 # files 156 # files
120 parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' ) 157 parser.add_option( '-i', '--input', dest='input', default=None, help='The input defuse results.tsv file (else read from stdin)' )
121 parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' ) 158 parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' )
122 parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' ) 159 parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' )
123 parser.add_option( '-o', '--output', dest='output', help='The output report (else write to stdout)' ) 160 parser.add_option( '-o', '--output', dest='output', default=None, help='The output report (else write to stdout)' )
124 parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', help='The output alignment file' ) 161 parser.add_option( '-m', '--matched', dest='matched', default=None, help='The output matched report' )
125 parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', help='The output alignment file' ) 162 parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', default=None, help='The output alignment file' )
163 parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', default=None, help='The output ORF alignment file' )
126 parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' ) 164 parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' )
127 parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' ) 165 parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' )
128 parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' ) 166 parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' )
129 parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' ) 167 parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' )
168 parser.add_option( '-I', '--incomplete_orfs', dest='incomplete_orfs', action='store_true', default=False, help='Count incomplete ORFs' )
169 parser.add_option( '-O', '--orf_type', dest='orf_type', action='append', default=['complete','5prime_partial'], choices=['complete','5prime_partial','3prime_partial','internal'], help='ORF types to report' )
170 parser.add_option( '-r', '--readthrough', dest='readthrough', type='int', default=3, help='Number of stop_codons to read through' )
130 # min_orf_len 171 # min_orf_len
131 # split_na_len 172 # split_na_len
132 # tic_len = 1000000 173 # tic_len = 1000000
133 # prior 174 # prior
134 # deFuse direction reversed 175 # deFuse direction reversed
158 except Exception, e: 199 except Exception, e:
159 print >> sys.stderr, "failed: %s" % e 200 print >> sys.stderr, "failed: %s" % e
160 exit(3) 201 exit(3)
161 else: 202 else:
162 outputFile = sys.stdout 203 outputFile = sys.stdout
204 outputTxFile = None
205 outputOrfFile = None
206 if options.transcript_alignment:
207 try:
208 outputTxFile = open(options.transcript_alignment,'w')
209 except Exception, e:
210 print >> sys.stderr, "failed: %s" % e
211 exit(3)
212 if options.orf_alignment:
213 try:
214 outputOrfFile = open(options.orf_alignment,'w')
215 except Exception, e:
216 print >> sys.stderr, "failed: %s" % e
217 exit(3)
218 # Add percent match after transcript
219 report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2']
220 report_fields = ['cluster_id','gene_name1','gene_name2','span_count','probability','genomic_bkpt1','gene_location1','genomic_bkpt2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2']
221 report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point', 'cluster_id':'cluster_id', 'splitr_sequence':'splitr_sequence', 'splitr_count':'splitr_count', 'splitr_span_pvalue':'splitr_span_pvalue', 'splitr_pos_pvalue':'splitr_pos_pvalue', 'splitr_min_pvalue':'splitr_min_pvalue', 'adjacent':'adjacent', 'altsplice':'altsplice', 'break_adj_entropy1':'break_adj_entropy1', 'break_adj_entropy2':'break_adj_entropy2', 'break_adj_entropy_min':'break_adj_entropy_min', 'breakpoint_homology':'breakpoint_homology', 'breakseqs_estislands_percident':'breakseqs_estislands_percident', 'cdna_breakseqs_percident':'cdna_breakseqs_percident', 'deletion':'deletion', 'est_breakseqs_percident':'est_breakseqs_percident', 'eversion':'eversion', 'exonboundaries':'exonboundaries', 'expression1':'expression1', 'expression2':'expression2', 'gene1':'gene1', 'gene2':'gene2', 'gene_align_strand1':'gene_align_strand1', 'gene_align_strand2':'gene_align_strand2', 'gene_end1':'gene_end1', 'gene_end2':'gene_end2', 'gene_start1':'gene_start1', 'gene_start2':'gene_start2', 'gene_strand1':'gene_strand1', 'gene_strand2':'gene_strand2', 'genome_breakseqs_percident':'genome_breakseqs_percident', 'genomic_break_pos1':'genomic_break_pos1', 'genomic_break_pos2':'genomic_break_pos2', 'genomic_strand1':'genomic_strand1', 'genomic_strand2':'genomic_strand2', 'interchromosomal':'interchromosomal', 'interrupted_index1':'interrupted_index1', 'interrupted_index2':'interrupted_index2', 'inversion':'inversion', 'library_name':'library_name', 'max_map_count':'max_map_count', 'max_repeat_proportion':'max_repeat_proportion', 'mean_map_count':'mean_map_count', 'min_map_count':'min_map_count', 'num_multi_map':'num_multi_map', 'num_splice_variants':'num_splice_variants', 'orf':'orf', 'read_through':'read_through', 'repeat_proportion1':'repeat_proportion1', 'repeat_proportion2':'repeat_proportion2', 'span_coverage1':'span_coverage1', 'span_coverage2':'span_coverage2', 'span_coverage_max':'span_coverage_max', 'span_coverage_min':'span_coverage_min', 'splice_score':'splice_score', 'splicing_index1':'splicing_index1', 'splicing_index2':'splicing_index2', 'fusion_type':'Type', 'coverage':'fusion%','Transcript':'Transcript?','Protein':'Protein?','flags':'descriptions','fwd_seq':'fusion','alignments1':'alignments1','alignments2':'alignments2','genomic_bkpt1':'From Chr', 'genomic_bkpt2':'To Chr'}
163 222
164 ## Read defuse results 223 ## Read defuse results
165 fusions = parse_defuse_results(inputFile) 224 fusions = parse_defuse_results(inputFile)
166 ## Create a field with the 12 nt before and after the fusion point. 225 ## Create a field with the 12 nt before and after the fusion point.
167 ## Create a field with the reverse complement of the 24 nt fusion point field. 226 ## Create a field with the reverse complement of the 24 nt fusion point field.
168 ## Add fusion type filed (INTER, INTRA, TIC) 227 ## Add fusion type filed (INTER, INTRA, TIC)
169 for i,fusion in enumerate(fusions): 228 for i,fusion in enumerate(fusions):
170 fusion['ordinal'] = i + 1 229 fusion['ordinal'] = i + 1
230 fusion['genomic_bkpt1'] = "%s:%d" % (fusion['gene_chromosome1'], fusion['genomic_break_pos1'])
231 fusion['genomic_bkpt2'] = "%s:%d" % (fusion['gene_chromosome2'], fusion['genomic_break_pos2'])
232 fusion['alignments1'] = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'])
233 fusion['alignments2'] = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
171 split_seqs = fusion['splitr_sequence'].split('|') 234 split_seqs = fusion['splitr_sequence'].split('|')
172 fusion['split_seqs'] = split_seqs 235 fusion['split_seqs'] = split_seqs
173 fwd_seq = split_seqs[0][-(min(abs(options.nbases),len(split_seqs[0]))):] + split_seqs[1][:min(abs(options.nbases),len(split_seqs[1]))] 236 fusion['split_seqs'] = split_seqs
237 fusion['split_seq_lens'] = [len(split_seqs[0]),len(split_seqs[1])]
238 fusion['split_max_lens'] = [len(split_seqs[0]),len(split_seqs[1])]
239 fwd_off = min(abs(options.nbases),len(split_seqs[0]))
240 rev_off = min(abs(options.nbases),len(split_seqs[1]))
241 fusion['fwd_off'] = fwd_off
242 fusion['rev_off'] = rev_off
243 fwd_seq = split_seqs[0][-fwd_off:] + split_seqs[1][:rev_off]
174 rev_seq = revcompl(fwd_seq) 244 rev_seq = revcompl(fwd_seq)
175 fusion['fwd_seq'] = fwd_seq 245 fusion['fwd_seq'] = fwd_seq
176 fusion['rev_seq'] = rev_seq 246 fusion['rev_seq'] = rev_seq
177 fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC' 247 fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC'
178 fusion['fusion_type'] = fusion_type 248 fusion['fusion_type'] = fusion_type
179 fusion['transcripts'] = [] 249 fusion['transcripts'] = dict()
180 fusion['Transcript'] = 'No' 250 fusion['Transcript'] = 'No'
251 fusion['coverage'] = 0
181 fusion['Protein'] = 'No' 252 fusion['Protein'] = 'No'
182 #print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2']) 253 # print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2'])
183 inputFile.close() 254 inputFile.close()
184 255
185 ## Process Trinity data and compare to deFuse 256 ## Process Trinity data and compare to deFuse
186 matched_transcripts = dict() 257 matched_transcripts = dict()
187 matched_orfs = dict() 258 matched_orfs = dict()
259 transcript_orfs = dict()
188 fusions_with_transcripts = set() 260 fusions_with_transcripts = set()
189 fusions_with_orfs = set() 261 fusions_with_orfs = set()
262 ## fusion['transcripts'][tx_id] { revcompl:?, bkpt:n, seq1: , seq2: , match1:n, match2:n}
190 n = 0 263 n = 0
191 if options.transcripts: 264 if options.transcripts:
192 with open(options.transcripts) as fp: 265 with open(options.transcripts) as fp:
193 for name, seq in read_fasta(fp): 266 for tx_full_id, seq in read_fasta(fp):
194 n += 1 267 n += 1
195 for i,fusion in enumerate(fusions): 268 for i,fusion in enumerate(fusions):
196 if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq: 269 if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq:
197 fusions_with_transcripts.add(i) 270 fusions_with_transcripts.add(i)
198 matched_transcripts[name] = seq
199 fusion['transcripts'].append(name)
200 fusion['Transcript'] = 'Yes' 271 fusion['Transcript'] = 'Yes'
201 #print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts)) 272 tx_id = tx_full_id.lstrip('>').split()[0]
202 print >> sys.stdout, "fusions_with_transcripts: %d unique_transcripts: %d" % (len(fusions_with_transcripts),len(matched_transcripts)) 273 matched_transcripts[tx_full_id] = seq
203 #for i,fusion in enumerate(fusions): 274 fusion['transcripts'][tx_id] = dict()
204 # print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts']) 275 fusion['transcripts'][tx_id]['seq'] = seq
276 fusion['transcripts'][tx_id]['full_id'] = tx_full_id
277 pos = seq.find(fusion['fwd_seq'])
278 if pos >= 0:
279 tx_bkpt = pos + fusion['fwd_off']
280 # fusion['transcripts'][tx_full_id] = tx_bkpt
281 if tx_bkpt > fusion['split_max_lens'][0]:
282 fusion['split_max_lens'][0] = tx_bkpt
283 len2 = len(seq) - tx_bkpt
284 if len2 > fusion['split_max_lens'][1]:
285 fusion['split_max_lens'][1] = len2
286 fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt
287 fusion['transcripts'][tx_id]['revcompl'] = False
288 fusion['transcripts'][tx_id]['seq1'] = seq[:tx_bkpt]
289 fusion['transcripts'][tx_id]['seq2'] = seq[tx_bkpt:]
290 else:
291 pos = seq.find(fusion['rev_seq'])
292 tx_bkpt = pos + fusion['rev_off']
293 # fusion['transcripts'][tx_full_id] = -tx_bkpt
294 if tx_bkpt > fusion['split_max_lens'][1]:
295 fusion['split_max_lens'][1] = tx_bkpt
296 len2 = len(seq) - tx_bkpt
297 if len2 > fusion['split_max_lens'][0]:
298 fusion['split_max_lens'][0] = len2
299 rseq = revcompl(seq)
300 pos = rseq.find(fusion['fwd_seq'])
301 tx_bkpt = pos + fusion['fwd_off']
302 fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt
303 fusion['transcripts'][tx_id]['revcompl'] = True
304 fusion['transcripts'][tx_id]['seq1'] = rseq[:tx_bkpt]
305 fusion['transcripts'][tx_id]['seq2'] = rseq[tx_bkpt:]
306 fseq = fusion['split_seqs'][0]
307 tseq = fusion['transcripts'][tx_id]['seq1']
308 mlen = min(len(fseq),len(tseq))
309 fusion['transcripts'][tx_id]['match1'] = mlen
310 for j in range(1,mlen+1):
311 if fseq[-j] != tseq[-j]:
312 fusion['transcripts'][tx_id]['match1'] = j - 1
313 break
314 fseq = fusion['split_seqs'][1]
315 tseq = fusion['transcripts'][tx_id]['seq2']
316 mlen = min(len(fseq),len(tseq))
317 fusion['transcripts'][tx_id]['match2'] = mlen
318 for j in range(mlen):
319 if fseq[j] != tseq[j]:
320 fusion['transcripts'][tx_id]['match2'] = j
321 break
322 # coverage = math.floor(float(fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 100. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1]))
323 coverage = int((fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 1000. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1])) * .1
324 # print >> sys.stderr, "%s\t%d\t%d\t%d\%s\t\t%d\t%d\t%d\t%d" % (tx_id,fusion['transcripts'][tx_id]['match1'],fusion['transcripts'][tx_id]['match2'],len(fusion['split_seqs'][0]+fusion['split_seqs'][1]),coverage,len( fusion['split_seqs'][0]),len(fusion['transcripts'][tx_id]['seq1']),len(fusion['split_seqs'][1]),len(fusion['transcripts'][tx_id]['seq2']))
325 fusion['coverage'] = max(coverage,fusion['coverage'])
326 print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts))
327 ##for i,fusion in enumerate(fusions):
328 ## print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts'])
205 ## Process ORFs and compare to matched deFuse and Trinity data. 329 ## Process ORFs and compare to matched deFuse and Trinity data.
206 ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*". 330 ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*".
207 if options.peptides: 331 if options.peptides:
208 with open(options.peptides) as fp: 332 with open(options.peptides) as fp:
209 for name, seq in read_fasta(fp): 333 for orf_full_id, seq in read_fasta(fp):
210 n += 1 334 n += 1
211 if len(seq) < options.min_pep_len: 335 if len(seq) < options.min_pep_len:
212 continue 336 continue
337 orf_type = re.match('^.* type:(\S+) .*$',orf_full_id).groups()[0]
338 ## if not seq[-1] == '*' and not options.incomplete_orfs:
339 ## if not orf_type 'complete' and not options.incomplete_orfs:
340 if orf_type not in options.orf_type:
341 continue
213 for i,fusion in enumerate(fusions): 342 for i,fusion in enumerate(fusions):
214 if len(fusion['transcripts']) > 0: 343 if len(fusion['transcripts']) > 0:
215 for id_string in fusion['transcripts']: 344 for tx_id in fusion['transcripts']:
216 tx_id = id_string.lstrip('>').split()[0] 345 ## >m.196252 g.196252 ORF g.196252 m.196252 type:complete len:237 (+) comp100000_c5_seq2:315-1025(+)
217 if tx_id in name: 346 ## >m.134565 g.134565 ORF g.134565 m.134565 type:5prime_partial len:126 (-) comp98702_c1_seq21:52-429(-)
347 if tx_id+':' not in orf_full_id:
348 continue
349 m = re.match("^.*%s:(\d+)-(\d+)[(]([+-])[)].*" % re.sub('([|.{}()$?^])','[\\1]',tx_id),orf_full_id)
350 if m:
351 if not m.groups() or len(m.groups()) < 3 or m.groups()[0] == None:
352 print >> sys.stderr, "Error:\n%s\n%s\n" % (tx_id,orf_full_id)
353 orf_id = orf_full_id.lstrip('>').split()[0]
354 if not tx_id in transcript_orfs:
355 transcript_orfs[tx_id] = []
356 alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
357 # print >> sys.stdout, "%d %s bkpt:%d %s rc:%s (%s) %s" % (fusion['ordinal'], tx_id, int(fusion['transcripts'][tx_id]['bkpt']), str(m.groups()), str(fusion['transcripts'][tx_id]['revcompl']), alignments, orf_full_id)
358 start = seq.find('M')
218 pep_len = len(seq) 359 pep_len = len(seq)
219 start = seq.find('M')
220 if pep_len - start < options.min_pep_len: 360 if pep_len - start < options.min_pep_len:
221 continue 361 continue
362 orf_dict = dict()
363 transcript_orfs[tx_id].append(orf_dict)
222 fusions_with_orfs.add(i) 364 fusions_with_orfs.add(i)
223 matched_orfs[name] = seq 365 matched_orfs[orf_full_id] = seq
224 fusion['Protein'] = 'Yes' 366 fusion['Protein'] = 'Yes'
225 """ 367 tx_start = int(m.groups()[0])
226 # fwd or reverse 368 tx_end = int(m.groups()[1])
227 tx_seq = matched_transcripts(tx_id) 369 tx_strand = m.groups()[2]
228 pos = tx_seq.find(fusion['fwd_seq']) 370 tx_bkpt = fusion['transcripts'][tx_id]['bkpt']
229 if pos < 0: 371 orf_dict['orf_id'] = orf_id
230 pos = tx_seq.find(fusion['rev_seq']) 372 orf_dict['tx_start'] = tx_start
231 # locate fusion in transcript 373 orf_dict['tx_end'] = tx_end
232 # locate fusion in ORF 374 orf_dict['tx_strand'] = tx_strand
233 fusion['prior_pep_seq'] = '' 375 orf_dict['tx_bkpt'] = tx_bkpt
234 fusion['novel_pep_seq'] = '' 376 orf_dict['seq'] = seq[:start].lower() + seq[start:] if start > 0 else seq
235 """ 377 ## >m.208656 g.208656 ORF g.208656 m.208656 type:5prime_partial len:303 (+) comp100185_c2_seq9:2-910(+)
236 #print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs)) 378 ## translate(tx34[1:910])
237 print >> sys.stdout, "fusions_with_orfs: %d unique_orfs: %d" % (len(fusions_with_orfs),len(matched_orfs)) 379 ## translate(tx34[1:2048])
380 ## comp99273_c1_seq1 len=3146 (-2772)
381 ## >m.158338 g.158338 ORF g.158338 m.158338 type:complete len:785 (-) comp99273_c1_seq1:404-2758(-)
382 ## translate(tx[-2758:-403])
383 ## comp100185_c2_seq9 len=2048 (904)
384 ## novel protein sequence
385 ## find first novel AA
386 ## get prior n AAs
387 ## get novel AA seq thru n stop codons
388 ### tx_seq = matched_transcripts[tx_full_id] if tx_bkpt >= 0 else revcompl(tx_seq)
389 tx_seq = fusion['transcripts'][tx_id]['seq']
390 orf_dict['tx_seq'] = tx_seq
391 novel_tx_seq = tx_seq[tx_start - 1:] if tx_strand == '+' else revcompl(tx_seq[:tx_end])
392 read_thru_pep = translate(novel_tx_seq)
393 # fusion['transcripts'][tx_id]['revcompl'] = True
394 # tx_bkpt = fusion['transcripts'][tx_id]['bkpt']
395 # bkpt_aa_pos = tx_bkpt - tx_start - 1
396 # bkpt_aa_pos = (tx_bkpt - tx_start - 1) / 3 if tx_strand == '+' else tx_end
397 # print >> sys.stdout, "%s\n%s" % (seq,read_thru_pep)
398 stop_codons = get_stop_codons(novel_tx_seq)
399 if options.readthrough:
400 readthrough = options.readthrough + 1
401 read_thru_pep = '*'.join(read_thru_pep.split('*')[:readthrough])
402 stop_codons = stop_codons[:readthrough]
403 orf_dict['read_thru_pep'] = read_thru_pep
404 orf_dict['stop_codons'] = ','.join(stop_codons)
405 print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs))
406 ## Alignments 3 columns, seq columns padded out to longest seq, UPPERCASE_match diffs lowercase
407 ### defuse_id pre_split_seq post_split_seq
408 ### trinity_id pre_split_seq post_split_seq
409 ## Transcripts alignment output
410 ## Peptide alignment output
238 ## Write reports 411 ## Write reports
239 report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein'] 412 ## OS03_Matched_Rev.csv
240 report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point','fusion_type':'Type','Transcript':'Transcript?','Protein':'Protein?' } 413 ## "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein"
414 if options.transcripts and options.matched:
415 #match_fields = ['ordinal','gene_name1','gene_name2','fwd_seq']
416 outputMatchFile = open(options.matched,'w')
417 #print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","read_through","stop_codons"])
418 print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","stop_codons"])
419 for i,fusion in enumerate(fusions):
420 if len(fusion['transcripts']) > 0:
421 for tx_id in fusion['transcripts'].keys():
422 if tx_id in transcript_orfs:
423 for orf_dict in transcript_orfs[tx_id]:
424 if 'tx_seq' not in orf_dict:
425 print >> sys.stderr, "orf_dict %s" % orf_dict
426 #fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['seq'],orf_dict['read_thru_pep'],orf_dict['stop_codons']]
427 fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['read_thru_pep'],orf_dict['stop_codons']]
428 print >> outputMatchFile, '\t'.join(fields)
429 outputMatchFile.close()
430 if options.transcripts and options.transcript_alignment:
431 if outputTxFile:
432 id_fields = ['gene_name1','alignments1','gene_name2','alignments2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein','flags']
433 fa_width = 80
434 for i,fusion in enumerate(fusions):
435 if len(fusion['transcripts']) > 0:
436 alignments1 = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'])
437 alignments2 = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
438 alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2'])
439 fusion_id = "%s (%s) %s" % (i + 1,alignments,' '.join([str(fusion[x]) for x in report_fields]))
440 for tx_id in fusion['transcripts'].keys():
441 m1 = fusion['transcripts'][tx_id]['match1']
442 f_seq1 = fusion['split_seqs'][0][:-m1].lower() + fusion['split_seqs'][0][-m1:]
443 t_seq1 = fusion['transcripts'][tx_id]['seq1'][:-m1].lower() + fusion['transcripts'][tx_id]['seq1'][-m1:]
444 if len(f_seq1) > len(t_seq1):
445 t_seq1 = t_seq1.rjust(len(f_seq1),'.')
446 elif len(f_seq1) < len(t_seq1):
447 f_seq1 = f_seq1.rjust(len(t_seq1),'.')
448 m2 = fusion['transcripts'][tx_id]['match2']
449 f_seq2 = fusion['split_seqs'][1][:m2] + fusion['split_seqs'][1][m2:].lower()
450 t_seq2 = fusion['transcripts'][tx_id]['seq2'][:m2] + fusion['transcripts'][tx_id]['seq2'][m2:].lower()
451 if len(f_seq2) > len(t_seq2):
452 t_seq2 = t_seq2.ljust(len(f_seq2),'.')
453 elif len(f_seq2) < len(t_seq2):
454 f_seq2 = f_seq2.ljust(len(t_seq2),'.')
455 print >> outputTxFile, ">%s\n%s\n%s" % (fusion_id,'\n'.join(textwrap.wrap(f_seq1,fa_width)),'\n'.join(textwrap.wrap(f_seq2,fa_width)))
456 print >> outputTxFile, "%s bkpt:%d rev_compl:%s\n%s\n%s" % (fusion['transcripts'][tx_id]['full_id'],fusion['transcripts'][tx_id]['bkpt'],str(fusion['transcripts'][tx_id]['revcompl']),'\n'.join(textwrap.wrap(t_seq1,fa_width)),'\n'.join(textwrap.wrap(t_seq2,fa_width)))
457 """
458 if options.peptides and options.orf_alignment:
459 pass
460 """
241 print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields])) 461 print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields]))
242 for i,fusion in enumerate(fusions): 462 for i,fusion in enumerate(fusions):
243 print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields])) 463 print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields]))
244 # print >> outputFile, "%d\t%s\t%s\t%d\t%f\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['gene_name1'],fusion['gene_name2'],fusion['span_count'],fusion['probability'],fusion['gene_chromosome1'],fusion['gene_location1'],fusion['gene_chromosome2'],fusion['gene_location2'],fusion['fusion_type'],fusion['Transcript'],fusion['Protein'])
245 464
246 if __name__ == "__main__" : __main__() 465 if __name__ == "__main__" : __main__()
247 466