| 36 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 # | 
|  | 4 #------------------------------------------------------------------------------ | 
|  | 5 #                         University of Minnesota | 
|  | 6 #         Copyright 2014, Regents of the University of Minnesota | 
|  | 7 #------------------------------------------------------------------------------ | 
|  | 8 # Author: | 
|  | 9 # | 
|  | 10 #  James E Johnson | 
|  | 11 # | 
|  | 12 #------------------------------------------------------------------------------ | 
|  | 13 """ | 
|  | 14 | 
|  | 15 | 
|  | 16 """ | 
|  | 17 This tool takes the defuse results.tsv  tab-delimited file, trinity | 
|  | 18 and creates a tabular report | 
| 40 | 19 | 
|  | 20 Would it be possible to create 2 additional files from the deFuse-Trinity comparison program. | 
|  | 21 One containing all the Trinity records matched to deFuse records (with the deFuse ID number), | 
|  | 22 and the other with the ORFs records matching back to the Trinity records in the first files? | 
|  | 23 | 
|  | 24 M045_Report.csv | 
|  | 25 "","deFuse_subset.count","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","deFuse.gene_chromosome1","deFuse.gene_location1","deFuse.gene_chromosome2","deFuse.gene_location2","deFuse_subset.type" | 
|  | 26 "1",1,"Rps6","Dennd4c",7,0.814853504,"4","coding","4","coding","TIC  " | 
|  | 27 | 
|  | 28 | 
|  | 29 | 
|  | 30 OS03_Matched_Rev.csv | 
|  | 31 "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein" | 
|  | 32 | 
|  | 33 "","deFuse.splitr_sequence","deFuse.gene_chromosome1","deFuse.gene_chromosome2","deFuse.gene_location1","deFuse.gene_location2","deFuse.gene_name1","deFuse.gene_name2","deFuse.span_count","deFuse.probability","word1","word2","fusion_part_1","fusion_part_2","fusion_point","fusion_point_rc","count","transcript" | 
|  | 34 | 
| 36 | 35 """ | 
|  | 36 | 
| 40 | 37 import sys,re,os.path,math | 
|  | 38 import textwrap | 
| 36 | 39 import optparse | 
|  | 40 from optparse import OptionParser | 
|  | 41 | 
|  | 42 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1]) | 
|  | 43 | 
| 40 | 44 codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L", | 
|  | 45     "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S", | 
|  | 46     "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*", | 
|  | 47     "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W", | 
|  | 48     "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L", | 
|  | 49     "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P", | 
|  | 50     "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | 
|  | 51     "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R", | 
|  | 52     "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M", | 
|  | 53     "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T", | 
|  | 54     "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K", | 
|  | 55     "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R", | 
|  | 56     "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V", | 
|  | 57     "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A", | 
|  | 58     "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E", | 
|  | 59     "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | 
|  | 60 | 
|  | 61 def translate(seq) : | 
|  | 62   rna = seq.upper().replace('T','U') | 
|  | 63   aa = [] | 
|  | 64   for i in range(0,len(rna) - 2, 3): | 
|  | 65     codon = rna[i:i+3] | 
|  | 66     aa.append(codon_map[codon] if codon in codon_map else 'X') | 
|  | 67   return ''.join(aa) | 
|  | 68 | 
|  | 69 def get_stop_codons(seq) : | 
|  | 70   rna = seq.upper().replace('T','U') | 
|  | 71   stop_codons = [] | 
|  | 72   for i in range(0,len(rna) - 2, 3): | 
|  | 73     codon = rna[i:i+3] | 
|  | 74     aa = codon_map[codon] if codon in codon_map else 'X' | 
|  | 75     if aa == '*': | 
|  | 76       stop_codons.append(codon) | 
|  | 77   return stop_codons | 
|  | 78 | 
| 36 | 79 def read_fasta(fp): | 
|  | 80     name, seq = None, [] | 
|  | 81     for line in fp: | 
|  | 82         line = line.rstrip() | 
|  | 83         if line.startswith(">"): | 
|  | 84             if name: yield (name, ''.join(seq)) | 
|  | 85             name, seq = line, [] | 
|  | 86         else: | 
|  | 87             seq.append(line) | 
|  | 88     if name: yield (name, ''.join(seq)) | 
|  | 89 | 
|  | 90 | 
|  | 91 def test_rcomplement(seq, target): | 
|  | 92   try: | 
|  | 93     comp = revcompl(seq) | 
|  | 94     return comp in target | 
|  | 95   except: | 
|  | 96     pass | 
|  | 97   return False | 
|  | 98 | 
|  | 99 def test_reverse(seq,target): | 
|  | 100   return options.test_reverse and seq and seq[::-1] in target | 
|  | 101 | 
|  | 102 def cmp_alphanumeric(s1,s2): | 
|  | 103   if s1 == s2: | 
|  | 104     return 0 | 
|  | 105   a1 = re.findall("\d+|[a-zA-Z]+",s1) | 
|  | 106   a2 = re.findall("\d+|[a-zA-Z]+",s2) | 
|  | 107   for i in range(min(len(a1),len(a2))): | 
|  | 108     if a1[i] == a2[i]: | 
|  | 109       continue | 
|  | 110     if a1[i].isdigit() and a2[i].isdigit(): | 
|  | 111       return int(a1[i]) - int(a2[i]) | 
|  | 112     return 1 if a1[i] >  a2[i] else -1 | 
|  | 113   return len(a1) - len(a2) | 
|  | 114 | 
|  | 115 def parse_defuse_results(inputFile): | 
| 40 | 116   defuse_results = [] | 
| 36 | 117   columns = [] | 
| 40 | 118   coltype_int = ['expression1', 'expression2', 'gene_start1', 'gene_start2', 'gene_end1', 'gene_end2', 'genomic_break_pos1', 'genomic_break_pos2', 'breakpoint_homology', 'span_count', 'splitr_count', 'splice_score'] | 
|  | 119   coltype_float = ['probability'] | 
|  | 120   coltype_yn = [ 'orf', 'exonboundaries', 'read_through', 'interchromosomal', 'adjacent', 'altsplice', 'deletion', 'eversion', 'inversion'] | 
| 36 | 121   try: | 
|  | 122     for linenum,line in enumerate(inputFile): | 
|  | 123       ## print >> sys.stderr, "%d: %s\n" % (linenum,line) | 
|  | 124       fields = line.strip().split('\t') | 
|  | 125       if line.startswith('cluster_id'): | 
|  | 126         columns = fields | 
|  | 127         ## print >> sys.stderr, "columns: %s\n" % columns | 
|  | 128         continue | 
| 40 | 129       elif fields and len(fields) == len(columns): | 
|  | 130         cluster_id = fields[columns.index('cluster_id')] | 
|  | 131         cluster = dict() | 
|  | 132         flags = [] | 
|  | 133         defuse_results.append(cluster) | 
|  | 134         for i,v in enumerate(columns): | 
|  | 135           if v in coltype_int: | 
|  | 136             cluster[v] = int(fields[i]) | 
|  | 137           elif v in coltype_float: | 
|  | 138             cluster[v] = float(fields[i]) | 
|  | 139           elif v in coltype_yn: | 
|  | 140             cluster[v] = fields[i] == 'Y' | 
|  | 141             if cluster[v]: | 
|  | 142               flags.append(columns[i]) | 
|  | 143           else: | 
|  | 144             cluster[v] = fields[i] | 
|  | 145         cluster['flags'] = ','.join(flags) | 
| 36 | 146   except Exception, e: | 
| 40 | 147     print >> sys.stderr, "failed to read cluster_dict: %s" % e | 
|  | 148     exit(1) | 
| 36 | 149   return defuse_results | 
|  | 150 | 
|  | 151 ## deFuse params to the mapping application? | 
|  | 152 | 
|  | 153 def __main__(): | 
|  | 154   #Parse Command Line | 
|  | 155   parser = optparse.OptionParser() | 
|  | 156   # files | 
| 40 | 157   parser.add_option( '-i', '--input', dest='input', default=None, help='The input defuse results.tsv file (else read from stdin)' ) | 
| 36 | 158   parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' ) | 
|  | 159   parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' ) | 
| 40 | 160   parser.add_option( '-o', '--output', dest='output', default=None, help='The output report (else write to stdout)' ) | 
|  | 161   parser.add_option( '-m', '--matched', dest='matched', default=None, help='The output matched report' ) | 
|  | 162   parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', default=None, help='The output alignment file' ) | 
|  | 163   parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', default=None, help='The output ORF alignment file' ) | 
| 36 | 164   parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' ) | 
|  | 165   parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' ) | 
|  | 166   parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' ) | 
|  | 167   parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' ) | 
| 40 | 168   parser.add_option( '-I', '--incomplete_orfs', dest='incomplete_orfs', action='store_true', default=False, help='Count incomplete ORFs'  ) | 
|  | 169   parser.add_option( '-O', '--orf_type', dest='orf_type', action='append', default=['complete','5prime_partial'], choices=['complete','5prime_partial','3prime_partial','internal'], help='ORF types to report'  ) | 
|  | 170   parser.add_option( '-r', '--readthrough', dest='readthrough', type='int', default=3, help='Number of stop_codons to read through' ) | 
| 36 | 171   # min_orf_len | 
|  | 172   # split_na_len | 
|  | 173   # tic_len = 1000000 | 
|  | 174   # prior | 
|  | 175   # deFuse direction reversed | 
|  | 176   # in frame ? | 
|  | 177   # contain known protein elements | 
|  | 178   # what protein change | 
|  | 179   # trinity provides full transctipt, defuse doesn't show full | 
|  | 180   #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' ) | 
|  | 181   #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file') | 
|  | 182   (options, args) = parser.parse_args() | 
|  | 183 | 
|  | 184   # results.tsv input | 
|  | 185   if options.input != None: | 
|  | 186     try: | 
|  | 187       inputPath = os.path.abspath(options.input) | 
|  | 188       inputFile = open(inputPath, 'r') | 
|  | 189     except Exception, e: | 
|  | 190       print >> sys.stderr, "failed: %s" % e | 
|  | 191       exit(2) | 
|  | 192   else: | 
|  | 193     inputFile = sys.stdin | 
|  | 194   # vcf output | 
|  | 195   if options.output != None: | 
|  | 196     try: | 
|  | 197       outputPath = os.path.abspath(options.output) | 
|  | 198       outputFile = open(outputPath, 'w') | 
|  | 199     except Exception, e: | 
|  | 200       print >> sys.stderr, "failed: %s" % e | 
|  | 201       exit(3) | 
|  | 202   else: | 
|  | 203     outputFile = sys.stdout | 
| 40 | 204   outputTxFile = None | 
|  | 205   outputOrfFile = None | 
|  | 206   if options.transcript_alignment: | 
|  | 207     try: | 
|  | 208       outputTxFile = open(options.transcript_alignment,'w') | 
|  | 209     except Exception, e: | 
|  | 210       print >> sys.stderr, "failed: %s" % e | 
|  | 211       exit(3) | 
|  | 212   if options.orf_alignment: | 
|  | 213     try: | 
|  | 214       outputOrfFile = open(options.orf_alignment,'w') | 
|  | 215     except Exception, e: | 
|  | 216       print >> sys.stderr, "failed: %s" % e | 
|  | 217       exit(3) | 
|  | 218   # Add percent match after transcript | 
|  | 219   report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2'] | 
|  | 220   report_fields = ['cluster_id','gene_name1','gene_name2','span_count','probability','genomic_bkpt1','gene_location1','genomic_bkpt2','gene_location2','fusion_type','Transcript','coverage','Protein','flags','alignments1','alignments2'] | 
|  | 221   report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point', 'cluster_id':'cluster_id', 'splitr_sequence':'splitr_sequence', 'splitr_count':'splitr_count', 'splitr_span_pvalue':'splitr_span_pvalue', 'splitr_pos_pvalue':'splitr_pos_pvalue', 'splitr_min_pvalue':'splitr_min_pvalue', 'adjacent':'adjacent', 'altsplice':'altsplice', 'break_adj_entropy1':'break_adj_entropy1', 'break_adj_entropy2':'break_adj_entropy2', 'break_adj_entropy_min':'break_adj_entropy_min', 'breakpoint_homology':'breakpoint_homology', 'breakseqs_estislands_percident':'breakseqs_estislands_percident', 'cdna_breakseqs_percident':'cdna_breakseqs_percident', 'deletion':'deletion', 'est_breakseqs_percident':'est_breakseqs_percident', 'eversion':'eversion', 'exonboundaries':'exonboundaries', 'expression1':'expression1', 'expression2':'expression2', 'gene1':'gene1', 'gene2':'gene2', 'gene_align_strand1':'gene_align_strand1', 'gene_align_strand2':'gene_align_strand2', 'gene_end1':'gene_end1', 'gene_end2':'gene_end2', 'gene_start1':'gene_start1', 'gene_start2':'gene_start2', 'gene_strand1':'gene_strand1', 'gene_strand2':'gene_strand2', 'genome_breakseqs_percident':'genome_breakseqs_percident', 'genomic_break_pos1':'genomic_break_pos1', 'genomic_break_pos2':'genomic_break_pos2', 'genomic_strand1':'genomic_strand1', 'genomic_strand2':'genomic_strand2', 'interchromosomal':'interchromosomal', 'interrupted_index1':'interrupted_index1', 'interrupted_index2':'interrupted_index2', 'inversion':'inversion', 'library_name':'library_name', 'max_map_count':'max_map_count', 'max_repeat_proportion':'max_repeat_proportion', 'mean_map_count':'mean_map_count', 'min_map_count':'min_map_count', 'num_multi_map':'num_multi_map', 'num_splice_variants':'num_splice_variants', 'orf':'orf', 'read_through':'read_through', 'repeat_proportion1':'repeat_proportion1', 'repeat_proportion2':'repeat_proportion2', 'span_coverage1':'span_coverage1', 'span_coverage2':'span_coverage2', 'span_coverage_max':'span_coverage_max', 'span_coverage_min':'span_coverage_min', 'splice_score':'splice_score', 'splicing_index1':'splicing_index1', 'splicing_index2':'splicing_index2', 'fusion_type':'Type', 'coverage':'fusion%','Transcript':'Transcript?','Protein':'Protein?','flags':'descriptions','fwd_seq':'fusion','alignments1':'alignments1','alignments2':'alignments2','genomic_bkpt1':'From Chr', 'genomic_bkpt2':'To Chr'} | 
| 36 | 222 | 
|  | 223   ## Read defuse results | 
|  | 224   fusions = parse_defuse_results(inputFile) | 
|  | 225   ## Create a field with the 12 nt before and after the fusion point. | 
|  | 226   ## Create a field with the reverse complement of the 24 nt fusion point field. | 
|  | 227   ## Add fusion type filed (INTER, INTRA, TIC) | 
|  | 228   for i,fusion in enumerate(fusions): | 
|  | 229       fusion['ordinal'] = i + 1 | 
| 40 | 230       fusion['genomic_bkpt1'] = "%s:%d" % (fusion['gene_chromosome1'], fusion['genomic_break_pos1']) | 
|  | 231       fusion['genomic_bkpt2'] = "%s:%d" % (fusion['gene_chromosome2'], fusion['genomic_break_pos2']) | 
|  | 232       fusion['alignments1'] = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1']) | 
|  | 233       fusion['alignments2'] = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) | 
| 36 | 234       split_seqs = fusion['splitr_sequence'].split('|') | 
|  | 235       fusion['split_seqs'] = split_seqs | 
| 40 | 236       fusion['split_seqs'] = split_seqs | 
|  | 237       fusion['split_seq_lens'] = [len(split_seqs[0]),len(split_seqs[1])] | 
|  | 238       fusion['split_max_lens'] = [len(split_seqs[0]),len(split_seqs[1])] | 
|  | 239       fwd_off = min(abs(options.nbases),len(split_seqs[0])) | 
|  | 240       rev_off = min(abs(options.nbases),len(split_seqs[1])) | 
|  | 241       fusion['fwd_off'] = fwd_off | 
|  | 242       fusion['rev_off'] = rev_off | 
|  | 243       fwd_seq = split_seqs[0][-fwd_off:] + split_seqs[1][:rev_off] | 
| 36 | 244       rev_seq =  revcompl(fwd_seq) | 
|  | 245       fusion['fwd_seq'] = fwd_seq | 
|  | 246       fusion['rev_seq'] = rev_seq | 
|  | 247       fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC' | 
|  | 248       fusion['fusion_type'] = fusion_type | 
| 40 | 249       fusion['transcripts'] = dict() | 
| 36 | 250       fusion['Transcript'] = 'No' | 
| 40 | 251       fusion['coverage'] = 0 | 
| 36 | 252       fusion['Protein'] = 'No' | 
| 40 | 253       # print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2']) | 
| 36 | 254   inputFile.close() | 
|  | 255 | 
|  | 256   ## Process Trinity data and compare to deFuse | 
|  | 257   matched_transcripts = dict() | 
|  | 258   matched_orfs = dict() | 
| 40 | 259   transcript_orfs = dict() | 
| 36 | 260   fusions_with_transcripts = set() | 
|  | 261   fusions_with_orfs = set() | 
| 40 | 262   ## fusion['transcripts'][tx_id] { revcompl:?, bkpt:n, seq1: ,  seq2: , match1:n, match2:n} | 
| 36 | 263   n = 0 | 
|  | 264   if options.transcripts: | 
|  | 265     with open(options.transcripts) as fp: | 
| 40 | 266       for tx_full_id, seq in read_fasta(fp): | 
| 36 | 267         n += 1 | 
|  | 268         for i,fusion in enumerate(fusions): | 
|  | 269           if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq: | 
|  | 270             fusions_with_transcripts.add(i) | 
|  | 271             fusion['Transcript'] = 'Yes' | 
| 40 | 272             tx_id = tx_full_id.lstrip('>').split()[0] | 
|  | 273             matched_transcripts[tx_full_id] = seq | 
|  | 274             fusion['transcripts'][tx_id] = dict() | 
|  | 275             fusion['transcripts'][tx_id]['seq'] = seq | 
|  | 276             fusion['transcripts'][tx_id]['full_id'] = tx_full_id | 
|  | 277             pos = seq.find(fusion['fwd_seq']) | 
|  | 278             if pos >= 0: | 
|  | 279               tx_bkpt = pos + fusion['fwd_off'] | 
|  | 280               # fusion['transcripts'][tx_full_id] = tx_bkpt | 
|  | 281               if tx_bkpt > fusion['split_max_lens'][0]: | 
|  | 282                 fusion['split_max_lens'][0] = tx_bkpt | 
|  | 283               len2 = len(seq) - tx_bkpt | 
|  | 284               if len2 > fusion['split_max_lens'][1]: | 
|  | 285                 fusion['split_max_lens'][1] = len2 | 
|  | 286               fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt | 
|  | 287               fusion['transcripts'][tx_id]['revcompl'] = False | 
|  | 288               fusion['transcripts'][tx_id]['seq1'] = seq[:tx_bkpt] | 
|  | 289               fusion['transcripts'][tx_id]['seq2'] = seq[tx_bkpt:] | 
|  | 290             else: | 
|  | 291               pos = seq.find(fusion['rev_seq']) | 
|  | 292               tx_bkpt = pos + fusion['rev_off'] | 
|  | 293               # fusion['transcripts'][tx_full_id] = -tx_bkpt | 
|  | 294               if tx_bkpt > fusion['split_max_lens'][1]: | 
|  | 295                 fusion['split_max_lens'][1] = tx_bkpt | 
|  | 296               len2 = len(seq) - tx_bkpt | 
|  | 297               if len2 > fusion['split_max_lens'][0]: | 
|  | 298                 fusion['split_max_lens'][0] = len2 | 
|  | 299               rseq = revcompl(seq) | 
|  | 300               pos = rseq.find(fusion['fwd_seq']) | 
|  | 301               tx_bkpt = pos + fusion['fwd_off'] | 
|  | 302               fusion['transcripts'][tx_id]['bkpt'] = tx_bkpt | 
|  | 303               fusion['transcripts'][tx_id]['revcompl'] = True | 
|  | 304               fusion['transcripts'][tx_id]['seq1'] = rseq[:tx_bkpt] | 
|  | 305               fusion['transcripts'][tx_id]['seq2'] = rseq[tx_bkpt:] | 
|  | 306             fseq = fusion['split_seqs'][0] | 
|  | 307             tseq = fusion['transcripts'][tx_id]['seq1'] | 
|  | 308             mlen = min(len(fseq),len(tseq)) | 
|  | 309             fusion['transcripts'][tx_id]['match1'] = mlen | 
|  | 310             for j in range(1,mlen+1): | 
|  | 311               if fseq[-j] != tseq[-j]: | 
|  | 312                 fusion['transcripts'][tx_id]['match1'] = j - 1 | 
|  | 313                 break | 
|  | 314             fseq = fusion['split_seqs'][1] | 
|  | 315             tseq = fusion['transcripts'][tx_id]['seq2'] | 
|  | 316             mlen = min(len(fseq),len(tseq)) | 
|  | 317             fusion['transcripts'][tx_id]['match2'] = mlen | 
|  | 318             for j in range(mlen): | 
|  | 319               if fseq[j] != tseq[j]: | 
|  | 320                 fusion['transcripts'][tx_id]['match2'] = j | 
|  | 321                 break | 
|  | 322             # coverage = math.floor(float(fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 100. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1])) | 
|  | 323             coverage = int((fusion['transcripts'][tx_id]['match1'] + fusion['transcripts'][tx_id]['match2']) * 1000. / len(fusion['split_seqs'][0]+fusion['split_seqs'][1])) * .1 | 
|  | 324             # print >> sys.stderr, "%s\t%d\t%d\t%d\%s\t\t%d\t%d\t%d\t%d" % (tx_id,fusion['transcripts'][tx_id]['match1'],fusion['transcripts'][tx_id]['match2'],len(fusion['split_seqs'][0]+fusion['split_seqs'][1]),coverage,len( fusion['split_seqs'][0]),len(fusion['transcripts'][tx_id]['seq1']),len(fusion['split_seqs'][1]),len(fusion['transcripts'][tx_id]['seq2'])) | 
|  | 325             fusion['coverage'] = max(coverage,fusion['coverage']) | 
|  | 326     print >> sys.stdout, "fusions_with_transcripts: %d  %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts)) | 
|  | 327     ##for i,fusion in enumerate(fusions): | 
|  | 328     ##  print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts']) | 
| 36 | 329     ## Process ORFs and compare to matched deFuse and Trinity data. | 
|  | 330     ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*". | 
|  | 331     if options.peptides: | 
|  | 332       with open(options.peptides) as fp: | 
| 40 | 333         for orf_full_id, seq in read_fasta(fp): | 
| 36 | 334           n += 1 | 
|  | 335           if len(seq) < options.min_pep_len: | 
|  | 336             continue | 
| 40 | 337           orf_type = re.match('^.* type:(\S+) .*$',orf_full_id).groups()[0] | 
|  | 338           ## if not seq[-1] == '*' and not options.incomplete_orfs: | 
|  | 339           ## if not orf_type 'complete' and not options.incomplete_orfs: | 
|  | 340           if orf_type not in options.orf_type: | 
|  | 341             continue | 
| 36 | 342           for i,fusion in enumerate(fusions): | 
|  | 343             if len(fusion['transcripts']) > 0: | 
| 40 | 344               for tx_id in fusion['transcripts']: | 
|  | 345                 ## >m.196252 g.196252  ORF g.196252 m.196252 type:complete len:237 (+) comp100000_c5_seq2:315-1025(+) | 
|  | 346                 ## >m.134565 g.134565  ORF g.134565 m.134565 type:5prime_partial len:126 (-) comp98702_c1_seq21:52-429(-) | 
|  | 347                 if tx_id+':' not in orf_full_id: | 
|  | 348                   continue | 
|  | 349                 m = re.match("^.*%s:(\d+)-(\d+)[(]([+-])[)].*" % re.sub('([|.{}()$?^])','[\\1]',tx_id),orf_full_id) | 
|  | 350                 if m: | 
|  | 351                   if not m.groups() or len(m.groups()) < 3 or m.groups()[0] == None: | 
|  | 352                     print >> sys.stderr, "Error:\n%s\n%s\n" % (tx_id,orf_full_id) | 
|  | 353                   orf_id = orf_full_id.lstrip('>').split()[0] | 
|  | 354                   if not tx_id in transcript_orfs: | 
|  | 355                     transcript_orfs[tx_id] = [] | 
|  | 356                   alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) | 
|  | 357                   # print >> sys.stdout, "%d %s bkpt:%d %s rc:%s (%s)   %s" % (fusion['ordinal'], tx_id, int(fusion['transcripts'][tx_id]['bkpt']), str(m.groups()), str(fusion['transcripts'][tx_id]['revcompl']), alignments, orf_full_id) | 
|  | 358                   start = seq.find('M') | 
| 36 | 359                   pep_len = len(seq) | 
|  | 360                   if pep_len - start < options.min_pep_len: | 
|  | 361                     continue | 
| 40 | 362                   orf_dict = dict() | 
|  | 363                   transcript_orfs[tx_id].append(orf_dict) | 
| 36 | 364                   fusions_with_orfs.add(i) | 
| 40 | 365                   matched_orfs[orf_full_id] = seq | 
| 36 | 366                   fusion['Protein'] = 'Yes' | 
| 40 | 367                   tx_start = int(m.groups()[0]) | 
|  | 368                   tx_end = int(m.groups()[1]) | 
|  | 369                   tx_strand = m.groups()[2] | 
|  | 370                   tx_bkpt = fusion['transcripts'][tx_id]['bkpt'] | 
|  | 371                   orf_dict['orf_id'] = orf_id | 
|  | 372                   orf_dict['tx_start'] = tx_start | 
|  | 373                   orf_dict['tx_end'] = tx_end | 
|  | 374                   orf_dict['tx_strand'] = tx_strand | 
|  | 375                   orf_dict['tx_bkpt'] = tx_bkpt | 
|  | 376                   orf_dict['seq'] = seq[:start].lower() + seq[start:] if start > 0 else seq | 
|  | 377                   ## >m.208656 g.208656  ORF g.208656 m.208656 type:5prime_partial len:303 (+) comp100185_c2_seq9:2-910(+) | 
|  | 378                   ## translate(tx34[1:910]) | 
|  | 379                   ## translate(tx34[1:2048]) | 
|  | 380                   ## comp99273_c1_seq1 len=3146 (-2772) | 
|  | 381                   ## >m.158338 g.158338  ORF g.158338 m.158338 type:complete len:785 (-) comp99273_c1_seq1:404-2758(-) | 
|  | 382                   ##  translate(tx[-2758:-403]) | 
|  | 383                   ## comp100185_c2_seq9 len=2048 (904) | 
|  | 384                   ## novel protein sequence | 
|  | 385                   ## find first novel AA | 
|  | 386                   ## get prior n AAs | 
|  | 387                   ## get novel AA seq thru n stop codons | 
|  | 388                   ### tx_seq = matched_transcripts[tx_full_id] if tx_bkpt >= 0 else revcompl(tx_seq) | 
|  | 389                   tx_seq = fusion['transcripts'][tx_id]['seq'] | 
|  | 390                   orf_dict['tx_seq'] = tx_seq | 
|  | 391                   novel_tx_seq = tx_seq[tx_start - 1:] if tx_strand == '+' else revcompl(tx_seq[:tx_end]) | 
|  | 392                   read_thru_pep = translate(novel_tx_seq) | 
|  | 393                   # fusion['transcripts'][tx_id]['revcompl'] = True | 
|  | 394                   # tx_bkpt = fusion['transcripts'][tx_id]['bkpt'] | 
|  | 395                   # bkpt_aa_pos = tx_bkpt - tx_start - 1 | 
|  | 396                   # bkpt_aa_pos = (tx_bkpt - tx_start - 1) / 3 if tx_strand == '+' else tx_end | 
|  | 397                   # print >> sys.stdout, "%s\n%s" % (seq,read_thru_pep) | 
|  | 398                   stop_codons = get_stop_codons(novel_tx_seq) | 
|  | 399                   if options.readthrough: | 
|  | 400                     readthrough = options.readthrough + 1 | 
|  | 401                     read_thru_pep = '*'.join(read_thru_pep.split('*')[:readthrough]) | 
|  | 402                     stop_codons = stop_codons[:readthrough] | 
|  | 403                   orf_dict['read_thru_pep'] = read_thru_pep | 
|  | 404                   orf_dict['stop_codons'] = ','.join(stop_codons) | 
|  | 405       print >> sys.stdout, "fusions_with_orfs: %d  %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs)) | 
|  | 406   ## Alignments 3 columns, seq columns padded out to longest seq, UPPERCASE_match  diffs lowercase | 
|  | 407   ### defuse_id		pre_split_seq		post_split_seq | 
|  | 408   ### trinity_id	pre_split_seq		post_split_seq | 
|  | 409   ## Transcripts alignment output | 
|  | 410   ## Peptide alignment output | 
| 36 | 411   ## Write reports | 
| 40 | 412   ## OS03_Matched_Rev.csv | 
|  | 413   ## "count","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","ID1","protein" | 
|  | 414   if options.transcripts and options.matched: | 
|  | 415     #match_fields = ['ordinal','gene_name1','gene_name2','fwd_seq'] | 
|  | 416     outputMatchFile = open(options.matched,'w') | 
|  | 417     #print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","read_through","stop_codons"]) | 
|  | 418     print >> outputMatchFile, '\t'.join(["#fusion_id","cluster_id","gene1","gene2","breakpoint","fusion","Trinity_transcript_ID","Trinity_transcript","Trinity_ORF_Transcript","Trinity_ORF_ID","protein","stop_codons"]) | 
|  | 419     for i,fusion in enumerate(fusions): | 
|  | 420       if len(fusion['transcripts']) > 0: | 
|  | 421         for tx_id in fusion['transcripts'].keys(): | 
|  | 422           if tx_id in transcript_orfs: | 
|  | 423             for orf_dict in transcript_orfs[tx_id]: | 
|  | 424               if 'tx_seq' not in orf_dict: | 
|  | 425                 print >> sys.stderr, "orf_dict %s" % orf_dict | 
|  | 426               #fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['seq'],orf_dict['read_thru_pep'],orf_dict['stop_codons']] | 
|  | 427               fields = [str(fusion['ordinal']),str(fusion['cluster_id']),fusion['gene_name1'],fusion['gene_name2'],fusion['fwd_seq'],fusion['splitr_sequence'],tx_id, fusion['transcripts'][tx_id]['seq1']+'|'+fusion['transcripts'][tx_id]['seq2'],orf_dict['tx_seq'],orf_dict['orf_id'],orf_dict['read_thru_pep'],orf_dict['stop_codons']] | 
|  | 428               print >> outputMatchFile, '\t'.join(fields) | 
|  | 429     outputMatchFile.close() | 
|  | 430   if options.transcripts and options.transcript_alignment: | 
|  | 431     if outputTxFile: | 
|  | 432       id_fields = ['gene_name1','alignments1','gene_name2','alignments2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein','flags'] | 
|  | 433       fa_width = 80 | 
|  | 434       for i,fusion in enumerate(fusions): | 
|  | 435         if len(fusion['transcripts']) > 0: | 
|  | 436           alignments1 = "%s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1']) | 
|  | 437           alignments2 = "%s%s%s" % (fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) | 
|  | 438           alignments = "%s%s%s %s%s%s" % (fusion['genomic_strand1'], fusion['gene_strand1'], fusion['gene_align_strand1'], fusion['genomic_strand2'], fusion['gene_strand2'], fusion['gene_align_strand2']) | 
|  | 439           fusion_id = "%s (%s) %s" % (i + 1,alignments,' '.join([str(fusion[x]) for x in report_fields])) | 
|  | 440           for tx_id in fusion['transcripts'].keys(): | 
|  | 441             m1 = fusion['transcripts'][tx_id]['match1'] | 
|  | 442             f_seq1 = fusion['split_seqs'][0][:-m1].lower() +  fusion['split_seqs'][0][-m1:] | 
|  | 443             t_seq1 = fusion['transcripts'][tx_id]['seq1'][:-m1].lower() + fusion['transcripts'][tx_id]['seq1'][-m1:] | 
|  | 444             if len(f_seq1) > len(t_seq1): | 
|  | 445               t_seq1 = t_seq1.rjust(len(f_seq1),'.') | 
|  | 446             elif len(f_seq1) < len(t_seq1): | 
|  | 447               f_seq1 = f_seq1.rjust(len(t_seq1),'.') | 
|  | 448             m2 = fusion['transcripts'][tx_id]['match2'] | 
|  | 449             f_seq2 = fusion['split_seqs'][1][:m2] +  fusion['split_seqs'][1][m2:].lower() | 
|  | 450             t_seq2 = fusion['transcripts'][tx_id]['seq2'][:m2] + fusion['transcripts'][tx_id]['seq2'][m2:].lower() | 
|  | 451             if len(f_seq2) > len(t_seq2): | 
|  | 452               t_seq2 = t_seq2.ljust(len(f_seq2),'.') | 
|  | 453             elif len(f_seq2) < len(t_seq2): | 
|  | 454               f_seq2 = f_seq2.ljust(len(t_seq2),'.') | 
|  | 455             print >> outputTxFile, ">%s\n%s\n%s" % (fusion_id,'\n'.join(textwrap.wrap(f_seq1,fa_width)),'\n'.join(textwrap.wrap(f_seq2,fa_width))) | 
|  | 456             print >> outputTxFile, "%s bkpt:%d rev_compl:%s\n%s\n%s" % (fusion['transcripts'][tx_id]['full_id'],fusion['transcripts'][tx_id]['bkpt'],str(fusion['transcripts'][tx_id]['revcompl']),'\n'.join(textwrap.wrap(t_seq1,fa_width)),'\n'.join(textwrap.wrap(t_seq2,fa_width))) | 
|  | 457   """ | 
|  | 458   if options.peptides and options.orf_alignment: | 
|  | 459     pass | 
|  | 460   """ | 
| 36 | 461   print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields])) | 
|  | 462   for i,fusion in enumerate(fusions): | 
|  | 463     print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields])) | 
|  | 464 | 
|  | 465 if __name__ == "__main__" : __main__() | 
|  | 466 |