annotate defuse_trinity_analysis.py @ 36:4353f776dfa3

Add defuse_trinity_analysis
author Jim Johnson <jj@umn.edu>
date Tue, 10 Feb 2015 19:35:52 -0600
parents
children 90127ee1eae5
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
36
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
1 #!/usr/bin/env python
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
2 """
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
3 #
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
4 #------------------------------------------------------------------------------
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
5 # University of Minnesota
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
6 # Copyright 2014, Regents of the University of Minnesota
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
7 #------------------------------------------------------------------------------
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
8 # Author:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
9 #
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
10 # James E Johnson
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
11 #
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
12 #------------------------------------------------------------------------------
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
13 """
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
14
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
15
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
16 """
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
17 This tool takes the defuse results.tsv tab-delimited file, trinity
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
18 and creates a tabular report
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
19 """
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
20
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
21 import sys,re,os.path
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
22 import optparse
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
23 from optparse import OptionParser
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
24
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
25 revcompl = lambda x: ''.join([{'A':'T','C':'G','G':'C','T':'A','a':'t','c':'g','g':'c','t':'a','N':'N','n':'n'}[B] for B in x][::-1])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
26
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
27 def read_fasta(fp):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
28 name, seq = None, []
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
29 for line in fp:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
30 line = line.rstrip()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
31 if line.startswith(">"):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
32 if name: yield (name, ''.join(seq))
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
33 name, seq = line, []
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
34 else:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
35 seq.append(line)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
36 if name: yield (name, ''.join(seq))
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
37
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
38
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
39 def test_rcomplement(seq, target):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
40 try:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
41 comp = revcompl(seq)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
42 return comp in target
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
43 except:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
44 pass
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
45 return False
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
46
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
47 def test_reverse(seq,target):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
48 return options.test_reverse and seq and seq[::-1] in target
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
49
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
50 def cmp_alphanumeric(s1,s2):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
51 if s1 == s2:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
52 return 0
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
53 a1 = re.findall("\d+|[a-zA-Z]+",s1)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
54 a2 = re.findall("\d+|[a-zA-Z]+",s2)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
55 for i in range(min(len(a1),len(a2))):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
56 if a1[i] == a2[i]:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
57 continue
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
58 if a1[i].isdigit() and a2[i].isdigit():
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
59 return int(a1[i]) - int(a2[i])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
60 return 1 if a1[i] > a2[i] else -1
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
61 return len(a1) - len(a2)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
62
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
63
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
64 def parse_defuse_results(inputFile):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
65 columns = []
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
66 defuse_results = []
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
67 # {cluster_id : { field : value})
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
68 try:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
69 for linenum,line in enumerate(inputFile):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
70 ## print >> sys.stderr, "%d: %s\n" % (linenum,line)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
71 fields = line.strip().split('\t')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
72 if line.startswith('cluster_id'):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
73 columns = fields
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
74 ## print >> sys.stderr, "columns: %s\n" % columns
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
75 continue
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
76 cluster_dict = dict()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
77 cluster_id = fields[columns.index('cluster_id')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
78 cluster_dict['cluster_id'] = fields[columns.index('cluster_id')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
79 cluster_dict['gene_chromosome1'] = fields[columns.index('gene_chromosome1')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
80 cluster_dict['gene_chromosome2'] = fields[columns.index('gene_chromosome2')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
81 cluster_dict['genomic_strand1'] = fields[columns.index('genomic_strand1')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
82 cluster_dict['genomic_strand2'] = fields[columns.index('genomic_strand2')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
83 cluster_dict['gene1'] = fields[columns.index('gene1')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
84 cluster_dict['gene2'] = fields[columns.index('gene2')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
85 cluster_dict['gene_name1'] = fields[columns.index('gene_name1')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
86 cluster_dict['gene_name2'] = fields[columns.index('gene_name2')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
87 cluster_dict['gene_location1'] = fields[columns.index('gene_location1')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
88 cluster_dict['gene_location2'] = fields[columns.index('gene_location2')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
89 cluster_dict['expression1'] = int(fields[columns.index('expression1')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
90 cluster_dict['expression2'] = int(fields[columns.index('expression2')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
91 cluster_dict['genomic_break_pos1'] = int(fields[columns.index('genomic_break_pos1')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
92 cluster_dict['genomic_break_pos2'] = int(fields[columns.index('genomic_break_pos2')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
93 cluster_dict['breakpoint_homology'] = int(fields[columns.index('breakpoint_homology')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
94 cluster_dict['orf'] = fields[columns.index('orf')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
95 cluster_dict['exonboundaries'] = fields[columns.index('exonboundaries')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
96 cluster_dict['read_through'] = fields[columns.index('read_through')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
97 cluster_dict['interchromosomal'] = fields[columns.index('interchromosomal')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
98 cluster_dict['adjacent'] = fields[columns.index('adjacent')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
99 cluster_dict['altsplice'] = fields[columns.index('altsplice')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
100 cluster_dict['deletion'] = fields[columns.index('deletion')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
101 cluster_dict['eversion'] = fields[columns.index('eversion')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
102 cluster_dict['inversion'] = fields[columns.index('inversion')] == 'Y'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
103 cluster_dict['span_count'] = int(fields[columns.index('span_count')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
104 cluster_dict['splitr_count'] = int(fields[columns.index('splitr_count')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
105 cluster_dict['splice_score'] = int(fields[columns.index('splice_score')])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
106 cluster_dict['probability'] = float(fields[columns.index('probability')] if columns.index('probability') else 'nan')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
107 cluster_dict['splitr_sequence'] = fields[columns.index('splitr_sequence')]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
108 defuse_results.append(cluster_dict)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
109 except Exception, e:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
110 print >> sys.stderr, "failed: %s" % e
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
111 sys.exit(1)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
112 return defuse_results
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
113
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
114 ## deFuse params to the mapping application?
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
115
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
116 def __main__():
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
117 #Parse Command Line
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
118 parser = optparse.OptionParser()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
119 # files
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
120 parser.add_option( '-i', '--input', dest='input', help='The input defuse results.tsv file (else read from stdin)' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
121 parser.add_option( '-t', '--transcripts', dest='transcripts', default=None, help='Trinity transcripts' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
122 parser.add_option( '-p', '--peptides', dest='peptides', default=None, help='Trinity ORFs' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
123 parser.add_option( '-o', '--output', dest='output', help='The output report (else write to stdout)' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
124 parser.add_option( '-a', '--transcript_alignment', dest='transcript_alignment', help='The output alignment file' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
125 parser.add_option( '-A', '--orf_alignment', dest='orf_alignment', help='The output alignment file' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
126 parser.add_option( '-N', '--nbases', dest='nbases', type='int', default=12, help='Number of bases on either side of the fusion to compare' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
127 parser.add_option( '-L', '--min_pep_len', dest='min_pep_len', type='int', default=100, help='Minimum length of peptide to report' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
128 parser.add_option( '-T', '--ticdist', dest='ticdist', type='int', default=1000000, help='Maximum intrachromosomal distance to be classified a Transcription-induced chimera (TIC)' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
129 parser.add_option( '-P', '--prior_aa', dest='prior_aa', type='int', default=11, help='Number of protein AAs to show preceeding fusion point' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
130 # min_orf_len
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
131 # split_na_len
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
132 # tic_len = 1000000
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
133 # prior
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
134 # deFuse direction reversed
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
135 # in frame ?
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
136 # contain known protein elements
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
137 # what protein change
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
138 # trinity provides full transctipt, defuse doesn't show full
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
139 #parser.add_option( '-r', '--reference', dest='reference', default=None, help='The genomic reference fasta' )
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
140 #parser.add_option( '-g', '--gtf', dest='gtf', default=None, help='The genomic reference gtf feature file')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
141 (options, args) = parser.parse_args()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
142
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
143 # results.tsv input
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
144 if options.input != None:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
145 try:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
146 inputPath = os.path.abspath(options.input)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
147 inputFile = open(inputPath, 'r')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
148 except Exception, e:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
149 print >> sys.stderr, "failed: %s" % e
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
150 exit(2)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
151 else:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
152 inputFile = sys.stdin
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
153 # vcf output
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
154 if options.output != None:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
155 try:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
156 outputPath = os.path.abspath(options.output)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
157 outputFile = open(outputPath, 'w')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
158 except Exception, e:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
159 print >> sys.stderr, "failed: %s" % e
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
160 exit(3)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
161 else:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
162 outputFile = sys.stdout
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
163
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
164 ## Read defuse results
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
165 fusions = parse_defuse_results(inputFile)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
166 ## Create a field with the 12 nt before and after the fusion point.
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
167 ## Create a field with the reverse complement of the 24 nt fusion point field.
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
168 ## Add fusion type filed (INTER, INTRA, TIC)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
169 for i,fusion in enumerate(fusions):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
170 fusion['ordinal'] = i + 1
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
171 split_seqs = fusion['splitr_sequence'].split('|')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
172 fusion['split_seqs'] = split_seqs
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
173 fwd_seq = split_seqs[0][-(min(abs(options.nbases),len(split_seqs[0]))):] + split_seqs[1][:min(abs(options.nbases),len(split_seqs[1]))]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
174 rev_seq = revcompl(fwd_seq)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
175 fusion['fwd_seq'] = fwd_seq
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
176 fusion['rev_seq'] = rev_seq
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
177 fusion_type = 'inter' if fusion['gene_chromosome1'] != fusion['gene_chromosome2'] else 'intra' if abs(fusion['genomic_break_pos1'] - fusion['genomic_break_pos2']) > options.ticdist else 'TIC'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
178 fusion['fusion_type'] = fusion_type
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
179 fusion['transcripts'] = []
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
180 fusion['Transcript'] = 'No'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
181 fusion['Protein'] = 'No'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
182 print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fwd_seq,rev_seq,fusion_type,fusion['gene_name1'],fusion['gene_name2'])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
183 inputFile.close()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
184
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
185 ## Process Trinity data and compare to deFuse
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
186 matched_transcripts = dict()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
187 matched_orfs = dict()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
188 fusions_with_transcripts = set()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
189 fusions_with_orfs = set()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
190 n = 0
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
191 if options.transcripts:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
192 with open(options.transcripts) as fp:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
193 for name, seq in read_fasta(fp):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
194 n += 1
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
195 for i,fusion in enumerate(fusions):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
196 if fusion['fwd_seq'] in seq or fusion['rev_seq'] in seq:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
197 fusions_with_transcripts.add(i)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
198 matched_transcripts[name] = seq
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
199 fusion['transcripts'].append(name)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
200 fusion['Transcript'] = 'Yes'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
201 print >> sys.stdout, "fusions_with_transcripts: %d %s\n matched_transcripts: %d" % (len(fusions_with_transcripts),fusions_with_transcripts,len(matched_transcripts))
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
202 for i,fusion in enumerate(fusions):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
203 print >> sys.stdout, "%4d\t%6s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['cluster_id'],fusion['fwd_seq'],fusion['rev_seq'],fusion['fusion_type'],fusion['gene_name1'],fusion['gene_name2'], fusion['transcripts'])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
204 ## Process ORFs and compare to matched deFuse and Trinity data.
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
205 ## Proteins must be at least 100 aa long, starting at the first "M" and must end with an "*".
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
206 if options.peptides:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
207 with open(options.peptides) as fp:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
208 for name, seq in read_fasta(fp):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
209 n += 1
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
210 if len(seq) < options.min_pep_len:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
211 continue
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
212 for i,fusion in enumerate(fusions):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
213 if len(fusion['transcripts']) > 0:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
214 for id_string in fusion['transcripts']:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
215 tx_id = id_string.lstrip('>').split()[0]
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
216 if tx_id in name:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
217 pep_len = len(seq)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
218 start = seq.find('M')
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
219 if pep_len - start < options.min_pep_len:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
220 continue
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
221 fusions_with_orfs.add(i)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
222 matched_orfs[name] = seq
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
223 fusion['Protein'] = 'Yes'
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
224 # fwd or reverse
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
225 tx_seq = matched_transcripts(tx_id)
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
226 pos = tx_seq.find(fusion['fwd_seq'])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
227 if pos < 0:
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
228 pos = tx_seq.find(fusion['rev_seq'])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
229 # locate fusion in transcript
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
230 # locate fusion in ORF
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
231 fusion['prior_pep_seq'] = ''
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
232 fusion['novel_pep_seq'] = ''
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
233 print >> sys.stdout, "fusions_with_orfs: %d %s\n matched_orfs: %d" % (len(fusions_with_orfs),fusions_with_orfs,len(matched_orfs))
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
234 ## Write reports
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
235 report_fields = ['gene_name1','gene_name2','span_count','probability','gene_chromosome1','gene_location1','gene_chromosome2','gene_location2','fusion_type','Transcript','Protein']
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
236 report_colnames = {'gene_name1':'Gene 1','gene_name2':'Gene 2','span_count':'Span cnt','probability':'Probability','gene_chromosome1':'From Chr','gene_location1':'Fusion point','gene_chromosome2':'To Chr','gene_location2':'Fusion point','fusion_type':'Type','Transcript':'Transcript?','Protein':'Protein?' }
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
237 print >> outputFile,"%s\t%s" % ('#','\t'.join([report_colnames[x] for x in report_fields]))
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
238 for i,fusion in enumerate(fusions):
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
239 print >> outputFile,"%s\t%s" % (i + 1,'\t'.join([str(fusion[x]) for x in report_fields]))
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
240 # print >> outputFile, "%d\t%s\t%s\t%d\t%f\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (i,fusion['gene_name1'],fusion['gene_name2'],fusion['span_count'],fusion['probability'],fusion['gene_chromosome1'],fusion['gene_location1'],fusion['gene_chromosome2'],fusion['gene_location2'],fusion['fusion_type'],fusion['Transcript'],fusion['Protein'])
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
241
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
242 if __name__ == "__main__" : __main__()
4353f776dfa3 Add defuse_trinity_analysis
Jim Johnson <jj@umn.edu>
parents:
diff changeset
243