annotate cuffcompare_wrapper.py @ 2:5ac3ca7ad18b draft default tip

Updated Test Data
author geert-vandeweyer
date Sat, 02 Aug 2014 10:24:42 -0400
parents 15437246f588
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
1 #!/usr/bin/env python
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
2
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
3 # Supports Cuffcompare versions v1.3.0 and newer.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
4
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
5 import optparse, os, shutil, subprocess, sys, tempfile
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
6
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
7 def stop_err( msg ):
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
8 sys.stderr.write( '%s\n' % msg )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
9 sys.exit()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
10
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
11 def __main__():
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
12 #Parse Command Line
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
13 parser = optparse.OptionParser()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
14 parser.add_option( '-r', dest='ref_annotation', help='An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
15 parser.add_option( '-R', action="store_true", dest='ignore_nonoverlap_reference', help='If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts accuracy file' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
16 parser.add_option( '-Q', action="store_true", dest='ignore_nonoverlap_transfrag', help='If -r was specified, this option causes cuffcompare to consider only the input transcripts that overlap any of the reference transcripts (Sp correction); Warning: this will discard all "novel" loci!)' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
17
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
18 parser.add_option( '-s', dest='use_seq_data', action="store_true", help='Causes cuffcompare to look into for fasta files with the underlying genomic sequences (one file per contig) against which your reads were aligned for some optional classification functions. For example, Cufflinks transcripts consisting mostly of lower-case bases are classified as repeats. Note that <seq_dir> must contain one fasta file per reference chromosome, and each file must be named after the chromosome, and have a .fa or .fasta extension.')
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
19
1
15437246f588 corrected boolean argument handling in wrapper.py
geert-vandeweyer
parents: 0
diff changeset
20 parser.add_option( '-M', action="store_true", dest='discard_single_exon_all', help='discard (ignore) single-exon transfrags and reference transcript')
15437246f588 corrected boolean argument handling in wrapper.py
geert-vandeweyer
parents: 0
diff changeset
21 parser.add_option( '-N', action="store_true", dest='discard_single_exon_ref', help='discard (ignore) single-exon reference transcripts')
0
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
22 parser.add_option( '-e', dest='max_dist_exon', help='Max. Distance for assessing exon accuracy" help="max. distance (range) allowed from free ends of terminal exons of reference transcripts when assessing exon accuracy. Default: 100')
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
23 parser.add_option( '-d', dest='max_dist_group', help='Max.Distance for transcript grouping" help="max. distance (range) for grouping transcript start sites. Default: 100')
1
15437246f588 corrected boolean argument handling in wrapper.py
geert-vandeweyer
parents: 0
diff changeset
24 parser.add_option( '-F', action="store_true", dest='discard_redundant_intron_transfrags', help='Discard intron-redundant transfrags if they share the 5-prime end (if they differ only at the 3-prime end)')
0
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
25
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
26 # Wrapper / Galaxy options.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
27 parser.add_option( '', '--index', dest='index', help='The path of the reference genome' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
28 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
29
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
30 # Outputs.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
31 parser.add_option( '', '--combined-transcripts', dest='combined_transcripts' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
32
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
33 (options, args) = parser.parse_args()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
34
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
35 # output version # of tool
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
36 try:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
37 tmp = tempfile.NamedTemporaryFile().name
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
38 tmp_stdout = open( tmp, 'wb' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
39 proc = subprocess.Popen( args='cuffcompare 2>&1', shell=True, stdout=tmp_stdout )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
40 tmp_stdout.close()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
41 returncode = proc.wait()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
42 stdout = None
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
43 for line in open( tmp_stdout.name, 'rb' ):
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
44 if line.lower().find( 'cuffcompare v' ) >= 0:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
45 stdout = line.strip()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
46 break
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
47 if stdout:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
48 sys.stdout.write( '%s\n' % stdout )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
49 else:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
50 raise Exception
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
51 except:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
52 sys.stdout.write( 'Could not determine Cuffcompare version\n' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
53
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
54 # Set/link to sequence file.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
55 if options.use_seq_data:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
56 if options.ref_file:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
57 # Sequence data from history.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
58 # Create symbolic link to ref_file so that index will be created in working directory.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
59 seq_path = "ref.fa"
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
60 os.symlink( options.ref_file, seq_path )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
61 else:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
62 if not os.path.exists( options.index ):
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
63 stop_err( 'Reference genome %s not present, request it by reporting this error.' % options.index )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
64 seq_path = options.index
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
65
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
66 # Build command.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
67
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
68 # Base.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
69 cmd = "cuffcompare -o cc_output "
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
70
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
71 # Add options.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
72 if options.ref_annotation:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
73 cmd += " -r %s " % options.ref_annotation
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
74 if options.ignore_nonoverlap_reference:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
75 cmd += " -R "
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
76 if options.ignore_nonoverlap_transfrag:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
77 cmd += " -Q "
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
78 if options.use_seq_data:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
79 cmd += " -s %s " % seq_path
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
80 if options.discard_single_exon_all:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
81 cmd += " -M "
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
82 if options.discard_single_exon_ref:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
83 cmd += " -N "
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
84 if options.max_dist_exon:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
85 cmd += " -e %i " % int( options.max_dist_exon )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
86 if options.max_dist_group:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
87 cmd += " -d %i " % int( options.max_dist_group )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
88 if options.discard_redundant_intron_transfrags:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
89 cmd += " -F "
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
90 # Add input files.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
91
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
92 # Need to symlink inputs so that output files are written to temp directory.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
93 for i, arg in enumerate( args ):
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
94 input_file_name = "./input%i" % ( i+1 )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
95 os.symlink( arg, input_file_name )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
96 cmd += "%s " % input_file_name
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
97
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
98 # Debugging.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
99 print cmd
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
100
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
101 # Run command.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
102 try:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
103 tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
104 tmp_stderr = open( tmp_name, 'wb' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
105 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
106 returncode = proc.wait()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
107 tmp_stderr.close()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
108
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
109 # Get stderr, allowing for case where it's very large.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
110 tmp_stderr = open( tmp_name, 'rb' )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
111 stderr = ''
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
112 buffsize = 1048576
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
113 try:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
114 while True:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
115 stderr += tmp_stderr.read( buffsize )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
116 if not stderr or len( stderr ) % buffsize != 0:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
117 break
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
118 except OverflowError:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
119 pass
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
120 tmp_stderr.close()
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
121
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
122 # Error checking.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
123 if returncode != 0:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
124 raise Exception, stderr
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
125
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
126 # Copy outputs.
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
127 shutil.copyfile( "cc_output.combined.gtf" , options.combined_transcripts )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
128
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
129 # check that there are results in the output file
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
130 cc_output_fname = "cc_output.stats"
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
131 if len( open( cc_output_fname, 'rb' ).read().strip() ) == 0:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
132 raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
133 except Exception, e:
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
134 stop_err( 'Error running cuffcompare. ' + str( e ) )
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
135
d4455014134b Uploaded
geert-vandeweyer
parents:
diff changeset
136 if __name__=="__main__": __main__()