comparison FastQC/rgFastQC.py @ 0:57f890a5fa73 draft

Uploaded
author fubar
date Tue, 04 Jun 2013 00:01:51 -0400
parents
children 71899f689406
comparison
equal deleted inserted replaced
-1:000000000000 0:57f890a5fa73
1 """
2 # May 2013 ross added check for bogus gz extension - fastqc gets confused
3 # added sanitizer for user supplied name
4 # removed shell and make cl a sequence for Popen call
5 # ross lazarus August 10 2012 in response to anon insecurity report
6 wrapper for fastqc
7
8 called as
9 <command interpreter="python">
10 rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
11 </command>
12
13
14
15 Current release seems overly intolerant of sam/bam header strangeness
16 Author notified...
17
18
19 """
20 import re
21 import os
22 import sys
23 import subprocess
24 import optparse
25 import shutil
26 import tempfile
27 import zipfile
28 import gzip
29
30 class FastQC():
31 """wrapper
32 """
33
34
35 def __init__(self,opts=None):
36 assert opts <> None
37 self.opts = opts
38
39 def getFileString(fpath, outpath):
40 """
41 format a nice file size string
42 """
43 size = ''
44 fp = os.path.join(outpath, fpath)
45 s = fpath
46 if os.path.isfile(fp):
47 n = float(os.path.getsize(fp))
48 if n > 2**20:
49 size = ' (%1.1f MB)' % (n/2**20)
50 elif n > 2**10:
51 size = ' (%1.1f KB)' % (n/2**10)
52 elif n > 0:
53 size = ' (%d B)' % (int(n))
54 s = '%s %s' % (fpath, size)
55 return s
56
57 def run_fastqc(self):
58 """
59 In batch mode fastqc behaves not very nicely - will write to a new folder in
60 the same place as the infile called [infilebasename]_fastqc
61 rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
62 duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt
63 error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png
64 fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png
65
66 """
67 serr = ''
68 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
69 sout = open(tlog, 'w')
70 fastq = os.path.basename(self.opts.input)
71 cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
72 if self.opts.informat in ['sam','bam']:
73 cl.append('--f=%s' % self.opts.informat)
74 if self.opts.contaminants <> None :
75 cl.append('--contaminants=%s' % self.opts.contaminants)
76 # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
77 # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
78 # note this exposes a bug in the EBI_SRA download tool which leaves bogus .gz extensions on uncompressed files
79 # which fastqc helpfully tries to uncompress again - hilarity ensues.
80 # patched may 29 2013 until this is fixed properly
81 infname = self.opts.inputfilename
82 linf = infname.lower()
83 trimext = False
84 if ( linf.endswith('.gz') or linf.endswith('.gzip') ):
85 f = gzip.open(self.opts.input)
86 try:
87 testrow = f.readline()
88 except:
89 trimext = True
90 f.close()
91 elif linf.endswith('bz2'):
92 f = bz2.open(self.opts.input,'rb')
93 try:
94 f.readline()
95 except:
96 trimext = True
97 f.close()
98 elif linf.endswith('.zip'):
99 if not zipfile.is_zipfile(self.opts.input):
100 trimext = True
101 if trimext:
102 infname = os.path.splitext(infname)[0]
103 fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
104 link_name = os.path.join(self.opts.outputdir, fastqinfilename)
105 os.symlink(self.opts.input, link_name)
106 cl.append(link_name)
107 sout.write('# FastQC cl = %s\n' % ' '.join(cl))
108 sout.flush()
109 p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
110 retval = p.wait()
111 sout.close()
112 runlog = open(tlog,'r').readlines()
113 os.unlink(link_name)
114 flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
115 odpath = None
116 for f in flist:
117 d = os.path.join(self.opts.outputdir,f)
118 if os.path.isdir(d):
119 if d.endswith('_fastqc'):
120 odpath = d
121 hpath = None
122 if odpath <> None:
123 try:
124 hpath = os.path.join(odpath,'fastqc_report.html')
125 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
126 except:
127 pass
128 if hpath == None:
129 serr = '\n'.join(runlog)
130 res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
131 res += runlog
132 res += ['</pre>\n',
133 'Please read the above for clues<br/>\n',
134 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
135 'It is also possible that the log shows that fastqc is not installed?<br/>\n',
136 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
137 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
138 return res,1,serr
139 self.fix_fastqcimages(odpath)
140 flist = os.listdir(self.opts.outputdir) # these have now been fixed
141 excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
142 flist = [x for x in flist if not x in excludefiles]
143 for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
144 rep[i] = rep[i].replace('Icons/','')
145 rep[i] = rep[i].replace('Images/','')
146 html = self.fix_fastqc(rep,flist,runlog)
147 return html,retval,serr
148
149
150
151 def fix_fastqc(self,rep=[],flist=[],runlog=[]):
152 """ add some of our stuff to the html
153 """
154 bodyindex = len(rep) -1 # hope they don't change this
155 footrow = bodyindex - 1
156 footer = rep[footrow]
157 rep = rep[:footrow] + rep[footrow+1:]
158 res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
159 flist.sort()
160 for i,f in enumerate(flist):
161 if not(os.path.isdir(f)):
162 fn = os.path.split(f)[-1]
163 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
164 res.append('</table>\n')
165 res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
166 res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n</div>')
167 res.append(footer)
168 fixed = rep[:bodyindex] + res + rep[bodyindex:]
169 return fixed # with our additions
170
171
172 def fix_fastqcimages(self,odpath):
173 """ Galaxy wants everything in the same files_dir
174 """
175 icpath = os.path.join(odpath,'Icons')
176 impath = os.path.join(odpath,'Images')
177 for adir in [icpath,impath,odpath]:
178 if os.path.exists(adir):
179 flist = os.listdir(adir) # get all files created
180 for f in flist:
181 if not os.path.isdir(os.path.join(adir,f)):
182 sauce = os.path.join(adir,f)
183 dest = os.path.join(self.opts.outputdir,f)
184 shutil.move(sauce,dest)
185 os.rmdir(adir)
186
187
188
189 if __name__ == '__main__':
190 op = optparse.OptionParser()
191 op.add_option('-i', '--input', default=None)
192 op.add_option('-j', '--inputfilename', default=None)
193 op.add_option('-o', '--htmloutput', default=None)
194 op.add_option('-d', '--outputdir', default="/tmp/shortread")
195 op.add_option('-f', '--informat', default='fastq')
196 op.add_option('-n', '--namejob', default='rgFastQC')
197 op.add_option('-c', '--contaminants', default=None)
198 op.add_option('-e', '--executable', default='fastqc')
199 opts, args = op.parse_args()
200 assert opts.input <> None
201 assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
202 if not os.path.exists(opts.outputdir):
203 os.makedirs(opts.outputdir)
204 f = FastQC(opts)
205 html,retval,serr = f.run_fastqc()
206 f = open(opts.htmloutput, 'w')
207 f.write(''.join(html))
208 f.close()
209 if retval <> 0:
210 print >> sys.stderr, serr # indicate failure
211
212
213