Mercurial > repos > charles_s_test > seqsero2
comparison run_seqsero_batch_galaxy.py @ 0:6895de35a263 draft
planemo upload commit 844a891e4eaf732830043204ac636907eefb011d-dirty
author | charles_s_test |
---|---|
date | Thu, 19 Oct 2017 18:16:51 -0400 |
parents | |
children | 0d65b71ff8df |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6895de35a263 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 print 'monkey wonders where he is' | |
4 | |
5 import os | |
6 | |
7 # os.system('source /nfs/sw/apps/galaxy-dev/galaxy/.venv/bin/activate') | |
8 | |
9 import re, sys, time, datetime | |
10 import subprocess, psycopg2, sqlalchemy | |
11 from subprocess import Popen, PIPE | |
12 from sqlalchemy import * | |
13 from datetime import datetime | |
14 | |
15 print 'monkey found some files', sys.argv | |
16 engine = create_engine('postgresql+psycopg2://galaxy:cF$cl0udh9c@galaxyprod.cvkyaz9id4ml.us-east-1.rds.amazonaws.com:5432/galaxy') | |
17 print 'monkey says "vroom vroom"' | |
18 connection = engine.connect() | |
19 print "monkey made a connection!" | |
20 | |
21 # database = '/nfs/sw/apps/galaxy-dev/galaxy/database/universe.sqlite' | |
22 seqsero = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/SeqSero.py' | |
23 test_out = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test.txt", 'w') | |
24 out_path = '/nfs/sw/apps/galaxy-dev/galaxy/database/files/000' | |
25 test_out2 = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test2.txt", 'w') | |
26 | |
27 path2sample = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp' | |
28 | |
29 test_out.write("monkey "); | |
30 test_out.write("\t".join(sys.argv)+'\n') | |
31 | |
32 fq_list1 = [] | |
33 | |
34 test_out.write(str(len(sys.argv))+"\n") | |
35 if len(sys.argv) >= 2: | |
36 test_out.write("\t".join(sys.argv)+'\n') | |
37 fq_list1 = sys.argv[1:] | |
38 | |
39 test_out.write(str(len(sys.argv))+"\n") | |
40 | |
41 fastq_files = re.split(",", fq_list1[0]) # fq_list1[0] is a string with commas between the path/filenames. | |
42 | |
43 def print_time(): | |
44 test_out.write(time.asctime( time.localtime(time.time()))) | |
45 | |
46 tmp_path = path2sample | |
47 if not os.path.exists(tmp_path): | |
48 os.system('mkdir '+tmp_path) | |
49 | |
50 test_out.write(str(len(fastq_files))+"\n") | |
51 | |
52 def list_runs(fastq_files): | |
53 ''' | |
54 Creates dict with runs as keys and list with filenames as values. | |
55 ''' | |
56 run2fastqs = {} | |
57 is_fastq = 'yes' | |
58 for file in fastq_files: | |
59 print 'monkey found', file | |
60 run = '' | |
61 try: | |
62 fastq = open(file, 'r') | |
63 i = 0 | |
64 for line in fastq: | |
65 line = line.rstrip("\n") | |
66 if i == 0: | |
67 run = re.split("\s", line)[0] | |
68 run = re.sub('@', '', run) | |
69 run = re.split("\.", run)[0] | |
70 if not re.search('^@', line): | |
71 is_fastq = 'no' | |
72 else: | |
73 break | |
74 i += 1 | |
75 file1 = re.split('/', file)[-1] | |
76 dataset_id = re.split('_', file1) | |
77 dataset_id = re.sub('.dat', '', dataset_id[-1]) | |
78 print dataset_id, type(dataset_id) | |
79 result = connection.execute('SELECT name FROM history_dataset_association WHERE dataset_id = '+dataset_id+';') | |
80 original_filename = '' | |
81 for row in result: | |
82 print 'monkey says the original_filename is something like', row[0] | |
83 original_filename = row[0] | |
84 if re.search('fasta$', original_filename): | |
85 if is_fastq == 'yes': | |
86 original_filename = re.sub('fasta', 'fastq', original_filename) | |
87 else: | |
88 print 'The input file is not a fastq file.' | |
89 file2 = re.sub('.dat$', '_'+original_filename, file1) | |
90 print 'monkey renamed the file', file2 | |
91 new_path_file = tmp_path+'/'+file2 | |
92 if file2 not in os.listdir(tmp_path): | |
93 print file, new_path_file | |
94 os.system('cp '+file+' '+new_path_file) | |
95 if run in run2fastqs.keys(): | |
96 if file not in run2fastqs[run]: | |
97 run2fastqs[run].append(new_path_file) | |
98 else: | |
99 run2fastqs[run] = [new_path_file] | |
100 except IOError: | |
101 print "Data not found. It is possible for a deleted file to still be listed "\ | |
102 "in a Galaxy library. Please confirm that the data still exists on this "\ | |
103 "server. You may need to upload it again." | |
104 return run2fastqs | |
105 | |
106 def run_seqsero(run2fastqs): | |
107 ''' | |
108 Takes files from run2fastqs and runs SeqSero. | |
109 ''' | |
110 outputs = [] | |
111 for run in run2fastqs: | |
112 print run, run2fastqs[run] | |
113 seqsero_cmd = [] | |
114 if len(run2fastqs[run]) == 2: | |
115 seqsero_cmd = ['python', seqsero, '-m', '2', '-i', run2fastqs[run][0], run2fastqs[run][1]] | |
116 elif len(run2fastqs[run]) == 1: | |
117 seqsero_cmd = ['python', seqsero, '-m', '1', '-i', run2fastqs[run][0]] | |
118 print seqsero_cmd | |
119 p = Popen(seqsero_cmd, stdout=PIPE) | |
120 output = p.communicate() | |
121 outputs.append(output) | |
122 return outputs | |
123 | |
124 def get_serotypes(outputs): | |
125 ''' | |
126 ''' | |
127 fastq2comment = {} | |
128 fastq2serotype = {} | |
129 for sample in outputs: | |
130 fastqs = '' | |
131 lines_used = [] | |
132 for line in sample: # line is actually the entire seqsero output. | |
133 line = str(line) | |
134 linel = re.split("\n", line) | |
135 # print linel | |
136 #lines_used = [] | |
137 for element in linel: # element is a line of seqsero output. | |
138 element = element.rstrip("\n") | |
139 test_out.write(element+"\n") | |
140 elementl = re.split("\t", element) | |
141 if elementl[0] == 'Input files:': | |
142 fastqs = elementl[1] | |
143 lines_used.append(element) | |
144 if elementl[1] not in fastq2serotype.keys(): | |
145 fastq2serotype[fastqs] = ['']*5 | |
146 fastq2comment[fastqs] = [] | |
147 elif elementl[0] == 'O antigen prediction:': | |
148 lines_used.append(element) | |
149 fastq2serotype[fastqs][0] = elementl[1] # add predicted profile | |
150 elif elementl[0] == 'H1 antigen prediction(fliC):': | |
151 lines_used.append(element) | |
152 fastq2serotype[fastqs][1] = elementl[1] # add predicted profile | |
153 elif elementl[0] == 'H2 antigen prediction(fljB):': | |
154 lines_used.append(element) | |
155 fastq2serotype[fastqs][2] = elementl[1] # add predicted profile | |
156 elif elementl[0] == 'Predicted antigenic profile:': | |
157 lines_used.append(element) | |
158 fastq2serotype[fastqs][3] = elementl[1] # add predicted profile | |
159 elif elementl[0] == 'Predicted serotype(s):': | |
160 lines_used.append(element) | |
161 fastq2serotype[fastqs][4] = elementl[1] # add predicted serotype | |
162 if element not in lines_used and re.search("\w", fastqs) and len(element) > 7: | |
163 fastq2comment[fastqs].append(element) | |
164 #print "\n" | |
165 return fastq2serotype, fastq2comment | |
166 | |
167 def print_html(fastq2serotype, fastq2comment): | |
168 ''' | |
169 Takes dict and prints to html file. | |
170 ''' | |
171 tab_out = open('Seqsero_result.txt', 'w'); | |
172 html_out = open('Seqsero_result.html', 'w') | |
173 html_out.write('<!DOCTYPE html>\n') | |
174 html_out.write('<html>\n') | |
175 html_out.write('<head>\n') | |
176 html_out.write('<title>SeqSero Results</title>\n') | |
177 html_out.write('</head>\n') | |
178 html_out.write('<body>\n') | |
179 html_out.write('<body style="font-family:Helvetica;">\n') | |
180 html_out.write('<p style="font-size:10px">\n') | |
181 html_out.write('<table border=1>\n') | |
182 header = 'Input Files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted antigenic profile\tPredicted serotype(s)' | |
183 header = re.sub(' ', '_', header) | |
184 header_l = ['Input Files', 'O antigen prediction', 'H1 antigen prediction(fliC)', 'H2 antigen prediction(fljB)', 'Predicted antigenic profile', 'Predicted serotype(s)'] | |
185 html_out.write('<tr>\n') | |
186 for element in header_l: | |
187 html_out.write('<td>'+element+'</td>\n') | |
188 html_out.write('</tr>\n') | |
189 tab_out.write(header+"\n") | |
190 print "\n\n", header | |
191 for fastq in fastq2serotype: | |
192 # print fastq, fastq2serotype[fastq] | |
193 line_to_print = fastq+'\t'+"\t".join(fastq2serotype[fastq]) | |
194 tab_out.write(line_to_print+"\n") | |
195 html_out.write('<tr>\n') | |
196 html_out.write('<td>'+fastq+'</td>\n') | |
197 for antigen in fastq2serotype[fastq]: | |
198 html_out.write('<td>'+antigen+'</td>\n') | |
199 html_out.write('</tr>\n') | |
200 print line_to_print | |
201 html_out.write('</table>\n') | |
202 print "\n" | |
203 for fastq in fastq2comment: | |
204 tab_out.write("\n"+fastq+"\n") | |
205 html_out.write('<tr>\n') | |
206 html_out.write('<p>\n') | |
207 html_out.write('<td>'+fastq+"<br></td>\n") | |
208 for line in fastq2comment[fastq]: | |
209 #if len(line) > 7: | |
210 html_out.write('<td>'+line+'</td>\n') | |
211 tab_out.write(line+"\n") | |
212 print line | |
213 print "\n" | |
214 html_out.write('</p>\n') | |
215 html_out.write('</tr>\n') | |
216 html_out.write('</body>\n') | |
217 html_out.write('</html>\n') | |
218 html_out.close() | |
219 | |
220 run2fastqs = list_runs(fastq_files) | |
221 #print run2fastqs | |
222 outputs = run_seqsero(run2fastqs) | |
223 fastq2serotype, fastq2comment = get_serotypes(outputs) | |
224 print_html(fastq2serotype, fastq2comment) | |
225 print_time() | |
226 | |
227 |