Mercurial > repos > charles_s_test > seqsero2
comparison run_seqsero_batch_galaxy.py @ 0:6895de35a263 draft
planemo upload commit 844a891e4eaf732830043204ac636907eefb011d-dirty
| author | charles_s_test |
|---|---|
| date | Thu, 19 Oct 2017 18:16:51 -0400 |
| parents | |
| children | 0d65b71ff8df |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6895de35a263 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 print 'monkey wonders where he is' | |
| 4 | |
| 5 import os | |
| 6 | |
| 7 # os.system('source /nfs/sw/apps/galaxy-dev/galaxy/.venv/bin/activate') | |
| 8 | |
| 9 import re, sys, time, datetime | |
| 10 import subprocess, psycopg2, sqlalchemy | |
| 11 from subprocess import Popen, PIPE | |
| 12 from sqlalchemy import * | |
| 13 from datetime import datetime | |
| 14 | |
| 15 print 'monkey found some files', sys.argv | |
| 16 engine = create_engine('postgresql+psycopg2://galaxy:cF$cl0udh9c@galaxyprod.cvkyaz9id4ml.us-east-1.rds.amazonaws.com:5432/galaxy') | |
| 17 print 'monkey says "vroom vroom"' | |
| 18 connection = engine.connect() | |
| 19 print "monkey made a connection!" | |
| 20 | |
| 21 # database = '/nfs/sw/apps/galaxy-dev/galaxy/database/universe.sqlite' | |
| 22 seqsero = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/SeqSero.py' | |
| 23 test_out = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test.txt", 'w') | |
| 24 out_path = '/nfs/sw/apps/galaxy-dev/galaxy/database/files/000' | |
| 25 test_out2 = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test2.txt", 'w') | |
| 26 | |
| 27 path2sample = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp' | |
| 28 | |
| 29 test_out.write("monkey "); | |
| 30 test_out.write("\t".join(sys.argv)+'\n') | |
| 31 | |
| 32 fq_list1 = [] | |
| 33 | |
| 34 test_out.write(str(len(sys.argv))+"\n") | |
| 35 if len(sys.argv) >= 2: | |
| 36 test_out.write("\t".join(sys.argv)+'\n') | |
| 37 fq_list1 = sys.argv[1:] | |
| 38 | |
| 39 test_out.write(str(len(sys.argv))+"\n") | |
| 40 | |
| 41 fastq_files = re.split(",", fq_list1[0]) # fq_list1[0] is a string with commas between the path/filenames. | |
| 42 | |
| 43 def print_time(): | |
| 44 test_out.write(time.asctime( time.localtime(time.time()))) | |
| 45 | |
| 46 tmp_path = path2sample | |
| 47 if not os.path.exists(tmp_path): | |
| 48 os.system('mkdir '+tmp_path) | |
| 49 | |
| 50 test_out.write(str(len(fastq_files))+"\n") | |
| 51 | |
| 52 def list_runs(fastq_files): | |
| 53 ''' | |
| 54 Creates dict with runs as keys and list with filenames as values. | |
| 55 ''' | |
| 56 run2fastqs = {} | |
| 57 is_fastq = 'yes' | |
| 58 for file in fastq_files: | |
| 59 print 'monkey found', file | |
| 60 run = '' | |
| 61 try: | |
| 62 fastq = open(file, 'r') | |
| 63 i = 0 | |
| 64 for line in fastq: | |
| 65 line = line.rstrip("\n") | |
| 66 if i == 0: | |
| 67 run = re.split("\s", line)[0] | |
| 68 run = re.sub('@', '', run) | |
| 69 run = re.split("\.", run)[0] | |
| 70 if not re.search('^@', line): | |
| 71 is_fastq = 'no' | |
| 72 else: | |
| 73 break | |
| 74 i += 1 | |
| 75 file1 = re.split('/', file)[-1] | |
| 76 dataset_id = re.split('_', file1) | |
| 77 dataset_id = re.sub('.dat', '', dataset_id[-1]) | |
| 78 print dataset_id, type(dataset_id) | |
| 79 result = connection.execute('SELECT name FROM history_dataset_association WHERE dataset_id = '+dataset_id+';') | |
| 80 original_filename = '' | |
| 81 for row in result: | |
| 82 print 'monkey says the original_filename is something like', row[0] | |
| 83 original_filename = row[0] | |
| 84 if re.search('fasta$', original_filename): | |
| 85 if is_fastq == 'yes': | |
| 86 original_filename = re.sub('fasta', 'fastq', original_filename) | |
| 87 else: | |
| 88 print 'The input file is not a fastq file.' | |
| 89 file2 = re.sub('.dat$', '_'+original_filename, file1) | |
| 90 print 'monkey renamed the file', file2 | |
| 91 new_path_file = tmp_path+'/'+file2 | |
| 92 if file2 not in os.listdir(tmp_path): | |
| 93 print file, new_path_file | |
| 94 os.system('cp '+file+' '+new_path_file) | |
| 95 if run in run2fastqs.keys(): | |
| 96 if file not in run2fastqs[run]: | |
| 97 run2fastqs[run].append(new_path_file) | |
| 98 else: | |
| 99 run2fastqs[run] = [new_path_file] | |
| 100 except IOError: | |
| 101 print "Data not found. It is possible for a deleted file to still be listed "\ | |
| 102 "in a Galaxy library. Please confirm that the data still exists on this "\ | |
| 103 "server. You may need to upload it again." | |
| 104 return run2fastqs | |
| 105 | |
| 106 def run_seqsero(run2fastqs): | |
| 107 ''' | |
| 108 Takes files from run2fastqs and runs SeqSero. | |
| 109 ''' | |
| 110 outputs = [] | |
| 111 for run in run2fastqs: | |
| 112 print run, run2fastqs[run] | |
| 113 seqsero_cmd = [] | |
| 114 if len(run2fastqs[run]) == 2: | |
| 115 seqsero_cmd = ['python', seqsero, '-m', '2', '-i', run2fastqs[run][0], run2fastqs[run][1]] | |
| 116 elif len(run2fastqs[run]) == 1: | |
| 117 seqsero_cmd = ['python', seqsero, '-m', '1', '-i', run2fastqs[run][0]] | |
| 118 print seqsero_cmd | |
| 119 p = Popen(seqsero_cmd, stdout=PIPE) | |
| 120 output = p.communicate() | |
| 121 outputs.append(output) | |
| 122 return outputs | |
| 123 | |
| 124 def get_serotypes(outputs): | |
| 125 ''' | |
| 126 ''' | |
| 127 fastq2comment = {} | |
| 128 fastq2serotype = {} | |
| 129 for sample in outputs: | |
| 130 fastqs = '' | |
| 131 lines_used = [] | |
| 132 for line in sample: # line is actually the entire seqsero output. | |
| 133 line = str(line) | |
| 134 linel = re.split("\n", line) | |
| 135 # print linel | |
| 136 #lines_used = [] | |
| 137 for element in linel: # element is a line of seqsero output. | |
| 138 element = element.rstrip("\n") | |
| 139 test_out.write(element+"\n") | |
| 140 elementl = re.split("\t", element) | |
| 141 if elementl[0] == 'Input files:': | |
| 142 fastqs = elementl[1] | |
| 143 lines_used.append(element) | |
| 144 if elementl[1] not in fastq2serotype.keys(): | |
| 145 fastq2serotype[fastqs] = ['']*5 | |
| 146 fastq2comment[fastqs] = [] | |
| 147 elif elementl[0] == 'O antigen prediction:': | |
| 148 lines_used.append(element) | |
| 149 fastq2serotype[fastqs][0] = elementl[1] # add predicted profile | |
| 150 elif elementl[0] == 'H1 antigen prediction(fliC):': | |
| 151 lines_used.append(element) | |
| 152 fastq2serotype[fastqs][1] = elementl[1] # add predicted profile | |
| 153 elif elementl[0] == 'H2 antigen prediction(fljB):': | |
| 154 lines_used.append(element) | |
| 155 fastq2serotype[fastqs][2] = elementl[1] # add predicted profile | |
| 156 elif elementl[0] == 'Predicted antigenic profile:': | |
| 157 lines_used.append(element) | |
| 158 fastq2serotype[fastqs][3] = elementl[1] # add predicted profile | |
| 159 elif elementl[0] == 'Predicted serotype(s):': | |
| 160 lines_used.append(element) | |
| 161 fastq2serotype[fastqs][4] = elementl[1] # add predicted serotype | |
| 162 if element not in lines_used and re.search("\w", fastqs) and len(element) > 7: | |
| 163 fastq2comment[fastqs].append(element) | |
| 164 #print "\n" | |
| 165 return fastq2serotype, fastq2comment | |
| 166 | |
| 167 def print_html(fastq2serotype, fastq2comment): | |
| 168 ''' | |
| 169 Takes dict and prints to html file. | |
| 170 ''' | |
| 171 tab_out = open('Seqsero_result.txt', 'w'); | |
| 172 html_out = open('Seqsero_result.html', 'w') | |
| 173 html_out.write('<!DOCTYPE html>\n') | |
| 174 html_out.write('<html>\n') | |
| 175 html_out.write('<head>\n') | |
| 176 html_out.write('<title>SeqSero Results</title>\n') | |
| 177 html_out.write('</head>\n') | |
| 178 html_out.write('<body>\n') | |
| 179 html_out.write('<body style="font-family:Helvetica;">\n') | |
| 180 html_out.write('<p style="font-size:10px">\n') | |
| 181 html_out.write('<table border=1>\n') | |
| 182 header = 'Input Files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted antigenic profile\tPredicted serotype(s)' | |
| 183 header = re.sub(' ', '_', header) | |
| 184 header_l = ['Input Files', 'O antigen prediction', 'H1 antigen prediction(fliC)', 'H2 antigen prediction(fljB)', 'Predicted antigenic profile', 'Predicted serotype(s)'] | |
| 185 html_out.write('<tr>\n') | |
| 186 for element in header_l: | |
| 187 html_out.write('<td>'+element+'</td>\n') | |
| 188 html_out.write('</tr>\n') | |
| 189 tab_out.write(header+"\n") | |
| 190 print "\n\n", header | |
| 191 for fastq in fastq2serotype: | |
| 192 # print fastq, fastq2serotype[fastq] | |
| 193 line_to_print = fastq+'\t'+"\t".join(fastq2serotype[fastq]) | |
| 194 tab_out.write(line_to_print+"\n") | |
| 195 html_out.write('<tr>\n') | |
| 196 html_out.write('<td>'+fastq+'</td>\n') | |
| 197 for antigen in fastq2serotype[fastq]: | |
| 198 html_out.write('<td>'+antigen+'</td>\n') | |
| 199 html_out.write('</tr>\n') | |
| 200 print line_to_print | |
| 201 html_out.write('</table>\n') | |
| 202 print "\n" | |
| 203 for fastq in fastq2comment: | |
| 204 tab_out.write("\n"+fastq+"\n") | |
| 205 html_out.write('<tr>\n') | |
| 206 html_out.write('<p>\n') | |
| 207 html_out.write('<td>'+fastq+"<br></td>\n") | |
| 208 for line in fastq2comment[fastq]: | |
| 209 #if len(line) > 7: | |
| 210 html_out.write('<td>'+line+'</td>\n') | |
| 211 tab_out.write(line+"\n") | |
| 212 print line | |
| 213 print "\n" | |
| 214 html_out.write('</p>\n') | |
| 215 html_out.write('</tr>\n') | |
| 216 html_out.write('</body>\n') | |
| 217 html_out.write('</html>\n') | |
| 218 html_out.close() | |
| 219 | |
| 220 run2fastqs = list_runs(fastq_files) | |
| 221 #print run2fastqs | |
| 222 outputs = run_seqsero(run2fastqs) | |
| 223 fastq2serotype, fastq2comment = get_serotypes(outputs) | |
| 224 print_html(fastq2serotype, fastq2comment) | |
| 225 print_time() | |
| 226 | |
| 227 |
