Mercurial > repos > charles_s_test > seqsero2
diff run_seqsero_batch_galaxy.py @ 0:6895de35a263 draft
planemo upload commit 844a891e4eaf732830043204ac636907eefb011d-dirty
author | charles_s_test |
---|---|
date | Thu, 19 Oct 2017 18:16:51 -0400 |
parents | |
children | 0d65b71ff8df |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/run_seqsero_batch_galaxy.py Thu Oct 19 18:16:51 2017 -0400 @@ -0,0 +1,227 @@ +#!/usr/bin/python + +print 'monkey wonders where he is' + +import os + +# os.system('source /nfs/sw/apps/galaxy-dev/galaxy/.venv/bin/activate') + +import re, sys, time, datetime +import subprocess, psycopg2, sqlalchemy +from subprocess import Popen, PIPE +from sqlalchemy import * +from datetime import datetime + +print 'monkey found some files', sys.argv +engine = create_engine('postgresql+psycopg2://galaxy:cF$cl0udh9c@galaxyprod.cvkyaz9id4ml.us-east-1.rds.amazonaws.com:5432/galaxy') +print 'monkey says "vroom vroom"' +connection = engine.connect() +print "monkey made a connection!" + +# database = '/nfs/sw/apps/galaxy-dev/galaxy/database/universe.sqlite' +seqsero = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/SeqSero.py' +test_out = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test.txt", 'w') +out_path = '/nfs/sw/apps/galaxy-dev/galaxy/database/files/000' +test_out2 = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test2.txt", 'w') + +path2sample = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp' + +test_out.write("monkey "); +test_out.write("\t".join(sys.argv)+'\n') + +fq_list1 = [] + +test_out.write(str(len(sys.argv))+"\n") +if len(sys.argv) >= 2: + test_out.write("\t".join(sys.argv)+'\n') + fq_list1 = sys.argv[1:] + +test_out.write(str(len(sys.argv))+"\n") + +fastq_files = re.split(",", fq_list1[0]) # fq_list1[0] is a string with commas between the path/filenames. + +def print_time(): + test_out.write(time.asctime( time.localtime(time.time()))) + +tmp_path = path2sample +if not os.path.exists(tmp_path): + os.system('mkdir '+tmp_path) + +test_out.write(str(len(fastq_files))+"\n") + +def list_runs(fastq_files): + ''' + Creates dict with runs as keys and list with filenames as values. + ''' + run2fastqs = {} + is_fastq = 'yes' + for file in fastq_files: + print 'monkey found', file + run = '' + try: + fastq = open(file, 'r') + i = 0 + for line in fastq: + line = line.rstrip("\n") + if i == 0: + run = re.split("\s", line)[0] + run = re.sub('@', '', run) + run = re.split("\.", run)[0] + if not re.search('^@', line): + is_fastq = 'no' + else: + break + i += 1 + file1 = re.split('/', file)[-1] + dataset_id = re.split('_', file1) + dataset_id = re.sub('.dat', '', dataset_id[-1]) + print dataset_id, type(dataset_id) + result = connection.execute('SELECT name FROM history_dataset_association WHERE dataset_id = '+dataset_id+';') + original_filename = '' + for row in result: + print 'monkey says the original_filename is something like', row[0] + original_filename = row[0] + if re.search('fasta$', original_filename): + if is_fastq == 'yes': + original_filename = re.sub('fasta', 'fastq', original_filename) + else: + print 'The input file is not a fastq file.' + file2 = re.sub('.dat$', '_'+original_filename, file1) + print 'monkey renamed the file', file2 + new_path_file = tmp_path+'/'+file2 + if file2 not in os.listdir(tmp_path): + print file, new_path_file + os.system('cp '+file+' '+new_path_file) + if run in run2fastqs.keys(): + if file not in run2fastqs[run]: + run2fastqs[run].append(new_path_file) + else: + run2fastqs[run] = [new_path_file] + except IOError: + print "Data not found. It is possible for a deleted file to still be listed "\ + "in a Galaxy library. Please confirm that the data still exists on this "\ + "server. You may need to upload it again." + return run2fastqs + +def run_seqsero(run2fastqs): + ''' + Takes files from run2fastqs and runs SeqSero. + ''' + outputs = [] + for run in run2fastqs: + print run, run2fastqs[run] + seqsero_cmd = [] + if len(run2fastqs[run]) == 2: + seqsero_cmd = ['python', seqsero, '-m', '2', '-i', run2fastqs[run][0], run2fastqs[run][1]] + elif len(run2fastqs[run]) == 1: + seqsero_cmd = ['python', seqsero, '-m', '1', '-i', run2fastqs[run][0]] + print seqsero_cmd + p = Popen(seqsero_cmd, stdout=PIPE) + output = p.communicate() + outputs.append(output) + return outputs + +def get_serotypes(outputs): + ''' + ''' + fastq2comment = {} + fastq2serotype = {} + for sample in outputs: + fastqs = '' + lines_used = [] + for line in sample: # line is actually the entire seqsero output. + line = str(line) + linel = re.split("\n", line) +# print linel + #lines_used = [] + for element in linel: # element is a line of seqsero output. + element = element.rstrip("\n") + test_out.write(element+"\n") + elementl = re.split("\t", element) + if elementl[0] == 'Input files:': + fastqs = elementl[1] + lines_used.append(element) + if elementl[1] not in fastq2serotype.keys(): + fastq2serotype[fastqs] = ['']*5 + fastq2comment[fastqs] = [] + elif elementl[0] == 'O antigen prediction:': + lines_used.append(element) + fastq2serotype[fastqs][0] = elementl[1] # add predicted profile + elif elementl[0] == 'H1 antigen prediction(fliC):': + lines_used.append(element) + fastq2serotype[fastqs][1] = elementl[1] # add predicted profile + elif elementl[0] == 'H2 antigen prediction(fljB):': + lines_used.append(element) + fastq2serotype[fastqs][2] = elementl[1] # add predicted profile + elif elementl[0] == 'Predicted antigenic profile:': + lines_used.append(element) + fastq2serotype[fastqs][3] = elementl[1] # add predicted profile + elif elementl[0] == 'Predicted serotype(s):': + lines_used.append(element) + fastq2serotype[fastqs][4] = elementl[1] # add predicted serotype + if element not in lines_used and re.search("\w", fastqs) and len(element) > 7: + fastq2comment[fastqs].append(element) + #print "\n" + return fastq2serotype, fastq2comment + +def print_html(fastq2serotype, fastq2comment): + ''' + Takes dict and prints to html file. + ''' + tab_out = open('Seqsero_result.txt', 'w'); + html_out = open('Seqsero_result.html', 'w') + html_out.write('<!DOCTYPE html>\n') + html_out.write('<html>\n') + html_out.write('<head>\n') + html_out.write('<title>SeqSero Results</title>\n') + html_out.write('</head>\n') + html_out.write('<body>\n') + html_out.write('<body style="font-family:Helvetica;">\n') + html_out.write('<p style="font-size:10px">\n') + html_out.write('<table border=1>\n') + header = 'Input Files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted antigenic profile\tPredicted serotype(s)' + header = re.sub(' ', '_', header) + header_l = ['Input Files', 'O antigen prediction', 'H1 antigen prediction(fliC)', 'H2 antigen prediction(fljB)', 'Predicted antigenic profile', 'Predicted serotype(s)'] + html_out.write('<tr>\n') + for element in header_l: + html_out.write('<td>'+element+'</td>\n') + html_out.write('</tr>\n') + tab_out.write(header+"\n") + print "\n\n", header + for fastq in fastq2serotype: +# print fastq, fastq2serotype[fastq] + line_to_print = fastq+'\t'+"\t".join(fastq2serotype[fastq]) + tab_out.write(line_to_print+"\n") + html_out.write('<tr>\n') + html_out.write('<td>'+fastq+'</td>\n') + for antigen in fastq2serotype[fastq]: + html_out.write('<td>'+antigen+'</td>\n') + html_out.write('</tr>\n') + print line_to_print + html_out.write('</table>\n') + print "\n" + for fastq in fastq2comment: + tab_out.write("\n"+fastq+"\n") + html_out.write('<tr>\n') + html_out.write('<p>\n') + html_out.write('<td>'+fastq+"<br></td>\n") + for line in fastq2comment[fastq]: + #if len(line) > 7: + html_out.write('<td>'+line+'</td>\n') + tab_out.write(line+"\n") + print line + print "\n" + html_out.write('</p>\n') + html_out.write('</tr>\n') + html_out.write('</body>\n') + html_out.write('</html>\n') + html_out.close() + +run2fastqs = list_runs(fastq_files) +#print run2fastqs +outputs = run_seqsero(run2fastqs) +fastq2serotype, fastq2comment = get_serotypes(outputs) +print_html(fastq2serotype, fastq2comment) +print_time() + +