view run_seqsero_batch_galaxy.py @ 2:0d65b71ff8df draft

planemo upload commit 464b391afaa5819bc681452e85bea9d882730eb6
author charles_s_test
date Sun, 12 Nov 2017 02:27:32 -0500
parents 6895de35a263
children 38ad1130d077
line wrap: on
line source

#!/usr/bin/python

print 'monkey wonders where he is'

import os

# os.system('source /nfs/sw/apps/galaxy-dev/galaxy/.venv/bin/activate')

import re, sys, time, datetime
import subprocess, psycopg2, sqlalchemy
from subprocess import Popen, PIPE
from sqlalchemy import *
from datetime import datetime

print 'monkey found some files', sys.argv
engine = create_engine('postgresql+psycopg2://galaxy:cF$cl0udh9c@galaxydev.cvkyaz9id4ml.us-east-1.rds.amazonaws.com:5432/galaxy')
print 'monkey says "vroom vroom"'
connection = engine.connect()
print "monkey made a connection!"

# database = '/nfs/sw/apps/galaxy-dev/galaxy/database/universe.sqlite'
seqsero = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/SeqSero.py'
test_out = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test.txt", 'w')
out_path = '/nfs/sw/apps/galaxy-dev/galaxy/database/files/000'
test_out2 = open("/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/test2.txt", 'w')

path2sample = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp'

test_out.write("monkey ");
test_out.write("\t".join(sys.argv)+'\n')

fq_list1 = []

test_out.write(str(len(sys.argv))+"\n")
if len(sys.argv) >= 2:
        test_out.write("\t".join(sys.argv)+'\n')
        fq_list1 = sys.argv[1:]

test_out.write(str(len(sys.argv))+"\n")

fastq_files = re.split(",", fq_list1[0])	# fq_list1[0] is a string with commas between the path/filenames.

def print_time():
        test_out.write(time.asctime( time.localtime(time.time())))

tmp_path = path2sample
if not os.path.exists(tmp_path):
        os.system('mkdir '+tmp_path)

test_out.write(str(len(fastq_files))+"\n")

def list_runs(fastq_files):
        '''
        Creates dict with runs as keys and list with filenames as values.
        '''
        run2fastqs = {}
	is_fastq = 'yes'
        for file in fastq_files:
		print 'monkey found', file 
		run = ''
                try:
                        fastq = open(file, 'r')
                        i = 0
                        for line in fastq:
                                line = line.rstrip("\n")
                                if i == 0:
                                        run = re.split("\s", line)[0]
                                        run = re.sub('@', '', run)
					run = re.split("\.", run)[0]
					if not re.search('^@', line):
						is_fastq = 'no'
                                else:
                                        break
                                i += 1
                        file1 = re.split('/', file)[-1]
			dataset_id = re.split('_', file1)
			dataset_id = re.sub('.dat', '', dataset_id[-1])
			print dataset_id, type(dataset_id)
			result = connection.execute('SELECT name FROM history_dataset_association WHERE dataset_id = '+dataset_id+';')			
			original_filename = ''
			for row in result:
				print 'monkey says the original_filename is something like', row[0]
				original_filename = row[0]
			if re.search('fasta$', original_filename):
				if is_fastq == 'yes':
					original_filename = re.sub('fasta', 'fastq', original_filename)
				else:
					print 'The input file is not a fastq file.'
                        file2 = re.sub('.dat$', '_'+original_filename, file1)
			print 'monkey renamed the file', file2
                        new_path_file = tmp_path+'/'+file2
                        if file2 not in os.listdir(tmp_path):
				print file, new_path_file
                                os.system('cp '+file+' '+new_path_file)
                        if run in run2fastqs.keys():
                                if file not in run2fastqs[run]:
                                        run2fastqs[run].append(new_path_file)
                        else:
				run2fastqs[run] = [new_path_file]
                except IOError:
                        print "Data not found. It is possible for a deleted file to still be listed "\
                                "in a Galaxy library. Please confirm that the data still exists on this "\
                                "server. You may need to upload it again."
        return run2fastqs

def run_seqsero(run2fastqs):
        '''
        Takes files from run2fastqs and runs SeqSero.
        '''
        outputs = []
        for run in run2fastqs:
		print run, run2fastqs[run]
                seqsero_cmd = []
                if len(run2fastqs[run]) == 2:
                        seqsero_cmd = ['python', seqsero, '-m', '2', '-i', run2fastqs[run][0], run2fastqs[run][1]]
                elif len(run2fastqs[run]) == 1:
                        seqsero_cmd = ['python', seqsero, '-m', '1', '-i', run2fastqs[run][0]]
		print seqsero_cmd
                p = Popen(seqsero_cmd, stdout=PIPE)
                output = p.communicate()
                outputs.append(output)
        return outputs

def get_serotypes(outputs):
        '''
        '''
	fastq2comment = {}
        fastq2serotype = {}
        for sample in outputs:
		fastqs = ''
		lines_used = []
                for line in sample:	# line is actually the entire seqsero output.
                        line = str(line)
                        linel = re.split("\n", line)
#                       print linel
			#lines_used = []
                        for element in linel:	# element is a line of seqsero output.
				element = element.rstrip("\n")
				test_out.write(element+"\n")
                                elementl = re.split("\t", element)
                                if elementl[0] == 'Input files:':
                                        fastqs = elementl[1]
					lines_used.append(element)
                                        if elementl[1] not in fastq2serotype.keys():
                                                fastq2serotype[fastqs] = ['']*5
						fastq2comment[fastqs] = []
                                elif elementl[0] == 'O antigen prediction:':
					lines_used.append(element)
                                        fastq2serotype[fastqs][0] = elementl[1] # add predicted profile
                                elif elementl[0] == 'H1 antigen prediction(fliC):':
					lines_used.append(element)
                                        fastq2serotype[fastqs][1] = elementl[1] # add predicted profile
                                elif elementl[0] == 'H2 antigen prediction(fljB):':
					lines_used.append(element)
                                        fastq2serotype[fastqs][2] = elementl[1] # add predicted profile                    
                                elif elementl[0] == 'Predicted antigenic profile:':
					lines_used.append(element)
                                        fastq2serotype[fastqs][3] = elementl[1] # add predicted profile
                                elif elementl[0] == 'Predicted serotype(s):':
					lines_used.append(element)
                                        fastq2serotype[fastqs][4] = elementl[1] # add predicted serotype
				if element not in lines_used and re.search("\w", fastqs) and len(element) > 7:
					fastq2comment[fastqs].append(element)
                #print "\n"
        return fastq2serotype, fastq2comment

def print_html(fastq2serotype, fastq2comment):
        '''
        Takes dict and prints to html file.
        '''
	tab_out = open('Seqsero_result.txt', 'w');
        html_out = open('Seqsero_result.html', 'w')
        html_out.write('<!DOCTYPE html>\n')
        html_out.write('<html>\n')
        html_out.write('<head>\n')
        html_out.write('<title>SeqSero Results</title>\n')
        html_out.write('</head>\n')
        html_out.write('<body>\n')
        html_out.write('<body style="font-family:Helvetica;">\n')
        html_out.write('<p style="font-size:10px">\n')
        html_out.write('<table border=1>\n')
        header = 'Input Files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted antigenic profile\tPredicted serotype(s)'
        header = re.sub(' ', '_', header)
        header_l = ['Input Files', 'O antigen prediction', 'H1 antigen prediction(fliC)', 'H2 antigen prediction(fljB)', 'Predicted antigenic profile', 'Predicted serotype(s)']
        html_out.write('<tr>\n')
        for element in header_l:
                html_out.write('<td>'+element+'</td>\n')
        html_out.write('</tr>\n')
        tab_out.write(header+"\n")
        print "\n\n", header
        for fastq in fastq2serotype:
#               print fastq, fastq2serotype[fastq]
                line_to_print = fastq+'\t'+"\t".join(fastq2serotype[fastq])
                tab_out.write(line_to_print+"\n")
                html_out.write('<tr>\n')
                html_out.write('<td>'+fastq+'</td>\n')
                for antigen in fastq2serotype[fastq]:
                        html_out.write('<td>'+antigen+'</td>\n')
                html_out.write('</tr>\n')
                print line_to_print
        html_out.write('</table>\n')
	print "\n"
        for fastq in fastq2comment:
                tab_out.write("\n"+fastq+"\n")
                html_out.write('<tr>\n')
		html_out.write('<p>\n')
                html_out.write('<td>'+fastq+"<br></td>\n")
                for line in fastq2comment[fastq]:
			#if len(line) > 7:
                        html_out.write('<td>'+line+'</td>\n')
                        tab_out.write(line+"\n")
			print line
		print "\n"
		html_out.write('</p>\n')
                html_out.write('</tr>\n')
        html_out.write('</body>\n')
        html_out.write('</html>\n')
        html_out.close()

run2fastqs = list_runs(fastq_files)
#print run2fastqs
outputs = run_seqsero(run2fastqs)
fastq2serotype, fastq2comment = get_serotypes(outputs)
print_html(fastq2serotype, fastq2comment)
print_time()