Mercurial > repos > charles_s_test > seqsero2

--- a/README.md	Fri Oct 27 17:47:00 2017 -0400
+++ b/README.md	Sun Nov 12 02:27:32 2017 -0500
@@ -1,16 +1,5 @@
-
-# GalaxySeqSero
-#This Project has been modified by the CFSAN FDA office to work in Galaxy it also has a batch functionality as well add by the Galaxy rapper code.
-
-All below dependencies are the same for Seqsero.
- Add this project to the galaxy tools directory and then added to tool_conf.xml
-
-Adding custom tools to Galaxy
-
-https://galaxyproject.org/admin/tools/add-tool-tutorial/
-
-
-# SeqSero 1.1
+<<<<<<< HEAD
+# SeqSero 1.0
 Salmonella serotyping from genome sequencing data
--- a/Seqsero_result.html	Fri Oct 27 17:47:00 2017 -0400
+++ b/Seqsero_result.html	Sun Nov 12 02:27:32 2017 -0500
@@ -16,19 +16,19 @@
 <td>Predicted serotype(s)</td>
 </tr>
 <tr>
-<td>dataset_15_sra_data9.fastq </td>
-<td>O-16</td>
-<td>d</td>
-<td>1,7</td>
-<td>16:d:1,7</td>
-<td>Gaminara</td>
+<td>dataset_445_SRR6158764_1.fastq dataset_446_SRR6158764_2.fastq</td>
+<td>O-7</td>
+<td>y</td>
+<td>1,5</td>
+<td>7:y:1,5</td>
+<td>Bareilly</td>
 </tr>
 </table>
 <tr>
 <p>
-<td>dataset_15_sra_data9.fastq <br></td>
+<td>dataset_445_SRR6158764_1.fastq dataset_446_SRR6158764_2.fastq<br></td>
 <td>The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme.  New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.</td>
-<td>/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp/dataset_15_sra_data9.fastq</td>
+<td>check fastq id and make them in accordance with each other...please wait...</td>
 </p>
 </tr>
 </body>
--- a/Seqsero_result.txt	Fri Oct 27 17:47:00 2017 -0400
+++ b/Seqsero_result.txt	Sun Nov 12 02:27:32 2017 -0500
@@ -1,6 +1,6 @@
 Input_Files	O_antigen_prediction	H1_antigen_prediction(fliC)	H2_antigen_prediction(fljB)	Predicted_antigenic_profile	Predicted_serotype(s)
-dataset_15_sra_data9.fastq 	O-16	d	1,7	16:d:1,7	Gaminara
+dataset_445_SRR6158764_1.fastq dataset_446_SRR6158764_2.fastq	O-7	y	1,5	7:y:1,5	Bareilly

-dataset_15_sra_data9.fastq
+dataset_445_SRR6158764_1.fastq dataset_446_SRR6158764_2.fastq
 The serotype(s) is/are the only serotype(s) with the indicated antigenic profile currently recognized in the Kauffmann White Scheme.  New serotypes can emerge and the possibility exists that this antigenic profile may emerge in a different subspecies.  Identification of strains to the subspecies level should accompany serotype determination; the same antigenic profile in different subspecies is considered different serotypes.
-/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp/dataset_15_sra_data9.fastq
+check fastq id and make them in accordance with each other...please wait...
--- a/libs/BWA_analysis_H_update_new_family_dependent.py	Fri Oct 27 17:47:00 2017 -0400
+++ b/libs/BWA_analysis_H_update_new_family_dependent.py	Sun Nov 12 02:27:32 2017 -0500
@@ -6,8 +6,8 @@
 from Initial_functions import Uniq
 from Bio.Blast import NCBIXML

-BwaPath="/nfs/sw/apps/bwa/bwa-0.7.15/bwa"
-SamTlsPth="/nfs/sw/apps/samtools/samtools-1.3.1/bin/samtools"
+BwaPath="bwa"
+SamTlsPth="samtools"
 Makebltdb="/nfs/sw/apps/blast/ncbi-blast-2.6.0+/bin/makeblastdb"
 Blastnpth="/nfs/sw/apps/blast/ncbi-blast-2.6.0+/bin/blastn"
--- a/libs/BWA_analysis_O_new_dependent.py	Fri Oct 27 17:47:00 2017 -0400
+++ b/libs/BWA_analysis_O_new_dependent.py	Sun Nov 12 02:27:32 2017 -0500
@@ -7,8 +7,8 @@
 from Initial_functions import Uniq
 from Bio.Blast import NCBIXML

-BwaPath="/nfs/sw/apps/bwa/bwa-0.7.15/bwa"
-SamTlsPth="/nfs/sw/apps/samtools/samtools-1.3.1/bin/samtools"
+BwaPath="bwa"
+SamTlsPth="samtools"
 Makebltdb="/nfs/sw/apps/blast/ncbi-blast-2.6.0+/bin/makeblastdb"
 Blastnpth="/nfs/sw/apps/blast/ncbi-blast-2.6.0+/bin/blastn"
Binary file libs/Initial_Conditions.pyc has changed
Binary file libs/Initial_functions.pyc has changed
--- a/libs/deletion_compare.py	Fri Oct 27 17:47:00 2017 -0400
+++ b/libs/deletion_compare.py	Sun Nov 12 02:27:32 2017 -0500
@@ -5,8 +5,8 @@
 from Initial_functions import Uniq
 from Bio.Blast import NCBIXML

-BwaPath="/nfs/sw/apps/bwa/bwa-0.7.15/bwa"
-SamTlsPth="/nfs/sw/apps/samtools/samtools-1.3.1/bin/samtools"
+BwaPath="bwa"
+SamTlsPth="samtools"
 Makebltdb="/nfs/sw/apps/blast/ncbi-blast-2.6.0+/bin/makeblastdb"
 Blastnpth="/nfs/sw/apps/blast/ncbi-blast-2.6.0+/bin/blastn"
--- a/psql_test.py	Fri Oct 27 17:47:00 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-import psycopg2, os
-
-os.system('source /nfs/sw/apps/galaxy-dev/galaxy/.venv/bin/activate')
-
-connection = psycopg2.connect(database="galaxy", user="galaxy", host="galaxydev.cvkyaz9id4ml.us-east-1.rds.amazonaws.com", password="cF$cl0udh9c", port="5432")
-print "monkey made a connection!"
-
-worker = connection.cursor()
-worker.execute('SELECT name FROM library_dataset_dataset_association WHERE dataset_id = 4;')
-result = worker.fetchall()
-print result, '4'
-
-worker = connection.cursor()
-worker.execute('SELECT name FROM library_dataset_dataset_association WHERE dataset_id = 10;')
-result = worker.fetchall()
-print result, '10'
-
-worker = connection.cursor()
-worker.execute('SELECT name FROM library_dataset_dataset_association WHERE dataset_id = 11;')
-result = worker.fetchall()
-print result, '11'
-
-worker = connection.cursor()
-worker.execute('SELECT name FROM library_dataset_dataset_association WHERE dataset_id = 18;')
-result = worker.fetchall()
-print result, '18'
--- a/run_seqsero_batch_galaxy.py	Fri Oct 27 17:47:00 2017 -0400
+++ b/run_seqsero_batch_galaxy.py	Sun Nov 12 02:27:32 2017 -0500
@@ -13,7 +13,7 @@
 from datetime import datetime

 print 'monkey found some files', sys.argv
-engine = create_engine('postgresql+psycopg2://galaxy:cF$cl0udh9c@galaxyprod.cvkyaz9id4ml.us-east-1.rds.amazonaws.com:5432/galaxy')
+engine = create_engine('postgresql+psycopg2://galaxy:cF$cl0udh9c@galaxydev.cvkyaz9id4ml.us-east-1.rds.amazonaws.com:5432/galaxy')
 print 'monkey says "vroom vroom"'
 connection = engine.connect()
 print "monkey made a connection!"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/run_seqsero_batch_galaxy_09-15-17.py	Sun Nov 12 02:27:32 2017 -0500
@@ -0,0 +1,201 @@
+#!/usr/bin/python
+
+print "monkey has started"
+
+import os, re, sys, time, datetime
+import subprocess
+from subprocess import Popen, PIPE
+
+database = '/nfs/sw/apps/galaxy-prd/galaxy/database/universe.sqlite'
+seqsero = '/nfs/sw/apps/galaxy-prd/galaxy/tools/seqsero/SeqSero.py'
+test_out = open("/nfs/sw/apps/galaxy-prd/galaxy/tools/seqsero/test.txt", 'w')
+out_path = '/nfs/sw/apps/galaxy-prd/galaxy/database/files/000'
+test_out2 = open("/nfs/sw/apps/galaxy-prd/galaxy/tools/seqsero/test2.txt", 'w')
+
+path2sample = '/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp'
+
+#test_out2.write(" Test test test");
+
+test_out.write("monkey ");
+#test_out.write("");
+test_out.write("\t".join(sys.argv)+'\n')
+
+fq_list1 = []
+fq_list2 = []
+
+test_out.write(str(len(sys.argv))+"\n")
+if len(sys.argv) >= 2:
+        test_out.write("\t".join(sys.argv)+'\n')
+        fq_list1 = sys.argv[1]
+
+test_out.write(str(len(sys.argv))+"\n")
+
+fastq_files = re.split(",", fq_list1)
+
+def print_time():
+        test_out.write(time.asctime( time.localtime(time.time())))
+
+tmp_path = "/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp"
+if not os.path.exists(tmp_path):
+        os.system('mkdir '+tmp_path)
+
+test_out.write(str(len(fastq_files))+"\n")
+
+def list_runs(fastq_files):
+        '''
+        Creates dict with runs as keys and list with filenames as values.
+        '''
+#       print "monkey"
+        run2fastqs = {}
+        for file in fastq_files:
+                run = ''
+                try:
+                        fastq = open(file, 'r')
+                        i = 0
+                        for line in fastq:
+                                line = line.rstrip("\n")
+                                if i == 0:
+                                        run = re.split("\s", line)[0]
+                                        run = re.sub('@', '', run)
+					run = re.split("\.", run)[0]
+                                        #run = re.sub("\.1", '', run)]
+                                else:
+                                        break
+                                i += 1
+                        file1 = re.split('/', file)[-1]
+                        file2 = re.sub('.dat$', '_'+run+'.fastq', file1)
+                        new_path_file = tmp_path+'/'+file2
+#                       print run, new_path_file
+                        if file2 not in os.listdir(tmp_path):
+                                os.system('cp '+file+' '+new_path_file)
+                        if run in run2fastqs.keys():
+                                if file not in run2fastqs[run]:
+                                        run2fastqs[run].append(new_path_file)
+                        else:
+                                run2fastqs[run] = [new_path_file]
+                except IOError:
+                        print "Data not found. It is possible for a deleted file to still be listed "\
+                                "in a Galaxy library. Please confirm that the data still exists on this "\
+                                "server. You may need to upload it again."
+        return run2fastqs
+
+def run_seqsero(run2fastqs):
+        '''
+        Takes files from run2fastqs and runs SeqSero.
+        '''
+        outputs = []
+        for run in run2fastqs:
+                seqsero_cmd = []
+                if len(run2fastqs[run]) == 2:
+                        seqsero_cmd = ['python', seqsero, '-m', '2', '-i', run2fastqs[run][0], run2fastqs[run][1]]
+                elif len(run2fastqs[run]) == 1:
+                        seqsero_cmd = ['python', seqsero, '-m', '1', '-i', run2fastqs[run][0]]
+                p = Popen(seqsero_cmd, stdout=PIPE)
+                output = p.communicate()
+                outputs.append(output)
+        return outputs
+
+def get_serotypes(outputs):
+        '''
+        '''
+	fastq2comment = {}
+        fastq2serotype = {}
+        for sample in outputs:
+		fastqs = ''
+		lines_used = []
+                for line in sample:	# line is actually the entire seqsero output.
+                        line = str(line)
+                        linel = re.split("\n", line)
+#                       print linel
+			#lines_used = []
+                        for element in linel:	# element is a line of seqsero output.
+				element = element.rstrip("\n")
+				test_out.write(element+"\n")
+                                elementl = re.split("\t", element)
+                                if elementl[0] == 'Input files:':
+                                        fastqs = elementl[1]
+					lines_used.append(element)
+                                        if elementl[1] not in fastq2serotype.keys():
+                                                fastq2serotype[fastqs] = ['']*5
+						fastq2comment[fastqs] = []
+                                elif elementl[0] == 'O antigen prediction:':
+					lines_used.append(element)
+                                        fastq2serotype[fastqs][0] = elementl[1] # add predicted profile
+                                elif elementl[0] == 'H1 antigen prediction(fliC):':
+					lines_used.append(element)
+                                        fastq2serotype[fastqs][1] = elementl[1] # add predicted profile
+                                elif elementl[0] == 'H2 antigen prediction(fljB):':
+					lines_used.append(element)
+                                        fastq2serotype[fastqs][2] = elementl[1] # add predicted profile
+                                elif elementl[0] == 'Predicted antigenic profile:':
+					lines_used.append(element)
+                                        fastq2serotype[fastqs][3] = elementl[1] # add predicted profile
+                                elif elementl[0] == 'Predicted serotype(s):':
+					lines_used.append(element)
+                                        fastq2serotype[fastqs][4] = elementl[1] # add predicted serotype
+				if element not in lines_used and re.search("\w", fastqs) and len(element) > 7:
+					fastq2comment[fastqs].append(element)
+                #print "\n"
+        return fastq2serotype, fastq2comment
+
+def print_html(fastq2serotype, fastq2comment):
+        '''
+        Takes dict and prints to html file.
+        '''
+	tab_out = open('Seqsero_result.txt', 'w');
+        html_out = open('Seqsero_result.html', 'w')
+        html_out.write('<!DOCTYPE html>\n')
+        html_out.write('<html>\n')
+        html_out.write('<head>\n')
+        html_out.write('<title>SeqSero Results</title>\n')
+        html_out.write('</head>\n')
+        html_out.write('<body>\n')
+        html_out.write('<body style="font-family:Helvetica;">\n')
+        html_out.write('<p style="font-size:10px">\n')
+        html_out.write('<table border=1>\n')
+        header = 'Input Files\tO antigen prediction\tH1 antigen prediction(fliC)\tH2 antigen prediction(fljB)\tPredicted antigenic profile\tPredicted serotype(s)'
+        header = re.sub(' ', '_', header)
+        header_l = ['Input Files', 'O antigen prediction', 'H1 antigen prediction(fliC)', 'H2 antigen prediction(fljB)', 'Predicted antigenic profile', 'Predicted serotype(s)']
+        html_out.write('<tr>\n')
+        for element in header_l:
+                html_out.write('<td>'+element+'</td>\n')
+        html_out.write('</tr>\n')
+        tab_out.write(header+"\n")
+        print "\n\n", header
+        for fastq in fastq2serotype:
+#               print fastq, fastq2serotype[fastq]
+                line_to_print = fastq+'\t'+"\t".join(fastq2serotype[fastq])
+                tab_out.write(line_to_print+"\n")
+                html_out.write('<tr>\n')
+                html_out.write('<td>'+fastq+'</td>\n')
+                for antigen in fastq2serotype[fastq]:
+                        html_out.write('<td>'+antigen+'</td>\n')
+                html_out.write('</tr>\n')
+                print line_to_print
+        html_out.write('</table>\n')
+	print "\n"
+        for fastq in fastq2comment:
+                tab_out.write("\n"+fastq+"\n")
+                html_out.write('<tr>\n')
+		html_out.write('<p>\n')
+                html_out.write('<td>'+fastq+"<br></td>\n")
+                for line in fastq2comment[fastq]:
+			#if len(line) > 7:
+                        html_out.write('<td>'+line+'</td>\n')
+                        tab_out.write(line+"\n")
+			print line
+		print "\n"
+		html_out.write('</p>\n')
+                html_out.write('</tr>\n')
+        html_out.write('</body>\n')
+        html_out.write('</html>\n')
+        html_out.close()
+
+run2fastqs = list_runs(fastq_files)
+outputs = run_seqsero(run2fastqs)
+fastq2serotype, fastq2comment = get_serotypes(outputs)
+print_html(fastq2serotype, fastq2comment)
+
+print_time()
+
+
--- a/seqsero.xml	Fri Oct 27 17:47:00 2017 -0400
+++ b/seqsero.xml	Sun Nov 12 02:27:32 2017 -0500
@@ -1,18 +1,34 @@
 <tool id="seq_sero_reads_multiple" name="SeqSero Batch - Paired-End Reads" version="0.1.0">
-	<description>Salmonella Serotype Prediction from Paired-End Reads</description>
-        <command interpreter="python2.7">
+  <description>Salmonella Serotype Prediction from Paired-End Reads</description>
+<requirements>
+	<requirement type="package" version="1.69">biopython</requirement>
+        <requirement type="package" version="0.5.9">bwa</requirement>
+        <requirement type="package" version="1.3.1">samtools</requirement>
+        <requirement type="package" version="2.5.0">blast</requirement>
+        <requirement type="package" version="2.6.2">sra_toolkit</requirement>
+  </requirements>
+       <stdio>
+                <exit_code range="0:" level="warning"/>
+                <regex match="Error:"/>
+        </stdio>
+      <command interpreter="python2.7">
                 run_seqsero_batch_galaxy.py $input1 <!--$input2-->
                 <!--test_bioblend.py $input1 $input2-->
         </command>
         <inputs>
-                <param format="fastq,fastqsanger" name="input1" multiple="true" type="data" label="Source file"/>
+                <param format="fastq" name="input1" multiple="true" type="data" label="Source file"/>
                 <!--<param format="fastq" name="input2" multiple="true" type="data" label="Source file"/>-->
         </inputs>
         <outputs>
                 <data name="html_file" format="html" from_work_dir="Seqsero_result.html"/>
                 <data name="text_file" format="txt" from_work_dir="Seqsero_result.txt"/>
         </outputs>
-        <help>
+ 	<tests>
+        <test>
+            <output  name="text_file" file="Seqsero_result.txt" ftype="txt"/>
+ 	</test>
+    	</tests>
+	<help>
                This tool predicts Salmonella serotypes from shotgun WGS data.

           	Below is the link to the seqsero help
@@ -34,8 +50,4 @@
           }
           </citation>
      </citations>
-        <stdio>
-                <exit_code range="0:" level="warning"/>
-                <regex match="Error:"/>
-        </stdio>
 </tool>
--- a/test.txt	Fri Oct 27 17:47:00 2017 -0400
+++ b/test.txt	Sun Nov 12 02:27:32 2017 -0500
@@ -1,19 +1,19 @@
-monkey /nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/run_seqsero_batch_galaxy.py	/nfs/sw/apps/galaxy-dev/galaxy/database/files/000/dataset_304.dat
+monkey /nfs/sw/apps/galaxy-prd/galaxy/tools/seqsero/run_seqsero_batch_galaxy.py	/nfs/sw/apps/galaxy-prd/galaxy/database/files/001/dataset_1643.dat,/nfs/sw/apps/galaxy-prd/galaxy/database/files/001/dataset_1644.dat
 2
-/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/run_seqsero_batch_galaxy.py	/nfs/sw/apps/galaxy-dev/galaxy/database/files/000/dataset_304.dat
+/nfs/sw/apps/galaxy-prd/galaxy/tools/seqsero/run_seqsero_batch_galaxy.py	/nfs/sw/apps/galaxy-prd/galaxy/database/files/001/dataset_1643.dat,/nfs/sw/apps/galaxy-prd/galaxy/database/files/001/dataset_1644.dat
 2
-1
+2


-Input files:	dataset_304_F1605001-005-2C1.fsa
-O antigen prediction:	O-?
+Input files:	dataset_1643_SRR6232093_1.fastq dataset_1644_SRR6232093_2.fastq
+O antigen prediction:	O--
 H1 antigen prediction(fliC):	-
 H2 antigen prediction(fljB):	-
-Predicted antigenic profile:	?:-:-
+Predicted antigenic profile:	-:-:-
 Predicted serotype(s):	N/A (The predicted antigenic profile does not exist in the White-Kauffmann-Le Minor scheme)

-/nfs/sw/apps/galaxy-dev/galaxy/tools/seqsero/fastq_tmp/dataset_304_F1605001-005-2C1.fsa
+check fastq id and make them in accordance with each other...please wait...


@@ -21,4 +21,4 @@


 None
-Fri Oct 27 15:35:33 2017
\ No newline at end of file
+Wed Nov  8 06:26:06 2017
\ No newline at end of file
--- a/test_bioblend.py	Fri Oct 27 17:47:00 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-#/usr/bin/python
-
-import sqlite3, re, os
-from bioblend import galaxy
-
-#out = open("/opt/galaxy/tools/seqserobatch/test.txt", 'w')
-
-def get_bioblend_id():
-#	gi = galaxy.GalaxyInstance(url='http://galaxy.fda.gov:8080/', key='ebec9b999774b69d3f3880f0f664f56e')
-	gi = galaxy.GalaxyInstance(url='https://dev.galaxytrakr.org/')
-	job = galaxy.jobs.JobsClient(gi)
-	j = job.get_jobs()
-	while j[0]['exit_code'] == 'None':
-		print j[0]['id'], j[0]['exit_code']
-	print j[0]['id'], j[0]['exit_code']
-
-#	dataset_id = galaxy.datasets.DatasetClient(gi)
-#	output = show_stdout(dataset_id)
-#	print output
-#	data = download_dataset(dataset_id, wait_for_completion=True)
-#	print dataset_id.show_stdout()
-	#print gi, job
-
-#	print '"'+'tiger'+'"'
-	return j[0]['id']
-
-def get_job_id(bioblend_id):
-#	print 'supermonkey', bioblend_id
-	database = '/opt/galaxy/database/universe.sqlite'
-	conn = sqlite3.connect(database)
-	select = 'SELECT job.id, job.stdout FROM job WHERE job.stdout LIKE '+"'"+"%"+bioblend_id+"%"+"'"+';'
-#	print select
-#	entries = conn.execute('SELECT job.id, job.stdout FROM job WHERE job.stdout LIKE '+"'"+"%"+bioblend_id+"%"+"'"+';')
-	entries = conn.execute('SELECT job.id, job.stdout FROM job WHERE job.stdout LIKE '+"'"+"%ecbc86ac41da8f7b%"+"'"+';')
-	print type(entries)
-	print entries
-	print select
-	for entry in entries:
-		print entry
-		for line in entry:
-			line = str(line)
-			print line
-#	print entries
-
-#out.write(j_join)
-
-bioblend_id = get_bioblend_id()
-#print bioblend_id
-#get_job_id(bioblend_id)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Sun Nov 12 02:27:32 2017 -0500
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.67">
+        <repository changeset_revision="fc45a61abc2f" name="package_biopython_1_67" owner="biopython" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="blast" version="2.5.0">
+        <repository changeset_revision="de5976f2c96d" name="package_blast_plus_2_5_0" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>