Galaxy |

Changeset 37:72170c3f515a (2013-12-03)

Previous changeset 36:8f9023b30384 (2013-12-02) Next changeset 38:2f7fac29bb3c (2014-01-15)

Commit message:
Uploaded v0.0.22f, fix for new style dummy IDs in BLAST XML output

modified:
tools/ncbi_blast_plus/blastxml_to_tabular.py
tools/ncbi_blast_plus/blastxml_to_tabular.xml
tools/ncbi_blast_plus/ncbi_macros.xml

added:
test-data/blastn_arabidopsis.extended.tabular
test-data/blastn_arabidopsis.standard.tabular
test-data/blastn_arabidopsis.xml

diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.extended.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.extended.tabular Tue Dec 03 10:02:17 2013 -0500

@@ -0,0 +1,1 @@
+chunk_of_plant chrIII 100.00 630 0 0 1 630 4341 4970 0.0 1164 chrIII 630 630 630 0 100.00 1 1 GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT 630 23459830 gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence

diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.standard.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.standard.tabular Tue Dec 03 10:02:17 2013 -0500

@@ -0,0 +1,1 @@
+chunk_of_plant chrIII 100.00 630 0 0 1 630 4341 4970 0.0 1164

diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.xml Tue Dec 03 10:02:17 2013 -0500

@@ -0,0 +1,71 @@
+<?xml version="1.0"?>
+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
+<BlastOutput>
+  <BlastOutput_program>blastn</BlastOutput_program>
+  <BlastOutput_version>BLASTN 2.2.28+</BlastOutput_version>
+  <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
+  <BlastOutput_db>/mnt/galaxy/galaxy-central/database/files/000/dataset_857_files/blastdb</BlastOutput_db>
+  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
+  <BlastOutput_query-def>chunk_of_plant</BlastOutput_query-def>
+  <BlastOutput_query-len>630</BlastOutput_query-len>
+  <BlastOutput_param>
+    <Parameters>
+      <Parameters_expect>0.001</Parameters_expect>
+      <Parameters_sc-match>1</Parameters_sc-match>
+      <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
+      <Parameters_gap-open>0</Parameters_gap-open>
+      <Parameters_gap-extend>0</Parameters_gap-extend>
+      <Parameters_filter>L;m;</Parameters_filter>
+    </Parameters>
+  </BlastOutput_param>
+<BlastOutput_iterations>
+<Iteration>
+  <Iteration_iter-num>1</Iteration_iter-num>
+  <Iteration_query-ID>Query_1</Iteration_query-ID>
+  <Iteration_query-def>chunk_of_plant</Iteration_query-def>
+  <Iteration_query-len>630</Iteration_query-len>
+<Iteration_hits>
+<Hit>
+  <Hit_num>1</Hit_num>
+  <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+  <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+  <Hit_accession>2</Hit_accession>
+  <Hit_len>23459830</Hit_len>
+  <Hit_hsps>
+    <Hsp>
+      <Hsp_num>1</Hsp_num>
+      <Hsp_bit-score>1164.51</Hsp_bit-score>
+      <Hsp_score>630</Hsp_score>
+      <Hsp_evalue>0</Hsp_evalue>
+      <Hsp_query-from>1</Hsp_query-from>
+      <Hsp_query-to>630</Hsp_query-to>
+      <Hsp_hit-from>4341</Hsp_hit-from>
+      <Hsp_hit-to>4970</Hsp_hit-to>
+      <Hsp_query-frame>1</Hsp_query-frame>
+      <Hsp_hit-frame>1</Hsp_hit-frame>
+      <Hsp_identity>630</Hsp_identity>
+      <Hsp_positive>630</Hsp_positive>
+      <Hsp_gaps>0</Hsp_gaps>
+      <Hsp_align-len>630</Hsp_align-len>
+      <Hsp_qseq>GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT</Hsp_qseq>
+      <Hsp_hseq>GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT</Hsp_hseq>
+      <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
+    </Hsp>
+  </Hit_hsps>
+</Hit>
+</Iteration_hits>
+  <Iteration_stat>
+    <Statistics>
+      <Statistics_db-num>5</Statistics_db-num>
+      <Statistics_db-len>119146348</Statistics_db-len>
+      <Statistics_hsp-len>26</Statistics_hsp-len>
+      <Statistics_eff-space>71964315672</Statistics_eff-space>
+      <Statistics_kappa>0.46</Statistics_kappa>
+      <Statistics_lambda>1.28</Statistics_lambda>
+      <Statistics_entropy>0.85</Statistics_entropy>
+    </Statistics>
+  </Iteration_stat>
+</Iteration>
+</BlastOutput_iterations>
+</BlastOutput>
+

diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Dec 03 10:02:17 2013 -0500

[

@@ -158,6 +158,11 @@
             # <Hit_accession>Subject_1</Hit_accession>
             #
             #apparently depending on the parse_deflines switch
+            #
+            #Or, with BLAST 2.2.28+ can get this,
+            # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+            # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+            # <Hit_accession>2</Hit_accession>
             sseqid = hit.findtext("Hit_id").split(None,1)[0]
             hit_def = sseqid + " " + hit.findtext("Hit_def")
             if re_default_subject_id.match(sseqid) \
@@ -165,6 +170,11 @@
                 #Place holder ID, take the first word of the subject definition
                 hit_def = hit.findtext("Hit_def")
                 sseqid = hit_def.split(None,1)[0]
+            if sseqid.startswith("gnl|BL_ORD_ID|") \
+            and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+                #Alternative place holder ID, again take the first word of hit_def
+                hit_def = hit.findtext("Hit_def")
+                sseqid = hit_def.split(None,1)[0]
             # for every <Hsp> within <Hit>
             for hsp in hit.findall("Hit_hsps/Hsp"):
                 nident = hsp.findtext("Hsp_identity")

diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Tue Dec 03 10:02:17 2013 -0500

@@ -70,6 +70,16 @@
             
             <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
         </test>
+        <test>
+            <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <output name="tabular_file" file="blastn_arabidopsis.standard.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <output name="tabular_file" file="blastn_arabidopsis.extended.tabular" ftype="tabular" />
+        </test>
     </tests>
     <help>

@@ -122,7 +132,7 @@
     22 sseq          Aligned part of subject sequence
     23 qlen          Query sequence length
     24 slen          Subject sequence length
-    25 salltitles    All subject titles, separated by '<>'
+    25 salltitles    All subject title(s), separated by '<>'
====== ============= ===========================================

Beware that the XML file (and thus the conversion) and the tabular output

diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/ncbi_macros.xml
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml Tue Dec 03 10:02:17 2013 -0500

@@ -13,7 +13,7 @@
     <xml name="input_out_format">
         <param name="out_format" type="select" label="Output format">
             <option value="6">Tabular (standard 12 columns)</option>
-            <option value="ext" selected="True">Tabular (extended 24 columns)</option>
+            <option value="ext" selected="True">Tabular (extended 25 columns)</option>
             <option value="5">BLAST XML</option>
             <option value="0">Pairwise text</option>
             <option value="0 -html">Pairwise HTML</option>
@@ -351,7 +351,7 @@
     22 sseq          Aligned part of subject sequence
     23 qlen          Query sequence length
     24 slen          Subject sequence length
-    25 salltitles    All subject titles, separated by '<>'
+    25 salltitles    All subject title(s), separated by '<>'
====== ============= ===========================================

The third option is BLAST XML output, which is designed to be parsed by