changeset 37:72170c3f515a draft

Uploaded v0.0.22f, fix for new style dummy IDs in BLAST XML output
author peterjc
date Tue, 03 Dec 2013 10:02:17 -0500
parents 8f9023b30384
children 2f7fac29bb3c
files test-data/blastn_arabidopsis.extended.tabular test-data/blastn_arabidopsis.standard.tabular test-data/blastn_arabidopsis.xml tools/ncbi_blast_plus/blastxml_to_tabular.py tools/ncbi_blast_plus/blastxml_to_tabular.xml tools/ncbi_blast_plus/ncbi_macros.xml
diffstat 6 files changed, 96 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.extended.tabular	Tue Dec 03 10:02:17 2013 -0500
@@ -0,0 +1,1 @@
+chunk_of_plant	chrIII	100.00	630	0	0	1	630	4341	4970	0.0	1164	chrIII	630	630	630	0	100.00	1	1	GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT	GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT	630	23459830	gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.standard.tabular	Tue Dec 03 10:02:17 2013 -0500
@@ -0,0 +1,1 @@
+chunk_of_plant	chrIII	100.00	630	0	0	1	630	4341	4970	0.0	1164
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.xml	Tue Dec 03 10:02:17 2013 -0500
@@ -0,0 +1,71 @@
+<?xml version="1.0"?>
+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
+<BlastOutput>
+  <BlastOutput_program>blastn</BlastOutput_program>
+  <BlastOutput_version>BLASTN 2.2.28+</BlastOutput_version>
+  <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), &quot;A greedy algorithm for aligning DNA sequences&quot;, J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference>
+  <BlastOutput_db>/mnt/galaxy/galaxy-central/database/files/000/dataset_857_files/blastdb</BlastOutput_db>
+  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>
+  <BlastOutput_query-def>chunk_of_plant</BlastOutput_query-def>
+  <BlastOutput_query-len>630</BlastOutput_query-len>
+  <BlastOutput_param>
+    <Parameters>
+      <Parameters_expect>0.001</Parameters_expect>
+      <Parameters_sc-match>1</Parameters_sc-match>
+      <Parameters_sc-mismatch>-2</Parameters_sc-mismatch>
+      <Parameters_gap-open>0</Parameters_gap-open>
+      <Parameters_gap-extend>0</Parameters_gap-extend>
+      <Parameters_filter>L;m;</Parameters_filter>
+    </Parameters>
+  </BlastOutput_param>
+<BlastOutput_iterations>
+<Iteration>
+  <Iteration_iter-num>1</Iteration_iter-num>
+  <Iteration_query-ID>Query_1</Iteration_query-ID>
+  <Iteration_query-def>chunk_of_plant</Iteration_query-def>
+  <Iteration_query-len>630</Iteration_query-len>
+<Iteration_hits>
+<Hit>
+  <Hit_num>1</Hit_num>
+  <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+  <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+  <Hit_accession>2</Hit_accession>
+  <Hit_len>23459830</Hit_len>
+  <Hit_hsps>
+    <Hsp>
+      <Hsp_num>1</Hsp_num>
+      <Hsp_bit-score>1164.51</Hsp_bit-score>
+      <Hsp_score>630</Hsp_score>
+      <Hsp_evalue>0</Hsp_evalue>
+      <Hsp_query-from>1</Hsp_query-from>
+      <Hsp_query-to>630</Hsp_query-to>
+      <Hsp_hit-from>4341</Hsp_hit-from>
+      <Hsp_hit-to>4970</Hsp_hit-to>
+      <Hsp_query-frame>1</Hsp_query-frame>
+      <Hsp_hit-frame>1</Hsp_hit-frame>
+      <Hsp_identity>630</Hsp_identity>
+      <Hsp_positive>630</Hsp_positive>
+      <Hsp_gaps>0</Hsp_gaps>
+      <Hsp_align-len>630</Hsp_align-len>
+      <Hsp_qseq>GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT</Hsp_qseq>
+      <Hsp_hseq>GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT</Hsp_hseq>
+      <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline>
+    </Hsp>
+  </Hit_hsps>
+</Hit>
+</Iteration_hits>
+  <Iteration_stat>
+    <Statistics>
+      <Statistics_db-num>5</Statistics_db-num>
+      <Statistics_db-len>119146348</Statistics_db-len>
+      <Statistics_hsp-len>26</Statistics_hsp-len>
+      <Statistics_eff-space>71964315672</Statistics_eff-space>
+      <Statistics_kappa>0.46</Statistics_kappa>
+      <Statistics_lambda>1.28</Statistics_lambda>
+      <Statistics_entropy>0.85</Statistics_entropy>
+    </Statistics>
+  </Iteration_stat>
+</Iteration>
+</BlastOutput_iterations>
+</BlastOutput>
+
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py	Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py	Tue Dec 03 10:02:17 2013 -0500
@@ -158,6 +158,11 @@
             # <Hit_accession>Subject_1</Hit_accession>
             #
             #apparently depending on the parse_deflines switch
+            #
+            #Or, with BLAST 2.2.28+ can get this,
+            # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+            # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+            # <Hit_accession>2</Hit_accession>
             sseqid = hit.findtext("Hit_id").split(None,1)[0]
             hit_def = sseqid + " " + hit.findtext("Hit_def")
             if re_default_subject_id.match(sseqid) \
@@ -165,6 +170,11 @@
                 #Place holder ID, take the first word of the subject definition
                 hit_def = hit.findtext("Hit_def")
                 sseqid = hit_def.split(None,1)[0]
+            if sseqid.startswith("gnl|BL_ORD_ID|") \
+            and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+                #Alternative place holder ID, again take the first word of hit_def
+                hit_def = hit.findtext("Hit_def")
+                sseqid = hit_def.split(None,1)[0]
             # for every <Hsp> within <Hit>
             for hsp in hit.findall("Hit_hsps/Hsp"):
                 nident = hsp.findtext("Hsp_identity")
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml	Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml	Tue Dec 03 10:02:17 2013 -0500
@@ -70,6 +70,16 @@
             <!-- Note this has some white space differences from the actual blastp output -->
             <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" />
         </test>
+        <test>
+            <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" />
+            <param name="out_format" value="std" />
+            <output name="tabular_file" file="blastn_arabidopsis.standard.tabular" ftype="tabular" />
+        </test>
+        <test>
+            <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" />
+            <param name="out_format" value="ext" />
+            <output name="tabular_file" file="blastn_arabidopsis.extended.tabular" ftype="tabular" />
+        </test>
     </tests>
     <help>
     
@@ -122,7 +132,7 @@
     22 sseq          Aligned part of subject sequence
     23 qlen          Query sequence length
     24 slen          Subject sequence length
-    25 salltitles    All subject titles, separated by '&lt;&gt;'
+    25 salltitles    All subject title(s), separated by '&lt;&gt;'
 ====== ============= ===========================================
 
 Beware that the XML file (and thus the conversion) and the tabular output
--- a/tools/ncbi_blast_plus/ncbi_macros.xml	Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml	Tue Dec 03 10:02:17 2013 -0500
@@ -13,7 +13,7 @@
     <xml name="input_out_format">
         <param name="out_format" type="select" label="Output format">
             <option value="6">Tabular (standard 12 columns)</option>
-            <option value="ext" selected="True">Tabular (extended 24 columns)</option>
+            <option value="ext" selected="True">Tabular (extended 25 columns)</option>
             <option value="5">BLAST XML</option>
             <option value="0">Pairwise text</option>
             <option value="0 -html">Pairwise HTML</option>
@@ -351,7 +351,7 @@
     22 sseq          Aligned part of subject sequence
     23 qlen          Query sequence length
     24 slen          Subject sequence length
-    25 salltitles    All subject titles, separated by '&lt;&gt;'
+    25 salltitles    All subject title(s), separated by '&lt;&gt;'
 ====== ============= ===========================================
 
 The third option is BLAST XML output, which is designed to be parsed by