# HG changeset patch
# User peterjc
# Date 1386082937 18000
# Node ID 72170c3f515ae4a358c2dc1137ab0f12b3617135
# Parent 8f9023b30384a542365a2feac4dcb99d7bc39b9c
Uploaded v0.0.22f, fix for new style dummy IDs in BLAST XML output
diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.extended.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.extended.tabular Tue Dec 03 10:02:17 2013 -0500
@@ -0,0 +1,1 @@
+chunk_of_plant chrIII 100.00 630 0 0 1 630 4341 4970 0.0 1164 chrIII 630 630 630 0 100.00 1 1 GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT 630 23459830 gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence
diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.standard.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.standard.tabular Tue Dec 03 10:02:17 2013 -0500
@@ -0,0 +1,1 @@
+chunk_of_plant chrIII 100.00 630 0 0 1 630 4341 4970 0.0 1164
diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastn_arabidopsis.xml Tue Dec 03 10:02:17 2013 -0500
@@ -0,0 +1,71 @@
+
+
+
+ blastn
+ BLASTN 2.2.28+
+ Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.
+ /mnt/galaxy/galaxy-central/database/files/000/dataset_857_files/blastdb
+ Query_1
+ chunk_of_plant
+ 630
+
+
+ 0.001
+ 1
+ -2
+ 0
+ 0
+ L;m;
+
+
+
+
+ 1
+ Query_1
+ chunk_of_plant
+ 630
+
+
+ 1
+ gnl|BL_ORD_ID|2
+ chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence
+ 2
+ 23459830
+
+
+ 1
+ 1164.51
+ 630
+ 0
+ 1
+ 630
+ 4341
+ 4970
+ 1
+ 1
+ 630
+ 630
+ 0
+ 630
+ GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT
+ GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT
+ ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+
+
+
+
+
+
+ 5
+ 119146348
+ 26
+ 71964315672
+ 0.46
+ 1.28
+ 0.85
+
+
+
+
+
+
diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Dec 03 10:02:17 2013 -0500
@@ -158,6 +158,11 @@
# Subject_1
#
#apparently depending on the parse_deflines switch
+ #
+ #Or, with BLAST 2.2.28+ can get this,
+ # gnl|BL_ORD_ID|2
+ # chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence
+ # 2
sseqid = hit.findtext("Hit_id").split(None,1)[0]
hit_def = sseqid + " " + hit.findtext("Hit_def")
if re_default_subject_id.match(sseqid) \
@@ -165,6 +170,11 @@
#Place holder ID, take the first word of the subject definition
hit_def = hit.findtext("Hit_def")
sseqid = hit_def.split(None,1)[0]
+ if sseqid.startswith("gnl|BL_ORD_ID|") \
+ and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+ #Alternative place holder ID, again take the first word of hit_def
+ hit_def = hit.findtext("Hit_def")
+ sseqid = hit_def.split(None,1)[0]
# for every within
for hsp in hit.findall("Hit_hsps/Hsp"):
nident = hsp.findtext("Hsp_identity")
diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/blastxml_to_tabular.xml
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Tue Dec 03 10:02:17 2013 -0500
@@ -70,6 +70,16 @@
+
+
+
+
+
+
+
+
+
+
@@ -122,7 +132,7 @@
22 sseq Aligned part of subject sequence
23 qlen Query sequence length
24 slen Subject sequence length
- 25 salltitles All subject titles, separated by '<>'
+ 25 salltitles All subject title(s), separated by '<>'
====== ============= ===========================================
Beware that the XML file (and thus the conversion) and the tabular output
diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/ncbi_macros.xml
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Mon Dec 02 10:27:05 2013 -0500
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml Tue Dec 03 10:02:17 2013 -0500
@@ -13,7 +13,7 @@
-
+
@@ -351,7 +351,7 @@
22 sseq Aligned part of subject sequence
23 qlen Query sequence length
24 slen Subject sequence length
- 25 salltitles All subject titles, separated by '<>'
+ 25 salltitles All subject title(s), separated by '<>'
====== ============= ===========================================
The third option is BLAST XML output, which is designed to be parsed by