Previous changeset 36:8f9023b30384 (2013-12-02) Next changeset 38:2f7fac29bb3c (2014-01-15) |
Commit message:
Uploaded v0.0.22f, fix for new style dummy IDs in BLAST XML output |
modified:
tools/ncbi_blast_plus/blastxml_to_tabular.py tools/ncbi_blast_plus/blastxml_to_tabular.xml tools/ncbi_blast_plus/ncbi_macros.xml |
added:
test-data/blastn_arabidopsis.extended.tabular test-data/blastn_arabidopsis.standard.tabular test-data/blastn_arabidopsis.xml |
b |
diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.extended.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastn_arabidopsis.extended.tabular Tue Dec 03 10:02:17 2013 -0500 |
b |
@@ -0,0 +1,1 @@ +chunk_of_plant chrIII 100.00 630 0 0 1 630 4341 4970 0.0 1164 chrIII 630 630 630 0 100.00 1 1 GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT 630 23459830 gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence |
b |
diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.standard.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastn_arabidopsis.standard.tabular Tue Dec 03 10:02:17 2013 -0500 |
b |
@@ -0,0 +1,1 @@ +chunk_of_plant chrIII 100.00 630 0 0 1 630 4341 4970 0.0 1164 |
b |
diff -r 8f9023b30384 -r 72170c3f515a test-data/blastn_arabidopsis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blastn_arabidopsis.xml Tue Dec 03 10:02:17 2013 -0500 |
b |
@@ -0,0 +1,71 @@ +<?xml version="1.0"?> +<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"> +<BlastOutput> + <BlastOutput_program>blastn</BlastOutput_program> + <BlastOutput_version>BLASTN 2.2.28+</BlastOutput_version> + <BlastOutput_reference>Zheng Zhang, Scott Schwartz, Lukas Wagner, and Webb Miller (2000), "A greedy algorithm for aligning DNA sequences", J Comput Biol 2000; 7(1-2):203-14.</BlastOutput_reference> + <BlastOutput_db>/mnt/galaxy/galaxy-central/database/files/000/dataset_857_files/blastdb</BlastOutput_db> + <BlastOutput_query-ID>Query_1</BlastOutput_query-ID> + <BlastOutput_query-def>chunk_of_plant</BlastOutput_query-def> + <BlastOutput_query-len>630</BlastOutput_query-len> + <BlastOutput_param> + <Parameters> + <Parameters_expect>0.001</Parameters_expect> + <Parameters_sc-match>1</Parameters_sc-match> + <Parameters_sc-mismatch>-2</Parameters_sc-mismatch> + <Parameters_gap-open>0</Parameters_gap-open> + <Parameters_gap-extend>0</Parameters_gap-extend> + <Parameters_filter>L;m;</Parameters_filter> + </Parameters> + </BlastOutput_param> +<BlastOutput_iterations> +<Iteration> + <Iteration_iter-num>1</Iteration_iter-num> + <Iteration_query-ID>Query_1</Iteration_query-ID> + <Iteration_query-def>chunk_of_plant</Iteration_query-def> + <Iteration_query-len>630</Iteration_query-len> +<Iteration_hits> +<Hit> + <Hit_num>1</Hit_num> + <Hit_id>gnl|BL_ORD_ID|2</Hit_id> + <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> + <Hit_accession>2</Hit_accession> + <Hit_len>23459830</Hit_len> + <Hit_hsps> + <Hsp> + <Hsp_num>1</Hsp_num> + <Hsp_bit-score>1164.51</Hsp_bit-score> + <Hsp_score>630</Hsp_score> + <Hsp_evalue>0</Hsp_evalue> + <Hsp_query-from>1</Hsp_query-from> + <Hsp_query-to>630</Hsp_query-to> + <Hsp_hit-from>4341</Hsp_hit-from> + <Hsp_hit-to>4970</Hsp_hit-to> + <Hsp_query-frame>1</Hsp_query-frame> + <Hsp_hit-frame>1</Hsp_hit-frame> + <Hsp_identity>630</Hsp_identity> + <Hsp_positive>630</Hsp_positive> + <Hsp_gaps>0</Hsp_gaps> + <Hsp_align-len>630</Hsp_align-len> + <Hsp_qseq>GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT</Hsp_qseq> + <Hsp_hseq>GATGTTCAATACTGTTTCCAACAAAAAGATTGTTGTCCTCGAGTTCGCCTTCAAGAAAGACACGAGAGAGACTCCAGCCATTGACGTCTGCAAAGGTTTGTTAGGAGACAAGGCCCGAATCAGCATCTATGATCCACAAGTCACGGAAGAACAAATCCAAAGAGACTTAACCATGAACACATTCGACTGGGACCATCCACTTCACCTCCAACCCATGAGTCCAACCACTGTGAAACAAGTCTCAGTTGCTTGGGACGCTTACGCTGCCACCAAAGACGCCCACGGAATCTGCTTGTTAACCGAGTGGGACGAGTATAAGACGCTTGACTATGAGCGGATTTTTGAAAACATGCAGAAACCAGCGTTTGTCTTCGATGGCAGAAATGTTTTTGATGCAGAGAAGCTGAGGAAGATAGGGTTTATTGTTTACTCTATTGGTAAGCCGTTGGACCAGTGGCACATGCCTGCTCTTGCTTAGCTCAGACTCTTTGCCCTTTCTCAAGATTTGGATTGTTTTTCTCTCTGTTGCTTATATCAAATAATTTGTTCTGTTTCTTCTTGACGAGATATTTTCCTATACTTATTATGTTGGTTAGAACAAGAGACTAGGTTTGGTTATTATTGCTAACT</Hsp_hseq> + <Hsp_midline>||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</Hsp_midline> + </Hsp> + </Hit_hsps> +</Hit> +</Iteration_hits> + <Iteration_stat> + <Statistics> + <Statistics_db-num>5</Statistics_db-num> + <Statistics_db-len>119146348</Statistics_db-len> + <Statistics_hsp-len>26</Statistics_hsp-len> + <Statistics_eff-space>71964315672</Statistics_eff-space> + <Statistics_kappa>0.46</Statistics_kappa> + <Statistics_lambda>1.28</Statistics_lambda> + <Statistics_entropy>0.85</Statistics_entropy> + </Statistics> + </Iteration_stat> +</Iteration> +</BlastOutput_iterations> +</BlastOutput> + |
b |
diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/blastxml_to_tabular.py --- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Mon Dec 02 10:27:05 2013 -0500 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Dec 03 10:02:17 2013 -0500 |
[ |
@@ -158,6 +158,11 @@ # <Hit_accession>Subject_1</Hit_accession> # #apparently depending on the parse_deflines switch + # + #Or, with BLAST 2.2.28+ can get this, + # <Hit_id>gnl|BL_ORD_ID|2</Hit_id> + # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def> + # <Hit_accession>2</Hit_accession> sseqid = hit.findtext("Hit_id").split(None,1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") if re_default_subject_id.match(sseqid) \ @@ -165,6 +170,11 @@ #Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") sseqid = hit_def.split(None,1)[0] + if sseqid.startswith("gnl|BL_ORD_ID|") \ + and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"): + #Alternative place holder ID, again take the first word of hit_def + hit_def = hit.findtext("Hit_def") + sseqid = hit_def.split(None,1)[0] # for every <Hsp> within <Hit> for hsp in hit.findall("Hit_hsps/Hsp"): nident = hsp.findtext("Hsp_identity") |
b |
diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/blastxml_to_tabular.xml --- a/tools/ncbi_blast_plus/blastxml_to_tabular.xml Mon Dec 02 10:27:05 2013 -0500 +++ b/tools/ncbi_blast_plus/blastxml_to_tabular.xml Tue Dec 03 10:02:17 2013 -0500 |
b |
@@ -70,6 +70,16 @@ <!-- Note this has some white space differences from the actual blastp output --> <output name="tabular_file" file="blastp_human_vs_pdb_seg_no_converted_ext.tabular" ftype="tabular" /> </test> + <test> + <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" /> + <param name="out_format" value="std" /> + <output name="tabular_file" file="blastn_arabidopsis.standard.tabular" ftype="tabular" /> + </test> + <test> + <param name="blastxml_file" value="blastn_arabidopsis.xml" ftype="blastxml" /> + <param name="out_format" value="ext" /> + <output name="tabular_file" file="blastn_arabidopsis.extended.tabular" ftype="tabular" /> + </test> </tests> <help> @@ -122,7 +132,7 @@ 22 sseq Aligned part of subject sequence 23 qlen Query sequence length 24 slen Subject sequence length - 25 salltitles All subject titles, separated by '<>' + 25 salltitles All subject title(s), separated by '<>' ====== ============= =========================================== Beware that the XML file (and thus the conversion) and the tabular output |
b |
diff -r 8f9023b30384 -r 72170c3f515a tools/ncbi_blast_plus/ncbi_macros.xml --- a/tools/ncbi_blast_plus/ncbi_macros.xml Mon Dec 02 10:27:05 2013 -0500 +++ b/tools/ncbi_blast_plus/ncbi_macros.xml Tue Dec 03 10:02:17 2013 -0500 |
b |
@@ -13,7 +13,7 @@ <xml name="input_out_format"> <param name="out_format" type="select" label="Output format"> <option value="6">Tabular (standard 12 columns)</option> - <option value="ext" selected="True">Tabular (extended 24 columns)</option> + <option value="ext" selected="True">Tabular (extended 25 columns)</option> <option value="5">BLAST XML</option> <option value="0">Pairwise text</option> <option value="0 -html">Pairwise HTML</option> @@ -351,7 +351,7 @@ 22 sseq Aligned part of subject sequence 23 qlen Query sequence length 24 slen Subject sequence length - 25 salltitles All subject titles, separated by '<>' + 25 salltitles All subject title(s), separated by '<>' ====== ============= =========================================== The third option is BLAST XML output, which is designed to be parsed by |