Mercurial > repos > greg > bwa_color

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bwa_color_wrapper.xml	Thu Jan 17 16:44:14 2013 -0500
@@ -0,0 +1,485 @@
+<tool id="bwa_color_wrapper" name="Map with BWA for SOLiD" version="1.0.2">
+  <requirements>
+    <requirement type="package" version="0.5.9">bwa</requirement>
+  </requirements>
+  <description></description>
+  <parallelism method="basic"></parallelism>
+  <command interpreter="python">
+    bwa_wrapper.py
+      --threads="4"
+      --color-space
+
+      ## reference source
+      --fileSource=$genomeSource.refGenomeSource
+      #if $genomeSource.refGenomeSource == "history":
+        ##build index on the fly
+        --ref="${genomeSource.ownFile}"
+        --dbkey=$dbkey
+      #else:
+        ##use precomputed indexes
+        --ref="${genomeSource.indices.fields.path}"
+        --do_not_build_index
+      #end if
+
+      ## input file(s)
+      --input1=$paired.input1
+      #if $paired.sPaired == "paired":
+        --input2=$paired.input2
+      #end if
+
+      ## output file
+      --output=$output
+
+      ## run parameters
+      --genAlignType=$paired.sPaired
+      --params=$params.source_select
+      #if $params.source_select != "pre_set":
+        --maxEditDist=$params.maxEditDist
+        --fracMissingAligns=$params.fracMissingAligns
+        --maxGapOpens=$params.maxGapOpens
+        --maxGapExtens=$params.maxGapExtens
+        --disallowLongDel=$params.disallowLongDel
+        --disallowIndel=$params.disallowIndel
+        --seed=$params.seed
+        --maxEditDistSeed=$params.maxEditDistSeed
+        --mismatchPenalty=$params.mismatchPenalty
+        --gapOpenPenalty=$params.gapOpenPenalty
+        --gapExtensPenalty=$params.gapExtensPenalty
+        --suboptAlign="${params.suboptAlign}"
+        --noIterSearch=$params.noIterSearch
+        --outputTopN=$params.outputTopN
+        --outputTopNDisc=$params.outputTopNDisc
+        --maxInsertSize=$params.maxInsertSize
+        --maxOccurPairing=$params.maxOccurPairing
+        #if $params.readGroup.specReadGroup == "yes"
+          --rgid="$params.readGroup.rgid"
+          --rgcn="$params.readGroup.rgcn"
+          --rgds="$params.readGroup.rgds"
+          --rgdt="$params.readGroup.rgdt"
+          --rgfo="$params.readGroup.rgfo"
+          --rgks="$params.readGroup.rgks"
+          --rglb="$params.readGroup.rglb"
+          --rgpg="$params.readGroup.rgpg"
+          --rgpi="$params.readGroup.rgpi"
+          --rgpl="$params.readGroup.rgpl"
+          --rgpu="$params.readGroup.rgpu"
+          --rgsm="$params.readGroup.rgsm"
+        #end if
+      #end if
+
+      ## suppress output SAM header
+      --suppressHeader=$suppressHeader
+  </command>
+  <requirements>
+    <requirement type="package">bwa</requirement>
+  </requirements>
+  <inputs>
+    <conditional name="genomeSource">
+      <param name="refGenomeSource" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+        <option value="indexed">Use a built-in index</option>
+        <option value="history">Use one from the history</option>
+      </param>
+      <when value="indexed">
+        <param name="indices" type="select" label="Select a reference genome">
+          <options from_data_table="bwa_indexes_color">
+            <filter type="sort_by" column="2" />
+            <validator type="no_options" message="No indexes are available for the selected input dataset" />
+          </options>
+        </param>
+      </when>
+      <when value="history">
+        <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select a reference from history" />
+      </when>
+    </conditional>
+    <conditional name="paired">
+      <param name="sPaired" type="select" label="Is this library mate-paired?">
+        <option value="single">Single-end</option>
+        <option value="paired">Paired-end</option>
+      </param>
+      <when value="single">
+        <param name="input1" type="data" format="fastqcssanger" label="FASTQ file (Nucleotide-space recoded from color-space)">
+          <help>Convert color-space data to nucleotide-space (see help section below for steps). Must have Sanger-scaled quality values with ASCII offset 33</help>
+        </param>
+      </when>
+      <when value="paired">
+        <param name="input1" type="data" format="fastqcssanger" label="Forward FASTQ file (Nucleotide-space recoded from color-space)" help="Must have Sanger-scaled quality values with ASCII offset 33">
+          <help>Convert color-space data to nucleotide-space (see help section below for steps). Must have Sanger-scaled quality values with ASCII offset 33</help>
+        </param>
+        <param name="input2" type="data" format="fastqcssanger" label="Reverse FASTQ file (Nucleotide-space recoded from color-space)" help="Must have Sanger-scaled quality values with ASCII offset 33">
+          <help>Convert color-space data to nucleotide-space (see help section below for steps). Must have Sanger-scaled quality values with ASCII offset 33</help>
+        </param>
+      </when>
+    </conditional>
+    <conditional name="params">
+      <param name="source_select" type="select" label="BWA settings to use" help="For most mapping needs use Commonly Used settings. If you want full control use Full Parameter List">
+        <option value="pre_set">Commonly Used</option>
+        <option value="full">Full Parameter List</option>
+      </param>
+      <when value="pre_set" />
+      <when value="full">
+        <param name="maxEditDist" type="integer" value="0" label="Maximum edit distance (aln -n)" help="Enter this value OR a fraction of missing alignments, not both" />
+        <param name="fracMissingAligns" type="float" value="0.04" label="Fraction of missing alignments given 2% uniform base error rate (aln -n)" help="Enter this value OR maximum edit distance, not both" />
+        <param name="maxGapOpens" type="integer" value="1" label="Maximum number of gap opens (aln -o)" />
+        <param name="maxGapExtens" type="integer" value="-1" label="Maximum number of gap extensions (aln -e)" help="-1 for k-difference mode (disallowing long gaps)" />
+        <param name="disallowLongDel" type="integer" value="16" label="Disallow long deletion within [value] bp towards the 3'-end (aln -d)" />
+        <param name="disallowIndel" type="integer" value="5" label="Disallow insertion/deletion within [value] bp towards the end (aln -i)" />
+        <param name="seed" type="integer" value="-1" label="Number of first subsequences to take as seed (aln -l)" help="Enter -1 for infinity" />
+        <param name="maxEditDistSeed" type="integer" value="2" label="Maximum edit distance in the seed (aln -k)" />
+        <param name="mismatchPenalty" type="integer" value="3" label="Mismatch penalty (aln -M)" help="BWA will not search for suboptimal hits with a score lower than [value]" />
+        <param name="gapOpenPenalty" type="integer" value="11" label="Gap open penalty (aln -O)" />
+        <param name="gapExtensPenalty" type="integer" value="4" label="Gap extension penalty (aln -E)" />
+        <param name="suboptAlign" type="integer" optional="True" label="Proceed with suboptimal alignments if there are no more than INT equally best hits. (aln -R)" help="For paired-end reads only. By default, BWA only searches for suboptimal alignments if the top hit is unique. Using this option has no effect on accuracy for single-end reads. It is mainly designed for improving the alignment accuracy of paired-end reads. However, the pairing procedure will be slowed down, especially for very short reads (~32bp)" />
+        <param name="noIterSearch" type="boolean" truevalue="true" falsevalue="false" checked="no" label="Disable iterative search (aln -N)" help="All hits with no more than maxDiff differences will be found. This mode is much slower than the default" />
+        <param name="outputTopN" type="integer" value="3" label="Maximum number of alignments to output in the XA tag for reads paired properly (samse/sampe -n)" help="If a read has more than INT hits, the XA tag will not be written" />
+        <param name="outputTopNDisc" type="integer" value="10" label="Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons) (sampe -N)" help="For paired-end reads only. If a read has more than INT hits, the XA tag will not be written" />
+        <param name="maxInsertSize" type="integer" value="500" label="Maximum insert size for a read pair to be considered as being mapped properly (sampe -a)" help="For paired-end reads only. Only used when there are not enough good alignments to infer the distribution of insert sizes" />
+        <param name="maxOccurPairing" type="integer" value="100000" label="Maximum occurrences of a read for pairing (sampe -o)" help="For paired-end reads only. A read with more occurrences will be treated as a single-end read. Reducing this parameter helps faster pairing" />
+        <conditional name="readGroup">
+          <param name="specReadGroup" type="select" label="Specify the read group for this file? (samse/sampe -r)">
+            <option value="yes">Yes</option>
+            <option value="no" selected="True">No</option>
+          </param>
+          <when value="yes">
+            <param name="rgid" type="text" size="25" label="Read group identiﬁer (ID). Each @RG line must have a unique ID. The value of ID is used in the RG
+tags of alignment records. Must be unique among all read groups in header section." help="Required if RG specified. Read group
+IDs may be modiﬁed when merging SAM ﬁles in order to handle collisions." />
+            <param name="rgcn" type="text" size="25" label="Sequencing center that produced the read (CN)" help="Optional" />
+            <param name="rgds" type="text" size="25" label="Description (DS)" help="Optional" />
+            <param name="rgdt" type="text" size="25" label="Date that run was produced (DT)" help="Optional. ISO8601 format date or date/time, like YYYY-MM-DD" />
+            <param name="rgfo" type="text" size="25" label="Flow order (FO). The array of nucleotide bases that correspond to the nucleotides used for each
+ﬂow of each read." help="Optional. Multi-base ﬂows are encoded in IUPAC format, and non-nucleotide ﬂows by
+various other characters. Format : /\*|[ACMGRSVTWYHKDBN]+/" />
+            <param name="rgks" type="text" size="25" label="The array of nucleotide bases that correspond to the key sequence of each read (KS)" help="Optional" />
+            <param name="rglb" type="text" size="25" label="Library name (LB)" help="Required if RG specified" />
+            <param name="rgpg" type="text" size="25" label="Programs used for processing the read group (PG)" help="Optional" />
+            <param name="rgpi" type="text" size="25" label="Predicted median insert size (PI)" help="Optional" />
+            <param name="rgpl" type="text" size="25" label="Platform/technology used to produce the reads (PL)" help="Required if RG specified. Valid values : CAPILLARY, LS454, ILLUMINA,
+SOLID, HELICOS, IONTORRENT and PACBIO" />
+            <param name="rgpu" type="text" size="25" label="Platform unit (PU)" help="Optional. Unique identiﬁer (e.g. ﬂowcell-barcode.lane for Illumina or slide for SOLiD)" />
+            <param name="rgsm" type="text" size="25" label="Sample (SM)" help="Required if RG specified. Use pool name where a pool is being sequenced" />
+          </when>
+          <when value="no" />
+        </conditional>
+      </when>
+    </conditional>
+    <param name="suppressHeader" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Suppress the header in the output SAM file" help="BWA produces SAM with several lines of header information" />
+  </inputs>
+  <outputs>
+    <data format="sam" name="output" label="${tool.name} on ${on_string}: mapped reads">
+      <actions>
+        <conditional name="genomeSource.refGenomeSource">
+          <when value="indexed">
+            <action type="metadata" name="dbkey">
+              <option type="from_data_table" name="bwa_indexes_color" column="1">
+                <filter type="param_value" column="0" value="#" compare="startswith" keep="False"/>
+                <filter type="param_value" ref="genomeSource.indices" column="0" />
+              </option>
+            </action>
+          </when>
+          <when value="history">
+            <action type="metadata" name="dbkey">
+              <option type="from_param" name="genomeSource.ownFile" param_attribute="dbkey" />
+            </action>
+          </when>
+        </conditional>
+      </actions>
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <!--
+      BWA commands:
+      cp test-data/hg19chrX_midpart.fasta hg19chrX_midpart.fasta
+      bwa index -c -a is hg19chrX_midpart.fasta
+      bwa aln -t 4 -c hg19chrX_midpart.fasta test-data/bwa_wrapper_in4.fastqcssanger > bwa_wrapper_out4.sai
+      bwa samse hg19chrX_midpart.fasta bwa_wrapper_out4.sai test-data/bwa_wrapper_in4.fastqcssanger > bwa_wrapper_out4.u.sam
+      hg19chrX_midpart.fasta is the prefix for the reference files (hg19chrX_midpart.fasta.amb, hg19chrX_midpart.fasta.ann, hg19chrX_midpart.fasta.bwt, ...)
+      It's just part of hg19 chrX, from the middle of the chromosome
+      plain old sort doesn't handle underscores like python:
+      python -c "import sys; lines=file(sys.argv[1],'rb').readlines(); lines.sort(); file(sys.argv[2],'wb').write(''.join(lines))" bwa_wrapper_out4.u.sam bwa_wrapper_out4.sam
+      -->
+      <param name="refGenomeSource" value="history" />
+      <param name="ownFile" value="hg19chrX_midpart.fasta" />
+      <param name="sPaired" value="single" />
+      <param name="input1" value="bwa_wrapper_in4.fastqcssanger" ftype="fastqcssanger" />
+      <param name="source_select" value="pre_set" />
+      <param name="suppressHeader" value="false" />
+      <output name="output" file="bwa_wrapper_out4.sam" ftype="sam" sort="True" lines_diff="2" />
+    </test>
+    <test>
+      <!--
+      BWA commands:
+      bwa aln -t 4 -c equCab2chrM_cs.fa test-data/bwa_wrapper_in5.fastqcssanger > bwa_wrapper_out5a.sai
+      bwa aln -t 4 -c equCab2chrM_cs.fa test-data/bwa_wrapper_in6.fastqcssanger > bwa_wrapper_out5b.sai
+      bwa sampe equCab2chrM_cs.fa bwa_wrapper_out5a.sai bwa_wrapper_out5b.sai test-data/bwa_wrapper_in5.fastqcssanger test-data/bwa_wrapper_in6.fastqcssanger > bwa_wrapper_out5.u.sam
+      equCab2chrM_cs.fa is the prefix of the index files (equCab2chrM_cs.fa.amb, equCab2chrM_cs.fa.ann, ...)
+      remove the comment lines (beginning with '@') from the resulting sam file
+      plain old sort doesn't handle underscores like python:
+      python -c "import sys; lines=file(sys.argv[1],'rb').readlines(); lines.sort(); file(sys.argv[2],'wb').write(''.join(lines))" bwa_wrapper_out5.u.sam bwa_wrapper_out5.sam
+      -->
+      <param name="refGenomeSource" value="indexed" />
+      <param name="indices" value="equCab2chrM" />
+      <param name="sPaired" value="paired" />
+      <param name="input1" value="bwa_wrapper_in5.fastqcssanger" ftype="fastqcssanger" />
+      <param name="input2" value="bwa_wrapper_in6.fastqcssanger" ftype="fastqcssanger" />
+      <param name="source_select" value="pre_set" />
+      <param name="suppressHeader" value="true" />
+      <output name="output" file="bwa_wrapper_out5.sam" ftype="sam" sort="True" />
+    </test>
+    <test>
+      <!--
+      BWA commands:
+      bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N -c hg19chrX_midpart.fasta test-data/bwa_wrapper_in4.fastqcssanger > bwa_wrapper_out6.sai
+      bwa samse -n 3 -r "@RG\tID:474747\tDS:description\tDT:2011-03-14\tLB:lib-child-1-A\tPI:200\tPL:SOLID\tSM:child-1" hg19chrX_midpart.fasta bwa_wrapper_out6.sai test-data/bwa_wrapper_in4.fastqcssanger > bwa_wrapper_out6.u.sam
+      hg19chrX_midpart_cs.fa is the prefix of the index files (hg19chrX_midpart.fa.amb, hg19chrX_midpart.fa.ann, ...)
+      (It's just part of hg19 chrX, from the middle of the chromosome)
+      plain old sort doesn't handle underscores like python:
+      python -c "import sys; lines=file(sys.argv[1],'rb').readlines(); lines.sort(); file(sys.argv[2],'wb').write(''.join(lines))" bwa_wrapper_out6.u.sam bwa_wrapper_out6.sam
+      -->
+      <param name="refGenomeSource" value="indexed" />
+      <param name="indices" value="hg19chrX_midpart" />
+      <param name="sPaired" value="single" />
+      <param name="input1" value="bwa_wrapper_in4.fastqcssanger" ftype="fastqcssanger" />
+      <param name="source_select" value="full" />
+      <param name="maxEditDist" value="0" />
+      <param name="fracMissingAligns" value="0.04" />
+      <param name="maxGapOpens" value="1" />
+      <param name="maxGapExtens" value="-1" />
+      <param name="disallowLongDel" value="16" />
+      <param name="disallowIndel" value="5" />
+      <param name="seed" value="-1" />
+      <param name="maxEditDistSeed" value="2" />
+      <param name="mismatchPenalty" value="3" />
+      <param name="gapOpenPenalty" value="11" />
+      <param name="gapExtensPenalty" value="4" />
+      <param name="suboptAlign" value="" />
+      <param name="noIterSearch" value="true" />
+      <param name="outputTopN" value="3" />
+      <param name="outputTopNDisc" value="10" />
+      <param name="maxInsertSize" value="500" />
+      <param name="maxOccurPairing" value="100000" />
+      <param name="specReadGroup" value="yes" />
+      <param name="rgid" value="474747" />
+      <param name="rgcn" value="" />
+      <param name="rgds" value="description" />
+      <param name="rgdt" value="2011-03-14" />
+      <param name="rgfo" value="" />
+      <param name="rgks" value="" />
+      <param name="rglb" value="lib-child-1-A" />
+      <param name="rgpg" value="" />
+      <param name="rgpi" value="200" />
+      <param name="rgpl" value="SOLID" />
+      <param name="rgpu" value="" />
+      <param name="rgsm" value="child-1" />
+      <param name="suppressHeader" value="false" />
+      <output name="output" file="bwa_wrapper_out6.sam" ftype="sam" sort="True" lines_diff="2" />
+    </test>
+    <test>
+      <!--
+      BWA commands:
+      cp test-data/chr_m.fasta chr_m.fasta
+      bwa index -c -a is chr_m.fasta
+      bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N -c chr_m.fasta test-data/bwa_wrapper_in5.fastqcssanger > bwa_wrapper_out7a.sai
+      bwa aln -n 0.04 -o 1 -e -1 -d 16 -i 5 -k 2 -t 4 -M 3 -O 11 -E 4 -R -N -c chr_m.fasta test-data/bwa_wrapper_in6.fastqcssanger > bwa_wrapper_out7b.sai
+      bwa sampe -a 100 -o 2 -n 3 -N 10 chr_m.fasta bwa_wrapper_out7a.sai bwa_wrapper_out7b.sai test-data/bwa_wrapper_in5.fastqcssanger test-data/bwa_wrapper_in6.fastqcssanger > bwa_wrapper_out7.u.sam
+      chr_m.fasta is the prefix of the index files (chr_m.fasta.amb, chr_m.fasta.ann, ...)
+      plain old sort doesn't handle underscores like python:
+      python -c "import sys; lines=file(sys.argv[1],'rb').readlines(); lines.sort(); file(sys.argv[2],'wb').write(''.join(lines))" bwa_wrapper_out7.u.sam bwa_wrapper_out7.sam
+      -->
+      <param name="refGenomeSource" value="history" />
+      <param name="ownFile" value="chr_m.fasta" />
+      <param name="sPaired" value="paired" />
+      <param name="input1" value="bwa_wrapper_in5.fastqcssanger" ftype="fastqcssanger" />
+      <param name="input2" value="bwa_wrapper_in6.fastqcssanger" ftype="fastqcssanger" />
+      <param name="source_select" value="full" />
+      <param name="maxEditDist" value="0" />
+      <param name="fracMissingAligns" value="0.04" />
+      <param name="maxGapOpens" value="1" />
+      <param name="maxGapExtens" value="-1" />
+      <param name="disallowLongDel" value="16" />
+      <param name="disallowIndel" value="5" />
+      <param name="seed" value="-1" />
+      <param name="maxEditDistSeed" value="2" />
+      <param name="mismatchPenalty" value="3" />
+      <param name="gapOpenPenalty" value="11" />
+      <param name="gapExtensPenalty" value="4" />
+      <param name="suboptAlign" value="" />
+      <param name="noIterSearch" value="true" />
+      <param name="outputTopN" value="3" />
+      <param name="outputTopNDisc" value="10" />
+      <param name="maxInsertSize" value="100" />
+      <param name="maxOccurPairing" value="2" />
+      <param name="specReadGroup" value="no" />
+      <param name="suppressHeader" value="false" />
+      <output name="output" file="bwa_wrapper_out7.sam" ftype="sam" sort="True" lines_diff="2" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+BWA is a fast light-weighted tool that aligns relatively short sequences (queries) to a sequence database (large), such as the human reference genome. It is developed by Heng Li at the Sanger Insitute. Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler transform. Bioinformatics, 25, 1754-60.
+
+------
+
+**Know what you are doing**
+
+.. class:: warningmark
+
+There is no such thing (yet) as an automated gearshift in short read mapping. It is all like stick-shift driving in San Francisco. In other words = running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
+
+ .. __: http://bio-bwa.sourceforge.net/
+
+------
+
+**Input formats**
+
+BWA accepts files in Sanger FASTQ format. Use the FASTQ Groomer to prepare your files, set to either FASTQ Sanger or FASTQ Color Space Sanger as appropriate.
+
+If you have Color Space Sanger, it must be converted to nucleotide-space first. To do this, use the Manipulate FASTQ tool under NGS: QC and manipulation, with the following settings:
+    Manipulate reads on Sequence Content, choosing Change Adapter Base, and having the text box empty.
+    Manipulate reads on Sequence Content, doing a String Translate from "01234." to "ACGTN".
+
+
+------
+
+**A Note on Built-in Reference Genomes**
+
+The default variant for all genomes is "Full", defined as all primary chromosomes (or scaffolds/contigs) including mitochondrial plus associated unmapped, plasmid, and other segments. When only one version of a genome is available in this tool, it represents the default "Full" variant. Some genomes will have more than one variant available. The "Canonical Male" or sometimes simply "Canonical" variant contains the primary chromosomes for a genome. For example a human "Canonical" variant contains chr1-chr22, chrX, chrY, and chrM. The "Canonical Female" variant contains the primary chromosomes excluding chrY.
+
+------
+
+**Outputs**
+
+The output is in SAM format, and has the following columns::
+
+    Column  Description
+  --------  --------------------------------------------------------
+  1  QNAME  Query (pair) NAME
+  2  FLAG   bitwise FLAG
+  3  RNAME  Reference sequence NAME
+  4  POS    1-based leftmost POSition/coordinate of clipped sequence
+  5  MAPQ   MAPping Quality (Phred-scaled)
+  6  CIGAR  extended CIGAR string
+  7  MRNM   Mate Reference sequence NaMe ('=' if same as RNAME)
+  8  MPOS   1-based Mate POSition
+  9  ISIZE  Inferred insert SIZE
+  10 SEQ    query SEQuence on the same strand as the reference
+  11 QUAL   query QUALity (ASCII-33 gives the Phred base quality)
+  12 OPT    variable OPTional fields in the format TAG:VTYPE:VALU
+
+The flags are as follows::
+
+    Flag  Description
+  ------  -------------------------------------
+  0x0001  the read is paired in sequencing
+  0x0002  the read is mapped in a proper pair
+  0x0004  the query sequence itself is unmapped
+  0x0008  the mate is unmapped
+  0x0010  strand of the query (1 for reverse)
+  0x0020  strand of the mate
+  0x0040  the read is the first read in a pair
+  0x0080  the read is the second read in a pair
+  0x0100  the alignment is not primary
+
+It looks like this (scroll sideways to see the entire example)::
+
+  QNAME	FLAG	RNAME	POS	MAPQ	CIAGR	MRNM	MPOS	ISIZE	SEQ	QUAL	OPT
+  HWI-EAS91_1_30788AAXX:1:1:1761:343	4	*	0	0	*	*	0	0	AAAAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAACNNANNGAGTNGNNNNNNNGCTTCCCACAGNNCTGG	hhhhhhh;;hhhhhhhhhhh^hOhhhhghhhfhhhgh;;h;;hhhh;h;;;;;;;hhhhhhghhhh;;Phhh
+  HWI-EAS91_1_30788AAXX:1:1:1578:331	4	*	0	0	*	*	0	0	GTATAGANNAATAAGAAAAAAAAAAATGAAGACTTTCNNANNTCTGNANNNNNNNTCTTTTTTCAGNNGTAG	hhhhhhh;;hhhhhhhhhhhhhhhhhhhhhhhhhhhh;;h;;hhhh;h;;;;;;;hhhhhhhhhhh;;hhVh
+
+-------
+
+**BWA settings**
+
+All of the options have a default value. You can change any of them. All of the options in BWA have been implemented here.
+
+------
+
+**BWA parameter list**
+
+This is an exhaustive list of BWA options:
+
+For **aln**::
+
+  -n NUM  Maximum edit distance if the value is INT, or the fraction of missing
+          alignments given 2% uniform base error rate if FLOAT. In the latter
+          case, the maximum edit distance is automatically chosen for different
+          read lengths. [0.04]
+  -o INT  Maximum number of gap opens [1]
+  -e INT  Maximum number of gap extensions, -1 for k-difference mode
+          (disallowing long gaps) [-1]
+  -d INT  Disallow a long deletion within INT bp towards the 3'-end [16]
+  -i INT  Disallow an indel within INT bp towards the ends [5]
+  -l INT  Take the first INT subsequence as seed. If INT is larger than the
+          query sequence, seeding will be disabled. For long reads, this option
+          is typically ranged from 25 to 35 for '-k 2'. [inf]
+  -k INT  Maximum edit distance in the seed [2]
+  -t INT  Number of threads (multi-threading mode) [1]
+  -M INT  Mismatch penalty. BWA will not search for suboptimal hits with a score
+          lower than (bestScore-misMsc). [3]
+  -O INT  Gap open penalty [11]
+  -E INT  Gap extension penalty [4]
+  -c      Reverse query but not complement it, which is required for alignment
+          in the color space.
+  -R      Proceed with suboptimal alignments even if the top hit is a repeat. By
+          default, BWA only searches for suboptimal alignments if the top hit is
+          unique. Using this option has no effect on accuracy for single-end
+          reads. It is mainly designed for improving the alignment accuracy of
+          paired-end reads. However, the pairing procedure will be slowed down,
+          especially for very short reads (~32bp).
+  -N      Disable iterative search. All hits with no more than maxDiff
+          differences will be found. This mode is much slower than the default.
+
+For **samse**::
+
+  -n INT  Maximum number of alignments to output in the XA tag for reads paired
+          properly. If a read has more than INT hits, the XA tag will not be
+          written. [3]
+  -r STR  Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null]
+
+For **sampe**::
+
+  -a INT  Maximum insert size for a read pair to be considered as being mapped
+          properly. Since version 0.4.5, this option is only used when there
+          are not enough good alignment to infer the distribution of insert
+          sizes. [500]
+  -n INT  Maximum number of alignments to output in the XA tag for reads paired
+          properly. If a read has more than INT hits, the XA tag will not be
+          written. [3]
+  -N INT  Maximum number of alignments to output in the XA tag for disconcordant
+          read pairs (excluding singletons). If a read has more than INT hits,
+          the XA tag will not be written. [10]
+  -o INT  Maximum occurrences of a read for pairing. A read with more
+          occurrences will be treated as a single-end read. Reducing this
+          parameter helps faster pairing. [100000]
+  -r STR  Specify the read group in a format like '@RG\tID:foo\tSM:bar' [null]
+
+For specifying the read group in **samse** or **sampe**, use the following::
+
+  @RG   Read group. Unordered multiple @RG lines are allowed.
+  ID    Read group identiﬁer. Each @RG line must have a unique ID. The value of
+        ID is used in the RG tags of alignment records. Must be unique among all
+        read groups in header section. Read group IDs may be modiﬁed when
+        merging SAM ﬁles in order to handle collisions.
+  CN    Name of sequencing center producing the read.
+  DS    Description.
+  DT    Date the run was produced (ISO8601 date or date/time).
+  FO    Flow order. The array of nucleotide bases that correspond to the
+        nucleotides used for each flow of each read. Multi-base flows are encoded
+        in IUPAC format, and non-nucleotide flows by various other characters.
+        Format : /\*|[ACMGRSVTWYHKDBN]+/
+  KS    The array of nucleotide bases that correspond to the key sequence of each read.
+  LB    Library.
+  PG    Programs used for processing the read group.
+  PI    Predicted median insert size.
+  PL    Platform/technology used to produce the reads. Valid values : CAPILLARY,
+        LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT and PACBIO.
+  PU    Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for
+        SOLiD). Unique identiﬁer.
+  SM    Sample. Use pool name where a pool is being sequenced.
+
+  </help>
+</tool>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bwa_wrapper.py	Thu Jan 17 16:44:14 2013 -0500
@@ -0,0 +1,342 @@
+#!/usr/bin/env python
+
+"""
+Runs BWA on single-end or paired-end data.
+Produces a SAM file containing the mappings.
+Works with BWA version 0.5.9.
+
+usage: bwa_wrapper.py [options]
+
+See below for options
+"""
+
+import optparse, os, shutil, subprocess, sys, tempfile
+
+def stop_err( msg ):
+    sys.stderr.write( '%s\n' % msg )
+    sys.exit()
+
+def check_is_double_encoded( fastq ):
+    # check that first read is bases, not one base followed by numbers
+    bases = [ 'A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N' ]
+    nums = [ '0', '1', '2', '3' ]
+    for line in file( fastq, 'rb'):
+        if not line.strip() or line.startswith( '@' ):
+            continue
+        if len( [ b for b in line.strip() if b in nums ] ) > 0:
+            return False
+        elif line.strip()[0] in bases and len( [ b for b in line.strip() if b in bases ] ) == len( line.strip() ):
+            return True
+        else:
+            raise Exception, 'First line in first read does not appear to be a valid FASTQ read in either base-space or color-space'
+    raise Exception, 'There is no non-comment and non-blank line in your FASTQ file'
+
+def __main__():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' )
+    parser.add_option( '-c', '--color-space', dest='color_space', action='store_true', help='If the input files are SOLiD format' )
+    parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
+    parser.add_option( '-f', '--input1', dest='fastq', help='The (forward) fastq file to use for the mapping' )
+    parser.add_option( '-F', '--input2', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' )
+    parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' )
+    parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' )
+    parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' )
+    parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' )
+    parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' )
+    parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' )
+    parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' )
+    parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' )
+    parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' )
+    parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' )
+    parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' )
+    parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' )
+    parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' )
+    parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' )
+    parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' )
+    parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', default=None, help='Proceed with suboptimal alignments even if the top hit is a repeat' )
+    parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' )
+    parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Maximum number of alignments to output in the XA tag for reads paired properly' )
+    parser.add_option( '', '--outputTopNDisc', dest='outputTopNDisc', help='Maximum number of alignments to output in the XA tag for disconcordant read pairs (excluding singletons)' )
+    parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' )
+    parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' )
+    parser.add_option( '', '--rgid', dest='rgid', help='Read group identifier' )
+    parser.add_option( '', '--rgcn', dest='rgcn', help='Sequencing center that produced the read' )
+    parser.add_option( '', '--rgds', dest='rgds', help='Description' )
+    parser.add_option( '', '--rgdt', dest='rgdt', help='Date that run was produced (ISO8601 format date or date/time, like YYYY-MM-DD)' )
+    parser.add_option( '', '--rgfo', dest='rgfo', help='Flow order' )
+    parser.add_option( '', '--rgks', dest='rgks', help='The array of nucleotide bases that correspond to the key sequence of each read' )
+    parser.add_option( '', '--rglb', dest='rglb', help='Library name' )
+    parser.add_option( '', '--rgpg', dest='rgpg', help='Programs used for processing the read group' )
+    parser.add_option( '', '--rgpi', dest='rgpi', help='Predicted median insert size' )
+    parser.add_option( '', '--rgpl', dest='rgpl', choices=[ 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT' and 'PACBIO' ], help='Platform/technology used to produce the reads' )
+    parser.add_option( '', '--rgpu', dest='rgpu', help='Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD)' )
+    parser.add_option( '', '--rgsm', dest='rgsm', help='Sample' )
+    parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' )
+    parser.add_option( '-X', '--do_not_build_index', dest='do_not_build_index', action='store_true', help="Don't build index" )
+    parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
+    parser.add_option( '-I', '--illumina1.3', dest='illumina13qual', help='Input FASTQ files have Illuina 1.3 quality scores' )
+    (options, args) = parser.parse_args()
+
+    # output version # of tool
+    try:
+        tmp = tempfile.NamedTemporaryFile().name
+        tmp_stdout = open( tmp, 'wb' )
+        proc = subprocess.Popen( args='bwa 2>&1', shell=True, stdout=tmp_stdout )
+        tmp_stdout.close()
+        returncode = proc.wait()
+        stdout = None
+        for line in open( tmp_stdout.name, 'rb' ):
+            if line.lower().find( 'version' ) >= 0:
+                stdout = line.strip()
+                break
+        if stdout:
+            sys.stdout.write( 'BWA %s\n' % stdout )
+        else:
+            raise Exception
+    except:
+        sys.stdout.write( 'Could not determine BWA version\n' )
+
+    # check for color space fastq that's not double-encoded and exit if appropriate
+    if options.color_space:
+        if not check_is_double_encoded( options.fastq ):
+            stop_err( 'Your file must be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
+        if options.genAlignType == 'paired':
+            if not check_is_double_encoded( options.rfastq ):
+                stop_err( 'Your reverse reads file must also be double-encoded (it must be converted from "numbers" to "bases"). See the help section for details' )
+
+    fastq = options.fastq
+    if options.rfastq:
+         rfastq = options.rfastq
+
+    # set color space variable
+    if options.color_space:
+        color_space = '-c'
+    else:
+        color_space = ''
+
+    # make temp directory for placement of indices
+    tmp_index_dir = tempfile.mkdtemp()
+    tmp_dir = tempfile.mkdtemp()
+    # index if necessary
+    if options.fileSource == 'history' and not options.do_not_build_index:
+        ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
+        ref_file_name = ref_file.name
+        ref_file.close()
+        os.symlink( options.ref, ref_file_name )
+        # determine which indexing algorithm to use, based on size
+        try:
+            size = os.stat( options.ref ).st_size
+            if size <= 2**30:
+                indexingAlg = 'is'
+            else:
+                indexingAlg = 'bwtsw'
+        except:
+            indexingAlg = 'is'
+        indexing_cmds = '%s -a %s' % ( color_space, indexingAlg )
+        cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name )
+        try:
+            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
+            tmp_stderr = open( tmp, 'wb' )
+            proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
+            returncode = proc.wait()
+            tmp_stderr.close()
+            # get stderr, allowing for case where it's very large
+            tmp_stderr = open( tmp, 'rb' )
+            stderr = ''
+            buffsize = 1048576
+            try:
+                while True:
+                    stderr += tmp_stderr.read( buffsize )
+                    if not stderr or len( stderr ) % buffsize != 0:
+                        break
+            except OverflowError:
+                pass
+            tmp_stderr.close()
+            if returncode != 0:
+                raise Exception, stderr
+        except Exception, e:
+            # clean up temp dirs
+            if os.path.exists( tmp_index_dir ):
+                shutil.rmtree( tmp_index_dir )
+            if os.path.exists( tmp_dir ):
+                shutil.rmtree( tmp_dir )
+            stop_err( 'Error indexing reference sequence. ' + str( e ) )
+    else:
+        ref_file_name = options.ref
+    if options.illumina13qual:
+        illumina_quals = "-I"
+    else:
+        illumina_quals = ""
+
+    # set up aligning and generate aligning command options
+    if options.params == 'pre_set':
+        aligning_cmds = '-t %s %s %s' % ( options.threads, color_space, illumina_quals )
+        gen_alignment_cmds = ''
+    else:
+        if options.maxEditDist != '0':
+            editDist = options.maxEditDist
+        else:
+            editDist = options.fracMissingAligns
+        if options.seed != '-1':
+            seed = '-l %s' % options.seed
+        else:
+            seed = ''
+        if options.suboptAlign:
+            suboptAlign = '-R "%s"' % ( options.suboptAlign )
+        else:
+            suboptAlign = ''
+        if options.noIterSearch == 'true':
+            noIterSearch = '-N'
+        else:
+            noIterSearch = ''
+        aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s %s %s' % \
+                        ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel,
+                          options.disallowIndel, seed, options.maxEditDistSeed, options.threads,
+                          options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty,
+                          suboptAlign, noIterSearch, color_space, illumina_quals )
+        if options.genAlignType == 'paired':
+            gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing )
+            if options.outputTopNDisc:
+                gen_alignment_cmds += ' -N %s' % options.outputTopNDisc
+        else:
+            gen_alignment_cmds = ''
+        if options.rgid:
+            if not options.rglb or not options.rgpl or not options.rgsm:
+                stop_err( 'If you want to specify read groups, you must include the ID, LB, PL, and SM tags.' )
+            readGroup = '@RG\tID:%s\tLB:%s\tPL:%s\tSM:%s' % ( options.rgid, options.rglb, options.rgpl, options.rgsm )
+            if options.rgcn:
+                readGroup += '\tCN:%s' % options.rgcn
+            if options.rgds:
+                readGroup += '\tDS:%s' % options.rgds
+            if options.rgdt:
+                readGroup += '\tDT:%s' % options.rgdt
+            if options.rgfo:
+                readGroup += '\tFO:%s' % options.rgfo
+            if options.rgks:
+                readGroup += '\tKS:%s' % options.rgks
+            if options.rgpg:
+                readGroup += '\tPG:%s' % options.rgpg
+            if options.rgpi:
+                readGroup += '\tPI:%s' % options.rgpi
+            if options.rgpu:
+                readGroup += '\tPU:%s' % options.rgpu
+            gen_alignment_cmds += ' -r "%s"' % readGroup
+        if options.outputTopN:
+            gen_alignment_cmds += ' -n %s' % options.outputTopN
+    # set up output files
+    tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir )
+    tmp_align_out_name = tmp_align_out.name
+    tmp_align_out.close()
+    tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir )
+    tmp_align_out2_name = tmp_align_out2.name
+    tmp_align_out2.close()
+    # prepare actual aligning and generate aligning commands
+    cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, fastq, tmp_align_out_name )
+    cmd2b = ''
+    if options.genAlignType == 'paired':
+        cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, rfastq, tmp_align_out2_name )
+        cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, fastq, rfastq, options.output )
+    else:
+        cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, fastq, options.output )
+    # perform alignments
+    buffsize = 1048576
+    try:
+        # need to nest try-except in try-finally to handle 2.4
+        try:
+            # align
+            try:
+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+                tmp_stderr = open( tmp, 'wb' )
+                proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                # get stderr, allowing for case where it's very large
+                tmp_stderr = open( tmp, 'rb' )
+                stderr = ''
+                try:
+                    while True:
+                        stderr += tmp_stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
+                tmp_stderr.close()
+                if returncode != 0:
+                    raise Exception, stderr
+            except Exception, e:
+                raise Exception, 'Error aligning sequence. ' + str( e )
+            # and again if paired data
+            try:
+                if cmd2b:
+                    tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+                    tmp_stderr = open( tmp, 'wb' )
+                    proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+                    returncode = proc.wait()
+                    tmp_stderr.close()
+                    # get stderr, allowing for case where it's very large
+                    tmp_stderr = open( tmp, 'rb' )
+                    stderr = ''
+                    try:
+                        while True:
+                            stderr += tmp_stderr.read( buffsize )
+                            if not stderr or len( stderr ) % buffsize != 0:
+                                break
+                    except OverflowError:
+                        pass
+                    tmp_stderr.close()
+                    if returncode != 0:
+                        raise Exception, stderr
+            except Exception, e:
+                raise Exception, 'Error aligning second sequence. ' + str( e )
+            # generate align
+            try:
+                tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
+                tmp_stderr = open( tmp, 'wb' )
+                proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
+                returncode = proc.wait()
+                tmp_stderr.close()
+                # get stderr, allowing for case where it's very large
+                tmp_stderr = open( tmp, 'rb' )
+                stderr = ''
+                try:
+                    while True:
+                        stderr += tmp_stderr.read( buffsize )
+                        if not stderr or len( stderr ) % buffsize != 0:
+                            break
+                except OverflowError:
+                    pass
+                tmp_stderr.close()
+                if returncode != 0:
+                    raise Exception, stderr
+            except Exception, e:
+                raise Exception, 'Error generating alignments. ' + str( e )
+            # remove header if necessary
+            if options.suppressHeader == 'true':
+                tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
+                tmp_out_name = tmp_out.name
+                tmp_out.close()
+                try:
+                    shutil.move( options.output, tmp_out_name )
+                except Exception, e:
+                    raise Exception, 'Error moving output file before removing headers. ' + str( e )
+                fout = file( options.output, 'w' )
+                for line in file( tmp_out.name, 'r' ):
+                    if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ):
+                        fout.write( line )
+                fout.close()
+            # check that there are results in the output file
+            if os.path.getsize( options.output ) > 0:
+                sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType )
+            else:
+                raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
+        except Exception, e:
+            stop_err( 'The alignment failed.\n' + str( e ) )
+    finally:
+        # clean up temp dir
+        if os.path.exists( tmp_index_dir ):
+            shutil.rmtree( tmp_index_dir )
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree( tmp_dir )
+
+if __name__=="__main__": __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/bwa_index.loc.sample	Thu Jan 17 16:44:14 2013 -0500
@@ -0,0 +1,38 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of BWA indexed sequences data files. You will need
+#to create these data files and then create a bwa_index.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bwa_index.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>   <dbkey>   <display_name>   <file_path>
+#
+#So, for example, if you had phiX indexed stored in
+#/depot/data2/galaxy/phiX/base/,
+#then the bwa_index.loc entry would look like this:
+#
+#phiX174   phiX   phiX Pretty   /depot/data2/galaxy/phiX/base/phiX.fa
+#
+#and your /depot/data2/galaxy/phiX/base/ directory
+#would contain phiX.fa.* files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 phiX.fa.amb
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 phiX.fa.ann
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 phiX.fa.bwt
+#...etc...
+#
+#Your bwa_index.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files.  For example:
+#
+#phiX174				phiX	phiX174			/depot/data2/galaxy/phiX/base/phiX.fa
+#hg18canon				hg18	hg18 Canonical	/depot/data2/galaxy/hg18/base/hg18canon.fa
+#hg18full				hg18	hg18 Full		/depot/data2/galaxy/hg18/base/hg18full.fa
+#/orig/path/hg19.fa		hg19	hg19			/depot/data2/galaxy/hg19/base/hg19.fa
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/bwa_index_color.loc.sample	Thu Jan 17 16:44:14 2013 -0500
@@ -0,0 +1,38 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a directory of BWA indexed sequences data files. You will need
+#to create these data files and then create a bwa_index_color.loc file
+#similar to this one (store it in this directory) that points to
+#the directories in which those files are stored. The bwa_index_color.loc
+#file has this format (longer white space characters are TAB characters):
+#
+#<unique_build_id>	<dbkey>		<display_name>	<file_path>
+#
+#So, for example, if you had phiX indexed stored in
+#/depot/data2/galaxy/phiX/color/,
+#then the bwa_index.loc entry would look like this:
+#
+#phiX174   phiX   phiX Pretty   /depot/data2/galaxy/phiX/color/phiX.fa
+#
+#and your /depot/data2/galaxy/phiX/color/ directory
+#would contain phiX.fa.* files:
+#
+#-rw-r--r--  1 james    universe 830134 2005-09-13 10:12 phiX.fa.amb
+#-rw-r--r--  1 james    universe 527388 2005-09-13 10:12 phiX.fa.ann
+#-rw-r--r--  1 james    universe 269808 2005-09-13 10:12 phiX.fa.bwt
+#...etc...
+#
+#Your bwa_index_color.loc file should include an entry per line for each
+#index set you have stored. The "file" in the path does not actually
+#exist, but it is the prefix for the actual index files.  For example:
+#
+#phiX174				phiX	phiX174				/depot/data2/galaxy/phiX/color/phiX.fa
+#hg18canon				hg18	hg18 Canonical		/depot/data2/galaxy/hg18/color/hg18canon.fa
+#hg18full				hg18	hg18 Full			/depot/data2/galaxy/hg18/color/hg18full.fa
+#/orig/path/hg19.fa		hg19	hg19				/depot/data2/galaxy/hg19/color/hg19.fa
+#...etc...
+#
+#Note that for backwards compatibility with workflows, the unique ID of
+#an entry must be the path that was in the original loc file, because that
+#is the value stored in the workflow for that parameter. That is why the
+#hg19 entry above looks odd. New genomes can be better-looking.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Jan 17 16:44:14 2013 -0500
@@ -0,0 +1,13 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of indexes in the BWA mapper format -->
+    <table name="bwa_indexes" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/bwa_index.loc" />
+    </table>
+    <!-- Locations of indexes in the BWA color-space mapper format -->
+    <table name="bwa_indexes_color" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/bwa_index_color.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Thu Jan 17 16:44:14 2013 -0500
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="bwa" version="0.5.9">
+        <install version="1.0">
+            <actions>
+                <action type="download_by_url">http://downloads.sourceforge.net/project/bio-bwa/bwa-0.5.9.tar.bz2</action>
+                <action type="shell_command">make</action>
+                <action type="move_file">
+                    <source>bwa</source>
+                    <destination>$INSTALL_DIR/bin</destination>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                </action>
+            </actions>
+        </install>
+        <readme>
+Compiling BWA requires zlib and libpthread to be present on your system.
+        </readme>
+    </package>
+</tool_dependency>
\ No newline at end of file