changeset 2:f6ba0f12cca2 draft

Untested work-in-progress GMAP wrappers v3.0.0, from JJ back in June 2013
author peterjc
date Wed, 28 Sep 2016 10:43:44 -0400
parents 74391fc6e3f2
children 488e9d642566
files datatypes_conf.xml gmap.xml gmap_build.xml gsnap.xml iit_store.xml lib/galaxy/datatypes/gmap.py snpindex.xml tool_dependencies.xml
diffstat 8 files changed, 383 insertions(+), 76 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -0,0 +1,24 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="gmap.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="gmapdb" type="galaxy.datatypes.gmap:GmapDB" display_in_upload="False"/>
+        <datatype extension="gmapsnpindex" type="galaxy.datatypes.gmap:GmapSnpIndex" display_in_upload="False"/>
+        <datatype extension="iit" type="galaxy.datatypes.gmap:IntervalIndexTree" display_in_upload="True"/>
+        <datatype extension="splicesites.iit" type="galaxy.datatypes.gmap:SpliceSitesIntervalIndexTree" display_in_upload="True"/>
+        <datatype extension="introns.iit" type="galaxy.datatypes.gmap:IntronsIntervalIndexTree" display_in_upload="True"/>
+        <datatype extension="snps.iit" type="galaxy.datatypes.gmap:SNPsIntervalIndexTree" display_in_upload="True"/>
+        <datatype extension="gmap_annotation" type="galaxy.datatypes.gmap:IntervalAnnotation" display_in_upload="False"/>
+        <datatype extension="gmap_splicesites" type="galaxy.datatypes.gmap:SpliceSiteAnnotation" display_in_upload="True"/>
+        <datatype extension="gmap_introns" type="galaxy.datatypes.gmap:IntronAnnotation" display_in_upload="True"/>
+        <datatype extension="gmap_snps" type="galaxy.datatypes.gmap:SNPAnnotation" display_in_upload="True"/>
+    </registration>
+    <sniffers>
+        <sniffer type="galaxy.datatypes.gmap:IntervalAnnotation"/>
+        <sniffer type="galaxy.datatypes.gmap:SpliceSiteAnnotation"/>
+        <sniffer type="galaxy.datatypes.gmap:IntronAnnotation"/>
+        <sniffer type="galaxy.datatypes.gmap:SNPAnnotation"/>
+    </sniffers>
+</datatypes>
--- a/gmap.xml	Fri Oct 05 13:08:43 2012 -0500
+++ b/gmap.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -1,7 +1,7 @@
-<tool id="gmap" name="GMAP" version="2.0.1">
+<tool id="gmap" name="GMAP" version="3.0.0">
   <description>Genomic Mapping and Alignment Program for mRNA and EST sequences</description>
   <requirements>
-    <requirement type="package" version="2011-11-30">gmap</requirement>
+    <requirement type="package" version="2013-05-09">gmap</requirement>
   </requirements>
   <version_string>gmap --version</version_string>
   <command>
@@ -11,8 +11,7 @@
     #if $refGenomeSource.genomeSource == "history":
       --gseg=$refGenomeSource.ownFile
     #elif $refGenomeSource.genomeSource == "gmapdb":
-      #set $gmapdb = $os.listdir($refGenomeSource.gmapdb.extra_files_path)[0]
-      --dir=$refGenomeSource.gmapdb.extra_files_path --db=$gmapdb
+      --dir=$refGenomeSource.gmapdb.extra_files_path --db=$refGenomeSource.gmapdb.metadata.db_name
       #if $refGenomeSource.kmer != None and len($refGenomeSource.kmer.__str__) == 2:
         --kmer=$refGenomeSource.kmer
       #end if
@@ -43,6 +42,9 @@
     #elif $result.format == "sam":
       --format=$result.sam_paired_read
       $result.no_sam_headers 
+      $result.sam_use_0M
+      $result.force_xs_dir
+      $result.md_lowercase_snp
       #* Removed in gmap version 2011-11-30
       #if len($result.noncanonical_splices.__str__) > 0
          --noncanonical-splices=$result.noncanonical_splices
@@ -65,6 +67,7 @@
     #end if
     #if $computation.options == "advanced":
       $computation.nosplicing
+      $computation.find_shifted_canonical
       $computation.cross_species
       #if len($computation.min_intronlength.__str__) > 0
         --min-intronlength=$computation.min_intronlength
@@ -170,6 +173,15 @@
             <filter type="sort_by" column="3"/>
           </options>
         </param>
+        <!--
+         basesize=INT                 Base size to use in genome database.  If not specified, the program
+                                   will find the highest available base size in the genome database
+                                   within selected k-mer size
+         sampling=INT                 Sampling to use in genome database.  If not specified, the program
+                                   will find the smallest available sampling value in the genome database
+                                   within selected basesize and k-mer size
+
+        -->
         <param name="map" type="select" data_ref="gmapindex" label="Look for splicing involving known sites or known introns" help="">
           <options from_file="gmap_indices.loc">
             <column name="name" index="4"/>
@@ -224,7 +236,7 @@
          <validator type="in_range" message="totallength must be positive" min="0" />
        </param>
        <param name="chimera_margin" type="integer" value="" optional="true" label="Amount of unaligned sequence that triggers search for a chimera" 
-              help=" default is 40, To turn off, set to a large value (greater than the query length)" >	
+              help=" default is 40, To turn off, set to 0" >	
          <validator type="in_range" message="chimera_margin must be positive" min="0" />
        </param>
        <param name="direction"  type="select" label="cDNA direction">	
@@ -237,6 +249,7 @@
        <param name="trimendexons"  type="integer" value="" optional="true" label="Trim end exons with fewer than given number of matches (in nt, default 12)" >	
          <validator type="in_range" message="trimendexons must be positive" min="1" />
        </param>
+       <param name="find_shifted_canonical" type="boolean" truevalue="--find-shifted-canonical-species" falsevalue="" checked="false" label="find-shifted-canonical Use a more sensitive search for canonical splicing" help=""/>
        <param name="cross_species" type="boolean" truevalue="--cross-species" falsevalue="" checked="false" label="Cross-species alignment" help="For cross-species alignments, use a more sensitive search for canonical splicing"/>
        
        <param name="canonical"  type="select" label="Reward for canonical and semi-canonical introns">	
@@ -328,7 +341,7 @@
       <option value="splicesites">splicesites output (for GSNAP)</option>
       <option value="introns">introns output (for GSNAP)</option>
       <option value="map_exons">IIT FASTA exon map format</option>
-      <option value="map_genes">IIT FASTA map format</option>
+      <option value="map_ranges">IIT FASTA map format</option>
       <option value="coords">coords in table format</option>
       <option value="sam" selected="true">SAM format</option>
     </param>
@@ -353,7 +366,7 @@
       <when value="splicesites"/>
       <when value="introns"/>
       <when value="map_exons"/>
-      <when value="map_genes"/>
+      <when value="map_ranges"/>
       <when value="coords"/>
       <when value="sam">
         <param name="sam_paired_read" type="boolean" truevalue="sampe" falsevalue="samse" checked="false" label="SAM paired reads"/>
@@ -369,6 +382,11 @@
         <param name="read_group_name" type="text" value="" label="Value to put into read-group name (RG-SM) field"/>
         <param name="read_group_library" type="text" value="" label="Value to put into read-group library (RG-LB) field"/>
         <param name="read_group_platform" type="text" value="" label="Value to put into read-group library platform (RG-PL) field"/>
+        <param name="sam_use_0M" type="boolean" truevalue="--sam-use-0M" falsevalue="" checked="false" label="Insert 0M in CIGAR between adjacent insertions and deletions" help="Required by Picard, but can cause errors in other tools"/>
+        <param name="force_xs_dir" type="boolean" truevalue="--force-xs-dir" falsevalue="" checked="false" label="Force direction (disallow XS:A:?)" 
+               help="For RNA-Seq alignments, disallows XS:A:? when the sense direction is unclear, and replaces this value arbitrarily with XS:A:+. May be useful for some programs, such as Cufflinks, that cannot handle XS:A:?.  However, if you use this flag, the reported value of XS:A:+ in these cases will not be meaningful."/>
+        <param name="md_lowercase_snp" type="boolean" truevalue="--md-lowercase-snp" falsevalue="" checked="false" label="MD lowercase SNP" 
+               help="In MD string, when known SNPs are given by the -v flag, prints difference nucleotides as lower-case when they, differ from reference but match a known alternate allele"/>
       </when>
     </conditional> <!-- name="result" -->
 
@@ -383,6 +401,18 @@
       print-comment    Show comment line for each hit
     -->
 
+    <!-- 
+  min-trimmed-coverage=FLOAT   Do not print alignments with trimmed coverage less
+                                   this value (default=0.0, which means no filtering)
+                                   Note that chimeric alignments will be output regardless
+                                   of this filter
+  min-identity=FLOAT           Do not print alignments with identity less
+                                   this value (default=0.0, which means no filtering)
+                                   Note that chimeric alignments will be output regardless
+                                   of this filter
+    -->
+
+
 
   </inputs>
   <outputs>
--- a/gmap_build.xml	Fri Oct 05 13:08:43 2012 -0500
+++ b/gmap_build.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -1,10 +1,10 @@
-<tool id="gmap_build" name="GMAP Build" version="2.0.0">
+<tool id="gmap_build" name="GMAP Build" version="3.0.0">
   <description>a database genome index for GMAP and GSNAP</description>
   <requirements>
-      <requirement type="package" version="2011-11-30">gmap</requirement>
+      <requirement type="package" version="2013-05-09">gmap</requirement>
   </requirements>
   <version_string>gmap --version</version_string>
-  <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command>
+  <command interpreter="command"> /bin/bash $shscript > $output </command>
   <inputs>
     <!-- Name for this gmapdb -->
     <param name="refname" type="text" label="Name you want to give this gmap database" help="">
@@ -15,12 +15,17 @@
       <param name="input" type="data" format="fasta" label="reference sequence fasta" />
     </repeat>
 
-    <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="">
-      <option value="12">12</option>
-      <option value="13">13</option>
-      <option value="14">14</option>
-      <option value="15" selected="true">15</option>
+    <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" 
+           help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome">
+    </param>
+
+    <param name="sort" type="select" label="Sort chromosomes" help="">
+      <option value="none">none - use chromosomes as found in FASTA file(s)</option>
+      <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option>
+      <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option>
+      <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option>
     </param>  
+
     <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/>
     <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/>
     <conditional name="splicesite">
@@ -51,6 +56,7 @@
         <option value="none"></option>
         <option value="snpTable">UCSC SNP Table</option>
         <option value="snpFile">GMAP SNP File</option>
+        <option value="vcfFile">VCF File</option>
       </param>
       <when value="none"/>
       <when value="snpTable">
@@ -77,8 +83,33 @@
                 sequence, that SNP will be ignored in subsequent processing as a probable error.
                 The N stands for any other allele." />
       </when>
+      <when value="vcfFile">
+        <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" 
+               help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
+                     The VCF file contains multiple versions of dbSNP, so if you want a
+                     particular version, such as 135.  The vcf_iit program tries to pick 
+                     a subset of SNPs that somewhat parallel
+                     the ones without exceptions in the UCSC dbSNP file.  It keeps all SNPs
+                     that have been validated (marked in the VCF file as &#34;VLD&#34;) or have a
+                     submitter link-out (&#34;SLO&#34;).  Otherwise, it excludes SNPs that are
+                     individual genotypes (&#34;GNO&#34;).  If none of these conditions hold, then
+                     the SNP is allowed.  "/>
+        <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" 
+           help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/>
+      </when>
     </conditional> 
+
+    <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM">
+      <option value="12">12 (64MB RAM)</option>
+      <option value="13">13 (256MB RAM)</option>
+      <option value="14">14 (1GB RAM)</option>
+      <option value="15" selected="true">15 (4GB RAM)</option>
+    </param>
+
   </inputs>
+  <stdio>
+    <exit_code range="1"  level="fatal"   description="Error running gmap_build" />
+  </stdio>
   <outputs>
     <!--
     <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/>
@@ -97,15 +128,23 @@
   ## #set $ref_files = $ref_files $i.input
 ## #end for
 ## echo $ref_files
+#set circular = ""
+#if $circular_chroms.__str__.strip() != '':
+#set circular = ('').join([' -c ','"', $circular_chroms.__str__,'"'])
+#end if
 #import os.path
 #set $gmapdb = $output.extra_files_path
 #set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps')
 mkdir -p $gmapdb
 ## export GMAPDB required for cmetindex  and atoiindex
 export GMAPDB=$gmapdb
+#if $kmer:
 #for $k in $kmer.__str__.split(','):
-gmap_build -D $gmapdb -d $refname -s numeric-alpha -k $k #for i in $inputs# ${i.input}#end for#
+gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for#
 #end for
+#else:
+gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for#
+#end if
 get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' 
 echo "kmers: " $kmer 
 #if $splicesite.splice_source == 'refGeneTable':
@@ -125,17 +164,23 @@
 #end if
 #end if
 #if $dbsnp.snp_source != 'none' and $dbsnp.snps.__str__ != 'None':
-#if $dbsnp.snp_source == 'snpTable':
-#if $dbsnp.snpsex.__str__ != 'None':
-cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o  $os.path.join($mapsdir,'snps')
-#else:
-cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o  $os.path.join($mapsdir,'snps')
-#end if
-#else:
-cat $dbsnp.snps | iit_store -o  $os.path.join($mapsdir,'snps')
-#end if
-snpindex -d $refname -v snps
-echo "snpindex" -d  $refname -v snps
+ #if $dbsnp.snp_source == 'snpTable':
+  #if $dbsnp.snpsex.__str__ != 'None':
+   cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o  $os.path.join($mapsdir,'snps')
+  #else:
+   cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o  $os.path.join($mapsdir,'snps')
+  #end if
+ #elif $dbsnp.snp_source == 'vcfFile':
+  #if $dbsnp.vcf_version  and len($dbsnp.vcf_version.__str__.strip()) > 0:
+   cat $dbsnp.snps | vcf_iit -v $dbsnp.vcf_version.__str__.strip() | iit_store -o  $os.path.join($mapsdir,'snps')
+  #else:
+   cat $dbsnp.snps | vcf_iit | iit_store -o  $os.path.join($mapsdir,'snps')
+  #end if
+ #else:
+  cat $dbsnp.snps | iit_store -o  $os.path.join($mapsdir,'snps')
+ #end if
+ snpindex -d $refname -v snps
+ echo "snpindex" -d  $refname -v snps
 #end if
 #if $cmetindex.__str__ == 'yes':
 cmetindex -d $refname
@@ -154,10 +199,9 @@
 
   <help>
 
-
 **GMAP Build**
 
-GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program).  (GMAP Build uses GMSP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.)
+GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program).  (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.)
 
 You will want to read the README_
 
@@ -169,6 +213,129 @@
 .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859
 
 
+**circular chromosomes**
+
+Finally, you can provide information to gmap_build that certain
+chromosomes are circular, with the -c or -\-circular flag.  The value
+for these flags is a list of chromosomes, separated by commas.  The
+gmap_build program will then allow GSNAP and GMAP to align reads
+across the ends of the chromosome.  For example, the mitochondrial
+genome in human beings is circular.
+
+
+**Detecting known and novel splice sites in GSNAP**
+
+GSNAP can detect splice junctions in individual reads.  
+GSNAP allows for known splicing at two levels: at the level of known
+splice sites and at the level of known introns.  At the site level,
+GSNAP finds splicing between arbitrary combinations of donor and
+acceptor splice sites, meaning that it can find alternative splicing
+events.  At the intron level, GSNAP finds splicing only between the
+set of given donor-acceptor pairs, so it is constrained not to find
+alternative splicing events, only introns included in the given list.
+For most purposes, I would recommend using known splice sites, rather
+than known introns, unless you are certain that all alternative
+splicing events are known are represented in your file.
+
+Splice site files can be generated from a GTF file 
+or from  refGenes table from UCSC.  
+
+
+**SNP-tolerant alignment in GSNAP**
+
+GSNAP has the ability to align to a reference space of all possible
+major and minor alleles in a set of known SNPs provided by the user.
+
+
+Process known SNP data, either from older dbSNP files or from newer
+files in VCF format.  The older dbSNP files can be obtained from UCSC,
+either from the Galaxy UCSC table browser or downloaded:
+
+    ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz
+
+For versions before snp132, you may also want to exclude exceptions,
+which will require this file:
+
+    ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz
+
+The option "-w weight" makes use of the dbSNP item weight, a value
+from 1 to 3, where lower weight means higher confidence.  Items will
+be included if the item weight is the given value weight or less.
+The default value of -w is 1, which is the criterion UCSC uses to
+build its ambiguous version of the genome.  To allow all item weights,
+specify "-w 3".
+
+The more recent SNP data are provided in VCF format, and can be
+retrieved like this:
+
+    ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz
+
+The VCF file contains multiple versions of dbSNP, so if you want a
+particular version, such as 135, you would use the flag "-v 135".  The
+vcf_iit program tries to pick a subset of SNPs that somewhat parallel
+the ones without exceptions in the UCSC dbSNP file.  It keeps all SNPs
+that have been validated (marked in the VCF file as "VLD") or have a
+submitter link-out ("SLO").  Otherwise, it excludes SNPs that are
+individual genotypes ("GNO").  If none of these conditions hold, then
+the SNP is allowed.  These rules might not be the best ones; I made
+them up by trying to compare version 135 of the VCF data with
+version 135 of the UCSC dbSNP data.
+
+**Alignment of reads from bisulfite-treated DNA in GSNAP**
+
+GSNAP has the ability to align reads from bisulfite-treated DNA, which
+converts unmethylated cytosines to uracils that appear as thymines in
+reads.  GSNAP is able to identify genomic-T to read-C mismatches, 
+if a cmetindex is generated.
+
+**RNA-editing tolerance in GSNAP**
+
+Just as GSNAP has a program cmetindex and a mode called "cmet" for
+tolerance to C-to-T changes, it can be tolerant to A-to-G changes
+using the program atoiindex and a mode called "atoi".  This mode is
+designed to facilitate alignments that are tolerant to RNA editing
+where A's are converted to I's, which appear as G's to a sequencer.
+
+To process reads under RNA-editing tolerance, you will first need to
+create th atoi index.
+
+
+
+**K-mer size**
+
+You can control the k-mer size
+for the genomic index with the -k flag, which can range from 12 to 15.
+The default value for -k is 15, but this requires your machine to have
+4 GB of RAM to build the indices.  If you do not have 4 GB of RAM,
+then you will need to reduce the value of -k or find another machine.
+Here are the RAM requirements for building various indices::
+
+    k-mer of 12: 64 MB
+    k-mer of 13: 256 MB
+    k-mer of 14: 1 GB
+    k-mer of 15: 4 GB
+
+These are the RAM requirements for building indices, but not to run
+the GMAP/GSNAP programs once the indices are built, because the
+genomic indices are compressed.  For example, the genomic index for a
+k-mer of 15 gives a gammaptrs file of 64 MB and an offsetscomp file of
+about 350 MB, much smaller than the 4 GB that would otherwise be
+required.  Therefore, you may want to build your genomic index on a
+computer with sufficient RAM, and distribute that index to be used by
+computers with less RAM.
+
+The amount of compression can be controlled using the -b or -\-basesize
+parameter to gmap_build.  By default, the value for k-mer size is 15,
+and the value for basesize is 12.  If you select a different value for
+k-mer size, then basesize is made by default to be equal to that k-mer
+size.
+
+If you want to build your genomic databases with more than one k-mer
+size, you can re-run gmap_build with different values of -k.  This
+will overwrite only the identical files from the previous runs.  You
+can then choose the k-mer size at run-time by using the -k flag for
+either GMAP or GSNAP.
+
   </help>
 </tool>
 
--- a/gsnap.xml	Fri Oct 05 13:08:43 2012 -0500
+++ b/gsnap.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -1,7 +1,7 @@
-<tool id="gsnap" name="GSNAP" version="2.0.1">
+<tool id="gsnap" name="GSNAP" version="3.0.0">
   <description>Genomic Short-read Nucleotide Alignment Program</description>
   <requirements>
-      <requirement type="package" version="2011-11-30">gmap</requirement>
+      <requirement type="package" version="2013-05-09">gmap</requirement>
   </requirements>
   <version_string>gsnap --version</version_string>
   <command>
@@ -9,7 +9,6 @@
     gsnap
     --nthreads="4" --ordered
     #if $refGenomeSource.genomeSource == "gmapdb":
-      #set $gmapdb = $os.listdir($refGenomeSource.gmapdb.extra_files_path)[0]
       --dir=$refGenomeSource.gmapdb.extra_files_path --db=$refGenomeSource.gmapdb.metadata.db_name
     #else:
       --dir=$os.path.dirname($refGenomeSource.gmapindex.value) --db=$os.path.basename($refGenomeSource.gmapindex.value)
@@ -140,6 +139,9 @@
       #if $output.npath.__str__ != '':
         --npath=$output.npath
       #end if
+      #if $output.maxsearch.__str__ != '':
+        --maxsearch=$output.maxsearch 
+      #end if
       $output.quiet_if_excessive
       $output.show_refdiff
       $output.clip_overlap
@@ -248,7 +250,13 @@
         <param name="fastq" type="data" format="fastq" label="Select a fastq dataset" />
         <conditional name="paired">
           <param name="ispaired" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Use Paired Reads?"/>
-          <when value="no"/>
+          <when value="no">
+          <!--
+            Could allow multiple input fastq and set the force-single-end flag
+            force-single-end             When multiple FASTQ files are provided on the command line, GSNAP assumes
+                                    they are match paired-end files.  This flag treats each file as single-end.
+          -->
+          </when>
           <when value="yes">
             <param name="fastq" type="data" format="fastq" label="Select the paired reads reverse dataset" />
             <param name="orientation" type="select" label="Orientation of paired-end reads" help="">
@@ -471,8 +479,11 @@
       <when value="default"/>
       <when value="advanced">
          <param name="max_mismatches" type="float" value="" optional="true" label="Maximum number of mismatches allowed (uses default when negative)" 
-              help="Defaults to the ultrafast level of ((readlength+2)/12 - 2)).
-                    If specified between 0.0 and 1.0, then treated as a fraction
+              help="Maximum number of mismatches allowed (if not specified, then
+                    defaults to the ultrafast level of ((readlength+index_interval-1)/kmer - 2))
+                    (By default, the genome index interval is 3, but this can be changed
+                    by providing a different value for -q to gmap_build when processing the genome.)
+		    If specified between 0.0 and 1.0, then treated as a fraction
                     of each read length.  Otherwise, treated as an integral number
                     of mismatches (including indel and splicing penalties)
                     For RNA-Seq, you may need to increase this value slightly
@@ -481,12 +492,24 @@
          </param>
          <param name="query_unk_mismatch" type="boolean" checked="false" truevalue="--query-unk-mismatch=1" falsevalue="" label="Count unknown (N) characters in the query as a mismatch"/>
          <param name="genome_unk_mismatch" type="boolean" checked="true" truevalue="" falsevalue="--genome-unk-mismatch=0" label="Count unknown (N) characters in the genome as a mismatch"/>
-         <param name="terminal_threshold"  type="integer" value="" optional="true" label="Threshold for searching for a terminal alignment (default 2)" 
-                help="(from one end of the read to the best possible position at the other end).   For example, if this value is 2, then if GSNAP finds an exact or
+         <param name="maxsearch" type="integer" value="" optional="true" label="Maximum number of alignments to find (default 1000)"
+                help="Must be larger than paths, which is the number to report.
+                      Keeping this number large will allow for random selection among multiple alignments.
+                      Reducing this number can speed up the program. "/>
+
+         <param name="terminal_threshold"  type="integer" value="" optional="true" label="Threshold for searching for a terminal alignment" 
+                help="Threshold for searching for a terminal alignment (from one end of the
+                                   read to the best possible position at the other end) (default 2
+                                   for standard, atoi-stranded, and atoi-nonstranded mode; default 100
+                                   for cmet-stranded and cmet-nonstranded mode).
+                                   For example, if this value is 2, then if GSNAP finds an exact or
                                    1-mismatch alignment, it will not try to find a terminal alignment.
                                    Note that this default value may not be low enough if you want to
                                    obtain terminal alignments for very short reads, although such reads
-                                   probably don't have enough specificity for terminal alignments anyway." />
+                                   probably don't have enough specificity for terminal alignments anyway.
+                                   To turn off terminal alignments, set this to a high value, greater
+                                   than the value for max-mismatches.
+                                   "/>
          <param name="indel_penalty"  type="integer" value="" optional="true" label="Penalty for an indel (default 2)" 
                 help="Counts against mismatches allowed.  To find indels, make indel-penalty less than or equal to max-mismatches.  A value &lt; 2 can lead to false positives at read ends" />
          <param name="indel_endlength"  type="integer" value="" optional="true" label="Minimum length at end required for indel alignments (default 4)" />
@@ -497,9 +520,10 @@
          <param name="suboptimal_levels"  type="integer" value="" optional="true" label="Report suboptimal hits beyond best hit (default 0)"
                 help="All hits with best score plus suboptimal-levels are reported" />
          <param name="adapter_strip"  type="select" label="Method for removing adapters from reads" 
-                help="paired removes adapters from paired-end reads if a concordant or paired alignment cannot be found from the original read">
-           <option value="paired" selected="true">paired</option>
-           <option value="off">off</option>
+                help="Default is 'off'.  To turn on, specify 'paired', which removes adapters
+                                   from paired-end reads if they appear to be present.">
+           <option value="paired">paired</option>
+           <option value="off" selected="true">off</option>
          </param>
          <param name="trim_mismatch_score" type="integer" value="" optional="true" label="Score to use for mismatches when trimming at ends (default is -3)" 
                 help="to turn off trimming, specify 0 (Warning: turning trimming off will give false positive mismatches at the ends of reads)"/>
@@ -523,6 +547,7 @@
           <param name="gmap_mode" type="select" multiple="true" optional="true" display="checkboxes" label="Cases to use GMAP for complex alignments containing multiple splices or indels" 
                  help="Default: pairsearch,terminal,improve">
             <option value="pairsearch" selected="true">pairsearch</option>
+            <option value="indel_knownsplice" selected="true">indel_knownsplice</option>
             <option value="terminal" selected="true">terminal</option>
             <option value="improve" selected="true">improve</option>
           </param>
--- a/iit_store.xml	Fri Oct 05 13:08:43 2012 -0500
+++ b/iit_store.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -1,7 +1,7 @@
-<tool id="gmap_iit_store" name="GMAP IIT" version="2.0.0">
+<tool id="gmap_iit_store" name="GMAP IIT" version="3.0.0">
   <description>Create a map store for known genes or SNPs</description>
   <requirements>
-      <requirement type="package" version="2011-11-30">gmap</requirement>
+      <requirement type="package" version="2013-05-09">gmap</requirement>
   </requirements>
   <version_string>iit_store --version</version_string>
   <command interpreter="command"> /bin/bash $shscript 2> $log </command>
--- a/lib/galaxy/datatypes/gmap.py	Fri Oct 05 13:08:43 2012 -0500
+++ b/lib/galaxy/datatypes/gmap.py	Wed Sep 28 10:43:44 2016 -0400
@@ -2,7 +2,7 @@
 GMAP indexes
 """
 import logging
-import os,os.path,re
+import os,os.path,re,sys
 import galaxy.datatypes.data
 from galaxy.datatypes.data import Text
 from galaxy import util
@@ -15,11 +15,14 @@
     A GMAP DB for indexes
     """
     MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
+    MetadataElement( name="chromosomes", desc="The chromosomes or contigs", no_value=[], readonly=False )
+    MetadataElement( name="circular", desc="cirular chromosomes", no_value=[], readonly=False )
+    MetadataElement( name="chromlength", desc="Chromosome lengths", no_value=[], readonly=False )
     MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True )
-    MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True )
+    MetadataElement( name="kmers", desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True )
     MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True )
-    MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True )
-    MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True )
+    MetadataElement( name="maps", desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True )
+    MetadataElement( name="snps", desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True )
     MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True )
     MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True )
     
@@ -41,10 +44,24 @@
         """
         bn = dataset.metadata.db_name
         log.info( "GmapDB regenerate_primary_file %s" % (bn))
-        rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
-        for i,name in enumerate(dataset.metadata.maps):
-            rval.append( '<li>%s' % name)
-        rval.append( '</ul></html>' )
+        rval = []
+        rval.append("GMAPDB: %s" % dataset.metadata.db_name)
+        if dataset.metadata.chromosomes:
+            rval.append("chromosomes: %s" % dataset.metadata.chromosomes)
+        if dataset.metadata.chromlength and len(dataset.metadata.chromlength) == len(dataset.metadata.chromosomes):
+            rval.append( 'chrom\tlength' )
+            for i,name in enumerate(dataset.metadata.chromosomes):
+                rval.append( '%s\t%d' % (dataset.metadata.chromosomes[i],dataset.metadata.chromlength[i]))
+        if dataset.metadata.circular:
+            rval.append("circular: %s" % dataset.metadata.circular)
+        if dataset.metadata.kmers:
+            rval.append("kmers: %s" % dataset.metadata.kmers)
+        rval.append("cmetindex: %s  atoiindex: %s" % (dataset.metadata.cmet,dataset.metadata.atoi))
+        if dataset.metadata.maps and len(dataset.metadata.maps) > 0:
+            rval.append( 'Maps:')
+            for i,name in enumerate(dataset.metadata.maps):
+                if name.strip() != '':
+                    rval.append( ' %s' % name)
         f = file(dataset.file_name,'w')
         f.write("\n".join( rval ))
         f.write('\n')
@@ -53,7 +70,7 @@
     def set_peek( self, dataset, is_multi_byte=False ):
         log.info( "GmapDB set_peek %s" % (dataset))
         if not dataset.dataset.purged:
-            dataset.peek  = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps )
+            dataset.peek  = "GMAPDB index %s\n chroms %s\n kmers %s cmet %s atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.kmers,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps )
             dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name )
         else:
             dataset.peek = 'file does not exist'
@@ -68,6 +85,26 @@
         return False
     def set_meta( self, dataset, overwrite = True, **kwd ):
         """
+        extra_files_path/<db_name>/GRCh37_19
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2iag12123offsetscomp
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2iag123positions
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2itc12123offsetscomp
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2itc123positions
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.chromosome
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.chromosome.iit
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.chrsubset
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.contig
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.contig.iit
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.genomecomp
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.maps
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metct12123offsetscomp
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metct123positions
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metga12123offsetscomp
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metga123positions
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.ref12123offsetscomp
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.ref123positions
+        extra_files_path/<db_name>/GRCh37_19/GRCh37_19.version
+
         Expecting:
         extra_files_path/<db_name>/db_name>.ref<basesize><kmer>3<index>
         extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp
@@ -77,7 +114,9 @@
         extra_files_path/db_name/db_name.maps/*.iit
         """
         log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path))
-        pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?'
+        chrom_pat = '^(.+).chromosome$'
+        #pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?'
+        pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?(\d)(offsetscomp)'
         efp = dataset.extra_files_path
         flist = os.listdir(efp)
         for i,fname in enumerate(flist):
@@ -85,24 +124,60 @@
             fpath = os.path.join(efp,fname)
             if os.path.isdir(fpath):
                 ilist = os.listdir(fpath)
-                kmers = {'':'default'} # HACK  '' empty key  added so user has default choice when selecting kmer from metadata
+                # kmers = {'':'default'} # HACK  '' empty key  added so user has default choice when selecting kmer from metadata
+                kmers = dict()
                 for j,iname in enumerate(ilist):
                     log.info( "GmapDB set_meta file %s %s" % (j,iname))
                     ipath = os.path.join(fpath,iname)
+                    print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath)
                     if os.path.isdir(ipath):  # find maps
                         dataset.metadata.map_dir = iname
+                        maps = []
+                        snps = []
                         for mapfile in os.listdir(ipath):
                             mapname = mapfile.replace('.iit','')
                             log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile))
-                            dataset.metadata.maps.append(mapname)
+                            print >> sys.stderr, "GmapDB set_meta map %s %s " % (mapname,mapfile)
+                            maps.append(mapname)
+                            if mapname.find('snp') >= 0: 
+                                snps.append(mapname)
+                        if len(maps) > 0:
+                            dataset.metadata.maps = maps
+                        if len(snps) > 0:
+                            dataset.metadata.snps = snps
                     else: 
+                        m = re.match(chrom_pat,iname)
+                        if m and len(m.groups()) == 1:
+                            dataset.metadata.db_name = m.groups()[0]
+                            print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath)
+                            try:
+                                fh = open(ipath)
+                                dataset.metadata.chromosomes = []
+                                dataset.metadata.circular = []
+                                dataset.metadata.chromlength = []
+                                for k,line in enumerate(fh):
+                                   fields = line.strip().split('\t')
+                                   print >> sys.stderr, "GmapDB set_meta chrom %s fields %s"  % (line,fields)
+                                   if len(fields) > 2:
+                                       dataset.metadata.chromosomes.append(str(fields[0]))
+                                       dataset.metadata.chromlength.append(int(fields[2]))
+                                   if len(fields) > 3 and fields[3] == 'circular':
+                                       dataset.metadata.circular.append(str(fields[0]))
+                                print >> sys.stderr, "GmapDB set_meta db_name %s chromosomes %s  circular %s" % (dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.circular)
+                            except Exception, e:
+                                log.info( "GmapDB set_meta error %s %s " % (iname, e))
+                                print >> sys.stderr, "GmapDB set_meta file %s Error %s"  % (ipath,e)
+                            finally:
+                                if fh:
+                                    fh.close()
+                            continue
                         m = re.match(pat,iname)
                         if m:
                             log.info( "GmapDB set_meta m %s %s " % (iname, m))
+                            print >> sys.stderr, "GmapDB set_meta iname %s  %s" % (iname,m.groups())
                             assert len(m.groups()) == 10
-                            dataset.metadata.db_name = fname
                             if m.groups()[2] == 'ref':
-                                if m.groups()[-1] != None:
+                                if m.groups()[-1] != None and m.groups()[-1] != 'offsetscomp':
                                     dataset.metadata.snps.append(m.groups()[-1])
                                 else:
                                     if m.groups()[-3] != None:
@@ -115,6 +190,7 @@
                             elif m.groups()[4] == 'a2i':
                                 dataset.metadata.atoi = True
                 dataset.metadata.kmers = kmers.keys()
+        self.regenerate_primary_file(dataset)
 
 class GmapSnpIndex( Text ):
     """
--- a/snpindex.xml	Fri Oct 05 13:08:43 2012 -0500
+++ b/snpindex.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -1,7 +1,7 @@
-<tool id="gmap_snpindex" name="GMAP SNP Index" version="2.0.0">
+<tool id="gmap_snpindex" name="GMAP SNP Index" version="3.0.0">
   <description>build index files for known SNPs</description>
   <requirements>
-      <requirement type="package" version="2011-11-30">gmap</requirement>
+      <requirement type="package" version="2013-05-09">gmap</requirement>
   </requirements>
   <version_string>snpindex --version</version_string>
   <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command>
--- a/tool_dependencies.xml	Fri Oct 05 13:08:43 2012 -0500
+++ b/tool_dependencies.xml	Wed Sep 28 10:43:44 2016 -0400
@@ -1,21 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
-    <package name="gmap" version="2011-11-30">
-        <install version="1.0">
-            <actions>
-                <action type="shell_command">wget http://research-pub.gene.com/gmap/src/gmap-gsnap-2011-11-30.tar.gz</action>
-                <action type="shell_command"> ./configure --prefix=bin --with-gmapdb=../gmapdb</action>
-                <action type="shell_command">make</action>
-                <action type="move_directory_files">
-                    <source_directory>bin</source_directory>
-                    <destination_directory>$INSTALL_DIR/bin</destination_directory>
-                </action>
-                <action type="set_environment">
-                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
-                </action>
-            </actions>
-        </install>
-        <readme>
-        </readme>
+    <package name="gmap" version="2013-05-09">
+        <repository changeset_revision="8bacc25e2023" name="package_gmap_2013_05_09" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>