Mercurial > repos > jjohnson > gmap
changeset 2:f6ba0f12cca2 draft
Untested work-in-progress GMAP wrappers v3.0.0, from JJ back in June 2013
author | peterjc |
---|---|
date | Wed, 28 Sep 2016 10:43:44 -0400 |
parents | 74391fc6e3f2 |
children | 488e9d642566 |
files | datatypes_conf.xml gmap.xml gmap_build.xml gsnap.xml iit_store.xml lib/galaxy/datatypes/gmap.py snpindex.xml tool_dependencies.xml |
diffstat | 8 files changed, 383 insertions(+), 76 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Wed Sep 28 10:43:44 2016 -0400 @@ -0,0 +1,24 @@ +<?xml version="1.0"?> +<datatypes> + <datatype_files> + <datatype_file name="gmap.py"/> + </datatype_files> + <registration> + <datatype extension="gmapdb" type="galaxy.datatypes.gmap:GmapDB" display_in_upload="False"/> + <datatype extension="gmapsnpindex" type="galaxy.datatypes.gmap:GmapSnpIndex" display_in_upload="False"/> + <datatype extension="iit" type="galaxy.datatypes.gmap:IntervalIndexTree" display_in_upload="True"/> + <datatype extension="splicesites.iit" type="galaxy.datatypes.gmap:SpliceSitesIntervalIndexTree" display_in_upload="True"/> + <datatype extension="introns.iit" type="galaxy.datatypes.gmap:IntronsIntervalIndexTree" display_in_upload="True"/> + <datatype extension="snps.iit" type="galaxy.datatypes.gmap:SNPsIntervalIndexTree" display_in_upload="True"/> + <datatype extension="gmap_annotation" type="galaxy.datatypes.gmap:IntervalAnnotation" display_in_upload="False"/> + <datatype extension="gmap_splicesites" type="galaxy.datatypes.gmap:SpliceSiteAnnotation" display_in_upload="True"/> + <datatype extension="gmap_introns" type="galaxy.datatypes.gmap:IntronAnnotation" display_in_upload="True"/> + <datatype extension="gmap_snps" type="galaxy.datatypes.gmap:SNPAnnotation" display_in_upload="True"/> + </registration> + <sniffers> + <sniffer type="galaxy.datatypes.gmap:IntervalAnnotation"/> + <sniffer type="galaxy.datatypes.gmap:SpliceSiteAnnotation"/> + <sniffer type="galaxy.datatypes.gmap:IntronAnnotation"/> + <sniffer type="galaxy.datatypes.gmap:SNPAnnotation"/> + </sniffers> +</datatypes>
--- a/gmap.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/gmap.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,7 +1,7 @@ -<tool id="gmap" name="GMAP" version="2.0.1"> +<tool id="gmap" name="GMAP" version="3.0.0"> <description>Genomic Mapping and Alignment Program for mRNA and EST sequences</description> <requirements> - <requirement type="package" version="2011-11-30">gmap</requirement> + <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> <version_string>gmap --version</version_string> <command> @@ -11,8 +11,7 @@ #if $refGenomeSource.genomeSource == "history": --gseg=$refGenomeSource.ownFile #elif $refGenomeSource.genomeSource == "gmapdb": - #set $gmapdb = $os.listdir($refGenomeSource.gmapdb.extra_files_path)[0] - --dir=$refGenomeSource.gmapdb.extra_files_path --db=$gmapdb + --dir=$refGenomeSource.gmapdb.extra_files_path --db=$refGenomeSource.gmapdb.metadata.db_name #if $refGenomeSource.kmer != None and len($refGenomeSource.kmer.__str__) == 2: --kmer=$refGenomeSource.kmer #end if @@ -43,6 +42,9 @@ #elif $result.format == "sam": --format=$result.sam_paired_read $result.no_sam_headers + $result.sam_use_0M + $result.force_xs_dir + $result.md_lowercase_snp #* Removed in gmap version 2011-11-30 #if len($result.noncanonical_splices.__str__) > 0 --noncanonical-splices=$result.noncanonical_splices @@ -65,6 +67,7 @@ #end if #if $computation.options == "advanced": $computation.nosplicing + $computation.find_shifted_canonical $computation.cross_species #if len($computation.min_intronlength.__str__) > 0 --min-intronlength=$computation.min_intronlength @@ -170,6 +173,15 @@ <filter type="sort_by" column="3"/> </options> </param> + <!-- + basesize=INT Base size to use in genome database. If not specified, the program + will find the highest available base size in the genome database + within selected k-mer size + sampling=INT Sampling to use in genome database. If not specified, the program + will find the smallest available sampling value in the genome database + within selected basesize and k-mer size + + --> <param name="map" type="select" data_ref="gmapindex" label="Look for splicing involving known sites or known introns" help=""> <options from_file="gmap_indices.loc"> <column name="name" index="4"/> @@ -224,7 +236,7 @@ <validator type="in_range" message="totallength must be positive" min="0" /> </param> <param name="chimera_margin" type="integer" value="" optional="true" label="Amount of unaligned sequence that triggers search for a chimera" - help=" default is 40, To turn off, set to a large value (greater than the query length)" > + help=" default is 40, To turn off, set to 0" > <validator type="in_range" message="chimera_margin must be positive" min="0" /> </param> <param name="direction" type="select" label="cDNA direction"> @@ -237,6 +249,7 @@ <param name="trimendexons" type="integer" value="" optional="true" label="Trim end exons with fewer than given number of matches (in nt, default 12)" > <validator type="in_range" message="trimendexons must be positive" min="1" /> </param> + <param name="find_shifted_canonical" type="boolean" truevalue="--find-shifted-canonical-species" falsevalue="" checked="false" label="find-shifted-canonical Use a more sensitive search for canonical splicing" help=""/> <param name="cross_species" type="boolean" truevalue="--cross-species" falsevalue="" checked="false" label="Cross-species alignment" help="For cross-species alignments, use a more sensitive search for canonical splicing"/> <param name="canonical" type="select" label="Reward for canonical and semi-canonical introns"> @@ -328,7 +341,7 @@ <option value="splicesites">splicesites output (for GSNAP)</option> <option value="introns">introns output (for GSNAP)</option> <option value="map_exons">IIT FASTA exon map format</option> - <option value="map_genes">IIT FASTA map format</option> + <option value="map_ranges">IIT FASTA map format</option> <option value="coords">coords in table format</option> <option value="sam" selected="true">SAM format</option> </param> @@ -353,7 +366,7 @@ <when value="splicesites"/> <when value="introns"/> <when value="map_exons"/> - <when value="map_genes"/> + <when value="map_ranges"/> <when value="coords"/> <when value="sam"> <param name="sam_paired_read" type="boolean" truevalue="sampe" falsevalue="samse" checked="false" label="SAM paired reads"/> @@ -369,6 +382,11 @@ <param name="read_group_name" type="text" value="" label="Value to put into read-group name (RG-SM) field"/> <param name="read_group_library" type="text" value="" label="Value to put into read-group library (RG-LB) field"/> <param name="read_group_platform" type="text" value="" label="Value to put into read-group library platform (RG-PL) field"/> + <param name="sam_use_0M" type="boolean" truevalue="--sam-use-0M" falsevalue="" checked="false" label="Insert 0M in CIGAR between adjacent insertions and deletions" help="Required by Picard, but can cause errors in other tools"/> + <param name="force_xs_dir" type="boolean" truevalue="--force-xs-dir" falsevalue="" checked="false" label="Force direction (disallow XS:A:?)" + help="For RNA-Seq alignments, disallows XS:A:? when the sense direction is unclear, and replaces this value arbitrarily with XS:A:+. May be useful for some programs, such as Cufflinks, that cannot handle XS:A:?. However, if you use this flag, the reported value of XS:A:+ in these cases will not be meaningful."/> + <param name="md_lowercase_snp" type="boolean" truevalue="--md-lowercase-snp" falsevalue="" checked="false" label="MD lowercase SNP" + help="In MD string, when known SNPs are given by the -v flag, prints difference nucleotides as lower-case when they, differ from reference but match a known alternate allele"/> </when> </conditional> <!-- name="result" --> @@ -383,6 +401,18 @@ print-comment Show comment line for each hit --> + <!-- + min-trimmed-coverage=FLOAT Do not print alignments with trimmed coverage less + this value (default=0.0, which means no filtering) + Note that chimeric alignments will be output regardless + of this filter + min-identity=FLOAT Do not print alignments with identity less + this value (default=0.0, which means no filtering) + Note that chimeric alignments will be output regardless + of this filter + --> + + </inputs> <outputs>
--- a/gmap_build.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/gmap_build.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,10 +1,10 @@ -<tool id="gmap_build" name="GMAP Build" version="2.0.0"> +<tool id="gmap_build" name="GMAP Build" version="3.0.0"> <description>a database genome index for GMAP and GSNAP</description> <requirements> - <requirement type="package" version="2011-11-30">gmap</requirement> + <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> <version_string>gmap --version</version_string> - <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command> + <command interpreter="command"> /bin/bash $shscript > $output </command> <inputs> <!-- Name for this gmapdb --> <param name="refname" type="text" label="Name you want to give this gmap database" help=""> @@ -15,12 +15,17 @@ <param name="input" type="data" format="fasta" label="reference sequence fasta" /> </repeat> - <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help=""> - <option value="12">12</option> - <option value="13">13</option> - <option value="14">14</option> - <option value="15" selected="true">15</option> + <param name="circular_chroms" type="text" value="" optional="true" label="Names of circular chromosomes" + help="a list of chromosomes, separated by commas, allow GSNAP and GMAP to align reads across the ends of the chromosome"> + </param> + + <param name="sort" type="select" label="Sort chromosomes" help=""> + <option value="none">none - use chromosomes as found in FASTA file(s)</option> + <option value="alpha">alpha - sort chromosomes alphabetically (chr10 before chr 1)</option> + <option value="numeric-alpha">numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY</option> + <option value="chrom">chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU</option> </param> + <param name="cmetindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create cmetindex to process reads from bisulfite-treated DNA"/> <param name="atoiindex" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Create atoiindex to process reads under RNA-editing tolerance"/> <conditional name="splicesite"> @@ -51,6 +56,7 @@ <option value="none"></option> <option value="snpTable">UCSC SNP Table</option> <option value="snpFile">GMAP SNP File</option> + <option value="vcfFile">VCF File</option> </param> <when value="none"/> <when value="snpTable"> @@ -77,8 +83,33 @@ sequence, that SNP will be ignored in subsequent processing as a probable error. The N stands for any other allele." /> </when> + <when value="vcfFile"> + <param name="snps" type="data" format="vcf" optional="true" label="VCF SNPs file" + help="Example: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz + The VCF file contains multiple versions of dbSNP, so if you want a + particular version, such as 135. The vcf_iit program tries to pick + a subset of SNPs that somewhat parallel + the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs + that have been validated (marked in the VCF file as "VLD") or have a + submitter link-out ("SLO"). Otherwise, it excludes SNPs that are + individual genotypes ("GNO"). If none of these conditions hold, then + the SNP is allowed. "/> + <param name="vcf_version" type="text" value="" optional="true" label="dbSNP version" + help="The VCF file contains multiple versions of dbSNP, so if you want a particular version, such as 135"/> + </when> </conditional> + + <param name="kmer" type="select" multiple="true" force_select="true" label="kmer size" help="Use smaller values when building indexes on machines with limited RAM"> + <option value="12">12 (64MB RAM)</option> + <option value="13">13 (256MB RAM)</option> + <option value="14">14 (1GB RAM)</option> + <option value="15" selected="true">15 (4GB RAM)</option> + </param> + </inputs> + <stdio> + <exit_code range="1" level="fatal" description="Error running gmap_build" /> + </stdio> <outputs> <!-- <data format="txt" name="log" label="${tool.name} on ${on_string}: log"/> @@ -97,15 +128,23 @@ ## #set $ref_files = $ref_files $i.input ## #end for ## echo $ref_files +#set circular = "" +#if $circular_chroms.__str__.strip() != '': +#set circular = ('').join([' -c ','"', $circular_chroms.__str__,'"']) +#end if #import os.path #set $gmapdb = $output.extra_files_path #set $mapsdir = $os.path.join($os.path.join($gmapdb,str($refname)), str($refname) + '.maps') mkdir -p $gmapdb ## export GMAPDB required for cmetindex and atoiindex export GMAPDB=$gmapdb +#if $kmer: #for $k in $kmer.__str__.split(','): -gmap_build -D $gmapdb -d $refname -s numeric-alpha -k $k #for i in $inputs# ${i.input}#end for# +gmap_build -D $gmapdb -d $refname -s $sort $circular -k $k #for i in $inputs# ${i.input}#end for# #end for +#else: +gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for# +#end if get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' echo "kmers: " $kmer #if $splicesite.splice_source == 'refGeneTable': @@ -125,17 +164,23 @@ #end if #end if #if $dbsnp.snp_source != 'none' and $dbsnp.snps.__str__ != 'None': -#if $dbsnp.snp_source == 'snpTable': -#if $dbsnp.snpsex.__str__ != 'None': -cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps') -#else: -cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps') -#end if -#else: -cat $dbsnp.snps | iit_store -o $os.path.join($mapsdir,'snps') -#end if -snpindex -d $refname -v snps -echo "snpindex" -d $refname -v snps + #if $dbsnp.snp_source == 'snpTable': + #if $dbsnp.snpsex.__str__ != 'None': + cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight -e $dbsnp.snpsex | iit_store -o $os.path.join($mapsdir,'snps') + #else: + cat $dbsnp.snps | dbsnp_iit -w $dbsnp.weight | iit_store -o $os.path.join($mapsdir,'snps') + #end if + #elif $dbsnp.snp_source == 'vcfFile': + #if $dbsnp.vcf_version and len($dbsnp.vcf_version.__str__.strip()) > 0: + cat $dbsnp.snps | vcf_iit -v $dbsnp.vcf_version.__str__.strip() | iit_store -o $os.path.join($mapsdir,'snps') + #else: + cat $dbsnp.snps | vcf_iit | iit_store -o $os.path.join($mapsdir,'snps') + #end if + #else: + cat $dbsnp.snps | iit_store -o $os.path.join($mapsdir,'snps') + #end if + snpindex -d $refname -v snps + echo "snpindex" -d $refname -v snps #end if #if $cmetindex.__str__ == 'yes': cmetindex -d $refname @@ -154,10 +199,9 @@ <help> - **GMAP Build** -GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMSP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) +GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) You will want to read the README_ @@ -169,6 +213,129 @@ .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 +**circular chromosomes** + +Finally, you can provide information to gmap_build that certain +chromosomes are circular, with the -c or -\-circular flag. The value +for these flags is a list of chromosomes, separated by commas. The +gmap_build program will then allow GSNAP and GMAP to align reads +across the ends of the chromosome. For example, the mitochondrial +genome in human beings is circular. + + +**Detecting known and novel splice sites in GSNAP** + +GSNAP can detect splice junctions in individual reads. +GSNAP allows for known splicing at two levels: at the level of known +splice sites and at the level of known introns. At the site level, +GSNAP finds splicing between arbitrary combinations of donor and +acceptor splice sites, meaning that it can find alternative splicing +events. At the intron level, GSNAP finds splicing only between the +set of given donor-acceptor pairs, so it is constrained not to find +alternative splicing events, only introns included in the given list. +For most purposes, I would recommend using known splice sites, rather +than known introns, unless you are certain that all alternative +splicing events are known are represented in your file. + +Splice site files can be generated from a GTF file +or from refGenes table from UCSC. + + +**SNP-tolerant alignment in GSNAP** + +GSNAP has the ability to align to a reference space of all possible +major and minor alleles in a set of known SNPs provided by the user. + + +Process known SNP data, either from older dbSNP files or from newer +files in VCF format. The older dbSNP files can be obtained from UCSC, +either from the Galaxy UCSC table browser or downloaded: + + ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130.txt.gz + +For versions before snp132, you may also want to exclude exceptions, +which will require this file: + + ftp://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/snp130Exceptions.txt.gz + +The option "-w weight" makes use of the dbSNP item weight, a value +from 1 to 3, where lower weight means higher confidence. Items will +be included if the item weight is the given value weight or less. +The default value of -w is 1, which is the criterion UCSC uses to +build its ambiguous version of the genome. To allow all item weights, +specify "-w 3". + +The more recent SNP data are provided in VCF format, and can be +retrieved like this: + + ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/00-All.vcf.gz + +The VCF file contains multiple versions of dbSNP, so if you want a +particular version, such as 135, you would use the flag "-v 135". The +vcf_iit program tries to pick a subset of SNPs that somewhat parallel +the ones without exceptions in the UCSC dbSNP file. It keeps all SNPs +that have been validated (marked in the VCF file as "VLD") or have a +submitter link-out ("SLO"). Otherwise, it excludes SNPs that are +individual genotypes ("GNO"). If none of these conditions hold, then +the SNP is allowed. These rules might not be the best ones; I made +them up by trying to compare version 135 of the VCF data with +version 135 of the UCSC dbSNP data. + +**Alignment of reads from bisulfite-treated DNA in GSNAP** + +GSNAP has the ability to align reads from bisulfite-treated DNA, which +converts unmethylated cytosines to uracils that appear as thymines in +reads. GSNAP is able to identify genomic-T to read-C mismatches, +if a cmetindex is generated. + +**RNA-editing tolerance in GSNAP** + +Just as GSNAP has a program cmetindex and a mode called "cmet" for +tolerance to C-to-T changes, it can be tolerant to A-to-G changes +using the program atoiindex and a mode called "atoi". This mode is +designed to facilitate alignments that are tolerant to RNA editing +where A's are converted to I's, which appear as G's to a sequencer. + +To process reads under RNA-editing tolerance, you will first need to +create th atoi index. + + + +**K-mer size** + +You can control the k-mer size +for the genomic index with the -k flag, which can range from 12 to 15. +The default value for -k is 15, but this requires your machine to have +4 GB of RAM to build the indices. If you do not have 4 GB of RAM, +then you will need to reduce the value of -k or find another machine. +Here are the RAM requirements for building various indices:: + + k-mer of 12: 64 MB + k-mer of 13: 256 MB + k-mer of 14: 1 GB + k-mer of 15: 4 GB + +These are the RAM requirements for building indices, but not to run +the GMAP/GSNAP programs once the indices are built, because the +genomic indices are compressed. For example, the genomic index for a +k-mer of 15 gives a gammaptrs file of 64 MB and an offsetscomp file of +about 350 MB, much smaller than the 4 GB that would otherwise be +required. Therefore, you may want to build your genomic index on a +computer with sufficient RAM, and distribute that index to be used by +computers with less RAM. + +The amount of compression can be controlled using the -b or -\-basesize +parameter to gmap_build. By default, the value for k-mer size is 15, +and the value for basesize is 12. If you select a different value for +k-mer size, then basesize is made by default to be equal to that k-mer +size. + +If you want to build your genomic databases with more than one k-mer +size, you can re-run gmap_build with different values of -k. This +will overwrite only the identical files from the previous runs. You +can then choose the k-mer size at run-time by using the -k flag for +either GMAP or GSNAP. + </help> </tool>
--- a/gsnap.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/gsnap.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,7 +1,7 @@ -<tool id="gsnap" name="GSNAP" version="2.0.1"> +<tool id="gsnap" name="GSNAP" version="3.0.0"> <description>Genomic Short-read Nucleotide Alignment Program</description> <requirements> - <requirement type="package" version="2011-11-30">gmap</requirement> + <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> <version_string>gsnap --version</version_string> <command> @@ -9,7 +9,6 @@ gsnap --nthreads="4" --ordered #if $refGenomeSource.genomeSource == "gmapdb": - #set $gmapdb = $os.listdir($refGenomeSource.gmapdb.extra_files_path)[0] --dir=$refGenomeSource.gmapdb.extra_files_path --db=$refGenomeSource.gmapdb.metadata.db_name #else: --dir=$os.path.dirname($refGenomeSource.gmapindex.value) --db=$os.path.basename($refGenomeSource.gmapindex.value) @@ -140,6 +139,9 @@ #if $output.npath.__str__ != '': --npath=$output.npath #end if + #if $output.maxsearch.__str__ != '': + --maxsearch=$output.maxsearch + #end if $output.quiet_if_excessive $output.show_refdiff $output.clip_overlap @@ -248,7 +250,13 @@ <param name="fastq" type="data" format="fastq" label="Select a fastq dataset" /> <conditional name="paired"> <param name="ispaired" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Use Paired Reads?"/> - <when value="no"/> + <when value="no"> + <!-- + Could allow multiple input fastq and set the force-single-end flag + force-single-end When multiple FASTQ files are provided on the command line, GSNAP assumes + they are match paired-end files. This flag treats each file as single-end. + --> + </when> <when value="yes"> <param name="fastq" type="data" format="fastq" label="Select the paired reads reverse dataset" /> <param name="orientation" type="select" label="Orientation of paired-end reads" help=""> @@ -471,8 +479,11 @@ <when value="default"/> <when value="advanced"> <param name="max_mismatches" type="float" value="" optional="true" label="Maximum number of mismatches allowed (uses default when negative)" - help="Defaults to the ultrafast level of ((readlength+2)/12 - 2)). - If specified between 0.0 and 1.0, then treated as a fraction + help="Maximum number of mismatches allowed (if not specified, then + defaults to the ultrafast level of ((readlength+index_interval-1)/kmer - 2)) + (By default, the genome index interval is 3, but this can be changed + by providing a different value for -q to gmap_build when processing the genome.) + If specified between 0.0 and 1.0, then treated as a fraction of each read length. Otherwise, treated as an integral number of mismatches (including indel and splicing penalties) For RNA-Seq, you may need to increase this value slightly @@ -481,12 +492,24 @@ </param> <param name="query_unk_mismatch" type="boolean" checked="false" truevalue="--query-unk-mismatch=1" falsevalue="" label="Count unknown (N) characters in the query as a mismatch"/> <param name="genome_unk_mismatch" type="boolean" checked="true" truevalue="" falsevalue="--genome-unk-mismatch=0" label="Count unknown (N) characters in the genome as a mismatch"/> - <param name="terminal_threshold" type="integer" value="" optional="true" label="Threshold for searching for a terminal alignment (default 2)" - help="(from one end of the read to the best possible position at the other end). For example, if this value is 2, then if GSNAP finds an exact or + <param name="maxsearch" type="integer" value="" optional="true" label="Maximum number of alignments to find (default 1000)" + help="Must be larger than paths, which is the number to report. + Keeping this number large will allow for random selection among multiple alignments. + Reducing this number can speed up the program. "/> + + <param name="terminal_threshold" type="integer" value="" optional="true" label="Threshold for searching for a terminal alignment" + help="Threshold for searching for a terminal alignment (from one end of the + read to the best possible position at the other end) (default 2 + for standard, atoi-stranded, and atoi-nonstranded mode; default 100 + for cmet-stranded and cmet-nonstranded mode). + For example, if this value is 2, then if GSNAP finds an exact or 1-mismatch alignment, it will not try to find a terminal alignment. Note that this default value may not be low enough if you want to obtain terminal alignments for very short reads, although such reads - probably don't have enough specificity for terminal alignments anyway." /> + probably don't have enough specificity for terminal alignments anyway. + To turn off terminal alignments, set this to a high value, greater + than the value for max-mismatches. + "/> <param name="indel_penalty" type="integer" value="" optional="true" label="Penalty for an indel (default 2)" help="Counts against mismatches allowed. To find indels, make indel-penalty less than or equal to max-mismatches. A value < 2 can lead to false positives at read ends" /> <param name="indel_endlength" type="integer" value="" optional="true" label="Minimum length at end required for indel alignments (default 4)" /> @@ -497,9 +520,10 @@ <param name="suboptimal_levels" type="integer" value="" optional="true" label="Report suboptimal hits beyond best hit (default 0)" help="All hits with best score plus suboptimal-levels are reported" /> <param name="adapter_strip" type="select" label="Method for removing adapters from reads" - help="paired removes adapters from paired-end reads if a concordant or paired alignment cannot be found from the original read"> - <option value="paired" selected="true">paired</option> - <option value="off">off</option> + help="Default is 'off'. To turn on, specify 'paired', which removes adapters + from paired-end reads if they appear to be present."> + <option value="paired">paired</option> + <option value="off" selected="true">off</option> </param> <param name="trim_mismatch_score" type="integer" value="" optional="true" label="Score to use for mismatches when trimming at ends (default is -3)" help="to turn off trimming, specify 0 (Warning: turning trimming off will give false positive mismatches at the ends of reads)"/> @@ -523,6 +547,7 @@ <param name="gmap_mode" type="select" multiple="true" optional="true" display="checkboxes" label="Cases to use GMAP for complex alignments containing multiple splices or indels" help="Default: pairsearch,terminal,improve"> <option value="pairsearch" selected="true">pairsearch</option> + <option value="indel_knownsplice" selected="true">indel_knownsplice</option> <option value="terminal" selected="true">terminal</option> <option value="improve" selected="true">improve</option> </param>
--- a/iit_store.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/iit_store.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,7 +1,7 @@ -<tool id="gmap_iit_store" name="GMAP IIT" version="2.0.0"> +<tool id="gmap_iit_store" name="GMAP IIT" version="3.0.0"> <description>Create a map store for known genes or SNPs</description> <requirements> - <requirement type="package" version="2011-11-30">gmap</requirement> + <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> <version_string>iit_store --version</version_string> <command interpreter="command"> /bin/bash $shscript 2> $log </command>
--- a/lib/galaxy/datatypes/gmap.py Fri Oct 05 13:08:43 2012 -0500 +++ b/lib/galaxy/datatypes/gmap.py Wed Sep 28 10:43:44 2016 -0400 @@ -2,7 +2,7 @@ GMAP indexes """ import logging -import os,os.path,re +import os,os.path,re,sys import galaxy.datatypes.data from galaxy.datatypes.data import Text from galaxy import util @@ -15,11 +15,14 @@ A GMAP DB for indexes """ MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) + MetadataElement( name="chromosomes", desc="The chromosomes or contigs", no_value=[], readonly=False ) + MetadataElement( name="circular", desc="cirular chromosomes", no_value=[], readonly=False ) + MetadataElement( name="chromlength", desc="Chromosome lengths", no_value=[], readonly=False ) MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True ) - MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True ) + MetadataElement( name="kmers", desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True ) MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True ) + MetadataElement( name="maps", desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True ) + MetadataElement( name="snps", desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True ) MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True ) MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True ) @@ -41,10 +44,24 @@ """ bn = dataset.metadata.db_name log.info( "GmapDB regenerate_primary_file %s" % (bn)) - rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)] - for i,name in enumerate(dataset.metadata.maps): - rval.append( '<li>%s' % name) - rval.append( '</ul></html>' ) + rval = [] + rval.append("GMAPDB: %s" % dataset.metadata.db_name) + if dataset.metadata.chromosomes: + rval.append("chromosomes: %s" % dataset.metadata.chromosomes) + if dataset.metadata.chromlength and len(dataset.metadata.chromlength) == len(dataset.metadata.chromosomes): + rval.append( 'chrom\tlength' ) + for i,name in enumerate(dataset.metadata.chromosomes): + rval.append( '%s\t%d' % (dataset.metadata.chromosomes[i],dataset.metadata.chromlength[i])) + if dataset.metadata.circular: + rval.append("circular: %s" % dataset.metadata.circular) + if dataset.metadata.kmers: + rval.append("kmers: %s" % dataset.metadata.kmers) + rval.append("cmetindex: %s atoiindex: %s" % (dataset.metadata.cmet,dataset.metadata.atoi)) + if dataset.metadata.maps and len(dataset.metadata.maps) > 0: + rval.append( 'Maps:') + for i,name in enumerate(dataset.metadata.maps): + if name.strip() != '': + rval.append( ' %s' % name) f = file(dataset.file_name,'w') f.write("\n".join( rval )) f.write('\n') @@ -53,7 +70,7 @@ def set_peek( self, dataset, is_multi_byte=False ): log.info( "GmapDB set_peek %s" % (dataset)) if not dataset.dataset.purged: - dataset.peek = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps ) + dataset.peek = "GMAPDB index %s\n chroms %s\n kmers %s cmet %s atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.kmers,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps ) dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name ) else: dataset.peek = 'file does not exist' @@ -68,6 +85,26 @@ return False def set_meta( self, dataset, overwrite = True, **kwd ): """ + extra_files_path/<db_name>/GRCh37_19 + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2iag12123offsetscomp + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2iag123positions + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2itc12123offsetscomp + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.a2itc123positions + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.chromosome + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.chromosome.iit + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.chrsubset + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.contig + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.contig.iit + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.genomecomp + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.maps + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metct12123offsetscomp + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metct123positions + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metga12123offsetscomp + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.metga123positions + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.ref12123offsetscomp + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.ref123positions + extra_files_path/<db_name>/GRCh37_19/GRCh37_19.version + Expecting: extra_files_path/<db_name>/db_name>.ref<basesize><kmer>3<index> extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp @@ -77,7 +114,9 @@ extra_files_path/db_name/db_name.maps/*.iit """ log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path)) - pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' + chrom_pat = '^(.+).chromosome$' + #pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' + pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?(\d)(offsetscomp)' efp = dataset.extra_files_path flist = os.listdir(efp) for i,fname in enumerate(flist): @@ -85,24 +124,60 @@ fpath = os.path.join(efp,fname) if os.path.isdir(fpath): ilist = os.listdir(fpath) - kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata + # kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata + kmers = dict() for j,iname in enumerate(ilist): log.info( "GmapDB set_meta file %s %s" % (j,iname)) ipath = os.path.join(fpath,iname) + print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath) if os.path.isdir(ipath): # find maps dataset.metadata.map_dir = iname + maps = [] + snps = [] for mapfile in os.listdir(ipath): mapname = mapfile.replace('.iit','') log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile)) - dataset.metadata.maps.append(mapname) + print >> sys.stderr, "GmapDB set_meta map %s %s " % (mapname,mapfile) + maps.append(mapname) + if mapname.find('snp') >= 0: + snps.append(mapname) + if len(maps) > 0: + dataset.metadata.maps = maps + if len(snps) > 0: + dataset.metadata.snps = snps else: + m = re.match(chrom_pat,iname) + if m and len(m.groups()) == 1: + dataset.metadata.db_name = m.groups()[0] + print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath) + try: + fh = open(ipath) + dataset.metadata.chromosomes = [] + dataset.metadata.circular = [] + dataset.metadata.chromlength = [] + for k,line in enumerate(fh): + fields = line.strip().split('\t') + print >> sys.stderr, "GmapDB set_meta chrom %s fields %s" % (line,fields) + if len(fields) > 2: + dataset.metadata.chromosomes.append(str(fields[0])) + dataset.metadata.chromlength.append(int(fields[2])) + if len(fields) > 3 and fields[3] == 'circular': + dataset.metadata.circular.append(str(fields[0])) + print >> sys.stderr, "GmapDB set_meta db_name %s chromosomes %s circular %s" % (dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.circular) + except Exception, e: + log.info( "GmapDB set_meta error %s %s " % (iname, e)) + print >> sys.stderr, "GmapDB set_meta file %s Error %s" % (ipath,e) + finally: + if fh: + fh.close() + continue m = re.match(pat,iname) if m: log.info( "GmapDB set_meta m %s %s " % (iname, m)) + print >> sys.stderr, "GmapDB set_meta iname %s %s" % (iname,m.groups()) assert len(m.groups()) == 10 - dataset.metadata.db_name = fname if m.groups()[2] == 'ref': - if m.groups()[-1] != None: + if m.groups()[-1] != None and m.groups()[-1] != 'offsetscomp': dataset.metadata.snps.append(m.groups()[-1]) else: if m.groups()[-3] != None: @@ -115,6 +190,7 @@ elif m.groups()[4] == 'a2i': dataset.metadata.atoi = True dataset.metadata.kmers = kmers.keys() + self.regenerate_primary_file(dataset) class GmapSnpIndex( Text ): """
--- a/snpindex.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/snpindex.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,7 +1,7 @@ -<tool id="gmap_snpindex" name="GMAP SNP Index" version="2.0.0"> +<tool id="gmap_snpindex" name="GMAP SNP Index" version="3.0.0"> <description>build index files for known SNPs</description> <requirements> - <requirement type="package" version="2011-11-30">gmap</requirement> + <requirement type="package" version="2013-05-09">gmap</requirement> </requirements> <version_string>snpindex --version</version_string> <command interpreter="command"> /bin/bash $shscript 2>1 1> $output </command>
--- a/tool_dependencies.xml Fri Oct 05 13:08:43 2012 -0500 +++ b/tool_dependencies.xml Wed Sep 28 10:43:44 2016 -0400 @@ -1,21 +1,6 @@ <?xml version="1.0"?> <tool_dependency> - <package name="gmap" version="2011-11-30"> - <install version="1.0"> - <actions> - <action type="shell_command">wget http://research-pub.gene.com/gmap/src/gmap-gsnap-2011-11-30.tar.gz</action> - <action type="shell_command"> ./configure --prefix=bin --with-gmapdb=../gmapdb</action> - <action type="shell_command">make</action> - <action type="move_directory_files"> - <source_directory>bin</source_directory> - <destination_directory>$INSTALL_DIR/bin</destination_directory> - </action> - <action type="set_environment"> - <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> - </action> - </actions> - </install> - <readme> - </readme> + <package name="gmap" version="2013-05-09"> + <repository changeset_revision="8bacc25e2023" name="package_gmap_2013_05_09" owner="jjohnson" toolshed="https://testtoolshed.g2.bx.psu.edu" /> </package> </tool_dependency>