# HG changeset patch # User peterjc # Date 1475074048 14400 # Node ID 488e9d642566aa1d7477d4406d2f9740f3575eb5 # Parent f6ba0f12cca2d93b20f39a6fa060556363b3a037 GMAP wrappers v3.0.1 after linting and cleanup, still untested work-in-progress diff -r f6ba0f12cca2 -r 488e9d642566 gmap.xml --- a/gmap.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/gmap.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,9 +1,9 @@ - + Genomic Mapping and Alignment Program for mRNA and EST sequences gmap - gmap --version + gmap --version #import os,os.path gmap @@ -41,7 +41,7 @@ --protein_gen #elif $result.format == "sam": --format=$result.sam_paired_read - $result.no_sam_headers + $result.no_sam_headers $result.sam_use_0M $result.force_xs_dir $result.md_lowercase_snp @@ -127,7 +127,7 @@ ${i.added_input} #end for #if $split_output == True - 2> $gmap_stderr + 2> $gmap_stderr #else 2> $gmap_stderr > $output #end if @@ -194,7 +194,7 @@ - @@ -208,12 +208,12 @@ - - + @@ -223,56 +223,56 @@ - + - + - + - + - + - + - + - - + + - + - - + + - + - @@ -293,25 +293,25 @@ - + - + + label="Maximum number of paths to show. Ignored if negative. If 0, prints two paths if chimera detected, else one." > + help="By default the program prints all paths found." > - + - @@ -383,9 +383,9 @@ - - @@ -393,7 +393,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #!/bin/bash @@ -145,8 +34,8 @@ #else: gmap_build -D $gmapdb -d $refname -s $sort $circular #for i in $inputs# ${i.input}#end for# #end if -get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' -echo "kmers: " $kmer +get-genome -D $gmapdb -d '?' | sed 's/^Available .*/gmap db: /' +echo "kmers: " $kmer #if $splicesite.splice_source == 'refGeneTable': #if $splicesite.refGenes.__str__ != 'None': cat $splicesite.refGenes | psl_splicesites -s $splicesite.col_skip | iit_store -o $os.path.join($mapsdir,'splicesites') @@ -190,15 +79,122 @@ atoiindex -d $refname echo "atoiindex" -d $refname #end if -get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' +get-genome -D $gmapdb -d $refname -m '?' | sed 's/^Available maps .*/maps: /' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + - **GMAP Build** GMAP Build creates an index of a genomic sequence for mapping and alignment using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). (GMAP Build uses GMAP commands: gmap_build, iit_store, psl_splicesites, psl_introns, gtf_splicesites, gtf_introns, gff3_splicesites, gff3_introns, dbsnp_iit, snpindex, cmetindex, and atoiindex.) @@ -225,7 +221,7 @@ **Detecting known and novel splice sites in GSNAP** -GSNAP can detect splice junctions in individual reads. +GSNAP can detect splice junctions in individual reads. GSNAP allows for known splicing at two levels: at the level of known splice sites and at the level of known introns. At the site level, GSNAP finds splicing between arbitrary combinations of donor and @@ -237,8 +233,8 @@ than known introns, unless you are certain that all alternative splicing events are known are represented in your file. -Splice site files can be generated from a GTF file -or from refGenes table from UCSC. +Splice site files can be generated from a GTF file +or from refGenes table from UCSC. **SNP-tolerant alignment in GSNAP** @@ -285,7 +281,7 @@ GSNAP has the ability to align reads from bisulfite-treated DNA, which converts unmethylated cytosines to uracils that appear as thymines in -reads. GSNAP is able to identify genomic-T to read-C mismatches, +reads. GSNAP is able to identify genomic-T to read-C mismatches, if a cmetindex is generated. **RNA-editing tolerance in GSNAP** @@ -335,7 +331,8 @@ will overwrite only the identical files from the previous runs. You can then choose the k-mer size at run-time by using the -k flag for either GMAP or GSNAP. - + + 10.1093/bioinformatics/bti310 + - diff -r f6ba0f12cca2 -r 488e9d642566 gmap_v3.0.0_from_JJ.tar.gz Binary file gmap_v3.0.0_from_JJ.tar.gz has changed diff -r f6ba0f12cca2 -r 488e9d642566 gsnap.xml --- a/gsnap.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/gsnap.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,9 +1,9 @@ - + Genomic Short-read Nucleotide Alignment Program gmap - gsnap --version + gsnap --version #import os.path, re gsnap @@ -140,7 +140,7 @@ --npath=$output.npath #end if #if $output.maxsearch.__str__ != '': - --maxsearch=$output.maxsearch + --maxsearch=$output.maxsearch #end if $output.quiet_if_excessive $output.show_refdiff @@ -266,15 +266,15 @@ - - - - - + @@ -350,8 +350,8 @@ - @@ -359,7 +359,7 @@ - @@ -384,7 +384,7 @@ - @@ -403,7 +403,7 @@ - @@ -420,8 +420,8 @@ - @@ -429,10 +429,10 @@ - @@ -442,7 +442,7 @@ @@ -455,7 +455,7 @@ - @@ -478,7 +478,7 @@ - - - @@ -519,17 +519,17 @@ - - - - - + - - - - - - @@ -574,7 +574,7 @@ - + @@ -587,8 +587,8 @@ - @@ -602,11 +602,11 @@ - - - @@ -640,8 +640,8 @@ - + @@ -655,8 +655,8 @@ - + @@ -671,8 +671,8 @@ - + @@ -808,18 +808,18 @@ - + **What it does** -GSNAP_ (Genomic Short-read Nucleotide Alignment Program) is a short read aligner which can align both single- and paired-end reads as short as 14nt and of arbitrarily long length. It can detect short- and long-distance splicing, including interchromosomal splicing, in individual reads, using probabilistic models or a database of known splice sites. Our program also permits SNP-tolerant alignment to a reference space of all possible combinations of major and minor alleles, and can align reads from bisulfite-treated DNA for the study of methylation state. It is developed by Thomas D. Wu of Genentech, Inc. +GSNAP_ (Genomic Short-read Nucleotide Alignment Program) is a short read aligner which can align both single- and paired-end reads as short as 14nt and of arbitrarily long length. It can detect short- and long-distance splicing, including interchromosomal splicing, in individual reads, using probabilistic models or a database of known splice sites. Our program also permits SNP-tolerant alignment to a reference space of all possible combinations of major and minor alleles, and can align reads from bisulfite-treated DNA for the study of methylation state. It is developed by Thomas D. Wu of Genentech, Inc. Publication_ citation: Thomas D. Wu, Serban Nacu "Fast and SNP-tolerant detection of complex variants and splicing in short reads. Bioinformatics. 2010 Apr 1;26(7):873-81. Epub 2010 Feb 10. .. _GSNAP: http://research-pub.gene.com/gmap/ .. _Publication: http://bioinformatics.oupjournals.org/cgi/content/full/26/7/873 -http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2844994/?tool=pubmed +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2844994/?tool=pubmed ------ @@ -835,10 +835,10 @@ **Input formats** -Input to GSNAP should be either in FASTQ or FASTA format. +Input to GSNAP should be either in FASTQ or FASTA format. The FASTQ input may include quality scores, which will then be included in SAM -output, if that output format is selected. +output, if that output format is selected. For FASTA format, you should include one line per read (or end of a paired-end read). The same FASTA file can have a mixture of @@ -880,10 +880,9 @@ Default GSNAP format See the README_ - - - - + + 10.1093/bioinformatics/btq057 + diff -r f6ba0f12cca2 -r 488e9d642566 iit_store.xml --- a/iit_store.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/iit_store.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,109 +1,10 @@ - + Create a map store for known genes or SNPs gmap - iit_store --version + iit_store --version /bin/bash $shscript 2> $log - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - (map['type'] == 'genes' and 'splicesites' in map['maps']) - - - (map['type'] == 'genes' and 'introns' in map['maps']) - - - (map['type'] == 'snps') - - - (map['type'] == 'gmap') - - #!/bin/bash @@ -146,23 +47,121 @@ #if $map.src.snpsex.__str__ != 'None': $catcmd $map.src.snps | dbsnp_iit -w $map.src.weight -e $map.src.snpsex | iit_store -o $snps_iit #else: - $catcmd $map.src.snps | dbsnp_iit -w $map.src.weight | iit_store -o $snps_iit + $catcmd $map.src.snps | dbsnp_iit -w $map.src.weight | iit_store -o $snps_iit #end if #else: - $catcmd $map.src.snps | iit_store -o $map_iit + $catcmd $map.src.snps | iit_store -o $map_iit #end if - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (map['type'] == 'genes' and 'splicesites' in map['maps']) + + + (map['type'] == 'genes' and 'introns' in map['maps']) + + + (map['type'] == 'snps') + + + (map['type'] == 'gmap') + + - + **iit_store** -GMAP IIT creates an Interval Index Tree map of known splice sites, introns, or SNPs (it uses iit_store described in the GMAP documentation). The maps can be used in GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). Maps are typically used for known splice sites, introns, or SNPs. +GMAP IIT creates an Interval Index Tree map of known splice sites, introns, or SNPs (it uses iit_store described in the GMAP documentation). The maps can be used in GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). Maps are typically used for known splice sites, introns, or SNPs. You will want to read the README_ @@ -177,5 +176,8 @@ **inputs** + + 10.1093/bioinformatics/bti310 + diff -r f6ba0f12cca2 -r 488e9d642566 lib/galaxy/datatypes/gmap.py --- a/lib/galaxy/datatypes/gmap.py Wed Sep 28 10:43:44 2016 -0400 +++ b/lib/galaxy/datatypes/gmap.py Wed Sep 28 10:47:28 2016 -0400 @@ -2,88 +2,109 @@ GMAP indexes """ import logging -import os,os.path,re,sys -import galaxy.datatypes.data +import os +import os.path +import re +import sys +from galaxy.datatypes import data from galaxy.datatypes.data import Text from galaxy import util from galaxy.datatypes.metadata import MetadataElement log = logging.getLogger(__name__) -class GmapDB( Text ): + +class GmapDB(Text): + """ A GMAP DB for indexes """ - MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="chromosomes", desc="The chromosomes or contigs", no_value=[], readonly=False ) - MetadataElement( name="circular", desc="cirular chromosomes", no_value=[], readonly=False ) - MetadataElement( name="chromlength", desc="Chromosome lengths", no_value=[], readonly=False ) - MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True ) - MetadataElement( name="kmers", desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="maps", desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="snps", desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True ) - MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True ) - MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True ) - + MetadataElement(name="db_name", desc="The db name for this index set", + default='unknown', set_in_upload=True, readonly=True) + MetadataElement( + name="chromosomes", desc="The chromosomes or contigs", no_value=[], readonly=False) + MetadataElement( + name="circular", desc="cirular chromosomes", no_value=[], readonly=False) + MetadataElement( + name="chromlength", desc="Chromosome lengths", no_value=[], readonly=False) + MetadataElement(name="basesize", default="12", + desc="The basesize for offsetscomp", visible=True, readonly=True) + MetadataElement(name="kmers", desc="The kmer sizes for indexes", + visible=True, no_value=[''], readonly=True) + MetadataElement(name="map_dir", desc="The maps directory", + default='unknown', set_in_upload=True, readonly=True) + MetadataElement(name="maps", desc="The names of maps stored for this gmap gmapdb", + visible=True, no_value=[''], readonly=True) + MetadataElement(name="snps", desc="The names of SNP indexes stored for this gmapdb", + visible=True, no_value=[''], readonly=True) + MetadataElement(name="cmet", default=False, + desc="Has a cmet index", visible=True, readonly=True) + MetadataElement(name="atoi", default=False, + desc="Has a atoi index", visible=True, readonly=True) + file_ext = 'gmapdb' is_binary = True composite_type = 'auto_primary_file' allow_datatype_change = False - def generate_primary_file( self, dataset = None ): - """ + def generate_primary_file(self, dataset=None): + """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return 'AutoGenerated Primary File for Composite Dataset' - - def regenerate_primary_file(self,dataset): + + def regenerate_primary_file(self, dataset): """ - cannot do this until we are setting metadata + cannot do this until we are setting metadata """ bn = dataset.metadata.db_name - log.info( "GmapDB regenerate_primary_file %s" % (bn)) + log.info("GmapDB regenerate_primary_file %s" % (bn)) rval = [] rval.append("GMAPDB: %s" % dataset.metadata.db_name) if dataset.metadata.chromosomes: rval.append("chromosomes: %s" % dataset.metadata.chromosomes) if dataset.metadata.chromlength and len(dataset.metadata.chromlength) == len(dataset.metadata.chromosomes): - rval.append( 'chrom\tlength' ) - for i,name in enumerate(dataset.metadata.chromosomes): - rval.append( '%s\t%d' % (dataset.metadata.chromosomes[i],dataset.metadata.chromlength[i])) + rval.append('chrom\tlength') + for i, name in enumerate(dataset.metadata.chromosomes): + rval.append( + '%s\t%d' % (dataset.metadata.chromosomes[i], dataset.metadata.chromlength[i])) if dataset.metadata.circular: rval.append("circular: %s" % dataset.metadata.circular) if dataset.metadata.kmers: rval.append("kmers: %s" % dataset.metadata.kmers) - rval.append("cmetindex: %s atoiindex: %s" % (dataset.metadata.cmet,dataset.metadata.atoi)) + rval.append("cmetindex: %s atoiindex: %s" % + (dataset.metadata.cmet, dataset.metadata.atoi)) if dataset.metadata.maps and len(dataset.metadata.maps) > 0: - rval.append( 'Maps:') - for i,name in enumerate(dataset.metadata.maps): + rval.append('Maps:') + for i, name in enumerate(dataset.metadata.maps): if name.strip() != '': - rval.append( ' %s' % name) - f = file(dataset.file_name,'w') - f.write("\n".join( rval )) + rval.append(' %s' % name) + f = open(dataset.file_name, 'w') + f.write("\n".join(rval)) f.write('\n') f.close() - def set_peek( self, dataset, is_multi_byte=False ): - log.info( "GmapDB set_peek %s" % (dataset)) + def set_peek(self, dataset, is_multi_byte=False): + log.info("GmapDB set_peek %s" % (dataset)) if not dataset.dataset.purged: - dataset.peek = "GMAPDB index %s\n chroms %s\n kmers %s cmet %s atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.kmers,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps ) - dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name ) + dataset.peek = "GMAPDB index %s\n chroms %s\n kmers %s cmet %s atoi %s\n maps %s" % ( + dataset.metadata.db_name, dataset.metadata.chromosomes, dataset.metadata.kmers, dataset.metadata.cmet, dataset.metadata.atoi, dataset.metadata.maps) + dataset.blurb = "GMAPDB %s" % (dataset.metadata.db_name) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def display_peek( self, dataset ): + + def display_peek(self, dataset): try: return dataset.peek except: return "GMAP index file" - def sniff( self, filename ): + def sniff(self, filename): return False - def set_meta( self, dataset, overwrite = True, **kwd ): + + def set_meta(self, dataset, overwrite=True, **kwd): """ extra_files_path//GRCh37_19 extra_files_path//GRCh37_19/GRCh37_19.a2iag12123offsetscomp @@ -110,81 +131,96 @@ extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs - index maps: + index maps: extra_files_path/db_name/db_name.maps/*.iit """ - log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path)) + log.info("GmapDB set_meta %s %s" % (dataset, dataset.extra_files_path)) chrom_pat = '^(.+).chromosome$' - #pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' + # pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?(\d)(offsetscomp)' efp = dataset.extra_files_path flist = os.listdir(efp) - for i,fname in enumerate(flist): - log.info( "GmapDB set_meta %s %s" % (i,fname)) - fpath = os.path.join(efp,fname) + for i, fname in enumerate(flist): + log.info("GmapDB set_meta %s %s" % (i, fname)) + fpath = os.path.join(efp, fname) if os.path.isdir(fpath): ilist = os.listdir(fpath) - # kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata + # kmers = {'':'default'} # HACK '' empty key added so user + # has default choice when selecting kmer from metadata kmers = dict() - for j,iname in enumerate(ilist): - log.info( "GmapDB set_meta file %s %s" % (j,iname)) - ipath = os.path.join(fpath,iname) - print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath) + for j, iname in enumerate(ilist): + log.info("GmapDB set_meta file %s %s" % (j, iname)) + ipath = os.path.join(fpath, iname) + print >> sys.stderr, "GmapDB set_meta file %s %s %s" % ( + j, iname, ipath) if os.path.isdir(ipath): # find maps dataset.metadata.map_dir = iname maps = [] snps = [] for mapfile in os.listdir(ipath): - mapname = mapfile.replace('.iit','') - log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile)) - print >> sys.stderr, "GmapDB set_meta map %s %s " % (mapname,mapfile) + mapname = mapfile.replace('.iit', '') + log.info("GmapDB set_meta map %s %s" % + (mapname, mapfile)) + print >> sys.stderr, "GmapDB set_meta map %s %s " % ( + mapname, mapfile) maps.append(mapname) - if mapname.find('snp') >= 0: + if mapname.find('snp') >= 0: snps.append(mapname) if len(maps) > 0: dataset.metadata.maps = maps if len(snps) > 0: dataset.metadata.snps = snps - else: - m = re.match(chrom_pat,iname) + else: + m = re.match(chrom_pat, iname) if m and len(m.groups()) == 1: dataset.metadata.db_name = m.groups()[0] - print >> sys.stderr, "GmapDB set_meta file %s %s %s" % (j,iname,ipath) + print >> sys.stderr, "GmapDB set_meta file %s %s %s" % ( + j, iname, ipath) try: fh = open(ipath) dataset.metadata.chromosomes = [] dataset.metadata.circular = [] dataset.metadata.chromlength = [] - for k,line in enumerate(fh): - fields = line.strip().split('\t') - print >> sys.stderr, "GmapDB set_meta chrom %s fields %s" % (line,fields) - if len(fields) > 2: - dataset.metadata.chromosomes.append(str(fields[0])) - dataset.metadata.chromlength.append(int(fields[2])) - if len(fields) > 3 and fields[3] == 'circular': - dataset.metadata.circular.append(str(fields[0])) - print >> sys.stderr, "GmapDB set_meta db_name %s chromosomes %s circular %s" % (dataset.metadata.db_name,dataset.metadata.chromosomes,dataset.metadata.circular) - except Exception, e: - log.info( "GmapDB set_meta error %s %s " % (iname, e)) - print >> sys.stderr, "GmapDB set_meta file %s Error %s" % (ipath,e) + for k, line in enumerate(fh): + fields = line.strip().split('\t') + print >> sys.stderr, "GmapDB set_meta chrom %s fields %s" % ( + line, fields) + if len(fields) > 2: + dataset.metadata.chromosomes.append( + str(fields[0])) + dataset.metadata.chromlength.append( + int(fields[2])) + if len(fields) > 3 and fields[3] == 'circular': + dataset.metadata.circular.append( + str(fields[0])) + print >> sys.stderr, "GmapDB set_meta db_name %s chromosomes %s circular %s" % ( + dataset.metadata.db_name, dataset.metadata.chromosomes, dataset.metadata.circular) + except Exception as e: + log.info( + "GmapDB set_meta error %s %s " % (iname, e)) + print >> sys.stderr, "GmapDB set_meta file %s Error %s" % ( + ipath, e) finally: if fh: fh.close() continue - m = re.match(pat,iname) + m = re.match(pat, iname) if m: - log.info( "GmapDB set_meta m %s %s " % (iname, m)) - print >> sys.stderr, "GmapDB set_meta iname %s %s" % (iname,m.groups()) + log.info("GmapDB set_meta m %s %s " % (iname, m)) + print >> sys.stderr, "GmapDB set_meta iname %s %s" % ( + iname, m.groups()) assert len(m.groups()) == 10 if m.groups()[2] == 'ref': - if m.groups()[-1] != None and m.groups()[-1] != 'offsetscomp': - dataset.metadata.snps.append(m.groups()[-1]) + if m.groups()[-1] is not None and m.groups()[-1] != 'offsetscomp': + dataset.metadata.snps.append( + m.groups()[-1]) else: - if m.groups()[-3] != None: + if m.groups()[-3] is not None: k = int(m.groups()[-3]) kmers[k] = k - if m.groups()[-4] != None: - dataset.metadata.basesize = int( m.groups()[-4]) + if m.groups()[-4] is not None: + dataset.metadata.basesize = int( + m.groups()[-4]) elif m.groups()[3] == 'met': dataset.metadata.cmet = True elif m.groups()[4] == 'a2i': @@ -192,56 +228,66 @@ dataset.metadata.kmers = kmers.keys() self.regenerate_primary_file(dataset) -class GmapSnpIndex( Text ): + +class GmapSnpIndex(Text): + """ A GMAP SNP index created by snpindex """ - MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) - MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True ) - + MetadataElement(name="db_name", desc="The db name for this index set", + default='unknown', set_in_upload=True, readonly=True) + MetadataElement(name="snps_name", default='snps', + desc="The name of SNP index", visible=True, no_value='', readonly=True) + file_ext = 'gmapsnpindex' is_binary = True composite_type = 'auto_primary_file' allow_datatype_change = False - def generate_primary_file( self, dataset = None ): - """ + def generate_primary_file(self, dataset=None): + """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return 'AutoGenerated Primary File for Composite Dataset' - - def regenerate_primary_file(self,dataset): + + def regenerate_primary_file(self, dataset): """ - cannot do this until we are setting metadata + cannot do this until we are setting metadata """ bn = dataset.metadata.db_name - log.info( "GmapDB regenerate_primary_file %s" % (bn)) - rval = ['GMAPDB %s

GMAPDB %s

cmet %s
atoi %s

Maps:

    ' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)] - for i,name in enumerate(dataset.metadata.maps): - rval.append( '
  • %s' % name) - rval.append( '
' ) - f = file(dataset.file_name,'w') - f.write("\n".join( rval )) + log.info("GmapDB regenerate_primary_file %s" % (bn)) + rval = ['GMAPDB %s

GMAPDB %s

cmet %s
atoi %s

Maps:

    ' % + (bn, bn, dataset.metadata.cmet, dataset.metadata.atoi)] + for i, name in enumerate(dataset.metadata.maps): + rval.append('
  • %s' % name) + rval.append('
') + f = open(dataset.file_name, 'w') + f.write("\n".join(rval)) f.write('\n') f.close() - def set_peek( self, dataset, is_multi_byte=False ): - log.info( "GmapSnpIndex set_peek %s" % (dataset)) + + def set_peek(self, dataset, is_multi_byte=False): + log.info("GmapSnpIndex set_peek %s" % (dataset)) if not dataset.dataset.purged: - dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) - dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) + dataset.peek = "GMAP SNPindex %s on %s\n" % ( + dataset.metadata.snps_name, dataset.metadata.db_name) + dataset.blurb = "GMAP SNPindex %s on %s\n" % ( + dataset.metadata.snps_name, dataset.metadata.db_name) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def display_peek( self, dataset ): + + def display_peek(self, dataset): try: return dataset.peek except: return "GMAP SNP index" - def sniff( self, filename ): + def sniff(self, filename): return False - def set_meta( self, dataset, overwrite = True, **kwd ): + + def set_meta(self, dataset, overwrite=True, **kwd): """ Expecting: extra_files_path/snp_name.iit @@ -249,21 +295,21 @@ extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name """ - log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path)) + log.info("GmapSnpIndex set_meta %s %s" % + (dataset, dataset.extra_files_path)) pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?' efp = dataset.extra_files_path flist = os.listdir(efp) - for i,fname in enumerate(flist): - m = re.match(pat,fname) + for i, fname in enumerate(flist): + m = re.match(pat, fname) if m: assert len(m.groups()) == 6 dataset.metadata.db_name = m.groups()[0] dataset.metadata.snps_name = m.groups()[-1] - +class IntervalIndexTree(Text): -class IntervalIndexTree( Text ): """ A GMAP Interval Index Tree Map created by iit_store @@ -272,35 +318,45 @@ file_ext = 'iit' is_binary = True -class SpliceSitesIntervalIndexTree( IntervalIndexTree ): + +class SpliceSitesIntervalIndexTree(IntervalIndexTree): + """ - A GMAP Interval Index Tree Map + A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'splicesites.iit' -class IntronsIntervalIndexTree( IntervalIndexTree ): + +class IntronsIntervalIndexTree(IntervalIndexTree): + """ A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'introns.iit' -class SNPsIntervalIndexTree( IntervalIndexTree ): + +class SNPsIntervalIndexTree(IntervalIndexTree): + """ A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'snps.iit' -class TallyIntervalIndexTree( IntervalIndexTree ): + +class TallyIntervalIndexTree(IntervalIndexTree): + """ A GMAP Interval Index Tree Map created by iit_store """ file_ext = 'tally.iit' -class IntervalAnnotation( Text ): + +class IntervalAnnotation(Text): + """ Class describing a GMAP Interval format: >label coords optional_tag @@ -311,38 +367,42 @@ """ file_ext = 'gmap_annotation' """Add metadata elements""" - MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 ) + MetadataElement(name="annotations", default=0, desc="Number of interval annotations", + readonly=True, optional=True, visible=False, no_value=0) - def set_meta( self, dataset, **kwd ): + def set_meta(self, dataset, **kwd): """ Set the number of annotations and the number of data lines in dataset. """ data_lines = 0 annotations = 0 - for line in file( dataset.file_name ): + for line in open(dataset.file_name): line = line.strip() - if line and line.startswith( '>' ): + if line and line.startswith('>'): annotations += 1 - data_lines +=1 + data_lines += 1 else: data_lines += 1 dataset.metadata.data_lines = data_lines dataset.metadata.annotations = annotations - def set_peek( self, dataset, is_multi_byte=False ): + + def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: - dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) + dataset.peek = data.get_file_peek( + dataset.file_name, is_multi_byte=is_multi_byte) if dataset.metadata.annotations: - dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) ) + dataset.blurb = "%s annotations" % util.commaify( + str(dataset.metadata.annotations)) else: - dataset.blurb = data.nice_size( dataset.get_size() ) + dataset.blurb = util.nice_size(dataset.get_size()) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' - def sniff( self, filename ): + def sniff(self, filename): """ Determines whether the file is a gmap annotation file - Format: + Format: >label coords optional_tag optional_annotation (which may be zero, one, or multiple lines) For example, the label may be an EST accession, with the coords @@ -356,23 +416,25 @@ reverse direction, then may be less than . """ try: - pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag] - fh = open( filename ) + # >label chr:position[..endposition][ optional_tag] + pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line - if line.startswith( '>' ): + if line: # first non-empty line + if line.startswith('>'): count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() return False + class SpliceSiteAnnotation(IntervalAnnotation): file_ext = 'gmap_splicesites' """ @@ -399,27 +461,30 @@ in the database; GSNAP will use the longest intron distance reported in searching for long introns. """ - def sniff( self, filename ): # TODO + + def sniff(self, filename): # TODO """ Determines whether the file is a gmap splice site annotation file """ try: - pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist] - fh = open( filename ) + # >label chr:position..position donor|acceptor[ intron_dist] + pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() return False + class IntronAnnotation(IntervalAnnotation): file_ext = 'gmap_introns' """ @@ -432,27 +497,30 @@ surrounding the intron, with the first coordinate being from the donor exon and the second one being from the acceptor exon. """ - def sniff( self, filename ): # TODO + + def sniff(self, filename): # TODO """ Determines whether the file is a gmap Intron annotation file """ try: - pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position - fh = open( filename ) + # >label chr:position + pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() return False + class SNPAnnotation(IntervalAnnotation): file_ext = 'gmap_snps' """ @@ -471,7 +539,7 @@ one of these two letters does not match the allele in the reference sequence, that SNP will be ignored in subsequent processing as a probable error. - + GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows all nucleotides to match at that position, not just a given reference and alternate allele. It is essentially as if an "N" were recorded at @@ -483,22 +551,24 @@ where "W" is the reference allele and "X" and "Y" are two different alternate alleles. """ - def sniff( self, filename ): + + def sniff(self, filename): """ Determines whether the file is a gmap SNP annotation file """ try: - pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG - fh = open( filename ) + # >label chr:position ATCG + pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None: # Failed to match + if re.match(pat, line) is None: # Failed to match return False finally: fh.close() @@ -517,32 +587,35 @@ C2 0.889,0.912,0.889,0.889,0.933,0.912,0.912,0.889,0.889,0.889 -2.66,-2.89,-2.66,-2.66,-3.16,-2.89,-2.89,-2.66,-2.66,-2.66 C1 T1 0.888,0.9,0.888,0.9,0.913,0.9,0.911,0.888,0.9,0.913 -2.66,-2.78,-2.66,-2.78,-2.91,-2.78,-2.89,-2.66,-2.78,-2.91 """ - def sniff( self, filename ): # TODO + + def sniff(self, filename): # TODO """ Determines whether the file is a gmap splice site annotation file """ try: - pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' #>total chr:position..position - pat2 = '^[GATCN]\d.*$' #BaseCountDeatails - fh = open( filename ) + # >total chr:position..position + pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' + pat2 = '^[GATCN]\d.*$' # BaseCountDeatails + fh = open(filename) count = 0 while True and count < 10: line = fh.readline() if not line: - break #EOF + break # EOF line = line.strip() - if line: #first non-empty line + if line: # first non-empty line count += 1 - if re.match(pat,line) == None and re.match(pat2,line) == None: # Failed to match + # Failed to match + if re.match(pat, line) is None and re.match(pat2, line) is None: return False finally: fh.close() return False -class GsnapResult( Text ): + +class GsnapResult(Text): + """ The default output format for gsnap. Can be used as input for gsnap_tally. """ file_ext = 'gsnap' - - diff -r f6ba0f12cca2 -r 488e9d642566 snpindex.xml --- a/snpindex.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/snpindex.xml Wed Sep 28 10:47:28 2016 -0400 @@ -1,77 +1,10 @@ - + build index files for known SNPs gmap - snpindex --version + snpindex --version /bin/bash $shscript 2>1 1> $output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #!/bin/bash @@ -84,7 +17,7 @@ #set $gmapdb = $refGenomeSource.gmapdb.extra_files_path #set $refname = $refGenomeSource.gmapdb.metadata.db_name #else: -#set $gmapdb = $os.path.dirname($refGenomeSource.gmapindex.value) +#set $gmapdb = $os.path.dirname($refGenomeSource.gmapindex.value) $refname = $os.path.basename($refGenomeSource.gmapindex.value) #end if #set $gmapsnpdir = $output.extra_files_path @@ -110,16 +43,79 @@ #end if - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + - - **GMAP SNP Index** -GMAP SNP Index (snpindex in the GMAP documentaion) creates an index for known SNPs allowing for SNP tolerant mapping and alignment when using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). +GMAP SNP Index (snpindex in the GMAP documentaion) creates an index for known SNPs allowing for SNP tolerant mapping and alignment when using GMAP_ (Genomic Mapping and Alignment Program for mRNA and EST sequences) and GSNAP_ (Genomic Short-read Nucleotide Alignment Program). You will want to read the README_ @@ -129,8 +125,8 @@ .. _GSNAP: http://research-pub.gene.com/gmap/ .. _README: http://research-pub.gene.com/gmap/src/README .. _Publication: http://bioinformatics.oxfordjournals.org/cgi/content/full/21/9/1859 - - + + 10.1093/bioinformatics/bti310 + - diff -r f6ba0f12cca2 -r 488e9d642566 tool-data/datatypes_conf.xml --- a/tool-data/datatypes_conf.xml Wed Sep 28 10:43:44 2016 -0400 +++ b/tool-data/datatypes_conf.xml Wed Sep 28 10:47:28 2016 -0400 @@ -6,6 +6,7 @@ +