# HG changeset patch # User dpryan79 # Date 1492540378 14400 # Node ID 6f9dd98d641a54d0befb60c6ff6e0b0fc358e51d # Parent ab5869799e9ff20960877e6e13d83aba6eb75adb Upload files diff -r ab5869799e9f -r 6f9dd98d641a README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,40 @@ +##What it does## + +This is a Galaxy datamanager for the rna STAR gap-aware RNA aligner. It's a hack of Dan Blankenberg's BWA data manager +and works on any fasta file you have already downloaded with the all fasta data manager - start there! + +Warning - this is not well tested and there are some complexities to do with splice junction annotation in rna star +indexes - feedback welcomed. Send code. + +Note, currently you'll need a small patch to prevent an error when you try to generate splice junction indexes described at +https://bitbucket.org/galaxy/galaxy-central/pull-request/510/fix-for-data-manager-failure-to-update-a#comment-3265356 + +Please read the fine manual - that and the google group are the places to learn about the options above. + +*Note on sjdbOverhang* + +From https://groups.google.com/forum/#!topic/rna-star/h9oh10UlvhI:: + + James is right, using large enough --sjdbOverhang is safer and should not generally cause any problems with reads of varying length. If your reads are very short, <50b, then I would strongly recommend using optimum --sjdbOverhang=mateLength-1 + By mate length I mean the length of one of the ends of the read, i.e. it's 100 for 2x100b PE or 1x100b SE. For longer reads you can simply use generic --sjdbOverhang 100. + It is a bit confusing because of the way I named this parameter. --sjdbOverhang Noverhang is only used at the genome generation step for constructing the reference sequence out of the annotations. + Basically, the Noverhang exonic bases from the donor site and Noverhang exonic bases from the acceptor site are spliced together for each of the junctions, and these spliced sequences are added to the genome sequence. + + At the mapping stage, the reads are aligned to both genomic and splice sequences simultaneously. If a read maps to one of spliced sequences and crosses the "junction" in the middle of it, the coordinates of two pspliced pieces are translated back to genomic space and added to the collection of mapped pieces, which are then all "stitched" together to form the final alignment. Since in the process of "maximal mapped length" search the read is split into pieces of no longer than --seedSearchStartLmax (=50 by default) bases, even if the read (mate) is longer than --sjdbOverhang, it can still be mapped to the spliced reference, as long as --sjdbOverhang > --seedSearchStartLmax. + + Cheers + Alex + +*Note on gene model requirements for splice junctions* + +From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ:: + + When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100. + + Your gtf lines look fine to me. STAR needs 3 features from a GTF file: + 1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option. + 2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon) + 3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript) + + Cheers + Alex diff -r ab5869799e9f -r 6f9dd98d641a data_manager/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macros.xml Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,20 @@ + + + + star + samtools + + + + --readFilesCommand zcat + + + + 10.1093/bioinformatics/bts635 + + + + + + + diff -r ab5869799e9f -r 6f9dd98d641a data_manager/rna_star_index_builder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/rna_star_index_builder.py Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import json +import optparse + + +def main(): + parser = optparse.OptionParser() + parser.add_option( '--config-file', dest='config_file', action='store', type="string") + parser.add_option( '--value', dest='value', action='store', type="string" ) + parser.add_option( '--dbkey', dest='dbkey', action='store', type="string" ) + parser.add_option( '--name', dest='name', action='store', type="string" ) + parser.add_option( '--subdir', dest='subdir', action='store', type="string" ) + parser.add_option( '--data-table', dest='data_table', action='store', type="string" ) + parser.add_option( '--withGTF', dest='withGTF', action='store_true' ) + (options, args) = parser.parse_args() + + if options.dbkey in [ None, '', '?' ]: + raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( options.dbkey ) ) + + withGTF = "0" + if options.withGTF: + withGTF = "1" + + data_manager_dict = {'data_tables': {options.data_table: [dict( value=options.value, dbkey=options.dbkey, name=options.name, path=options.subdir, withGTF=withGTF )]}} + open( options.config_file, 'wb' ).write( json.dumps( data_manager_dict ) ) + + +if __name__ == "__main__": + main() diff -r ab5869799e9f -r 6f9dd98d641a data_manager/rna_star_index_builder.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/rna_star_index_builder.xml Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,125 @@ + + builder + + + macros.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + + --seedSearchStartLmax. + + Cheers + Alex + +*Note on gene model requirements for splice junctions* + +From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ:: + + When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100. + + Your gtf lines look fine to me. STAR needs 3 features from a GTF file: + 1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option. + 2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon) + 3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript) + + Cheers + Alex + +**Notice:** If you leave name, description, or id blank, it will be generated automatically. +]]> + + + diff -r ab5869799e9f -r 6f9dd98d641a tool-data/all_fasta.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/all_fasta.loc.sample Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +# +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +# diff -r ab5869799e9f -r 6f9dd98d641a tool-data/rnastar_index2.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/rnastar_index2.loc.sample Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,23 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of rna-star indexed sequences data files. You will +#need to create these data files and then create a rnastar_index2.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The rnastar_index2.loc +#file has this format (longer white space characters are TAB characters): +# +# +# +#The column should be 1 or 0, indicating whether the index was made +#with an annotation (i.e., --sjdbGTFfile and --sjdbOverhang were used) or not, +#respecively. +# +#Note that STAR indices can become quite large. Consequently, it is only +#advisable to create indices with annotations if it's known ahead of time that +#(A) the annotations won't be frequently updated and (B) the read lengths used +#will also rarely vary. If either of these is not the case, it's advisable to +#create indices without annotations and then specify an annotation file and +#maximum read length (minus 1) when running STAR. +# +#hg19 hg19 hg19 full /mnt/galaxyIndices/genomes/hg19/rnastar 0 +#hg19Ensembl hg19Ensembl hg19 full with Ensembl annotation /mnt/galaxyIndices/genomes/hg19Ensembl/rnastar 1 + diff -r ab5869799e9f -r 6f9dd98d641a tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Apr 18 14:32:58 2017 -0400 @@ -0,0 +1,12 @@ + + + + value, dbkey, name, path + +

+ + + value, dbkey, name, path, withGTF + +