RSEM Reference %s

# HG changeset patch # User jjohnson # Date 1384196083 18000 # Node ID 64d45f9593038d400c26fff91e46d38f03be07bb Uploaded diff -r 000000000000 -r 64d45f959303 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,10 @@ + + + + + + + + + + diff -r 000000000000 -r 64d45f959303 rsem.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem.py Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,83 @@ +""" +SnpEff datatypes +""" +import os,os.path,re,sys +import galaxy.datatypes.data +from galaxy.datatypes.images import Html +from galaxy.datatypes.metadata import MetadataElement + +class RsemReference( Html ): + """Class describing an RSEM reference""" + MetadataElement( name='reference_name', default='galaxy_generated_bowtie_index', desc='RSEM Reference Name', readonly=True, visible=True, no_value=None ) + + file_ext = 'rsem_ref' + is_binary = True + composite_type = 'auto_primary_file' + allow_datatype_change = False + + def generate_primary_file( self, dataset = None ): + """ + This is called only at upload to write the html file + cannot rename the datasets here - they come with the default unfortunately + """ + return 'AutoGenerated Primary File for RSEM Reference Composite Dataset' + + def regenerate_primary_file(self,dataset): + """ + cannot do this until we are setting metadata + """ + refname = dataset.metadata.reference_name + flist = os.listdir(dataset.extra_files_path) + rval = ['RSEM Reference %s

Comprises the following files:

%s' % ( sfname, sfname ) ) + rval.append( '

' ) + f = file(dataset.file_name,'w') + f.write("\n".join( rval )) + f.write('\n') + f.close() + + def set_peek( self, dataset, is_multi_byte=False ): + if not dataset.dataset.purged: + dataset.peek = "RSEM Reference (%s)" % ( dataset.metadata.reference_name ) + dataset.blurb = "RSEM Reference (%s)" % ( dataset.metadata.reference_name ) + else: + dataset.peek = 'RSEM Reference (%s) does not exist' % ( dataset.metadata.reference_name ) + dataset.blurb = 'RSEM Reference (%s) purged from disk' % ( dataset.metadata.reference_name ) + + def display_peek( self, dataset ): + try: + return dataset.peek + except: + return "RSEM Reference" + + def set_meta( self, dataset, overwrite = True, **kwd ): + """ + Expecting files: + extra_files_path/.grp + extra_files_path/.ti + extra_files_path/.seq + extra_files_path/.transcripts.fa + Optionally includes files: + extra_files_path/.chrlist + extra_files_path/.idx.fa + extra_files_path/.4.ebwt + extra_files_path/.3.ebwt + extra_files_path/.2.ebwt + extra_files_path/.1.ebwt + extra_files_path/.rev.2.ebwt + extra_files_path/.rev.1.ebwt + """ + log.info( "RSEM reference set_meta %s %s" % (dataset,dataset.extra_files_path)) + pat = '^(.*)\.grp$' + efp = dataset.extra_files_path + flist = os.listdir(efp) + for i,fname in enumerate(flist): + m = re.match(pat,fname) + if m: + dataset.metadata.reference_name = m.groups()[0] + break + self.regenerate_primary_file(dataset) + + diff -r 000000000000 -r 64d45f959303 rsem_calculate_expression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem_calculate_expression.xml Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,583 @@ + + RNA-Seq by Expectation-Maximization + + rsem + samtools + bowtie + + + rsem-calculate-expression + --calc-ci $useci.ci + --fragment-length-mean $fraglenmean + --fragment-length-min $fraglenmin + --fragment-length-sd $fraglensd + --fragment-length-max $fraglenmax + --bowtie-e $bowtie_e + --bowtie-m $bowtie_m + + #if $input.format=="fastq" + ## IF FASTQ AND SINGLE END READS (DEFAULTS) + #if $input.fastqmatepair.matepair=="single" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --seed-length $seedlength $input.fastq_select --estimate-rspd $rspd --forward-prob + $fprob -p $cpus --bowtie-n $bowtie_mis --output-genome-bam --single_fastq $singlefastq + --output $output --isoformfile $isoforms --bamfile $bam_res --log $log + --sampling-for-bam $sampling_for_bam --reference ${index.fields.path} + #end if + ## IF FASTQ AND PAIRED END READS (DEFAULTS) + #if $input.fastqmatepair.matepair=="paired" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --paired-end --seed-length $seedlength --estimate-rspd $rspd $input.fastq_select --forward-prob $fprob -p $cpus + --bowtie-n $bowtie_mis --output-genome-bam --fastq1 $fastq1 --fastq2 $fastq2 --output + $output --isoformfile $isoforms --bamfile $bam_res --log $log --sampling-for-bam + $sampling_for_bam --reference ${index.fields.path} + #end if + #end if + #if $input.format=="fasta" + ## IF FASTA AND SINGLE END READS (DEFAULTS) + #if $input.fastamatepair.matepair=="single" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --no-qualities --seed-length $seedlength --estimate-rspd $rspd --forward-prob $fprob -p $cpus --bowtie-n $bowtie_mis + --output-genome-bam --single_fasta $single_fasta --output $output --isoformfile + $isoforms --bamfile $bam_res --log $log --sampling-for-bam $sampling_for_bam --reference + ${index.fields.path} + #end if + ## IF FASTA AND PAIRED END READS (DEFAULTS) + #if $input.fastamatepair.matepair=="paired" #rsem-wrapper-1.1.17.pl --bam_genome $bam_genome --bamtype $bamtype + --no-qualities --paired-end --seed-length $seedlength --estimate-rspd $rspd --forward-prob $fprob -p $cpus + --bowtie-n $bowtie_mis --output-genome-bam --fasta1 $fasta1 --fasta2 $fasta2 --output + $output --isoformfile $isoforms --bamfile $bam_res --log $log --sampling-for-bam + $sampling_for_bam --reference ${index.fields.path} + #end if + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Is the library strand specific? + + + + + + + + + + + + + + + + + + + + Sample Bam File + + + + + Estimate and correct for a non-uniform read start position distribution (RSPD) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + bamtype == "yes" + + + + + + + +RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/ + +NAME + rsem-calculate-expression + +SYNOPSIS + rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name + rsem-calculate-expression [options] --paired-end upstream_read_file/s downstream_read_file/s reference_name sample_name + rsem-calculate-expression [options] --sam/--bam [--paired-end] input reference_name sample_name + +ARGUMENTS + upstream_read_files/s + Comma-separated list of files containing single-end reads or + upstream reads for paired-end data. By default, these files are + assumed to be in FASTQ format. If the --no-qualities option is + specified, then FASTA format is expected. + + downstream_read_file/s + Comma-separated list of files containing downstream reads which are + paired with the upstream reads. By default, these files are assumed + to be in FASTQ format. If the --no-qualities option is specified, + then FASTA format is expected. + + input + SAM/BAM formatted input file. If "-" is specified for the filename, + SAM/BAM input is instead assumed to come from standard input. RSEM + requires all alignments of the same read group together. For + paired-end reads, RSEM also requires the two mates of any alignment + be adjacent. See Description section for how to make input file obey + RSEM's requirements. + + reference_name + The name of the reference used. The user must have run + 'rsem-prepare-reference' with this reference_name before running + this program. + + sample_name + The name of the sample analyzed. All output files are prefixed by + this name (e.g., sample_name.genes.results) + +OPTIONS + + --paired-end + Input reads are paired-end reads. (Default: off) + + --no-qualities + Input reads do not contain quality scores. (Default: off) + + --strand-specific + The RNA-Seq protocol used to generate the reads is strand specific, + i.e., all (upstream) reads are derived from the forward strand. This + option is equivalent to --forward-prob=1.0. With this option set, if + RSEM runs the Bowtie aligner, the '--norc' Bowtie option will be + used, which disables alignment to the reverse strand of transcripts. + (Default: off) + + --sam + Input file is in SAM format. (Default: off) + + --bam + Input file is in BAM format. (Default: off) + + --sam-header-info [file] + RSEM reads header information from input by default. If this option + is on, header information is read from the specified file. For the + format of the file, please see SAM official website. (Default: "") + + -p/--num-threads [int] + Number of threads to use. Both Bowtie and expression estimation will + use this many threads. (Default: 1) + + --no-bam-output + Do not output any BAM file. (Default: off) + + --output-genome-bam + Generate a BAM file, 'sample_name.genome.bam', with alignments + mapped to genomic coordinates and annotated with their posterior + probabilities. In addition, RSEM will call samtools (included in + RSEM package) to sort and index the bam file. + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' will be generated. (Default: + off) + + --sampling-for-bam + When RSEM generates a BAM file, instead of outputing all alignments + a read has with their posterior probabilities, one alignment is + sampled and outputed according to the posterior probabilities. If + the sampling result is that the read comes from the "noise" + transcript, nothing is outputed. (Default: off) + + --calc-ci + Calculate 95% credibility intervals and posterior mean estimates. + (Default: off) + + --seed-length [int] + Seed length used by the read aligner. Providing the correct value is + important for RSEM. If RSEM runs Bowtie, it uses this value for + Bowtie's seed length parameter. Any read with its or at least one of + its mates' (for paired-end reads) length less than this value will + be ignored. If the references are not added poly(A) tails, the + minimum allowed value is 5, otherwise, the minimum allowed value is + 25. Note that this script will only check if the value less or equal than + 5 and give a warning message if the value less than 25 but greter or equal than + 5. (Default: 25) + + --tag [string] + The name of the optional field used in the SAM input for identifying + a read with too many valid alignments. The field should have the + format [tagName]:i:[value], where a [value] bigger than 0 indicates + a read with too many alignments. (Default: "") + + --bowtie-path [path] + The path to the bowtie executables. (Default: the path to the bowtie + executables is assumed to be in the user's PATH environment + variable) + + --bowtie-n [int] + (Bowtie parameter) max # of mismatches in the seed. (Range: 0-3, + Default: 2) + + --bowtie-e [int] + (Bowtie parameter) max sum of mismatch quality scores across the + alignment. (Default: 99999999) + + --bowtie-m [int] + (Bowtie parameter) suppress all alignments for a read if greater then [int] + valid alignments exist. (Default: 200) + + --bowtie-chunkmbs [int] + (Bowtie parameter) memory allocated for best first alignment + calculation (Default: 0 - use bowtie's default) + + --phred33-quals + Input quality scores are encoded as Phred+33. (Default: on) + + --phred64-quals + Input quality scores are encoded as Phred+64 (default for GA + Pipeline ver. less than 1.3). (Default: off) + + --solexa-quals + Input quality scores are solexa encoded (from GA Pipeline ver. less + than 1.3). (Default: off) + + --forward-prob [double] + Probability of generating a read from the forward strand of a + transcript. Set to 1 for a strand-specific protocol where all + (upstream) reads are derived from the forward strand, 0 for a + strand-specific protocol where all (upstream) read are derived from + the reverse strand, or 0.5 for a non-strand-specific protocol. + (Default: 0.5) + + --fragment-length-min [int] + Minimum read/insert length allowed. This is also the value for the + bowtie -I option. (Default: 1) + + --fragment-length-max [int] + Maximum read/insert length allowed. This is also the value for the + bowtie -X option. (Default: 1000) + + --fragment-length-mean [double] + (single-end data only) The mean of the fragment length distribution, + which is assumed to be a Gaussian. (Default: -1, which disables use + of the fragment length distribution) + + --fragment-length-sd [double] + (single-end data only) The standard deviation of the fragment length + distribution, which is assumed to be a Gaussian. (Default: 0, which + assumes that all fragments are of the same length, given by the + rounded value of --fragment-length-mean) + + --estimate-rspd + Set this option if you want to estimate the read start position + distribution (RSPD) from data. Otherwise, RSEM will use a uniform + RSPD. (Default: off) + + --num-rspd-bins [int] + Number of bins in the RSPD. Only relevant when '--estimate-rspd' is + specified. Use of the default setting is recommended. (Default: 20) + + --ci-memory [int] + Maximum size (in memory, MB) of the auxiliary buffer used for + computing credibility intervals (CI). Set it larger for a faster CI + calculation. However, leaving 2 GB memory free for other usage is + recommended. (Default: 1024) + + --keep-intermediate-files + Keep temporary files generated by RSEM. RSEM creates a temporary + directory, 'sample_name.temp', into which it puts all intermediate + output files. If this directory already exists, RSEM overwrites all + files generated by previous RSEM runs inside of it. By default, + after RSEM finishes, the temporary directory is deleted. Set this + option to prevent the deletion of this directory and the + intermediate files inside of it. (Default: off) + + --time + Output time consumed by each step of RSEM to 'sample_name.time'. + (Default: off) + + -q/--quiet + Suppress the output of logging information. (Default: off) + + -h/--help + Show help information. + +DESCRIPTION + In its default mode, this program aligns input reads against a reference + transcriptome with Bowtie and calculates expression values using the + alignments. RSEM assumes the data are single-end reads with quality + scores, unless the '--paired-end' or '--no-qualities' options are + specified. Users may use an alternative aligner by specifying one of the + --sam and --bam options, and providing an alignment file in the + specified format. However, users should make sure that they align + against the indices generated by 'rsem-prepare-reference' and the + alignment file satisfies the requirements mentioned in ARGUMENTS + section. + + One simple way to make the alignment file satisfying RSEM's requirements + (assuming the aligner used put mates in a paired-end read adjacent) is + to use 'convert-sam-for-rsem' script. This script only accept SAM format + files as input. If a BAM format file is obtained, please use samtools to + convert it to a SAM file first. For example, if '/ref/mouse_125' is the + 'reference_name' and the SAM file is named 'input.sam', you can run the + following command: + + convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + + For details, please refer to 'convert-sam-for-rsem's documentation page. + + The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old + SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. + In addition, RSEM requires SEQ and QUAL are not '*'. + + The user must run 'rsem-prepare-reference' with the appropriate + reference before using this program. + + For single-end data, it is strongly recommended that the user provide + the fragment length distribution parameters (--fragment-length-mean and + --fragment-length-sd). For paired-end data, RSEM will automatically + learn a fragment length distribution from the data. + + Please note that some of the default values for the Bowtie parameters + are not the same as those defined for Bowtie itself. + + The temporary directory and all intermediate files will be removed when + RSEM finishes unless '--keep-intermediate-files' is specified. + + With the '--calc-ci' option, 95% credibility intervals and posterior + mean estimates will be calculated in addition to maximum likelihood + estimates. + +OUTPUT + sample_name.genes.results + File containing gene level expression estimates. The format of each + line in this file is: + + gene_id expected_counts tau_value [pmc_value tau_pme_value + tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list + + Fields are separated by the tab character. Fields within "[]" are + only presented if '--calc-ci' is set. pme stands for posterior mean + estimation. pmc stands for posterior mean counts. ci_lower_bound(l) + means the lower bound of the credibility intervals, + ci_upper_bound(u) means the upper bound of the credibility + intervals. So the credibility interval is [l, u]. + 'transcript_id_list' is a space-separated list of transcript_ids + belonging to the gene. If no gene information is provided, this file + has the same content as 'sample_name.isoforms.results'. + + sample_name.isoforms.results + File containing isoform level expression values. The format of each + line in this file is: + + transcript_id expected_counts tau_value [pmc_value tau_pme_value + tau_ci_lower_bound tau_ci_upper_bound] gene_id + + Fields are separated by the tab character. 'gene_id' is the gene_id + of the gene which this transcript belongs to. If no gene information + is provided, 'gene_id' and 'transcript_id' are the same. + + sample_name.transcript.bam, sample_name.transcript.sorted.bam and + sample_name.transcript.sorted.bam.bai + Only generated when --no-bam-output is not specified. + + 'sample_name.transcript.bam' is a BAM-formatted file of read + alignments in transcript coordinates. The MAPQ field of each + alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), + where w is the posterior probability of that alignment being the + true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, + where value is a single precision floating number representing the + posterior probability. + + 'sample_name.transcript.sorted.bam' and + 'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.genome.bam, sample_name.genome.sorted.bam and + sample_name.genome.sorted.bam.bai + Only generated when --no-bam-output is not specified and + --output-genome-bam is specified. + + 'sample_name.genome.bam' is a BAM-formatted file of read alignments + in genomic coordinates. Alignments of reads that have identical + genomic coordinates (i.e., alignments to different isoforms that + share the same genomic region) are collapsed into one alignment. The + MAPQ field of each alignment is set to min(100, floor(-10 * + log10(1.0 - w) + 0.5)), where w is the posterior probability of that + alignment being the true mapping of a read. In addition, RSEM pads a + new tag ZW:f:value, where value is a single precision floating + number representing the posterior probability. If an alignment is + spliced, a XS:A:value tag is also added, where value is either '+' + or '-' indicating the strand of the transcript it aligns to. + + 'sample_name.genome.sorted.bam' and + 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and + indices generated by samtools (included in RSEM package). + + sample_name.sam.gz + Only generated when the input files are raw reads instead of SAM/BAM + format files + + It is the gzipped SAM output produced by bowtie aligner. + + sample_name.time + Only generated when --time is specified. + + It contains time (in seconds) consumed by aligning reads, estimating + expression levels and calculating credibility intervals. + + sample_name.stat + This is a folder instead of a file. All model related statistics are + stored in this folder. Use 'rsem-plot-model' can generate plots + using this folder. + +EXAMPLES + Assume the path to the bowtie executables is in the user's PATH + environment variable. Reference files are under '/ref' with name + 'mouse_125'. + + 1) '/data/mmliver.fq', single-end reads with quality scores. Quality + scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 + threads and generate a genome BAM file: + + rsem-calculate-expression --phred64-quals \ + -p 8 \ + --output-genome-bam \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with + quality scores. Quality scores are in SANGER format. We want to use 8 + threads and do not generate a genome BAM file: + + rsem-calculate-expression -p 8 \ + --paired-end \ + /data/mmliver_1.fq \ + /data/mmliver_2.fq \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + 3) '/data/mmliver.fa', single-end reads without quality scores. We want + to use 8 threads: + + rsem-calculate-expression -p 8 \ + --no-qualities \ + /data/mmliver.fa \ + /ref/mouse_125 \ + mmliver_single_without_quals + + 4) Data are the same as 1). We want to take a fragment length + distribution into consideration. We set the fragment length mean to 150 + and the standard deviation to 35. In addition to a BAM file, we also + want to generate credibility intervals. We allow RSEM to use 1GB of + memory for CI calculation: + + rsem-calculate-expression --bowtie-path /sw/bowtie \ + --phred64-quals \ + --fragment-length-mean 150.0 \ + --fragment-length-sd 35.0 \ + -p 8 \ + --output-genome-bam \ + --calc-ci \ + --ci-memory 1024 \ + /data/mmliver.fq \ + /ref/mouse_125 \ + mmliver_single_quals + + 5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality + scores. We want to use 8 threads: + + rsem-calculate-expression --paired-end \ + --bam \ + -p 8 \ + /data/mmliver_paired_end_quals.bam \ + /ref/mouse_125 \ + mmliver_paired_end_quals + + diff -r 000000000000 -r 64d45f959303 rsem_prepare_reference.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rsem_prepare_reference.xml Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,114 @@ + + + + rsem + bowtie + + + rsem-prepare-reference + #if $polya.polya_use == 'add': + #if $polya.polya_length: + --polyA-length $polya.polya_length + #end if + #elif $polya.polya_use == 'subset': + --no-polyA-subset $polya.no_polya_subset + #if $polya.polya_length: + --polyA-length $polya.polya_length + #end if + #elif $polya.polya_use == 'none': + --no-polyA + #end if + $ntog + #if $transcript_to_gene_map: + --transcript-to-gene-map $transcript_to_gene_map + #end if + #if $reference.ref_type == 'transcripts': + $reference.reference_fasta_file + #else: + --gtf $reference.gtf + $reference.reference_fasta_file + #end if + $reference_name + + + + + + + + + + + + + + + + + + + + Each line of should be of the form: gene_id transcript_id ( with the two fields separated by a tab character ) + The map can be obtained from the UCSC table browser + group: Genes and Gene Prediction Tracks + table: knownIsoforms + Without a map: + If a reference genome and gtf is used, then RSEM uses the "gene_id" and "transcript_id" attributes in the GTF file. + Otherwise, RSEM assumes that each sequence in the reference sequence files is a separate gene. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +RSEM HOME PAGE - http://deweylab.biostat.wisc.edu/rsem/ + +NAME + rsem-prepare-reference + +SYNOPSIS + rsem-prepare-reference [options] reference_fasta_file(s) reference_name + +DESCRIPTION + The rsem-prepare-reference program extracts/preprocesses the reference sequences and builds Bowtie indices using default parameters. + This program is used in conjunction with the 'rsem-calculate-expression' program. + +INPUTS + + + + + diff -r 000000000000 -r 64d45f959303 tool-data/rsem_indices.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/rsem_indices.loc.sample Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,14 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie_indices.loc +#file has this format (longer white space characters are TAB characters): +# +# +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie/hg18/, +#then the bowtie_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie/hg18/hg18 diff -r 000000000000 -r 64d45f959303 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,8 @@ + + + + value, dbkey, name, path + +

+ + diff -r 000000000000 -r 64d45f959303 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Mon Nov 11 13:54:43 2013 -0500 @@ -0,0 +1,12 @@ + + + + + + + + + + + +