# HG changeset patch # User greg # Date 1326905120 18000 # Node ID a41241d67693508e98df76a0d1fbd01643ea3db2 Uploaded diff -r 000000000000 -r a41241d67693 grinder-fa1fa683bcf1/.hg_archival.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grinder-fa1fa683bcf1/.hg_archival.txt Wed Jan 18 11:45:20 2012 -0500 @@ -0,0 +1,5 @@ +repo: b35ec780aac1e8535f35e34a541830aaf4d1676b +node: fa1fa683bcf178923f14d9aaf077187da76f0d6f +branch: default +latesttag: null +latesttagdistance: 12 diff -r 000000000000 -r a41241d67693 grinder-fa1fa683bcf1/Galaxy_readme.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grinder-fa1fa683bcf1/Galaxy_readme.txt Wed Jan 18 11:45:20 2012 -0500 @@ -0,0 +1,5 @@ +This is an XML wrapper that provides a GUI for Grinder in Galaxy (http://galaxy.psu.edu/). + +Place these files in your Galaxy directory. More information at http://wiki.g2.bx.psu.edu/FrontPage. + +Note: The Grinder wrapper uses Galaxy builtin datasets located in the 'all_fasta' data table. diff -r 000000000000 -r a41241d67693 grinder-fa1fa683bcf1/all_fasta.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grinder-fa1fa683bcf1/all_fasta.loc.sample Wed Jan 18 11:45:20 2012 -0500 @@ -0,0 +1,15 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. +#IMPORTANT: EACH LINE OF THIS FILE HAS TO BE TAB-DELIMITED! +# +# +# +#So, all_fasta.loc could look something like this: +# +#ncbi_refseq_complete_viruses ncbi_refseq_complete_viruses RefSeq complete viruses /path/to/ncbi_refseq_complete_viruses.fna +#ncbi_refseq_complete_microbes ncbi_refseq_complete_microbes RefSeq complete microbes /path/to/ncbi_refseq_complete_microbes.fna +#homo_sapiens_GRCh37 homo_sapiens_GRCh37 Homo sapiens genome /path/to/Homo_sapiens_GRCh37_reference.fna +#gg_named_16S gg_named_16S GreenGenes named 16S strains /path/to/Isolated_named_strains_16S.fna + diff -r 000000000000 -r a41241d67693 grinder-fa1fa683bcf1/grinder.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grinder-fa1fa683bcf1/grinder.xml Wed Jan 18 11:45:20 2012 -0500 @@ -0,0 +1,483 @@ + + + versatile omic shotgun and amplicon read simulator + + + grinder + + + grinder --version + + + stderr_wrapper.py + grinder + #if $reference_file.specify == "builtin": + -reference_file ${ filter( lambda x: str( x[0] ) == str( $reference_file.value ), $__app__.tool_data_tables[ 'all_fasta' ].get_fields() )[0][-1] } + #else if $reference_file.specify == "uploaded": + -reference_file $reference_file.value + #end if + #if str($coverage_fold): + -coverage_fold $coverage_fold + #end if + #if str($total_reads): + -total_reads $total_reads + #end if + #if str($read_dist): + -read_dist $read_dist + #end if + #if str($insert_dist): + -insert_dist $insert_dist + #end if + #if str($mate_orientation): + -mate_orientation $mate_orientation + #end if + #if str($exclude_chars): + -exclude_chars $exclude_chars + #end if + #if str($delete_chars): + -delete_chars $delete_chars + #end if + #if str($forward_reverse) != "None": + -forward_reverse $forward_reverse + #end if + #if str($unidirectional): + -unidirectional $unidirectional + #end if + #if str($length_bias): + -length_bias $length_bias + #end if + #if str($copy_bias): + -copy_bias $copy_bias + #end if + #if str($mutation_dist): + -mutation_dist $mutation_dist + #end if + #if str($mutation_ratio): + -mutation_ratio $mutation_ratio + #end if + #if str($homopolymer_dist): + -homopolymer_dist $homopolymer_dist + #end if + #if str($chimera_perc): + -chimera_perc $chimera_perc + #end if + #if str($chimera_dist): + -chimera_dist $chimera_dist + #end if + #if str($chimera_kmer): + -chimera_kmer $chimera_kmer + #end if + #if str($abundance_file) != "None": + -abundance_file $abundance_file + #end if + #if str($abundance_model): + -abundance_model $abundance_model + #end if + #if str($num_libraries): + -num_libraries $num_libraries + #end if + #if str($multiplex_ids) != "None": + -multiplex_ids $multiplex_ids + #end if + #if str($diversity): + -diversity $diversity + #end if + #if str($shared_perc): + -shared_perc $shared_perc + #end if + #if str($permuted_perc): + -permuted_perc $permuted_perc + #end if + #if str($random_seed): + -random_seed $random_seed + #end if + #if str($permuted_perc): + -desc_track $desc_track + #end if + #if str($qual_levels): + -qual_levels $qual_levels + #end if + #if str($fastq_output) == '1': + -fastq_output $fastq_output + #end if + #if str($profile_file) != "None": + -profile_file $profile_file.value + #end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + int(str(num_libraries)) == 1 + + + int(str(num_libraries)) == 1 and fastq_output == 0 + + + int(str(num_libraries)) == 1 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) == 1 and fastq_output == 1 + + + + + + + + int(str(num_libraries)) >= 2 + + + int(str(num_libraries)) >= 2 and fastq_output == 0 + + + int(str(num_libraries)) >= 2 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 2 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 2 + + + int(str(num_libraries)) >= 2 and fastq_output == 0 + + + int(str(num_libraries)) >= 2 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 2 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 3 + + + int(str(num_libraries)) >= 3 and fastq_output == 0 + + + int(str(num_libraries)) >= 3 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 3 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 4 + + + int(str(num_libraries)) >= 4 and fastq_output == 0 + + + int(str(num_libraries)) >= 4 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 4 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 5 + + + int(str(num_libraries)) >= 5 and fastq_output == 0 + + + int(str(num_libraries)) >= 5 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 5 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 6 + + + int(str(num_libraries)) >= 6 and fastq_output == 0 + + + int(str(num_libraries)) >= 6 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 6 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 7 + + + int(str(num_libraries)) >= 7 and fastq_output == 0 + + + int(str(num_libraries)) >= 7 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 7 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 8 + + + int(str(num_libraries)) >= 8 and fastq_output == 0 + + + int(str(num_libraries)) >= 8 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 8 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 9 + + + int(str(num_libraries)) >= 9 and fastq_output == 0 + + + int(str(num_libraries)) >= 9 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 9 and fastq_output == 1 + + + + + int(str(num_libraries)) >= 10 + + + int(str(num_libraries)) >= 10 and fastq_output == 0 + + + int(str(num_libraries)) >= 10 and str(qual_levels) and fastq_output == 0 + + + int(str(num_libraries)) >= 10 and fastq_output == 1 + + + + + + + + + + + +**What it does** + +Grinder is a program to create random shotgun and amplicon sequence libraries +based on reference sequences in a FASTA file. Features include: + + * omic support: genomic, metagenomic, transcriptomic, metatranscriptomic, + proteomic and metaproteomic + * shotgun library or amplicon library + * arbitrary read length distribution and number of reads + * simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers) + * support for creating paired-end (mate pair) datasets + * specific rank-abundance settings or manually given abundance for each genome + * creation of datasets with a given richness (alpha diversity) + * independent datasets can share a variable number of genomes (beta diversity) + * modeling of the bias created by varying genome lengths or gene copy number + * profile mechanism to store preferred options + * API to automate the creation of a large number of simulated datasets + + +**Input** + +A variety of FASTA databases containing genes or genomes can be used as input +for Grinder, such as the NCBI RefSeq collection (ftp://ftp.ncbi.nih.gov/refseq/release/microbial/), +the GreenGenes 16S rRNA database (http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta), the human genome and transcriptome (ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/, ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz), ... + +These input files can either be provided as a Galaxy dataset, or can be uploaded +by Galaxy users in their history. + + +**Output** + +For each library requested, a first file contains the abundance of the species +in the simulated community created, e.g.:: + + # rank seqID rel. abundance + 1 86715_Lachnospiraceae 0.367936925098555 + 2 6439_Neisseria_polysaccharea 0.183968462549277 + 3 103712_Fusobacterium_nucleatum 0.122645641699518 + 4 103024_Frigoribacterium 0.0919842312746386 + 5 129066_Streptococcus_pyogenes 0.0735873850197109 + 6 106485_Pseudomonas_aeruginosa 0.0613228208497591 + 7 13824_Veillonella_criceti 0.0525624178712221 + 8 28044_Lactosphaera 0.0459921156373193 + +The second file is a FASTA file containing shotgun or amplicon reads, e.g.:: + + >1 reference=13824_Veillonella_criceti position=89-1088 strand=+ + ACCAACCTGCCCTTCAGAGGGGGATAACAACGGGAAACCGTTGCTAATACCGCGTACGAA + TGGACTTCGGCATCGGAGTTCATTGAAAGGTGGCCTCTATTTATAAGCTATCGCTGAAGG + AGGGGGTTGCGTCTGATTAGCTAGTTGGAGGGGTAATGGCCCACCAAGGCAA + + >2 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+ + TGAACGAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAA + GTCAACTTGAATTTGGGTTTTTAACTTAGGTTTGGG + +If you specify the quality score levels option, a third file representing the +quality scores of the reads is created:: + + >1 reference=103712_Fusobacterium_nucleatum position=2-1001 strand=+ + 30 30 30 10 30 30 ... + + + + + + diff -r 000000000000 -r a41241d67693 grinder-fa1fa683bcf1/stderr_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grinder-fa1fa683bcf1/stderr_wrapper.py Wed Jan 18 11:45:20 2012 -0500 @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +""" +Wrapper that executes a program with its arguments but reports standard error +messages only if the program exit status was not 0. This is useful to prevent +Galaxy to interpret that there was an error if something was printed on stderr, +e.g. if this was simply a warning. +Example: ./stderr_wrapper.py myprog arg1 -f arg2 +Author: Florent Angly +""" + +import sys, subprocess + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def __main__(): + # Get command-line arguments + args = sys.argv + # Remove name of calling program, i.e. ./stderr_wrapper.py + args.pop(0) + # If there are no arguments left, we're done + if len(args) == 0: + return + + # If one needs to silence stdout + #args.append( ">" ) + #args.append( "/dev/null" ) + + #cmdline = " ".join(args) + #print cmdline + try: + # Run program + proc = subprocess.Popen( args=args, shell=False, stderr=subprocess.PIPE ) + returncode = proc.wait() + # Capture stderr, allowing for case where it's very large + stderr = '' + buffsize = 1048576 + try: + while True: + stderr += proc.stderr.read( buffsize ) + if not stderr or len( stderr ) % buffsize != 0: + break + except OverflowError: + pass + # Running Grinder failed: write error message to stderr + if returncode != 0: + raise Exception, stderr + except Exception, e: + # Running Grinder failed: write error message to stderr + stop_err( 'Error: ' + str( e ) ) + + +if __name__ == "__main__": __main__() diff -r 000000000000 -r a41241d67693 grinder-fa1fa683bcf1/tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/grinder-fa1fa683bcf1/tool_data_table_conf.xml.sample Wed Jan 18 11:45:20 2012 -0500 @@ -0,0 +1,7 @@ + + + + value, dbkey, name, path + +