# HG changeset patch # User greg # Date 1489683285 14400 # Node ID eda72de4717127bca65ed4485a40db6dcf6e1a9e # Parent 77fe1183f3e3dc6ef30dd1444ca7906c7b1097cc Uploaded diff -r 77fe1183f3e3 -r eda72de47171 assembly_post_processor.xml --- a/assembly_post_processor.xml Fri Mar 03 11:19:25 2017 -0500 +++ b/assembly_post_processor.xml Thu Mar 16 12:54:45 2017 -0400 @@ -1,5 +1,5 @@ - - into putative coding sequences + + post-processes de novo transcriptome assembly plant_tribes_assembly_post_processor @@ -39,9 +39,9 @@ ]]> - + - + @@ -51,20 +51,20 @@ - + - + - + @@ -76,16 +76,16 @@ - - + + - + - + @@ -123,56 +123,160 @@ -This tool is one of the PlantTribes' collection of automated modular analysis pipelines that utilize objective classifications of -complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies. It postprocesses de novo assembly -transcripts into putative coding sequences and their corresponding amino acid translations, locally assembling targeted gene families. +This tool is one of the PlantTribes' collection of automated modular analysis pipelines for comparative and +evolutionary analyses of genome-scale gene families and transcriptomes. This tool post-processes de novo +assembled transcripts into putative coding sequences and their corresponding amino acid translations and +optionally assigns transcripts to circumscribed gene families ("orthogroups")[2]. After transcripts have been +assigned to gene families, overlapping contigs can be identified and merged to reduce fragmentation in the +de novo assembly. ----- -**Options** +**Required options** + + * **Transcriptome assembly fasta file** - either de novo or reference-guided transcriptome assembly fasta file selected from your history. + * **Coding regions prediction method** - method for finding coding recions within transcripts. Available methods are TransDecoder[3] and ESTScan[4]. + * **Scores matrices** - Scores matrices, based on a related species, are required when ESTScan is ued to find coding regions. Details of how to create species-specific scores matrices can be found on the ESTScan website (http://estscan.sourceforge.net). Matrices of some organisms are also available to download. + +**Other options** - * **Prediction method for coding regions** - The prediction method for coding regions; one of ESTScan or TransDecoder. - * **Scores matrices** - Scores matrices that reflect the codons preferences in the studied organisms. - * **Target gene family assembly?** - Select 'Yes' to target gene families to assemble. - * **List of orthogroup identifiers for target gene families to assemble** - History item with a list of orthogroup identifiers for target gene families to assemble. - * **Orthogroups or gene families proteins scaffold** - PlantTribes scaffolds data installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool. - * **Protein clustering method** - One of GFam (domain architecture based clustering), OrthoFinder (broadly defined clusters) or OrthoMCL (narrowly defined clusters). - * **Remove sites in alignments with gaps of** - Removes gappy sites in alignments (i.e. 0.1 removes sites with 90% gaps): [0.0 to 1.0] - * **De novo transcriptome assembly was performed with strand-specific library?** - Select 'Yes' if de novo transcriptome assembly was performed with strand-specific library. - * **Remove repeated sequences in predicted coding regions?** - Select yes to remove repeated sequences in predicted coding regions. - * **Minimum sequence length of predicted coding regions** - Set the minimum sequence length of predicted coding regions. + * **Perform targeted gene assembly?** - Selecting 'Yes' enables local assembly of one or more targeted gene families in a specific scaffold. Scaffolds are defined in PlantTribes as clusters of paralogous/orthologous sequences from a specified set of proteomes[5-7]. + * **Targeted gene families** - A selected history item containing a list of targeted orthogroup identifiers corresponding to the gene family classification from a specified scaffold. Gene identifiers can be obtained from the function annotation table ("Orthogroup ID" field of *.summary file) of scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool. + * **Gene family scaffold** - one of the PlantTribes gene family scaffolds (installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool) whose orthogroups(s) are targeted for the localized assembly. + * **Protein clustering method** -gene family scaffold protein clustering method. Each PlantTribes scaffold data has up to three sets of clusters - GFam[8] (clusters of consensus domain architecture), OrthoFinder[9] (broadly defined clusters) or OrthoMCL[10] (narrowly defined clusters). You can also install your own data scaffold created using a different clustering method as long as it conforms to the PlantTribes scaffold data format. + * **Trim alignments** - trim gene family multiple sequence alignments that include scaffold backbone genes and locally assembled transcripts to remove non-conserved regions (gappy sites)[11]. The trimmed alignments are used in assigning scores to locally assembled transcripts to determine how well they compare to the backbones gene models. The default setting of 0.1 removes sites tha thave gaps in 90% of the sequences in the multiple sequence alignment. This option is restricted to the range 0.0 - 1.0. + * **Strand-specific assembly?** - select 'Yes' if transcriptome library sequences were strand-specific. If 'Yes" is selected, transcripts from the minority strand (antisense) are removed. + * **Remove duplicate sequences?** - select 'Yes' to remove duplicated and exact subsequences[12]. + * **Minimum sequence length** - set the minimum sequence length of predicted coding regions. The default is 200 bp. @unpublished{None, + key = {1}, author = {Eric Wafula}, - title = {None}, + title = {Manuscript in preparation}, year = {None}, url = {https://github.com/dePamphilis/PlantTribes} - } + } + + + @published{PloS one, + key = {2}, + author = {Honaas, L.A., Wafula, E.K., Wickett, N.J., Der, J.P., Zhang, Y., Edger, P.P., Altman, N.S., Pires, J.C. and Leebens-Mack, J.H.}, + title = {Selecting superior de novo transcriptome assemblies: lessons learned by leveraging the best plant genome}, + year = {2016}, + volume = {11}, + number = {1}, + pages = {e0146062} + } + - @published{Proc Int Conf Intell Syst Mol Biol, + @published{Nature protocols, + key = {3}, + author = {Haas, B.J., Papanicolaou, A., Yassour, M., Grabherr, M., Blood, P.D., Bowden, J., Couger, M.B., Eccles, D., Li, B., Lieber, M. and MacManes, M.D.}, + title = {De novo transcript sequence reconstruction from RNA-seq using the Trinity platform for reference generation and analysis}, + year = {2013}, + volume = {8}, + number = {8}, + pages = {1494-1512} + } + + + @published{ISMB, + key = {4}, author = {Iseli C, Jongeneel CV, Bucher P.}, title = {ESTScan: a program for detecting, evaluating, and reconstructing potential coding regions in EST sequences.}, year = {1999}, - url = {http://estscan.sourceforge.net/} - } - 10.1038/nprot.2013.084 - 10.1109/tcbb.2013.68 + volume = {99}, + pages = {138-148} + url = {http://estscan.sourceforge.net} + } + + + @published{Genome research, + key = {5}, + author = {Huang X, Madan A}, + title = {CAP3: A DNA sequence assembly program}, + year = {1999}, + volume = {9}, + number = {9}, + pages = {868-877} + url = {http://seq.cs.iastate.edu/cap3.html} + } + + + @published{Genome Inform, + key = {6}, + author = {Eddy, S.R.}, + title = {A new generation of homology search tools based on probabilistic inference}, + year = {2009}, + volume = {23}, + number = {1}, + pages = {205-211} + } + + + @published{Nucleic acids research, + key = {7}, + author = {Sasidharan, R., Nepusz, T., Swarbreck, D., Huala, E. and Paccanaro, A.}, + title = {GFam: a platform for automatic annotation of gene families}, + year = {2012}, + pages = {gks631} + } + + + @published{Genome research, + key = {8}, + author = {Li, L., Stoeckert, C.J. and Roos, D.S.}, + title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes}, + year = {2003}, + volume = {13}, + number = {9}, + pages = {2178-2189} + } + + + @published{Genome biology, + key = {9}, + author = {Emms, D.M. and Kelly, S.}, + title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy}, + year = {2015}, + volume = {16}, + number = {1}, + pages = {157} + } + + + @published{Bioinformatics, + key = {10}, + author = {Capella-Gutiérrez, S., Silla-Martínez, J.M. and Gabaldón, T.}, + title = {trimAl: a tool for automated alignment trimming in large-scale phylogenetic analyses}, + year = {2009}, + volume = {25}, + number = {15}, + pages = {1972-1973} + } + + + @published{IEEE/ACM Transactions on Computational Biology and Bioinformatics, + key = {11}, + author = {Gremme, G., Steinbiss, S. and Kurtz, S.}, + title = {GenomeTools: a comprehensive software library for efficient processing of structured genome annotations}, + year = {2013}, + volume = {10}, + number = {3}, + pages = {645-656} + } + @unpublished{None, + key = {12}, author = {None}, title = {HMMER 3.1+ hmmscan search sequence(s) against a profile database}, year = {2013}, url = {http://hmmer.org/} - } - - @published{Genome Research, - author = {Huang X, Madan A}, - title = {CAP3: A DNA sequence assembly program}, - year = {1999}, - url = {http://seq.cs.iastate.edu/cap3.html} - } + } +