Mercurial > repos > greg > assembly_post_processor

--- a/assembly_post_processor.xml	Fri Mar 03 11:19:25 2017 -0500
+++ b/assembly_post_processor.xml	Thu Mar 16 12:54:45 2017 -0400
@@ -1,5 +1,5 @@
-<tool id="plant_tribes_assembly_post_processor" name="Postprocess de novo assembly transcripts" version="0.4.0">
-    <description>into putative coding sequences</description>
+<tool id="plant_tribes_assembly_post_processor" name="AssemblyPostProcessor" version="0.4.0">
+    <description>post-processes de novo transcriptome assembly</description>
     <requirements>
         <requirement type="package" version="0.4">plant_tribes_assembly_post_processor</requirement>
     </requirements>
@@ -39,9 +39,9 @@
         ]]>
     </command>
     <inputs>
-        <param name="input" format="fasta" type="data" label="De novo transcriptome assembly fasta file"/>
+        <param name="input" format="fasta" type="data" label="Transcriptome assembly fasta file"/>
         <conditional name="prediction_method_cond">
-            <param name="prediction_method" type="select" label="Prediction method for coding regions">
+            <param name="prediction_method" type="select" label="Coding regions prediction method">
                 <option value="transdecoder" selected="true">TransDecoder</option>
                 <option value="estscan">ESTScan</option>
             </param>
@@ -51,20 +51,20 @@
             </when>
         </conditional>
         <conditional name="options_type">
-            <param name="options_type_selector" type="select" label="Options Configuration">
+            <param name="options_type_selector" type="select" label="Options configuration">
                 <option value="basic" selected="true">Basic</option>
                 <option value="advanced">Advanced</option>
             </param>
             <when value="basic" />
             <when value="advanced">
                 <conditional name="target_gene_family_assembly_cond">
-                    <param name="target_gene_family_assembly" type="select" label="Target gene family assembly?">
+                    <param name="target_gene_family_assembly" type="select" label="Perform targeted gene assembly?">
                         <option value="no" selected="true">No</option>
                         <option value="yes">Yes</option>
                     </param>
                     <when value="no" />
                     <when value="yes">
-                        <param name="orthogroups" format="tabular" type="data" label="List of orthogroup identifiers for target gene families to assemble"/>
+                        <param name="orthogroups" format="tabular" type="data" label="Targeted gene families"/>
                         <param name="scaffold" type="select" label="Orthogroups or gene families proteins scaffold">
                             <options from_data_table="plant_tribes_scaffolds" />
                             <validator type="no_options" message="No PlantTribes scaffolds are available.  Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table."/>
@@ -76,16 +76,16 @@
                         </param>
                     </when>
                 </conditional>
-                <param name="gap_trimming" type="float" value="0.1" min="0" max="1.0" label="Remove sites in alignments with gaps of" help="0.1 removes sites with 90% gaps"/>
-                <param name="strand_specific" type="select" label="De novo transcriptome assembly was performed with strand-specific library?">
+                <param name="gap_trimming" type="float" value="0.1" min="0" max="1.0" label="Trim alignments"/>
+                <param name="strand_specific" type="select" label="Strand-specific assembly?">
                     <option value="no" selected="true">No</option>
                     <option value="yes">Yes</option>
                 </param>
-                <param name="dereplicate" type="select" label="Remove repeated sequences in predicted coding regions?">
+                <param name="dereplicate" type="select" label="Remove duplicate sequences?">
                     <option value="no" selected="true">No</option>
                     <option value="yes">Yes</option>
                 </param>
-                <param name="min_length" type="integer" value="200" label="Minimum sequence length of predicted coding regions"/>
+                <param name="min_length" type="integer" value="200" label="Minimum sequence length"/>
             </when>
         </conditional>
     </inputs>
@@ -123,56 +123,160 @@
         </test>
     </tests>
     <help>
-This tool is one of the PlantTribes' collection of automated modular analysis pipelines that utilize objective classifications of
-complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies. It postprocesses de novo assembly
-transcripts into putative coding sequences and their corresponding amino acid translations, locally assembling targeted gene families.
+This tool is one of the PlantTribes' collection of automated modular analysis pipelines for comparative and
+evolutionary analyses of genome-scale gene families and transcriptomes.  This tool post-processes de novo
+assembled transcripts into putative coding sequences and their corresponding amino acid translations and
+optionally assigns transcripts to circumscribed gene families ("orthogroups")[2].  After transcripts have been
+assigned to gene families, overlapping contigs can be identified and merged to reduce fragmentation in the
+de novo assembly.

 -----

-**Options**
+**Required options**
+
+ * **Transcriptome assembly fasta file** - either de novo or reference-guided transcriptome assembly fasta file selected from your history.
+ * **Coding regions prediction method** - method for finding coding recions within transcripts.  Available methods are TransDecoder[3] and ESTScan[4].
+ * **Scores matrices** - Scores matrices, based on a related species, are required when ESTScan is ued to find coding regions.  Details of how to create species-specific scores matrices can be found on the ESTScan website (http://estscan.sourceforge.net).  Matrices of some organisms are also available to download.
+
+**Other options**

- * **Prediction method for coding regions** - The prediction method for coding regions; one of ESTScan or TransDecoder.
- * **Scores matrices** - Scores matrices that reflect the codons preferences in the studied organisms.
- * **Target gene family assembly?** - Select 'Yes' to target gene families to assemble.
- * **List of orthogroup identifiers for target gene families to assemble** - History item with a list of orthogroup identifiers for target gene families to assemble.
- * **Orthogroups or gene families proteins scaffold** - PlantTribes scaffolds data installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool.
- * **Protein clustering method** - One of GFam (domain architecture based clustering), OrthoFinder (broadly defined clusters) or OrthoMCL (narrowly defined clusters).
- * **Remove sites in alignments with gaps of** - Removes gappy sites in alignments (i.e. 0.1 removes sites with 90% gaps): [0.0 to 1.0]
- * **De novo transcriptome assembly was performed with strand-specific library?** - Select 'Yes' if de novo transcriptome assembly was performed with strand-specific library.
- * **Remove repeated sequences in predicted coding regions?** - Select yes to remove repeated sequences in predicted coding regions.
- * **Minimum sequence length of predicted coding regions** - Set the minimum sequence length of predicted coding regions.
+ * **Perform targeted gene assembly?** - Selecting 'Yes' enables local assembly of one or more targeted gene families in a specific scaffold.  Scaffolds are defined in PlantTribes as clusters of paralogous/orthologous sequences from a specified set of proteomes[5-7].
+ * **Targeted gene families** - A selected history item containing a list of targeted orthogroup identifiers corresponding to the gene family classification from a specified scaffold.  Gene identifiers can be obtained from the function annotation table ("Orthogroup ID" field of *.summary file) of scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool.
+ * **Gene family scaffold** - one of the PlantTribes gene family scaffolds (installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool) whose orthogroups(s) are targeted for the localized assembly.
+ * **Protein clustering method** -gene family scaffold protein clustering method.  Each PlantTribes scaffold data has up to three sets of clusters - GFam[8] (clusters of consensus domain architecture), OrthoFinder[9] (broadly defined clusters) or OrthoMCL[10] (narrowly defined clusters).  You can also install your own data scaffold created using a different clustering method as long as it conforms to the PlantTribes scaffold data format.
+ * **Trim alignments** - trim gene family multiple sequence alignments that include scaffold backbone genes and locally assembled transcripts to remove non-conserved regions (gappy sites)[11].  The trimmed alignments are used in assigning scores to locally assembled transcripts to determine how well they compare to the backbones gene models.  The default setting of 0.1 removes sites tha thave gaps in 90% of the sequences in the multiple sequence alignment.  This option is restricted to the range 0.0 - 1.0.
+ * **Strand-specific assembly?** - select 'Yes' if transcriptome library sequences were strand-specific.  If 'Yes" is selected, transcripts from the minority strand (antisense) are removed.
+ * **Remove duplicate sequences?** - select 'Yes' to remove duplicated and exact subsequences[12].
+ * **Minimum sequence length** - set the minimum sequence length of predicted coding regions. The default is 200 bp.

     </help>
     <citations>
         <citation type="bibtex">
             @unpublished{None,
+            key = {1},
             author = {Eric Wafula},
-            title = {None},
+            title = {Manuscript in preparation},
             year = {None},
             url = {https://github.com/dePamphilis/PlantTribes}
-        }</citation>
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{PloS one,
+            key = {2},
+            author = {Honaas, L.A., Wafula, E.K., Wickett, N.J., Der, J.P., Zhang, Y., Edger, P.P., Altman, N.S., Pires, J.C. and Leebens-Mack, J.H.},
+            title = {Selecting superior de novo transcriptome assemblies: lessons learned by leveraging the best plant genome},
+            year = {2016},
+            volume = {11},
+            number = {1},
+            pages = {e0146062}
+            }
+        </citation>
         <citation type="bibtex">
-            @published{Proc Int Conf Intell Syst Mol Biol,
+            @published{Nature protocols,
+            key = {3},
+            author = {Haas, B.J., Papanicolaou, A., Yassour, M., Grabherr, M., Blood, P.D., Bowden, J., Couger, M.B., Eccles, D., Li, B., Lieber, M. and MacManes, M.D.},
+            title = {De novo transcript sequence reconstruction from RNA-seq using the Trinity platform for reference generation and analysis},
+            year = {2013},
+            volume = {8},
+            number = {8},
+            pages = {1494-1512}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{ISMB,
+            key = {4},
             author = {Iseli C, Jongeneel CV, Bucher P.},
             title = {ESTScan: a program for detecting, evaluating, and reconstructing potential coding regions in EST sequences.},
             year = {1999},
-            url = {http://estscan.sourceforge.net/}
-        }</citation>
-        <citation type="doi">10.1038/nprot.2013.084</citation>
-        <citation type="doi">10.1109/tcbb.2013.68</citation>
+            volume = {99},
+            pages = {138-148}
+            url = {http://estscan.sourceforge.net}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{Genome research,
+            key = {5},
+            author = {Huang X, Madan A},
+            title = {CAP3: A DNA sequence assembly program},
+            year = {1999},
+            volume = {9},
+            number = {9},
+            pages = {868-877}
+            url = {http://seq.cs.iastate.edu/cap3.html}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{Genome Inform,
+            key = {6},
+            author = {Eddy, S.R.},
+            title = {A new generation of homology search tools based on probabilistic inference},
+            year = {2009},
+            volume = {23},
+            number = {1},
+            pages = {205-211}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{Nucleic acids research,
+            key = {7},
+            author = {Sasidharan, R., Nepusz, T., Swarbreck, D., Huala, E. and Paccanaro, A.},
+            title = {GFam: a platform for automatic annotation of gene families},
+            year = {2012},
+            pages = {gks631}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{Genome research,
+            key = {8},
+            author = {Li, L., Stoeckert, C.J. and Roos, D.S.},
+            title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes},
+            year = {2003},
+            volume = {13},
+            number = {9},
+            pages = {2178-2189}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{Genome biology,
+            key = {9},
+            author = {Emms, D.M. and Kelly, S.},
+            title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy},
+            year = {2015},
+            volume = {16},
+            number = {1},
+            pages = {157}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{Bioinformatics,
+            key = {10},
+            author = {Capella-Gutiérrez, S., Silla-Martínez, J.M. and Gabaldón, T.},
+            title = {trimAl: a tool for automated alignment trimming in large-scale phylogenetic analyses},
+            year = {2009},
+            volume = {25},
+            number = {15},
+            pages = {1972-1973}
+            }
+        </citation>
+        <citation type="bibtex">
+            @published{IEEE/ACM Transactions on Computational Biology and Bioinformatics,
+            key = {11},
+            author = {Gremme, G., Steinbiss, S. and Kurtz, S.},
+            title = {GenomeTools: a comprehensive software library for efficient processing of structured genome annotations},
+            year = {2013},
+            volume = {10},
+            number = {3},
+            pages = {645-656}
+            }
+        </citation>
         <citation type="bibtex">
             @unpublished{None,
+            key = {12},
             author = {None},
             title = {HMMER 3.1+ hmmscan search sequence(s) against a profile database},
             year = {2013},
             url = {http://hmmer.org/}
-        }</citation>
-        <citation type="bibtex">
-            @published{Genome Research,
-            author = {Huang X, Madan A},
-            title = {CAP3: A DNA sequence assembly program},
-            year = {1999},
-            url = {http://seq.cs.iastate.edu/cap3.html}
-        }</citation>
+            }
+        </citation>
     </citations>
 </tool>