changeset 0:6ade7ba67f5d draft

planemo upload
author leomrtns
date Thu, 23 May 2019 12:59:34 -0400
parents
children e9b5ad7dffde
files macros.xml super_distance.xml test-data/HOG1.tre test-data/HOG2.tre test-data/HOG3.tre test-data/HOG4.tre test-data/HOG5.tre test-data/out_avge.tre test-data/out_nodal.tre test-data/species_names.txt
diffstat 10 files changed, 156 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,17 @@
+<macros>
+  <token name="@VERSION@">1.1.0</token>
+  <xml name="requirements">
+    <requirements>
+      <requirement type="package" version="@VERSION@">super_distance</requirement>
+      <yield/>
+    </requirements>
+  </xml>
+  <xml name="version_command">
+    <version_command>super_distance --version</version_command>
+  </xml>
+  <xml name="citations">
+    <citations>
+      <yield />
+    </citations>
+  </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/super_distance.xml	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,112 @@
+<tool id="super_distance" name="Super_distance" version="@VERSION@">
+  <description>Supertree estimation using matrix representation with distances </description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version_command" />
+    <command detect_errors="exit_code"><![CDATA[
+      #set $spaced_input = ''
+      #for $i_file in $inputtre:
+        #set $_input_file = "'{}'".format($i_file)
+        #set $spaced_input = $spaced_input + ' ' + $_input_file
+      #end for
+
+      super_distance $fast
+      #if str($spnames) != ""
+      -s $spnames
+      #end if
+      -o ./all.tre 
+      ${spaced_input} &&
+      #if str($fast) != ""
+        sed -n '1p' ./all.tre > ./nodal.tre &&
+        sed -n '2p' ./all.tre > ./average.tre
+      #else
+        sed -n '3p' ./all.tre > ./nodal.tre &&
+        sed -n '9p' ./all.tre > ./average.tre
+      #end if
+
+    ]]></command>
+  <inputs>
+    <param name="inputtre" multiple="true" type="data" format="txt,nhx" label="input (gene) trees in newick format (can have more than one tree per file)"/>
+
+    <param name="spnames" type="data" format="txt,tabular" label="File with list of species names" optional="true">
+      <help><![CDATA[
+        This file is optional, but highly recommended unless all trees came from same dataset (leaves have same names etc.). 
+        This list will define leaves in species tree, and allow for paralogs (i.e. several leaves in gene tree with same label).
+        ]]></help>
+    </param>
+
+    <param name="fast" type="boolean" truevalue="--fast" falsevalue="" checked="true" label="Fast estimation (for large supertrees)">
+      <help><![CDATA[
+        If you don't chose fast estimation, a lot of trees will be output to same file, which must then  be inspected by user. 
+        The first two trees from this file are collected as individual tree files. The fast option is in theory 18x faster than the regular run, since 
+        it estimates only 2 instead of 36 species trees (assuming UPGMA/NJ is the overwhelming step). 
+        ]]></help>
+    </param>
+
+    <param name="epslon" type="float" label="polytomy tolerance" min="0.0" optional="true">
+      <help><![CDATA[ 
+        Small value, below which a branch length is considered zero for nodal distances. Default is 1e-7. 
+        ]]></help>
+    </param>
+  </inputs>
+  <outputs>
+     <data name="alltrees" format="nhx" label="All supertrees" from_work_dir="all.tre" />
+     <data name="nodaltree" format="nhx" label="Nodal supertree" from_work_dir="nodal.tre" />
+     <data name="averagetree" format="nhx" label="Average supertree" from_work_dir="average.tre" />
+
+  </outputs>
+  <tests>
+    <test>
+      <param name="spnames" value="species_names.txt"/>
+      <param name="fast" value="true"/>
+      <param name="inputtre" value="HOG1.tre,HOG2.tre,HOG3.tre,HOG4.tre,HOG5.tre"/>
+      <output name='nodaltree'>
+        <assert_contents><has_text_matching expression="[F00]"/></assert_contents>
+      </output>
+      <output name='averagetree'>
+        <assert_contents><has_text_matching expression="[F01]"/></assert_contents>
+      </output>
+    </test>
+  </tests>
+    <help><![CDATA[
+      Matrix Representation with Distances: calculates pairwise distances between gene leaves, and estimates species trees from summary distance matrices <br/>
+      This software implements a class of methods called Matrix Representation with Distances (MRD), with emphasis on whole 
+      gene families (i.e. gene trees that may contain paralogs) for species tree inference. <br/>
+      The two main output trees are the <b>Nodal</b> and the <b>Average</b> supertrees. If the "--fast" option was not selected, then the file <b>All</b> 
+      supertrees will have several other estimates (otherwise it will have only the two main supertrees).<br/><br/>
+      
+      The "Nodal" supertree is estimated from nodal distances betweeen gene tree leaves (equivalent to assuming equal branch lengths), but its final branch lengths 
+      are estimated by least squares using the average branch lengths. The "Average" supertree uses this information directly, and both supertrees are estimated by 
+      UPGMA.<br/>
+      <hr>
+      <br/>
+      super_distance  [-h|--help] [-v|--version] [-F|--fast] [-e|--epsilon=<double>] [-s|--species=<species names>] [-o|--output=<newick>] <file> [<file>]...
+
+      Based on several rescaled patristic distances, the program takes the average matrix between genes and estimates
+      the species tree using bioNJ, UPGMA and single-linkage after scaling back to the original values (more below). The program 
+      also uses a distance matrix to project branch lengths on species trees missing lengths; 
+
+      The branch length rescaling per gene can be the minimum, the average, the total sum, etc. and at the end these values
+      averaged over trees are scaled back in the final distance matrix, such that lengths in the supertree (species tree) are interpretable.
+      One exception is the nodal distance, which is based on the number of nodes between two leaves (e.g. NJst). In this case it may make
+      more sense to use another distance matrix to infer the branch lengths. Option 'F' uses averages distances projected on nodal-estimated tree; 
+      it uses fewer scalings/options, providing a fast estimation. We avoid using individual gene trees since they may have 
+      missing information (missing species or species pairs). For missing comparisons (when two species are never seen in the same gene tree)
+      we use the ultrametric condition (comparison to a common species) to estimate its value.
+
+      If a file with species names is given, the program allows for paralogs; otherwise it assumes orthology and that _at_least_ one tree has no missing data:
+      * Paralogy: the species names will be mapped to individual gene tree leaves (e.g. `ECOLI_a` and `ECOLI_b` will both map to species `ECOLI`).
+      Each gene tree can therefore have several copies of each species, and can also have missing species.
+      * Orthology: if a file with species names is not given, however, it is assumed that each species is represented at most once per gene, and
+      furthermore that the leaf names represent the species, and are thus identical across trees. This mode is the underlying assumption behind
+      most tree comparison software, although here missing data for some trees (not all) is allowed. I.e. as long as one tree has full information
+      (for all species), then others can have some absent species.
+      With paralogs or not, it is not recommended to have missing entries in the distance matrix (e.g. when a species pair does not appear in any tree),
+      and matrix representation with distances methods work better with more 'complete' gene trees. If there are no paralogs, many supertrees will be equivalent, 
+      as well as if the input trees lack lengths (only topological information).
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/HOG1.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+(DROWI_18:0.3789018505,(DROGR_7:0.2443018419,(DROMO_10:0.1316525082,DROVI_17:0.0984391551):0.122729):0.236619,((DROPE_11:0.0000023111,DROPS_12:0.0039732231):0.158232,((DROAN_0:0.0523037343,DROBP_2:0.0541929634):0.154840,(DROKI_8:0.1754046509,((((((DROYA_19:0.0454568263,DROER_4:0.0221802414):0.009433,(DROME_9:0.0154108890,(DROSI_15:0.0104437030,DROSE_14:0.0103446291):0.006553):0.007669):0.085090,DROEU_5:0.0719460572):0.010227,(DROBM_1:0.0475379790,DROTK_16:0.0428214000):0.019343):0.011966,(DRORH_13:0.0980624594,DROEL_3:0.0426114893):0.026189):0.012613,DROFC_6:0.0771805483):0.062752):0.055134):0.088303):0.111929);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/HOG2.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+(DROEU_7:2.6707832803,(((DROVI_17:0.1659062151,(DROMO_12:0.1671047483,DROGR_9:0.1479550015):0.082093):0.201114,DROWI_18:0.2630970288):0.050379,((DROAN_0:0.0182475133,(DROBP_3:0.0125985179,DROBP_2:0.0046837882):0.028555):0.077966,(DROKI_10:0.1054285481,(((((DROFC_8:0.0754595688,DROEL_4:0.0367458921):0.017312,(DROEU_6:0.0409251258,DROBM_1:0.0200906079):0.027882):0.027118,DROYA_19:0.0159778433):0.006906,DROER_5:0.0115646703):0.003304,(DROME_11:0.0196755938,(DROSE_15:0.0070216727,DROSI_16:0.0115885956):0.010218):0.022794):0.062511):0.038220):0.038254):0.053899,(DROPE_13:0.0150971917,DROPS_14:0.0156580887):0.110366);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/HOG3.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+(DROWI_18:1.3986144626,(DROGR_7:0.5020534626,(DROMO_10:0.2072433617,DROVI_17:0.1213631563):0.176415):0.400812,((DROPE_11:0.1255463088,DROPS_12:0.0062041894):0.374687,((DROAN_0:0.0529721442,DROBP_2:0.0482152226):0.260691,(DROKI_8:0.3015377200,((DROBM_1:0.1737538555,DROTK_16:0.1098459459):0.038728,(((DROFC_6:0.1814201709,(DRORH_13:0.0699182415,DROEL_3:0.0757597195):0.049046):0.028239,((DROME_9:0.0113417559,(DROSE_14:0.0148692844,DROSI_15:0.0157735255):0.009843):0.075691,(DROER_4:0.0731933903,DROYA_19:0.0512225143):0.039711):0.096678):0.030802,DROEU_5:0.1451575357):0.016826):0.081440):0.089075):0.094121):0.149309);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/HOG4.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+(DROWI_18:0.5868382482,(((((DRORH_13:0.6652950208,DROEL_3:0.3789058064):0.132222,(DROFC_6:0.5481301714,((DROBM_1:0.3103870563,DROTK_16:0.1457783218):0.087793,(DROEU_5:0.3107900511,((DROER_4:0.0980109981,DROYA_19:0.0651305578):0.028048,(DROME_9:0.0344411278,(DROSE_14:0.0152745521,DROSI_15:0.0192488188):0.002744):0.036213):0.054291):0.045368):0.031325):0.040055):0.271090,DROKI_8:0.5026965046):0.029784,(DROAN_0:0.0293865573,DROBP_2:0.0275804120):0.132855):0.104278,(DROPE_11:0.0166948636,DROPS_12:0.0069147115):0.192926):0.248620,(DROGR_7:0.3367199283,(DROMO_10:0.2926223708,DROVI_17:0.1374444833):0.107221):0.358964);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/HOG5.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+(DROWI_18:0.1885012154,((DROPE_12:0.0930361130,DROPS_13:0.0048427559):0.043936,((((DROBP_3:0.4208376176,(DROEL_5:0.0966381344,DRORH_14:0.0376203310):0.046772):0.033880,DROFC_9:0.2852242842):0.027955,(DROEU_8:0.0752885945,DROTK_17:0.0885953577):0.027643):0.065468,DROBM_2:0.0290972856):3.313400):0.054171,(((((DROEL_4:0.1366401683,DROBM_1:0.0779371466):0.016714,DROEU_7:0.0209150267):0.008409,(DROYA_19:0.0159753214,(DROER_6:0.0126351355,((DROME_11:0.0047702525,DROSE_15:0.0015834254):0.000002,DROSI_16:0.0015862134):0.007376):0.004328):0.017018):0.026086,DROKI_10:0.0982019427):0.040511,DROAN_0:0.0582216690):0.055033);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_avge.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+[F01] (((((((((((DROSI:0.0114239190162,DROSE:0.0114239190162):0.00665311785271,DROME:0.0180770368689):0.0332289457345,(DROYA:0.0425777500307,DROER:0.0425777500307):0.00872823257268):0.0702215561382,DROEU:0.121527538742):0.0370743470615,DROEL:0.158601885803):0.00637747827539,DRORH:0.164979364078):0.119690548008,((DROTK:0.120113878085,DROBM:0.120113878085):0.08498980613,DROFC:0.205103684215):0.0795662278716):0.0226805993692,DROBP:0.307350511456):0.096308094249,((DROPS:0.0263888258223,DROPE:0.0263888258223):0.224545399278,(DROKI:0.238733687263,DROAN:0.238733687263):0.0122005378374):0.152724380604):0.0758046642449,DROWI:0.47946326995):0.133079153946,((DROVI:0.185662999163,DROMO:0.185662999163):0.127661752359,DROGR:0.313324751521):0.299217672374):1;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_nodal.tre	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,1 @@
+[F00] ((((((DROBP:0.532439087059,DROAN:0.276636337932):0.0503083375465,DROKI:0.37993216902):0,(DROPS:0.00822020949675,DROPE:0.0445574421478):0.354616460118):0.00309136864897,(((DROVI:0.143658061175,DROMO:0.22766793715):0.103501886683,DROGR:0.337484617197):0.307155918674,DROWI:0.686976061347):0.128297584692):0.10425066443,(((DROTK:0.43557192824,DROBM:0.2093492096):0.00760527041108,DROEU:0.467252999325):0,((DRORH:0.549712406883,DROEL:0.203706841614):0,DROFC:0.310554224165):0.160342042579):0.286455631907):0.114252911488,(((DROSI:0.0123172334665,DROSE:0.0105306045659):0.00678507767638,DROME:0.0179450770452):0.0279199528065,(DROYA:0.0431176263202,DROER:0.0420378737413):0.0155720673903):0.114252911488):0;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/species_names.txt	Thu May 23 12:59:34 2019 -0400
@@ -0,0 +1,20 @@
+DROAN
+DROBM
+DROBP
+DROEL
+DROER
+DROEU
+DROFC
+DROGR
+DROKI
+DROME
+DROMO
+DROPE
+DROPS
+DRORH
+DROSE
+DROSI
+DROTK
+DROVI
+DROWI
+DROYA