Mercurial > repos > leomrtns > super_distance
changeset 0:6ade7ba67f5d draft
planemo upload
author | leomrtns |
---|---|
date | Thu, 23 May 2019 12:59:34 -0400 |
parents | |
children | e9b5ad7dffde |
files | macros.xml super_distance.xml test-data/HOG1.tre test-data/HOG2.tre test-data/HOG3.tre test-data/HOG4.tre test-data/HOG5.tre test-data/out_avge.tre test-data/out_nodal.tre test-data/species_names.txt |
diffstat | 10 files changed, 156 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,17 @@ +<macros> + <token name="@VERSION@">1.1.0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@VERSION@">super_distance</requirement> + <yield/> + </requirements> + </xml> + <xml name="version_command"> + <version_command>super_distance --version</version_command> + </xml> + <xml name="citations"> + <citations> + <yield /> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/super_distance.xml Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,112 @@ +<tool id="super_distance" name="Super_distance" version="@VERSION@"> + <description>Supertree estimation using matrix representation with distances </description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="version_command" /> + <command detect_errors="exit_code"><![CDATA[ + #set $spaced_input = '' + #for $i_file in $inputtre: + #set $_input_file = "'{}'".format($i_file) + #set $spaced_input = $spaced_input + ' ' + $_input_file + #end for + + super_distance $fast + #if str($spnames) != "" + -s $spnames + #end if + -o ./all.tre + ${spaced_input} && + #if str($fast) != "" + sed -n '1p' ./all.tre > ./nodal.tre && + sed -n '2p' ./all.tre > ./average.tre + #else + sed -n '3p' ./all.tre > ./nodal.tre && + sed -n '9p' ./all.tre > ./average.tre + #end if + + ]]></command> + <inputs> + <param name="inputtre" multiple="true" type="data" format="txt,nhx" label="input (gene) trees in newick format (can have more than one tree per file)"/> + + <param name="spnames" type="data" format="txt,tabular" label="File with list of species names" optional="true"> + <help><![CDATA[ + This file is optional, but highly recommended unless all trees came from same dataset (leaves have same names etc.). + This list will define leaves in species tree, and allow for paralogs (i.e. several leaves in gene tree with same label). + ]]></help> + </param> + + <param name="fast" type="boolean" truevalue="--fast" falsevalue="" checked="true" label="Fast estimation (for large supertrees)"> + <help><![CDATA[ + If you don't chose fast estimation, a lot of trees will be output to same file, which must then be inspected by user. + The first two trees from this file are collected as individual tree files. The fast option is in theory 18x faster than the regular run, since + it estimates only 2 instead of 36 species trees (assuming UPGMA/NJ is the overwhelming step). + ]]></help> + </param> + + <param name="epslon" type="float" label="polytomy tolerance" min="0.0" optional="true"> + <help><![CDATA[ + Small value, below which a branch length is considered zero for nodal distances. Default is 1e-7. + ]]></help> + </param> + </inputs> + <outputs> + <data name="alltrees" format="nhx" label="All supertrees" from_work_dir="all.tre" /> + <data name="nodaltree" format="nhx" label="Nodal supertree" from_work_dir="nodal.tre" /> + <data name="averagetree" format="nhx" label="Average supertree" from_work_dir="average.tre" /> + + </outputs> + <tests> + <test> + <param name="spnames" value="species_names.txt"/> + <param name="fast" value="true"/> + <param name="inputtre" value="HOG1.tre,HOG2.tre,HOG3.tre,HOG4.tre,HOG5.tre"/> + <output name='nodaltree'> + <assert_contents><has_text_matching expression="[F00]"/></assert_contents> + </output> + <output name='averagetree'> + <assert_contents><has_text_matching expression="[F01]"/></assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + Matrix Representation with Distances: calculates pairwise distances between gene leaves, and estimates species trees from summary distance matrices <br/> + This software implements a class of methods called Matrix Representation with Distances (MRD), with emphasis on whole + gene families (i.e. gene trees that may contain paralogs) for species tree inference. <br/> + The two main output trees are the <b>Nodal</b> and the <b>Average</b> supertrees. If the "--fast" option was not selected, then the file <b>All</b> + supertrees will have several other estimates (otherwise it will have only the two main supertrees).<br/><br/> + + The "Nodal" supertree is estimated from nodal distances betweeen gene tree leaves (equivalent to assuming equal branch lengths), but its final branch lengths + are estimated by least squares using the average branch lengths. The "Average" supertree uses this information directly, and both supertrees are estimated by + UPGMA.<br/> + <hr> + <br/> + super_distance [-h|--help] [-v|--version] [-F|--fast] [-e|--epsilon=<double>] [-s|--species=<species names>] [-o|--output=<newick>] <file> [<file>]... + + Based on several rescaled patristic distances, the program takes the average matrix between genes and estimates + the species tree using bioNJ, UPGMA and single-linkage after scaling back to the original values (more below). The program + also uses a distance matrix to project branch lengths on species trees missing lengths; + + The branch length rescaling per gene can be the minimum, the average, the total sum, etc. and at the end these values + averaged over trees are scaled back in the final distance matrix, such that lengths in the supertree (species tree) are interpretable. + One exception is the nodal distance, which is based on the number of nodes between two leaves (e.g. NJst). In this case it may make + more sense to use another distance matrix to infer the branch lengths. Option 'F' uses averages distances projected on nodal-estimated tree; + it uses fewer scalings/options, providing a fast estimation. We avoid using individual gene trees since they may have + missing information (missing species or species pairs). For missing comparisons (when two species are never seen in the same gene tree) + we use the ultrametric condition (comparison to a common species) to estimate its value. + + If a file with species names is given, the program allows for paralogs; otherwise it assumes orthology and that _at_least_ one tree has no missing data: + * Paralogy: the species names will be mapped to individual gene tree leaves (e.g. `ECOLI_a` and `ECOLI_b` will both map to species `ECOLI`). + Each gene tree can therefore have several copies of each species, and can also have missing species. + * Orthology: if a file with species names is not given, however, it is assumed that each species is represented at most once per gene, and + furthermore that the leaf names represent the species, and are thus identical across trees. This mode is the underlying assumption behind + most tree comparison software, although here missing data for some trees (not all) is allowed. I.e. as long as one tree has full information + (for all species), then others can have some absent species. + With paralogs or not, it is not recommended to have missing entries in the distance matrix (e.g. when a species pair does not appear in any tree), + and matrix representation with distances methods work better with more 'complete' gene trees. If there are no paralogs, many supertrees will be equivalent, + as well as if the input trees lack lengths (only topological information). + + ]]></help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/HOG1.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +(DROWI_18:0.3789018505,(DROGR_7:0.2443018419,(DROMO_10:0.1316525082,DROVI_17:0.0984391551):0.122729):0.236619,((DROPE_11:0.0000023111,DROPS_12:0.0039732231):0.158232,((DROAN_0:0.0523037343,DROBP_2:0.0541929634):0.154840,(DROKI_8:0.1754046509,((((((DROYA_19:0.0454568263,DROER_4:0.0221802414):0.009433,(DROME_9:0.0154108890,(DROSI_15:0.0104437030,DROSE_14:0.0103446291):0.006553):0.007669):0.085090,DROEU_5:0.0719460572):0.010227,(DROBM_1:0.0475379790,DROTK_16:0.0428214000):0.019343):0.011966,(DRORH_13:0.0980624594,DROEL_3:0.0426114893):0.026189):0.012613,DROFC_6:0.0771805483):0.062752):0.055134):0.088303):0.111929);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/HOG2.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +(DROEU_7:2.6707832803,(((DROVI_17:0.1659062151,(DROMO_12:0.1671047483,DROGR_9:0.1479550015):0.082093):0.201114,DROWI_18:0.2630970288):0.050379,((DROAN_0:0.0182475133,(DROBP_3:0.0125985179,DROBP_2:0.0046837882):0.028555):0.077966,(DROKI_10:0.1054285481,(((((DROFC_8:0.0754595688,DROEL_4:0.0367458921):0.017312,(DROEU_6:0.0409251258,DROBM_1:0.0200906079):0.027882):0.027118,DROYA_19:0.0159778433):0.006906,DROER_5:0.0115646703):0.003304,(DROME_11:0.0196755938,(DROSE_15:0.0070216727,DROSI_16:0.0115885956):0.010218):0.022794):0.062511):0.038220):0.038254):0.053899,(DROPE_13:0.0150971917,DROPS_14:0.0156580887):0.110366);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/HOG3.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +(DROWI_18:1.3986144626,(DROGR_7:0.5020534626,(DROMO_10:0.2072433617,DROVI_17:0.1213631563):0.176415):0.400812,((DROPE_11:0.1255463088,DROPS_12:0.0062041894):0.374687,((DROAN_0:0.0529721442,DROBP_2:0.0482152226):0.260691,(DROKI_8:0.3015377200,((DROBM_1:0.1737538555,DROTK_16:0.1098459459):0.038728,(((DROFC_6:0.1814201709,(DRORH_13:0.0699182415,DROEL_3:0.0757597195):0.049046):0.028239,((DROME_9:0.0113417559,(DROSE_14:0.0148692844,DROSI_15:0.0157735255):0.009843):0.075691,(DROER_4:0.0731933903,DROYA_19:0.0512225143):0.039711):0.096678):0.030802,DROEU_5:0.1451575357):0.016826):0.081440):0.089075):0.094121):0.149309);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/HOG4.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +(DROWI_18:0.5868382482,(((((DRORH_13:0.6652950208,DROEL_3:0.3789058064):0.132222,(DROFC_6:0.5481301714,((DROBM_1:0.3103870563,DROTK_16:0.1457783218):0.087793,(DROEU_5:0.3107900511,((DROER_4:0.0980109981,DROYA_19:0.0651305578):0.028048,(DROME_9:0.0344411278,(DROSE_14:0.0152745521,DROSI_15:0.0192488188):0.002744):0.036213):0.054291):0.045368):0.031325):0.040055):0.271090,DROKI_8:0.5026965046):0.029784,(DROAN_0:0.0293865573,DROBP_2:0.0275804120):0.132855):0.104278,(DROPE_11:0.0166948636,DROPS_12:0.0069147115):0.192926):0.248620,(DROGR_7:0.3367199283,(DROMO_10:0.2926223708,DROVI_17:0.1374444833):0.107221):0.358964);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/HOG5.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +(DROWI_18:0.1885012154,((DROPE_12:0.0930361130,DROPS_13:0.0048427559):0.043936,((((DROBP_3:0.4208376176,(DROEL_5:0.0966381344,DRORH_14:0.0376203310):0.046772):0.033880,DROFC_9:0.2852242842):0.027955,(DROEU_8:0.0752885945,DROTK_17:0.0885953577):0.027643):0.065468,DROBM_2:0.0290972856):3.313400):0.054171,(((((DROEL_4:0.1366401683,DROBM_1:0.0779371466):0.016714,DROEU_7:0.0209150267):0.008409,(DROYA_19:0.0159753214,(DROER_6:0.0126351355,((DROME_11:0.0047702525,DROSE_15:0.0015834254):0.000002,DROSI_16:0.0015862134):0.007376):0.004328):0.017018):0.026086,DROKI_10:0.0982019427):0.040511,DROAN_0:0.0582216690):0.055033);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_avge.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +[F01] (((((((((((DROSI:0.0114239190162,DROSE:0.0114239190162):0.00665311785271,DROME:0.0180770368689):0.0332289457345,(DROYA:0.0425777500307,DROER:0.0425777500307):0.00872823257268):0.0702215561382,DROEU:0.121527538742):0.0370743470615,DROEL:0.158601885803):0.00637747827539,DRORH:0.164979364078):0.119690548008,((DROTK:0.120113878085,DROBM:0.120113878085):0.08498980613,DROFC:0.205103684215):0.0795662278716):0.0226805993692,DROBP:0.307350511456):0.096308094249,((DROPS:0.0263888258223,DROPE:0.0263888258223):0.224545399278,(DROKI:0.238733687263,DROAN:0.238733687263):0.0122005378374):0.152724380604):0.0758046642449,DROWI:0.47946326995):0.133079153946,((DROVI:0.185662999163,DROMO:0.185662999163):0.127661752359,DROGR:0.313324751521):0.299217672374):1;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_nodal.tre Thu May 23 12:59:34 2019 -0400 @@ -0,0 +1,1 @@ +[F00] ((((((DROBP:0.532439087059,DROAN:0.276636337932):0.0503083375465,DROKI:0.37993216902):0,(DROPS:0.00822020949675,DROPE:0.0445574421478):0.354616460118):0.00309136864897,(((DROVI:0.143658061175,DROMO:0.22766793715):0.103501886683,DROGR:0.337484617197):0.307155918674,DROWI:0.686976061347):0.128297584692):0.10425066443,(((DROTK:0.43557192824,DROBM:0.2093492096):0.00760527041108,DROEU:0.467252999325):0,((DRORH:0.549712406883,DROEL:0.203706841614):0,DROFC:0.310554224165):0.160342042579):0.286455631907):0.114252911488,(((DROSI:0.0123172334665,DROSE:0.0105306045659):0.00678507767638,DROME:0.0179450770452):0.0279199528065,(DROYA:0.0431176263202,DROER:0.0420378737413):0.0155720673903):0.114252911488):0;