Mercurial > repos > eschen42 > w4mclassfilter
changeset 5:ae791fe4fbe9 draft
planemo upload for repository https://github.com/HegemanLab/w4mclassfilter_galaxy_wrapper/tree/master commit a2d17eac4a1343a34cf2908d9ab31b3202a21f64
author | eschen42 |
---|---|
date | Thu, 07 Sep 2017 17:32:09 -0400 |
parents | 2495d0019dbe |
children | f1eabb5973b1 |
files | test-data/input_nofilter_dataMatrix.tsv w4mclassfilter.xml w4mclassfilter_wrapper.R |
diffstat | 3 files changed, 309 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_nofilter_dataMatrix.tsv Thu Sep 07 17:32:09 2017 -0400 @@ -0,0 +1,16 @@ +dataMatrix HU_017 HU_028 HU_034 HU_051 HU_060 HU_078 HU_091 HU_093 HU_099 HU_110 HU_130 HU_134 HU_138 HU_149 HU_152 HU_175 HU_178 HU_185 HU_204 HU_208 +HMDB03193 76043 412165 44943 27242 436566 173175 242549 57066 559869 3732 339188 471368 262271 127285 451270 212500 79673 NA 891129 43907 +HMDB01101 30689 6877586 52217 3158 10789748 229568 4763576 3878773 976436 831937 608298 1605075 72021 442510 1107705 1464339 31250 2724553 891129 32742 +HMDB01101 6877586 52217 3158 10789748 229568 4763576 3878773 976436 831937 608298 1605075 72021 442510 1107705 1464339 31250 2724553 72900 891129 30689 +HMDB10348 47259 544877 60885 34582 529874 168264 176500 76457 610110 16262 279156 524468 451573 591487 433529 161069 214392 13781 891129 39315 +HMDB59717 357351 1030464 301983 67604 306862 1028110 1530493 270027 1378535 289677 808334 1132813 871209 895435 715190 1563158 784738 146195 891129 239030 +HMDB00822 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 14627 +HMDB00299 250551 1046138 456162 159386 1013302 808657 614370 250403 768004 242085 504108 1014041 1362408 1057660 1110050 566050 411886 142233 891129 284775 +HMDB00191 560002 771533 575790 392284 888498 785428 645785 591569 960658 910201 639437 1092885 1409045 2292023 1246459 1945577 710519 773384 891129 622898 +HMDB00518 -34236 58249 85944 NA 342102 129886 175800 13154 230242 NA 440223 315368 10657 419508 48673 28361 514579 23108 891129 73831 +HMDB00715 1252089 2547452 905408 371059 4983588 5140022 2658555 814523 2558923 859466 4184204 3865723 3236644 2615560 3820724 3577833 2295288 625924 891129 1341900 +HMDB01032 2569205 26023086 1604999 430453 8103558 26222916 257139 675754 59906109 263055 31151730 18648127 14989438 1554658 20249262 5588731 871010 15920 891129 44276 +HMDB00208 747080 13420742 595872 1172376 7172632 3143654 4059767 1433702 5593888 5402629 2477288 3346077 4230072 7621236 8960828 10335722 7037373 1574738 891129 2540044 +HMDB04824 374028 1144386 539206 178517 1046190 959381 605191 310260 1253319 477259 477995 825691 1157093 1089284 1411802 1020206 782673 346761 891129 387811 +HMDB00512 NA 319783 280560 85009 1333877 556003 590779 209285 342532 198512 569970 525240 246282 1140422 542345 1171008 827723 222953 891129 85554 +HMDB00251 368600 616555 94936 622468 180988 293988 352855 767894 268331 167246 310918 1248919 577184 10985 335711 403815 80614 63393 891129 616061
--- a/w4mclassfilter.xml Thu May 11 00:01:40 2017 -0400 +++ b/w4mclassfilter.xml Thu Sep 07 17:32:09 2017 -0400 @@ -1,10 +1,10 @@ -<tool id="w4mclassfilter" name="Sample_Subset" version="0.98.1"> +<tool id="w4mclassfilter" name="Sample_Subset" version="0.98.3"> <description>Filter W4M data by sample class</description> <requirements> - <requirement type="package" version="3.3.1">r-base</requirement> + <requirement type="package" version="3.3.2">r-base</requirement> <requirement type="package" version="1.1_4">r-batch</requirement> - <requirement type="package" version="0.98.1">w4mclassfilter</requirement> + <requirement type="package" version="0.98.3">w4mclassfilter</requirement> </requirements> <stdio> @@ -14,29 +14,58 @@ <command detect_errors="aggressive"><![CDATA[ Rscript $__tool_directory__/w4mclassfilter_wrapper.R - dataMatrix_in "$dataMatrix_in" - sampleMetadata_in "$sampleMetadata_in" - variableMetadata_in "$variableMetadata_in" - sampleclassNames "$sampleclassNames" - inclusive "$inclusive" - classnameColumn "$classnameColumn" - samplenameColumn "$samplenameColumn" - dataMatrix_out "$dataMatrix_out" - sampleMetadata_out "$sampleMetadata_out" - variableMetadata_out "$variableMetadata_out" + dataMatrix_in '$dataMatrix_in' + sampleMetadata_in '$sampleMetadata_in' + variableMetadata_in '$variableMetadata_in' + sampleclassNames '$sampleclassNames' + inclusive '$inclusive' + wildcards '$wildcards' + classnameColumn '$classnameColumn' + samplenameColumn '$samplenameColumn' + dataMatrix_out '$dataMatrix_out' + sampleMetadata_out '$sampleMetadata_out' + variableMetadata_out '$variableMetadata_out' ]]></command> <inputs> <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> - <param name="sampleclassNames" label="Names of sample classes" type="text" value = "" help="comma-separated names of sample classes to filter in or out; defaults to no names" /> + <param name="samplenameColumn" label="Column that names the sample" type="text" value = "sampleMetadata" help="name of the column in the sample metadata file that has the name of the sample - defaults to 'sampleMetadata'" /> + <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'classes' input parameter - defaults to 'class'" /> + <param name="sampleclassNames" label="Names of sample classes" type="text" value = "" help="comma-separated names (or comma-less regular expressions to match names) of sample-classes to filter in or out; defaults to no names"> + <sanitizer> + <valid initial="string.letters"> + <add preset="string.digits"/> + <add value="$" /> <!-- dollar, dollar-sign --> + <add value="(" /> <!-- left-paren --> + <add value=")" /> <!-- right-paren --> + <add value="*" /> <!-- splat, asterisk --> + <add value="+" /> <!-- plus --> + <add value="," /> <!-- comma --> + <add value="." /> <!-- dot, period --> + <add value=":" /> <!-- colon --> + <add value=";" /> <!-- semi, semicolon --> + <add value="?" /> <!-- what, question mark --> + <add value="[" /> <!-- l-squib, left-squre-bracket --> + <add value="\" /> <!-- whack, backslash --> + <add value="]" /> <!-- r-squib, right-squre-bracket --> + <add value="^" /> <!-- hat, caret --> + <add value="{" /> <!-- l-cube, left-curly-bracket --> + <add value="|" /> <!-- pipe --> + <add value="}" /> <!-- r-cube, right-curly-bracket --> + </valid> + </sanitizer> + + </param> + <param name="wildcards" label="Use wild-cards or regular-expressions" type="select" help="wild-cards (the default) - use '*' and '?' to match class names; regular-expressions - use comma-less regular expressions to match class names"> + <option value="TRUE" selected="true">wild-cards</option> + <option value="FALSE">regular-expressions</option> + </param> <param name="inclusive" label="Include named classes" type="select" help="filter-in - include only the named sample classes; filter-out (the default) - exclude only the named sample classes"> <option value="TRUE">filter-in</option> <option value="FALSE" selected="true">filter-out</option> </param> - <param name="classnameColumn" label="Column that names the sample-class" type="text" value = "class" help="name of the column in sample metadata that has the values to be tested against the 'classes' input parameter - defaults to 'class'" /> - <param name="samplenameColumn" label="Column that names the sample" type="text" value = "sampleMetadata" help="name of the column in sample metadata that has the name of the sample - defaults to 'sampleMetadata'" /> </inputs> <outputs> <data name="dataMatrix_out" label="${tool.name}_${dataMatrix_in.name}" format="tabular" ></data> @@ -99,7 +128,42 @@ <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> <param name="classnameColumn" value="gender"/> + <param name="sampleclassNames" value="*"/> + <param name="wildcards" value="TRUE"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="sampleMetadata_out"> + <assert_contents> + <not_has_text text="HU_204" /> + <has_text text="HU_028" /> + <has_text text="HU_051" /> + <has_text text="HU_060" /> + <has_text text="HU_110" /> + <has_text text="HU_149" /> + <has_text text="HU_152" /> + <has_text text="HU_175" /> + <has_text text="HU_178" /> + <has_text text="HU_185" /> + <has_text text="HU_208" /> + <has_text text="HU_017" /> + <has_text text="HU_034" /> + <has_text text="HU_078" /> + <has_text text="HU_091" /> + <has_text text="HU_093" /> + <has_text text="HU_099" /> + <has_text text="HU_130" /> + <has_text text="HU_134" /> + <has_text text="HU_138" /> + </assert_contents> + </output> + </test> + <test> + <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value="gender"/> <param name="sampleclassNames" value="M"/> + <param name="wildcards" value="FALSE"/> <param name="samplenameColumn" value="sampleMetadata"/> <param name="inclusive" value="filter-in"/> <output name="sampleMetadata_out"> @@ -133,6 +197,7 @@ <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> <param name="classnameColumn" value="gender"/> <param name="sampleclassNames" value="M"/> + <param name="wildcards" value="FALSE"/> <param name="samplenameColumn" value="sampleMetadata"/> <param name="inclusive" value="filter-in"/> <output name="variableMetadata_out"> @@ -156,20 +221,115 @@ </assert_contents> </output> </test> + <test> + <param name="dataMatrix_in" value="input_nofilter_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value="gender"/> + <param name="sampleclassNames" value="M"/> + <param name="wildcards" value="FALSE"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="variableMetadata_out"> + <assert_contents> + <has_text text="HMDB03193" /> + <not_has_text text="HMDB00822" /> + <has_text text="HMDB01101" /> + <has_text text="HMDB01101.1" /> + <has_text text="HMDB10348" /> + <has_text text="HMDB59717" /> + <not_has_text text="HMDB13189" /> + <has_text text="HMDB00299" /> + <has_text text="HMDB00191" /> + <has_text text="HMDB00518" /> + <has_text text="HMDB00715" /> + <has_text text="HMDB01032" /> + <has_text text="HMDB00208" /> + <has_text text="HMDB04824" /> + <has_text text="HMDB00512" /> + <has_text text="HMDB00251" /> + </assert_contents> + </output> + </test> + <test> + <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value="gender"/> + <param name="sampleclassNames" value="[Mm],[fF]"/> + <param name="wildcards" value="FALSE"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="sampleMetadata_out"> + <assert_contents> + <has_text text="HU_028" /> + <has_text text="HU_051" /> + <has_text text="HU_060" /> + <has_text text="HU_110" /> + <has_text text="HU_149" /> + <has_text text="HU_152" /> + <has_text text="HU_175" /> + <has_text text="HU_178" /> + <has_text text="HU_185" /> + <not_has_text text="HU_204" /> + <has_text text="HU_208" /> + <has_text text="HU_017" /> + <has_text text="HU_034" /> + <has_text text="HU_078" /> + <has_text text="HU_091" /> + <has_text text="HU_093" /> + <has_text text="HU_099" /> + <has_text text="HU_130" /> + <has_text text="HU_134" /> + <has_text text="HU_138" /> + </assert_contents> + </output> + </test> + <test> + <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="classnameColumn" value=""/> + <param name="sampleclassNames" value="M"/> + <param name="wildcards" value="FALSE"/> + <param name="samplenameColumn" value="sampleMetadata"/> + <param name="inclusive" value="filter-in"/> + <output name="sampleMetadata_out"> + <assert_contents> + <has_text text="HU_028" /> + <has_text text="HU_051" /> + <has_text text="HU_060" /> + <has_text text="HU_110" /> + <has_text text="HU_149" /> + <has_text text="HU_152" /> + <has_text text="HU_175" /> + <has_text text="HU_178" /> + <has_text text="HU_185" /> + <not_has_text text="HU_204" /> + <has_text text="HU_208" /> + <has_text text="HU_017" /> + <has_text text="HU_034" /> + <has_text text="HU_078" /> + <has_text text="HU_091" /> + <has_text text="HU_093" /> + <has_text text="HU_099" /> + <has_text text="HU_130" /> + <has_text text="HU_134" /> + <has_text text="HU_138" /> + </assert_contents> + </output> + </test> </tests> - <help> - <![CDATA[ + <help><![CDATA[ -.. class:: infomark -**Author** Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) +**Author** Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) -------------------------------------------------------------------------- -.. class:: infomark **R package** @@ -177,7 +337,6 @@ ----------------------------------------------------------------------------------------------------------------------------------------- -.. class:: infomark **Tool updates** @@ -199,22 +358,22 @@ Workflow Position ----------------- - - Upstream tool category: Preprocessing - - Downstream tool categories: Normalisation, Statistical Analysis, Quality Control +- Upstream tool category: Preprocessing +- Downstream tool categories: Normalisation, Statistical Analysis, Quality Control, Filter and Sort ---------- Motivation ---------- -GC-MS1 and LC-MS1 experiments seek to resolve chemicals as features that have distinct chromatographic behavior and (after ionization) mass-to-charge ratio. +GC-MS1 and LC-MS1 experiments seek to resolve chemicals as features that have distinct chromatographic behavior and (after ionization) mass-to-charge ratio. Data for a sample are collected as MS intensities, each of which is associated with a position on a 2D plane with dimensions of m/z ratio and chromatographic retention time. -Ideally, features would be sufficiently reproducible from sample-run to sample-run to identify features that are commmon among samples and those that differ. +Ideally, features would be sufficiently reproducible from sample-run to sample-run to identify features that are commmon among samples and those that differ. However, the chromatographic retention time for a chemical can vary from one run to another. -In the Workflow4Metabolomics (W4M, [Giacomoni *et al.*, 2014]) "flavor" of Galaxy, the XCMS [Smith *et al.*, 2006] preprocessing tools provide for "retention time correction" to align features among samples, but features may be better aligned if pooled samples and blanks are included. +In the Workflow4Metabolomics (W4M, [Giacomoni *et al.*, 2014, Guitton *et al.* 2017]) "flavor" of Galaxy, the XCMS [Smith *et al.*, 2006] preprocessing tools provide for "retention time correction" to align features among samples, but features may be better aligned if pooled samples and blanks are included. Multivariate statistical techniques may be used to discover clusters of similar samples, and sometimes it is desirable to apply clustering iteratively to smaller and smaller subsets of samples until observable separation of clusters is no longer significant. Once feature-alignment has been achieved among samples in GC-MS and LC-MS datasets, however, the presence of pools and blanks may confound identification and separation of clusters. -Multivariate statistical algorithms also may be impacted by missing values or dimensions that have zero variance. +Multivariate statistical algorithms also may be impacted by missing values or dimensions that have zero variance (Thévenot *et al.*, 2015). The w4mclassfilter tool provides a way to choose subsets of samples for further analysis. The tool takes as input the data matrix, sample metadata, and variable metadata Galaxy datasets produced by W4M and produces the same trio of datasets with data only for the selected samples. @@ -224,7 +383,6 @@ Next, missing and negative intensites for features of the remaining samples are imputed to zero. Finally, samples or features with zero variance are eliminated. - ----------- Input files ----------- @@ -256,23 +414,28 @@ | variable x metadata **variableMetadata** (tabular separated values) file of the numeric and/or character variable metadata, with . as decimal and NA for missing values | +Column that names the sample (default = '``sampleMetadata``') + | name of the column in sample metadata that has the name of the sample + | + +Column that names the sample-class (default = '``class``') + | name of the column in sample metadata that has the values to be tested against the '``classes``' input parameter + | + Names of sample classes (default = no names) | comma-separated names of sample classes to include or exclude | -Include named classes (default = filter-out) - | *filter-in* - include only the named sample classes - | *filter-out* - exclude only the named sample classes +Wild-cards (default = '``wild-cards``') + | '``wild-cards``' - use wild-cards to match names of sample classes (see 'Wild card patterns to match class-names' below) + | '``regular-expressions``' - exclude only the named sample classes (see 'Regular expression patterns to match class-names' below) | - -Column that names the sample-class (default = 'class') - | name of the column in sample metadata that has the values to be tested against the 'classes' input parameter +Include named classes (default = '``filter-out``') + | '``filter-in``' - include only the named sample classes + | '``filter-out``' - exclude only the named sample classes | -Column that names the sample (default = 'sampleMetadata') - | name of the column in sample metadata that has the name of the sample - | ------------ @@ -293,6 +456,63 @@ | +--------------------------------------- +Wild card patterns to match class-names +--------------------------------------- + +Beginning with v0.98.2, w4mclassfilter supports use of R "wild card" patterns to select class-names. + +- use '``?``' to match a single character +- use '``*``' to match zero or more characters +- the entire pattern must match the sample name + +For example + +- '``??.samp*``' matches '``my.sample``' but not '``my.own.sample``' +- '``*.sample``' matches '``my.sample``' and '``my.own.sample``' +- '``*.sampl``' matches neither '``my.sample``' nor '``my.own.sample``' + +------------------------------------------------ +Regular expression patterns to match class-names +------------------------------------------------ + +Beginning with v0.98.2, w4mclassfilter supports use of R "regular expression" patterns to select class-names. + +R uses POSIX 1003.2 standard regular expressions, which allow precise pattern-matching and are exhaustively defined at: +http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html + +However, only a few basic building blocks of regular expressions need to be mastered for most cases: + +- '``^``' matches the beginning of a class-name +- '``$``' matches the end of a class-name +- '``.``' outside of square brackets matches a single character +- '``*``' matches character specified immediately before zero or more times +- square brackets specify a set of characters to be matched. + +Within square brackets + +- '``^``' as the first character specifies that the list of characters are those that should **not** be matched. +- '``-``' is used to specify ranges of characters + +Caveat: The tool wrapper uses the comma ('``,``') to split a list of sample-class names, so **commas may not be used within regular expressions for this tool** + +First Example: Consider a field of class-names consisting of '``marq3,marq6,marq9,marq12,front3,front6,front9,front12``' + +- The regular expression '``^front[0-9][0-9]*$``' will match the same sample-classes as '``front3,front6,front9,front12``' +- The regular expression '``^[a-z][a-z]3$``' will match the same sample-classes as '``front3,marq3``' +- The regular expression '``^[a-z][a-z]12$``' will match the same sample-classes as '``front12,marq12``' +- The regular expression '``^[a-z][a-z][0-9]$``' will match the same sample-classes as '``front3,front6,front9,marq3,marq6,marq9``' + +Second Example: Consider these regular expression patterns as possible matches to a sample-class name '``AB0123``': + +- '``^[A-Z][A-Z][0-9][0-9]*$``' - MATCHES '``**^AB0123$**``' +- '``^[A-Z][A-Z]*[0-9][0-9]*$``' - MATCHES '``**^AB0123$**``' +- '``^[A-Z][0-9]*``' - MATCHES '``**^A** B0123$``' - first character is a letter, '``*``' can specify zero characters, and end of line did not need to be matched. +- '``^[A-Z][A-Z][0-9]``' - MATCHES '``**^AB0** 123$``' - first two characters are letters aind the third is a digit. +- '``^[A-Z][A-Z]*[0-9][0-9]$``' - NO MATCH - the name does not end with the pattern '``[A-Z][0-9][0-9]$``', i.e., it ends with four digits, not two. +- '``^[A-Z][0-9]*$``' - NO MATCH - the pattern specifies that second character and all those that follow, if present, must be digits. + + --------------- Working example --------------- @@ -348,6 +568,28 @@ NEWS ---- +CHANGES IN VERSION 0.98.3 +========================= + +INTERNAL MODIFICATIONS + +* Improved input handling. +* Now uses w4mclassfilter R package v0.98.3, although that version has no functional implications for this tool. +* Improved reference-list. + +CHANGES IN VERSION 0.98.2 +========================= + +NEW FEATURES + +* Added support for R-flavored regular expression pattern-matching when selecting names of sample-classes. +* Empty classes argument or zero-length class_column result in no samples filtered out. + +INTERNAL MODIFICATIONS + +* Support and tests for new features. + + CHANGES IN VERSION 0.98.1 ========================= @@ -363,11 +605,16 @@ none - ]]> - </help> + ]]></help> <citations> + <!-- Giacomoni_2014 W4M 2.5 --> + <citation type="doi">10.1093/bioinformatics/btu813</citation> + <!-- Guitton_2017 W4M 3.0 --> + <citation type="doi">10.1016/j.biocel.2017.07.002</citation> + <!-- Smith_2006 XCMS --> <citation type="doi">10.1021/ac051437y</citation> - <citation type="doi">10.1093/bioinformatics/btu813</citation> + <!-- Th_venot_2015 Urinary metabolome statistics --> + <citation type="doi">10.1021/acs.jproteome.5b00354</citation> </citations> <!-- vim:noet:sw=4:ts=4
--- a/w4mclassfilter_wrapper.R Thu May 11 00:01:40 2017 -0400 +++ b/w4mclassfilter_wrapper.R Thu Sep 07 17:32:09 2017 -0400 @@ -83,15 +83,13 @@ # other parameters sampleclassNames <- as.character(argVc["sampleclassNames"]) -# if (sampleclassNames == "NONE_SPECIFIED") { -# sampleclassNames <- as.character(c()) -# -# } else { -# sampleclassNames <- strsplit(x = sampleclassNames, split = ",", fixed = TRUE)[[1]] -# } +wildcards <- as.logical(argVc["wildcards"]) sampleclassNames <- strsplit(x = sampleclassNames, split = ",", fixed = TRUE)[[1]] +if (wildcards) { + sampleclassNames <- gsub("[.]", "[.]", sampleclassNames) + sampleclassNames <- utils::glob2rx(sampleclassNames, trim.tail = FALSE) +} inclusive <- as.logical(argVc["inclusive"]) -# print(sprintf("inclusive = '%s'", as.character(inclusive))) classnameColumn <- as.character(argVc["classnameColumn"]) samplenameColumn <- as.character(argVc["samplenameColumn"])