Mercurial > repos > melpetera > utilitaires
changeset 19:1b14b1ffdcb4 draft
Uploaded
author | melpetera |
---|---|
date | Fri, 08 Mar 2019 04:50:45 -0500 |
parents | 9f3ef38b6bad |
children | 94327d2cddbd |
files | ACF/Analytic_correlation_filtration.pl ACF/IonFiltration.pm ACF/analytic_correlation_filtration.xml ACF/data/default_list.csv ACF/static/images/Adduct_fragment_list.JPG ACF/static/images/Correlation_matrix.JPG Intchecks/RcheckLibrary.R Intchecks/Script_intensity_check.R Intchecks/wrapper_intensity_check.R Intchecks/xml_intensity_check.xml |
diffstat | 10 files changed, 1186 insertions(+), 832 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/Analytic_correlation_filtration.pl Fri Mar 08 04:50:45 2019 -0500 @@ -0,0 +1,82 @@ +#!usr/bin/perl + +### Perl modules +use warnings; +use strict; +use Getopt::Long qw(GetOptions); #Creation of script options +use Pod::Usage qw(pod2usage); #Creation of script options + +#Personnal packages +use FindBin ; ## Allows you to locate the directory of original perl script +use lib $FindBin::Bin; +use lib "$FindBin::Bin/../lib"; +use IonFiltration; + +my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete + +######################## +### Options and help ### +######################## + +GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2); + +### Check required parameters : +pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file; +#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file; +pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation. +#It should be a number (1 ; 2 or 3) : +# 1 : Don't take into acount mass information (only RT) +# 2 : Check that all mass differences are include in a specific list and taking into acount RT information +# 3 : Check that all mass differences are include in a specific list, ignoring RT information}, -exitval=>1, -verbose=>0}) unless $opt; +pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation. +It should be one of the 3 options below : + "mass" : choose the ion with the highest mass as the representant + "intensity" : choose the ion with the highest intensity as the representant + "mixt" : choose the ion with the highest (mass^2 * intensity) as the representant}, -exitval=>1, -verbose=>0}) unless $repres_opt; +pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix; +pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM; +#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold; +#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold; +pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold; +pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular; +pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif; + + +#if(($opt != 1) && ($opt != 2) && ($opt != 3)){ +# print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n"; +# exit; +#} + + + +if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){ + print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n"; + exit; +} + + + +######################################################################### +#### Création of a hash containing all adduits and fragments possible ### +######################################################################### + +my %hmass; +if($opt != 1){ + %hmass = IonFiltration::MassCollecting($mass_file); + print "Hash table done\n"; +} + + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + + +($output_sif, $output_tabular) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass); +print "sif table done\n"; + + + +print "All steps done\n"; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/IonFiltration.pm Fri Mar 08 04:50:45 2019 -0500 @@ -0,0 +1,668 @@ +#!usr/bin/perl +package IonFiltration; + +### Perl modules +use strict; +use warnings; + + + + + + +######################################################################## +### Création of a hash containing all adduits and fragments possible ### +######################################################################## + + +sub MassCollecting{ + + my $mass_file = $_[0]; + my %hmass; + + open (F1, $mass_file); + + while(my $line = <F1>){ + chomp $line; + my @tline = split(/[\t;]/, $line); + if(defined($hmass{$tline[2]})){ + print "Il existe déjà une même différence de masse : $tline[2] !\n"; + } + $hmass{$tline[1]}{$tline[2]}=$tline[0]; + } + + close F1; +# my $count=0; +# foreach my $k (keys %hmass){ +# $count++; +# } + + return %hmass; + + +} + + + + + + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + + +sub sifTableCreation{ + + my $file = $_[0]; + my $output_sif = $_[1]; + my $opt = $_[2]; + my $rt_threshold = $_[3]; + my $mass_threshold = $_[4]; + my $correl_threshold = $_[5]; + my $dataMatrix = $_[6]; + my $output_tabular = $_[7]; + my $combined_DMVM = $_[8]; + my $repres_opt = $_[9]; + my $intensity_threshold = $_[10]; + my $intensity_pourc = $_[11]; + my $refhmass = $_[12]; + + + + + my %hheader_file; + my %hduplicate; + + my %hcorrelgroup; + my $groupct=1; + + + my $linenb3=0; + my %hheader_line; + my %hrtmz; + + open (F5, $combined_DMVM); + while(my $line = <F5>){ + chomp $line; + my @tline = split(/\t/, $line); + + if($linenb3 == 0){ + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader_line{$a}=$i; + } + } + else{ + if(defined($hheader_line{mzmed})){ + my $b = $tline[$hheader_line{mzmed}]; + $hrtmz{$tline[0]}{mz}=$b; + } + else{ + my $b = $tline[$hheader_line{mz}]; + $hrtmz{$tline[0]}{mz}=$b; + } + if(defined($hheader_line{rtmed})){ + my $d = $tline[$hheader_line{rtmed}]; + $hrtmz{$tline[0]}{rt}=$d; + } + else{ + my $d = $tline[$hheader_line{rt}]; + $hrtmz{$tline[0]}{rt}=$d; + } + } + + $linenb3 ++; + } + close F5; + + + my $linenb=0; + + open (F1, $file) or die "Impossible to open $file\n"; + open(F2, ">$output_sif") or die "Impossible to open $output_sif\n"; + + while(my $line = <F1>){ + chomp $line; + my @tline = split(/\t/, $line); + + + ############################### + ### Création of a sif table ### + ############################### + + if($linenb == 0){ + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader_file{$i}=$a; + } + } + else{ + for(my $i=1; $i<scalar(@tline);$i++){ + my $a=$tline[0]; + my $b=$hheader_file{$i}; + my $coef=$tline[$i]; + + + if($a eq $b){ + # print "This is a correlation between A ($a) and A ($b) !\n" + } + else{ + + + ######################### + ### Remove duplicates ### + ######################### + + my $y = $a."/".$b; + my $z = $b."/".$a; + + if((!(defined($hduplicate{$y}))) && (!(defined($hduplicate{$z})))){ + + $hduplicate{$y}=1; + + if($coef > $correl_threshold){ + + print F2 "$a\t$coef\t$b\n"; + + my $count=0; + + ###################################################### + ### Analytic correlation filtrering follow options ### + ###################################################### + + + + my $amass=$hrtmz{$a}{mz}; + my $atemp=$hrtmz{$a}{rt}; + my $bmass= $hrtmz{$b}{mz}; + my $btemp=$hrtmz{$b}{rt}; + my $diff = $amass-$bmass; + $diff = abs($diff); + + ### Option 1: Don't take into acount mass information ### + + if($opt == 1){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + + } + } + + + ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ### + + elsif($opt == 2){ + + my $print = 0; + foreach my $s (keys %$refhmass){ + foreach my $r (keys %{%$refhmass{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ +# foreach my $s (keys %{%$refhmass{$r}}){ + # print F2 "$line\t$hmass{$r}{$s} : $s ($r)\n"; + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; +# $hcorrelgroup{$k}{frag}.="#".$s; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; +# $hcorrelgroup{$groupnb}{frag}.="#".$s; + $groupct ++; + + } + $print = 1; + } + + } + } + } + } + } + + + ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ### + + elsif($opt == 3){ + + my $print = 0; + foreach my $s (keys %$refhmass){ + foreach my $r (keys %{%$refhmass{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + +# foreach my $s (keys %{%$refhmass{$r}}){ + # print F2 "$line\t$hmass{$r}{$s} : $s ($r)\n"; + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; +# $hcorrelgroup{$k}{frag}.="#".$s; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; +# $hcorrelgroup{$groupnb}{frag}.="#".$s; + $groupct ++; + } + $print = 1; + } + } + } + } + } + } + + + } + } + } + } + $linenb ++; + } + close F1; + close F2; + + + + + ############################################# + ### Join groups that have been subdivided ### + ############################################# + + my @tdelete; + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ +# if($w ne "frag"){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } +# } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } + } + + foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); + } + + + ### Do it twice to see if it fix the problem of unmerge groups + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ +# if($w ne "frag"){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } +# } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } + } + + foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); + } + + + + ########################################## + ### Addition of annotation information ### + ########################################## + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $j (keys %{$hcorrelgroup{$k}}){ + my $count = 0; + if ($i ne $j){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$j}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %$refhmass){ + + foreach my $y (keys %{%$refhmass{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.6f", $diff_list); + + if($hcorrelgroup{$k}{$i} eq 1){ + my $val = "#".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}=$val; + $count ++; + } + else{ + if($count == 0){ + my $val = "#".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + else{ + my $val = $sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + } + } + } + } + } + } + } + } + + + + #################################################### + ### Choose the representative ion for each group ### + #################################################### + + my %hgrouprepres; + + open(F3, $dataMatrix); + + while (my $line = <F3>){ + chomp $line; + + my @tline = split (/\t/, $line); + + + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + + if($tline[0] eq $i){ + $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz}; + + my $intensity; + my $nbsubjects=0; + for(my $y=1;$y<scalar(@tline);$y++){ + $intensity += $tline[$y]; + $nbsubjects ++; + } + my $meanintensity = $intensity/$nbsubjects; + $hgrouprepres{$k}{$i}{intensity}=$meanintensity; + $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity}); + } + } + } + } + close F3; + + foreach my $z (keys %hgrouprepres){ + my $max_intensity = 0; + my $max_int_ion = ""; + my $max_mass = 0; + my $max_mass_ion = ""; + my $max_squared = 0; + my $max_squared_ion = ""; + foreach my $w (keys %{$hgrouprepres{$z}}){ + if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){ + $max_intensity = $hgrouprepres{$z}{$w}{intensity}; + $max_int_ion = $w; + + } + if($hgrouprepres{$z}{$w}{mass} > $max_mass){ + $max_mass = $hgrouprepres{$z}{$w}{mass}; + $max_mass_ion = $w; + } + if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){ + $max_squared = $hgrouprepres{$z}{$w}{squaredmassint}; + $max_squared_ion = $w; + } + } + + my $max_int_max_mass_ion=""; + + if($repres_opt eq "max_intensity_max_mass"){ + my %hfirst; + my $first=0; + foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){ + $first ++; + if ($first <= 3){ + $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity}; + } + } + + my $first_2 = 0; + my $intens_max = 0; + my $mass_max = 0; + + foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){ + + + + $first_2 ++; + if($first_2 == 1){ + $intens_max = $hfirst{$y}; + if($intensity_threshold > $intens_max){ + $intensity_threshold = 0; + } + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + if($hgrouprepres{$z}{$y}{mass} > $mass_max){ + if($hfirst{$y}>$intensity_threshold){ + my $a = $intens_max * $intensity_pourc; + if($hfirst{$y} > $a){ + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + } + } + + } + } + + + $hgrouprepres{$z}{max_int}=$max_int_ion; + $hgrouprepres{$z}{max_mass}=$max_mass_ion; + $hgrouprepres{$z}{max_squared}=$max_squared_ion; + $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion; + + } + + + ###################### + ### Print result ! ### + ###################### + + open(F4, ">$output_tabular"); + open(F5, $combined_DMVM); + + my $line_nb = 0; + my %hheader; + while (my $line = <F5>){ + chomp $line; + + + my @tline = split (/\t/, $line); + + if($line_nb == 0){ + print F4 "$line\tACF_groups"; + if($opt == 1){ + if($repres_opt eq "intensity"){print F4 "\tintensity_repres\tACF_filter\n"} + if($repres_opt eq "mass"){print F4 "\tmass_repres\tACF_filter\n"} + if($repres_opt eq "mixt"){print F4 "\tmass2intens_repres\tACF_filter\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tmax_intensity_max_mass_repres\tACF_filter\n"} + + } + else{ + if($repres_opt eq "intensity"){print F4 "\tintensity_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} + if($repres_opt eq "mass"){print F4 "\tmass_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} + if($repres_opt eq "mixt"){print F4 "\tmass2intens_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tmax_intensity_max_mass_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} + } + + + ### Creation of a header hash + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader{$a}=$i; + } + } + + else{ + my $find = 0; + foreach my $v (keys %hcorrelgroup){ + if(defined($hgrouprepres{$v}{$tline[0]})){ + print F4 "$line\t$v"; + + + if($repres_opt eq "intensity"){print F4 "\t$hgrouprepres{$v}{max_int}\t"} + if($repres_opt eq "mass"){print F4 "\t$hgrouprepres{$v}{max_mass}\t"} + if($repres_opt eq "mixt"){print F4 "\t$hgrouprepres{$v}{max_squared}\t"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\t$hgrouprepres{$v}{max_int_max_mass}\t"} + + + if($opt != 1){ + if(defined($hcorrelgroup{$v}{$tline[0]})){ + print F4 "$hcorrelgroup{$v}{$tline[0]}\t"; + } + else{ + print F4 "\t"; + } + } + + if($repres_opt eq "intensity"){ + if($tline[0] eq $hgrouprepres{$v}{max_int}){ + print F4 "conserved\n"; + } + else{ + print F4 "deleted\n"; + } + $find = 1; + } + if($repres_opt eq "mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_mass}){ + print F4 "conserved\n"; + } + else{ + print F4 "deleted\n"; + } + $find = 1; + } + if($repres_opt eq "mixt"){ + if($tline[0] eq $hgrouprepres{$v}{max_squared}){ + print F4 "conserved\n"; + } + else{ + print F4 "deleted\n"; + } + $find = 1; + } + if($repres_opt eq "max_intensity_max_mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){ + print F4 "conserved\n"; + } + else{ + print F4 "deleted\n"; + } + $find = 1; + } + } + } + if($find == 0){ + $groupct ++; + my $group = "group".$groupct; + if($opt != 1){ + print F4 "$line\t$group\t-\t-\t-\n"; + } + else{ + print F4 "$line\t$group\t-\t-\n"; + } + } + + + } + + $line_nb ++; + } + + + return ($output_sif, $output_tabular); +} + + + + + +1; \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/analytic_correlation_filtration.xml Fri Mar 08 04:50:45 2019 -0500 @@ -0,0 +1,211 @@ +<tool id="Analytic_correlation_filtration" name="Analytic correlation filtration" version="2018-11-08"> + <description> + : Detect analytic correlation among data and remove them. + </description> + + <requirements> + <requirement type="package">perl-statistics</requirement> + <requirement type="package" version="1.19">perl-soap-lite</requirement> + </requirements> + + <command><![CDATA[ + + + perl $__tool_directory__/Analytic_correlation_filtration.pl + + + #if str($mass_file.mass_choice)=="false": + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -o 1 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt 9999999999 + #else: + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -o 1 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" + #end if + #else: + #if str($mass_file.liste.mass_list)=="true": + #if str($rt_cond.rt_choice)=="true": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m "$mass_file.liste.mass_file_in" -o 2 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" -mass "$mass_file.mass_threshold" + #end if + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m "$mass_file.liste.mass_file_in" -o 3 -d "$dataMatrix_in" -v "$variableMetadata_in" -mass "$mass_file.mass_threshold" + #end if + #else + #if str($rt_cond.rt_choice)=="true": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m $__tool_directory__/data/default_list.csv -o 2 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" -mass "$mass_file.mass_threshold" + #end if + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m $__tool_directory__/data/default_list.csv -o 3 -d "$dataMatrix_in" -v "$variableMetadata_in" -mass "$mass_file.mass_threshold" + #end if + #end if + #end if + + -r "$repres_opt.repres_opt_selector" + + #if str($repres_opt.repres_opt_selector)=="max_intensity_max_mass": + -IT $repres_opt.int_threshold + -IP $repres_opt.int_percentage + #end if + -correl "$correl_threshold" + -output_sif "$sif_out" + -output_tabular "$variableMetadata_out" + + ]]></command> + + <inputs> + <param type="data" name="file_in" format="txt" help="The .txt correlation table (you can obtain it by using the Between-table Correlation tool or for exemple the cor() function in R) " label="Correlation table file" /> + <param type="data" name="dataMatrix_in" format="tabular" help="" label="dataMatrix file" /> + <param type="data" name="variableMetadata_in" format="tabular" help="" label="variableMetadata file" /> + + <param help="Define the minimum correlation threshold accepted to determine analytic correlation" label="Correlation threshold" type="float" name="correl_threshold" value="0.90"/> + + <conditional name="mass_file"> + <param name="mass_choice" checked="true" falsevalue="false" help="'YES' if you want to take it into account; 'NO' if you don't want to take into account mass information" label="Do you want to take into account mass differences between 2 ions?" truevalue="true" type="boolean"/> + <when value="true"> + <conditional name="liste"> + <param name="mass_list" checked="true" falsevalue="false" help="'YES' if you have your own list to upload; 'NO' if you want to use a default list" label="Do you have your own list of mass differences or do you want to use a default list ?" truevalue="true" type="boolean"/> + <when value="false"> + + </when> + <when value="true"> + <param type="data" name="mass_file_in" format="tabular,csv" help="The file containing all your report and known mass differences (cf help for file example) " label="Mass differences table (format: tabular or csv) " /> + </when> + </conditional> + <param help="2 ions need to have a difference mass included in the list at +/- mass difference range to be considered as analytically correlated | Value recommendation : 0.005" label="Mass difference range" type="float" name="mass_threshold" value="0.005"/> + </when> + <when value="false"> + + </when> + </conditional> + + <conditional name="rt_cond"> + <param checked="true" falsevalue="false" help="'YES' if want to take into account retention time information; 'NO' if you don't want to take into account retention time information" label="Do you want to take into account retention time differences between 2 ions? " name="rt_choice" truevalue="true" type="boolean"/> + <when value="true"> + <param help="Choose a retention time difference threshold between 2 ions considered as analytically correlated | Value recommendation : 0.1" label="Retention time difference threshold" type="float" name="rt_threshold" value="0.1"/> + </when> + <when value="false"> + + </when> + </conditional> + + <conditional name="repres_opt"> + <param name="repres_opt_selector" label="Which representative ion do you want to select for each group" type="select" display="radio" help=""> + <option value="intensity">Highest intensity</option> + <option value="mass">Highest mass</option> + <option value="mixt">Highest (mass2 x intensity) </option> + <option value="max_intensity_max_mass">Highest mass between the 3 highest intensity (following intensity threshold and rules ==> see help) </option> + </param> + <when value="max_intensity_max_mass"> + <param help="" label="Minimum intensity threshold for the representative ion" type="float" name="int_threshold" value="1000"/> + <param help="Example: ion A have the highest intensity of a group but not the highest mass, B is an ion that have the second highest intensity in the group and a highest mass than A, to choose B as a representative ion for the group his intensity need to be at list 50% of the A intensity." label="Percentage of highest intensity of the group accept for the new representative ion. This option allow to avoid isotope selection. " type="float" name="int_percentage" value="0.5"/> + </when> + <when value="intensity"> + </when> + <when value="mass"> + </when> + <when value="mixt"> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="sif" label="${file_in.name}_sif" name="sif_out"/> + <data format="tabular" label="${variableMetadata_in.name}_representative_ion" name="variableMetadata_out"/> + </outputs> + + <help><![CDATA[ + +.. class:: infomark + +**Authors** : **Stephanie Monnerie** (stephanie.monnerie@inra.fr) wrote this tool for analytic correlation detection. + +--------------------------------------------------- + +.. class:: infomark + +**References** : + +--------------------------------------------------- + +----------- +Input files +----------- + ++-----------------------------------------+---------------+ +| File | Format | ++=========================================+===============+ +| 1) Correlation matrix | txt | ++-----------------------------------------+---------------+ +| 2) Data matrix | tabular | ++-----------------------------------------+---------------+ +| 3) Variable metadata | tabular | ++-----------------------------------------+---------------+ +| **Optional file** | **Format** | ++-----------------------------------------+---------------+ +| 4) Optional : Mass differences list | csv/tabular | ++-----------------------------------------+---------------+ + +--------------------------------------------------- + +------------- +Files content +------------- + +Correlation matrix + * File organisation : on line by correlation pairs with the first ion ID, the correlation value and the second ion ID, tabular separated ==> Fist_Ion_ID \\t Correlation_Value \\t Second_Ion_ID + * Example: + +.. image:: Correlation_matrix.JPG + :width: 800 + +Data matrix file + * "variable x sample" **dataMatrix** : tabular separated file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the variable metadata (see below) + +Variable metadata file + * "variable x metadata" **variableMetadata** tabular separated file of the numeric and/or character variable metadata, with . as decimal and NA for missing values + +.. class:: warningmark + +For more information about input files, refer to the corresponding "W4M HowTo" page: +W4M_table_format_for_Galaxy_ +.. _W4M_table_format_for_Galaxy: http://workflow4metabolomics.org/sites/workflow4metabolomics.org/files/files/w4m_TableFormatForGalaxy_150908.pdf + + +Mass differences list + * A file containing list of known adducts, fragments or isotopes with the mass differences linked to them + * Example: + +.. image:: Adduct_fragment_list.JPG + :width: 800 + +--------------------------------------------------- + +---------- +Parameters +---------- + +Take into account mass diffrences between 2 ions : + * You can enter a list of mass differences that are known. The file must be organized with a first column for the mass difference type (isotope, fragment, etc...), a second column with the mass difference chemical formula (H+, -2H+K, etc...) and a third column for the mass difference value + * If you are choosing to use a mass differences table, you have to choose a mass difference range that will be a threshold to accept or not a difference value as true (recognize a mass difference value in the file +/- this threshold). + +Take into acount retention time : + * You can use retention time as a criteria to group ions. You have to choose a value that will be use as intervalle : 2 ions are group when their retention time is equal +/- the threshold. + +Choose the representative ion for each group, there are 3 possibilities to determine the representative ion : + * The ion with the highest intensity (recommandated for LC/MS) + * The ion with the highest mass + * The ion with the highest "mass2 * intensity" value + * The ion with the highest mass between the 3 highest intensity of the group, except if the highest mass ion have an intensity < determined percentage of the highest intensity ion one (for exemple 50%) (recommandated for GC/MS) + + +--------------------------------------------------- + +-------------- +Example of use +-------------- + +Add exemples according to the ppt presentation ! + + + + ]]></help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/data/default_list.csv Fri Mar 08 04:50:45 2019 -0500 @@ -0,0 +1,225 @@ +adduit -2H+Na+K 59.9378259 +adduit H 1.007825032 +adduit -H+K 37.95588165 +adduit -H+Na 21.98194425 +adduit -3H+3Na 65.94583274 +adduit -4H+4K 151.8235266 +adduit -4H+4Na 87.92777699 +adduit -3H+3K 113.8676449 +adduit -2H+2K 75.9117633 +adduit -2H+2Na 43.9638885 +adduit 2H 2.015650064 +adduit Cl 34.96885268 +adduit -2H+Ca 37.94694092 +isotope 13C db 0.501677419 +isotope 13C 1.003354838 +isotope 15N 0.997034893 +isotope 18O 2.00424638 +isotope 34S 1.9957959 +isotope 41K 1.99811908 +isotope 37Cl 1.99704991 +isotope 13C2 2.006709676 +isotope 13C3 3.010064513 +isotope 13C+37Cl 3.000404748 +isotope 13C+18O 3.007601218 +isotope 13C+34S 2.999150738 +isotope 44Ca 3.99289082 +adduit CH3OH 32.02621475 +adduit CH3CN 41.0265491 +adduit H2O 18.01056468 +adduit 2(H2O 36.02112937 +adduit NaCl 57.95862196 +adduit HCOOH 46.0054793 +adduit +(HCOOH)+(HCOOK) 129.9668403 +adduit +(HCOOH)+(HCOONa) 113.9929029 +adduit +(HCOOH)+2(HCOONa) 181.9803264 +adduit HCOOK 83.96136095 +adduit +(HCOOK)+(HCOONa) 151.9487845 +adduit HCOONa 67.98742355 +adduit 2(HCOOH) 92.01095861 +adduit +2(HCOOH)+(HCOOK) 175.9723196 +adduit +2(HCOOH)+(HCOONa) 159.9983822 +adduit 2(HCOOK) 167.9227219 +adduit 2(HCOONa) 135.9748471 +fragment C11H18O9 294.0950822 +fragment C12H16O12 352.064176 +fragment C12H20O9 308.1107322 +fragment C2H2O 42.01056468 +fragment C2H3. 27.0229265 +fragment C2H3N 41.0265491 +fragment C2H3NO3 89.01129296 +fragment C2H3O. 43.01784112 +fragment C2H4 28.03130013 +fragment C2H4N. 42.03382553 +fragment C2H4O 44.02621475 +fragment C2H5. 29.03857656 +fragment C2H5N 43.04219916 +fragment C2H5NO2 75.0320284 +fragment C2H5O. 45.03349118 +fragment C2H5O6P 155.9823745 +fragment C2H6 30.04695019 +fragment C2H7N 45.05784922 +fragment C2HNO2 71.00072827 +fragment C3H4O3 88.01604399 +fragment C3H5. 41.03857656 +fragment C3H5NO2 87.0320284 +fragment -(C3H5O2NS)-(NH3) 136.0306485 +fragment C3H5O2NS 119.0040994 +fragment C3H6 42.04695019 +fragment C3H6O3 90.03169405 +fragment C3H7. 43.05422662 +fragment C3H7O2N 89.04767846 +fragment C3H7O2NS 121.0197495 +fragment C3H7O6P 169.9980246 +fragment C4H6 54.04695019 +fragment C4H6O2 86.03677943 +fragment C4H6O4 118.0266087 +fragment C4H7. 55.05422662 +fragment C4H8O3 104.0473441 +fragment C4H9 57.07042529 +fragment C5H7O3N 129.0425931 +fragment C5H8O3NS 162.0224891 +fragment C5H8O4 132.0422587 +fragment C6H10O4 146.0579088 +fragment -(C6H10O5)-(H2O) 180.0633881 +fragment C6H10O5 162.0528234 +fragment C6H10O7 194.0426527 +fragment C6H8O6 176.032088 +fragment CH2O 30.01056468 +fragment -(CH2S)-(HCOOH) 91.99320037 +fragment -(CH2S)-(NH3) 63.01427016 +fragment CH2S 45.98772106 +fragment CH3. 15.0229265 +fragment CH3COO. 59.01275574 +fragment CH3COOH 60.02112937 +fragment CH3N 29.0265491 +fragment CH3O. 31.01784112 +fragment CH3OH 32.02621475 +fragment CH4 16.03130013 +fragment CH4N. 30.03382553 +fragment -(CH4S)-(HCOOH) 94.00885043 +fragment -(CH4S)-(NH3) 65.02992022 +fragment CH4S 48.00337113 +fragment CH5N 31.04219916 +fragment Cl. 34.96830408 +fragment CO 27.99491462 +fragment -(CO2)-(CO) 71.98474386 +fragment CO2 43.98982924 +fragment -(H2)-(NH3) 19.04219916 +fragment H2 2.015650064 +fragment -(H2O)-(CO2) 62.00039392 +fragment -(H2O)-(HCOOH) 64.01604399 +fragment -(H2O)-(NH3) 35.03711378 +fragment H2O 18.01056468 +fragment -(H2O)-2(CO2) 105.9902232 +fragment -(H2S)-(H2O) 51.99828575 +fragment H2S 33.98772106 +fragment H2SO4 97.96737954 +fragment H3PO4 97.97689521 +fragment HCl 35.97667771 +fragment HCN 27.01089903 +fragment -(HCOOH)-(HCN) 73.01637834 +fragment HCOOH 46.0054793 +fragment HS. 32.97934743 +fragment -(NC3H9)-(CH3COOH) 119.0946287 +fragment -(NC3H9)-(H2O) 77.08406397 +fragment -(NC3H9)-(HCOOH) 105.0789786 +fragment NC3H9 59.07349929 +fragment NaCl 57.95862196 +fragment NH2CO. 44.01309008 +fragment -(NH3)-(CO2)-(H2O) 79.02694302 +fragment -(NH3)-(CO2) 61.01637834 +fragment -(NH3)-(CONH) 60.03236275 +fragment -(NH3)-(HCOOH) 63.0320284 +fragment NH3 17.0265491 +fragment NH3CO 45.02146372 +fragment NHCO 43.00581365 +fragment OH. 17.00219105 +fragment PO3 78.95850549 +fragment SO2 63.96190024 +fragment SO3 79.95681486 +fragment -2(H2O)-(CO2) 80.01095861 +fragment -2(H2O)-(HCOOH)-(NH3) 99.05315777 +fragment -2(H2O)-(HCOOH) 82.02660867 +fragment 2(H2O) 36.02112937 +fragment 2(HCOOH) 92.01095861 +fragment -2(NH3)-(CO)-(CO2) 106.0378421 +fragment -2(NH3)-(CO) 62.04801281 +fragment 2(NH3) 34.05309819 +fragment 3(H2O) 54.03169405 +fragment 3(NH3) 51.07964729 +fragment 4(H2O) 72.04225874 +fragment C10H11O3N5 249.0861892 +fragment C10H13O4N5 267.0967539 +fragment C10H14O7N5P 347.0630844 +fragment C10H15O5N5 285.1073186 +fragment C2H3NO2 73.01637834 +fragment C2H4O2 60.02112937 +fragment C2H5NO3 91.02694302 +fragment C2H6O2 62.03677943 +fragment C2H6O3 78.03169405 +fragment -(C2H6O3)-(H2O) 96.04225874 +fragment C2H6O4 94.02660867 +fragment C2H7NO2 77.04767846 +fragment C3H10O5 126.0528234 +fragment -(C3H6O3)-(CHNO) 133.0375077 +fragment C3H6O4 106.0266087 +fragment C3H8O3 92.04734412 +fragment C3H8O4 108.0422587 +fragment C4H10O5 138.0528234 +fragment C4H5NO3 115.026943 +fragment C4H8O4 120.0422587 +fragment C5H10O4 134.0579088 +fragment C5H13O4N 151.0844579 +fragment C6H11O4N 161.0688078 +fragment C6H11O5N 177.0637225 +fragment C6H13O5N 179.0793725 +fragment C5H10O5 150.0528234 +fragment C5H10O6 166.047738 +fragment C5H12O2 104.0837296 +fragment -(C5H12O2)-(H2O) 122.0942943 +fragment C5H5N5 135.0544952 +fragment C5H5ON5 151.0494098 +fragment C5H6O2 98.03677943 +fragment C5H7O2N5 169.0599745 +fragment -(C5H7O3N)-(CO2) 173.0324223 +fragment -(C5H7O3N)-(H2O) 147.0531578 +fragment C5H8N3 110.0718223 +fragment C5H8O3 116.0473441 +fragment C5H8O5N5P 249.026305 +fragment C5H9O3 117.0551691 +fragment C5H9O6P 196.0136746 +fragment C5H9O7P 212.0085893 +fragment C6H10O3 130.0629942 +fragment -(C6H10O3)-(H2O) 148.0735589 +fragment C6H11O4N3PS 252.0207885 +fragment C6H11O4NPS 224.0146405 +fragment C6H12O5 164.0684735 +fragment C6H14O6 182.0790382 +fragment C6H14O7 198.0739528 +fragment C6H16O7 200.0896029 +fragment C6H16O8 216.0845175 +fragment C6H8N3 122.0718223 +fragment C6H8NS 126.0377453 +fragment C7H5ON5 175.0494098 +fragment C7H6ON6 190.0603088 +fragment C7H7O2N5 193.0599745 +fragment C7H11O6N 205.0586371 +fragment C8H14O7 222.0739528 +fragment C8H5O3N5 219.039239 +fragment C8H7O4N5 237.0498037 +fragment C9H10O4N2 210.0640568 +fragment C9H11O3N3 209.0800412 +fragment C9H11O4N3 225.0749558 +fragment C9H12O5N2 228.0746215 +fragment C9H12O6N3P 289.0463717 +fragment C9H13O4N3 227.0906059 +fragment C9H14O7N3P 307.0569364 +fragment C9H16O8 252.0845175 +fragment CH2N2 42.02179806 +fragment -(CH2O)-(H2O) 48.02112937 +fragment CH5NO 47.03711378 +fragment -(H3PO4)-(CHNO) 140.9827089 +fragment -(H3PO4)-(H2O) 115.9874599 +fragment -(H3PO4)-(NH3) 115.0034443 +fragment HPO3 79.96633052
--- a/Intchecks/RcheckLibrary.R Thu Mar 07 08:34:33 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,124 +0,0 @@ -###################################################### -# R check library -# Coded by: M.Petera, -# - - -# R functions to use in R scripts -# (management of various generic subroutines) -# - - -# V0: script structure + first functions -# V1: More detailed error messages in match functions -###################################################### - - -# Generic function to return an error if problems have been encountered - - - - - -check.err <- function(err.stock){ - - # err.stock = vector of results returned by check functions - - if(length(err.stock)!=0){ stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n") } - -} - - - - -# Table match check functions - - - - - - - - - - - - - - - - - - - - - - - - - - -# To check if dataMatrix and (variable or sample)Metadata match regarding identifiers -match2 <- function(dataMatrix, Metadata, Mtype){ - - # dataMatrix = data.frame containing dataMatrix - # Metadata = data.frame containing sampleMetadata or variableMetadata - # Mtype = "sample" or "variable" depending on Metadata content - - err.stock <- NULL # error vector - - id2 <- Metadata[,1] - if(Mtype=="sample"){ id1 <- colnames(dataMatrix)[-1] } - if(Mtype=="variable"){ id1 <- dataMatrix[,1] } - - if( length(which(id1%in%id2))!=length(id1) || length(which(id2%in%id1))!=length(id2) ){ - err.stock <- c("\nData matrix and ",Mtype," metadata do not match regarding ",Mtype," identifiers.") - if(length(which(id1%in%id2))!=length(id1)){ - if(length(which(!(id1%in%id2)))<4){ err.stock <- c(err.stock,"\n The ") - }else{ err.stock <- c(err.stock,"\n For example, the ") } - err.stock <- c(err.stock,"following identifiers found in the data matrix\n", - " do not appear in the ",Mtype," metadata file:\n") - identif <- id1[which(!(id1%in%id2))][1:min(3,length(which(!(id1%in%id2))))] - err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") - } - if(length(which(id2%in%id1))!=length(id2)){ - if(length(which(!(id2%in%id1)))<4){ err.stock <- c(err.stock,"\n The ") - }else{ err.stock <- c(err.stock,"\n For example, the ") } - err.stock <- c(err.stock,"following identifiers found in the ",Mtype," metadata file\n", - " do not appear in the data matrix:\n") - identif <- id2[which(!(id2%in%id1))][1:min(3,length(which(!(id2%in%id1))))] - err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") - } - err.stock <- c(err.stock,"\nPlease check your data.\n") - } - - return(err.stock) - -} - -# To check if the 3 standard tables match regarding identifiers -match3 <- function(dataMatrix, sampleMetadata, variableMetadata){ - - # dataMatrix = data.frame containing dataMatrix - # sampleMetadata = data.frame containing sampleMetadata - # variableMetadata = data.frame containing variableMetadata - - err.stock <- NULL # error vector - - id1 <- colnames(dataMatrix)[-1] - id2 <- sampleMetadata[,1] - id3 <- dataMatrix[,1] - id4 <- variableMetadata[,1] - - if( length(which(id1%in%id2))!=length(id1) || length(which(id2%in%id1))!=length(id2) ){ - err.stock <- c(err.stock,"\nData matrix and sample metadata do not match regarding sample identifiers.") - if(length(which(id1%in%id2))!=length(id1)){ - if(length(which(!(id1%in%id2)))<4){ err.stock <- c(err.stock,"\n The ") - }else{ err.stock <- c(err.stock,"\n For example, the ") } - err.stock <- c(err.stock,"following identifiers found in the data matrix\n", - " do not appear in the sample metadata file:\n") - identif <- id1[which(!(id1%in%id2))][1:min(3,length(which(!(id1%in%id2))))] - err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") - } - if(length(which(id2%in%id1))!=length(id2)){ - if(length(which(!(id2%in%id1)))<4){ err.stock <- c(err.stock,"\n The ") - }else{ err.stock <- c(err.stock,"\n For example, the ") } - err.stock <- c(err.stock,"following identifiers found in the sample metadata file\n", - " do not appear in the data matrix:\n") - identif <- id2[which(!(id2%in%id1))][1:min(3,length(which(!(id2%in%id1))))] - err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") - } - } - - if( length(which(id3%in%id4))!=length(id3) || length(which(id4%in%id3))!=length(id4) ){ - err.stock <- c(err.stock,"\nData matrix and variable metadata do not match regarding variable identifiers.") - if(length(which(id3%in%id4))!=length(id3)){ - if(length(which(!(id3%in%id4)))<4){ err.stock <- c(err.stock,"\n The ") - }else{ err.stock <- c(err.stock,"\n For example, the ") } - err.stock <- c(err.stock,"following identifiers found in the data matrix\n", - " do not appear in the variable metadata file:\n") - identif <- id3[which(!(id3%in%id4))][1:min(3,length(which(!(id3%in%id4))))] - err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") - } - if(length(which(id4%in%id3))!=length(id4)){ - if(length(which(!(id4%in%id3)))<4){ err.stock <- c(err.stock,"\n The ") - }else{ err.stock <- c(err.stock,"\n For example, the ") } - err.stock <- c(err.stock,"following identifiers found in the variable metadata file\n", - " do not appear in the data matrix:\n") - identif <- id4[which(!(id4%in%id3))][1:min(3,length(which(!(id4%in%id3))))] - err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") - } - } - - if(length(err.stock)!=0){ err.stock <- c(err.stock,"\nPlease check your data.\n") } - - return(err.stock) - -} \ No newline at end of file
--- a/Intchecks/Script_intensity_check.R Thu Mar 07 08:34:33 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,437 +0,0 @@ -######################################################################### -# SCRIPT INTENSITY CHECK # -# # -# Input: Data Matrix, VariableMetadata, SampleMetadata # -# Output: VariableMetadata, Graphics # -# # -# Dependencies: RcheckLibrary.R # -# # -######################################################################### - - -# Parameters (for dev) -if(FALSE){ - - rm(list = ls()) - setwd("Y:\\Developpement\\Intensity check\\Pour tests\\Tests_global") - - DM.name <- "DM_NA.tabular" - SM.name <- "SM_NA.tabular" - VM.name <- "vM_NA.tabular" - method <- "one_class" - chosen.stat <- "mean,sd,quartile,decile,NA" - class.col <- "2" - test.fold <- "Yes" - class1 <- "Pools" - fold.frac <- "Top" - logarithm <- "log10" - VM.output <- "new_VM.txt" - graphs.output <- "Barplots_and_Boxplots.pdf" -} - - - - -intens_check <- function(DM.name, SM.name, VM.name, method, chosen.stat, class.col, test.fold, class1, fold.frac, - logarithm, VM.output, graphs.output){ - - # This function allows to check the intensities with various statistics, number of missing values and mean fold change - # - # Three methods proposed: - # - global: tests for each variable without distinction between samples - # - one class: one class versus all the remaining samples - # - each class: if the class columns contains at least three classes and you want to test each of them - # - # Parameters: - # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access - # method: "global", "one_class", "each_class" - # chosen.stat: character listing the chosen analysis (comma-separated) - # class.col: number of the sampleMetadata's column with classes (if method = one_class or each_class) - # test.fold: "yes" or "no" (if method = one_class or each_class) - # class1: name of the class (if method = one_class) - # fold.frac: "Top" -> class1/other or "Bottom" -> other/class1 (if method = one_class) - # logarithm: "log2", "log10" or "none" (if method = one_class or each_class) - # VM.output: output file's access (VM with new columns) - # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values - - - - - # Input --------------------------------------------------------------------------------------------------- - - DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) - SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) - VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) - - - - # Table match check with Rchecklibrary - table.check <- match3(DM, SM, VM) - check.err(table.check) - - - rownames(DM) <- DM[,1] - var_names <- DM[,1] - DM <- DM[,-1] - DM <- data.frame(t(DM)) - - stat.list <- strsplit(chosen.stat,",")[[1]] - - - # check class.col, class1 and the number of classes --------------------------------------------------------- - - #set 1 class for all samples in case of method = no_class - if(method=="no_class"){ - c_class <- rep("global", length=nrow(DM)) - classnames <- "global" - nb_class=1 - test.fold <- "No" - } - - - if(method != "no_class"){ - - class.col <- colnames(SM)[as.numeric(class.col)] - - if(!(class.col %in% colnames(SM))){ - stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") - } - - c_class <- SM[,class.col] - c_class <- as.factor(c_class) - nb_class <- nlevels(c_class) - classnames <- levels(c_class) - - if((nb_class < 2)&&(test.fold=="Yes")){ - err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n") - cat(err.1class) - } - - if((nb_class > (nrow(SM))/3)&&(method == "each_class")){ - class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those - with few samples \n") - cat(class.err) - } - - - if(method == "one_class"){ - if(!(class1 %in% classnames)){ - list.class1 <- c("\n Classes:",classnames,"\n") - cat(list.class1) - err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) - stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") - } - - #If method is "one_class", change others classes in "other" - for(i in 1:length(c_class)){ - if(c_class[i]!=class1){ - c_class <- as.character(c_class) - c_class[i] <- "Other" - c_class <- as.factor(c_class) - nb_class <- nlevels(c_class) - classnames <- c(class1,"Other") - } - } - } - - } - - - # Statistics ------------------------------------------------------------------------------------------------ - - - ### Initialization - - DM <- cbind(c_class,DM) - - stat.res <- t(DM[0,-1,drop=FALSE]) - names <- NULL - - mean.res <- NULL - mean.names <- NULL - - sd.res <- NULL - sd.names <- NULL - - med.res <- NULL - med.names <- NULL - - quart.res <- NULL - quart.names <- NULL - - dec.res <- NULL - dec.names <- NULL - - NA.res <- NULL - NA.names <- NULL - pct_NA.res <- NULL - pct_NA.names <- NULL - - fold.res <- NULL - fold.names <- NULL - - if(("NA" %in% stat.list)||(test.fold=="Yes")){ - graphs <- 1 - }else{ - graphs=0 - } - - data_bp <- data.frame() #table for NA barplot - - - - ### Computation - - - for(j in 1:nb_class){ - - # Mean --------- - - if("mean" %in% stat.list){ - mean.res <- cbind(mean.res, colMeans(DM[which(DM$c_class==classnames[j]),-1],na.rm=TRUE)) - mean.names <- cbind(mean.names, paste("Mean",classnames[j], sep="_")) - if(j == nb_class){ - stat.res <- cbind(stat.res, mean.res) - names <- cbind(names, mean.names) - } - } - - # Standard deviation ----- - - if("sd" %in% stat.list){ - sd.res <- cbind(sd.res, apply(DM[which(DM$c_class==classnames[j]),-1],2,sd,na.rm=TRUE)) - sd.names <- cbind(sd.names, paste("Sd",classnames[j], sep="_")) - if(j == nb_class){ - stat.res <- cbind(stat.res, sd.res) - names <- cbind(names, sd.names) - } - } - - # Median --------- - - if(("median" %in% stat.list)&&(!("quartile" %in% stat.list))){ - med.res <- cbind(med.res, apply(DM[which(DM$c_class==classnames[j]),-1],2,median,na.rm=TRUE)) - med.names <- cbind(med.names, paste("Median",classnames[j], sep="_")) - if(j == nb_class){ - stat.res <- cbind(stat.res, med.res) - names <- cbind(names, med.names) - } - } - - # Quartiles ------ - - if("quartile" %in% stat.list){ - quart.res <- cbind(quart.res,t(apply(DM[which(DM$c_class==classnames[j]),-1],2,quantile,na.rm=TRUE))) - quart.names <- cbind(quart.names, paste("Min",classnames[j], sep="_"),paste("Q1",classnames[j], sep="_"), - paste("Median",classnames[j],sep="_"),paste("Q3",classnames[j],sep="_"), - paste("Max",classnames[j],sep="_")) - if(j == nb_class){ - stat.res <- cbind(stat.res, quart.res) - names <- cbind(names, quart.names) - } - } - - # Deciles ------ - - if("decile" %in% stat.list){ - dec.res <- cbind(dec.res,t(apply(DM[which(DM$c_class==classnames[j]),-1],2,quantile,na.rm=TRUE,seq(0,1,0.1)))) - dec.names <- cbind(dec.names, t(matrix(paste((paste("D",seq(0,10,1),sep="")),classnames[j],sep="_")))) - if(j == nb_class){ - stat.res <- cbind(stat.res, dec.res) - names <- cbind(names, dec.names) - } - } - - # Missing values ------------ - - if("NA" %in% stat.list){ - - nb_NA <- apply(DM[which(DM$c_class==classnames[j]),-1],2,function(x) sum(is.na(x))) - pct_NA <- nb_NA/nrow(DM[which(DM$c_class==classnames[j]),-1])*100 - NA.res <- cbind(NA.res,nb_NA) - pct_NA.res <- cbind(pct_NA.res,pct_NA) - NA.names <- cbind(NA.names, paste("NA",classnames[j], sep="_")) - pct_NA.names <- cbind(pct_NA.names,paste("Pct_NA", classnames[j], sep="_")) - if(j == nb_class){ - stat.res <- cbind(stat.res, NA.res,pct_NA.res) - names <- cbind(names, NA.names,pct_NA.names) - } - - #for barplots - Nb_NA_0_20 <- 0 - Nb_NA_20_40 <- 0 - Nb_NA_40_60 <- 0 - Nb_NA_60_80 <- 0 - Nb_NA_80_100 <- 0 - - for (i in 1:length(pct_NA)){ - - if ((0<=pct_NA[i])&(pct_NA[i]<20)){ - Nb_NA_0_20=Nb_NA_0_20+1} - - if ((20<=pct_NA[i])&(pct_NA[i]<40)){ - Nb_NA_20_40=Nb_NA_20_40+1} - - if ((40<=pct_NA[i])&(pct_NA[i]<60)){ - Nb_NA_40_60=Nb_NA_40_60+1} - - if ((60<=pct_NA[i])&(pct_NA[i]<80)){ - Nb_NA_60_80=Nb_NA_60_80+1} - - if ((80<=pct_NA[i])&(pct_NA[i]<=100)){ - Nb_NA_80_100=Nb_NA_80_100+1} - } - data_bp[1,j] <- Nb_NA_0_20 - data_bp[2,j] <- Nb_NA_20_40 - data_bp[3,j] <- Nb_NA_40_60 - data_bp[4,j] <- Nb_NA_60_80 - data_bp[5,j] <- Nb_NA_80_100 - rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") - - if(j == nb_class){ - - # Alert message if there is no missing value in data matrix - sum_total <- sum(NA.res) - alerte <- NULL - if(sum_total==0){ - alerte <- c(alerte, "Data Matrix contains no NA.\n") - } - if(length(alerte) != 0){ - cat(alerte,"\n") - } - - - colnames(data_bp) <- classnames - data_bp <- as.matrix(data_bp) - } - } - - - # Mean fold change ------------ - - if(test.fold=="Yes"){ - if(nb_class >= 2){ - if(j!=nb_class){ - ratio1 <- NULL - ratio2 <- NULL - if(method=="each_class"){ - fold.frac <- "Top" - } - for(k in (j+1):nb_class) { - if(fold.frac=="Bottom"){ - ratio1 <- classnames[k] - ratio2 <- classnames[j] - }else{ - ratio1 <- classnames[j] - ratio2 <- classnames[k] - } - fold <- colMeans(DM[which(DM$c_class==ratio1),-1],na.rm=TRUE)/ - colMeans(DM[which(DM$c_class==ratio2),-1],na.rm=TRUE) - if(logarithm=="log2"){ - fold.res <- cbind(fold.res,log2(fold)) - }else if(logarithm=="log10"){ - fold.res <- cbind(fold.res,log10(fold)) - }else{ - fold.res <- cbind(fold.res, fold) - } - if(logarithm == "none"){ - fold.names <- cbind(fold.names,paste("fold",ratio1,"VS", ratio2, sep="_")) - }else{ - fold.names <- cbind(fold.names,paste(logarithm, "fold", ratio1, "VS", ratio2, sep="_")) - } - } - - }else{ - stat.res <- cbind(stat.res,fold.res) - names <- cbind(names, fold.names) - } - } - } - - } - - ############ - - - # check columns names in variableMetadata - - VM.names <- colnames(VM) - for (i in 1:length(VM.names)){ - for (j in 1:length(names)){ - if (VM.names[i]==names[j]){ - names[j] <- paste(names[j], "2", sep="_") - } - } - } - - colnames(stat.res) <- names - - - - - - - # Output --------------------------------------------------------------------------------------------------- - - VM <-cbind(VM,stat.res) - - write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) - - - ### graphics pdf - - if(graphs == 1){ - - pdf(graphs.output) - - - #Barplots for NA - if("NA" %in% stat.list){ - graph.colors <- c("green3","palegreen3","lightblue","orangered","red") - par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) - - bp=barplot(data_bp, col=graph.colors, main="Proportion of NA", xlab="Classes", ylab="Variables") - legend("topright", fill=graph.colors,rownames(data_bp), inset=c(-0.3,0)) - - stock=0 - for (i in 1:nrow(data_bp)){ - text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7) - stock <- stock+data_bp[i,] - } - - } - - # Boxplots for fold test - - if((test.fold=="Yes")&&(nb_class >= 2)){ - - clean_fold <- fold.res - for(i in 1:nrow(clean_fold)){ - for(j in 1:ncol(clean_fold)){ - if(is.infinite(clean_fold[i,j])){ - clean_fold[i,j] <- NA - } - } - } - for (j in 1:ncol(clean_fold)){ - title <- paste(fold.names[j]) - boxplot(clean_fold[,j], main=title) - } - } - - dev.off() - - }else{ - pdf(graphs.output) - plot.new() - legend("center","You did not select any test with graphical output.") - dev.off() - } - - } - - - - - - \ No newline at end of file
--- a/Intchecks/wrapper_intensity_check.R Thu Mar 07 08:34:33 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -#!/usr/bin/Rscript --vanilla --slave --no-site-file - - -############################################################################# -# WRAPPER for INTENSITY CHECK # -# # -#Script: Script_intensity_check.R # -#Xml: xml_intensity_check.xml # -# # -# # -# Input: Data Matrix, VariableMetadata, SampleMetadata # -# Output: VariableMetadata, Graphics # -# # -# # -############################################################################# - - -library(batch) #necessary for parseCommandArgs function -args = parseCommandArgs(evaluate=FALSE) #interpretation of arguments given in command line as an R list of objects - -source_local <- function(...){ - argv <- commandArgs(trailingOnly = FALSE) - base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) - for(i in 1:length(list(...))){source(paste(base_dir, list(...)[[i]], sep="/"))} -} -#Import the different functions -source_local("Script_intensity_check.R", "RcheckLibrary.R") - - -if(length(args) < 7){ stop("NOT enough argument !!!") } - -class_col <- NULL -test_fold <- NULL -class1 <- NULL -fold_frac <- NULL -logarithm <- NULL -if(args$method == "each_class"){ - class_col <- args$class_col - test_fold <- args$test_fold - if(args$test_fold == "Yes"){ - logarithm <- args$logarithm - } -} -if(args$method == "one_class"){ - class_col <- args$class_col - class1 <- args$class1 - test_fold <- args$test_fold - if(args$test_fold == "Yes"){ - fold_frac <- args$fold_frac - logarithm <- args$logarithm - } -} - - - -#print args -cat("\n-------------------------------\nIntensity Check parameters:\n\n") -print(args) -cat("--------------------------------\n") - - -intens_check(args$dataMatrix_in, args$sampleMetadata_in, args$variableMetadata_in, args$method, args$chosen_stat, - class_col, test_fold, class1, fold_frac, logarithm, args$variableMetadata_out, args$graphs_out) - -sessionInfo() -cat("--------------------------------\n") - -#delete the parameters to avoid the passage to the next tool in .RData image -rm(args)
--- a/Intchecks/xml_intensity_check.xml Thu Mar 07 08:34:33 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,202 +0,0 @@ -<tool id="intens_check" name="Intensity Check" version="xx"> - <description>Statistical measures, number of missing values and mean fold change</description> - <requirements> - <requirement type="package" version="1.1_5">r-batch</requirement> - </requirements> - <command interpreter="Rscript"> - - wrapper_intensity_check.R - - dataMatrix_in "$dataMatrix_in" - sampleMetadata_in "$sampleMetadata_in" - variableMetadata_in "$variableMetadata_in" - - method "${method_cond.method}" - - chosen_stat "${method_cond.chosen_stat}" - - #if $method_cond.method == "each_class" : - class_col "${method_cond.class_col}" - test_fold "${method_cond.test_fold_cond.test_fold}" - #if $method_cond.test_fold_cond.test_fold == "Yes" : - logarithm "${method_cond.test_fold_cond.logarithm}" - #end if - #end if - - #if $method_cond.method == "one_class" : - class_col "${method_cond.class_col}" - class1 "${method_cond.class1}" - test_fold "${method_cond.test_fold_cond.test_fold}" - #if $method_cond.test_fold_cond.test_fold == "Yes" : - fold_frac "${method_cond.test_fold_cond.fold_frac}" - logarithm "${method_cond.test_fold_cond.logarithm}" - #end if - #end if - - variableMetadata_out "$variableMetadata_out" - graphs_out "$graphs_out" - - - - </command> - - <inputs> - <param name="dataMatrix_in" type="data" label="Data Matrix file" help="" format="tabular" /> - <param name="sampleMetadata_in" type="data" label="Sample metadata file" help="" format="tabular" /> - <param name="variableMetadata_in" type="data" label="Variable metadata file" help="" format="tabular" /> - - <conditional name="method_cond"> - <param name="method" type="select" label="Method" help="Select the first method if you don't want to take into account any class of samples"> - <option value="no_class">Tests without distinction between samples </option> - <option value="each_class">Tests for each class of samples </option> - <option value="one_class">Tests between one class and all the remaining samples </option> - - </param> - - <when value="no_class"> - <param name="chosen_stat" type="select" display="checkboxes" multiple="True" label="Statistics"> - <validator type="no_options" message="Please choose at least one statistic representation" /> - <option value="mean">Mean</option> - <option value="sd">Standard deviation</option> - <option value="median">Median</option> - <option value="quartile">Quartile</option> - <option value="decile">Decile</option> - <option value="NA">Missing values</option> - </param> - </when> - - <when value="each_class"> - <param name="class_col" type="data_column" data_ref="sampleMetadata_in" use_header_names="true" label="Class column" help="Class column in Sample metadata" /> - <param name="chosen_stat" type="select" display="checkboxes" multiple="True" label="Statistics"> - <option value="mean">Mean</option> - <option value="sd">Standard deviation</option> - <option value="median">Median</option> - <option value="quartile">Quartile</option> - <option value="decile">Decile</option> - <option value="NA">Missing values</option> - </param> - <conditional name="test_fold_cond"> - <param name="test_fold" type="select" display="radio" label="Calculate the mean fold change"> - <option value="Yes">Yes</option> - <option value="No">No</option> - </param> - <when value="Yes"> - <param name="logarithm" type="select" label="Logarithm" help="Choose if you want the mean fold change to be converted into a log mean fold change (log2 or log10)"> - <option value="none">none </option> - <option value="log2">log2 </option> - <option value="log10">log10 </option> - </param> - </when> - </conditional> - </when> - - <when value="one_class"> - <param name="class_col" type="data_column" data_ref="sampleMetadata_in" use_header_names="true" label="Class column" help="Class column in Sample metadata" /> - <param name="class1" type="text" label="Selected class" help="Name of the class to test versus all the remaining samples." /> - <param name="chosen_stat" type="select" display="checkboxes" multiple="True" label="Statistics"> - <option value="mean">Mean</option> - <option value="sd">Standard deviation</option> - <option value="median">Median</option> - <option value="quartile">Quartile</option> - <option value="decile">Decile</option> - <option value="NA">Missing values</option> - </param> - <conditional name="test_fold_cond"> - <param name="test_fold" type="select" display="radio" label="Calculate the mean fold change"> - <option value="Yes">Yes</option> - <option value="No">No</option> - </param> - <when value="Yes"> - <param name="fold_frac" type="select" label="Where should the class be placed for the mean fold change calculation?" display="radio"> - <option value="Top">Numerator (Top) </option> - <option value="Bottom">Denominator (Bottom) </option> - </param> - <param name="logarithm" type="select" label="Logarithm" help="Choose if you want the mean fold change to be converted into a log mean fold change (log2 or log10)"> - <option value="none">none </option> - <option value="log2">log2 </option> - <option value="log10">log10 </option> - </param> - </when> - </conditional> - </when> - </conditional> - </inputs> - - - <outputs> - <data name="variableMetadata_out" label="IC_${variableMetadata_in.name}" format="tabular" /> - <data name="graphs_out" label="IC_Graphs" format="pdf" /> - </outputs> - - <help> - -.. class:: infomark - -**Authors** - | Anthony Fernandes - PFEM ; INRA - ---------------------------------------------------- - -======================== -Intensity Check -======================== - ------------ -Input files ------------ - -+----------------------------+------------+ -| Parameter | Format | -+============================+============+ -| 1 : Data Matrix file | tabular | -+----------------------------+------------+ -| 2 : Sample metadata file | tabular | -+----------------------------+------------+ -| 3 : Variable metadata file | tabular | -+----------------------------+------------+ - ----------- -Parameters ----------- - -**Method** - | - **Tests without distinction between samples:** calculates chosen statistic(s) for each variable. - | - **Tests for each class of samples:** separates samples between each class (class column to specified). Chosen statistic(s) and/or mean fold change are calculated for each of them. - | - **Tests between one class versus all the remaining samples:** If you want to test only one class versus all the remaining samples without class distinction. - - | In the case of two classes: "each class" and "one class" give the same results for statistical measures. We recommend to choose "one class" for mean fold change calculation in order to select the class you want to put as numerator or denominator (see below). - -**Statistics** - | Select the statistical measures you want to add in the variable metadata table. If the method is "each class" or "one class", you could choose no statistical measure if you only want to calculate the mean fold change (see below). - -**Class column** - | Select the class column in sample metadata table. - -**Selected class** - | If the method is "one class", specify it. Remaining samples will be named "Other". - -**Calculate the mean fold change** - | Choose if you want to calculate the mean fold change. If the method is "each class": mean fold change will be calculated for all combinations of classes. If the method is "one class": it will be calculated between the selected class (see above) and the remaining samples. - -**Where should the class be placed for the mean fold change calculation?** - | If the method is "one class", choose "top" or "bottom" to put the selected class as numerator or denominator (respectively) for the mean fold change calculation. - -**Logarithm** - | Choose if you want to transform the mean fold change with a log2 or log10. - ------------- -Output file ------------- - -**Variable metadata file** - | Contains the previous columns in variable metadata file and the new ones. - | In the column names for fold, the first class specified is the one used like numerator for the ratio. - -**Graphs file** - | Contains barplots with the proportion of NA considering classes and boxplots with the fold values. - - </help> - - -</tool> - \ No newline at end of file