Mercurial > repos > melpetera > acorrf
changeset 0:cfe4b819911b draft
Uploaded
author | melpetera |
---|---|
date | Thu, 10 Oct 2019 12:20:11 -0400 |
parents | |
children | 86ee1a3d5723 |
files | ACF/Analytic_correlation_filtration.pl ACF/README.md ACF/analytic_correlation_filtration.xml ACF/data/default_list.csv ACF/lib/IonFiltration.pm ACF/static/images/Adduct_fragment_list.JPG ACF/static/images/Correlation_matrix.JPG |
diffstat | 7 files changed, 1301 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/Analytic_correlation_filtration.pl Thu Oct 10 12:20:11 2019 -0400 @@ -0,0 +1,644 @@ +#!usr/bin/perl + +### Perl modules +use warnings; +use strict; +use Getopt::Long qw(GetOptions); #Creation of script options +use Pod::Usage qw(pod2usage); #Creation of script options + +#Personnal packages +use FindBin ; ## Allows you to locate the directory of original perl script +#use lib $FindBin::Bin; +use lib "$FindBin::Bin/lib"; +use IonFiltration; + +my ($file, $mass_file, $opt, $dataMatrix, $combined_DMVM, $repres_opt, $rt_threshold, $mass_threshold, $output_sif, $output_tabular, $correl_threshold, $intensity_threshold, $intensity_pourc); #Options to complete + +######################## +### Options and help ### +######################## + +GetOptions("f=s"=>\$file, "m=s"=>\$mass_file, "o=s"=>\$opt, "d=s"=>\$dataMatrix, "v=s"=>\$combined_DMVM, "r=s"=>\$repres_opt, "rt=f"=>\$rt_threshold, "mass=f"=>\$mass_threshold, "output_sif=s"=>\$output_sif, "output_tabular=s"=>\$output_tabular, "correl=s"=>\$correl_threshold, "IT=f"=>\$intensity_threshold, "IP=f"=>\$intensity_pourc) or pod2usage(2); + +### Check required parameters : +pod2usage({-message=>q{Mandatory argument '-f' is missing}, -exitval=>1, -verbose=>0}) unless $file; +#pod2usage({-message=>q{Mandatory argument '-m' is missing}, -exitval=>1, -verbose=>0}) unless $mass_file; +pod2usage({-message=>q{Mandatory argument '-o' is missing. It correspond to the grouping method for analytical correlation groups formation. +#It should be a number (1 ; 2 or 3) : +# 1 : Don't take into acount mass information (only RT) ; +# 2 : Check that all mass differences are include in a specific list and taking into acount RT information +# 3 : Check that all mass differences are include in a specific list, ignoring RT information +#To use the tool without takinf into account mass and RT information, use option 1 and define the RT threshold to 999999999.}, -exitval=>1, -verbose=>0}) unless $opt; +pod2usage({-message=>q{Mandatory argument '-r' is missing. It correspond to the group representent choosing method for analytical correlation groups formation. +It should be one of the 3 options below : + "mass" : choose the ion with the highest mass as the representant + "intensity" : choose the ion with the highest intensity as the representant + "mixt" : choose the ion with the highest (mass^2 * intensity) as the representant + "max_intensity_max_mass" : choose tha ion witht he highest intenisty among the 5 most intense ions of the group}, -exitval=>1, -verbose=>0}) unless $repres_opt; +pod2usage({-message=>q{Mandatory argument '-d' is missing}, -exitval=>1, -verbose=>0}) unless $dataMatrix; +pod2usage({-message=>q{Mandatory argument '-v' is missing}, -exitval=>1, -verbose=>0}) unless $combined_DMVM; +#pod2usage({-message=>q{Mandatory argument '-rt' is missing}, -exitval=>1, -verbose=>0}) unless $rt_threshold; +#pod2usage({-message=>q{Mandatory argument '-mass' is missing}, -exitval=>1, -verbose=>0}) unless $mass_threshold; +pod2usage({-message=>q{Mandatory argument '-correl' is missing}, -exitval=>1, -verbose=>0}) unless $correl_threshold; +pod2usage({-message=>q{Mandatory argument '-output_tabular' is missing}, -exitval=>1, -verbose=>0}) unless $output_tabular; +pod2usage({-message=>q{Mandatory argument '-output_sif' is missing}, -exitval=>1, -verbose=>0}) unless $output_sif; + + +#if(($opt != 1) && ($opt != 2) && ($opt != 3)){ +# print "you must indicate \"1\", \"2\" or \"3\" for the --o otpion\n"; +# exit; +#} + + + +if(($repres_opt ne "mass") && ($repres_opt ne "intensity") && ($repres_opt ne "mixt") && ($repres_opt ne "max_intensity_max_mass")){ + print "you must indicate \"mass\", \"intensity\", \"mix\" or \"max_intensity_max_mass\" for the --r otpion\n"; + exit; +} + + + +######################################################################### +#### Création of a hash containing all adduits and fragments possible ### +######################################################################### + +my %hmass; +if($opt != 1){ + %hmass = IonFiltration::MassCollecting($mass_file); + +} + +my $refhmass = \%hmass; + +print "Création of a hash containing all adduits and fragments possible\n"; + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + +my %hrtmz; +($output_sif, %hrtmz) = IonFiltration::sifTableCreation($file, $output_sif, $opt, $rt_threshold, $mass_threshold, $correl_threshold, $dataMatrix, $output_tabular, $combined_DMVM, $repres_opt, $intensity_threshold, $intensity_pourc, \%hmass); +print "Creation of a sif table + correlation filtration done\n"; + + +###################################################### +### Analytic correlation filtrering follow options ### +###################################################### + +my %hheader_file; +my %hduplicate; + +my %hcorrelgroup; +my $groupct=1; + +my $linenb3=0; +my %hheader_line; + + + +open (F1, $output_sif) or die "Impossible to open $output_sif\n"; + +while(my $line = <F1>){ + my $count=0; + chomp $line; + my @tline = split(/\t/, $line); + my $a = $tline[0]; + my $b = $tline[2]; + + my $amass=$hrtmz{$a}{mz}; + my $atemp=$hrtmz{$a}{rt}; + my $bmass= $hrtmz{$b}{mz}; + my $btemp=$hrtmz{$b}{rt}; + print "YY : $a ==> $amass ; $b ==> $bmass\n"; + my $diff = $amass-$bmass; + $diff = abs($diff); + + ### Option 1: Don't take into acount mass information ### + + if($opt == 1){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + } + } + + + + ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ### + + elsif($opt == 2){ + + my $print = 0; + foreach my $s (keys %{$refhmass}){ + foreach my $r (keys %{$refhmass->{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + my $btplus = $btemp + $rt_threshold; + my $btmoins = $btemp - $rt_threshold; + + if(($btmoins <= $atemp) && ($atemp <= $btplus)){ + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + $print = 1; + } + } + } + } + } + } + + + ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ### + + elsif($opt == 3){ + + my $print = 0; + foreach my $s (keys %{$refhmass}){ + foreach my $r (keys %{$refhmass->{$s}}){ + my $rm = $r - $mass_threshold; + my $rp = $r + $mass_threshold; + if(($diff <= $rp) && ($diff >= $rm)){ + if($print == 0){ + + foreach my $k (keys %hcorrelgroup){ + if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ + $hcorrelgroup{$k}{$a}=1; + $hcorrelgroup{$k}{$b}=1; + $count++; + last; + } + } + if($count == 0){ + my $groupnb="group".$groupct; + $hcorrelgroup{$groupnb}{$a}=1; + $hcorrelgroup{$groupnb}{$b}=1; + $groupct ++; + } + $print = 1; + } + } + } + } + } +} +close F1; + +print "Analytic correlation filtrering follow options done\n"; + + +############################################# +### Join groups that have been subdivided ### +############################################# + +my @tdelete; + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } +} + +foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); +} + + +### Do it twice to see if it fix the problem of unmerge groups + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $v (keys %hcorrelgroup){ + my $count = 0; + if ($v ne $k){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + if($w eq $i){ + $count = 1; + push(@tdelete, $v); + } + } + } + if($count == 1){ + foreach my $w (keys %{$hcorrelgroup{$v}}){ + $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; + } + delete($hcorrelgroup{$v}); + } + } + } +} + +foreach my $t (@tdelete){ + delete($hcorrelgroup{$t}); +} + +print "Join groups that have been subdivided done\n"; + +####################################################### +### Addition of annotation information among groups ### +####################################################### + +foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + foreach my $j (keys %{$hcorrelgroup{$k}}){ + my $count = 0; + if ($i ne $j){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$j}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %{$refhmass}){ + + foreach my $y (keys %{$refhmass->{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.6f", $diff_list); + + if($hcorrelgroup{$k}{$i} eq 1){ + my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}=$val; + $count ++; + } + else{ + if($count == 0){ + my $val = "@".$j."|".$sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + else{ + my $val = $sign."(".$z.")(".$diff_list.")|"; + $hcorrelgroup{$k}{$i}.=$val; + $count ++; + } + } + } + } + } + } + } + } +} + + +print "Addition of annotation information among groups done\n"; + + +#################################################### +### Choose the representative ion for each group ### +#################################################### + +my %hgrouprepres; + +open(F3, $dataMatrix); + +while (my $line = <F3>){ + chomp $line; + + my @tline = split (/\t/, $line); + + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + if($tline[0] eq $i){ + $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz}; + my $intensity; + my $nbsubjects=0; + for(my $y=1;$y<scalar(@tline);$y++){ + $intensity += $tline[$y]; + $nbsubjects ++; + } + my $meanintensity = $intensity/$nbsubjects; + $hgrouprepres{$k}{$i}{intensity}=$meanintensity; + $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity}); + } + } + } +} +close F3; + +foreach my $z (keys %hgrouprepres){ + my $max_intensity = 0; + my $max_int_ion = ""; + my $max_mass = 0; + my $max_mass_ion = ""; + my $max_squared = 0; + my $max_squared_ion = ""; + foreach my $w (keys %{$hgrouprepres{$z}}){ + if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){ + $max_intensity = $hgrouprepres{$z}{$w}{intensity}; + $max_int_ion = $w; + } + if($hgrouprepres{$z}{$w}{mass} > $max_mass){ + $max_mass = $hgrouprepres{$z}{$w}{mass}; + $max_mass_ion = $w; + } + if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){ + $max_squared = $hgrouprepres{$z}{$w}{squaredmassint}; + $max_squared_ion = $w; + } + } + + my $max_int_max_mass_ion=""; + + if($repres_opt eq "max_intensity_max_mass"){ + my %hfirst; + my $first=0; + foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){ + $first ++; + if ($first <= 3){ + $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity}; + } + } + + my $first_2 = 0; + my $intens_max = 0; + my $mass_max = 0; + + foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){ + + $first_2 ++; + if($first_2 == 1){ + $intens_max = $hfirst{$y}; + if($intensity_threshold > $intens_max){ + $intensity_threshold = 0; + } + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + if($hgrouprepres{$z}{$y}{mass} > $mass_max){ + if($hfirst{$y}>$intensity_threshold){ + my $a = $intens_max * $intensity_pourc; + if($hfirst{$y} > $a){ + $max_int_max_mass_ion = $y; + $mass_max = $hgrouprepres{$z}{$y}{mass}; + } + } + } + } + } + + $hgrouprepres{$z}{max_int}=$max_int_ion; + $hgrouprepres{$z}{max_mass}=$max_mass_ion; + $hgrouprepres{$z}{max_squared}=$max_squared_ion; + $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion; + +} + + +print "Choose the representative ion for each group done\n"; + +############################################################################# +### Addition of annotation information relative to the representative ion ### +############################################################################# + +my %hreprescomparison; + +my $representative=""; + +if($opt != 1){ + foreach my $k (keys %hcorrelgroup){ + foreach my $i (keys %{$hcorrelgroup{$k}}){ + + if($repres_opt eq "mass"){$representative = $hgrouprepres{$k}{max_mass}} + if($repres_opt eq "intensity"){$representative = $hgrouprepres{$k}{max_int}} + if($repres_opt eq "mixt"){$representative = $hgrouprepres{$k}{max_squared}} + if($repres_opt eq "max_intensity_max_mass"){$representative = $hgrouprepres{$k}{max_int_max_mass}} + + + my $count = 0; + if ($i ne $representative){ + + my $a = $hrtmz{$i}{mz}; + my $b = $hrtmz{$representative}{mz}; + + my $diff = $a - $b; + my $sign; + if($diff>0){ + $sign="+"; + } + if($diff<0){ + $sign="-"; + } + $diff = abs($diff); + + foreach my $z (keys %{$refhmass}){ + + foreach my $y (keys %{$refhmass->{$z}}){ + my $ym = $y - $mass_threshold; + my $yp = $y + $mass_threshold; + + if(($diff <= $yp) && ($diff >= $ym)){ + my $diff_list = $diff - $y; + $diff_list = abs($diff_list); + $diff_list = sprintf ("%0.4f", $diff_list); + if($hcorrelgroup{$k}{$i} eq 1){ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}=$valrep; + $count ++; + } + else{ + if($count == 0){ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; + $count ++; + } + else{ + my $valrep = "[M ".$sign."(".$z.")]|"; + $hreprescomparison{$k}{$i}{repres_diff}.=$valrep; + $count ++; + } + } + } + } + } + } + else{ + $hreprescomparison{$k}{$i}{repres_diff}="M"; + } + } + } +} + + +print "Addition of annotation information relative to the representative ion done\n"; + +############################## +### Print in result file ! ### +############################## + +open(F4, ">$output_tabular"); +open(F5, $combined_DMVM); + +my $line_nb = 0; +my %hheader; +while (my $line = <F5>){ + chomp $line; + + + my @tline = split (/\t/, $line); + + if($line_nb == 0){ + print F4 "$line\tACorF_groups"; + if($opt == 1){ + if($repres_opt eq "intensity"){print F4 "\tACorF_filter\tintensity_repres\n"} + if($repres_opt eq "mass"){print F4 "\tACorF_filter\tmass_repres\n"} + if($repres_opt eq "mixt"){print F4 "\tACorF_filter\tmass2intens_repres\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tACorF_filter\tmax_intensity_max_mass_repres\n"} + } + else{ + if($repres_opt eq "intensity"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tintensity_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "mixt"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmass2intens_repres\tannotation_relative_to_representative\n"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "\tisotopes_adducts_fragments_[\@id|annotation(delta_annotation)]\tACorF_filter\tmax_intensity_max_mass_repres\tannotation_relative_to_representative\n"} + } + + + ### Creation of a header hash + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader{$a}=$i; + } + } + + else{ + my $find = 0; + foreach my $v (keys %hcorrelgroup){ + if(defined($hgrouprepres{$v}{$tline[0]})){ + print F4 "$line\t$v"; + + if($opt != 1){ + if(defined($hcorrelgroup{$v}{$tline[0]})){ + print F4 "\t$hcorrelgroup{$v}{$tline[0]}\t"; + + } + else{ + print F4 "\t"; + } + } + + if($repres_opt eq "intensity"){ + if($tline[0] eq $hgrouprepres{$v}{max_int}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_mass}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "mixt"){ + if($tline[0] eq $hgrouprepres{$v}{max_squared}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + if($repres_opt eq "max_intensity_max_mass"){ + if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){ + print F4 "1\t"; + } + else{ + print F4 "0\t"; + } + $find = 1; + } + + if($repres_opt eq "intensity"){print F4 "$hgrouprepres{$v}{max_int}\t"} + if($repres_opt eq "mass"){print F4 "$hgrouprepres{$v}{max_mass}\t"} + if($repres_opt eq "mixt"){print F4 "$hgrouprepres{$v}{max_squared}\t"} + if($repres_opt eq "max_intensity_max_mass"){print F4 "$hgrouprepres{$v}{max_int_max_mass}\t"} + + if(defined($hreprescomparison{$v}{$tline[0]}{repres_diff})){ + print F4 "$hreprescomparison{$v}{$tline[0]}{repres_diff}\n"; + } + else{ + print F4 "-\n"; + } + } + } + if($find == 0){ + $groupct ++; + my $group = "group".$groupct; + if($opt != 1){ + print F4 "$line\t$group\t-\t-\t-\t-\n"; + } + else{ + print F4 "$line\t$group\t-\t-\n"; + } + } + } + $line_nb ++; +} + +print "Print in result file done\n"; + +print "All steps done\n"; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/README.md Thu Oct 10 12:20:11 2019 -0400 @@ -0,0 +1,45 @@ +Analytical Correlation Filtration +======= + +Metadata +----------- + + * **@name**: ACorF + * **@version**: 2019-06-20 + * **@authors**: <stephanie.monnerie@inra.fr> + * **@date creation**: 2018/11/17 + * **@main usage**: Reduction of analytical redundancies in Metabolomics data + + +Configuration +----------- + +### Requirement: + * perl + + +### Deploy: + + +### Warnings: + + +Services provided +----------- + + + +Technical description +----------- + + +Notes +----------- + + + + +License (optional) +----------- + +This code is published under CECILL 2.1.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/analytic_correlation_filtration.xml Thu Oct 10 12:20:11 2019 -0400 @@ -0,0 +1,206 @@ +<tool id="Analytic_correlation_filtration" name="Analytic correlation filtration" version="2019-06-20"> + <description> + : Detect analytic correlation among data and remove them. + </description> + + + <command><![CDATA[ + + + perl $__tool_directory__/Analytic_correlation_filtration.pl + + + #if str($mass_file.mass_choice)=="false": + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -o 1 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt 9999999999 + #else: + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -o 1 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" + #end if + #else: + #if str($mass_file.liste.mass_list)=="true": + #if str($rt_cond.rt_choice)=="true": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m "$mass_file.liste.mass_file_in" -o 2 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" -mass "$mass_file.mass_threshold" + #end if + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m "$mass_file.liste.mass_file_in" -o 3 -d "$dataMatrix_in" -v "$variableMetadata_in" -mass "$mass_file.mass_threshold" + #end if + #else + #if str($rt_cond.rt_choice)=="true": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m $__tool_directory__/data/default_list.csv -o 2 -d "$dataMatrix_in" -v "$variableMetadata_in" -rt "$rt_cond.rt_threshold" -mass "$mass_file.mass_threshold" + #end if + #if str($rt_cond.rt_choice)=="false": + perl $__tool_directory__/Analytic_correlation_filtration.pl -f "$file_in" -m $__tool_directory__/data/default_list.csv -o 3 -d "$dataMatrix_in" -v "$variableMetadata_in" -mass "$mass_file.mass_threshold" + #end if + #end if + #end if + + -r "$repres_opt.repres_opt_selector" + + #if str($repres_opt.repres_opt_selector)=="max_intensity_max_mass": + -IT $repres_opt.int_threshold + -IP $repres_opt.int_percentage + #end if + -correl "$correl_threshold" + -output_sif "$sif_out" + -output_tabular "$variableMetadata_out" + + ]]></command> + + <inputs> + <param type="data" name="file_in" format="txt" help="The .txt correlation table (you can obtain it by using the Between-table Correlation tool or for exemple the cor() function in R) " label="Correlation table file" /> + <param type="data" name="dataMatrix_in" format="tabular" help="" label="dataMatrix file" /> + <param type="data" name="variableMetadata_in" format="tabular" help="" label="variableMetadata file" /> + + <param help="Define the minimum correlation threshold accepted to determine analytic correlation" label="Correlation threshold" type="float" name="correl_threshold" value="0.90"/> + + <conditional name="mass_file"> + <param name="mass_choice" checked="true" falsevalue="false" help="'YES' if you want to take it into account; 'NO' if you don't want to take into account mass information" label="Do you want to take into account mass differences between 2 ions?" truevalue="true" type="boolean"/> + <when value="true"> + <conditional name="liste"> + <param name="mass_list" checked="true" falsevalue="false" help="'YES' if you have your own list to upload; 'NO' if you want to use a default list" label="Do you have your own list of mass differences or do you want to use a default list ?" truevalue="true" type="boolean"/> + <when value="false"> + + </when> + <when value="true"> + <param type="data" name="mass_file_in" format="tabular,csv" help="The file containing all your report and known mass differences (cf help for file example) " label="Mass differences table (format: tabular or csv) " /> + </when> + </conditional> + <param help="2 ions need to have a difference mass included in the list at +/- mass difference range to be considered as analytically correlated | Value recommendation : 0.005" label="Mass difference range" type="float" name="mass_threshold" value="0.005"/> + </when> + <when value="false"> + + </when> + </conditional> + + <conditional name="rt_cond"> + <param checked="true" falsevalue="false" help="'YES' if want to take into account retention time information; 'NO' if you don't want to take into account retention time information" label="Do you want to take into account retention time differences between 2 ions? " name="rt_choice" truevalue="true" type="boolean"/> + <when value="true"> + <param help="Choose a retention time difference threshold between 2 ions considered as analytically correlated | Value recommendation : 0.1" label="Retention time difference threshold" type="float" name="rt_threshold" value="0.1"/> + </when> + <when value="false"> + + </when> + </conditional> + + <conditional name="repres_opt"> + <param name="repres_opt_selector" label="Which representative ion do you want to select for each group" type="select" display="radio" help=""> + <option value="intensity">Highest intensity</option> + <option value="mass">Highest mass</option> + <option value="mixt">Highest (mass2 x intensity) </option> + <option value="max_intensity_max_mass">Highest mass between the 3 highest intensity (following intensity threshold and rules ==> see help) </option> + </param> + <when value="max_intensity_max_mass"> + <param help="" label="Minimum intensity threshold for the representative ion" type="float" name="int_threshold" value="1000"/> + <param help="Example: ion A have the highest intensity of a group but not the highest mass, B is an ion that have the second highest intensity in the group and a highest mass than A, to choose B as a representative ion for the group his intensity need to be at list 50% of the A intensity." label="Percentage of highest intensity of the group accept for the new representative ion. This option allow to avoid isotope selection. " type="float" name="int_percentage" value="0.5"/> + </when> + <when value="intensity"> + </when> + <when value="mass"> + </when> + <when value="mixt"> + </when> + </conditional> + + </inputs> + + <outputs> + <data format="sif" label="${file_in.name}_sif" name="sif_out"/> + <data format="tabular" label="${variableMetadata_in.name}_representative_ion" name="variableMetadata_out"/> + </outputs> + + <help><![CDATA[ + +.. class:: infomark + +**Authors** : **Stephanie Monnerie** (stephanie.monnerie@inra.fr) wrote this tool for analytic correlation detection. + +--------------------------------------------------- + +.. class:: infomark + +**References** : + +--------------------------------------------------- + +----------- +Input files +----------- + ++-----------------------------------------+---------------+ +| File | Format | ++=========================================+===============+ +| 1) Correlation matrix | txt | ++-----------------------------------------+---------------+ +| 2) Data matrix | tabular | ++-----------------------------------------+---------------+ +| 3) Variable metadata | tabular | ++-----------------------------------------+---------------+ +| **Optional file** | **Format** | ++-----------------------------------------+---------------+ +| 4) Optional : Mass differences list | csv/tabular | ++-----------------------------------------+---------------+ + +--------------------------------------------------- + +------------- +Files content +------------- + +Correlation matrix + * File organisation : on line by correlation pairs with the first ion ID, the correlation value and the second ion ID, tabular separated ==> Fist_Ion_ID \\t Correlation_Value \\t Second_Ion_ID + * Example: + +.. image:: Correlation_matrix.JPG + :width: 800 + +Data matrix file + * "variable x sample" **dataMatrix** : tabular separated file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the variable metadata (see below) + +Variable metadata file + * "variable x metadata" **variableMetadata** tabular separated file of the numeric and/or character variable metadata, with . as decimal and NA for missing values + +.. class:: warningmark + +For more information about input files, refer to the corresponding "W4M HowTo" page: +http://workflow4metabolomics.org/sites/workflow4metabolomics.org/files/files/w4m_TableFormatForGalaxy_150908.pdf + + +Mass differences list + * A file containing list of known adducts, fragments or isotopes with the mass differences linked to them + * Example: + +.. image:: Adduct_fragment_list.JPG + :width: 350 + +--------------------------------------------------- + +---------- +Parameters +---------- + +Take into account mass diffrences between 2 ions : + * You can enter a list of mass differences that are known. The file must be organized with a first column for the mass difference type (isotope, fragment, etc...), a second column with the mass difference chemical formula (H+, -2H+K, etc...) and a third column for the mass difference value + * If you are choosing to use a mass differences table, you have to choose a mass difference range that will be a threshold to accept or not a difference value as true (recognize a mass difference value in the file +/- this threshold). + +Take into acount retention time : + * You can use retention time as a criteria to group ions. You have to choose a value that will be use as intervalle : 2 ions are group when their retention time is equal +/- the threshold. + +Choose the representative ion for each group, there are 3 possibilities to determine the representative ion : + * The ion with the highest intensity (recommandated for LC/MS) + * The ion with the highest mass + * The ion with the highest "mass2 * intensity" value + * The ion with the highest mass between the 3 highest intensity of the group, except if the highest mass ion have an intensity < determined percentage of the highest intensity ion one (for exemple 50%) (recommandated for GC/MS) + + +--------------------------------------------------- + +-------------- +Example of use +-------------- + +Add exemples according to the ppt presentation ! + + + + ]]></help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/data/default_list.csv Thu Oct 10 12:20:11 2019 -0400 @@ -0,0 +1,225 @@ +adduit -2H+Na+K 59.9378259 +adduit H 1.007825032 +adduit -H+K 37.95588165 +adduit -H+Na 21.98194425 +adduit -3H+3Na 65.94583274 +adduit -4H+4K 151.8235266 +adduit -4H+4Na 87.92777699 +adduit -3H+3K 113.8676449 +adduit -2H+2K 75.9117633 +adduit -2H+2Na 43.9638885 +adduit 2H 2.015650064 +adduit Cl 34.96885268 +adduit -2H+Ca 37.94694092 +isotope 13C db 0.501677419 +isotope 13C 1.003354838 +isotope 15N 0.997034893 +isotope 18O 2.00424638 +isotope 34S 1.9957959 +isotope 41K 1.99811908 +isotope 37Cl 1.99704991 +isotope 13C2 2.006709676 +isotope 13C3 3.010064513 +isotope 13C+37Cl 3.000404748 +isotope 13C+18O 3.007601218 +isotope 13C+34S 2.999150738 +isotope 44Ca 3.99289082 +adduit CH3OH 32.02621475 +adduit CH3CN 41.0265491 +adduit H2O 18.01056468 +adduit 2(H2O 36.02112937 +adduit NaCl 57.95862196 +adduit HCOOH 46.0054793 +adduit +(HCOOH)+(HCOOK) 129.9668403 +adduit +(HCOOH)+(HCOONa) 113.9929029 +adduit +(HCOOH)+2(HCOONa) 181.9803264 +adduit HCOOK 83.96136095 +adduit +(HCOOK)+(HCOONa) 151.9487845 +adduit HCOONa 67.98742355 +adduit 2(HCOOH) 92.01095861 +adduit +2(HCOOH)+(HCOOK) 175.9723196 +adduit +2(HCOOH)+(HCOONa) 159.9983822 +adduit 2(HCOOK) 167.9227219 +adduit 2(HCOONa) 135.9748471 +fragment C11H18O9 294.0950822 +fragment C12H16O12 352.064176 +fragment C12H20O9 308.1107322 +fragment C2H2O 42.01056468 +fragment C2H3. 27.0229265 +fragment C2H3N 41.0265491 +fragment C2H3NO3 89.01129296 +fragment C2H3O. 43.01784112 +fragment C2H4 28.03130013 +fragment C2H4N. 42.03382553 +fragment C2H4O 44.02621475 +fragment C2H5. 29.03857656 +fragment C2H5N 43.04219916 +fragment C2H5NO2 75.0320284 +fragment C2H5O. 45.03349118 +fragment C2H5O6P 155.9823745 +fragment C2H6 30.04695019 +fragment C2H7N 45.05784922 +fragment C2HNO2 71.00072827 +fragment C3H4O3 88.01604399 +fragment C3H5. 41.03857656 +fragment C3H5NO2 87.0320284 +fragment -(C3H5O2NS)-(NH3) 136.0306485 +fragment C3H5O2NS 119.0040994 +fragment C3H6 42.04695019 +fragment C3H6O3 90.03169405 +fragment C3H7. 43.05422662 +fragment C3H7O2N 89.04767846 +fragment C3H7O2NS 121.0197495 +fragment C3H7O6P 169.9980246 +fragment C4H6 54.04695019 +fragment C4H6O2 86.03677943 +fragment C4H6O4 118.0266087 +fragment C4H7. 55.05422662 +fragment C4H8O3 104.0473441 +fragment C4H9 57.07042529 +fragment C5H7O3N 129.0425931 +fragment C5H8O3NS 162.0224891 +fragment C5H8O4 132.0422587 +fragment C6H10O4 146.0579088 +fragment -(C6H10O5)-(H2O) 180.0633881 +fragment C6H10O5 162.0528234 +fragment C6H10O7 194.0426527 +fragment C6H8O6 176.032088 +fragment CH2O 30.01056468 +fragment -(CH2S)-(HCOOH) 91.99320037 +fragment -(CH2S)-(NH3) 63.01427016 +fragment CH2S 45.98772106 +fragment CH3. 15.0229265 +fragment CH3COO. 59.01275574 +fragment CH3COOH 60.02112937 +fragment CH3N 29.0265491 +fragment CH3O. 31.01784112 +fragment CH3OH 32.02621475 +fragment CH4 16.03130013 +fragment CH4N. 30.03382553 +fragment -(CH4S)-(HCOOH) 94.00885043 +fragment -(CH4S)-(NH3) 65.02992022 +fragment CH4S 48.00337113 +fragment CH5N 31.04219916 +fragment Cl. 34.96830408 +fragment CO 27.99491462 +fragment -(CO2)-(CO) 71.98474386 +fragment CO2 43.98982924 +fragment -(H2)-(NH3) 19.04219916 +fragment H2 2.015650064 +fragment -(H2O)-(CO2) 62.00039392 +fragment -(H2O)-(HCOOH) 64.01604399 +fragment -(H2O)-(NH3) 35.03711378 +fragment H2O 18.01056468 +fragment -(H2O)-2(CO2) 105.9902232 +fragment -(H2S)-(H2O) 51.99828575 +fragment H2S 33.98772106 +fragment H2SO4 97.96737954 +fragment H3PO4 97.97689521 +fragment HCl 35.97667771 +fragment HCN 27.01089903 +fragment -(HCOOH)-(HCN) 73.01637834 +fragment HCOOH 46.0054793 +fragment HS. 32.97934743 +fragment -(NC3H9)-(CH3COOH) 119.0946287 +fragment -(NC3H9)-(H2O) 77.08406397 +fragment -(NC3H9)-(HCOOH) 105.0789786 +fragment NC3H9 59.07349929 +fragment NaCl 57.95862196 +fragment NH2CO. 44.01309008 +fragment -(NH3)-(CO2)-(H2O) 79.02694302 +fragment -(NH3)-(CO2) 61.01637834 +fragment -(NH3)-(CONH) 60.03236275 +fragment -(NH3)-(HCOOH) 63.0320284 +fragment NH3 17.0265491 +fragment NH3CO 45.02146372 +fragment NHCO 43.00581365 +fragment OH. 17.00219105 +fragment PO3 78.95850549 +fragment SO2 63.96190024 +fragment SO3 79.95681486 +fragment -2(H2O)-(CO2) 80.01095861 +fragment -2(H2O)-(HCOOH)-(NH3) 99.05315777 +fragment -2(H2O)-(HCOOH) 82.02660867 +fragment 2(H2O) 36.02112937 +fragment 2(HCOOH) 92.01095861 +fragment -2(NH3)-(CO)-(CO2) 106.0378421 +fragment -2(NH3)-(CO) 62.04801281 +fragment 2(NH3) 34.05309819 +fragment 3(H2O) 54.03169405 +fragment 3(NH3) 51.07964729 +fragment 4(H2O) 72.04225874 +fragment C10H11O3N5 249.0861892 +fragment C10H13O4N5 267.0967539 +fragment C10H14O7N5P 347.0630844 +fragment C10H15O5N5 285.1073186 +fragment C2H3NO2 73.01637834 +fragment C2H4O2 60.02112937 +fragment C2H5NO3 91.02694302 +fragment C2H6O2 62.03677943 +fragment C2H6O3 78.03169405 +fragment -(C2H6O3)-(H2O) 96.04225874 +fragment C2H6O4 94.02660867 +fragment C2H7NO2 77.04767846 +fragment C3H10O5 126.0528234 +fragment -(C3H6O3)-(CHNO) 133.0375077 +fragment C3H6O4 106.0266087 +fragment C3H8O3 92.04734412 +fragment C3H8O4 108.0422587 +fragment C4H10O5 138.0528234 +fragment C4H5NO3 115.026943 +fragment C4H8O4 120.0422587 +fragment C5H10O4 134.0579088 +fragment C5H13O4N 151.0844579 +fragment C6H11O4N 161.0688078 +fragment C6H11O5N 177.0637225 +fragment C6H13O5N 179.0793725 +fragment C5H10O5 150.0528234 +fragment C5H10O6 166.047738 +fragment C5H12O2 104.0837296 +fragment -(C5H12O2)-(H2O) 122.0942943 +fragment C5H5N5 135.0544952 +fragment C5H5ON5 151.0494098 +fragment C5H6O2 98.03677943 +fragment C5H7O2N5 169.0599745 +fragment -(C5H7O3N)-(CO2) 173.0324223 +fragment -(C5H7O3N)-(H2O) 147.0531578 +fragment C5H8N3 110.0718223 +fragment C5H8O3 116.0473441 +fragment C5H8O5N5P 249.026305 +fragment C5H9O3 117.0551691 +fragment C5H9O6P 196.0136746 +fragment C5H9O7P 212.0085893 +fragment C6H10O3 130.0629942 +fragment -(C6H10O3)-(H2O) 148.0735589 +fragment C6H11O4N3PS 252.0207885 +fragment C6H11O4NPS 224.0146405 +fragment C6H12O5 164.0684735 +fragment C6H14O6 182.0790382 +fragment C6H14O7 198.0739528 +fragment C6H16O7 200.0896029 +fragment C6H16O8 216.0845175 +fragment C6H8N3 122.0718223 +fragment C6H8NS 126.0377453 +fragment C7H5ON5 175.0494098 +fragment C7H6ON6 190.0603088 +fragment C7H7O2N5 193.0599745 +fragment C7H11O6N 205.0586371 +fragment C8H14O7 222.0739528 +fragment C8H5O3N5 219.039239 +fragment C8H7O4N5 237.0498037 +fragment C9H10O4N2 210.0640568 +fragment C9H11O3N3 209.0800412 +fragment C9H11O4N3 225.0749558 +fragment C9H12O5N2 228.0746215 +fragment C9H12O6N3P 289.0463717 +fragment C9H13O4N3 227.0906059 +fragment C9H14O7N3P 307.0569364 +fragment C9H16O8 252.0845175 +fragment CH2N2 42.02179806 +fragment -(CH2O)-(H2O) 48.02112937 +fragment CH5NO 47.03711378 +fragment -(H3PO4)-(CHNO) 140.9827089 +fragment -(H3PO4)-(H2O) 115.9874599 +fragment -(H3PO4)-(NH3) 115.0034443 +fragment HPO3 79.96633052
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ACF/lib/IonFiltration.pm Thu Oct 10 12:20:11 2019 -0400 @@ -0,0 +1,181 @@ +#!usr/bin/perl +package IonFiltration; + +### Perl modules +use strict; +use warnings; + + + + + + +######################################################################## +### Création of a hash containing all adduits and fragments possible ### +######################################################################## + + +sub MassCollecting{ + + my $mass_file = $_[0]; + my %hmass; + + open (F1, $mass_file); + + while(my $line = <F1>){ + chomp $line; + my @tline = split(/[\t;]/, $line); + if(defined($hmass{$tline[2]})){ + print "The mass difference already exists : $tline[2] !\n"; + } + $hmass{$tline[1]}{$tline[2]}=$tline[0]; + } + + close F1; + return %hmass; + +} + + + + + + + +######################################################## +### Creation of a sif table + correlation filtration ### +######################################################## + + +sub sifTableCreation{ + + my $file = $_[0]; + my $output_sif = $_[1]; +# my $opt = $_[2]; +# my $rt_threshold = $_[3]; +# my $mass_threshold = $_[4]; + my $correl_threshold = $_[5]; +# my $dataMatrix = $_[6]; +# my $output_tabular = $_[7]; + my $combined_DMVM = $_[8]; +# my $repres_opt = $_[9]; +# my $intensity_threshold = $_[10]; +# my $intensity_pourc = $_[11]; +# my $refhmass = $_[12]; + + + + + my %hheader_file; + my %hduplicate; + + my %hcorrelgroup; + my $groupct=1; + + + my $linenb3=0; + my %hheader_line; + my %hrtmz; + + open (F5, $combined_DMVM); + while(my $line = <F5>){ + chomp $line; + my @tline = split(/\t/, $line); + + if($linenb3 == 0){ + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader_line{$a}=$i; + } + } + else{ + if(defined($hheader_line{mzmed})){ + my $b = $tline[$hheader_line{mzmed}]; + $hrtmz{$tline[0]}{mz}=$b; + } + else{ + my $b = $tline[$hheader_line{mz}]; + $hrtmz{$tline[0]}{mz}=$b; + } + if(defined($hheader_line{rtmed})){ + my $d = $tline[$hheader_line{rtmed}]; + $hrtmz{$tline[0]}{rt}=$d; + } + else{ + my $d = $tline[$hheader_line{rt}]; + $hrtmz{$tline[0]}{rt}=$d; + } + } + + $linenb3 ++; + } + close F5; + + + my $linenb=0; + + open (F1, $file) or die "Impossible to open $file\n"; + open(F2, ">$output_sif") or die "Impossible to open $output_sif\n"; + + + while(my $line = <F1>){ + chomp $line; + my @tline = split(/\t/, $line); + + ############################### + ### Création of a sif table ### + ############################### + + if($linenb == 0){ + for(my $i=0; $i<scalar(@tline);$i++){ + my $a = $tline[$i]; + $hheader_file{$i}=$a; + } + } + else{ + for(my $i=1; $i<scalar(@tline);$i++){ + my $a=$tline[0]; + my $b=$hheader_file{$i}; + my $coef=$tline[$i]; + + if($a eq $b){ + # print "This is a correlation between A ($a) and A ($b) !\n" + } + else{ + + ######################### + ### Remove duplicates ### + ######################### + + my $y = $a."/".$b; + my $z = $b."/".$a; + + if((!(defined($hduplicate{$y}))) && (!(defined($hduplicate{$z})))){ + + $hduplicate{$y}=1; +# my $abcoef=abs($coef); # Only when you want to consider negative correlations + +# if($abcoef > $correl_threshold){ # Only when you want to consider negative correlations + if($coef > $correl_threshold){ + + print F2 "$a\t$coef\t$b\n"; + + my $count=0; + + } + } + } + } + } + $linenb ++; + } + close F1; + close F2; + return ($output_sif, %hrtmz); +} + + + + + +1; \ No newline at end of file