Mercurial > repos > melpetera > utilitaires
view ACF/lib/IonFiltration.pm @ 22:fff7de4f9fed draft
Uploaded
author | melpetera |
---|---|
date | Thu, 09 May 2019 08:17:53 -0400 |
parents | |
children |
line wrap: on
line source
#!usr/bin/perl package IonFiltration; ### Perl modules use strict; use warnings; ######################################################################## ### Création of a hash containing all adduits and fragments possible ### ######################################################################## sub MassCollecting{ my $mass_file = $_[0]; my %hmass; open (F1, $mass_file); while(my $line = <F1>){ chomp $line; my @tline = split(/[\t;]/, $line); if(defined($hmass{$tline[2]})){ print "Il existe déjà une même différence de masse : $tline[2] !\n"; } $hmass{$tline[1]}{$tline[2]}=$tline[0]; } close F1; # my $count=0; # foreach my $k (keys %hmass){ # $count++; # } return %hmass; } ######################################################## ### Creation of a sif table + correlation filtration ### ######################################################## sub sifTableCreation{ my $file = $_[0]; my $output_sif = $_[1]; my $opt = $_[2]; my $rt_threshold = $_[3]; my $mass_threshold = $_[4]; my $correl_threshold = $_[5]; my $dataMatrix = $_[6]; my $output_tabular = $_[7]; my $combined_DMVM = $_[8]; my $repres_opt = $_[9]; my $intensity_threshold = $_[10]; my $intensity_pourc = $_[11]; my $refhmass = $_[12]; my %hheader_file; my %hduplicate; my %hcorrelgroup; my $groupct=1; my $linenb3=0; my %hheader_line; my %hrtmz; open (F5, $combined_DMVM); while(my $line = <F5>){ chomp $line; my @tline = split(/\t/, $line); if($linenb3 == 0){ for(my $i=0; $i<scalar(@tline);$i++){ my $a = $tline[$i]; $hheader_line{$a}=$i; } } else{ if(defined($hheader_line{mzmed})){ my $b = $tline[$hheader_line{mzmed}]; $hrtmz{$tline[0]}{mz}=$b; } else{ my $b = $tline[$hheader_line{mz}]; $hrtmz{$tline[0]}{mz}=$b; } if(defined($hheader_line{rtmed})){ my $d = $tline[$hheader_line{rtmed}]; $hrtmz{$tline[0]}{rt}=$d; } else{ my $d = $tline[$hheader_line{rt}]; $hrtmz{$tline[0]}{rt}=$d; } } $linenb3 ++; } close F5; my $linenb=0; open (F1, $file) or die "Impossible to open $file\n"; open(F2, ">$output_sif") or die "Impossible to open $output_sif\n"; while(my $line = <F1>){ chomp $line; my @tline = split(/\t/, $line); ############################### ### Création of a sif table ### ############################### if($linenb == 0){ for(my $i=0; $i<scalar(@tline);$i++){ my $a = $tline[$i]; $hheader_file{$i}=$a; } } else{ for(my $i=1; $i<scalar(@tline);$i++){ my $a=$tline[0]; my $b=$hheader_file{$i}; my $coef=$tline[$i]; if($a eq $b){ # print "This is a correlation between A ($a) and A ($b) !\n" } else{ ######################### ### Remove duplicates ### ######################### my $y = $a."/".$b; my $z = $b."/".$a; if((!(defined($hduplicate{$y}))) && (!(defined($hduplicate{$z})))){ $hduplicate{$y}=1; if($coef > $correl_threshold){ print F2 "$a\t$coef\t$b\n"; my $count=0; ###################################################### ### Analytic correlation filtrering follow options ### ###################################################### my $amass=$hrtmz{$a}{mz}; my $atemp=$hrtmz{$a}{rt}; my $bmass= $hrtmz{$b}{mz}; my $btemp=$hrtmz{$b}{rt}; my $diff = $amass-$bmass; $diff = abs($diff); ### Option 1: Don't take into acount mass information ### if($opt == 1){ my $btplus = $btemp + $rt_threshold; my $btmoins = $btemp - $rt_threshold; if(($btmoins <= $atemp) && ($atemp <= $btplus)){ foreach my $k (keys %hcorrelgroup){ if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ $hcorrelgroup{$k}{$a}=1; $hcorrelgroup{$k}{$b}=1; $count++; last; } } if($count == 0){ my $groupnb="group".$groupct; $hcorrelgroup{$groupnb}{$a}=1; $hcorrelgroup{$groupnb}{$b}=1; $groupct ++; } } } ### Option 2: Check that all mass differences are include in a specific list taking into account RT information ### elsif($opt == 2){ my $print = 0; foreach my $s (keys %{$refhmass}){ foreach my $r (keys %{$refhmass->{$s}}){ my $rm = $r - $mass_threshold; my $rp = $r + $mass_threshold; if(($diff <= $rp) && ($diff >= $rm)){ if($print == 0){ my $btplus = $btemp + $rt_threshold; my $btmoins = $btemp - $rt_threshold; if(($btmoins <= $atemp) && ($atemp <= $btplus)){ # foreach my $s (keys %{%$refhmass{$r}}){ # print F2 "$line\t$hmass{$r}{$s} : $s ($r)\n"; foreach my $k (keys %hcorrelgroup){ if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ $hcorrelgroup{$k}{$a}=1; $hcorrelgroup{$k}{$b}=1; # $hcorrelgroup{$k}{frag}.="#".$s; $count++; last; } } if($count == 0){ my $groupnb="group".$groupct; $hcorrelgroup{$groupnb}{$a}=1; $hcorrelgroup{$groupnb}{$b}=1; # $hcorrelgroup{$groupnb}{frag}.="#".$s; $groupct ++; } $print = 1; } } } } } } ### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ### elsif($opt == 3){ my $print = 0; foreach my $s (keys %{$refhmass}){ foreach my $r (keys %{$refhmass->{$s}}){ my $rm = $r - $mass_threshold; my $rp = $r + $mass_threshold; if(($diff <= $rp) && ($diff >= $rm)){ if($print == 0){ # foreach my $s (keys %{%$refhmass{$r}}){ # print F2 "$line\t$hmass{$r}{$s} : $s ($r)\n"; foreach my $k (keys %hcorrelgroup){ if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){ $hcorrelgroup{$k}{$a}=1; $hcorrelgroup{$k}{$b}=1; # $hcorrelgroup{$k}{frag}.="#".$s; $count++; last; } } if($count == 0){ my $groupnb="group".$groupct; $hcorrelgroup{$groupnb}{$a}=1; $hcorrelgroup{$groupnb}{$b}=1; # $hcorrelgroup{$groupnb}{frag}.="#".$s; $groupct ++; } $print = 1; } } } } } } } } } } $linenb ++; } close F1; close F2; ############################################# ### Join groups that have been subdivided ### ############################################# my @tdelete; foreach my $k (keys %hcorrelgroup){ foreach my $i (keys %{$hcorrelgroup{$k}}){ foreach my $v (keys %hcorrelgroup){ my $count = 0; if ($v ne $k){ foreach my $w (keys %{$hcorrelgroup{$v}}){ # if($w ne "frag"){ if($w eq $i){ $count = 1; push(@tdelete, $v); } # } } } if($count == 1){ foreach my $w (keys %{$hcorrelgroup{$v}}){ $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; } delete($hcorrelgroup{$v}); } } } } foreach my $t (@tdelete){ delete($hcorrelgroup{$t}); } ### Do it twice to see if it fix the problem of unmerge groups foreach my $k (keys %hcorrelgroup){ foreach my $i (keys %{$hcorrelgroup{$k}}){ foreach my $v (keys %hcorrelgroup){ my $count = 0; if ($v ne $k){ foreach my $w (keys %{$hcorrelgroup{$v}}){ # if($w ne "frag"){ if($w eq $i){ $count = 1; push(@tdelete, $v); } # } } } if($count == 1){ foreach my $w (keys %{$hcorrelgroup{$v}}){ $hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w}; } delete($hcorrelgroup{$v}); } } } } foreach my $t (@tdelete){ delete($hcorrelgroup{$t}); } ########################################## ### Addition of annotation information ### ########################################## foreach my $k (keys %hcorrelgroup){ foreach my $i (keys %{$hcorrelgroup{$k}}){ foreach my $j (keys %{$hcorrelgroup{$k}}){ my $count = 0; if ($i ne $j){ my $a = $hrtmz{$i}{mz}; my $b = $hrtmz{$j}{mz}; my $diff = $a - $b; my $sign; if($diff>0){ $sign="+"; } if($diff<0){ $sign="-"; } $diff = abs($diff); foreach my $z (keys %{$refhmass}){ foreach my $y (keys %{$refhmass->{$z}}){ my $ym = $y - $mass_threshold; my $yp = $y + $mass_threshold; if(($diff <= $yp) && ($diff >= $ym)){ my $diff_list = $diff - $y; $diff_list = abs($diff_list); $diff_list = sprintf ("%0.6f", $diff_list); if($hcorrelgroup{$k}{$i} eq 1){ my $val = "#".$j."|".$sign."(".$z.")(".$diff_list.")|"; $hcorrelgroup{$k}{$i}=$val; $count ++; } else{ if($count == 0){ my $val = "#".$j."|".$sign."(".$z.")(".$diff_list.")|"; $hcorrelgroup{$k}{$i}.=$val; $count ++; } else{ my $val = $sign."(".$z.")(".$diff_list.")|"; $hcorrelgroup{$k}{$i}.=$val; $count ++; } } } } } } } } } #################################################### ### Choose the representative ion for each group ### #################################################### my %hgrouprepres; open(F3, $dataMatrix); while (my $line = <F3>){ chomp $line; my @tline = split (/\t/, $line); foreach my $k (keys %hcorrelgroup){ foreach my $i (keys %{$hcorrelgroup{$k}}){ if($tline[0] eq $i){ $hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz}; my $intensity; my $nbsubjects=0; for(my $y=1;$y<scalar(@tline);$y++){ $intensity += $tline[$y]; $nbsubjects ++; } my $meanintensity = $intensity/$nbsubjects; $hgrouprepres{$k}{$i}{intensity}=$meanintensity; $hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity}); } } } } close F3; foreach my $z (keys %hgrouprepres){ my $max_intensity = 0; my $max_int_ion = ""; my $max_mass = 0; my $max_mass_ion = ""; my $max_squared = 0; my $max_squared_ion = ""; foreach my $w (keys %{$hgrouprepres{$z}}){ if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){ $max_intensity = $hgrouprepres{$z}{$w}{intensity}; $max_int_ion = $w; } if($hgrouprepres{$z}{$w}{mass} > $max_mass){ $max_mass = $hgrouprepres{$z}{$w}{mass}; $max_mass_ion = $w; } if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){ $max_squared = $hgrouprepres{$z}{$w}{squaredmassint}; $max_squared_ion = $w; } } my $max_int_max_mass_ion=""; if($repres_opt eq "max_intensity_max_mass"){ my %hfirst; my $first=0; foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){ $first ++; if ($first <= 3){ $hfirst{$w} = $hgrouprepres{$z}{$w}{intensity}; } } my $first_2 = 0; my $intens_max = 0; my $mass_max = 0; foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){ $first_2 ++; if($first_2 == 1){ $intens_max = $hfirst{$y}; if($intensity_threshold > $intens_max){ $intensity_threshold = 0; } $max_int_max_mass_ion = $y; $mass_max = $hgrouprepres{$z}{$y}{mass}; } if($hgrouprepres{$z}{$y}{mass} > $mass_max){ if($hfirst{$y}>$intensity_threshold){ my $a = $intens_max * $intensity_pourc; if($hfirst{$y} > $a){ $max_int_max_mass_ion = $y; $mass_max = $hgrouprepres{$z}{$y}{mass}; } } } } } $hgrouprepres{$z}{max_int}=$max_int_ion; $hgrouprepres{$z}{max_mass}=$max_mass_ion; $hgrouprepres{$z}{max_squared}=$max_squared_ion; $hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion; } ###################### ### Print result ! ### ###################### open(F4, ">$output_tabular"); open(F5, $combined_DMVM); my $line_nb = 0; my %hheader; while (my $line = <F5>){ chomp $line; my @tline = split (/\t/, $line); if($line_nb == 0){ print F4 "$line\tACF_groups"; if($opt == 1){ if($repres_opt eq "intensity"){print F4 "\tintensity_repres\tACF_filter\n"} if($repres_opt eq "mass"){print F4 "\tmass_repres\tACF_filter\n"} if($repres_opt eq "mixt"){print F4 "\tmass2intens_repres\tACF_filter\n"} if($repres_opt eq "max_intensity_max_mass"){print F4 "\tmax_intensity_max_mass_repres\tACF_filter\n"} } else{ if($repres_opt eq "intensity"){print F4 "\tintensity_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} if($repres_opt eq "mass"){print F4 "\tmass_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} if($repres_opt eq "mixt"){print F4 "\tmass2intens_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} if($repres_opt eq "max_intensity_max_mass"){print F4 "\tmax_intensity_max_mass_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"} } ### Creation of a header hash for(my $i=0; $i<scalar(@tline);$i++){ my $a = $tline[$i]; $hheader{$a}=$i; } } else{ my $find = 0; foreach my $v (keys %hcorrelgroup){ if(defined($hgrouprepres{$v}{$tline[0]})){ print F4 "$line\t$v"; if($repres_opt eq "intensity"){print F4 "\t$hgrouprepres{$v}{max_int}\t"} if($repres_opt eq "mass"){print F4 "\t$hgrouprepres{$v}{max_mass}\t"} if($repres_opt eq "mixt"){print F4 "\t$hgrouprepres{$v}{max_squared}\t"} if($repres_opt eq "max_intensity_max_mass"){print F4 "\t$hgrouprepres{$v}{max_int_max_mass}\t"} if($opt != 1){ if(defined($hcorrelgroup{$v}{$tline[0]})){ print F4 "$hcorrelgroup{$v}{$tline[0]}\t"; } else{ print F4 "\t"; } } if($repres_opt eq "intensity"){ if($tline[0] eq $hgrouprepres{$v}{max_int}){ print F4 "conserved\n"; } else{ print F4 "deleted\n"; } $find = 1; } if($repres_opt eq "mass"){ if($tline[0] eq $hgrouprepres{$v}{max_mass}){ print F4 "conserved\n"; } else{ print F4 "deleted\n"; } $find = 1; } if($repres_opt eq "mixt"){ if($tline[0] eq $hgrouprepres{$v}{max_squared}){ print F4 "conserved\n"; } else{ print F4 "deleted\n"; } $find = 1; } if($repres_opt eq "max_intensity_max_mass"){ if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){ print F4 "conserved\n"; } else{ print F4 "deleted\n"; } $find = 1; } } } if($find == 0){ $groupct ++; my $group = "group".$groupct; if($opt != 1){ print F4 "$line\t$group\t-\t-\t-\n"; } else{ print F4 "$line\t$group\t-\t-\n"; } } } $line_nb ++; } return ($output_sif, $output_tabular); } 1;