view ACF/lib/IonFiltration.pm @ 22:fff7de4f9fed draft

Uploaded
author melpetera
date Thu, 09 May 2019 08:17:53 -0400
parents
children
line wrap: on
line source

#!usr/bin/perl
package IonFiltration;

### Perl modules
use strict;
use warnings;






########################################################################
### Création of a hash containing all adduits and fragments possible ###
########################################################################


sub MassCollecting{
	
	my $mass_file = $_[0];
	my %hmass;

	open (F1, $mass_file);
	
	while(my $line = <F1>){
		chomp $line;
		my @tline = split(/[\t;]/, $line);
		if(defined($hmass{$tline[2]})){
			print "Il existe déjà une même différence de masse : $tline[2] !\n";
		}
		$hmass{$tline[1]}{$tline[2]}=$tline[0];
	}
	
	close F1;
#	my $count=0;
#	foreach my $k (keys %hmass){
#		$count++;
#	}
	
	return %hmass;
	
	
}







########################################################
### Creation of a sif table + correlation filtration ###
########################################################


sub sifTableCreation{
	
	my $file = $_[0];
	my $output_sif = $_[1];
	my $opt = $_[2];
	my $rt_threshold = $_[3];
	my $mass_threshold = $_[4];
	my $correl_threshold = $_[5];
	my $dataMatrix = $_[6];
	my $output_tabular = $_[7];
	my $combined_DMVM = $_[8];
	my $repres_opt = $_[9];
	my $intensity_threshold = $_[10];
	my $intensity_pourc = $_[11];
	my $refhmass = $_[12];
	
	
	
	
	my %hheader_file;
	my %hduplicate;
	
	my %hcorrelgroup;
	my $groupct=1;

	
	my $linenb3=0;
	my %hheader_line;
	my %hrtmz;
	
	open (F5, $combined_DMVM);
	while(my $line = <F5>){
		chomp $line;
		my @tline = split(/\t/, $line);
		
		if($linenb3 == 0){
			for(my $i=0; $i<scalar(@tline);$i++){
				my $a = $tline[$i];
				$hheader_line{$a}=$i;
			}
		}
		else{
			if(defined($hheader_line{mzmed})){
				my $b = $tline[$hheader_line{mzmed}];
				$hrtmz{$tline[0]}{mz}=$b;
			}
			else{
				my $b = $tline[$hheader_line{mz}];
				$hrtmz{$tline[0]}{mz}=$b;
			}
			if(defined($hheader_line{rtmed})){
				my $d = $tline[$hheader_line{rtmed}];
				$hrtmz{$tline[0]}{rt}=$d;
			}
			else{
				my $d = $tline[$hheader_line{rt}];
				$hrtmz{$tline[0]}{rt}=$d;
			}
		}
		
		$linenb3 ++;
	}
	close F5;
	
	
	my $linenb=0;
	
	open (F1, $file) or die "Impossible to open $file\n";
	open(F2, ">$output_sif") or die "Impossible to open $output_sif\n";
	
	while(my $line = <F1>){
		chomp $line;
		my @tline = split(/\t/, $line);
		
		
		###############################
		### Création of a sif table ###
		###############################
		
		if($linenb == 0){
			for(my $i=0; $i<scalar(@tline);$i++){
				my $a = $tline[$i];
				$hheader_file{$i}=$a;
			}
		}
		else{
			for(my $i=1; $i<scalar(@tline);$i++){
				my $a=$tline[0];
				my $b=$hheader_file{$i};
				my $coef=$tline[$i];
				
				
				if($a eq $b){
	#				print "This is a correlation between A ($a) and A ($b) !\n"
				}
				else{
					
					
					#########################
					### Remove duplicates ###
					#########################
					
					my $y = $a."/".$b;
					my $z = $b."/".$a;
					
					if((!(defined($hduplicate{$y}))) && (!(defined($hduplicate{$z})))){
						
						$hduplicate{$y}=1;
						
						if($coef > $correl_threshold){
						
							print F2 "$a\t$coef\t$b\n";
							
							my $count=0;
							
							######################################################
							### Analytic correlation filtrering follow options ###
							######################################################
							
							
							
								my $amass=$hrtmz{$a}{mz};
								my $atemp=$hrtmz{$a}{rt};
									my $bmass= $hrtmz{$b}{mz};
									my $btemp=$hrtmz{$b}{rt};
									my $diff = $amass-$bmass;
									$diff = abs($diff);
								
									### Option 1: Don't take into acount mass information ###
					
									if($opt == 1){
										my $btplus = $btemp + $rt_threshold;
										my $btmoins = $btemp - $rt_threshold;
										if(($btmoins <= $atemp) && ($atemp <= $btplus)){
											foreach my $k (keys %hcorrelgroup){
												
												if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
													$hcorrelgroup{$k}{$a}=1;
													$hcorrelgroup{$k}{$b}=1;
													$count++;
													last;
												}
											}
											if($count == 0){
												my $groupnb="group".$groupct;
												$hcorrelgroup{$groupnb}{$a}=1;
												$hcorrelgroup{$groupnb}{$b}=1;
												$groupct ++;
											}
											
										}
									}
									
								
									### Option 2: Check that all mass differences are include in a specific list taking into account RT information ###
									
									elsif($opt == 2){
										
										my $print = 0;
										foreach my $s (keys %{$refhmass}){
											foreach my $r (keys %{$refhmass->{$s}}){
												my $rm = $r - $mass_threshold;
												my $rp = $r + $mass_threshold;
												if(($diff <= $rp) && ($diff >= $rm)){
													if($print == 0){
														my $btplus = $btemp + $rt_threshold;
														my $btmoins = $btemp - $rt_threshold;
														
														if(($btmoins <= $atemp) && ($atemp <= $btplus)){
#														foreach my $s (keys %{%$refhmass{$r}}){
						#									print F2 "$line\t$hmass{$r}{$s} : $s ($r)\n";
															foreach my $k (keys %hcorrelgroup){
																if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
																	$hcorrelgroup{$k}{$a}=1;
																	$hcorrelgroup{$k}{$b}=1;
#																	$hcorrelgroup{$k}{frag}.="#".$s;
																	$count++;
																	last;
																}
															}
															if($count == 0){
																my $groupnb="group".$groupct;
																$hcorrelgroup{$groupnb}{$a}=1;
																$hcorrelgroup{$groupnb}{$b}=1;
#																$hcorrelgroup{$groupnb}{frag}.="#".$s;
																$groupct ++;
																
															}
															$print = 1;
														}
														
													}
												}
											}
										}
									}
									
								
									### Option 3: Check that all mass differences are include in a specific list, ignoring RT information ###
					
									elsif($opt == 3){
										
										my $print = 0;
										foreach my $s (keys %{$refhmass}){
											foreach my $r (keys %{$refhmass->{$s}}){
												my $rm = $r - $mass_threshold;
												my $rp = $r + $mass_threshold;
												if(($diff <= $rp) && ($diff >= $rm)){
													if($print == 0){
														
#													foreach my $s (keys %{%$refhmass{$r}}){
						#								print F2 "$line\t$hmass{$r}{$s} : $s ($r)\n";
														foreach my $k (keys %hcorrelgroup){
															if((defined($hcorrelgroup{$k}{$a})) || (defined($hcorrelgroup{$k}{$b}))){
																$hcorrelgroup{$k}{$a}=1;
																$hcorrelgroup{$k}{$b}=1;
#																$hcorrelgroup{$k}{frag}.="#".$s;
																$count++;
																last;
															}
														}
														if($count == 0){
															my $groupnb="group".$groupct;
															$hcorrelgroup{$groupnb}{$a}=1;
															$hcorrelgroup{$groupnb}{$b}=1;
#															$hcorrelgroup{$groupnb}{frag}.="#".$s;
															$groupct ++;
														}
														$print = 1;
													}
												}
											}
										}
									}
								}


					}
				}
			}
		}
		$linenb ++;
	}
	close F1;
	close F2;
	

	
	
	#############################################
	### Join groups that have been subdivided ###
	#############################################
	
	my @tdelete;
	
	foreach my $k (keys %hcorrelgroup){
		foreach my $i (keys %{$hcorrelgroup{$k}}){
			foreach my $v (keys %hcorrelgroup){
				my $count = 0;
				if ($v ne $k){
					foreach my $w (keys %{$hcorrelgroup{$v}}){
#						if($w ne "frag"){
							if($w eq $i){
								$count = 1;
								push(@tdelete, $v);
							}	
#						}
					}
				}
				if($count == 1){
					foreach my $w (keys %{$hcorrelgroup{$v}}){
						$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
					}
					delete($hcorrelgroup{$v});
				}
			}
		}
	}
	
	foreach my $t (@tdelete){
		delete($hcorrelgroup{$t});
	}
	
	
	### Do it twice to see if it fix the problem of unmerge groups
	
	foreach my $k (keys %hcorrelgroup){
		foreach my $i (keys %{$hcorrelgroup{$k}}){
			foreach my $v (keys %hcorrelgroup){
				my $count = 0;
				if ($v ne $k){
					foreach my $w (keys %{$hcorrelgroup{$v}}){
#						if($w ne "frag"){
							if($w eq $i){
								$count = 1;
								push(@tdelete, $v);
							}	
#						}
					}
				}
				if($count == 1){
					foreach my $w (keys %{$hcorrelgroup{$v}}){
						$hcorrelgroup{$k}{$w}=$hcorrelgroup{$v}{$w};
					}
					delete($hcorrelgroup{$v});
				}
			}
		}
	}
	
	foreach my $t (@tdelete){
		delete($hcorrelgroup{$t});
	}
	
	
	
	##########################################
	### Addition of annotation information ###
	##########################################
	
	foreach my $k (keys %hcorrelgroup){
		foreach my $i (keys %{$hcorrelgroup{$k}}){
			foreach my $j (keys %{$hcorrelgroup{$k}}){
				my $count = 0;
				if ($i ne $j){
					
					my $a = $hrtmz{$i}{mz};
					my $b = $hrtmz{$j}{mz};
					
					my $diff = $a - $b;
					my $sign;
					if($diff>0){
						$sign="+";
					}
					if($diff<0){
						$sign="-";
					}
					$diff = abs($diff);
					
					foreach my $z (keys %{$refhmass}){
						
						foreach my $y (keys %{$refhmass->{$z}}){
							my $ym = $y - $mass_threshold;
							my $yp = $y + $mass_threshold;
							
							
							if(($diff <= $yp) && ($diff >= $ym)){
								my $diff_list = $diff - $y;
								$diff_list = abs($diff_list);
								$diff_list = sprintf ("%0.6f", $diff_list);
								
								if($hcorrelgroup{$k}{$i} eq 1){
									my $val = "#".$j."|".$sign."(".$z.")(".$diff_list.")|";
									$hcorrelgroup{$k}{$i}=$val;
									$count ++;
								}
								else{
									if($count == 0){
										my $val = "#".$j."|".$sign."(".$z.")(".$diff_list.")|";
										$hcorrelgroup{$k}{$i}.=$val;
										$count ++;
									}
									else{
										my $val = $sign."(".$z.")(".$diff_list.")|";
										$hcorrelgroup{$k}{$i}.=$val;
										$count ++;
									}
								}
							}
						}
					}
				}
			}
		}
	}
	
	
	
	####################################################
	### Choose the representative ion for each group ###
	####################################################
	
	my %hgrouprepres;
		
	open(F3, $dataMatrix);
	
	while (my $line = <F3>){
		chomp $line;
		
		my @tline = split (/\t/, $line);
		
		
		
			foreach my $k (keys %hcorrelgroup){
				foreach my $i (keys %{$hcorrelgroup{$k}}){
					
					if($tline[0] eq $i){
						$hgrouprepres{$k}{$i}{mass}=$hrtmz{$tline[0]}{mz};
					
						my $intensity;
						my $nbsubjects=0;
						for(my $y=1;$y<scalar(@tline);$y++){
							$intensity += $tline[$y];
							$nbsubjects ++;
						}
						my $meanintensity = $intensity/$nbsubjects;
						$hgrouprepres{$k}{$i}{intensity}=$meanintensity;
						$hgrouprepres{$k}{$i}{squaredmassint}=($hgrouprepres{$k}{$i}{mass}**2)/($hgrouprepres{$k}{$i}{intensity});
					}
				}
			}
	}
	close F3;
	
	foreach my $z (keys %hgrouprepres){
		my $max_intensity =  0;
		my $max_int_ion = "";
		my $max_mass = 0;
		my $max_mass_ion = "";
		my $max_squared = 0;
		my $max_squared_ion = "";
		foreach my $w (keys %{$hgrouprepres{$z}}){
			if($hgrouprepres{$z}{$w}{intensity} > $max_intensity){
				$max_intensity = $hgrouprepres{$z}{$w}{intensity};
				$max_int_ion = $w;
				
			}
			if($hgrouprepres{$z}{$w}{mass} > $max_mass){
				$max_mass = $hgrouprepres{$z}{$w}{mass};
				$max_mass_ion = $w;
			}
			if($hgrouprepres{$z}{$w}{squaredmassint} > $max_squared){
				$max_squared = $hgrouprepres{$z}{$w}{squaredmassint};
				$max_squared_ion = $w;
			}
		}
		
		my $max_int_max_mass_ion="";
		
		if($repres_opt eq "max_intensity_max_mass"){
			my %hfirst;
			my $first=0;
			foreach my $w (reverse sort {$hgrouprepres{$z}{$a}{intensity} <=> $hgrouprepres{$z}{$b}{intensity} } keys %{$hgrouprepres{$z}}){
				$first ++;
				if ($first <= 3){
					$hfirst{$w} = $hgrouprepres{$z}{$w}{intensity};
				}
			}
			
			my $first_2 = 0;
			my $intens_max = 0;
			my $mass_max = 0;
			
			foreach my $y (reverse sort {$hfirst{$a} <=> $hfirst{$b}} keys %hfirst){
				
				
				
				$first_2 ++;
				if($first_2 == 1){
					$intens_max = $hfirst{$y};
					if($intensity_threshold > $intens_max){
						$intensity_threshold = 0;
					}
					$max_int_max_mass_ion = $y;
					$mass_max = $hgrouprepres{$z}{$y}{mass};
				}
				if($hgrouprepres{$z}{$y}{mass} > $mass_max){
					if($hfirst{$y}>$intensity_threshold){
						my $a = $intens_max * $intensity_pourc;
						if($hfirst{$y} > $a){
							$max_int_max_mass_ion = $y;
							$mass_max = $hgrouprepres{$z}{$y}{mass};
						}
					}
				}
				
			}
		}
		
		
		$hgrouprepres{$z}{max_int}=$max_int_ion;
		$hgrouprepres{$z}{max_mass}=$max_mass_ion;
		$hgrouprepres{$z}{max_squared}=$max_squared_ion;
		$hgrouprepres{$z}{max_int_max_mass}=$max_int_max_mass_ion;
		
	}
	
	
	######################
	### Print result ! ###
	######################
	
	open(F4, ">$output_tabular");
	open(F5, $combined_DMVM);
	
	my $line_nb = 0;
	my %hheader;
	while (my $line = <F5>){
		chomp $line;
		
		
		my @tline = split (/\t/, $line);
		
		if($line_nb == 0){
			print F4 "$line\tACF_groups";
			if($opt == 1){
				if($repres_opt eq "intensity"){print F4 "\tintensity_repres\tACF_filter\n"}
				if($repres_opt eq "mass"){print F4 "\tmass_repres\tACF_filter\n"}
				if($repres_opt eq "mixt"){print F4 "\tmass2intens_repres\tACF_filter\n"}
				if($repres_opt eq "max_intensity_max_mass"){print F4 "\tmax_intensity_max_mass_repres\tACF_filter\n"}

			}
			else{
				if($repres_opt eq "intensity"){print F4 "\tintensity_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"}
				if($repres_opt eq "mass"){print F4 "\tmass_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"}
				if($repres_opt eq "mixt"){print F4 "\tmass2intens_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"}
				if($repres_opt eq "max_intensity_max_mass"){print F4 "\tmax_intensity_max_mass_repres\tisotopes_adducts_fragments_[#id|annotation(delta_annotation)]\tACF_filter\n"}
			}
			
			
			### Creation of a header hash
			for(my $i=0; $i<scalar(@tline);$i++){
				my $a = $tline[$i];
				$hheader{$a}=$i;
			}
		}
		
		else{
			my $find = 0;
			foreach my $v (keys %hcorrelgroup){
					if(defined($hgrouprepres{$v}{$tline[0]})){
						print F4 "$line\t$v";
						
						
						if($repres_opt eq "intensity"){print F4 "\t$hgrouprepres{$v}{max_int}\t"}
						if($repres_opt eq "mass"){print F4 "\t$hgrouprepres{$v}{max_mass}\t"}
						if($repres_opt eq "mixt"){print F4 "\t$hgrouprepres{$v}{max_squared}\t"}
						if($repres_opt eq "max_intensity_max_mass"){print F4 "\t$hgrouprepres{$v}{max_int_max_mass}\t"}
						

						if($opt != 1){
							if(defined($hcorrelgroup{$v}{$tline[0]})){
								print F4 "$hcorrelgroup{$v}{$tline[0]}\t";
							}
							else{
								print F4 "\t";
							}
						}
						
						if($repres_opt eq "intensity"){
							if($tline[0] eq $hgrouprepres{$v}{max_int}){
								print F4 "conserved\n";
							}
							else{
								print F4 "deleted\n";
							}
							$find = 1;
						}
						if($repres_opt eq "mass"){
							if($tline[0] eq $hgrouprepres{$v}{max_mass}){
								print F4 "conserved\n";
							}
							else{
								print F4 "deleted\n";
							}
							$find = 1;
						}
						if($repres_opt eq "mixt"){
							if($tline[0] eq $hgrouprepres{$v}{max_squared}){
								print F4 "conserved\n";
							}
							else{
								print F4 "deleted\n";
							}
							$find = 1;
						}
						if($repres_opt eq "max_intensity_max_mass"){
							if($tline[0] eq $hgrouprepres{$v}{max_int_max_mass}){
								print F4 "conserved\n";
							}
							else{
								print F4 "deleted\n";
							}
							$find = 1;
						}
					}
			}
			if($find == 0){
				$groupct ++;
				my $group = "group".$groupct;
				if($opt != 1){
					print F4 "$line\t$group\t-\t-\t-\n";
				}
				else{
					print F4 "$line\t$group\t-\t-\n";
				}	
			}
			
			
		}
		
		$line_nb ++;
	}
	
	
	return ($output_sif, $output_tabular);
}





1;