Mercurial > repos > elixir-it > vinyl_optimizer
diff score_complete_alt_M.pl @ 0:4c6529d120c3 draft
Uploaded
author | elixir-it |
---|---|
date | Tue, 09 Jun 2020 16:07:19 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/score_complete_alt_M.pl Tue Jun 09 16:07:19 2020 +0000 @@ -0,0 +1,453 @@ +#!/usr/bin/perl -w +#use strict; + +%arguments= +( +"AD"=>"T", #value mandatory, T==TRUE F==FALSE +"XL"=>"F", #value mandatory, F==FALSE T==TRUE +"vcf"=>"", #file mandatory, provided at runtime +"disease"=>"", #name optional +"similarD"=>"", #file optional +"lgenes"=>"", #file optional +"leQTL"=>"", #file optional +"keywords"=>"kfile", #file mandatory, but default value +"effects"=>"efile", #file mandatory, but default value +"disease_clinvar"=>8, #numeric mandadory, but default value +"score_AF"=>4, #numeric mandatory, but default value +"score_functional"=>8, #numeric mandatory, but default value +"score_NS"=>6, #numeric mandatory, but default value +"score_nIND"=>8, #numeric mandatory, but default value +"AF"=>0.0001, #numeric mandatory, but default value +"scoreeQTL"=>1, #numeric mandatory, but default value +"nind"=>5, #numeric mandatory, but default value +"scoreG"=>2, #numeric mandatory, but default value +"ifile"=>"", #file optional +"scoreT"=>1, #numeric mandatory, but default value +"scoreGW"=>1, #numeric mandatory but default value +"scoreM"=>1, #numeric mandatory but default value +"scoreR"=>1, #numeric mandatory but default value +"scoreSP"=>1, #numeric mandatory but default value +#####OUTPUT file############################################# +"ofile"=>"final_res.csv", #file #OUTPUT #tabulare +"ovcfile"=>"final_res.vcf",#, #file #OUTOUT #vcf +"osummary"=>"detailed_final_res.csv" +); + +@arguments=@ARGV; +for ($i=0;$i<=$#ARGV;$i+=2) +{ + $act=$ARGV[$i]; + $act=~s/-//g; + $val=$ARGV[$i+1]; + if (exists $arguments{$act}) + { + $arguments{$act}=$val; + }else{ + warn("$act: unknown argument\n"); + @valid=keys %arguments; + warn("Valid arguments are @valid\n"); + die("All those moments will be lost in time, like tears in rain.\n Time to die!\n"); + } +} + +$ofile_name=$arguments{"ofile"}; +open(O,">$ofile_name"); +$ovcfile=$arguments{"ovcfile"}; +open(OV,">$ovcfile"); +$osummary_name=$arguments{"osummary"}; +open(OS,">$osummary_name"); +print OS "chr\tstart\tref\talt\tNhom\tNhet\tNind\tGene\tScoreG\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreSP\tScoreTF\tScoremir\tScoreREG\tScoreGWAS\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n"; + +$ifile=$arguments{"ifile"}; +if (-e $ifile) +{ + open(IN,$ifile); + while(<IN>) + { + #($G1,$G2)=(split()); + #push (@{$interact{$G1}},$G2); + } +} + + +$file=$arguments{"vcf"} ; +die ("input file $file not found!\n") unless -e $file; + +$lgenes=$arguments{"lgenes"}; +if (-e $lgenes) +{ + open(IN,$lgenes); + while(<IN>) + { + chomp; + $G=(split())[0]; + $Lgenes{$G}=1; + } +} + + +$kfile=$arguments{"keywords"}; +die ("keyword file $kfile not found!\n") unless -e $kfile; +open(IN,$kfile); +while(<IN>) +{ + chomp; + ($k,$category)=(split())[0,1]; + $specialKeys{$k}=$category; + push (@{$annotKeys{$category}},$k); +} + +$diseaseO=$arguments{"disease"} ? $arguments{"disease"} : "GinocchioValgoDellaLavandaiaZoppa"; +@diseaseO=split(/#/,$diseaseO); + +$sfile=$arguments{"similarD"}; +if (-e $sfile) +{ + open(IN,$sfile); + while(<IN>) + { + chomp; + $Sw=lc($_); + $Sw=~s/\s+//; + push(@kw,$Sw); + } +} + +$efile=$arguments{"effects"}; +die ("effect file $efile not found!\n") unless -e $efile; +open(IN,$efile); +while(<IN>) +{ + chomp; + $effects{$_}=1; +} + +$leQTLfile=$arguments{"leQTL"}; +if (-e $leQTLfile) +{ + open(IN,$leQTLfile); + while(<IN>) + { + chomp(); + $Qlist{$_}=1; + } +} + +$disease_clinvar=$arguments{"disease_clinvar"}; +$score_AF=$arguments{"score_AF"}; +$score_functional=$arguments{"score_functional"}; +$score_NS=$arguments{"score_NS"}; +$score_nIND=$arguments{"score_nIND"}; +$score_QTL=$arguments{"scoreeQTL"}; +$scoreG=$arguments{"scoreG"}; +$scoreM=$arguments{"scoreM"}; +$scoreT=$arguments{"scoreT"}; +$scoreR=$arguments{"scoreR"}; +$scoreGW=$arguments{"scoreGW"}; +$scoreSP=$arguments{"scoreSP"}; + +#print O "CHR\tstart\tgene\tref\talt\tAC\tNhom\tNhet\tNind\tGene\tScoreCV\tScoreOth\tScoreAF\tScoreEff\tScoreNS\tScoreQTL\tScoreNi\tScoreT\n"; +print O "CHR\tstart\tgene\tref\talt\tAC\t"; + +foreach $k (sort keys %specialKeys) +{ + print O "$k\t"; +} +print O "VINYL_score\n"; + + +open(IN,$file); + +#print "$gene_score $disease_HGMD $disease_clinvar $score_functional $score_AF $score_NS $score_eQTL $score_nIND\n"; +while(<IN>) +{ + if ($_=~/^#/) + { + print OV; + next; + } + chomp(); + $summary_line=""; + @val=(split(/\t/)); + $chr=$val[0]; + $start=$val[1]; + $pstart=$val[2]; + $b1=$val[3]; + $b2=$val[4]; + $qscore=$val[5]; + $pb2=$val[6]; + $gene="na"; + $annot=$val[7]; + $gt=$val[8]; + @samples=@val[9..$#val]; + $Nhom=0; + $Nhet=0; + $summary_line.="$chr\t$start\t$b1\t$b2\t"; + foreach $s (@samples) + { + $sid=(split(/\:/,$s))[0]; + next if $sid eq "."; + if ($sid=~/\|/) + { + ($h1,$h2)=(split(/\|/,$sid))[0,1]; + }elsif($sid=~/\//){ + ($h1,$h2)=(split(/\//,$sid))[0,1]; + } + $Nhom++ if $h1==$h2 && $h1 !=0; + $Nhet++ if $h1!=$h2 && ($h1!=0 && $h2!=0) ; + } + $summary_line.="$Nhom\t$Nhet\t"; + $samples=(join("\t",@samples)); + @terms=(split(/\;/,$annot)); + $DIS=0; + if ($_=~/;AC=(\d+);/ || $_=~/\tAC=(\d+);/) + { + $nind=$1; + } + if ($_=~/Gene.refGene=(\w+);/) + { + $gene=$1; + } + $summary_line.="$nind\t$gene\t"; + next if $nind==0; + $gene=(split(/\,/,$gene))[0]; + $i=0; + %keep=(); + $score=0; + %riserva=(7569521=>1, 32974391=>1, 228557681=>1, 228527758=>1, 228525823=>1, 156106964=>1 ); + $score+=6 if $riserva{$start}; + $G=0; + if ($Lgenes{$gene}) + { + $score+=$scoreG; + $G=$scoreG; + } + $summary_line.="$G\t"; + foreach $t (@terms) + { + next unless $t; + ($keep,$value)=(split(/\=/,$t))[0,1]; + $value="." unless ($value); + $keep{$keep}=$value; + } + if ($keep{"CLNSIG"} ne "."){ + $scoreO=0; + $scoreC=0; + $add=0; + $add=$disease_clinvar if ($keep{"CLNSIG"} eq "Pathogenic" || $keep{"CLNSIG"} eq "Pathogenic,_other,_risk_factor" || $keep{"CLNSIG"} eq "pathogenic" || $keep{"CLNSIG"} eq "Pathogenic/Likely_pathogenic" ); + $add=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Likely_pathogenic" || $keep{"CLNSIG"} eq "Conflicting_interpretations_of_pathogenicity" || $keep{"CLNSIG"} eq "likely-pathogenic"); + $add-=$disease_clinvar/4 if ($keep{"CLNSIG"} eq "Likely_benign" || $keep{"CLNSIG"} eq "Benign/Likely_benign"); + $add-=$disease_clinvar/2 if ($keep{"CLNSIG"} eq "Benign"); + @diseases=split(/\|/,$keep{"CLNDN"}); + @databases=split(/\|/,$keep{"CLNDISDB"}); + for ($i=0;$i<=$#diseases;$i++) + { + $dis=lc $diseases[$i]; + $dat=$databases[$i]; + $MDO=0; + foreach $disOL (@diseaseO) + { + $disOL=lc $disOL; + if ($dis=~ /$disOL/) + { + if ($dat=~/OMIM/) + { + $scoreO=$add; + }else{ + $scoreC=$add; + } + $MDO=1; + last; + } + } + if ($MDO==0) + { + foreach $kv (@kw) + { + if ($dis=~/$kv/) + { + $DIS=1 if $add>0; + if ($dat=~/OMIM/) + { + $scoreO=$add; + }else{ + $scoreC=$add; + } + } + } + } + } + $score+=$scoreO+$scoreC; + $summary_line.="$scoreO\t$scoreC\t"; + #duplicate this block for KW annotated + }else{ + $summary_line.="0\t0\t"; + } + @AFkeys=@{$annotKeys{"AF"}}; + $AF=0; + foreach $AFK (@AFkeys) + { + $LOC_af=$keep{$AFK} ? $keep{$AFK} : 0 ; + $LOC_af=0 if $LOC_af eq "."; + $AF=$LOC_af if $LOC_af>$AF; + } + $AF=$AF/20 if ($chr eq "chr1" && ($start==228557681 || $start==228527758 || $start==228525823)); + if ($AF<=$arguments{"AF"}) #0,00002 + { + $score+=$score_AF; + $summary_line.="$score_AF\t"; + }elsif($AF>$arguments{"AF"} && $AF<=$arguments{"AF"}*4){ + $score+=$score_AF/2; + $summary_line.=$score_AF/2 ."\t"; + }elsif($AF>$arguments{"AF"}*4 && $AF<=0.01){ + $summary_line.="0\t"; + }elsif($AF>0.01){ #commonSNP + $score-=$score_AF/2; + $summary_line.=-$score_AF/2 ."\t"; + } + + @EFFkeys=@{$annotKeys{"Effect"}}; + $EFFS=0; + foreach $EFF (@EFFkeys) + { + $effectO=(split(/\;/,$keep{$EFF}))[0]; + $score+=$score_functional if $effects{$effectO}; + $EFFS+=$score_functional if $effects{$effectO}; + } + $summary_line.="$EFFS\t"; + + @SPKeys=@{$annotKeys{"Splice"}}; + $SPs=0; + foreach $SPk (@SPKeys) + { + next if ($keep{$SPk} eq "."); + if($keep{$SPk}>0.6) + { + $score+=$scoreSP/($#SPKeys+1); + $SPs+=$scoreSP/($#SPKeys+1); + } + } + $summary_line.="$SPs\t"; + + $TFscore=0; + @TFBSkeys=@{$annotKeys{"tfbs"}}; + foreach $T (@TFBSkeys) + { + $score+=$scoreT if $keep{$T} ne "."; + $TFscore+=$scoreT if $keep{$T} ne "."; + } + $summary_line.="$TFscore\t"; + + $mirscore=0; + @mirkeys=@{$annotKeys{"mirna"}}; + foreach $M (@mirkeys) + { + $score+=$scoreM if $keep{$M} ne "."; + $mirscore+=$scoreM if $keep{$M} ne "."; + } + $summary_line.="$mirscore\t"; + + $REGscore=0; + @REGkeys=@{$annotKeys{"Reg"}}; + foreach $R (@REGkeys) + { + $score+=$scoreR if $keep{$R} ne "."; + $REGscore+=$scoreR if $keep{$R} ne "."; + } + $summary_line.="$REGscore\t"; + + $GWscore=0; + if ($keep{"GWAS"} ne ".") + { + $GW=$keep{"GWAS"}; + $GW=lc($GW); + @Gs=(split(/\|/,$GW)); + foreach $Gword (@Gs) + { + next if $Gword eq " "; + $Gword=~s/-/ /g; + foreach $kv (@kw) + { + if ($Gword=~/$kv/) + { + $score+=$scoreGW; + $GWscore+=$scoreGW; + last; + } + } + } + + } + $summary_line.="$GWscore\t"; + + $NSa=0; + #print "Exon " . $keep{"ExonicFunc.refGene"}. " $score\n"; #modify here to use kwords in conf file + if ($keep{"ExonicFunc.refGene"} eq "nonsynonymous_SNV") + { + $ND=0; + @NStools=@{$annotKeys{"NStool"}}; + foreach $t (@NStools) + { + $NSscore=$keep{$t}; + next if $NSscore eq "."; + $NSscore="D" if $t=~/CADD/ && $NSscore>=5; + $ND++ if $NSscore eq "D"; + } + + if ($ND==$#NStools+1) + { + $score+=$score_NS; + $NSa+=$score_NS; + }elsif($ND>=($#NStools+1)/2){ + $score+=$score_NS/(($#NStools+1)/2); + $NSa+=$score_NS/(($#NStools+1)/2); + }elsif($ND==0){ + $score+=$score_NS/($#NStools+1); #da commentare. o NS/4 + $NSa+=$score_NS/($#NStools+1); + } + } + $summary_line.="$NSa\t"; + $iQTL=0; + foreach $QTL (keys %Qlist) + { + next unless $keep{$QTL}; + if ($keep{$QTL} ne ".") + { + $score+=$score_QTL; + $iQTL+=$score_QTL; #if ($keep{$QTL} ne "."); + } + + } + $summary_line.="$iQTL\t"; + if ($nind>=$arguments{"nind"} && $AF<=0.01) + { + $score+=$score_nIND; + $summary_line.="$score_nIND\t"; + }elsif($nind>=$arguments{"nind"}/2 && $nind<$arguments{"nind"} && $AF<=0.01){ + $score+=$score_nIND/2; + $summary_line.=$score_nIND/2 ."\t"; + }else{ + $summary_line.="0\t"; + } + chomp(); + chop($_); + $outL=""; + $IS_MAL=0; + foreach $k (sort keys %specialKeys) + { + unless($keep{$k}) + { + #warn("Malformed $_\n"); + warn("please check $k is missing\n"); + $IS_MAL=1; + } + $outL.="$keep{$k}\t"; + } + next if $IS_MAL==1; + $score=$score/2 if $arguments{"AD"} eq "F" && $Nhom==0; + $score=$score/2 if $arguments{"XL"} eq "T" && $chr ne "chrX"; + $summary_line.="$score\n"; + print O "$chr\t$start\t$gene\t$b1\t$b2\t$nind\t$outL$score\n"; #if $score>=12 || $DIS==1; + print OV "$chr\t$start\t$pstart\t$b1\t$b2\t$qscore\t$pb2\t$annot;VINYL_score=$score\t$gt\t$samples\n"; + print OS "$summary_line"; +}