Mercurial > repos > plus91-technologies-pvt-ltd > ss_test_tool
comparison 2.4/script/Merge_SV.pl @ 0:00b9898b8510 draft
Uploaded
| author | plus91-technologies-pvt-ltd |
|---|---|
| date | Wed, 04 Jun 2014 03:41:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00b9898b8510 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 use Getopt::Long; | |
| 3 use List::Util qw(min max); | |
| 4 | |
| 5 | |
| 6 #Declare variables | |
| 7 my ($window,$tmpSpace,$usage,$help,$outFile); | |
| 8 | |
| 9 GetOptions( | |
| 10 'v=s{2,}' => \@VCF, | |
| 11 'o:s' => \$outFile, | |
| 12 'w:s' => \$window, | |
| 13 'h|help' => \$help | |
| 14 ); | |
| 15 | |
| 16 if((!@VCF)||($help)){&usage();exit} | |
| 17 | |
| 18 | |
| 19 if (!$window) { | |
| 20 $window=500; | |
| 21 } | |
| 22 if (!$outFile) { | |
| 23 $outFile="merged.vcf.out"; | |
| 24 } | |
| 25 ########################################### | |
| 26 # Protect against merging too many results | |
| 27 ########################################### | |
| 28 $tmpSpace='temporarySV_merge'; | |
| 29 if (-e $tmpSpace) { | |
| 30 #Delete temp file if it exists | |
| 31 unlink $tmpSpace; | |
| 32 } | |
| 33 ########################################### | |
| 34 #For each VCF, create a BEDPE file | |
| 35 ########################################### | |
| 36 | |
| 37 open(OUT,">>$tmpSpace") or die "Can't write in this directory\n"; | |
| 38 for (my $i=0;$i<@VCF;$i++){ | |
| 39 #print STDERR "opening $VCF[$i]\n"; | |
| 40 open(VCF,$VCF[$i]) or die &usage(); | |
| 41 while (<VCF>) { | |
| 42 next if ($_=~/^#/); | |
| 43 chomp; | |
| 44 @line=split("\t",$_); | |
| 45 $mate=$line[4]; | |
| 46 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g; | |
| 47 @mate=split(/:/,$mate); | |
| 48 $end1a=$line[1]-$window; | |
| 49 $end1b=$line[1]+$window; | |
| 50 $end2a=$mate[1]-$window; | |
| 51 $end2b=$mate[1]+$window; | |
| 52 next if (($end1a<0)||($end2a<0)); | |
| 53 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) { | |
| 54 next; | |
| 55 } | |
| 56 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n"; | |
| 57 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n"; | |
| 58 } | |
| 59 } | |
| 60 close OUT; | |
| 61 | |
| 62 ########################################### | |
| 63 #Now merge the BEDPE into a unique BEDPE | |
| 64 ########################################### | |
| 65 #Make sure the BEDPE is sorted | |
| 66 #print "Make sure the BEDPE is sorted\n"; | |
| 67 my $tmpSpace2=join("",$tmpSpace,".2"); | |
| 68 system("cat $tmpSpace|sort -k1,1 -k2,3n -k4,4 -k5,5n -u > $tmpSpace2"); | |
| 69 unlink($tmpSpace); | |
| 70 | |
| 71 #Create output files for the left and right merged BEDPE | |
| 72 my $tmpSpace3=join("",$tmpSpace,".3"); | |
| 73 my $tmpSpace4=join("",$tmpSpace,".4"); | |
| 74 | |
| 75 open (OUT1,">$tmpSpace3") or die "Cant write in this directory\n"; | |
| 76 open (OUT2,">$tmpSpace4") or die "Cant write in this directory\n"; | |
| 77 | |
| 78 open(BEDPE,"$tmpSpace2") or die "$tmpSpace2 has already been deleted\n"; | |
| 79 #Initialize positions | |
| 80 #my ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4); | |
| 81 my (@chr,@pos1,@pos2,@chr2,@pos3,@pos4); | |
| 82 while (<BEDPE>) { | |
| 83 ($chr1,$pos2,$pos3,$chr2,$pos3,$pos4)=split("\t",$_); | |
| 84 if(!$Echr1){ | |
| 85 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=split("\t",$_); | |
| 86 } | |
| 87 while ( | |
| 88 ($chr1 =~ /^$Echr1$/)&& | |
| 89 ($pos2 <= $Epos2+$window)&& | |
| 90 ($chr2 =~ /^$Echr2$/)&& | |
| 91 ($pos3 <= $Epos3+$window) | |
| 92 ) | |
| 93 {$nextline = <BEDPE> ; | |
| 94 last if (!$nextline); | |
| 95 $nextline=~chomp; | |
| 96 ($chr1,$pos1,$pos2,$chr2,$pos3,$pos4)=split("\t",$nextline); | |
| 97 #print "NEXTLINE=$nextline"; | |
| 98 push (@chr1,$chr1); | |
| 99 push (@pos1,$pos1); | |
| 100 push (@pos2,$pos2); | |
| 101 push (@chr2,$chr2); | |
| 102 push (@pos3,$pos3); | |
| 103 push (@pos4,$pos4); | |
| 104 } | |
| 105 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1[0],min(@pos1),max(@pos2),$chr2[-2],min(@pos3),$pos4[-2]); | |
| 106 #print join("\t",$Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4); | |
| 107 if($pos1>$pos2){my $tmp=$pos1;$pos1=$pos2;$pos2=$tmp} | |
| 108 if($pos1>$pos2){my $tmp=$pos3;$pos3=$pos4;$pos4=$tmp} | |
| 109 print OUT1 join ("\t",$chr1,$pos1,$pos2)."\n"; | |
| 110 print OUT2 join ("\t",$chr2,$pos3,$pos4); | |
| 111 ($Echr1,$Epos1,$Epos2,$Echr2,$Epos3,$Epos4)=($chr1,$pos1,$pos2,$chr2,$pos3,$pos4); | |
| 112 } | |
| 113 close BEDPE; | |
| 114 close OUT; | |
| 115 unlink ($tmpSpace2); | |
| 116 | |
| 117 ##################################################################### | |
| 118 #Now find out for each Unique BEDPE, how many Samples was the SV in? | |
| 119 ##################################################################### | |
| 120 #FOR EACH VCF | |
| 121 #get NAME | |
| 122 | |
| 123 my $tmpSpace5=join("",$tmpSpace,".5"); | |
| 124 my $tmpSpace6=join("",$tmpSpace,".6"); | |
| 125 my $tmpSpace7=join("",$tmpSpace,".7"); | |
| 126 my $tmpSpace8=join("",$tmpSpace,".8"); | |
| 127 my $tmpSpace9=join("",$tmpSpace,".9"); | |
| 128 | |
| 129 #Create a placeholder file | |
| 130 system("paste $tmpSpace3 $tmpSpace4| awk '{OFS=\"\\t\"}{print \$1,\$2,\$3,\$4,\$5,\$6,0,\"NA\"}' > $tmpSpace7"); | |
| 131 #Convert the VCF into a BED PE | |
| 132 for (my $i=0;$i<@VCF;$i++){ | |
| 133 open (OUT,">$tmpSpace5") or die "Cant write in this directory\n"; | |
| 134 open(VCF,$VCF[$i]) ; | |
| 135 print STDERR "Starting on $VCF[$i]\n"; | |
| 136 while (<VCF>) { | |
| 137 next if ($_=~/^#/); | |
| 138 chomp; | |
| 139 @line=split("\t",$_); | |
| 140 $mate=$line[4]; | |
| 141 $mate=~s/[A-L]|[N-W]|[Z]|\[|\]//g; | |
| 142 @mate=split(/:/,$mate); | |
| 143 $end1a=$line[1]-$window; | |
| 144 $end1b=$line[1]+$window; | |
| 145 $end2a=$mate[1]-$window; | |
| 146 $end2b=$mate[1]+$window; | |
| 147 next if (($end1a<0)||($end2a<0)); | |
| 148 if (($line[0]=~/^chr$/)||($mate[0]=~/^chr$/)) { | |
| 149 #print "$_\n"; | |
| 150 next; | |
| 151 } | |
| 152 print OUT "$line[0]\t$end1a\t$end1b\t$mate[0]\t$end2a\t$end2b\n"; | |
| 153 print OUT "$mate[0]\t$end2a\t$end2b\t$line[0]\t$end1a\t$end1b\n"; | |
| 154 } | |
| 155 close VCF; | |
| 156 close OUT; | |
| 157 #for each row in $tmpSpace3, count the number of overlaps on both sides | |
| 158 my $left=join("",$tmpSpace,".left"); | |
| 159 my $right=join("",$tmpSpace,".right"); | |
| 160 system("intersectBed -a $tmpSpace3 -b $tmpSpace5 -loj -c > $left"); | |
| 161 system("intersectBed -a $tmpSpace4 -b $tmpSpace5 -loj -c > $right"); | |
| 162 | |
| 163 my $Lcount=`wc -l $left|cut -f1 -d" "`; | |
| 164 my $Rcount=`wc -l $right|cut -f1 -d" "`; | |
| 165 if ($Lcount != $Rcount){die "Need to check for errors in $left and $right\n\n"} | |
| 166 system("paste $left $right > $tmpSpace5"); | |
| 167 system ("rm $left $right"); | |
| 168 open (IN,"$tmpSpace5") or die "Cant find $tmpSpace5\n"; | |
| 169 open (OUT,">$tmpSpace6") or die "Cant write in this directory\n"; | |
| 170 while(<IN>){ | |
| 171 $_=~chomp; | |
| 172 @lines=split("\t",$_); | |
| 173 if(($lines[3] > 0)&&($lines[6] > 0)){print OUT "1\t$VCF[$i]\n"}else{print OUT "0\t.\n"} | |
| 174 } | |
| 175 close IN; | |
| 176 close OUT; | |
| 177 | |
| 178 system("paste $tmpSpace7 $tmpSpace6 > $tmpSpace8"); | |
| 179 #system("head $tmpSpace7 $tmpSpace8"); | |
| 180 open (IN,"$tmpSpace8") or die "Cant find $tmpSpace8\n"; | |
| 181 open (OUT,">$tmpSpace9") or die "Cant write in this directory\n"; | |
| 182 my ($Samples,$NumSamples,$EVENT); | |
| 183 while(<IN>){ | |
| 184 $_=~chomp; | |
| 185 @lines=split("\t",$_); | |
| 186 | |
| 187 if ($lines[8] > 0){ | |
| 188 $Samples=$lines[7].";".$lines[9]; | |
| 189 $Samples=~s/^NA;//; | |
| 190 $NumSamples=$lines[6]+$lines[8]; | |
| 191 } | |
| 192 else{ | |
| 193 $Samples=$lines[7]; | |
| 194 $NumSamples=$lines[6]; | |
| 195 } | |
| 196 print OUT join ("\t",@lines[0..5],$NumSamples,$Samples)."\n"; | |
| 197 } | |
| 198 close IN; | |
| 199 close OUT; | |
| 200 print STDERR "completed with $VCF[$i]\n"; | |
| 201 system("cp $tmpSpace9 $tmpSpace7"); | |
| 202 } | |
| 203 | |
| 204 system("cp $tmpSpace7 $outFile"); | |
| 205 unlink ($tmpSpace9, $tmpSpace8, $tmpSpace7, $tmpSpace9,$tmpSpace3, $tmpSpace4, $tmpSpace5, $tmpSpace6); | |
| 206 print STDERR "Your results are in $outFile\n"; | |
| 207 | |
| 208 | |
| 209 sub usage(){ | |
| 210 print " | |
| 211 ### | |
| 212 ### This script will merge multiple SoftSearch VCF files | |
| 213 ### | |
| 214 | |
| 215 Usage: Merge_SV.pl -v <vcf1> <vcf2> <vcfN> -w [500] -o <output file> | |
| 216 | |
| 217 Note: Must have bedtools installed and in your path\n\n\n"; | |
| 218 } |
