Mercurial > repos > plus91-technologies-pvt-ltd > ss_test_tool
comparison 2.4/src/SoftSearch.multi.pl @ 0:00b9898b8510 draft
Uploaded
| author | plus91-technologies-pvt-ltd |
|---|---|
| date | Wed, 04 Jun 2014 03:41:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00b9898b8510 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 #### | |
| 4 #### Usage: SoftSearch.pl [-lqrmsd] -b <BAM> -f <Genome.fa> -sam <samtools path> -bed <bedtools path> | |
| 5 #### Created 1-30-2012 by Steven Hart, PhD | |
| 6 #### hart.steven@mayo.edu | |
| 7 #### Required bedtools & samtools to be in path | |
| 8 | |
| 9 #use FindBin; | |
| 10 #use lib "$FindBin::Bin/lib"; | |
| 11 use lib "/home/plus91/shed_tools/toolshed.g2.bx.psu.edu/repos/plus91-technologies-pvt-ltd/softsearch/2.4/lib" ; | |
| 12 use Getopt::Long; | |
| 13 use strict; | |
| 14 use warnings; | |
| 15 use Data::Dumper; | |
| 16 use LevD; | |
| 17 use File::Basename; | |
| 18 use List::Util qw(min max); | |
| 19 | |
| 20 my (@INPUT_BAM,$INPUT_FASTA,$OUTPUT_FILE,$minSoft,$minSoftReads,$dist_To_Soft,$bedtools,$samtools); | |
| 21 my ($minRP, $temp_output, $num_sd, $MapQ, $chrom, $unmated_pairs, $minBQ, $pair_only, $disable_RP_only); | |
| 22 my ($levD_local_threshold, $levD_distl_threshold,$pe_upper_limit,$high_qual,$sv_only,$blacklist,$genome_file,$verbose); | |
| 23 | |
| 24 my $cmd = ""; | |
| 25 | |
| 26 #Declare variables | |
| 27 GetOptions( | |
| 28 'b=s{1,}' => \@INPUT_BAM, | |
| 29 'f=s' => \$INPUT_FASTA, | |
| 30 'o:s' => \$OUTPUT_FILE, | |
| 31 'm:i' => \$minRP, | |
| 32 'l:i' => \$minSoft, | |
| 33 'r:i' => \$minSoftReads, | |
| 34 't:i' => \$temp_output, | |
| 35 's:s' => \$num_sd, | |
| 36 'd:i' => \$dist_To_Soft, | |
| 37 'q:i' => \$MapQ, | |
| 38 'c:s' => \$chrom, | |
| 39 'u:s' => \$unmated_pairs, | |
| 40 'x:s' => \$minBQ, | |
| 41 'p' => \$pair_only, | |
| 42 'g' => \$disable_RP_only, #ignore softclips | |
| 43 'j:s' => \$levD_local_threshold, | |
| 44 'k:s' => \$levD_distl_threshold, | |
| 45 'a:s' => \$pe_upper_limit, | |
| 46 'e:s' => \$high_qual, | |
| 47 'L' => \$sv_only, | |
| 48 'v' => \$verbose, | |
| 49 'blacklist:s' => \$blacklist, | |
| 50 'genome_file:s' => \$genome_file, | |
| 51 "help|h|?" => \&usage); | |
| 52 #print "Using @INPUT_BAM as INPUT_BAM\n"; | |
| 53 unless($sv_only){$sv_only=""}; | |
| 54 my $INPUT_BAM=$INPUT_BAM[0]; | |
| 55 #print "MY NEW INPUT BAM=$INPUT_BAM[0]\n\n";die; | |
| 56 if(defined($INPUT_BAM)){$INPUT_BAM=$INPUT_BAM} else {print usage();die "Where is the BAM file?\n\n"} | |
| 57 if(defined($INPUT_FASTA)){$INPUT_FASTA=$INPUT_FASTA} else {print usage();die "Where is the fasta file?\n\n"} | |
| 58 my ($fn,$pathname) = fileparse($INPUT_BAM,".bam"); | |
| 59 #my $index=`ls $pathname/$fn*bai|head -1`; | |
| 60 #my $index =`ls \${INPUT_BAM%.bam}*bai`; | |
| 61 #print "INDEX=$index\n"; | |
| 62 #if(!$index){die "\n\nERROR: you need index your BAM file\n\n"} | |
| 63 my $index=""; | |
| 64 ### get current time | |
| 65 print "Start Time : " . &spGetCurDateTime() . "\n"; | |
| 66 my $now = time; | |
| 67 | |
| 68 #if(defined($OUTPUT_FILE)){$OUTPUT_FILE=$OUTPUT_FILE} else {$OUTPUT_FILE="output.vcf"; print "\nNo outfile specified. Using output.vcf as default\n\n"} | |
| 69 if(defined($minSoft)){$minSoft=$minSoft} else {$minSoft=5} | |
| 70 if(defined($minRP)){$minRP=$minRP} else {$minRP=5} | |
| 71 if(defined($minSoftReads)){$minSoftReads=$minSoftReads} else {$minSoftReads=5} | |
| 72 if(defined($dist_To_Soft)){$dist_To_Soft=$dist_To_Soft} else {$dist_To_Soft=300} | |
| 73 if(defined($num_sd)){$num_sd=$num_sd} else {$num_sd=6} | |
| 74 if(defined($MapQ)){$MapQ=$MapQ} else {$MapQ=20} | |
| 75 | |
| 76 unless (defined $pe_upper_limit) { $pe_upper_limit = 10000; } | |
| 77 unless (defined $levD_local_threshold) { $levD_local_threshold = 0.05; } | |
| 78 unless (defined $levD_distl_threshold) { $levD_distl_threshold = 0.05; } | |
| 79 #Get sample name if available | |
| 80 my $SAMPLE_NAME=""; | |
| 81 my $OUTNAME =""; | |
| 82 $SAMPLE_NAME=`samtools view -f2 -H $INPUT_BAM|awk '{if(\$1~/^\@RG/){sub("ID:","",\$2);name=\$2;print name}}'|head -1`; | |
| 83 $SAMPLE_NAME=~s/\n//g; | |
| 84 if (!$OUTPUT_FILE){ | |
| 85 if($SAMPLE_NAME ne ""){$OUTNAME=$SAMPLE_NAME.".vcf"} | |
| 86 else {$OUTNAME="output.vcf"} | |
| 87 } | |
| 88 else{$OUTNAME=$OUTPUT_FILE} | |
| 89 | |
| 90 print "Writing results to $OUTNAME\n"; | |
| 91 | |
| 92 | |
| 93 ##Make sure if submitting on SGE, to prepned the "chr". Not all referecne FAST files require "chr", so we shouldn't force the issue. | |
| 94 if(!defined($chrom)){$chrom=""} | |
| 95 if(!defined($unmated_pairs)){$unmated_pairs=0} | |
| 96 | |
| 97 my $badQualValue=chr($MapQ); | |
| 98 if(defined($minBQ)){ $badQualValue=chr($minBQ); } | |
| 99 | |
| 100 if($badQualValue eq "#"){$badQualValue="\#"} | |
| 101 | |
| 102 # adding and cheking for samtools and bedtools in the PATh | |
| 103 ## check for bedtools and samtools in the path | |
| 104 $bedtools=`which intersectBed` ; | |
| 105 if(!defined($bedtools)){die "\nError:\n\tno bedtools. Please install bedtools and add to the path\n";} | |
| 106 #$samtools=`samtools 2>&1`; | |
| 107 $samtools=`which samtools`; | |
| 108 if($samtools !~ /(samtools)/i){die "\nError:\n\tno samtools. Please install samtools and add to the path\n";} | |
| 109 | |
| 110 print "Usage = SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -s $num_sd -c $chrom -b @INPUT_BAM -f $INPUT_FASTA -o $OUTNAME \n\n"; | |
| 111 sub usage { | |
| 112 print "\nusage: SoftSearch.pl [-cqlrmsd] -b <BAM> -f <Genome.fa> \n"; | |
| 113 print "\t-q\t\tMinimum mapping quality [20]\n"; | |
| 114 print "\t-l\t\tMinimum length of soft-clipped segment [5]\n"; | |
| 115 print "\t-r\t\tMinimum depth of soft-clipped reads at position [5]\n"; | |
| 116 print "\t-m\t\tMinimum number of discordant read pairs [5]\n"; | |
| 117 print "\t-s\t\tNumber of sd away from mean to be considered discordant [6]\n"; | |
| 118 print "\t-u\t\tNumber of unmated pairs [0]\n"; | |
| 119 print "\t-d\t\tMax distance between soft-clipped segments and discordant read pairs [300]\n"; | |
| 120 print "\t-o\t\tOutput file name [output.vcf]\n"; | |
| 121 print "\t-t\t\tPrint temp files for debugging [no|yes]\n"; | |
| 122 print "\t-c\t\tuse only this chrom or chr:pos1-pos2\n"; | |
| 123 print "\t-p\t\tuse paired-end mode only \n"; | |
| 124 print "\t-g\t\tEnable paired-only seach. This will look for discordant read pairs even without soft clips.\n"; | |
| 125 print "\t-a\t\tset the minimum distance for a discordant read pair without soft-clipping info [10000]\n"; | |
| 126 print "\t-L\t\tFlag to print out even small deletions (low quality)\n"; | |
| 127 print "\t-e\t\tdisable strict quality filtering of base qualities in soft-clipped reads [no]\n"; | |
| 128 print "\t-blacklist\tareas of the genome to skip calling. Requires -genome_file\n"; | |
| 129 print "\t-genome_file\ttab seperated value of chromosome name and length. Only used with -blacklist option\n\n"; | |
| 130 | |
| 131 exit 1; | |
| 132 } | |
| 133 | |
| 134 | |
| 135 ############################################################# | |
| 136 # create temporary variable name | |
| 137 ############################################################# | |
| 138 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`); | |
| 139 our $random_name=join "", map { ("a".."z")[rand 26] } 1..8; | |
| 140 | |
| 141 ############################################################# | |
| 142 ## create green list | |
| 143 ############################################################## | |
| 144 # | |
| 145 my $new_blacklist=""; | |
| 146 if($blacklist){ | |
| 147 if(!$genome_file){die "if using a blacklist, you must also specify the location of a genome_file | |
| 148 The format of the genome_file should be | |
| 149 chrom size | |
| 150 chr1 249250621 | |
| 151 chr2 243199373 | |
| 152 ... | |
| 153 | |
| 154 If using hg19, you can ge the genome file by | |
| 155 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \"select chrom, size from hg19.chromInfo\" > hg19.genome";} | |
| 156 | |
| 157 $cmd=join("","complementBed -i $blacklist -g $genome_file >",$random_name,".bed") ; | |
| 158 system ($cmd); | |
| 159 $new_blacklist=join(""," -L ",$random_name,".bed "); | |
| 160 } | |
| 161 | |
| 162 if($verbose){print "CMD=$cmd\nBlacklist is $new_blacklist\n";} | |
| 163 | |
| 164 | |
| 165 | |
| 166 | |
| 167 | |
| 168 ############################################################# | |
| 169 # Calcualte insert size distribution of properly mated reads | |
| 170 ############################################################# | |
| 171 | |
| 172 #Change for compatability with other operating systems | |
| 173 #my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|cut -f9|head -10000|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)**2)}'`; | |
| 174 #print "samtools view -q $MapQ -f2 $INPUT_BAM $chrom|cut -f9|head -10000|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)^2)}'\n"; | |
| 175 my $metrics=`samtools view -q $MapQ -f2 $INPUT_BAM $chrom|cut -f9|head -10000|awk '{if (\$1<0){\$1=-\$1}else{\$1=\$1} sum+=\$1; sumsq+=\$1*\$1} END {print sum/NR, sqrt(sumsq/NR - (sum/NR)^2)}'`; | |
| 176 #my ($mean,$stdev)=split(/ /,$metrics); | |
| 177 my ($mean,$stdev)=split(/\s/,$metrics); | |
| 178 $stdev=~s/\n//; | |
| 179 | |
| 180 #print "MEAN=$mean\tSTDEV=$stdev\n\n"; | |
| 181 | |
| 182 my $upper_limit=int($mean+($num_sd*$stdev)); | |
| 183 my $lower_limit=int($mean-($num_sd*$stdev)); | |
| 184 die if (!$mean); | |
| 185 print qq{The mean insert size is $mean +/- $stdev (sd) | |
| 186 The upper limit = $upper_limit | |
| 187 The lower limit = $lower_limit\n | |
| 188 }; | |
| 189 if($lower_limit<0){ | |
| 190 print "Warning!! Given this insert size distribution, we can not call small indels. No other data will be affected\n\n"; | |
| 191 $lower_limit=1; | |
| 192 } | |
| 193 my $tmp_name=join ("",$random_name,".tmp.bam"); | |
| 194 my $random_file_sc = ""; | |
| 195 my $command = ""; | |
| 196 | |
| 197 ############################################################# | |
| 198 # Make sam file that has soft clipped reads | |
| 199 ############################################################# | |
| 200 #give file a name | |
| 201 if(!defined($pair_only)){ | |
| 202 foreach my $bam(@INPUT_BAM){ | |
| 203 $random_file_sc=join ("",$random_name,".sc.sam"); | |
| 204 $command=join ("","samtools view -q $MapQ -F 1024 $bam $chrom $new_blacklist| awk '{OFS=\"\\t\"}{c=0;if(\$6~/S/){++c};if(c == 1){print}}' | perl -ane '\$TR=(\@F[10]=~tr/\#//);if(\$TR<2){print}' >> ", $random_file_sc); | |
| 205 print "Making SAM file of soft-clipped reads from $bam\n"; | |
| 206 if($verbose){ print "$command\n";} | |
| 207 system("$command"); | |
| 208 } | |
| 209 ############################################################# | |
| 210 # Find areas that have deep enough soft-clip coverage | |
| 211 print "Identifying soft-clipped regions that are at least $minSoft bp long iin $random_file_sc\n"; | |
| 212 open (FILE,"$random_file_sc")||die "Can't open soft-clipped sam file $random_file_sc\n"; | |
| 213 | |
| 214 my $tmpfile=join("",$random_file_sc,".sc.passfilter"); | |
| 215 open (OUT,">$tmpfile")||die "Can't write files here!\n"; | |
| 216 | |
| 217 while(<FILE>){ | |
| 218 @_ = split(/\t/, $_); | |
| 219 #### parse CIGAR string and create a hash of array of each operation | |
| 220 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]); | |
| 221 my $hash; | |
| 222 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR; | |
| 223 | |
| 224 #for ($i=0; $i<=$#softclip_pos; $i++) { | |
| 225 foreach my $softclip (@{$hash->{S}}) { | |
| 226 #if ($CIGAR[$softclip_pos[$i]] > $minSoft){ | |
| 227 if ($softclip > $minSoft){ | |
| 228 ###############Make sure base qualities don't have more than 2 bad marks | |
| 229 my $qual=$_[10]; | |
| 230 my $TR=($qual=~tr/$badQualValue//); | |
| 231 if($badQualValue eq "#"){ $TR=($qual=~tr/\#//); } | |
| 232 #Skip the soft clip if there is more than 2 bad qual values | |
| 233 #next if($TR > 2); | |
| 234 # if (!$high_qual){next if($TR > 2);} | |
| 235 print OUT; | |
| 236 last; | |
| 237 } | |
| 238 } | |
| 239 } | |
| 240 close FILE; | |
| 241 close OUT; | |
| 242 | |
| 243 $command=join(" ","mv",$tmpfile,$random_file_sc); | |
| 244 if($verbose){ print "$command\n";} | |
| 245 system("$command"); | |
| 246 } | |
| 247 | |
| 248 ######################################################### | |
| 249 #Stack up SoftClips | |
| 250 ######################################################### | |
| 251 my $random_file=join("",$random_name,".sc.direction.bed"); | |
| 252 if(!defined($pair_only)){ | |
| 253 open (FILE,"$random_file_sc")|| die "Can't open sam file\n"; | |
| 254 #$random_file=join("",$random_name,".sc.direction"); | |
| 255 | |
| 256 print "Calling sides of soft-clips from $random_file_sc\n"; | |
| 257 #\nTMPOUT=$random_file\tINPUT=$random_file_sc\n\n"; | |
| 258 open (TMPOUT,">$random_file")|| die "Can't create tmp file\n"; | |
| 259 | |
| 260 while (<FILE>){ | |
| 261 @_ = split(/\t/, $_); | |
| 262 #### parse CIGAR string and create a hash of array of each operation | |
| 263 my @CIGAR = split(/([0-9]+[SMIDNHXP])/, $_[5]); | |
| 264 my $hash; | |
| 265 map { push(@{$hash->{$2}}, $1) if (/(\d+)([SMIDNHXP])/) } @CIGAR; | |
| 266 | |
| 267 #### next if softclips on each end | |
| 268 next if ($_[5] =~ /^[0-9]+S.*S$/); | |
| 269 | |
| 270 #### next softclip occurs in the middle | |
| 271 next if ($_[5] =~ /^[0-9]+[^S][0-9].*S.+$/); | |
| 272 | |
| 273 my $softclip = $hash->{S}[0]; | |
| 274 | |
| 275 my $end1 = 0; | |
| 276 my $end2 = 0; | |
| 277 my $softBases = ""; | |
| 278 my $right_corrected="";my $left_corrected=""; | |
| 279 | |
| 280 if ($softclip > $minSoft) { | |
| 281 | |
| 282 ####If the soft clip occurs at end of read and its on the minus strand, then it's a right clip | |
| 283 if ($_[5] =~ /^.*S$/) { | |
| 284 $end1=$_[3]+length($_[9])-$softclip-1; | |
| 285 $end2=$end1+1; | |
| 286 next if ($end1<0); | |
| 287 #RIGHT clip on Minus | |
| 288 $softBases=substr($_[9], length($_[9])-$softclip, length($_[9])); | |
| 289 #Right clips don't always get clipped correctly, so fix that | |
| 290 # Check to see if sc base matches ref | |
| 291 $right_corrected=baseCheck($_[2],$end2,"right",$softBases); | |
| 292 print TMPOUT "$right_corrected\n" | |
| 293 | |
| 294 } else { | |
| 295 #### Begins with S (left clip) | |
| 296 $end1=$_[3]-$softclip; | |
| 297 next if ($end1<0); | |
| 298 | |
| 299 $softBases=substr($_[9], 0,$softclip);#print "TMP=$softBases\n"; | |
| 300 $left_corrected=baseCheck($_[2],$end1,"left",$softBases); | |
| 301 if(!$left_corrected){print "baseCheck($_[2],$end1,left,$softBases)\n";next} | |
| 302 print TMPOUT "$left_corrected\n"; | |
| 303 #print "\nSEQ=$_[9]\t\n"; | |
| 304 | |
| 305 } | |
| 306 } | |
| 307 } | |
| 308 close FILE; | |
| 309 close TMPOUT; | |
| 310 } | |
| 311 sub baseCheck{ | |
| 312 my ($chrom,$pos,$direction,$softBases)=@_; | |
| 313 #skip if position is less than 0, which is caused by MT DNA | |
| 314 return if ($pos<0); | |
| 315 my $exit=""; | |
| 316 | |
| 317 while(!$exit){ | |
| 318 if($direction=~/right/){ | |
| 319 my $refBase=getSeq($chrom,$pos,$INPUT_FASTA); | |
| 320 my $softBase=substr($softBases,0,1); | |
| 321 if ($softBase !~ /$refBase/){ | |
| 322 my $value=join("\t",$chrom,$pos,$pos+1,join("|",$softBases,$direction)); | |
| 323 $exit="STOP"; | |
| 324 return $value; | |
| 325 } | |
| 326 else{ | |
| 327 $pos=$pos+1; | |
| 328 $softBases=substr($softBases, 1,length($softBases)); | |
| 329 } | |
| 330 } | |
| 331 else{ | |
| 332 my $refBase=getSeq($chrom,$pos+1,$INPUT_FASTA); | |
| 333 my $softBase=substr($softBases,-1,1); | |
| 334 if ($softBase !~ /$refBase/){ | |
| 335 $pos=$pos-1+length($softBases); | |
| 336 my $value=join("\t",$chrom,$pos-1,$pos,join("|",$softBases,$direction)); | |
| 337 $exit="STOP"; | |
| 338 return $value; | |
| 339 } | |
| 340 else{ | |
| 341 $pos=$pos-1; | |
| 342 $softBases=substr($softBases, 0, -1); | |
| 343 #print "Trying again $softBases\n"; | |
| 344 } | |
| 345 | |
| 346 } | |
| 347 | |
| 348 } | |
| 349 } | |
| 350 #Remove SAM files to conserve space | |
| 351 unlink($random_file_sc); | |
| 352 | |
| 353 | |
| 354 | |
| 355 ### | |
| 356 # | |
| 357 ###################################################### | |
| 358 # Transform Read pair groups into softclip equivalents | |
| 359 ###################################################### | |
| 360 # | |
| 361 # | |
| 362 # | |
| 363 my $v=""; | |
| 364 #if($disable_RP_only){ | |
| 365 print "Running Bam2pair.pl\n"; | |
| 366 print "Looking for discordant read pairs without requiring soft-clipping information\n"; | |
| 367 use FindBin qw($Bin); | |
| 368 my $path=$Bin; | |
| 369 # print"\n\nPATH=$path\n\n"; | |
| 370 if($verbose){$v="-v"} | |
| 371 foreach my $random_file_disc(@INPUT_BAM){ | |
| 372 my $tmp_out=join("",$random_name,"RP.out"); | |
| 373 $command=join("","perl ",$path,"/Bam2pair.pl -b $random_file_disc -o $tmp_out -isize $pe_upper_limit -winsize $dist_To_Soft -min $minRP -chrom $chrom -prefix $random_name -q $MapQ -blacklist $random_name.bed $v"); | |
| 374 if($verbose){ print "$command\n"}; | |
| 375 system("$command"); | |
| 376 $command=join("","perl -ane '\$end1=\@F[1];\$end2=\@F[3];print join(\"\\t\",\@F[0..1],\$end1,\"unknown|left\");print \"\\n\";print join(\"\\t\",\@F[2..3],\$end2,\"unknown|left\");print \"\\n\"' ", $tmp_out," >> ",$random_file); | |
| 377 if($verbose){print "$command\n"}; | |
| 378 system($command); | |
| 379 unlink($tmp_out); | |
| 380 #} | |
| 381 } | |
| 382 | |
| 383 | |
| 384 ###################################################### | |
| 385 unlink("$random_file","$tmp_name","$random_file","$index","$random_name","$new_blacklist") if (-z $random_file || ! -e $random_file) ; | |
| 386 if (-z $random_file || ! -e $random_file){ | |
| 387 print "Softclipped file is empty($random_file).\nNo soft clipping found using desired paramters\n\n"; | |
| 388 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
| 389 &print_header(); | |
| 390 close OUT; | |
| 391 exit; | |
| 392 } | |
| 393 | |
| 394 | |
| 395 ############################################################# | |
| 396 # Make sure there are enough soft-clippped supporting reads | |
| 397 ############################################################# | |
| 398 my $outfile=join("",$random_file,".sc.merge.bed"); | |
| 399 #sortbed -i .sc.direction | mergeBed -nms -d 25 -i stdin > .sc.merge.bed | |
| 400 $command=join(" ","sortBed -i",$random_file," | mergeBed -nms -i stdin","|grep \";\"","|awk '{OFS=\"\t\"}(NF==4)'",">",$outfile); | |
| 401 | |
| 402 #print "$command\n"; | |
| 403 system("$command"); | |
| 404 | |
| 405 if (-z $outfile || ! -e $outfile){ | |
| 406 unlink("$tmp_name","$random_file","$outfile","$index","$random_name","$new_blacklist"); | |
| 407 print "mergeBed file is empty.\nNo strucutral variants found\n\n" ; | |
| 408 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
| 409 &print_header(); | |
| 410 close OUT; | |
| 411 exit; | |
| 412 } | |
| 413 | |
| 414 print "completed mergeBed\n"; | |
| 415 | |
| 416 ############################################################### | |
| 417 # If left and right are on the same line, make into 2 lines | |
| 418 ############################################################### | |
| 419 open (INFILE,$outfile)||die "couldn't open temp file : $. \n\n"; | |
| 420 my $tmp2=join("",$random_name,".sc.fixed.merge.bed"); | |
| 421 #print "INFILE=$outfile\tOUTFILE=$tmp2\n\n"; | |
| 422 #INPUT FORMAT=chr9\t131467\t131473\tATGCTTATTAAAA|left;TTATTAAAAGCATA|left | |
| 423 open (OUTFILE,">$tmp2")||die "couldn't create temp file : $. \n\n"; | |
| 424 while(<INFILE>){ | |
| 425 chomp $_; | |
| 426 my $l = $_; | |
| 427 | |
| 428 my @a = split(/\t/, $l); | |
| 429 my $info = $a[3]; | |
| 430 my @info_arr = split(/\;/, $info); | |
| 431 my @left_arr=(); | |
| 432 my @right_arr=(); | |
| 433 @left_arr = grep(/left/, @info_arr); | |
| 434 @right_arr = grep(/right/, @info_arr); | |
| 435 | |
| 436 #New | |
| 437 my $left = join(";", @left_arr); | |
| 438 my $right = join(";", @right_arr); | |
| 439 $info = join(";", @info_arr); | |
| 440 | |
| 441 if((@left_arr) && (@right_arr)){ | |
| 442 print OUTFILE "$a[0]\t$a[1]\t$a[2]\t$left\n$a[0]\t$a[1]\t$a[2]\t$right\n"; | |
| 443 } else{ | |
| 444 my $all=join("\t",@a[0..2],$info); | |
| 445 print OUTFILE "$all\n"; | |
| 446 } | |
| 447 } | |
| 448 | |
| 449 # make sure output file name is $outfile | |
| 450 $command=join(" ","sed -e '/ /s//\t/g'", $tmp2,"|awk 'BEGIN{OFS=\"\\t\"}(NF==4)'", "|perl -pne 's/ /\t/g'>",$outfile); | |
| 451 system("$command"); | |
| 452 if($verbose){print "$command\n"}; | |
| 453 unlink("$tmp_name","$random_file","$tmp2","$outfile","$index","random_name","$new_blacklist") if (-z $outfile || ! -e $outfile) ; | |
| 454 if (-z $outfile || ! -e $outfile){ | |
| 455 print "Fixed mergeBed file is empty($outfile).\nNo strucutral variants found\n\n"; | |
| 456 open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
| 457 &print_header(); | |
| 458 close OUT; | |
| 459 exit; | |
| 460 } | |
| 461 | |
| 462 print "completed fixing mergeBed\n\n"; | |
| 463 | |
| 464 ############################################################### | |
| 465 # Seperate directions of soft clips | |
| 466 ############################################################### | |
| 467 my $left_sc = join("", "left", $tmp2); | |
| 468 my $right_sc = join("", "right", $tmp2); | |
| 469 use FindBin qw($Bin); | |
| 470 #my $path=$Bin; | |
| 471 | |
| 472 $command=join("","grep left ", $tmp2, " |sed -e '/left /s//left\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$left_sc); | |
| 473 system("$command"); | |
| 474 #print "$command\n"; | |
| 475 $command=join("","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g'|perl ".$path."/direction_filter.pl - >",$right_sc); | |
| 476 #$command=join(" ","grep right ", $tmp2, " |sed -e '/right /s//right\;/g' | sed -e '/ /s//\t/g' >",$right_sc); | |
| 477 system("$command"); | |
| 478 #print "$command\n"; | |
| 479 #die "CHECK $right_sc\n"; | |
| 480 | |
| 481 ############################################################### | |
| 482 # Count the number and identify directions of soft clips | |
| 483 ############################################################### | |
| 484 print "Count the number and identify directions of soft clips\n"; | |
| 485 #print "looking in $outfile\n"; | |
| 486 $outfile=join("",$random_name,".sc.fixed.merge.bed"); | |
| 487 #system("ls -lhrt"); | |
| 488 open (INFILE,$outfile)||die "couldn't open temp file\n\n"; | |
| 489 my $tmp3 = join("", $random_file, "predSV"); | |
| 490 open (OUTFILE, ">$tmp3")||die "couldn't create temp file\n\n"; | |
| 491 while(<INFILE>){ | |
| 492 chomp; | |
| 493 @_=split(/\t/,$_); | |
| 494 my $count=tr/\;//; | |
| 495 $count=$count+1; | |
| 496 my $left=0; | |
| 497 my $right=0; | |
| 498 | |
| 499 while ($_ =~ /left/g) { $left++ } # count number of right clips | |
| 500 while ($_ =~ /right/g) { $right++ } # count number of left clips | |
| 501 | |
| 502 ############################################################### | |
| 503 if ($count >= $minSoftReads){ | |
| 504 ####get longets soft-clipped read | |
| 505 my @clips=split(/\;|\|/,$_[3]); | |
| 506 | |
| 507 my ($max, $temp, $temp2, $temp3, $dir, $maxSclip) = (0) x 6; | |
| 508 for (my $i=0; $i<$count; $i++) { | |
| 509 my $plus1=$i+1; | |
| 510 $temp=length($clips[$i]); | |
| 511 $temp2=$clips[$plus1]; | |
| 512 $temp3=$clips[$i]; | |
| 513 | |
| 514 if ($temp > $max){ | |
| 515 $maxSclip=$temp3; | |
| 516 $max =$temp; | |
| 517 $dir=$temp2; | |
| 518 } else { | |
| 519 $max=$max; | |
| 520 $dir=$dir; | |
| 521 $maxSclip=$maxSclip; | |
| 522 } | |
| 523 $i++; | |
| 524 } | |
| 525 my $order2 = join("|", $left, $right); | |
| 526 #print join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n"; | |
| 527 print OUTFILE join ("\t",@_[0..2],$maxSclip,$max,$dir,$count,$order2) . "\n"; | |
| 528 } elsif($_=~/unknown/){ | |
| 529 print OUTFILE join ("\t",@_[0..2],"NA","NA","left","NA","NA|NA") . "\n"; | |
| 530 print OUTFILE join ("\t",@_[0..2],"NA","NA","right","NA","NA|NA") . "\n"; | |
| 531 } | |
| 532 ####Format is Chrom,start, end,longest Soft-clip,length of longest Soft-clip, direction of longest soft-clip,#supporting softclips,#right Sclips|#left Sclips | |
| 533 } | |
| 534 close INFILE; | |
| 535 close OUTFILE; | |
| 536 | |
| 537 unlink("$tmp2","$tmp_name","$random_file","$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$new_blacklist") if (-z $tmp3 || !-e $tmp3) ; | |
| 538 | |
| 539 if (-z $tmp3 || !-e $tmp3){ | |
| 540 print "No structural variants found while Counting the number and identify directions of soft clips.\n" ; | |
| 541 | |
| 542 # open (OUT,">$OUTNAME")||die "Can't write files here!\n"; | |
| 543 # &print_header(); | |
| 544 # close OUT; | |
| 545 exit; | |
| 546 } | |
| 547 | |
| 548 print "Done counting Softclipped reads\n"; | |
| 549 ############################################################### | |
| 550 #### Print header information | |
| 551 ############################################################### | |
| 552 | |
| 553 | |
| 554 foreach my $random_file_disc(@INPUT_BAM){ | |
| 555 print "Making the header for $random_file_disc\n"; | |
| 556 $SAMPLE_NAME=`samtools view -f2 -H $random_file_disc|awk '{if(\$1~/^\@RG/){sub("ID:","",\$2);name=\$2;print name}}'|head -1`; | |
| 557 $SAMPLE_NAME=~s/\n//g; | |
| 558 if($chrom){$SAMPLE_NAME.=".".$chrom} | |
| 559 | |
| 560 $SAMPLE_NAME.=".vcf"; | |
| 561 open (OUT,">$SAMPLE_NAME")||die "Can't write files here!\n"; | |
| 562 &print_header(); | |
| 563 | |
| 564 # DO the bulk of the work | |
| 565 open (FILE,"$tmp3")|| die "Can't open file\n"; | |
| 566 | |
| 567 while (<FILE>){ | |
| 568 #If left clip {+- or -- or -+ }{+- are uninformative b/c they go upstream} | |
| 569 #If right clip {++ or -- or +-} | |
| 570 chomp $_; | |
| 571 my @res=();my $res; | |
| 572 my $line = $_; | |
| 573 my @info = split(/\t/, $_); | |
| 574 my $i=0; | |
| 575 my $basename=basename($random_file_disc);$i=0; | |
| 576 if($info[5] eq "left") { | |
| 577 $res=bulk_work("left", $line, $random_file_disc); | |
| 578 if(!$res){$res=join("\t",".",".",".",".",".",".",".",".",".",".")}; | |
| 579 $i++; | |
| 580 } | |
| 581 elsif ($info[5] eq "right") { | |
| 582 $res=bulk_work("right", $line, $random_file_disc); | |
| 583 if(!$res){$res=join("\t",".",".",".",".",".",".",".",".",".",".")}; | |
| 584 $i++; | |
| 585 } | |
| 586 if($res){@res=split("\t",$res); | |
| 587 print OUT join("\t",@res)."\n"; | |
| 588 }} | |
| 589 close FILE; | |
| 590 close OUT; | |
| 591 print "Done with $random_file_disc\n\n"; | |
| 592 } | |
| 593 | |
| 594 | |
| 595 | |
| 596 ############################################################################### | |
| 597 ############################################################################### | |
| 598 #### Delete temp files | |
| 599 my $meregedBed=join("",$random_name,".sc.direction.bed.sc.merge.bed"); | |
| 600 | |
| 601 if(defined($temp_output)){$temp_output=$temp_output} else {$temp_output="no"} | |
| 602 | |
| 603 if ($temp_output eq "no"){ | |
| 604 unlink("$tmp_name","$random_file","$tmp2",,"$tmp3","$outfile","$index","$random_name","$right_sc","$left_sc","$meregedBed","$random_name.bed"); | |
| 605 } | |
| 606 ####Sort VCF | |
| 607 #my $tmp=join(".",$random_name,"tmp"); | |
| 608 #Get header | |
| 609 #$cmd="grep \"#\" $OUTNAME > $tmp"; | |
| 610 #system($cmd); | |
| 611 #sort results | |
| 612 #$cmd="grep -v \"#\" $OUTNAME|perl -pne 's/chr//'|sort -k1,1n -k2,2n|perl -ne 'print \"chr\".\$_' >>$tmp"; | |
| 613 #system($cmd); | |
| 614 #$cmd="mv $tmp $OUTNAME"; | |
| 615 #system($cmd); | |
| 616 #remove entries next to each other | |
| 617 | |
| 618 | |
| 619 print "Analysis Completed\n\nYou did it!!!\n"; | |
| 620 print "Finish Time : " . &spGetCurDateTime() . "\n"; | |
| 621 $now = time - $now; | |
| 622 printf("\n\nTotal running time: %02d:%02d:%02d\n\n", int($now / 3600), int(($now % 3600) / 60), | |
| 623 int($now % 60)); | |
| 624 | |
| 625 exit; | |
| 626 | |
| 627 ############################################################################### | |
| 628 sub rev_comp { | |
| 629 my $dna = shift; | |
| 630 my $revcomp = reverse($dna); | |
| 631 $revcomp =~ tr/ACGTacgt/TGCAtgca/; | |
| 632 return $revcomp; | |
| 633 } | |
| 634 | |
| 635 | |
| 636 ############################################################################### | |
| 637 #### to get reference base | |
| 638 sub getSeq{ | |
| 639 my ($chr,$pos,$fasta)=@_; | |
| 640 #don't require chr | |
| 641 #if($chr !~ /^chr/){die "$chr is not correct\n";} | |
| 642 # die "$pos is not a number\n" if ($pos <0); | |
| 643 my @result=(); | |
| 644 if ($pos <0){print "$pos is not a valid position (likely caused by circular MT chromosome)\n";return;} | |
| 645 | |
| 646 @result = `samtools faidx $fasta $chr:$pos-$pos`; | |
| 647 if($result[1]){chomp($result[1]); | |
| 648 return uc($result[1]); | |
| 649 } | |
| 650 return("NA"); | |
| 651 #### after return will not be printed | |
| 652 ####print "RESULTS=@result\n"; | |
| 653 } | |
| 654 | |
| 655 sub getBases{ | |
| 656 my ($chr,$pos1,$pos2,$fasta)=@_; | |
| 657 #don't require chr | |
| 658 #if($chr !~ /^chr/){die "$chr is not correct\n";} | |
| 659 my @result=(); | |
| 660 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";return;}; | |
| 661 | |
| 662 @result = `samtools faidx $fasta $chr:$pos1-$pos2`; | |
| 663 if(!$result[1]){$result[1]="NA"}; | |
| 664 chomp($result[1]); | |
| 665 return uc($result[1]); | |
| 666 | |
| 667 #### after return will not be printed | |
| 668 ####print "RESULTS=@result\n"; | |
| 669 } | |
| 670 ############################################################################### | |
| 671 #### to get time | |
| 672 sub spGetCurDateTime { | |
| 673 my ($sec, $min, $hour, $mday, $mon, $year) = localtime(); | |
| 674 my $curDateTime = sprintf "%4d-%02d-%02d %02d:%02d:%02d", | |
| 675 $year+1900, $mon+1, $mday, $hour, $min, $sec; | |
| 676 return ($curDateTime); | |
| 677 } | |
| 678 | |
| 679 | |
| 680 ############################################################################### | |
| 681 #### print header | |
| 682 sub print_header { | |
| 683 my $date=&spGetCurDateTime(); | |
| 684 my $header = qq{##fileformat=VCFv4.1 | |
| 685 ##fileDate=$date | |
| 686 ##source=SoftSearch.pl | |
| 687 ##reference=$INPUT_FASTA | |
| 688 ##Usage= SoftSearch.pl -l $minSoft -q $MapQ -r $minSoftReads -d $dist_To_Soft -m $minRP -u $unmated_pairs -s $num_sd -b @INPUT_BAM -f $INPUT_FASTA -o $OUTNAME | |
| 689 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant"> | |
| 690 ##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend"> | |
| 691 ##FORMAT=<ID=lSC,Number=1,Type=Integer,Description="Length of the longest soft clips supporting the BND"> | |
| 692 ##FORMAT=<ID=nSC,Number=1,Type=Integer,Description="Number of supporting soft-clips\"> | |
| 693 ##FORMAT=<ID=uRP,Number=1,Type=Integer,Description="Number of unmated read pairs nearby Soft-Clips"> | |
| 694 ##FORMAT=<ID=levD_local,Number=1,Type=Float,Description="Levenstein distance between soft-clipped bases and the area around the original soft-clipped site"> | |
| 695 ##FORMAT=<ID=levD_distl,Number=1,Type=Float,Description="Levenstein distance between the soft-clipped bases and mate location"> | |
| 696 ##FORMAT=<ID=CTX,Number=1,Type=Integer,Description="Number of chromosomal translocations"> | |
| 697 ##FORMAT=<ID=DEL,Number=1,Type=Integer,Description="Number of reads supporting Large Deletions"> | |
| 698 ##FORMAT=<ID=INS,Number=1,Type=Integer,Description="Number of reads supporting Large insertions"> | |
| 699 ##FORMAT=<ID=NOV_INS,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion"> | |
| 700 ##FORMAT=<ID=INV,Number=1,Type=Integer,Description="Number of reads supporting inversions"> | |
| 701 ##FORMAT=<ID=sDEL,Number=1,Type=Integer,Description="Number of reads supporting novel sequence insertion"> | |
| 702 ##INFO=<ID=NO_MATE_SC,Number=1,Type=Flag,Description="When there is no softclipping of the mate read location, an appromiate position is used"> | |
| 703 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Dummy value for maintaining VCF-Spec"> | |
| 704 #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$SAMPLE_NAME\n}; | |
| 705 | |
| 706 print OUT $header; | |
| 707 } | |
| 708 | |
| 709 | |
| 710 ############################################################################### | |
| 711 sub bulk_work { | |
| 712 my ($side, $line, $file) = @_; | |
| 713 my $local_levD = 0; | |
| 714 my $distl_levD = 0; | |
| 715 | |
| 716 #my @info = split(/\t/, $line); | |
| 717 my @plus_Reads = split(/\t/, $line); | |
| 718 $plus_Reads[7] =~ s/\n//g; | |
| 719 | |
| 720 #### softclip length and softclip size. | |
| 721 my $lSC = $plus_Reads[4]; | |
| 722 my $nSC = $plus_Reads[6]; | |
| 723 | |
| 724 | |
| 725 #Get all types of compatible reads | |
| 726 #Get improperly paired reads (@ max distance) | |
| 727 | |
| 728 #### default value for left SIDE. | |
| 729 #If left-clip, then look downstream for match of softclipped reads to define a deletion, but look for DRPs upstream | |
| 730 my $sv_type = "SVTYPE=BND"; | |
| 731 my $start_local=0; my $end_local=0;my $target_local="";my $target_drp="";my $start_drp="";my $end_drp=""; | |
| 732 if ($side =~ /left/) { | |
| 733 $start_local = $plus_Reads[1]-$dist_To_Soft; | |
| 734 $end_local = $plus_Reads[2]; | |
| 735 $start_drp = $plus_Reads[1]; | |
| 736 $end_drp = $plus_Reads[1]+$dist_To_Soft; | |
| 737 | |
| 738 } | |
| 739 else{ | |
| 740 $start_local = $plus_Reads[1]; | |
| 741 $end_local = $plus_Reads[1]+$dist_To_Soft; | |
| 742 $start_drp = $plus_Reads[1]-$dist_To_Soft; | |
| 743 $end_drp = $plus_Reads[1]; | |
| 744 } | |
| 745 | |
| 746 $target_local=join("", $plus_Reads[0], ":", $start_local, "-", $end_local); | |
| 747 $target_drp=join("", $plus_Reads[0], ":", $start_drp, "-", $end_drp); | |
| 748 my $num_unmapped_pairs=""; | |
| 749 if ($side =~ /right/) { | |
| 750 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f8 -F 1536 -c $file $target_drp`; | |
| 751 } else { | |
| 752 $num_unmapped_pairs=`samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $file $target_drp`; | |
| 753 } | |
| 754 if($verbose){print "samtools view $new_blacklist -q $MapQ -f24 -F 1536 -c $file $target_drp\n";} | |
| 755 | |
| 756 $num_unmapped_pairs=~s/\n//; | |
| 757 if($verbose){print "NUM UNMAPPED PAIRS= $num_unmapped_pairs\n";} | |
| 758 my $REF1_base = ""; | |
| 759 my $REF2_base = ""; | |
| 760 my $INFO_1 = ""; | |
| 761 my $INFO_2 = ""; | |
| 762 my $ALT_1 = ""; | |
| 763 my $ALT_2 = ""; | |
| 764 my $isize = 0; | |
| 765 my $QUAL = ""; | |
| 766 my $FORMAT = "GT:"; | |
| 767 | |
| 768 #### get 8 bit rand id | |
| 769 my $BND1_name = join "", map { ("a".."z")[rand 26] } 1..8; | |
| 770 my $BND2_name = join "", map { ("a".."z")[rand 26] } 1..8; | |
| 771 $BND1_name=join "_","BND",$BND1_name; | |
| 772 $BND2_name=join "_","BND",$BND2_name; | |
| 773 | |
| 774 my $counts = {CTX => 0, DEL => 0, INS => 0, INV => 0, TDUP => 0, NOV_INS => 0 }; | |
| 775 my $event_mate_info = {CTX => "", DEL => "", INS => "", INV => "", TDUP => "", NOV_INS => "" }; | |
| 776 | |
| 777 #### get mate pair info and counts per event | |
| 778 foreach my $e (sort keys %{$counts}) { | |
| 779 my $h = get_counts_n_info($e, $side, $MapQ, $file, $dist_To_Soft, $target_drp, $upper_limit, $lower_limit); | |
| 780 | |
| 781 $counts->{$e} = $h->{count}; | |
| 782 $event_mate_info->{$e} = $h->{info}; | |
| 783 } | |
| 784 | |
| 785 my $max = 0; | |
| 786 my $type = "UNKNOWN"; | |
| 787 my $nRP = 0; | |
| 788 my $mate_info = "NA\tNA\tNA\tNA"; | |
| 789 my $summary = "GT:"; | |
| 790 | |
| 791 #### find max count of events and set type, nRP and info to corresponding | |
| 792 #### max count event. | |
| 793 #### also create a summary string of all counts to be added to VCF file. | |
| 794 foreach my $e (sort keys %{$counts}){ | |
| 795 # if ($counts->{$e} >=i $max){ | |
| 796 if ($counts->{$e} > $max){ | |
| 797 $type = $e .",". $counts->{$e}; | |
| 798 $nRP = $counts->{$e}; | |
| 799 | |
| 800 $max = $counts->{$e}; | |
| 801 | |
| 802 if (length($event_mate_info->{$e})) { | |
| 803 $mate_info = $event_mate_info->{$e}; | |
| 804 } | |
| 805 } | |
| 806 | |
| 807 $summary .= $e .",". $counts->{$e} .":"; | |
| 808 } | |
| 809 #print "done with Summaryi=$summary\n"; | |
| 810 #### remove last colon ":" from | |
| 811 $summary =~ s/:$//; | |
| 812 if (($minRP > $max)&&(!$disable_RP_only )){return}; | |
| 813 | |
| 814 #### Run Levenstein distance on softclip in target region to find out if its a small deletion/insetion | |
| 815 #### passing 1: clip_seq, 2: chr, 3: start, 4: end, 5: ref file. | |
| 816 my $levD = new LevD; | |
| 817 ######################################################## | |
| 818 ######################################################## | |
| 819 ######################################################## | |
| 820 | |
| 821 #### redefine start and end location for LevD calc. | |
| 822 # $start = $plus_Reads[1]-$dist_To_Soft; | |
| 823 # $end = $plus_Reads[2]; | |
| 824 my $num_bases_to_loc=0; | |
| 825 my $new_start=0; | |
| 826 my $new_end=0; | |
| 827 my $del_seq=""; | |
| 828 my $start = $start_local; | |
| 829 my $end = $end_local; | |
| 830 if ($lSC=~/NA/){$lSC=0} | |
| 831 | |
| 832 if ($side =~ /right/) { | |
| 833 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA); | |
| 834 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist}); | |
| 835 $num_bases_to_loc=$levD->{index}; | |
| 836 $new_start = $plus_Reads[2]; | |
| 837 if ($plus_Reads[2]=~/^[0-9]/){$new_end=$plus_Reads[2]+$lSC}; | |
| 838 } | |
| 839 else{ | |
| 840 $levD->search($plus_Reads[3], $plus_Reads[0], $start, $end, $INPUT_FASTA); | |
| 841 $local_levD = sprintf("%.2f", $levD->{relative_edit_dist}); | |
| 842 $num_bases_to_loc=$levD->{index}; | |
| 843 if ($plus_Reads[2]=~/^[0-9]/){$new_start=$plus_Reads[2]-$lSC}; | |
| 844 $new_end = $plus_Reads[2]; | |
| 845 } | |
| 846 return if((!$new_start)||(!$new_end)); | |
| 847 return if ($new_start<0); | |
| 848 $del_seq=getBases($plus_Reads[0], $new_start,$new_end,$INPUT_FASTA); | |
| 849 ############################################################################## | |
| 850 # #If there is a match, where is the start position of the match? | |
| 851 # | |
| 852 ############################################################################## | |
| 853 | |
| 854 | |
| 855 #if $plus_Reads[3] eq "NA", then it was found without soft-clipped reads | |
| 856 if($plus_Reads[3] !~ /NA/){ | |
| 857 if (($local_levD < $levD_local_threshold)) { | |
| 858 return if (!$sv_only); | |
| 859 #### add value to summary to be written to vcf file. | |
| 860 $summary = "GT:sDel," . $plus_Reads[6]; | |
| 861 $type = "sDEL"; | |
| 862 ########################################################################### | |
| 863 ##### Printing output | |
| 864 | |
| 865 ######################################### | |
| 866 ##### Get DNA info | |
| 867 ######################################### | |
| 868 #$REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA); | |
| 869 $REF1_base = substr($del_seq, 0, 1); | |
| 870 | |
| 871 #### this is alt ref. for softclip its $plus_Reads[3] | |
| 872 $REF2_base = $del_seq; | |
| 873 $QUAL = 1/($local_levD + 0.001); | |
| 874 $QUAL = sprintf("%.2f",$QUAL); | |
| 875 $isize = length($del_seq); | |
| 876 | |
| 877 #### svtype = none for sDEL | |
| 878 #### isize = length($info[3]); | |
| 879 #### nRP = NA | |
| 880 #### mate_id = NA | |
| 881 #### CTX,:DEL,:....sDEL,## | |
| 882 $INFO_1=join(";", "SVTYPE=NA", "EVENT=$type", "ISIZE=$isize"); | |
| 883 | |
| 884 #Add Sample infomration | |
| 885 my $FORMAT="GT:sDEL"; | |
| 886 $FORMAT .= ":lSC:nSC:uRP:levD_local"; | |
| 887 my $SAMPLE= "0/1:"; | |
| 888 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD"; | |
| 889 | |
| 890 #### remove any white spaces. | |
| 891 $INFO_1=~s/\s//g; | |
| 892 $INFO_2=~s/\s//g; | |
| 893 | |
| 894 $BND1_name =~ s/^BND/LEVD/; | |
| 895 # If left, then the start position is plus_Reads[1]-isize | |
| 896 my $start_pos=0; | |
| 897 #Make sure Ref1 and Ref2 bases are different | |
| 898 if($REF2_base eq $REF1_base){$REF1_base="NA"} | |
| 899 if($side=~/left/){$start_pos=$plus_Reads[1]-$isize}else{$start_pos=$plus_Reads[1]}; | |
| 900 my $var=join("\t", $plus_Reads[0], $start_pos, $BND1_name, $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE); | |
| 901 return $var; | |
| 902 #print OUT join ("\t", $plus_Reads[0], $start_pos, $BND1_name, $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n"); | |
| 903 # return; | |
| 904 } | |
| 905 } | |
| 906 | |
| 907 #### Otherwise, look for DRP mate info | |
| 908 #if($nRP=~/NA/){print "MATE_INFO=$mate_info\tSide=$side\tline=$line\n";} | |
| 909 my @mate_info_arr = split(/\t/, $mate_info); | |
| 910 $nRP = $mate_info_arr[3]; | |
| 911 my $mate_chr=$mate_info_arr[0]; | |
| 912 | |
| 913 if((! defined $nRP) || ($nRP =~ /na/i) || ($mate_chr =~ /NA/) ){ | |
| 914 #PRINT UNKNOWN | |
| 915 return if ($nRP =~ /na/i); | |
| 916 #print "There is an unknown\nNRP=$nRP Mate_CHR=$mate_chr minRP=$minRP\n";die; | |
| 917 $summary .= ":unknown," . $plus_Reads[6]; | |
| 918 $type = "unknown"; | |
| 919 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA); | |
| 920 $REF2_base = $plus_Reads[3]; | |
| 921 $BND1_name =~ s/^BND/UNKNOWN/; | |
| 922 $QUAL = 1/($local_levD + 0.001); | |
| 923 $QUAL = sprintf("%.2f",$QUAL); | |
| 924 $INFO_1=join(";", "SVTYPE=unknown", "EVENT=unknown", "ISIZE=unknown"); | |
| 925 #Add Sample infomration | |
| 926 my $FORMAT="GT:sDEL"; | |
| 927 $FORMAT .= ":lSC:nSC:uRP:levD_local"; | |
| 928 my $SAMPLE = "0/1:"; | |
| 929 $SAMPLE .= "$plus_Reads[6]:$lSC:$nSC:$num_unmapped_pairs:$local_levD"; | |
| 930 #### remove any white spaces. | |
| 931 $INFO_1=~s/\s//g; | |
| 932 #print join ("\t", $plus_Reads[0], $plus_Reads[1], $REF2_base, $REF1_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n"); | |
| 933 | |
| 934 #print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $REF2_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE, "\n"); | |
| 935 my $var=join("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $REF2_base, $QUAL, "PASS", $INFO_1,$FORMAT,$SAMPLE); | |
| 936 return $var; | |
| 937 | |
| 938 } | |
| 939 | |
| 940 #### end if there is no mate info or nRP+uRP<minRP | |
| 941 return if (($nRP<$minRP)&&($unmated_pairs > ($num_unmapped_pairs+$nRP))); | |
| 942 | |
| 943 ################################################################################## | |
| 944 # Find out if mates have nearby soft-clips (to refine the breakpoints) | |
| 945 ################################################################################## | |
| 946 #Look for evidence of soft-clipping near mate | |
| 947 my @mate_soft_arr = (); | |
| 948 my $mate_start = 0; | |
| 949 my $mate_soft = ""; | |
| 950 | |
| 951 @mate_info_arr = split(/\t/, $mate_info); | |
| 952 | |
| 953 #### mate start and end locations. | |
| 954 my $filename = $right_sc; | |
| 955 | |
| 956 $start = $mate_info_arr[1] - $dist_To_Soft; | |
| 957 $end = $mate_info_arr[1]; | |
| 958 | |
| 959 if ($side =~ /right/) { | |
| 960 $start = $mate_info_arr[2]; | |
| 961 $end = $mate_info_arr[2] + $dist_To_Soft; | |
| 962 | |
| 963 $filename = $left_sc; | |
| 964 } | |
| 965 | |
| 966 #### add levenstein distance to Summary | |
| 967 #print "Calc distal Levd\n"; | |
| 968 $levD->search(rev_comp($plus_Reads[3]), $mate_info_arr[0], $start, $end, $INPUT_FASTA); | |
| 969 $distl_levD = sprintf("%.2f", $levD->{relative_edit_dist}); | |
| 970 $distl_levD = "NA" if($plus_Reads[3] =~ /NA/); | |
| 971 #If there is no softclips to string match, then give 0 as quality value | |
| 972 if ($plus_Reads[3] !~ /NA/){ | |
| 973 $QUAL=1/($distl_levD + 0.001); | |
| 974 } | |
| 975 else { | |
| 976 $QUAL=0; | |
| 977 }; | |
| 978 $QUAL=sprintf("%.2f",$QUAL); | |
| 979 #### looking for softclips to refine break point | |
| 980 #### if left look in right and vice-versa. | |
| 981 $cmd = qq{echo -e "$mate_info_arr[0]\t$start\t$end"}; | |
| 982 $cmd .= qq{ | awk -F'\t' 'NR==3' | intersectBed -a stdin -b $filename | head -1}; | |
| 983 | |
| 984 $mate_soft = `$cmd`; | |
| 985 | |
| 986 $mate_soft =~ s/\n//g; | |
| 987 @mate_soft_arr = split(/\s/, $mate_soft); | |
| 988 my $NO_MATE_SC=""; | |
| 989 if(@mate_soft_arr){ | |
| 990 $mate_chr = $mate_soft_arr[0]; | |
| 991 $mate_start = $mate_soft_arr[1]; | |
| 992 $NO_MATE_SC="APPROXIMATE"; | |
| 993 | |
| 994 } else{ | |
| 995 @mate_info_arr = split(/\s/,$mate_info); | |
| 996 $mate_chr = $mate_info_arr[0]; | |
| 997 $mate_start = $mate_info_arr[1]; | |
| 998 } | |
| 999 | |
| 1000 #end if there is no mate info | |
| 1001 return if ($mate_chr eq ""); | |
| 1002 #end if there is no mate info and !disable_RP_only | |
| 1003 return if (($lSC =~/NA/)&&(!$disable_RP_only)); | |
| 1004 | |
| 1005 | |
| 1006 ########################################################################### | |
| 1007 ##### Printing output | |
| 1008 | |
| 1009 ######################################### | |
| 1010 # Get DNA info | |
| 1011 ######################################### | |
| 1012 #print "PLUS_READS=$plus_Reads[0],$plus_Reads[1]\nMATE=$mate_chr,$mate_start,$INPUT_FASTA\n"; | |
| 1013 $REF1_base = getSeq($plus_Reads[0], $plus_Reads[1], $INPUT_FASTA); | |
| 1014 | |
| 1015 ### this is alt ref. for softclip its $plus_Reads[3] | |
| 1016 $REF2_base = getSeq($mate_chr, $mate_start, $INPUT_FASTA); | |
| 1017 | |
| 1018 ######################################### | |
| 1019 # print in VCF format | |
| 1020 ######################################### | |
| 1021 | |
| 1022 #### abs value to account for left and right reads. | |
| 1023 $isize = abs($plus_Reads[1]-$mate_start); | |
| 1024 | |
| 1025 my $event_type=$type; | |
| 1026 $event_type=~ s/,|[0-9]//g; | |
| 1027 $INFO_1=join(";", "$sv_type", "EVENT=$event_type", "ISIZE=$isize","MATE_ID=$BND2_name"); | |
| 1028 $INFO_2=join(";", "$sv_type", "EVENT=$event_type", "ISIZE=$isize","MATE_ID=$BND1_name"); | |
| 1029 | |
| 1030 #### remove any white spaces. | |
| 1031 #### ask: did you mean to remove space from ends? eg. trim() | |
| 1032 $INFO_1=~s/\s//g; | |
| 1033 $INFO_2=~s/\s//g; | |
| 1034 | |
| 1035 $FORMAT=$summary; | |
| 1036 $FORMAT=~ s/,|[0-9]//g; | |
| 1037 $FORMAT .= ":lSC:nSC:uRP:distl_levD"; | |
| 1038 if($NO_MATE_SC){$INFO_2 .= ":NO_MATE_SC"} | |
| 1039 my $SAMPLE="0/1:"; | |
| 1040 $SAMPLE .=$summary; | |
| 1041 # if($NO_MATE_SC){$SAMPLE.= ":$NO_MATE_SC"} | |
| 1042 | |
| 1043 $SAMPLE=~s/[A-Z|,|_]//g; | |
| 1044 my $MATE_SAMPLE=$SAMPLE; | |
| 1045 $SAMPLE .= ":$lSC:$nSC:$num_unmapped_pairs:$distl_levD"; | |
| 1046 $MATE_SAMPLE .=":NA:NA:NA:NA"; | |
| 1047 $SAMPLE=~s/::/:/g; | |
| 1048 $MATE_SAMPLE=~s/::/:/g; | |
| 1049 | |
| 1050 if($type !~ /INV/){ | |
| 1051 $ALT_1 = join("","]",$mate_chr,":",$mate_start,"]",$REF1_base); | |
| 1052 $ALT_2 = join("",$REF2_base,"[",$plus_Reads[0],":",$plus_Reads[1],"["); | |
| 1053 # 2 321682 bnd_V T ]13:123456]T 6 PASS SVTYPE=BND | |
| 1054 # 13 123456 bnd_U C C[2:321682[ 6 PASS SVTYPE=BND | |
| 1055 } else { | |
| 1056 $ALT_1 = join("", "]", $plus_Reads[0], ":", $plus_Reads[1], "]", $REF2_base); | |
| 1057 $ALT_2 = join("", $REF1_base, "[", $mate_chr, ":", $mate_start, "["); | |
| 1058 } | |
| 1059 | |
| 1060 if(($mate_chr) && ($plus_Reads[0])){ | |
| 1061 # print OUT join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $ALT_1, $QUAL,"PASS", $INFO_1, $FORMAT,$SAMPLE,"\n"); | |
| 1062 # print OUT join ("\t", $mate_chr, $mate_start, $BND2_name, $REF2_base, $ALT_2, $QUAL, "PASS", $INFO_2, $FORMAT,$MATE_SAMPLE,"\n"); | |
| 1063 my $var=join ("\t", $plus_Reads[0], $plus_Reads[1], $BND1_name, $REF1_base, $ALT_1, $QUAL,"PASS", $INFO_1, $FORMAT,$SAMPLE); | |
| 1064 return $var; | |
| 1065 } | |
| 1066 } | |
| 1067 | |
| 1068 ############################################################################### | |
| 1069 ############################################################################### | |
| 1070 sub get_counts_n_info { | |
| 1071 my ($event, $side, $mapQ, $file, $dist, $target, $upL, $lwL) = @_; | |
| 1072 | |
| 1073 my $mate_info = ""; | |
| 1074 my $cmd = ""; | |
| 1075 | |
| 1076 if ($event =~ /^CTX$/i) { | |
| 1077 #print "CTX side $side\n"; | |
| 1078 if ($side =~ /right/i) { | |
| 1079 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1536 $file $target}; | |
| 1080 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'}; | |
| 1081 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1082 if($verbose){print "$cmd\n"} | |
| 1083 $mate_info=`$cmd`; | |
| 1084 } else { | |
| 1085 $cmd = qq{ samtools view $new_blacklist -q $mapQ -f 16 -F 1536 $file $target}; | |
| 1086 $cmd .= qq{ | perl -ane 'if(\$F[6] ne "="){\$end=\$F[7]+1; print join ("\\t",\$F[6],\$F[7],\$end,"\\n")}'}; | |
| 1087 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1088 if($verbose){print "$cmd\n"} | |
| 1089 $mate_info=`$cmd`; | |
| 1090 } | |
| 1091 } elsif ($event =~ /^DEL$/i) { | |
| 1092 #print "DEL side $side\n"; | |
| 1093 if ($side =~ /right/i) { | |
| 1094 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target}; | |
| 1095 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1096 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1097 if($verbose){print "$cmd\n"} | |
| 1098 $mate_info=`$cmd`; | |
| 1099 } else { | |
| 1100 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1568 -f 16 $file $target}; | |
| 1101 $cmd .= qq{ | awk '{OFS="\\t"} {if((\$7 ~ /=/)&&(\$9<-$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1102 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1}; | |
| 1103 if($verbose){print "$cmd\n"} | |
| 1104 | |
| 1105 $mate_info=`$cmd`; | |
| 1106 } | |
| 1107 } elsif ($event =~ /^INS$/i) { | |
| 1108 #print "INS side $side\n"; | |
| 1109 if ($side =~ /right/i) { | |
| 1110 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target}; | |
| 1111 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<$lwL && \$9 > 0 )){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1112 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1113 if($verbose){print "$cmd\n"} | |
| 1114 $mate_info = `$cmd`; | |
| 1115 } else { | |
| 1116 $cmd = qq {samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target}; | |
| 1117 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>-$lwL && \$9 < 0 )){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1118 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1119 if($verbose){print "$cmd\n"} | |
| 1120 $mate_info = `$cmd`; | |
| 1121 } | |
| 1122 } elsif ($event =~ /^INV$/i) { | |
| 1123 #print "INV side $side\n"; | |
| 1124 if ($side =~ /right/i) { | |
| 1125 $cmd = qq{samtools view $new_blacklist -q $mapQ -F 1596 $file $target}; | |
| 1126 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1127 $cmd .= qq{ | sortBed | mergeBed -d $dist_To_Soft -n | sort -k4nr | head -1}; | |
| 1128 if($verbose){print "$cmd\n"} | |
| 1129 $mate_info = `$cmd`; | |
| 1130 } else { | |
| 1131 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 48 -F 1548 $file $target}; | |
| 1132 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1133 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1134 if($verbose){print "$cmd\n"} | |
| 1135 $mate_info = `$cmd`; | |
| 1136 } | |
| 1137 } elsif ($event =~ /^TDUP$/i) { | |
| 1138 #print "TDUP side $side\n"; | |
| 1139 if ($side =~ /right/i) { | |
| 1140 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 32 -F 1552 $file $target}; | |
| 1141 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9>$upL)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1142 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4>\$8)&&(\$9<0)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1143 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1144 if($verbose){print "$cmd\n"} | |
| 1145 $mate_info = `$cmd`; | |
| 1146 } else { | |
| 1147 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 16 -F 1568 $file $target}; | |
| 1148 # $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$9<-$upL )){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1149 $cmd .= qq{ | awk '{OFS="\\t"}{if((\$7 ~ /=/)&&(\$4<\$8)&&(\$9>0)){end=\$8+1;print \$3,\$8,end}}'}; | |
| 1150 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1151 if($verbose){print "$cmd\n"} | |
| 1152 $mate_info = `$cmd`; | |
| 1153 } | |
| 1154 } elsif ($event =~ /^NOV_INS$/i) { | |
| 1155 #print "NOV_INS side $side\n"; | |
| 1156 if ($side =~ /right/i) { | |
| 1157 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 8 -F 1552 $file $target}; | |
| 1158 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'}; | |
| 1159 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1160 if($verbose){print "$cmd\n"} | |
| 1161 $mate_info = `$cmd`; | |
| 1162 } else { | |
| 1163 $cmd = qq{samtools view $new_blacklist -q $mapQ -f 24 -F 1536 $file $target}; | |
| 1164 $cmd .= qq{ | awk '{OFS="\\t"}{end=\$8+1;print \$3,\$8,end}'}; | |
| 1165 $cmd .= qq{ | sortBed | mergeBed -d $dist -n | sort -k4nr | head -1}; | |
| 1166 if($verbose){print "$cmd\n"} | |
| 1167 $mate_info = `$cmd`; | |
| 1168 } | |
| 1169 } | |
| 1170 | |
| 1171 $mate_info=~s/\n//g; | |
| 1172 my @tmp=split(/\t/, $mate_info); | |
| 1173 | |
| 1174 my $counts = 0; | |
| 1175 | |
| 1176 if (defined $tmp[3]) { | |
| 1177 $tmp[3] =~ s/\n//g; | |
| 1178 | |
| 1179 $counts = $tmp[3] if (length($tmp[3])); | |
| 1180 } | |
| 1181 return ({count=>$counts, info=>$mate_info}); | |
| 1182 } | |
| 1183 |
