Mercurial > repos > plus91-technologies-pvt-ltd > ss_test_tool
comparison 2.4/src/standalone_blat2.pl @ 0:00b9898b8510 draft
Uploaded
| author | plus91-technologies-pvt-ltd |
|---|---|
| date | Wed, 04 Jun 2014 03:41:27 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:00b9898b8510 |
|---|---|
| 1 #!/usr/bin/perl -sw | |
| 2 use Getopt::Long; | |
| 3 sub usage(){ | |
| 4 print " | |
| 5 Usage: <VCF> -g <genome.2bit> -seq|s <seq.fa> -f genome.fa | |
| 6 -o out.vcf | |
| 7 -n contig.names | |
| 8 -dist how wide of a window to look for bp [50]\n | |
| 9 -v verbose option | |
| 10 Requires samtools,bedTools, and blat in your path\n; | |
| 11 "; | |
| 12 die; | |
| 13 } | |
| 14 #Initialize values | |
| 15 my ($blat,$genome,$tei_bed,$vntr_bed,$out_vcf,$contig_names,$contig,$fasta,$uninformative_contigs,$dist,$verbose,$bedTools,$samtools); | |
| 16 GetOptions ("genome|g=s" => \$genome, | |
| 17 "o|out:s" => \$out_vcf, | |
| 18 "names|n:s" => \$contig_names, | |
| 19 "seq|s=s" => \$contig, | |
| 20 "f|fasta:s" => \$fasta, | |
| 21 "b|bad:s" => \$uninformative_contigs, | |
| 22 "dist:s" => \$dist, | |
| 23 "v" => \$verbose | |
| 24 ); | |
| 25 #$genome="/data2/bsi/reference/db/hg19.2bit"" | |
| 26 #$blat="/projects/bsi/bictools/apps/alignment/blat/34/blat" ; | |
| 27 #TEI.bed=egrep "LINE|SINE|LTR" /data5/bsi/epibreast/m087494.couch/Reference_Data/Annotations/hg19.repeatMasker.bed >TEI.bed | |
| 28 #VNTR_BED=egrep "Satellite|Simple_repeat" /data5/bsi/epibreast/m087494.couch/Reference_Data/Annotations/hg19.repeatMasker.bed > VNTR.bed | |
| 29 | |
| 30 | |
| 31 $blat=`which blat`; | |
| 32 if (!$blat) {die "Your do not have BLAT in your path\n\n"} | |
| 33 $samtools=`which samtools`; | |
| 34 if (!$samtools) {die "Your do not have samtools in your path\n\n"} | |
| 35 $bedTools=`which sortBed`; | |
| 36 if (!$bedTools) {die "Your do not have bedTools in your path\n\n"} | |
| 37 | |
| 38 | |
| 39 if (!$dist) {$dist=50} | |
| 40 if (!$out_vcf) {$out_vcf="out.vcf"} | |
| 41 if (!$contig_names) {$contig_names="contig.names"} | |
| 42 if (!$uninformative_contigs) {$uninformative_contigs="uninformative.contigs"} | |
| 43 | |
| 44 if ((!$genome)||(!$contig)||(!$fasta)){&usage;die;} | |
| 45 | |
| 46 | |
| 47 open(VCF,"$ARGV[0]") or die "must specify VCF file\n\n"; | |
| 48 open(OUT_VCF,">",$out_vcf) or die "can't open the output VCF\n"; | |
| 49 open(CONTIG_LIST,">",$contig_names) or die "can't open the contig names\n"; | |
| 50 open(BAD_CONTIG_LIST,">",$uninformative_contigs) or die "can't open the contig names\n"; | |
| 51 #print "writing to CONTIG_LIST=$contig_names\n"; | |
| 52 while (<VCF>) { | |
| 53 if($_=~/^#/){ | |
| 54 if ($.==1) { | |
| 55 print OUT_VCF $_; | |
| 56 print OUT_VCF "##INFO=<ID=STRAND,Number=1,Type=String,Description=\"Strand to which assembled contig aligned\">\n"; | |
| 57 print OUT_VCF "##INFO=<ID=CONTIG,Number=1,Type=String,Description=\"Name of assembeled contig matching event\">\n"; | |
| 58 print OUT_VCF "##INFO=<ID=MECHANISM,Number=1,Type=String,Description=\"Proposed mechanism of how the event arose\">\n"; | |
| 59 print OUT_VCF "##INFO=<ID=INSLEN,Number=1,Type=Integer,Description=\"Length of insertion\">\n"; | |
| 60 print OUT_VCF "##INFO=<ID=HOM_LEN,Number=1,Type=Integer,Description=\"Length of microhomology\">\n"; | |
| 61 next; | |
| 62 } | |
| 63 else { | |
| 64 print OUT_VCF $_; | |
| 65 next; | |
| 66 } | |
| 67 }; | |
| 68 chomp; | |
| 69 | |
| 70 ##look for exact location of BP | |
| 71 @line=split("\t",$_); | |
| 72 my($left_chr,$start,$end); | |
| 73 | |
| 74 #Get left position | |
| 75 $left_chr=$line[0]; | |
| 76 $start=$line[1]-$dist; | |
| 77 $end=$line[1]+$dist; | |
| 78 | |
| 79 #Get right position | |
| 80 my ($mate_pos,@mate,$mate_chr,$mate_start,$mate_end); | |
| 81 $mate_pos=$line[4]; | |
| 82 $mate_pos=~s/[\[|\]|A-Z]//g; | |
| 83 #print "mate_pos=$mate_pos\n"; | |
| 84 @mate=split(/:/,$mate_pos); | |
| 85 $mate_chr=$mate[0]; $mate_pos=$mate[1]; | |
| 86 $mate_start=$mate_pos-$dist;$mate_end=$mate_pos+$dist; | |
| 87 #print "$left_chr:$start-$end\n$mate_chr:$mate_start-$mate_end\n"; | |
| 88 | |
| 89 #Run through blat | |
| 90 my ($result1,$result2); | |
| 91 my $target1=join("",$left_chr,":",$start,"-",$end); | |
| 92 my $target2=join("",$mate_chr,":",$mate_start,"-",$mate_end); | |
| 93 #print "target1=$target1\ttarget2=$target2\n";die; | |
| 94 $result1=get_result($target1); | |
| 95 $result2=get_result($target2); | |
| 96 | |
| 97 | |
| 98 my $NOV_INS=""; | |
| 99 #If there is a NOV_INS, then there shouldn't be any output, so trick the results | |
| 100 if ($_=~/EVENT=NOV_INS/) { | |
| 101 $mate_start=$start; | |
| 102 $NOV_INS="true"; | |
| 103 if (!$result1) {$result1=join("\t","0","0","0","0","0","0","0","0","+","UNKNOWN_NODE","0","0",$dist);} | |
| 104 if (!$result2) {$result2=join("\t","0","0","0","0","0","0","0","0","+","UNKNOWN_NODE","0","0",$dist);} | |
| 105 } | |
| 106 | |
| 107 #Skip over events that aren't supported | |
| 108 if ((!$result1)||(!$result2)){ | |
| 109 my @tmp1=split("\t",$result1); | |
| 110 my @tmp2=split("\t",$result2); | |
| 111 if ($tmp1[9]) {print BAD_CONTIG_LIST "$tmp1[9]\n"} | |
| 112 if ($tmp2[9]) {print BAD_CONTIG_LIST "$tmp2[9]\n" } | |
| 113 next; | |
| 114 } | |
| 115 #Parse blat results | |
| 116 my @result1=split("\t",$result1); | |
| 117 my @result2=split("\t",$result2); | |
| 118 if($result2[9] ne $result1[9]){print "$result2[9] != $result1[9]\n";next} | |
| 119 #print "@result1\n@result2\n";die; | |
| 120 my $pos1=$start+($result1[12]-$result1[11]); | |
| 121 my $pos2=$mate_start+($result2[12]-$result2[11]); | |
| 122 #print "$_\n$pos1\t$pos2\n"; | |
| 123 | |
| 124 ############################################################## | |
| 125 ### Build Classifier | |
| 126 | |
| 127 my ($QSTART1,$QEND1,$QSTART2,$QEND2,$len,$MECHANISM, $INSERTION, $DELETION, $bed_res1,$bed_res2); | |
| 128 $MECHANISM="UNKNOWN"; | |
| 129 $len="UNKNOWN"; | |
| 130 #Make sure the later event is second | |
| 131 if ($result1[11] < $result2[11]){ | |
| 132 $QSTART1=$result1[11]; | |
| 133 $QEND1=$result1[12]; | |
| 134 $QSTART2=$result2[11]; | |
| 135 $QEND2=$result2[12]; | |
| 136 } | |
| 137 else{ | |
| 138 $QSTART1=$result2[11]; | |
| 139 $QEND1=$result2[12]; | |
| 140 $QSTART2=$result1[11]; | |
| 141 $QEND2=$result1[12]; | |
| 142 } | |
| 143 #Now calculate the difference between $QEND1 and QSTART2 | |
| 144 if($verbose){print "QEND1=$QEND1\tQSTART2=$QSTART2\n";} | |
| 145 $len=$QEND1-$QSTART2; | |
| 146 #Check for TEI | |
| 147 if($_=~/MECHANISM=TEI/){$MECHANISM="TEI"} | |
| 148 elsif($_=~/MECHANISM=VNTR/){$MECHANISM="VNTR"} | |
| 149 else{ | |
| 150 if ($len==0) {$MECHANISM="NHEJ"} | |
| 151 else{ | |
| 152 if ($len>0){$INSERTION="true"} | |
| 153 if ($len<0){$DELETION="true"} | |
| 154 if ($INSERTION){ | |
| 155 if ($len>10) {$MECHANISM="FOSTES"} | |
| 156 else{$MECHANISM="NHEJ"} | |
| 157 } | |
| 158 elsif ($DELETION){ | |
| 159 if ($len>100) {$MECHANISM="NAHR"} | |
| 160 elsif ($len > 2){$MECHANISM="altEJ"} | |
| 161 else{$MECHANISM="NHEJ"} | |
| 162 } | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 | |
| 167 #if ($verbose){print "@result1";print "@result2";} | |
| 168 | |
| 169 #print out VCF | |
| 170 ############################################################# | |
| 171 # create temporary variable name | |
| 172 ############################################################# | |
| 173 srand (time ^ $$ ^ unpack "%L*", `ps axww | gzip -f`); | |
| 174 my $random_name=join "", map { ("a".."z")[rand 26] } 1..8; | |
| 175 my $random_name2=join "", map { ("a".."z")[rand 26] } 1..8; | |
| 176 | |
| 177 #Get Ref Base | |
| 178 my ($ref_base,$alt_base,$tmp_mate_pos); | |
| 179 $ref_base=getBases($left_chr,$pos1,$fasta); | |
| 180 $alt_base=getBases($mate_chr,$pos2,$fasta);#print "ALT=$alt_base\n"; | |
| 181 #Substitute the new mate position and base | |
| 182 $tmp_mate_pos=$line[4]; | |
| 183 $tmp_mate_pos=~s/$mate_pos/$pos2/; | |
| 184 $tmp_mate_pos=~s/[A-Z]/$alt_base/; | |
| 185 #split apart the INFO field to adjust the ISIZE and MATEID | |
| 186 my $NEW_INFO=""; | |
| 187 my @INFO=split(/;/,$line[7]); | |
| 188 for (my $i=0;$i<@INFO;$i++){ | |
| 189 if ($INFO[$i] =~ /^ISIZE=/){ | |
| 190 my @tmp=split(/=/,$INFO[$i]); | |
| 191 $NEW_INFO.="ISIZE="; | |
| 192 my $new_ISZIE=$pos2-$pos1; | |
| 193 $NEW_INFO.=$new_ISZIE | |
| 194 } | |
| 195 elsif($INFO[$i] =~ /^MATE_ID=/){ | |
| 196 $NEW_INFO.=";MATE_ID=".$random_name2 . ";"; | |
| 197 } | |
| 198 else{ | |
| 199 $NEW_INFO.=$INFO[$i].";"; | |
| 200 } | |
| 201 } | |
| 202 #ADD in strand and name | |
| 203 $NEW_INFO.="STRAND=".$result1[8]; | |
| 204 $NEW_INFO.=";CONTIG=".$result1[9]; | |
| 205 if($MECHANISM!~/TEI|VNTR/){$NEW_INFO.=";MECHANISM=".$MECHANISM;} | |
| 206 $NEW_INFO.=";HOM_LEN=".$len; | |
| 207 #don't pring contig nage if its a novel insertion | |
| 208 if(!$NOV_INS){print CONTIG_LIST "$result1[9]\n";}#else{print "I'm not printing $result1[9]\n";} | |
| 209 print OUT_VCF "$left_chr\t$pos1\t$random_name\t$ref_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n"; | |
| 210 #Now go through and fill info in for mate | |
| 211 #Substitute the new mate position and base | |
| 212 $tmp_mate_pos=$line[4]; | |
| 213 $tmp_mate_pos=~s/$mate_pos/$pos1/; | |
| 214 $tmp_mate_pos=~s/[A-Z]/$ref_base/; | |
| 215 $tmp_mate_pos=~s/$mate_chr/$left_chr/; | |
| 216 $NEW_INFO=""; | |
| 217 @INFO=split(/;/,$line[7]); | |
| 218 for (my $i=0;$i<@INFO;$i++){ | |
| 219 if ($INFO[$i] =~ /^ISIZE=/){ | |
| 220 my @tmp=split(/=/,$INFO[$i]); | |
| 221 $NEW_INFO.="ISIZE="; | |
| 222 my $new_ISZIE=$pos2-$pos1; | |
| 223 $NEW_INFO.=$new_ISZIE | |
| 224 } | |
| 225 elsif($INFO[$i] =~ /^MATE_ID=/){ | |
| 226 $NEW_INFO.=";MATE_ID=".$random_name.";"; | |
| 227 } | |
| 228 else{ | |
| 229 $NEW_INFO.=$INFO[$i].";"; | |
| 230 } | |
| 231 } | |
| 232 #ADD in strand and name | |
| 233 $NEW_INFO.="STRAND=".$result2[8]; | |
| 234 $NEW_INFO.=";CONTIG=".$result2[9]; | |
| 235 if ($MECHANISM!~/TEI|VNTR/){$NEW_INFO.=";MECHANISM=".$MECHANISM;} | |
| 236 $NEW_INFO.=";HOM_LEN=".$len; | |
| 237 | |
| 238 #don't pring contig nage if its a novel insertion | |
| 239 if(!$NOV_INS){print CONTIG_LIST "$result2[9]\n";} #else{print "I'm not printing $result1[9]\n";} | |
| 240 print OUT_VCF "$mate_chr\t$pos2\t$random_name2\t$alt_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n"; | |
| 241 if ($verbose){print "$mate_chr\t$pos2\t$random_name2\t$alt_base\t$tmp_mate_pos\t1000\tPASS\t$NEW_INFO\t$line[8]\t$line[9]\n";} | |
| 242 } | |
| 243 close VCF; | |
| 244 close OUT_VCF; | |
| 245 close CONTIG_LIST; | |
| 246 close BAD_CONTIG_LIST; | |
| 247 sub get_result{ | |
| 248 my $target=($_[0]); | |
| 249 if($verbose){print "target=$target\n"}#;die; | |
| 250 my $cmd="blat $genome:$target $contig /dev/stdout -t=dna -q=dna -noHead|egrep -v \"Searched|Loaded\" |head -1"; | |
| 251 | |
| 252 if ($verbose){print "$cmd\n"} #print "$cmd\n";die; | |
| 253 my $result=`$cmd`; | |
| 254 next if (!$cmd); | |
| 255 return ($result); | |
| 256 } | |
| 257 sub getBases{ | |
| 258 my ($chr,$pos1,$fasta)=@_; | |
| 259 my @result=(); | |
| 260 if ($pos1 <0){print "$pos1 is not a valid position (likely caused by circular MT chromosome)\n";$result[1]="NA";}; | |
| 261 @result = `samtools faidx $fasta $chr:$pos1-$pos1`; | |
| 262 if(!$result[1]){$result[1]="NA"}; | |
| 263 chomp($result[1]); | |
| 264 return uc($result[1]); | |
| 265 } | |
| 266 | |
| 267 |
