Mercurial > repos > jeremie > breakdancer_test
changeset 18:d26b184866b4 draft
Deleted selected files
author | jeremie |
---|---|
date | Tue, 01 Jul 2014 09:28:30 -0400 |
parents | e7cb65b58ede |
children | 87a6ec1e2400 |
files | bam2cfg.pl breakdancer2vcf.py |
diffstat | 2 files changed, 0 insertions(+), 896 deletions(-) [+] |
line wrap: on
line diff
--- a/bam2cfg.pl Tue Jul 01 09:28:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,769 +0,0 @@ -#!/opt/local/bin/perl -#Create a BreakDancer configuration file from a set of bam files - -use strict; -use warnings; -use Getopt::Std; -use Statistics::Descriptive; -use GD::Graph::histogram; -use FindBin qw($Bin); -use lib "$FindBin::Bin"; -#use lib '/gscuser/kchen/1000genomes/analysis/scripts/'; -use AlnParser; - -my %opts = (q=>35, n=>10000, v=>1, c=>4, b=>50, s=>50); -getopts('q:n:c:b:p:hmf:gCv:', \%opts); -die(" -Usage: bam2cfg.pl <bam files> -Options: - -q INT Minimum mapping quality [$opts{q}] - -m Using mapping quality instead of alternative mapping quality - -s Minimal mean insert size [$opts{s}] - -C Change default system from Illumina to SOLiD - -c FLOAT Cutoff in unit of standard deviation [$opts{c}] - -n INT Number of observation required to estimate mean and s.d. insert size [$opts{n}] - -v FLOAT Cutoff on coefficients of variation [$opts{v}] - -f STRING A two column tab-delimited text file (RG, LIB) specify the RG=>LIB mapping, useful when BAM header is incomplete - -b INT Number of bins in the histogram [$opts{b}] - -g Output mapping flag distribution - -h Plot insert size histogram for each BAM library -\n -") unless (@ARGV); - -my $AP=new AlnParser(platform=>$opts{C}); - -my %cRGlib; my %clibs; -if($opts{f}){ - open(RGLIB,"<$opts{f}") || die "unable to open $opts{f}\n"; - while(<RGLIB>){ - chomp; - my ($rg,$lib)=split; - $clibs{$lib}=1; - $cRGlib{$rg}=$lib; - } -} - -foreach my $fbam(@ARGV){ - my %RGlib; - my %RGplatform; - my %libs; - my %insert_stat; - my %readlen_stat; - my %libpos; - my %flagHgram; - my $recordcounter=0; - my $expected_max; - if(defined $opts{f}){ - %RGlib=%cRGlib; - %libs=%clibs; - } - my $ppos=0; - open(BAM,"samtools view -h $fbam |") || die "unable to open $fbam\n"; - while(<BAM>){ - chomp; - if(/^\@RG/){ #getting RG=>LIB mapping from the bam header - my ($id)=($_=~/ID\:(\S+)/); - my ($lib)=($_=~/LB\:(\S+)/); - my ($platform)=($_=~/PL\:(\S+)/); - my ($sample)=($_=~/SM\:(\S+)/); - my ($insertsize)=($_=~/PI\:(\d+)/); - #if(defined $insertsize && $insertsize>0){ - #$lib=$sample . '_'. $lib; - $libs{$lib}=1; - $RGlib{$id}=$lib; - $RGplatform{$id}=$platform; - #} - } - else{ - next if(/^\@/); - my @libas=keys %libs; - my @selected_libs=keys %insert_stat; - if($#libas<0){ - if($#selected_libs>=0){ - last; - } - else{ - $libs{'NA'}=1; - $RGlib{'NA'}='NA'; - $RGplatform{'NA'}=($opts{C})?'solid':'illumina'; - } - } - if(!defined $expected_max || $expected_max<=0){ - $expected_max=3*($#libas+1)*$opts{n}; - } - last if($recordcounter>$expected_max); - - my $t=$AP->in($_,'sam',\%RGplatform,$opts{m}); - #die "Please sort bam by position\n" if($t->{pos}<$ppos); - next if($t->{pos}<$ppos); - $ppos=$t->{pos}; - my $lib; - if(defined $t->{readgroup}){ - if(defined $RGlib{$t->{readgroup}}){ - $lib=$RGlib{$t->{readgroup}}; - } - else{ - $lib=$t->{readgroup}; - $libs{$t->{readgroup}}=1; - $RGlib{$t->{readgroup}}=$t->{readgroup}; - } - } - else{ - $lib='NA'; - } - #my $lib=($t->{readgroup})?$RGlib{$t->{readgroup}}:'NA'; #when multiple libraries are in a BAM file - next unless(defined $lib && $libs{$lib}); - $readlen_stat{$lib}=Statistics::Descriptive::Full->new() if(!defined $readlen_stat{$lib}); - $readlen_stat{$lib}->add_data($t->{readlen}); - next if ($t->{qual}<=$opts{q}); #skip low quality mapped reads - $recordcounter++; - $libpos{$lib}++; - if(defined $t->{readgroup}){ - $flagHgram{$t->{readgroup}}{$t->{flag}}++; - $flagHgram{$t->{readgroup}}{all}++; - } - - my $nreads=(defined $insert_stat{$lib})?$insert_stat{$lib}->count():1; - if($nreads/$libpos{$lib}<1e-4){ #single-end lane - delete $libs{$lib}; - delete $insert_stat{$lib}; - } - next unless(($t->{flag}==18 || $t->{flag}==20) && $t->{dist}>=0); - - $insert_stat{$lib}=Statistics::Descriptive::Full->new() if(!defined $insert_stat{$lib}); - $insert_stat{$lib}->add_data($t->{dist}); - if($insert_stat{$lib}->count()>$opts{n}){ - delete $libs{$lib}; - } - } - } - close(BAM); - - my %stdms; - my %stdps; - - foreach my $lib(keys %insert_stat){ - my $readlen=$readlen_stat{$lib}->mean(); - my @isize=$insert_stat{$lib}->get_data(); - my $mean=$insert_stat{$lib}->mean(); - my $std=$insert_stat{$lib}->standard_deviation(); - - delete $insert_stat{$lib}; - my $insertsize=Statistics::Descriptive::Full->new(); - foreach my $x(@isize){ - next if($x>$mean+5*$std); - $insertsize->add_data($x); - } - - $mean=$insertsize->mean(); - $std=$insertsize->standard_deviation(); - next if($mean<$opts{s}); - my $cv=$std/$mean; - if($cv>=$opts{v}){ - print STDERR "Coefficient of variation $cv in library $lib is larger than the cutoff $opts{v}, poor quality data, excluding from further analysis.\n"; - next; - } - - my $num=$insertsize->count(); - next if($num<100); - - my ($stdm,$stdp)=(0,0); - my ($nstdm,$nstdp)=(0,0); - foreach my $x($insertsize->get_data()){ - if($x>$mean){ - $stdp+=($x-$mean)**2; - $nstdp++; - } - else{ - $stdm+=($x-$mean)**2; - $nstdm++; - } - } - $stdm=sqrt($stdm/($nstdm-1)); - $stdp=sqrt($stdp/($nstdp-1)); - - $stdms{$lib}=$stdm; - $stdps{$lib}=$stdp; - $insert_stat{$lib}=$insertsize; - } - - foreach my $rg(keys %RGlib){ - my $lib=$RGlib{$rg}; - my $platform=$RGplatform{$rg} || 'illumina'; #default illumina - next unless($insert_stat{$lib}); - my $readlen=$readlen_stat{$lib}->mean(); - my $mean=$insert_stat{$lib}->mean(); - my $std=$insert_stat{$lib}->standard_deviation(); - my $num=$insert_stat{$lib}->count(); - - my $upper=$mean+$opts{c}*$stdps{$lib} if(defined $opts{c}); - my $lower=$mean-$opts{c}*$stdms{$lib} if(defined $opts{c}); - $lower=0 if(defined $lower && $lower<0); - - printf "readgroup\:%s\tplatform:%s\tmap\:%s\treadlen\:%.2f\tlib\:%s\tnum:%d",$rg,$platform,$fbam,$readlen,$lib,$num; - printf "\tlower\:%.2f\tupper\:%.2f",$lower,$upper if(defined $upper && defined $lower); - printf "\tmean\:%.2f\tstd\:%.2f",$mean,$std; - - - # compute the normality - my @data=$insert_stat{$lib}->get_data(); - my $n_data = @data; - @data = sort {$a <=> $b} @data; - my $n_data_ = @data; - my $p_value = ShapiroWilk(\@data); - if($p_value>0) { - my $log_p_value = log($p_value)/log(10); - printf "\tSWnormality\:%.2f",$log_p_value; - } - elsif($p_value == -1) { - printf "\tSWnormality\:data not qualified -1"; - } - elsif($p_value == -2.1){ - printf "\tSWnormality\:data not qualified -2.1"; - } - elsif($p_value == -2.2){ - printf "\tSWnormality\:data not qualified -2.2"; - } - elsif($p_value == -2.3){ - printf "\tSWnormality\:data not qualified -2.3"; - } - elsif($p_value == 0) { - printf "\tSWnormality\:minus infinity"; - } - - - - if($opts{g}){ - printf "\tflag:"; - foreach my $f(sort keys %{$flagHgram{$rg}}){ - next if($f eq 'all'); - printf "%d(%.2f%%)",$f,($flagHgram{$rg}{$f} || 0)*100/$flagHgram{$rg}{all}; - } - printf "%d",$flagHgram{$rg}{all}; - } - printf "\texe:samtools view\n"; - } - - if($opts{h}){ # plot insert size histogram for each library - foreach my $lib(keys %insert_stat){ - my $graph = new GD::Graph::histogram(1000,600); - my $library="$fbam.$lib"; - $graph->set( - x_label => 'Insert Size (bp)', - y_label => 'Count', - title => $library, - x_labels_vertical => 1, - bar_spacing => 0, - shadow_depth => 1, - shadowclr => 'dred', - transparent => 0, - histogram_bins => $opts{b}, - ) or warn $graph->error; - my @data=$insert_stat{$lib}->get_data(); - my $gd = $graph->plot(\@data) or die $graph->error; - - $library=~s/.*\///g; - my $imagefile="$library.insertsize_histogram.png"; - open(IMG, ">$imagefile") or die $!; - binmode IMG; - print IMG $gd->png; - - my $datafile="$library.insertsize_histogram"; - open(OUT,">$datafile"); - foreach my $x(@data){ - print OUT "$x\n"; - } - close(OUT); - } - } -} - -###################################################### Shapiro Wilk Normality test (including functions ppnd, alnorm, poly, min, sign and asin) ######################################################### -sub ShapiroWilk { - -# read input -my ($x) = @_; - -my $n=@$x; -my $w = 0; -my $ifault = 2; -my $init = 0; -my $n2 = $n/2; -my @a; -my $n1 = $n; - -my $upper = 1; -my @c1 = (0.0, 0.221157, -0.147981, -2.07119, 4.434685, -2.706056); -my @c2 = (0.0, 0.042981, -0.293762, -1.752461, 5.682633, -3.582633); -my @c3 = (0.5440, -0.39978, 0.025054, -0.6714*(10**(-3))); -my @c4 = (1.3822, -0.77857, 0.062767, -0.0020322); -my @c5 = (-1.5861, -0.31082, -0.083751, 0.0038915); -my @c6 = (-0.4803, -0.082676, 0.0030302); -my @c7 = (0.164, 0.533); -my @c8 = (0.1736, 0.315); -my @c9 = (0.256, -0.00635); -my @g = (-2.273, 0.459); -my $z90 = 1.2816; -my $z95 = 1.6449; -my $z99 = 2.3263; -my $zm = 1.7509; -my $zss = 0.56268; -my $bf1 = 0.8378; -my $xx90 = 0.556; -my $xx95 = 0.622; -my $zero = 0.0; -my $one = 1.0; -my $two = 2.0; -my $three = 3.0; -my $sqrth = 0.70711; -my $qtr = 0.25; -my $th = 0.375; -my $small = 1*(10**(-19)); -my $pi6 = 1.909859; -my $stqr = 1.047198; - -my $summ2; -my $ssumm2; -my $fac; -my $rsn; -my $an; -my $an25; -my $a1; -my $a2; -my $delta; -my $range; -my $sa; -my $sx; -my $ssx; -my $ssa; -my $sax; -my $asa; -my $xsx; -my $ssassx; -my $w1; -my $y; -my $xx; -my $xi; -my $gamma; -my $m; -my $s; -my $ld; -my $bf; -my $z90f; -my $z95f; -my $z99f; -my $zfm; -my $zsd; -my $zbar; -my $ncens; -my $nn2; -my $i; -my $i1; -my $j; - -my $pw = $one; -if($w >= $zero) {$w = $one;} -$an = $n; -$ifault = 3; -$nn2 = $n/2; -if($n2 < $nn2) {return (-1);} -$ifault = 1; -if($n < 3) {return (-1);} - -# If INIT is false, calculates coefficients for the test - -if(!$init){ - if($n == 3) { - $a[1-1] = $sqrth;} - else { - $an25 = $an + $qtr; - $summ2 = $zero; - for(my $i = 1; $i <= $n2; $i++){ - ($a[$i-1], $ifault) = ppnd(($i - $th)/$an25); - my $a_val = $a[$i-1]; - $summ2 = $summ2 + $a[$i-1] ** 2; - } - $summ2 = $summ2 * $two; - $ssumm2 = $summ2 ** 0.5; - $rsn = $one / ($an ** 0.5); - my $poly_result = poly_(\@c1, 6, $rsn); - $a1 = $poly_result - $a[1-1] / $ssumm2; - -# Normalize coefficients - - if($n > 5) { - $i1 = 3; - $a2 = -$a[2-1]/$ssumm2 + poly_(\@c2, 6, $rsn); - my $a_val1_new = $a[1-1]; - my $a_val2_new = $a[2-1]; - my $temp1 = $summ2 - $two * $a[1-1] ** 2 - $two * $a[2-1] ** 2; - my $temp2 = $one - $two * $a1 ** 2 - $two * $a2 ** 2; - $fac = (($summ2 - $two * $a[1-1] ** 2 - $two * $a[2-1] ** 2)/($one - $two * $a1 ** 2 - $two * $a2 ** 2)) ** 0.5; - $a[1-1] = $a1; - $a[2-1] = $a2; - } - else { - $i1 = 2; - $fac = (($summ2 - $two * $a[1-1] ** 2)/ ($one - $two * $a1 ** 2)) ** 0.5; - $a[1-1] = $a1; - } - for($i = $i1; $i <= $nn2; $i++){ - $a[$i-1] = -$a[$i-1]/$fac; - my $a_val = $a[$i-1]; - } - for($i = 1; $i <= $nn2; $i++){ - my $a_value = $a[$i-1]; - } - } - $init = 1; -} -if($n1 < 3) {return (-1);} -$ncens = $n - $n1; -$ifault = 4; -if($ncens < 0 || ($ncens > 0 && $n < 20)) {return (-1);} -$ifault = 5; -$delta = $ncens/$an; -if($delta > 0.8) {return (-2.1);} - -# If W input as negative, calculate significance level of -W - -if($w < $zero) { - $w1 = $one + $w; - $ifault = 0; -} -else{ -# Check for zero range - $ifault = 6; - $range = @$x[$n1-1] - @$x[1-1]; - if($range < $small) {return (-2.2);} - -# Check for correct sort order on range - scaled X - - $ifault = 7; - $xx = @$x[1-1]/$range; - $sx = $xx; - $sa = -$a[1-1]; - $j = $n - 1; - for($i = 2; $i <= $n1; $i++){ - $xi = @$x[$i-1]/$range; - if($xx-$xi > $small){ - return (-2.3); - } - $sx = $sx + $xi; - if($i != $j) { - my $sign_value = sign($i - $j); - my $min_value = $a[min($i, $j)-1]; - my $min_ = min($i, $j); - $sa = $sa + sign($i - $j) * $a[min($i, $j)-1]; - } - $xx = $xi; - $j = $j - 1; - } - $ifault = 0; - if($n > 5000) {$ifault = 2}; - -# Calculate W statistic as squared correlation between data and coefficients - $sa = $sa/$n1; - $sx = $sx/$n1; - $ssa = $zero; - $ssx = $zero; - $sax = $zero; - $j = $n; - for($i = 1; $i <= $n1; $i ++){ - if($i != $j) { - my $sign_result = sign($i - $j); - my $min_result = min($i, $j); - $asa = sign($i - $j) * $a[min($i, $j)-1] - $sa; - } - else { - $asa = -$sa; - } - $xsx = @$x[$i-1]/$range - $sx; - $ssa = $ssa + $asa * $asa; - $ssx = $ssx + $xsx * $xsx; - $sax = $sax + $asa * $xsx; - $j = $j - 1; - } - -# W1 equals (1-W) claculated to avoid excessive rounding error for W very near 1 (a potential problem in very large samples) - $ssassx = ($ssa * $ssx) ** 0.5; - $w1 = ($ssassx - $sax) * ($ssassx + $sax)/($ssa * $ssx); -} - -$w = $one - $w1; - -# Calculate significance level for W (exact for N=3) - -if($n == 3){ - $pw = $pi6 * (asin($w**0.5) - $stqr); - return ($pw); -} -$y = log($w1); -$xx = log($an); -$m = $zero; -$s = $one; -if($n <= 11){ - $gamma = poly_(\@g, 2, $an); - if($y >= $gamma){ - $pw = $small; - return ($pw); - } - $y = -log($gamma - $y); - $m = poly_(\@c3, 4, $an); - $s = exp(poly_(\@c4, 4, $an)) -} -else{ - $m = poly_(\@c5, 4, $xx); - $s = exp(poly_(\@c6, 3, $xx)); -} -if($ncens > 0){ - -# Censoring by proportion NCENS/N. Calculate mean and sd of normal equivalent deviate of W. - - $ld = -log($delta); - $bf = $one + $xx * $bf1; - $z90f = $z90 + $bf * poly_(\@c7, 2, $xx90 ** $xx) ** $ld; - $z95f = $z95 + $bf * poly_(\@c8, 2, $xx95 ** $xx) ** $ld; - $z99f = $z99 + $bf * poly_(\@c9, 2, $xx) ** $ld; - -# Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get pseudo-mean and pseudo-sd of z as the slope and intercept - - $zfm = ($z90f + $z95f + $z99f)/$three; - $zsd = ($z90*($z90f-$zfm)+$z95*($z95f-$zfm)+$z99*($z99f-$zfm))/$zss; - $zbar = $zfm - $zsd * $zm; - $m = $m + $zbar * $s; - $s = $s * $zsd; -} -$pw = alnorm(($y - $m)/$s, $upper); -return ($pw); - -} - -############################################### ppnd function############################################################ -sub ppnd { - -# read input -my ($p) = @_; - -# define output -my $ifault; -my $normal_dev; - -# Local variables - -my $zero = 0.0; -my $one = 1.0; -my $half = 0.5; -my $split1 = 0.425; -my $split2 = 5.0; -my $const1 = 0.180625; -my $const2 = 1.6; -my $q; -my $r; - -# Coefficients for P close to 0.5 - -my $a0 = 3.3871327179; -my $a1 = 5.0434271938*10; -my $a2 = 1.5929113202*10**2; -my $a3 = 5.9109374720*10; -my $b1 = 1.7895169469*10; -my $b2 = 7.8757757664*10; -my $b3 = 6.7187563600*10; -# HASH SUM AB 32.3184577772 - -# Coefficients for P not close to 0, 0.5 or 1. - -my $c0 = 1.4234372777; -my $c1 = 2.7568153900; -my $c2 = 1.3067284816; -my $c3 = 1.7023821103*(10**(-1)); -my $d1 = 7.3700164250*(10**(-1)); -my $d2 = 1.2021132975*(10**(-1)); -# HASH SUM CD 15.7614929821 - -# Coefficients for P near 0 or 1. - -my $e0 = 6.6579051150; -my $e1 = 3.0812263860; -my $e2 = 4.2868294337*(10**(-1)); -my $e3 = 1.7337203997*(10**(-2)); -my $f1 = 2.4197894225*(10**(-1)); -my $f2 = 1.2258202635*(10**(-2)); -# HASH SUM EF 19.4052910204 - -$ifault = 0; -$q = $p - $half; -if(abs($q) <= $split1) { - $r = $const1 - $q * $q; - $normal_dev = $q * ((($a3 * $r + $a2) * $r + $a1) * $r + $a0) / ((($b3 * $r + $b2) * $r + $b1) * $r + $one); -} -else{ - if($q < $zero) { - $r = $p;} - else { - $r = $one - $p; - } - if($r <= $zero) { - $ifault = 1; - $normal_dev = $zero; - return ($normal_dev, $ifault); - } - $r = (-log($r))**0.5; - if($r <= $split2) { - $r = $r - $const2; - $normal_dev = ((($c3 * $r + $c2) * $r + $c1) * $r + $c0) / (($d2 * $r + $d1) * $r + $one); - } - else{ - $r = $r - $split2; - $normal_dev = ((($e3 * $r + $e2) * $r + $e1) * $r + $e0) / (($f2 * $r + $f1) * $r + $one); - } - if($q < $zero) {$normal_dev = - $normal_dev;} - return ($normal_dev, $ifault); -} -} - - -################################ alnorm function ##################################################################### -#FUNCTION alnorm(x, upper) RESULT(fn_val) - -# Evaluates the tail area of the standardised normal curve from x to infinity if upper is .true. or -# from minus infinity to x if upper is .false. - -sub alnorm { - -# get the inputs -my ($x, $upper) = @_; - -# define the output -my $fn_val; - -my $zero = 0.0; -my $one = 1.0; -my $half = 0.5; -my $con = 1.28; -my $z; -my $y; -my $up; - -#!*** machine dependent constants -my $ltone = 7.0; -my $utzero = 18.66; - -my $p = 0.398942280444; -my $q = 0.39990348504; -my $r = 0.398942280385; -my $a1 = 5.75885480458; -my $a2 = 2.62433121679; -my $a3 = 5.92885724438; -my $b1 = -29.8213557807; -my $b2 = 48.6959930692; -my $c1 = -3.8052*(10**(-8)); -my $c2 = 3.98064794*(10**(-4)); -my $c3 = -0.151679116635; -my $c4 = 4.8385912808; -my $c5 = 0.742380924027; -my $c6 = 3.99019417011; -my $d1 = 1.00000615302; -my $d2 = 1.98615381364; -my $d3 = 5.29330324926; -my $d4 = -15.1508972451; -my $d5 = 30.789933034; - -$up = $upper; -$z = $x; -if($z < $zero){ - if($up == 0){$up = 1;} - else {$up = 0;} - $z = -$z; -} -if($z <= $ltone || $up && $z <= $utzero){ - $y = $half*$z*$z; - if($z > $con) {$fn_val = $r*exp(-$y)/($z+$c1+$d1/($z+$c2+$d2/($z+$c3+$d3/($z+$c4+$d4/($z+$c5+$d5/($z+$c6))))));} - else {$fn_val = $half - $z*($p - $q*$y/($y+$a1+$b1/($y+$a2+$b2/($y+$a3))));} -} -else{ - $fn_val = $zero; -} -if(!$up) {$fn_val = $one - $fn_val}; -return $fn_val; -} - -##################################### poly function ######################################################### - -# FUNCTION poly(c, nord, x) RESULT(fn_val) - -# Calculates the algebraic polynomial of order nored-1 with array of coefficients c. Zero order coefficient is c(1) - -sub poly_ { - -my ($c, $nord, $x) = @_; - -my $fn_val; - -my $i; -my $j; -my $n2; -my $p; - -$fn_val = @$c[1-1]; -if($nord == 1) {return $fn_val;} -$p = $x*@$c[$nord-1]; -if($nord == 2) { - $fn_val = $fn_val + $p; -} -else { - $n2 = $nord - 2; - $j = $n2 + 1; - for($i = 1; $i <= $n2; $i++){ - $p = ($p + @$c[$j-1])*$x; - $j = $j - 1; - } - $fn_val = $fn_val + $p; -} - -return $fn_val; -} - -###################################### min function ############################################ - -# FUNCTION min(a,b) RESULT min of a, b - -sub min { - -my ($a, $b) = @_; - -if($a <= $b) { return $a; } -else { return $b; } -} - -###################################### sign function ############################################ - -# FUNCTION sign(a,b) RESULT sign of a, b - -sub sign { - -my ($a) = @_; - -if($a >= 0) { return 1; } -else { return -1; } -} - -###################################### asin function ############################################ - -# FUNCTION asin(a) RESULT arcsin of a - -sub asin { - -my ($a) = @_; -if($a > 1 || $a < -1) { - print "error: abs sin value > 1"; - return 0; } -else { - my $b = (1-$a**2)**0.5; - my $fn_val = atan2($a, (1-$a**2)**0.5); - my $fn_test = (1-$a**2)**0.5; - return $fn_val; -} -} -
--- a/breakdancer2vcf.py Tue Jul 01 09:28:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ -#!/usr/bin/env python - -__copyright__ = """ -Copyright (C) 2013 - Tim te Beek -Copyright (C) 2013 - Wai Yi Leung -Copyright (C) 2013 AllBio (see AUTHORS file) -""" - -__desc__ = """Convert breakdancer output to pseudo .vcf file format.""" -__created__ = "Mar 18, 2013" -__author__ = "tbeek" - -import argparse -import csv -import os.path -import sys - - -def main(tsvfile, vcffile): - ''' -:param tsvfile: filename of input file.tsv -:type tsvfile: string -:param vcffile: filename of output file.vcf -:type vcffile: string -''' - with open(tsvfile) as reader: - # Parse file - dictreader = _parse_tsvfile(reader) - print dictreader.fieldnames - - # Write out file - _format_vcffile(dictreader, vcffile) - - # Quick output - with open(vcffile) as reader: - print reader.read(1000) - - -def _parse_tsvfile(readable): - ''' -Read readable using csv.Sniffer and csv.DictReader -:param readable: open file.tsv handle to read with csv.DictReader -:type readable: file -''' - prev, curr = 0, 0 - while True: - line = readable.readline() - if not line.startswith('#'): - # lets start from prev # line, without the hash sign - readable.seek(prev + 1) - break - else: - prev = curr - curr = readable.tell() - - # Determine dialect - curr = readable.tell() - #dialect = csv.Sniffer().sniff(readable.read(3000)) - dialect = 'excel-tab' - readable.seek(curr) - - # Read file - dictreader = csv.DictReader(readable, dialect=dialect) - return dictreader - - -_tsv_fields = ('Chr1', 'Pos1', 'Orientation1', - 'Chr2', 'Pos2', 'Orientation2', - 'Type', 'Size', 'Score', - 'num_Reads', 'num_Reads_lib', - 'ERR031544.sort.bam') -# 'Chr1': '1', -# 'Pos1': '269907', -# 'Orientation1': '39+39-', -# 'Chr2': '1', -# 'Pos2': '270919', -# 'Orientation2': '39+39-', -# 'Type': 'DEL', -# 'Size': '99', -# 'Score': '99', -# 'num_Reads': '38', -# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38', -# 'ERR031544.sort.bam': 'NA' - -_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO') - - -def _format_vcffile(dictreader, vcffile): - ''' -Create a pseudo .vcf file based on values read from DictReader instance. -:param dictreader: DictReader instance to read data from -:type dictreader: csv.DictRedaer -:param vcffile: output file.vcf filename -:type vcffile: string -''' - with open(vcffile, mode='w') as writer: - writer.write('#{}\n'.format('\t'.join(_vcf_fields))) - output_vcf = [] - for line in dictreader: - CHROM = line['Chr1'] - # TODO Figure out whether we have zero or one based positioning - POS = int(line['Pos1']) - SVEND = int(line['Pos2']) - INFO = 'PROGRAM=breakdancer;SVTYPE={};SVLEN={}'.format(line['Type'], - 0 - int(line['Size'])) - if line['Type'] not in ['CTX']: - INFO += ";SVEND={}".format(SVEND) - - # Create record - output_vcf.append([CHROM, POS, '.', '.', '.', '.', 'PASS', INFO]) - - # Sort all results - output_vcf.sort() - output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf]) - # Write record - writer.write(output) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str, - help='Breakdancer TSV outputfile') - parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str, - help='Output vcf to') - - args = parser.parse_args() - main(args.breakdancertsv, args.outputvcf)