Mercurial > repos > biomonika > linkyx
comparison scripts/bam_analysis.pl @ 9:695d28139f3e
toolshed8
| author | biomonika <biomonika@psu.edu> |
|---|---|
| date | Tue, 09 Sep 2014 14:31:02 -0400 |
| parents | 1955f03f092e |
| children |
comparison
equal
deleted
inserted
replaced
| 8:983278b1fdb2 | 9:695d28139f3e |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 #NOTE: The following code is adapted from the post http://www.biostars.org/p/44867/#44906 | |
| 3 | |
| 4 use strict; | |
| 5 use warnings; | |
| 6 | |
| 7 use Bio::DB::Sam; | |
| 8 open (MP, '>mpileup'); | |
| 9 | |
| 10 # all necessary inputs (I use Getopt::Long to obtain input parameters) | |
| 11 my $opt_i = $ARGV[0]; #"mother.bam"; # should have an index file named input.bam.bai | |
| 12 my $opt_f = $ARGV[1]; #"reference.fasta"; | |
| 13 my $chr_id = $ARGV[2]; #'comp1_c0_seq1'; # your chromosome-id | |
| 14 my $snp_pos = $ARGV[3]; # position you want to call all variants in this position | |
| 15 # create the object | |
| 16 my $sam = Bio::DB::Sam->new( -fasta => $opt_f, -bam => $opt_i ); | |
| 17 | |
| 18 # get all reads that cover this SNP -- therefore, start and end set to $snp_pos | |
| 19 my @alignments = $sam->get_features_by_location(-seq_id => $chr_id, -start => $snp_pos, -end => $snp_pos ); | |
| 20 | |
| 21 my $depth=@alignments; | |
| 22 print MP "DEPTH:".$depth."\n"; | |
| 23 | |
| 24 my %res; # output hash that'll contain the count of each available nucleotide (or blank if the read covering the SNP is spliced in this position). | |
| 25 # loop over all alignments | |
| 26 for my $cAlign (@alignments) { | |
| 27 # parameters we'll need from our bam file | |
| 28 | |
| 29 my $start = $cAlign->start; # get start coordinate | |
| 30 my $end = $cAlign->end; # get end coordinate | |
| 31 my $ref_seq = $cAlign->dna; # get reference DNA sequence | |
| 32 my $read_seq = $cAlign->query->dna; # get query DNA sequence | |
| 33 | |
| 34 #print $read_seq."\n"; | |
| 35 #print "start: ".$start." end: ".$end."\n"; | |
| 36 | |
| 37 my $cigar = $cAlign->cigar_str; # get CIGAR sequence | |
| 38 my $cigar_ref = $cAlign->cigar_array; # probably the important useful of all. splits cigar to array of array reference | |
| 39 | |
| 40 #print $cigar."\n"; # Ex: $cigar = 20M100N50M => $cigar_ref = [ [ 'M' 20 ] [ 'N' 100] ['M' 50] ] | |
| 41 | |
| 42 my $ref_cntr = $start; # $ref_cntr = assign start to counter variable for reference. This will ultimately | |
| 43 # keep track of the current position on the chromosome | |
| 44 my $read_cntr = 0; # $read_cntr = computes offset on the read | |
| 45 my $read_snp_pos = ""; # variable to hold base at $snp_pos from current read | |
| 46 | |
| 47 foreach my $deref ( @$cigar_ref ) { | |
| 48 my $cigar_chr = $deref->[0]; # cigar character (ex: M, N, I, D etc...) | |
| 49 my $len_chr = $deref->[1]; # number corresponding to `this` cigar character ( ex: 20, 100 ) | |
| 50 | |
| 51 # NOTE: I => insertion in to the reference, meaning the read has the base(s) but the REFERENCE fasta does NOT | |
| 52 # D => deletion from the reference, meaning the READ does NOT have the base(s), but the reference does | |
| 53 | |
| 54 # modify reference counter only on M, N or D occurrence | |
| 55 if( $cigar_chr eq "M" || $cigar_chr eq "N" || $cigar_chr eq "D") { | |
| 56 $ref_cntr += $len_chr; | |
| 57 } | |
| 58 | |
| 59 if( $cigar_chr eq "S" ) { | |
| 60 #print "soft_clipped.\n"; | |
| 61 } | |
| 62 | |
| 63 # modify offset only on M or I occurrence | |
| 64 if( $cigar_chr eq "M" || $cigar_chr eq "I" || $cigar_chr eq "S") { | |
| 65 $read_cntr += $len_chr; | |
| 66 } | |
| 67 | |
| 68 | |
| 69 # now, check if the current operation takes ref_cntr past the SNP position. If it does, | |
| 70 # 1) If the current operation is NOT "M", then its either "N" or "D". Both of them mean | |
| 71 # that the read doesn't have a base at the SNP. So, this read is either spliced or has | |
| 72 # a deletion at that point and is not useful for your SNP location. | |
| 73 # 2) If the current position IS "M", then the current operation has gotten is past SNP | |
| 74 # location. So, we FIRST SUBTRACT what we added in this operation for ref_cntr and read_cntr, | |
| 75 # and then just add the difference ($snp_pos - $ref_cntr + 1) | |
| 76 | |
| 77 #print "read_cntr: ".$read_cntr."\nref_cntr: ".$ref_cntr." cigar: ".$cigar_chr."\n"; | |
| 78 | |
| 79 if( $ref_cntr > $snp_pos ) { | |
| 80 if( $cigar_chr eq "M" || $cigar_chr eq "S") { | |
| 81 $ref_cntr -= $len_chr; | |
| 82 $read_cntr -= $len_chr; | |
| 83 | |
| 84 #print "backing\n"; | |
| 85 #print "read_cntr: ".$read_cntr."\nref_cntr: ".$ref_cntr."\n"; | |
| 86 | |
| 87 #IMPORTANT | |
| 88 my $pos = $snp_pos - $ref_cntr + $read_cntr; | |
| 89 | |
| 90 #print "SNP position: ".$snp_pos; | |
| 91 #print "position: ".$pos; | |
| 92 | |
| 93 $read_snp_pos = substr( $read_seq, $pos, 1 ); | |
| 94 | |
| 95 #print "This is what we add: ".$read_snp_pos."\n"; | |
| 96 } | |
| 97 # if $cigar_chr is "N" or "D", do nothing. $read_snp_pos is set to "" | |
| 98 | |
| 99 if( $cigar_chr eq "N" || $cigar_chr eq "D") { | |
| 100 print MP "NAN\n"; | |
| 101 } | |
| 102 | |
| 103 # add value of $read_snp_pos to hash and get out of loop - to next read | |
| 104 $res{$read_snp_pos}++; | |
| 105 last; | |
| 106 } | |
| 107 } | |
| 108 } | |
| 109 | |
| 110 #Here, I am just printing to output. | |
| 111 #print "Count\n"; | |
| 112 foreach my $key ( keys %res ) { | |
| 113 print MP "$key:$res{$key}\n"; | |
| 114 } | |
| 115 #print "--\n"; | |
| 116 close MP; |
