Mercurial > repos > eduardo > blobplot
comparison gc_cov_annotate.pl @ 0:0101d7d1211f
initial upload
| author | Eduardo <eduardoalves@abdn.ac.uk> |
|---|---|
| date | Wed, 25 Nov 2015 11:26:59 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0101d7d1211f |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 =head1 NAME | |
| 4 gc_cov_annotate.pl | |
| 5 =head1 SYNOPSIS | |
| 6 gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] | |
| 7 =head1 AUTHORS | |
| 8 sujai.kumar@zoo.ox.ac.uk 2013.09.15 | |
| 9 =cut | |
| 10 | |
| 11 use strict; | |
| 12 use warnings; | |
| 13 use Getopt::Long qw(:config pass_through no_ignore_case); | |
| 14 | |
| 15 my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","",""); | |
| 16 my @tax_list; | |
| 17 my @bam_files; | |
| 18 my @cov_files; | |
| 19 | |
| 20 GetOptions ( | |
| 21 "blasttaxid=s" => \$blasttaxid_file, | |
| 22 "assembly=s" => \$assembly_file, | |
| 23 "out:s" => \$output_file, | |
| 24 "taxdump:s" => \$taxdump_dir, | |
| 25 "bam:s{,}" => \@bam_files, | |
| 26 "cov:s{,}" => \@cov_files, | |
| 27 "taxlist:s{,}" => \@tax_list, | |
| 28 ); | |
| 29 | |
| 30 if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")}; | |
| 31 my %tax_levels; | |
| 32 foreach (@tax_list) {$tax_levels{$_}=1} | |
| 33 | |
| 34 if (not $output_file) {$output_file = $assembly_file . ".txt"}; | |
| 35 | |
| 36 print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" . | |
| 37 "--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" . | |
| 38 "--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless | |
| 39 (-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and | |
| 40 (-r $assembly_file or $assembly_file eq "-"); | |
| 41 | |
| 42 die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-"); | |
| 43 die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp"; | |
| 44 die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-"); | |
| 45 | |
| 46 #----------------------------------------------------------------- | |
| 47 # Get taxon annotation info from contig-taxid file | |
| 48 #----------------------------------------------------------------- | |
| 49 | |
| 50 my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name); | |
| 51 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n"; | |
| 52 &load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp"); | |
| 53 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n"; | |
| 54 | |
| 55 my $blasttaxid_fh = &read_fh($blasttaxid_file); | |
| 56 my %contig_taxinfo; | |
| 57 while (<$blasttaxid_fh>) { | |
| 58 die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless | |
| 59 /^(\S+)\t(\d+)/; | |
| 60 $contig_taxinfo{$1} = &taxonomy_report($2); | |
| 61 } | |
| 62 close $blasttaxid_fh; | |
| 63 | |
| 64 #----------------------------------------------------------------- | |
| 65 # Calculate GC, len, for assembly file, get coverage from bamfiles | |
| 66 #----------------------------------------------------------------- | |
| 67 | |
| 68 # load assembly file into memory: | |
| 69 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n"; | |
| 70 my $fastahash = &fastafile2hash ($assembly_file); | |
| 71 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n"; | |
| 72 | |
| 73 # calculate coverage for each bam file | |
| 74 for my $bam_file (@bam_files) { | |
| 75 my $reads_processed = 0; | |
| 76 my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop | |
| 77 open SAM, "samtools view $bam_file |" or die $!; | |
| 78 print STDERR scalar localtime() . " - Reading $bam_file ...\n"; | |
| 79 while (<SAM>) { | |
| 80 # ignore comment lines/ SAM headers | |
| 81 next if /^\s*$/ or /^@/ or /^#/; | |
| 82 chomp; | |
| 83 @F=split/\t/; | |
| 84 if (($F[1] & 4) != 4) { | |
| 85 $refstart = $F[3]; | |
| 86 $refend = $F[3] - 1; | |
| 87 while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR | |
| 88 die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]}; | |
| 89 $$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len}; | |
| 90 } | |
| 91 $reads_processed++; | |
| 92 print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0; | |
| 93 } | |
| 94 print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n"; | |
| 95 } | |
| 96 | |
| 97 # calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col) | |
| 98 for my $cov_file (@cov_files) { | |
| 99 open COV, $cov_file or die $!; | |
| 100 print STDERR scalar localtime() . " - Reading $cov_file ...\n"; | |
| 101 while (<COV>) { | |
| 102 /^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2; | |
| 103 } | |
| 104 print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n"; | |
| 105 } | |
| 106 | |
| 107 # print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R) | |
| 108 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n"; | |
| 109 open LENCOVGC, ">$output_file" or die $!; | |
| 110 | |
| 111 # header row: | |
| 112 print LENCOVGC "seqid\tlen\tgc"; | |
| 113 foreach (@bam_files) {print LENCOVGC "\tcov_$_"}; | |
| 114 foreach (@cov_files) {print LENCOVGC "\tcov_$_"}; | |
| 115 foreach (@tax_list) {print LENCOVGC "\ttaxlevel_$_"}; | |
| 116 print LENCOVGC "\n"; | |
| 117 | |
| 118 # for each contig/seqid: | |
| 119 my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency | |
| 120 for $seqid (keys %{$fastahash}) { | |
| 121 $length = $$fastahash{$seqid}{len}; | |
| 122 $gccount = $$fastahash{$seqid}{gc}; | |
| 123 $nonatgc = $$fastahash{$seqid}{nonatgc}; | |
| 124 print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc)); | |
| 125 for my $bam_file (@bam_files) { | |
| 126 $cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0); | |
| 127 print LENCOVGC "\t" . $cov; | |
| 128 } | |
| 129 for my $cov_file (@cov_files) { | |
| 130 $cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0); | |
| 131 print LENCOVGC "\t" . $cov; | |
| 132 } | |
| 133 for my $tax_level (@tax_list) { | |
| 134 print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated"); | |
| 135 } | |
| 136 print LENCOVGC "\n"; | |
| 137 } | |
| 138 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n"; | |
| 139 | |
| 140 ############################################################ | |
| 141 | |
| 142 sub taxonomy_report { | |
| 143 my $hit_taxid = shift @_; | |
| 144 my @parents = &get_parents($hit_taxid); | |
| 145 # convert @parents to tax names: | |
| 146 my %taxinfo; | |
| 147 # my $taxonomy_report_string = ""; | |
| 148 for my $parent (@parents) { | |
| 149 if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) { | |
| 150 $taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent}; | |
| 151 } | |
| 152 } | |
| 153 return \%taxinfo; | |
| 154 # for my $tax_level (keys %hit_counts) { | |
| 155 # for my $tax_name (keys %{$hit_counts{$tax_level}}) { | |
| 156 # $taxonomy_report_string .= "$tax_level\t$tax_name\t"; | |
| 157 # } | |
| 158 # } | |
| 159 # return $taxonomy_report_string . "\n"; | |
| 160 } | |
| 161 | |
| 162 ############################################################ | |
| 163 | |
| 164 sub get_parents { | |
| 165 my @all = @_; | |
| 166 my $current_id = $all[0]; | |
| 167 if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) { | |
| 168 unshift @all, $taxid_has_parent{$current_id}; | |
| 169 @all = &get_parents(@all); | |
| 170 } | |
| 171 return @all; | |
| 172 } | |
| 173 | |
| 174 ############################################################ | |
| 175 | |
| 176 sub load_nodes_names { | |
| 177 my $fh; | |
| 178 my $nodesfile = shift @_; | |
| 179 my $namesfile = shift @_; | |
| 180 $fh = &read_fh($nodesfile); | |
| 181 while (my $line = <$fh>) { | |
| 182 # line in nodes.dmp should match the regexp below. | |
| 183 # Change the regexp if NCBI changes their file format | |
| 184 next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/; | |
| 185 $taxid_has_parent{$1} = $2; | |
| 186 $taxid_has_taxlevel{$1} = $3; | |
| 187 } | |
| 188 close $fh; | |
| 189 | |
| 190 $fh = &read_fh($namesfile); | |
| 191 while (my $line = <$fh>) { | |
| 192 next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/; | |
| 193 $taxid_has_name{$1} = $2; | |
| 194 } | |
| 195 } | |
| 196 | |
| 197 ############################################################ | |
| 198 | |
| 199 sub fastafile2hash { | |
| 200 my $fastafile = shift @_; | |
| 201 my %sequences; | |
| 202 my $fh = &read_fh($fastafile); | |
| 203 my $seqid; | |
| 204 while (my $line = <$fh>) { | |
| 205 if ($line =~ /^>(\S+)(.*)/) { | |
| 206 $seqid = $1; | |
| 207 $sequences{$seqid}{desc} = $2; | |
| 208 } | |
| 209 else { | |
| 210 chomp $line; | |
| 211 $sequences{$seqid}{seq} .= $line; | |
| 212 $sequences{$seqid}{len} += length $line; | |
| 213 $sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/); | |
| 214 $line =~ s/[^atgc]/N/ig; | |
| 215 $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/); | |
| 216 } | |
| 217 } | |
| 218 close $fh; | |
| 219 return \%sequences; | |
| 220 } | |
| 221 | |
| 222 ############################################################ | |
| 223 | |
| 224 sub read_fh { | |
| 225 my $filename = shift @_; | |
| 226 my $filehandle; | |
| 227 if ($filename =~ /gz$/) { | |
| 228 open $filehandle, "gunzip -dc $filename |" or die $!; | |
| 229 } | |
| 230 else { | |
| 231 open $filehandle, "<$filename" or die $!; | |
| 232 } | |
| 233 return $filehandle; | |
| 234 } | |
| 235 | |
| 236 ############################################################ |
