blobplot: gc_cov_annotate.pl comparison

comparison gc_cov_annotate.pl @ 0:0101d7d1211f

initial upload

author	Eduardo <eduardoalves@abdn.ac.uk>
date	Wed, 25 Nov 2015 11:26:59 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:0101d7d1211f
+#!/usr/bin/perl
+=head1 NAME
+gc_cov_annotate.pl
+=head1 SYNOPSIS
+gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...]
+=head1 AUTHORS
+sujai.kumar@zoo.ox.ac.uk 2013.09.15
+=cut
+use strict;
+use warnings;
+use Getopt::Long qw(:config pass_through no_ignore_case);
+my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","","");
+my @tax_list;
+my @bam_files;
+my @cov_files;
+GetOptions (
+"blasttaxid=s" => \$blasttaxid_file,
+"assembly=s"   => \$assembly_file,
+"out:s"        => \$output_file,
+"taxdump:s"    => \$taxdump_dir,
+"bam:s{,}"     => \@bam_files,
+"cov:s{,}"     => \@cov_files,
+"taxlist:s{,}" => \@tax_list,
+);
+if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")};
+my %tax_levels;
+foreach (@tax_list) {$tax_levels{$_}=1}
+if (not $output_file) {$output_file = $assembly_file . ".txt"};
+print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" .
+"--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" .
+"--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless
+(-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and
+(-r $assembly_file or $assembly_file eq "-");
+die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-");
+die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp";
+die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-");
+#-----------------------------------------------------------------
+# Get taxon annotation info from contig-taxid file
+#-----------------------------------------------------------------
+my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name);
+print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n";
+&load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp");
+print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n";
+my $blasttaxid_fh = &read_fh($blasttaxid_file);
+my %contig_taxinfo;
+while (<$blasttaxid_fh>) {
+die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless
+/^(\S+)\t(\d+)/;
+$contig_taxinfo{$1} = &taxonomy_report($2);
+}
+close $blasttaxid_fh;
+#-----------------------------------------------------------------
+# Calculate GC, len, for assembly file, get coverage from bamfiles
+#-----------------------------------------------------------------
+# load assembly file into memory:
+print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n";
+my $fastahash = &fastafile2hash ($assembly_file);
+print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n";
+# calculate coverage for each bam file
+for my $bam_file (@bam_files) {
+my $reads_processed = 0;
+my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop
+open SAM, "samtools view $bam_file |" or die $!;
+print STDERR scalar localtime() . " - Reading $bam_file ...\n";
+while (<SAM>) {
+# ignore comment lines/ SAM headers
+next if /^\s*$/ or /^@/ or /^#/;
+chomp;
+@F=split/\t/;
+if (($F[1] & 4) != 4) {
+$refstart =  $F[3];
+$refend   =  $F[3] - 1;
+while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR
+die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]};
+$$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len};
+}
+$reads_processed++;
+print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0;
+}
+print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n";
+}
+# calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col)
+for my $cov_file (@cov_files) {
+open COV, $cov_file or die $!;
+print STDERR scalar localtime() . " - Reading $cov_file ...\n";
+while (<COV>) {
+/^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2;
+}
+print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n";
+}
+# print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R)
+print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n";
+open  LENCOVGC, ">$output_file" or die $!;
+# header row:
+print LENCOVGC "seqid\tlen\tgc";
+foreach (@bam_files) {print LENCOVGC "\tcov_$_"};
+foreach (@cov_files) {print LENCOVGC "\tcov_$_"};
+foreach (@tax_list)  {print LENCOVGC "\ttaxlevel_$_"};
+print LENCOVGC "\n";
+# for each contig/seqid:
+my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency
+for $seqid (keys %{$fastahash}) {
+$length   = $$fastahash{$seqid}{len};
+$gccount  = $$fastahash{$seqid}{gc};
+$nonatgc  = $$fastahash{$seqid}{nonatgc};
+print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc));
+for my $bam_file (@bam_files) {
+$cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0);
+print LENCOVGC "\t" . $cov;
+}
+for my $cov_file (@cov_files) {
+$cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0);
+print LENCOVGC "\t" . $cov;
+}
+for my $tax_level (@tax_list) {
+print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated");
+}
+print LENCOVGC "\n";
+}
+print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n";
+############################################################
+sub taxonomy_report {
+my $hit_taxid = shift @_;
+my @parents = &get_parents($hit_taxid);
+# convert @parents to tax names:
+my %taxinfo;
+# my $taxonomy_report_string = "";
+for my $parent (@parents) {
+if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) {
+$taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent};
+}
+}
+return \%taxinfo;
+# for my $tax_level (keys %hit_counts) {
+# for my $tax_name (keys %{$hit_counts{$tax_level}}) {
+# $taxonomy_report_string .= "$tax_level\t$tax_name\t";
+# }
+# }
+# return $taxonomy_report_string . "\n";
+}
+############################################################
+sub get_parents {
+my @all = @_;
+my $current_id = $all[0];
+if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) {
+unshift @all, $taxid_has_parent{$current_id};
+@all = &get_parents(@all);
+}
+return @all;
+}
+############################################################
+sub load_nodes_names {
+my $fh;
+my $nodesfile = shift @_;
+my $namesfile = shift @_;
+$fh = &read_fh($nodesfile);
+while (my $line = <$fh>) {
+# line in nodes.dmp should match the regexp below.
+# Change the regexp if NCBI changes their file format
+next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/;
+$taxid_has_parent{$1} = $2;
+$taxid_has_taxlevel{$1} = $3;
+}
+close $fh;
+$fh = &read_fh($namesfile);
+while (my $line = <$fh>) {
+next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/;
+$taxid_has_name{$1} = $2;
+}
+}
+############################################################
+sub fastafile2hash {
+my $fastafile = shift @_;
+my %sequences;
+my $fh = &read_fh($fastafile);
+my $seqid;
+while (my $line = <$fh>) {
+if ($line =~ /^>(\S+)(.*)/) {
+$seqid = $1;
+$sequences{$seqid}{desc} = $2;
+}
+else {
+chomp $line;
+$sequences{$seqid}{seq}     .= $line;
+$sequences{$seqid}{len}     += length $line;
+$sequences{$seqid}{gc}      += ($line =~ tr/gcGC/gcGC/);
+$line =~ s/[^atgc]/N/ig;
+$sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
+}
+}
+close $fh;
+return \%sequences;
+}
+############################################################
+sub read_fh {
+my $filename = shift @_;
+my $filehandle;
+if ($filename =~ /gz$/) {
+open $filehandle, "gunzip -dc $filename |" or die $!;
+}
+else {
+open $filehandle, "<$filename" or die $!;
+}
+return $filehandle;
+}
+############################################################

Mercurial > repos > eduardo > blobplot

comparison gc_cov_annotate.pl @ 0:0101d7d1211f