diff gc_cov_annotate.pl @ 0:0101d7d1211f

initial upload
author Eduardo <eduardoalves@abdn.ac.uk>
date Wed, 25 Nov 2015 11:26:59 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gc_cov_annotate.pl	Wed Nov 25 11:26:59 2015 +0000
@@ -0,0 +1,236 @@
+#!/usr/bin/perl
+
+=head1 NAME
+gc_cov_annotate.pl
+=head1 SYNOPSIS
+gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...]
+=head1 AUTHORS
+sujai.kumar@zoo.ox.ac.uk 2013.09.15
+=cut
+
+use strict;
+use warnings;
+use Getopt::Long qw(:config pass_through no_ignore_case);
+
+my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","","");
+my @tax_list;
+my @bam_files;
+my @cov_files;
+
+GetOptions (
+    "blasttaxid=s" => \$blasttaxid_file,
+    "assembly=s"   => \$assembly_file,
+    "out:s"        => \$output_file,
+    "taxdump:s"    => \$taxdump_dir,
+    "bam:s{,}"     => \@bam_files,
+    "cov:s{,}"     => \@cov_files,
+    "taxlist:s{,}" => \@tax_list,
+);
+
+if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")};
+my %tax_levels;
+foreach (@tax_list) {$tax_levels{$_}=1}
+
+if (not $output_file) {$output_file = $assembly_file . ".txt"};
+
+print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" .
+    "--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" . 
+    "--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless
+    (-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and 
+    (-r $assembly_file or $assembly_file eq "-");
+
+die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-");
+die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp";
+die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-");
+
+#-----------------------------------------------------------------
+# Get taxon annotation info from contig-taxid file
+#-----------------------------------------------------------------
+
+my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name);
+print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n";
+&load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp");
+print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n";
+
+my $blasttaxid_fh = &read_fh($blasttaxid_file);
+my %contig_taxinfo;
+while (<$blasttaxid_fh>) {
+    die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless 
+        /^(\S+)\t(\d+)/;
+    $contig_taxinfo{$1} = &taxonomy_report($2);
+}
+close $blasttaxid_fh;
+
+#-----------------------------------------------------------------
+# Calculate GC, len, for assembly file, get coverage from bamfiles
+#-----------------------------------------------------------------
+
+# load assembly file into memory:
+print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n";
+my $fastahash = &fastafile2hash ($assembly_file);
+print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n";
+
+# calculate coverage for each bam file
+for my $bam_file (@bam_files) {
+    my $reads_processed = 0;
+    my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop
+    open SAM, "samtools view $bam_file |" or die $!; 
+    print STDERR scalar localtime() . " - Reading $bam_file ...\n";
+    while (<SAM>) {
+        # ignore comment lines/ SAM headers
+        next if /^\s*$/ or /^@/ or /^#/;
+        chomp;
+        @F=split/\t/;
+        if (($F[1] & 4) != 4) {
+            $refstart =  $F[3];
+            $refend   =  $F[3] - 1;
+            while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR
+            die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]};
+            $$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len};
+        }
+        $reads_processed++;
+        print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0;
+    }
+    print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n";
+}
+
+# calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col)
+for my $cov_file (@cov_files) {
+    open COV, $cov_file or die $!;
+    print STDERR scalar localtime() . " - Reading $cov_file ...\n";
+    while (<COV>) {
+        /^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2;
+    }
+    print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n";
+}
+
+# print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R)
+print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n";
+open  LENCOVGC, ">$output_file" or die $!;
+
+# header row:
+print LENCOVGC "seqid\tlen\tgc";
+foreach (@bam_files) {print LENCOVGC "\tcov_$_"};
+foreach (@cov_files) {print LENCOVGC "\tcov_$_"};
+foreach (@tax_list)  {print LENCOVGC "\ttaxlevel_$_"};
+print LENCOVGC "\n";
+
+# for each contig/seqid:
+my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency
+for $seqid (keys %{$fastahash}) { 
+    $length   = $$fastahash{$seqid}{len};
+    $gccount  = $$fastahash{$seqid}{gc};
+    $nonatgc  = $$fastahash{$seqid}{nonatgc};
+    print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc));
+    for my $bam_file (@bam_files) {
+        $cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0);
+        print LENCOVGC "\t" . $cov;
+    }
+    for my $cov_file (@cov_files) {
+        $cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0);
+        print LENCOVGC "\t" . $cov;
+    }
+    for my $tax_level (@tax_list) {
+        print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated"); 
+    }
+    print LENCOVGC "\n";
+}
+print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n";
+
+############################################################
+
+sub taxonomy_report {
+    my $hit_taxid = shift @_;
+    my @parents = &get_parents($hit_taxid);
+    # convert @parents to tax names:
+    my %taxinfo;
+    # my $taxonomy_report_string = "";
+    for my $parent (@parents) {
+        if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) {
+            $taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent};
+        }
+    }
+    return \%taxinfo;
+    # for my $tax_level (keys %hit_counts) {
+        # for my $tax_name (keys %{$hit_counts{$tax_level}}) {
+            # $taxonomy_report_string .= "$tax_level\t$tax_name\t";
+        # }
+    # }
+    # return $taxonomy_report_string . "\n";
+}
+
+############################################################
+
+sub get_parents {
+    my @all = @_;
+    my $current_id = $all[0];
+    if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) {
+        unshift @all, $taxid_has_parent{$current_id};
+        @all = &get_parents(@all);
+    }
+    return @all;
+}
+
+############################################################
+
+sub load_nodes_names {
+    my $fh;
+    my $nodesfile = shift @_;
+    my $namesfile = shift @_;
+    $fh = &read_fh($nodesfile);
+    while (my $line = <$fh>) {
+        # line in nodes.dmp should match the regexp below.
+        # Change the regexp if NCBI changes their file format
+        next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/;
+        $taxid_has_parent{$1} = $2;
+        $taxid_has_taxlevel{$1} = $3;
+    }
+    close $fh;
+    
+    $fh = &read_fh($namesfile);
+    while (my $line = <$fh>) {
+        next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/;
+        $taxid_has_name{$1} = $2;
+    }
+}
+
+############################################################
+
+sub fastafile2hash {
+    my $fastafile = shift @_;
+    my %sequences;
+    my $fh = &read_fh($fastafile);
+    my $seqid;
+    while (my $line = <$fh>) {
+        if ($line =~ /^>(\S+)(.*)/) {
+            $seqid = $1;
+            $sequences{$seqid}{desc} = $2;
+        }
+        else {
+            chomp $line;
+            $sequences{$seqid}{seq}     .= $line;
+            $sequences{$seqid}{len}     += length $line;
+            $sequences{$seqid}{gc}      += ($line =~ tr/gcGC/gcGC/);
+            $line =~ s/[^atgc]/N/ig;
+            $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
+        }
+    }
+    close $fh;
+    return \%sequences;
+}
+
+############################################################
+
+sub read_fh {
+    my $filename = shift @_;
+    my $filehandle;
+    if ($filename =~ /gz$/) {
+        open $filehandle, "gunzip -dc $filename |" or die $!;
+    }
+    else {
+        open $filehandle, "<$filename" or die $!;
+    }
+    return $filehandle;
+}
+
+############################################################