Mercurial > repos > eduardo > blobplot
diff gc_cov_annotate.pl @ 0:0101d7d1211f
initial upload
| author | Eduardo <eduardoalves@abdn.ac.uk> |
|---|---|
| date | Wed, 25 Nov 2015 11:26:59 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gc_cov_annotate.pl Wed Nov 25 11:26:59 2015 +0000 @@ -0,0 +1,236 @@ +#!/usr/bin/perl + +=head1 NAME +gc_cov_annotate.pl +=head1 SYNOPSIS +gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] +=head1 AUTHORS +sujai.kumar@zoo.ox.ac.uk 2013.09.15 +=cut + +use strict; +use warnings; +use Getopt::Long qw(:config pass_through no_ignore_case); + +my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","",""); +my @tax_list; +my @bam_files; +my @cov_files; + +GetOptions ( + "blasttaxid=s" => \$blasttaxid_file, + "assembly=s" => \$assembly_file, + "out:s" => \$output_file, + "taxdump:s" => \$taxdump_dir, + "bam:s{,}" => \@bam_files, + "cov:s{,}" => \@cov_files, + "taxlist:s{,}" => \@tax_list, +); + +if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")}; +my %tax_levels; +foreach (@tax_list) {$tax_levels{$_}=1} + +if (not $output_file) {$output_file = $assembly_file . ".txt"}; + +print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" . + "--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" . + "--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless + (-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and + (-r $assembly_file or $assembly_file eq "-"); + +die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-"); +die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp"; +die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-"); + +#----------------------------------------------------------------- +# Get taxon annotation info from contig-taxid file +#----------------------------------------------------------------- + +my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name); +print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n"; +&load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp"); +print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n"; + +my $blasttaxid_fh = &read_fh($blasttaxid_file); +my %contig_taxinfo; +while (<$blasttaxid_fh>) { + die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless + /^(\S+)\t(\d+)/; + $contig_taxinfo{$1} = &taxonomy_report($2); +} +close $blasttaxid_fh; + +#----------------------------------------------------------------- +# Calculate GC, len, for assembly file, get coverage from bamfiles +#----------------------------------------------------------------- + +# load assembly file into memory: +print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n"; +my $fastahash = &fastafile2hash ($assembly_file); +print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n"; + +# calculate coverage for each bam file +for my $bam_file (@bam_files) { + my $reads_processed = 0; + my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop + open SAM, "samtools view $bam_file |" or die $!; + print STDERR scalar localtime() . " - Reading $bam_file ...\n"; + while (<SAM>) { + # ignore comment lines/ SAM headers + next if /^\s*$/ or /^@/ or /^#/; + chomp; + @F=split/\t/; + if (($F[1] & 4) != 4) { + $refstart = $F[3]; + $refend = $F[3] - 1; + while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR + die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]}; + $$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len}; + } + $reads_processed++; + print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0; + } + print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n"; +} + +# calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col) +for my $cov_file (@cov_files) { + open COV, $cov_file or die $!; + print STDERR scalar localtime() . " - Reading $cov_file ...\n"; + while (<COV>) { + /^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2; + } + print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n"; +} + +# print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R) +print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n"; +open LENCOVGC, ">$output_file" or die $!; + +# header row: +print LENCOVGC "seqid\tlen\tgc"; +foreach (@bam_files) {print LENCOVGC "\tcov_$_"}; +foreach (@cov_files) {print LENCOVGC "\tcov_$_"}; +foreach (@tax_list) {print LENCOVGC "\ttaxlevel_$_"}; +print LENCOVGC "\n"; + +# for each contig/seqid: +my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency +for $seqid (keys %{$fastahash}) { + $length = $$fastahash{$seqid}{len}; + $gccount = $$fastahash{$seqid}{gc}; + $nonatgc = $$fastahash{$seqid}{nonatgc}; + print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc)); + for my $bam_file (@bam_files) { + $cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0); + print LENCOVGC "\t" . $cov; + } + for my $cov_file (@cov_files) { + $cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0); + print LENCOVGC "\t" . $cov; + } + for my $tax_level (@tax_list) { + print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated"); + } + print LENCOVGC "\n"; +} +print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n"; + +############################################################ + +sub taxonomy_report { + my $hit_taxid = shift @_; + my @parents = &get_parents($hit_taxid); + # convert @parents to tax names: + my %taxinfo; + # my $taxonomy_report_string = ""; + for my $parent (@parents) { + if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) { + $taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent}; + } + } + return \%taxinfo; + # for my $tax_level (keys %hit_counts) { + # for my $tax_name (keys %{$hit_counts{$tax_level}}) { + # $taxonomy_report_string .= "$tax_level\t$tax_name\t"; + # } + # } + # return $taxonomy_report_string . "\n"; +} + +############################################################ + +sub get_parents { + my @all = @_; + my $current_id = $all[0]; + if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) { + unshift @all, $taxid_has_parent{$current_id}; + @all = &get_parents(@all); + } + return @all; +} + +############################################################ + +sub load_nodes_names { + my $fh; + my $nodesfile = shift @_; + my $namesfile = shift @_; + $fh = &read_fh($nodesfile); + while (my $line = <$fh>) { + # line in nodes.dmp should match the regexp below. + # Change the regexp if NCBI changes their file format + next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/; + $taxid_has_parent{$1} = $2; + $taxid_has_taxlevel{$1} = $3; + } + close $fh; + + $fh = &read_fh($namesfile); + while (my $line = <$fh>) { + next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/; + $taxid_has_name{$1} = $2; + } +} + +############################################################ + +sub fastafile2hash { + my $fastafile = shift @_; + my %sequences; + my $fh = &read_fh($fastafile); + my $seqid; + while (my $line = <$fh>) { + if ($line =~ /^>(\S+)(.*)/) { + $seqid = $1; + $sequences{$seqid}{desc} = $2; + } + else { + chomp $line; + $sequences{$seqid}{seq} .= $line; + $sequences{$seqid}{len} += length $line; + $sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/); + $line =~ s/[^atgc]/N/ig; + $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/); + } + } + close $fh; + return \%sequences; +} + +############################################################ + +sub read_fh { + my $filename = shift @_; + my $filehandle; + if ($filename =~ /gz$/) { + open $filehandle, "gunzip -dc $filename |" or die $!; + } + else { + open $filehandle, "<$filename" or die $!; + } + return $filehandle; +} + +############################################################
