comparison gc_cov_annotate.pl @ 0:0101d7d1211f

initial upload
author Eduardo <eduardoalves@abdn.ac.uk>
date Wed, 25 Nov 2015 11:26:59 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0101d7d1211f
1 #!/usr/bin/perl
2
3 =head1 NAME
4 gc_cov_annotate.pl
5 =head1 SYNOPSIS
6 gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...]
7 =head1 AUTHORS
8 sujai.kumar@zoo.ox.ac.uk 2013.09.15
9 =cut
10
11 use strict;
12 use warnings;
13 use Getopt::Long qw(:config pass_through no_ignore_case);
14
15 my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","","");
16 my @tax_list;
17 my @bam_files;
18 my @cov_files;
19
20 GetOptions (
21 "blasttaxid=s" => \$blasttaxid_file,
22 "assembly=s" => \$assembly_file,
23 "out:s" => \$output_file,
24 "taxdump:s" => \$taxdump_dir,
25 "bam:s{,}" => \@bam_files,
26 "cov:s{,}" => \@cov_files,
27 "taxlist:s{,}" => \@tax_list,
28 );
29
30 if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")};
31 my %tax_levels;
32 foreach (@tax_list) {$tax_levels{$_}=1}
33
34 if (not $output_file) {$output_file = $assembly_file . ".txt"};
35
36 print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" .
37 "--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" .
38 "--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless
39 (-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and
40 (-r $assembly_file or $assembly_file eq "-");
41
42 die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-");
43 die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp";
44 die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-");
45
46 #-----------------------------------------------------------------
47 # Get taxon annotation info from contig-taxid file
48 #-----------------------------------------------------------------
49
50 my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name);
51 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n";
52 &load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp");
53 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n";
54
55 my $blasttaxid_fh = &read_fh($blasttaxid_file);
56 my %contig_taxinfo;
57 while (<$blasttaxid_fh>) {
58 die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless
59 /^(\S+)\t(\d+)/;
60 $contig_taxinfo{$1} = &taxonomy_report($2);
61 }
62 close $blasttaxid_fh;
63
64 #-----------------------------------------------------------------
65 # Calculate GC, len, for assembly file, get coverage from bamfiles
66 #-----------------------------------------------------------------
67
68 # load assembly file into memory:
69 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n";
70 my $fastahash = &fastafile2hash ($assembly_file);
71 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n";
72
73 # calculate coverage for each bam file
74 for my $bam_file (@bam_files) {
75 my $reads_processed = 0;
76 my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop
77 open SAM, "samtools view $bam_file |" or die $!;
78 print STDERR scalar localtime() . " - Reading $bam_file ...\n";
79 while (<SAM>) {
80 # ignore comment lines/ SAM headers
81 next if /^\s*$/ or /^@/ or /^#/;
82 chomp;
83 @F=split/\t/;
84 if (($F[1] & 4) != 4) {
85 $refstart = $F[3];
86 $refend = $F[3] - 1;
87 while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR
88 die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]};
89 $$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len};
90 }
91 $reads_processed++;
92 print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0;
93 }
94 print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n";
95 }
96
97 # calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col)
98 for my $cov_file (@cov_files) {
99 open COV, $cov_file or die $!;
100 print STDERR scalar localtime() . " - Reading $cov_file ...\n";
101 while (<COV>) {
102 /^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2;
103 }
104 print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n";
105 }
106
107 # print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R)
108 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n";
109 open LENCOVGC, ">$output_file" or die $!;
110
111 # header row:
112 print LENCOVGC "seqid\tlen\tgc";
113 foreach (@bam_files) {print LENCOVGC "\tcov_$_"};
114 foreach (@cov_files) {print LENCOVGC "\tcov_$_"};
115 foreach (@tax_list) {print LENCOVGC "\ttaxlevel_$_"};
116 print LENCOVGC "\n";
117
118 # for each contig/seqid:
119 my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency
120 for $seqid (keys %{$fastahash}) {
121 $length = $$fastahash{$seqid}{len};
122 $gccount = $$fastahash{$seqid}{gc};
123 $nonatgc = $$fastahash{$seqid}{nonatgc};
124 print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc));
125 for my $bam_file (@bam_files) {
126 $cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0);
127 print LENCOVGC "\t" . $cov;
128 }
129 for my $cov_file (@cov_files) {
130 $cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0);
131 print LENCOVGC "\t" . $cov;
132 }
133 for my $tax_level (@tax_list) {
134 print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated");
135 }
136 print LENCOVGC "\n";
137 }
138 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n";
139
140 ############################################################
141
142 sub taxonomy_report {
143 my $hit_taxid = shift @_;
144 my @parents = &get_parents($hit_taxid);
145 # convert @parents to tax names:
146 my %taxinfo;
147 # my $taxonomy_report_string = "";
148 for my $parent (@parents) {
149 if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) {
150 $taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent};
151 }
152 }
153 return \%taxinfo;
154 # for my $tax_level (keys %hit_counts) {
155 # for my $tax_name (keys %{$hit_counts{$tax_level}}) {
156 # $taxonomy_report_string .= "$tax_level\t$tax_name\t";
157 # }
158 # }
159 # return $taxonomy_report_string . "\n";
160 }
161
162 ############################################################
163
164 sub get_parents {
165 my @all = @_;
166 my $current_id = $all[0];
167 if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) {
168 unshift @all, $taxid_has_parent{$current_id};
169 @all = &get_parents(@all);
170 }
171 return @all;
172 }
173
174 ############################################################
175
176 sub load_nodes_names {
177 my $fh;
178 my $nodesfile = shift @_;
179 my $namesfile = shift @_;
180 $fh = &read_fh($nodesfile);
181 while (my $line = <$fh>) {
182 # line in nodes.dmp should match the regexp below.
183 # Change the regexp if NCBI changes their file format
184 next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/;
185 $taxid_has_parent{$1} = $2;
186 $taxid_has_taxlevel{$1} = $3;
187 }
188 close $fh;
189
190 $fh = &read_fh($namesfile);
191 while (my $line = <$fh>) {
192 next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/;
193 $taxid_has_name{$1} = $2;
194 }
195 }
196
197 ############################################################
198
199 sub fastafile2hash {
200 my $fastafile = shift @_;
201 my %sequences;
202 my $fh = &read_fh($fastafile);
203 my $seqid;
204 while (my $line = <$fh>) {
205 if ($line =~ /^>(\S+)(.*)/) {
206 $seqid = $1;
207 $sequences{$seqid}{desc} = $2;
208 }
209 else {
210 chomp $line;
211 $sequences{$seqid}{seq} .= $line;
212 $sequences{$seqid}{len} += length $line;
213 $sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/);
214 $line =~ s/[^atgc]/N/ig;
215 $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
216 }
217 }
218 close $fh;
219 return \%sequences;
220 }
221
222 ############################################################
223
224 sub read_fh {
225 my $filename = shift @_;
226 my $filehandle;
227 if ($filename =~ /gz$/) {
228 open $filehandle, "gunzip -dc $filename |" or die $!;
229 }
230 else {
231 open $filehandle, "<$filename" or die $!;
232 }
233 return $filehandle;
234 }
235
236 ############################################################