|
0
|
1 #!/usr/bin/perl
|
|
|
2
|
|
|
3 =head1 NAME
|
|
|
4 gc_cov_annotate.pl
|
|
|
5 =head1 SYNOPSIS
|
|
|
6 gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...]
|
|
|
7 =head1 AUTHORS
|
|
|
8 sujai.kumar@zoo.ox.ac.uk 2013.09.15
|
|
|
9 =cut
|
|
|
10
|
|
|
11 use strict;
|
|
|
12 use warnings;
|
|
|
13 use Getopt::Long qw(:config pass_through no_ignore_case);
|
|
|
14
|
|
|
15 my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","","");
|
|
|
16 my @tax_list;
|
|
|
17 my @bam_files;
|
|
|
18 my @cov_files;
|
|
|
19
|
|
|
20 GetOptions (
|
|
|
21 "blasttaxid=s" => \$blasttaxid_file,
|
|
|
22 "assembly=s" => \$assembly_file,
|
|
|
23 "out:s" => \$output_file,
|
|
|
24 "taxdump:s" => \$taxdump_dir,
|
|
|
25 "bam:s{,}" => \@bam_files,
|
|
|
26 "cov:s{,}" => \@cov_files,
|
|
|
27 "taxlist:s{,}" => \@tax_list,
|
|
|
28 );
|
|
|
29
|
|
|
30 if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")};
|
|
|
31 my %tax_levels;
|
|
|
32 foreach (@tax_list) {$tax_levels{$_}=1}
|
|
|
33
|
|
|
34 if (not $output_file) {$output_file = $assembly_file . ".txt"};
|
|
|
35
|
|
|
36 print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" .
|
|
|
37 "--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" .
|
|
|
38 "--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless
|
|
|
39 (-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and
|
|
|
40 (-r $assembly_file or $assembly_file eq "-");
|
|
|
41
|
|
|
42 die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-");
|
|
|
43 die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp";
|
|
|
44 die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-");
|
|
|
45
|
|
|
46 #-----------------------------------------------------------------
|
|
|
47 # Get taxon annotation info from contig-taxid file
|
|
|
48 #-----------------------------------------------------------------
|
|
|
49
|
|
|
50 my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name);
|
|
|
51 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n";
|
|
|
52 &load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp");
|
|
|
53 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n";
|
|
|
54
|
|
|
55 my $blasttaxid_fh = &read_fh($blasttaxid_file);
|
|
|
56 my %contig_taxinfo;
|
|
|
57 while (<$blasttaxid_fh>) {
|
|
|
58 die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless
|
|
|
59 /^(\S+)\t(\d+)/;
|
|
|
60 $contig_taxinfo{$1} = &taxonomy_report($2);
|
|
|
61 }
|
|
|
62 close $blasttaxid_fh;
|
|
|
63
|
|
|
64 #-----------------------------------------------------------------
|
|
|
65 # Calculate GC, len, for assembly file, get coverage from bamfiles
|
|
|
66 #-----------------------------------------------------------------
|
|
|
67
|
|
|
68 # load assembly file into memory:
|
|
|
69 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n";
|
|
|
70 my $fastahash = &fastafile2hash ($assembly_file);
|
|
|
71 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n";
|
|
|
72
|
|
|
73 # calculate coverage for each bam file
|
|
|
74 for my $bam_file (@bam_files) {
|
|
|
75 my $reads_processed = 0;
|
|
|
76 my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop
|
|
|
77 open SAM, "samtools view $bam_file |" or die $!;
|
|
|
78 print STDERR scalar localtime() . " - Reading $bam_file ...\n";
|
|
|
79 while (<SAM>) {
|
|
|
80 # ignore comment lines/ SAM headers
|
|
|
81 next if /^\s*$/ or /^@/ or /^#/;
|
|
|
82 chomp;
|
|
|
83 @F=split/\t/;
|
|
|
84 if (($F[1] & 4) != 4) {
|
|
|
85 $refstart = $F[3];
|
|
|
86 $refend = $F[3] - 1;
|
|
|
87 while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR
|
|
|
88 die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]};
|
|
|
89 $$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len};
|
|
|
90 }
|
|
|
91 $reads_processed++;
|
|
|
92 print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0;
|
|
|
93 }
|
|
|
94 print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n";
|
|
|
95 }
|
|
|
96
|
|
|
97 # calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col)
|
|
|
98 for my $cov_file (@cov_files) {
|
|
|
99 open COV, $cov_file or die $!;
|
|
|
100 print STDERR scalar localtime() . " - Reading $cov_file ...\n";
|
|
|
101 while (<COV>) {
|
|
|
102 /^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2;
|
|
|
103 }
|
|
|
104 print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n";
|
|
|
105 }
|
|
|
106
|
|
|
107 # print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R)
|
|
|
108 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n";
|
|
|
109 open LENCOVGC, ">$output_file" or die $!;
|
|
|
110
|
|
|
111 # header row:
|
|
|
112 print LENCOVGC "seqid\tlen\tgc";
|
|
|
113 foreach (@bam_files) {print LENCOVGC "\tcov_$_"};
|
|
|
114 foreach (@cov_files) {print LENCOVGC "\tcov_$_"};
|
|
|
115 foreach (@tax_list) {print LENCOVGC "\ttaxlevel_$_"};
|
|
|
116 print LENCOVGC "\n";
|
|
|
117
|
|
|
118 # for each contig/seqid:
|
|
|
119 my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency
|
|
|
120 for $seqid (keys %{$fastahash}) {
|
|
|
121 $length = $$fastahash{$seqid}{len};
|
|
|
122 $gccount = $$fastahash{$seqid}{gc};
|
|
|
123 $nonatgc = $$fastahash{$seqid}{nonatgc};
|
|
|
124 print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc));
|
|
|
125 for my $bam_file (@bam_files) {
|
|
|
126 $cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0);
|
|
|
127 print LENCOVGC "\t" . $cov;
|
|
|
128 }
|
|
|
129 for my $cov_file (@cov_files) {
|
|
|
130 $cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0);
|
|
|
131 print LENCOVGC "\t" . $cov;
|
|
|
132 }
|
|
|
133 for my $tax_level (@tax_list) {
|
|
|
134 print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated");
|
|
|
135 }
|
|
|
136 print LENCOVGC "\n";
|
|
|
137 }
|
|
|
138 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n";
|
|
|
139
|
|
|
140 ############################################################
|
|
|
141
|
|
|
142 sub taxonomy_report {
|
|
|
143 my $hit_taxid = shift @_;
|
|
|
144 my @parents = &get_parents($hit_taxid);
|
|
|
145 # convert @parents to tax names:
|
|
|
146 my %taxinfo;
|
|
|
147 # my $taxonomy_report_string = "";
|
|
|
148 for my $parent (@parents) {
|
|
|
149 if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) {
|
|
|
150 $taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent};
|
|
|
151 }
|
|
|
152 }
|
|
|
153 return \%taxinfo;
|
|
|
154 # for my $tax_level (keys %hit_counts) {
|
|
|
155 # for my $tax_name (keys %{$hit_counts{$tax_level}}) {
|
|
|
156 # $taxonomy_report_string .= "$tax_level\t$tax_name\t";
|
|
|
157 # }
|
|
|
158 # }
|
|
|
159 # return $taxonomy_report_string . "\n";
|
|
|
160 }
|
|
|
161
|
|
|
162 ############################################################
|
|
|
163
|
|
|
164 sub get_parents {
|
|
|
165 my @all = @_;
|
|
|
166 my $current_id = $all[0];
|
|
|
167 if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) {
|
|
|
168 unshift @all, $taxid_has_parent{$current_id};
|
|
|
169 @all = &get_parents(@all);
|
|
|
170 }
|
|
|
171 return @all;
|
|
|
172 }
|
|
|
173
|
|
|
174 ############################################################
|
|
|
175
|
|
|
176 sub load_nodes_names {
|
|
|
177 my $fh;
|
|
|
178 my $nodesfile = shift @_;
|
|
|
179 my $namesfile = shift @_;
|
|
|
180 $fh = &read_fh($nodesfile);
|
|
|
181 while (my $line = <$fh>) {
|
|
|
182 # line in nodes.dmp should match the regexp below.
|
|
|
183 # Change the regexp if NCBI changes their file format
|
|
|
184 next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/;
|
|
|
185 $taxid_has_parent{$1} = $2;
|
|
|
186 $taxid_has_taxlevel{$1} = $3;
|
|
|
187 }
|
|
|
188 close $fh;
|
|
|
189
|
|
|
190 $fh = &read_fh($namesfile);
|
|
|
191 while (my $line = <$fh>) {
|
|
|
192 next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/;
|
|
|
193 $taxid_has_name{$1} = $2;
|
|
|
194 }
|
|
|
195 }
|
|
|
196
|
|
|
197 ############################################################
|
|
|
198
|
|
|
199 sub fastafile2hash {
|
|
|
200 my $fastafile = shift @_;
|
|
|
201 my %sequences;
|
|
|
202 my $fh = &read_fh($fastafile);
|
|
|
203 my $seqid;
|
|
|
204 while (my $line = <$fh>) {
|
|
|
205 if ($line =~ /^>(\S+)(.*)/) {
|
|
|
206 $seqid = $1;
|
|
|
207 $sequences{$seqid}{desc} = $2;
|
|
|
208 }
|
|
|
209 else {
|
|
|
210 chomp $line;
|
|
|
211 $sequences{$seqid}{seq} .= $line;
|
|
|
212 $sequences{$seqid}{len} += length $line;
|
|
|
213 $sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/);
|
|
|
214 $line =~ s/[^atgc]/N/ig;
|
|
|
215 $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
|
|
|
216 }
|
|
|
217 }
|
|
|
218 close $fh;
|
|
|
219 return \%sequences;
|
|
|
220 }
|
|
|
221
|
|
|
222 ############################################################
|
|
|
223
|
|
|
224 sub read_fh {
|
|
|
225 my $filename = shift @_;
|
|
|
226 my $filehandle;
|
|
|
227 if ($filename =~ /gz$/) {
|
|
|
228 open $filehandle, "gunzip -dc $filename |" or die $!;
|
|
|
229 }
|
|
|
230 else {
|
|
|
231 open $filehandle, "<$filename" or die $!;
|
|
|
232 }
|
|
|
233 return $filehandle;
|
|
|
234 }
|
|
|
235
|
|
|
236 ############################################################
|