annotate gc_cov_annotate.pl @ 4:f35d462f57b9

added toolshed and changeset_revision
author Eduardo <eduardoalves@abdn.ac.uk>
date Wed, 25 Nov 2015 13:39:32 +0000
parents 0101d7d1211f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
1 #!/usr/bin/perl
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
2
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
3 =head1 NAME
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
4 gc_cov_annotate.pl
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
5 =head1 SYNOPSIS
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
6 gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...]
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
7 =head1 AUTHORS
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
8 sujai.kumar@zoo.ox.ac.uk 2013.09.15
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
9 =cut
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
10
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
11 use strict;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
12 use warnings;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
13 use Getopt::Long qw(:config pass_through no_ignore_case);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
14
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
15 my ($blasttaxid_file, $taxdump_dir, $assembly_file, $output_file) = ("",".","","");
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
16 my @tax_list;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
17 my @bam_files;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
18 my @cov_files;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
19
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
20 GetOptions (
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
21 "blasttaxid=s" => \$blasttaxid_file,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
22 "assembly=s" => \$assembly_file,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
23 "out:s" => \$output_file,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
24 "taxdump:s" => \$taxdump_dir,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
25 "bam:s{,}" => \@bam_files,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
26 "cov:s{,}" => \@cov_files,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
27 "taxlist:s{,}" => \@tax_list,
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
28 );
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
29
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
30 if (not @tax_list) {@tax_list = ("species","order","phylum","superkingdom")};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
31 my %tax_levels;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
32 foreach (@tax_list) {$tax_levels{$_}=1}
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
33
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
34 if (not $output_file) {$output_file = $assembly_file . ".txt"};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
35
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
36 print "Usage: gc_cov_annotate.pl --blasttaxid CONTIGTAXIDFILE --assembly ASSEMBLYFASTAFILE [--taxdump TAXDUMPDIR] [--bam BAMFILE...] [--cov COVFILES...] [--taxlist species...]\n" .
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
37 "--taxdump is '.' by default, i.e. the files nodes.dmp and names.dmp from the NCBI taxonomy database ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz are expected to be in the current directory\n" .
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
38 "--taxlist: default is: species order phylum superkingdom, but you can add any other NCBI taxlevel such as class family suborder etc\n" unless
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
39 (-r $blasttaxid_file or $blasttaxid_file eq "-") and -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp" and
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
40 (-r $assembly_file or $assembly_file eq "-");
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
41
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
42 die "Please specify --blasttaxid\n" unless (-r $blasttaxid_file or $blasttaxid_file eq "-");
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
43 die "Please specify --taxdump\n" unless -r "$taxdump_dir/nodes.dmp" and -r "$taxdump_dir/names.dmp";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
44 die "Please specify --assembly\n" unless (-r $assembly_file or $assembly_file eq "-");
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
45
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
46 #-----------------------------------------------------------------
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
47 # Get taxon annotation info from contig-taxid file
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
48 #-----------------------------------------------------------------
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
49
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
50 my (%taxid_has_parent, %taxid_has_taxlevel, %taxid_has_name);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
51 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ...\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
52 &load_nodes_names ("$taxdump_dir/nodes.dmp","$taxdump_dir/names.dmp");
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
53 print STDERR scalar localtime() . " - Loading $taxdump_dir/nodes.dmp and $taxdump_dir/names.dmp into memory ... DONE\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
54
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
55 my $blasttaxid_fh = &read_fh($blasttaxid_file);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
56 my %contig_taxinfo;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
57 while (<$blasttaxid_fh>) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
58 die "Contig-taxid file $blasttaxid_file does not seem to have two cols with the seqid in the first col and taxid in the second col" unless
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
59 /^(\S+)\t(\d+)/;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
60 $contig_taxinfo{$1} = &taxonomy_report($2);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
61 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
62 close $blasttaxid_fh;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
63
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
64 #-----------------------------------------------------------------
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
65 # Calculate GC, len, for assembly file, get coverage from bamfiles
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
66 #-----------------------------------------------------------------
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
67
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
68 # load assembly file into memory:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
69 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ...\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
70 my $fastahash = &fastafile2hash ($assembly_file);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
71 print STDERR scalar localtime() . " - Loading assembly fasta file $assembly_file into memory ... DONE\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
72
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
73 # calculate coverage for each bam file
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
74 for my $bam_file (@bam_files) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
75 my $reads_processed = 0;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
76 my ($refstart, $refend, $readid, @F); # declaring here to avoid having to declare in each instance of loop
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
77 open SAM, "samtools view $bam_file |" or die $!;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
78 print STDERR scalar localtime() . " - Reading $bam_file ...\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
79 while (<SAM>) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
80 # ignore comment lines/ SAM headers
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
81 next if /^\s*$/ or /^@/ or /^#/;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
82 chomp;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
83 @F=split/\t/;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
84 if (($F[1] & 4) != 4) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
85 $refstart = $F[3];
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
86 $refend = $F[3] - 1;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
87 while ($F[5] =~ /(\d+)[MIDNP]/g) { $refend += $1 } # calculate coverage on reference sequence from CIGAR
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
88 die "ContigID $F[2] in bamfile $bam_file, but not in assembly file $assembly_file\n" if not exists $$fastahash{$F[2]};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
89 $$fastahash{$F[2]}{$bam_file} += ($refend - $refstart + 1)/$$fastahash{$F[2]}{len};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
90 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
91 $reads_processed++;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
92 print STDERR "Processed $reads_processed reads by " . localtime() . "...\n" if $reads_processed % 1000000 == 0;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
93 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
94 print STDERR scalar localtime() . " - Reading $bam_file ... DONE\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
95 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
96
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
97 # calculate coverage for each cov file (two col file with seqid in first col, average read coverage depth in second col)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
98 for my $cov_file (@cov_files) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
99 open COV, $cov_file or die $!;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
100 print STDERR scalar localtime() . " - Reading $cov_file ...\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
101 while (<COV>) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
102 /^(\S+)\s+(\S+)/ and $$fastahash{$1}{$cov_file} = $2;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
103 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
104 print STDERR scalar localtime() . " - Reading $cov_file ... DONE\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
105 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
106
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
107 # print len gc cov (one for each bam, and each cov) and taxon annotation info in one large file (that can be used for plotting by R)
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
108 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ...\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
109 open LENCOVGC, ">$output_file" or die $!;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
110
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
111 # header row:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
112 print LENCOVGC "seqid\tlen\tgc";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
113 foreach (@bam_files) {print LENCOVGC "\tcov_$_"};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
114 foreach (@cov_files) {print LENCOVGC "\tcov_$_"};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
115 foreach (@tax_list) {print LENCOVGC "\ttaxlevel_$_"};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
116 print LENCOVGC "\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
117
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
118 # for each contig/seqid:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
119 my ($seqid, $length, $gccount, $nonatgc, $totalcov, $cov); # declared outside loop for efficiency
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
120 for $seqid (keys %{$fastahash}) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
121 $length = $$fastahash{$seqid}{len};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
122 $gccount = $$fastahash{$seqid}{gc};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
123 $nonatgc = $$fastahash{$seqid}{nonatgc};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
124 print LENCOVGC "$seqid\t$length\t". ($gccount/($length-$nonatgc));
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
125 for my $bam_file (@bam_files) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
126 $cov = (exists($$fastahash{$seqid}{$bam_file}) ? $$fastahash{$seqid}{$bam_file} : 0);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
127 print LENCOVGC "\t" . $cov;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
128 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
129 for my $cov_file (@cov_files) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
130 $cov = (exists($$fastahash{$seqid}{$cov_file}) ? $$fastahash{$seqid}{$cov_file} : 0);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
131 print LENCOVGC "\t" . $cov;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
132 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
133 for my $tax_level (@tax_list) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
134 print LENCOVGC "\t" . (exists(${$contig_taxinfo{$seqid}}{$tax_level}) ? ${$contig_taxinfo{$seqid}}{$tax_level} : "Not annotated");
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
135 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
136 print LENCOVGC "\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
137 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
138 print STDERR scalar localtime() . " - Making len gc cov taxon annotation data file $assembly_file.txt for R ... DONE\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
139
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
140 ############################################################
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
141
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
142 sub taxonomy_report {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
143 my $hit_taxid = shift @_;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
144 my @parents = &get_parents($hit_taxid);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
145 # convert @parents to tax names:
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
146 my %taxinfo;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
147 # my $taxonomy_report_string = "";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
148 for my $parent (@parents) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
149 if (exists $taxid_has_taxlevel{$parent} and exists $tax_levels{$taxid_has_taxlevel{$parent}}) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
150 $taxinfo{$taxid_has_taxlevel{$parent}} = $taxid_has_name{$parent};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
151 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
152 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
153 return \%taxinfo;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
154 # for my $tax_level (keys %hit_counts) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
155 # for my $tax_name (keys %{$hit_counts{$tax_level}}) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
156 # $taxonomy_report_string .= "$tax_level\t$tax_name\t";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
157 # }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
158 # }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
159 # return $taxonomy_report_string . "\n";
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
160 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
161
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
162 ############################################################
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
163
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
164 sub get_parents {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
165 my @all = @_;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
166 my $current_id = $all[0];
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
167 if (exists $taxid_has_parent{$current_id} and $current_id ne $taxid_has_parent{$current_id}) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
168 unshift @all, $taxid_has_parent{$current_id};
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
169 @all = &get_parents(@all);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
170 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
171 return @all;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
172 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
173
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
174 ############################################################
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
175
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
176 sub load_nodes_names {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
177 my $fh;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
178 my $nodesfile = shift @_;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
179 my $namesfile = shift @_;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
180 $fh = &read_fh($nodesfile);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
181 while (my $line = <$fh>) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
182 # line in nodes.dmp should match the regexp below.
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
183 # Change the regexp if NCBI changes their file format
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
184 next if $line !~ /^(\d+)\s*\|\s*(\d+)\s*\|\s*(.+?)\s*\|/;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
185 $taxid_has_parent{$1} = $2;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
186 $taxid_has_taxlevel{$1} = $3;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
187 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
188 close $fh;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
189
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
190 $fh = &read_fh($namesfile);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
191 while (my $line = <$fh>) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
192 next unless $line =~ /^(\d+)\s*\|\s*(.+?)\s*\|.+scientific name/;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
193 $taxid_has_name{$1} = $2;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
194 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
195 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
196
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
197 ############################################################
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
198
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
199 sub fastafile2hash {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
200 my $fastafile = shift @_;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
201 my %sequences;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
202 my $fh = &read_fh($fastafile);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
203 my $seqid;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
204 while (my $line = <$fh>) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
205 if ($line =~ /^>(\S+)(.*)/) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
206 $seqid = $1;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
207 $sequences{$seqid}{desc} = $2;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
208 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
209 else {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
210 chomp $line;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
211 $sequences{$seqid}{seq} .= $line;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
212 $sequences{$seqid}{len} += length $line;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
213 $sequences{$seqid}{gc} += ($line =~ tr/gcGC/gcGC/);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
214 $line =~ s/[^atgc]/N/ig;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
215 $sequences{$seqid}{nonatgc} += ($line =~ tr/N/N/);
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
216 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
217 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
218 close $fh;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
219 return \%sequences;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
220 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
221
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
222 ############################################################
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
223
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
224 sub read_fh {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
225 my $filename = shift @_;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
226 my $filehandle;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
227 if ($filename =~ /gz$/) {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
228 open $filehandle, "gunzip -dc $filename |" or die $!;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
229 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
230 else {
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
231 open $filehandle, "<$filename" or die $!;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
232 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
233 return $filehandle;
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
234 }
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
235
0101d7d1211f initial upload
Eduardo <eduardoalves@abdn.ac.uk>
parents:
diff changeset
236 ############################################################