comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 CONTACT
14
15 Please email comments or questions to the public Ensembl
16 developers list at <dev@ensembl.org>.
17
18 Questions may also be sent to the Ensembl help desk at
19 <helpdesk@ensembl.org>.
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer
24
25 =head1 DESCRIPTION
26
27 Creates WUBlast indexes of the given GZipped file. The resulting index
28 is created under the parameter location I<base_path> in blast and then in a
29 directory defined by the type of dump. The type of dump also changes the file
30 name generated. Genomic dumps have their release number replaced with the
31 last repeat masked date.
32
33 Allowed parameters are:
34
35 =over 8
36
37 =item file - The file to index
38
39 =item program - The location of the xdformat program
40
41 =item molecule - The type of molecule to index. I<dna> and I<pep> are allowed
42
43 =item type - Type of index we are creating. I<genomic> and I<genes> are allowed
44
45 =item base_path - The base of the dumps
46
47 =item release - Required for correct DB naming
48
49 =back
50
51 =cut
52
53 package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer;
54
55 use strict;
56 use warnings;
57 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
58
59 use Bio::EnsEMBL::Utils::Exception qw/throw/;
60 use File::Copy qw/copy/;
61 use File::Spec;
62 use POSIX qw/strftime/;
63
64 sub param_defaults {
65 my ($self) = @_;
66 return {
67 program => 'xdformat',
68 # molecule => 'pep', #pep or dna
69 # type => 'genes' #genes or genomic
70 };
71 }
72
73 sub fetch_input {
74 my ($self) = @_;
75 my $mol = $self->param('molecule');
76 if($mol ne 'dna' && $mol ne 'pep') {
77 throw "param 'molecule' must be set to 'dna' or 'pep'";
78 }
79 my $type = $self->param('type');
80 if($type ne 'genomic' && $type ne 'genes') {
81 throw "param 'type' must be set to 'genomic' or 'genes'";
82 }
83 $self->assert_executable($self->param('program'));
84 $self->assert_executable('gunzip');
85 }
86
87 sub write_output {
88 my ($self) = @_;
89 $self->dataflow_output_id({
90 species => $self->param('species'),
91 type => $self->param('type'),
92 molecule => $self->param('molecule'),
93 index_base => $self->param('index_base')
94 }, 1);
95 return;
96 }
97
98 sub index_file {
99 my ($self, $file) = @_;
100 my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ;
101 my $silence = ($self->debug()) ? 0 : 1;
102 my $target_dir = $self->target_dir();
103 my $target_file = $self->target_file($file);
104 my $db_title = $self->db_title($file);
105 my $date = $self->db_date();
106
107 my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s },
108 $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file);
109
110 $self->info('About to run "%s"', $cmd);
111 my $output = `$cmd 2>&1`;
112 my $rc = $? >> 8;
113 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
114 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
115 $self->param('index_base', $target_file);
116 return;
117 }
118
119 sub target_file {
120 my ($self, $file) = @_;
121 my $target_dir = $self->target_dir();
122 my $target_filename = $self->target_filename($file);
123 return File::Spec->catfile($target_dir, $target_filename);
124 }
125
126 # Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX
127 sub target_dir {
128 my ($self) = @_;
129 return $self->get_dir('blast', $self->param('type'));
130 }
131
132 sub db_title {
133 my ($self, $source_file) = @_;
134 my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
135 my $release = $self->param('release');
136 my $title = $file;
137 $title =~ s/$release\.//;
138 return $title;
139 }
140
141 sub db_date {
142 my ($self) = @_;
143 return strftime('%d-%m-%Y', gmtime());
144 }
145
146 #Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa
147 #Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa
148 sub target_filename {
149 my ($self, $source_file) = @_;
150 my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
151 if($self->param('type') eq 'genomic') {
152 my @split = split(/\./, $file);
153 my $rm_date = $self->repeat_mask_date();
154 $split[-4] = $rm_date;
155 return join(q{.}, @split);
156 }
157 return $file;
158 }
159
160 1;