Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
6 Genome Research Limited. All rights reserved. | |
7 | |
8 This software is distributed under a modified Apache license. | |
9 For license details, please see | |
10 | |
11 http://www.ensembl.org/info/about/code_licence.html | |
12 | |
13 =head1 CONTACT | |
14 | |
15 Please email comments or questions to the public Ensembl | |
16 developers list at <dev@ensembl.org>. | |
17 | |
18 Questions may also be sent to the Ensembl help desk at | |
19 <helpdesk@ensembl.org>. | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 Creates WUBlast indexes of the given GZipped file. The resulting index | |
28 is created under the parameter location I<base_path> in blast and then in a | |
29 directory defined by the type of dump. The type of dump also changes the file | |
30 name generated. Genomic dumps have their release number replaced with the | |
31 last repeat masked date. | |
32 | |
33 Allowed parameters are: | |
34 | |
35 =over 8 | |
36 | |
37 =item file - The file to index | |
38 | |
39 =item program - The location of the xdformat program | |
40 | |
41 =item molecule - The type of molecule to index. I<dna> and I<pep> are allowed | |
42 | |
43 =item type - Type of index we are creating. I<genomic> and I<genes> are allowed | |
44 | |
45 =item base_path - The base of the dumps | |
46 | |
47 =item release - Required for correct DB naming | |
48 | |
49 =back | |
50 | |
51 =cut | |
52 | |
53 package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer; | |
54 | |
55 use strict; | |
56 use warnings; | |
57 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/; | |
58 | |
59 use Bio::EnsEMBL::Utils::Exception qw/throw/; | |
60 use File::Copy qw/copy/; | |
61 use File::Spec; | |
62 use POSIX qw/strftime/; | |
63 | |
64 sub param_defaults { | |
65 my ($self) = @_; | |
66 return { | |
67 program => 'xdformat', | |
68 # molecule => 'pep', #pep or dna | |
69 # type => 'genes' #genes or genomic | |
70 }; | |
71 } | |
72 | |
73 sub fetch_input { | |
74 my ($self) = @_; | |
75 my $mol = $self->param('molecule'); | |
76 if($mol ne 'dna' && $mol ne 'pep') { | |
77 throw "param 'molecule' must be set to 'dna' or 'pep'"; | |
78 } | |
79 my $type = $self->param('type'); | |
80 if($type ne 'genomic' && $type ne 'genes') { | |
81 throw "param 'type' must be set to 'genomic' or 'genes'"; | |
82 } | |
83 $self->assert_executable($self->param('program')); | |
84 $self->assert_executable('gunzip'); | |
85 } | |
86 | |
87 sub write_output { | |
88 my ($self) = @_; | |
89 $self->dataflow_output_id({ | |
90 species => $self->param('species'), | |
91 type => $self->param('type'), | |
92 molecule => $self->param('molecule'), | |
93 index_base => $self->param('index_base') | |
94 }, 1); | |
95 return; | |
96 } | |
97 | |
98 sub index_file { | |
99 my ($self, $file) = @_; | |
100 my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ; | |
101 my $silence = ($self->debug()) ? 0 : 1; | |
102 my $target_dir = $self->target_dir(); | |
103 my $target_file = $self->target_file($file); | |
104 my $db_title = $self->db_title($file); | |
105 my $date = $self->db_date(); | |
106 | |
107 my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s }, | |
108 $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file); | |
109 | |
110 $self->info('About to run "%s"', $cmd); | |
111 my $output = `$cmd 2>&1`; | |
112 my $rc = $? >> 8; | |
113 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc; | |
114 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!"; | |
115 $self->param('index_base', $target_file); | |
116 return; | |
117 } | |
118 | |
119 sub target_file { | |
120 my ($self, $file) = @_; | |
121 my $target_dir = $self->target_dir(); | |
122 my $target_filename = $self->target_filename($file); | |
123 return File::Spec->catfile($target_dir, $target_filename); | |
124 } | |
125 | |
126 # Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX | |
127 sub target_dir { | |
128 my ($self) = @_; | |
129 return $self->get_dir('blast', $self->param('type')); | |
130 } | |
131 | |
132 sub db_title { | |
133 my ($self, $source_file) = @_; | |
134 my ($vol, $dir, $file) = File::Spec->splitpath($source_file); | |
135 my $release = $self->param('release'); | |
136 my $title = $file; | |
137 $title =~ s/$release\.//; | |
138 return $title; | |
139 } | |
140 | |
141 sub db_date { | |
142 my ($self) = @_; | |
143 return strftime('%d-%m-%Y', gmtime()); | |
144 } | |
145 | |
146 #Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa | |
147 #Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa | |
148 sub target_filename { | |
149 my ($self, $source_file) = @_; | |
150 my ($vol, $dir, $file) = File::Spec->splitpath($source_file); | |
151 if($self->param('type') eq 'genomic') { | |
152 my @split = split(/\./, $file); | |
153 my $rm_date = $self->repeat_mask_date(); | |
154 $split[-4] = $rm_date; | |
155 return join(q{.}, @split); | |
156 } | |
157 return $file; | |
158 } | |
159 | |
160 1; |