annotate variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupAlignmentPipeline.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =pod
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupAlignmentPipeline
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 'SetupAlignmentPipeline' Does all the setup before the Alignment is run
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10 Checks for existence of input files, etc...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 This Runnable CAN be run multiple times in parallell!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 package Bio::EnsEMBL::Funcgen::RunnableDB::SetupAlignmentPipeline;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 use warnings;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 use Bio::EnsEMBL::DBSQL::DBAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(is_gzipped);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 use Data::Dumper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Alignment');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 #TODO... Maybe use and update the tracking database...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 sub fetch_input { # fetch parameters...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 my $self = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 $self->SUPER::fetch_input();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 #Magic default number...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 my $fastq_chunk_size = 8000000;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35 if($self->param("fastq_chunk_size")){ $fastq_chunk_size = $self->param("fastq_chunk_size")};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 $self->_fastq_chunk_size($fastq_chunk_size);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 #Sets up the output dir for this experiment_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 my $output_dir = $self->_output_dir();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 if(! -d $output_dir){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 my $input_dir = $self->_input_dir();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 if(! -d $input_dir ){ throw " Couldn't find input directory $input_dir"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 opendir(DIR,$input_dir);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 my @dirs = grep(/^\d/,readdir(DIR));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49 closedir(DIR);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51 if(scalar(@dirs)==0){ throw "No replicates found in $input_dir"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 my @input_files;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 my @replicates;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 foreach my $dir (@dirs){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 #TODO: maybe use some other code for replicates? (e.g. Rep\d )
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 if($dir =~ /^(\d)$/){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 my $replicate = $1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 opendir(DIR,$input_dir."/".$replicate);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 my @files = grep(/.fastq/,readdir(DIR));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 closedir(DIR);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64 if(scalar(@files)==0){ throw "No files for replicate $replicate"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 my $file_count = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 for my $file (@files){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 push @input_files, {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 path => $input_dir."/".$replicate."/".$file,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 replicate => $replicate,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 file_index => $file_count++,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 push @replicates, $replicate;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 } else { warn "Invalid replicate $dir ignored"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 $self->_input_files(\@input_files);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 $self->_replicates(\@replicates);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 sub run {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 my $self = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88 my $fastq_chunk_size = $self->_fastq_chunk_size();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 my @output_ids;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 my $set_name = $self->_set_name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 foreach my $file_info (@{$self->_input_files()}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 my $file = $file_info->{'path'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 my $replicate = $file_info->{'replicate'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 my $file_index = $file_info->{'file_index'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 my $cmd;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 if($file =~ /^(.*.fastq).gz$/){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 $cmd = "gunzip -c";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 elsif($file =~ /^(.*.fastq).bz2$/){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 $cmd = "bunzip2 -c"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 else {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 $cmd = "cat";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 $cmd .= ' '.$file.' | split -d -a 4 -l '.$fastq_chunk_size.' - '. $self->_output_dir().'/'.$set_name."_".$replicate.'_'.$file_index.'_';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 if(system($cmd) != 0){ throw "Problems running $cmd"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 sub write_output { # Create the relevant job
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 my $self = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 my $set_name = $self->_set_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 my (@align_output_ids, @merge_output_ids);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 opendir(DIR,$self->_output_dir());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 for my $split_file ( grep(/^${set_name}_\d+_\d+_\d+$/,readdir(DIR)) ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 my $output = eval($self->input_id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 $output->{input_file} = $split_file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131 push @align_output_ids, $output;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 closedir(DIR);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 # merge data for each replicate
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 for my $rep (@{$self->_replicates}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 my $output = eval($self->input_id);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 $output->{replicate} = $rep;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 push @merge_output_ids, $output;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 # files to align
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 $self->dataflow_output_id(\@align_output_ids, 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 # merge data acros replicates
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 $self->dataflow_output_id($self->input_id, 2);#input_id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 #Private getter / setter to the fastq chunk size
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 sub _fastq_chunk_size {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 return $_[0]->_getter_setter('fastq_chunk_size',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 #Private getter / setter to the output_ids list
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 sub _output_ids {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 return $_[0]->_getter_setter('output_ids',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 #Private getter / setter to the output_ids list
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 sub _replicates {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 return $_[0]->_getter_setter('replicates',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 sub _input_files {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 return $_[0]->_getter_setter('input_files',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 1;