annotate variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/WrapUpAlignment.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =pod
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 Bio::EnsEMBL::Funcgen::RunnableDB::WrapUpAlignment;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 'WrapUpAlignment' Merges all results from alignment jobs into a single file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 package Bio::EnsEMBL::Funcgen::RunnableDB::WrapUpAlignment;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 use warnings;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 use Bio::EnsEMBL::Funcgen::Utils::Helper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 use Bio::EnsEMBL::DBSQL::DBAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 use Bio::EnsEMBL::Funcgen::InputSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 use Bio::EnsEMBL::Funcgen::DataSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22 use Bio::EnsEMBL::Funcgen::FeatureSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Alignment');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 use Data::Dumper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 use File::Copy;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 sub fetch_input {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32 my $self = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 $self->SUPER::fetch_input();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 #my $replicate = $self->param('replicate') || throw "No replicate given";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 #$self->_replicate($replicate);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 #my $nbr_subfiles = $self->param('nbr_subfiles') || throw "Number of subfiles not given";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40 #if($nbr_subfiles<1){ throw "We need at least one subfile. Empty or invalid set given: $nbr_subfiles"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 #$self->_nbr_subfiles($nbr_subfiles);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 my $species=$self->_species();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 my $gender=$self->_cell_type->gender();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 $gender = $gender ? $gender : "male";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 my $assembly=$self->_assembly();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 my $sam_header = $self->_work_dir()."/sam_header/".$species."/".$species."_".$gender."_".$assembly."_unmasked.header.sam";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 $self->_sam_header($sam_header);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51 my $repository = $self->_repository();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 if(! -d $repository){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 system("mkdir -p $repository") && throw("Couldn't create directory $repository");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 sub run {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 my $self = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 # my $replicate = $self->_replicate();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 my $sam_header = $self->_sam_header();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 my $set_name = $self->_set_name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 my $file_prefix = $self->_output_dir()."/".$set_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 my $input_file_pattern = "${file_prefix}_[0-9]*_[0-9]*_[0-9]*.sorted.bam";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 my $merge_cmd="samtools merge -f -h $sam_header ${file_prefix}.sorted.bam $input_file_pattern";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 #Maybe remove duplicates as default behavior ?.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 #my $merge_cmd="samtools merge -h $sam_header - ${file_prefix}.[0-9]*.sorted.bam | ";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 #$merge_cmd.=" samtools rmdup -s - ${file_prefix}.sorted.bam";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 if(system($merge_cmd) != 0){ throw "Error merging file: $merge_cmd"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 #keep end result as bam... it's more compact: only when passing to the peak caller we pass to sam or other format...
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 #merge_cmd.=" samtools view -h - | gzip -c > ${file_prefix}${align_type}.sam.gz"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 my $alignment_log = $file_prefix.".alignment.log";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 my $log_cmd="echo \"Alignment QC - total reads as input: \" >> ${alignment_log}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 $log_cmd.=";samtools flagstat ${file_prefix}.sorted.bam | head -n 1 >> ${alignment_log}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 $log_cmd.=";echo \"Alignment QC - mapped reads: \" >> ${alignment_log} ";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 $log_cmd.=";samtools view -u -F 4 ${file_prefix}.sorted.bam | samtools flagstat - | head -n 1 >> ${alignment_log}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 $log_cmd.="; echo \"Alignment QC - reliably aligned reads (mapping quality >= 1): \" >> ${alignment_log}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 $log_cmd.=";samtools view -u -F 4 -q 1 ${file_prefix}.sorted.bam | samtools flagstat - | head -n 1 >> ${alignment_log}";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 #Maybe do some percentages?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 if(system($log_cmd) != 0){ warn "Error making the alignment statistics"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 print STDERR "logged\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 my $repository = $self->_repository();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 move("${file_prefix}.sorted.bam","${repository}/${set_name}.samse.bam");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 my $convert_cmd = "samtools view -h ${repository}/${set_name}.samse.bam | gzip -c - > ${repository}/${set_name}.samse.sam.gz";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 #print STDERR "using convert cmd: ".$convert_cmd;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 if(system($convert_cmd) != 0){ warn "Error converting to zipped sam"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 #print STDERR "converted\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 my $rm_cmd="rm -f $input_file_pattern";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 if(system($rm_cmd) != 0){ warn "Error removing temp files. Remove them manually"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 #$rm_cmd="rm -f ${file_prefix}.sorted.bam";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 #if(system($rm_cmd) != 0){ warn "Error removing final bam files. Remove it manually"; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 sub write_output {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 my $self = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 #Private getter / setter to the sam header
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 sub _sam_header {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 return $_[0]->_getter_setter('sam_header',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 #Private getter / setter to the replicate number
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 sub _replicate {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 return $_[0]->_getter_setter('replicate',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 #Private getter / setter to the replicate number
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 sub _nbr_subfiles {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 return $_[0]->_getter_setter('nbr_subfiles',$_[1]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 1;