comparison bin/parse @ 6:d248caf924d3 draft

Uploaded
author morinlab
date Wed, 30 Nov 2016 13:36:59 -0500
parents b77ab858eac1
children
comparison
equal deleted inserted replaced
5:e6fa60c7e2f0 6:d248caf924d3
63 parse_config(); 63 parse_config();
64 64
65 sub validateconfiguration { 65 sub validateconfiguration {
66 } 66 }
67 67
68 #map2interval(5,$CONF{cnv}{log2});
69 #exit;
70
68 ################################################################ 71 ################################################################
69 # get files 72 # get files
70 my $sv = read_file($CONF{files}{sv} ,"sv" ); 73 my $sv = read_file($CONF{files}{sv} ,"sv" );
71 my $genes = read_file($CONF{files}{mart},"genes"); 74 my $genes = read_file($CONF{files}{mart},"genes");
72 my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); 75 my $cnv = read_file($CONF{files}{cnv} ,"cnv" );
76 my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg");
77 my %affected_genes;
78 open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n";
79 while(<GENES>){
80 chomp;
81 $affected_genes{$_}++;
82 }
83
84
73 85
74 ################################################################ 86 ################################################################
75 # traverse all genes from biomart and determine number 87 # traverse all genes from biomart and determine number
76 # of SV and CNV events across samples 88 # of SV and CNV events across samples
77 for my $chr (keys %$genes) { 89 for my $chr (keys %$genes) {
78 next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; 90 next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr};
79 printdebug("processing",$chr); 91 printdebug("processing",$chr);
80 for my $gene (@{$genes->{$chr}}) { 92 for my $gene (@{$genes->{$chr}}) {
81 my $id = $gene->{id}; 93 my $id = $gene->{id};
94 my $name = $gene->{name};
95 #print "GENE: $name, ID: $id\n";
82 # filter out by presence and number of SV events 96 # filter out by presence and number of SV events
83 next if $CONF{filter}{sv} && ! $sv->{$id}; 97
84 # number of samples that have SV event 98 # number of samples that have SV event
85 my @samples_sv = keys %{$sv->{$id}}; 99 my @samples_sv = keys %{$sv->{$id}};
86 next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; 100 if(defined $affected_genes{$name}){
87 101 #override normal criteria for this gene
102
103 }
104 else{
105 next if $CONF{filter}{sv} && ! $sv->{$id};
106 next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num};
107 }
88 $gene->{affected} = 1; 108 $gene->{affected} = 1;
89 109
90 # register SV events 110 # register SV events
91 my $pos; 111 my $pos;
92 for my $sample (@samples_sv) { 112 for my $sample (@samples_sv) {
118 next unless $cnv->{$sample}{$chr}; 138 next unless $cnv->{$sample}{$chr};
119 for my $cnv (@{$cnv->{$sample}{$chr}}) { 139 for my $cnv (@{$cnv->{$sample}{$chr}}) {
120 my $int = $cnv->{set}->intersect($gene->{set})->cardinality; 140 my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
121 next unless $int; 141 next unless $int;
122 push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; 142 push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg};
143 }
144 }
145 for my $sample (keys %$cnvlg) {
146 my $chr = $gene->{chr};
147 next unless $cnvlg->{$chr};
148 for my $cnv (@{$cnvlg->{$chr}}) {
149 my $int = $cnv->{set}->intersect($gene->{set})->cardinality;
150 next unless $int;
151 $gene->{cnvlg}{$cnv->{idx}} = $cnv->{type};
123 } 152 }
124 } 153 }
125 } 154 }
126 } 155 }
127 156
173 scalar(min(@{$values_by_type->{$top_type}})), 202 scalar(min(@{$values_by_type->{$top_type}})),
174 average(@{$values_by_type->{$top_type}}), 203 average(@{$values_by_type->{$top_type}}),
175 median(@{$values_by_type->{$top_type}})->query, 204 median(@{$values_by_type->{$top_type}})->query,
176 scalar(max(@{$values_by_type->{$top_type}}))); 205 scalar(max(@{$values_by_type->{$top_type}})));
177 } 206 }
207 if($gene->{cnvlg}) {
208 for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) {
209 push @report, sprintf("cnvlg:%d:%s",
210 $cnv_idx,
211 $gene->{cnvlg}{$cnv_idx});
212 }
213 }
178 printinfo(@report); 214 printinfo(@report);
179 } 215 }
180 } 216 }
181 217
182 exit; 218 exit;
219
220 ################################################################
221
222 sub map2value {
223 my ($field_name,$x) = @_;
224 if($CONF{map}{$field_name}) {
225 my ($var,$fn) = split(":",$CONF{map}{$field_name});
226 $fn =~ s/x/$x/g;
227 my $value = eval $fn;
228 die "Could not evaluate remapping [$fn]" if $@;
229 return ($var,$value);
230 } else {
231 return ($field_name,$x);
232 }
233 }
234
235 sub str2leaf {
236 my $str = shift;
237 my $delim = "}{";
238 $str =~ s/\//$delim/g;
239 $str = sprintf("\$CONF{$str}");
240 my $leaf = eval $str;
241 if($@) {
242 die "Could not parse [$str] as a data structure.";
243 }
244 return $leaf;
245 }
183 246
184 sub read_file { 247 sub read_file {
185 my ($file,$type) = @_; 248 my ($file,$type) = @_;
249
186 open(F,$file) || die "Could not open file [$file] for reading"; 250 open(F,$file) || die "Could not open file [$file] for reading";
187 my $data; 251 my $data;
188 252
189 my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); 253 my @fields = grep(/\d/,keys %{$CONF{fields}{$type}});
190 my @keys = split(",",$CONF{fields}{$type}{key}); 254 my @keys = split(",",$CONF{fields}{$type}{key});
191 255
192 my $i; 256 my $i = 0;
193 while(<F>) { 257 while(<F>) {
194 chomp; 258 chomp;
195 next if /^\#/; 259 next if /^\#/;
196 my @tok = split "\t"; 260 my @tok = split "\t";
197 my $entry = {class=>$type}; 261 my $entry = {class=>$type,idx=>$i};
198 for my $col (@fields) { 262 for my $col (@fields) {
199 my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); 263 my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col});
200 my $value = $tok[$col]; 264 my ($new_field_name,$value) = map2value($field_name,$tok[$col]);
265 if($new_field_name ne $field_name) {
266 $entry->{$field_name} = $tok[$col];
267 $field_name = $new_field_name;
268 }
201 if($field_transform) { 269 if($field_transform) {
202 $value = lc $value if $field_transform =~ /lc/; 270 if($field_transform eq "lc") {
271 $value = lc $value;
272 } elsif ($field_transform =~ /map\((.+),(.+)\)/) {
273 $entry->{$field_name} = $value;
274 $field_name = $1;
275 $value = map2interval($value,str2leaf($2));
276 }
203 } 277 }
204 $entry->{ $field_name } = $value; 278 $entry->{ $field_name } = $value;
205 } 279 }
206 # skip mutation types that are not important 280 # skip mutation types that are not important
207 next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; 281 next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}};
208 next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; 282 next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}};
209 if($type eq "sv") { 283 if($type eq "sv") {
210 $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; 284 $entry->{weight} = $CONF{sv}{types}{$entry->{type}};
211 } 285 }
212 $entry->{chr} = "X" if $entry->{chr} eq 23; 286 $entry->{chr} = "X" if $entry->{chr} eq 23;
222 push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; 296 push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry;
223 } 297 }
224 } 298 }
225 printdebug("got",$i,$type); 299 printdebug("got",$i,$type);
226 return $data; 300 return $data;
301 }
302
303 sub map2interval {
304 my ($x,$conf) = @_;
305 for my $int (keys %$conf) {
306 my $cat = $conf->{$int};
307 return $cat if in_interval($x,$int);
308 }
309 printdumper($conf);
310 die "Could not map $x into any of the intervals seen";
311 }
312
313 sub in_interval {
314 my ($x,$int) = @_;
315 my $open_L = "\\(";
316 my $closed_L = "\\[";
317 my $open_R = "\\)";
318 my $closed_R = "\\]";
319 my $num = "[0-9]*"; #[0-9]+(?:.[0-9]+)?";
320 my $rx = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)";
321 my $bool = 1;
322 if($int =~ /$rx/) {
323 if($2 ne "") {
324 $bool &&= $1 eq "(" ? $x > $2 : $x >= $2;
325 }
326 if($3 ne "") {
327 $bool &&= $4 eq ")" ? $x < $3 : $x <= $3;
328 }
329 } else {
330 $bool = $x == $int;
331 }
332 return $bool;
227 } 333 }
228 334
229 sub list2hash { 335 sub list2hash {
230 my %h; 336 my %h;
231 map {$h{$_}=1} @_; 337 map {$h{$_}=1} @_;