Mercurial > repos > morinlab > oncocircos
comparison bin/parse @ 6:d248caf924d3 draft
Uploaded
| author | morinlab |
|---|---|
| date | Wed, 30 Nov 2016 13:36:59 -0500 |
| parents | b77ab858eac1 |
| children |
comparison
equal
deleted
inserted
replaced
| 5:e6fa60c7e2f0 | 6:d248caf924d3 |
|---|---|
| 63 parse_config(); | 63 parse_config(); |
| 64 | 64 |
| 65 sub validateconfiguration { | 65 sub validateconfiguration { |
| 66 } | 66 } |
| 67 | 67 |
| 68 #map2interval(5,$CONF{cnv}{log2}); | |
| 69 #exit; | |
| 70 | |
| 68 ################################################################ | 71 ################################################################ |
| 69 # get files | 72 # get files |
| 70 my $sv = read_file($CONF{files}{sv} ,"sv" ); | 73 my $sv = read_file($CONF{files}{sv} ,"sv" ); |
| 71 my $genes = read_file($CONF{files}{mart},"genes"); | 74 my $genes = read_file($CONF{files}{mart},"genes"); |
| 72 my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); | 75 my $cnv = read_file($CONF{files}{cnv} ,"cnv" ); |
| 76 my $cnvlg = read_file($CONF{files}{cnvlg},"cnvlg"); | |
| 77 my %affected_genes; | |
| 78 open(GENES, $CONF{files}{genes}) or die "#! $CONF{files}{genes}\n"; | |
| 79 while(<GENES>){ | |
| 80 chomp; | |
| 81 $affected_genes{$_}++; | |
| 82 } | |
| 83 | |
| 84 | |
| 73 | 85 |
| 74 ################################################################ | 86 ################################################################ |
| 75 # traverse all genes from biomart and determine number | 87 # traverse all genes from biomart and determine number |
| 76 # of SV and CNV events across samples | 88 # of SV and CNV events across samples |
| 77 for my $chr (keys %$genes) { | 89 for my $chr (keys %$genes) { |
| 78 next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; | 90 next if $CONF{filter}{chr} && $chr ne $CONF{filter}{chr}; |
| 79 printdebug("processing",$chr); | 91 printdebug("processing",$chr); |
| 80 for my $gene (@{$genes->{$chr}}) { | 92 for my $gene (@{$genes->{$chr}}) { |
| 81 my $id = $gene->{id}; | 93 my $id = $gene->{id}; |
| 94 my $name = $gene->{name}; | |
| 95 #print "GENE: $name, ID: $id\n"; | |
| 82 # filter out by presence and number of SV events | 96 # filter out by presence and number of SV events |
| 83 next if $CONF{filter}{sv} && ! $sv->{$id}; | 97 |
| 84 # number of samples that have SV event | 98 # number of samples that have SV event |
| 85 my @samples_sv = keys %{$sv->{$id}}; | 99 my @samples_sv = keys %{$sv->{$id}}; |
| 86 next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; | 100 if(defined $affected_genes{$name}){ |
| 87 | 101 #override normal criteria for this gene |
| 102 | |
| 103 } | |
| 104 else{ | |
| 105 next if $CONF{filter}{sv} && ! $sv->{$id}; | |
| 106 next if $CONF{filter}{sv_num} && @samples_sv < $CONF{filter}{sv_num}; | |
| 107 } | |
| 88 $gene->{affected} = 1; | 108 $gene->{affected} = 1; |
| 89 | 109 |
| 90 # register SV events | 110 # register SV events |
| 91 my $pos; | 111 my $pos; |
| 92 for my $sample (@samples_sv) { | 112 for my $sample (@samples_sv) { |
| 118 next unless $cnv->{$sample}{$chr}; | 138 next unless $cnv->{$sample}{$chr}; |
| 119 for my $cnv (@{$cnv->{$sample}{$chr}}) { | 139 for my $cnv (@{$cnv->{$sample}{$chr}}) { |
| 120 my $int = $cnv->{set}->intersect($gene->{set})->cardinality; | 140 my $int = $cnv->{set}->intersect($gene->{set})->cardinality; |
| 121 next unless $int; | 141 next unless $int; |
| 122 push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; | 142 push @{$gene->{cnv}{$cnv->{category}}{$sample}}, $cnv->{avg}; |
| 143 } | |
| 144 } | |
| 145 for my $sample (keys %$cnvlg) { | |
| 146 my $chr = $gene->{chr}; | |
| 147 next unless $cnvlg->{$chr}; | |
| 148 for my $cnv (@{$cnvlg->{$chr}}) { | |
| 149 my $int = $cnv->{set}->intersect($gene->{set})->cardinality; | |
| 150 next unless $int; | |
| 151 $gene->{cnvlg}{$cnv->{idx}} = $cnv->{type}; | |
| 123 } | 152 } |
| 124 } | 153 } |
| 125 } | 154 } |
| 126 } | 155 } |
| 127 | 156 |
| 173 scalar(min(@{$values_by_type->{$top_type}})), | 202 scalar(min(@{$values_by_type->{$top_type}})), |
| 174 average(@{$values_by_type->{$top_type}}), | 203 average(@{$values_by_type->{$top_type}}), |
| 175 median(@{$values_by_type->{$top_type}})->query, | 204 median(@{$values_by_type->{$top_type}})->query, |
| 176 scalar(max(@{$values_by_type->{$top_type}}))); | 205 scalar(max(@{$values_by_type->{$top_type}}))); |
| 177 } | 206 } |
| 207 if($gene->{cnvlg}) { | |
| 208 for my $cnv_idx (sort {$a <=> $b} keys %{$gene->{cnvlg}}) { | |
| 209 push @report, sprintf("cnvlg:%d:%s", | |
| 210 $cnv_idx, | |
| 211 $gene->{cnvlg}{$cnv_idx}); | |
| 212 } | |
| 213 } | |
| 178 printinfo(@report); | 214 printinfo(@report); |
| 179 } | 215 } |
| 180 } | 216 } |
| 181 | 217 |
| 182 exit; | 218 exit; |
| 219 | |
| 220 ################################################################ | |
| 221 | |
| 222 sub map2value { | |
| 223 my ($field_name,$x) = @_; | |
| 224 if($CONF{map}{$field_name}) { | |
| 225 my ($var,$fn) = split(":",$CONF{map}{$field_name}); | |
| 226 $fn =~ s/x/$x/g; | |
| 227 my $value = eval $fn; | |
| 228 die "Could not evaluate remapping [$fn]" if $@; | |
| 229 return ($var,$value); | |
| 230 } else { | |
| 231 return ($field_name,$x); | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 sub str2leaf { | |
| 236 my $str = shift; | |
| 237 my $delim = "}{"; | |
| 238 $str =~ s/\//$delim/g; | |
| 239 $str = sprintf("\$CONF{$str}"); | |
| 240 my $leaf = eval $str; | |
| 241 if($@) { | |
| 242 die "Could not parse [$str] as a data structure."; | |
| 243 } | |
| 244 return $leaf; | |
| 245 } | |
| 183 | 246 |
| 184 sub read_file { | 247 sub read_file { |
| 185 my ($file,$type) = @_; | 248 my ($file,$type) = @_; |
| 249 | |
| 186 open(F,$file) || die "Could not open file [$file] for reading"; | 250 open(F,$file) || die "Could not open file [$file] for reading"; |
| 187 my $data; | 251 my $data; |
| 188 | 252 |
| 189 my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); | 253 my @fields = grep(/\d/,keys %{$CONF{fields}{$type}}); |
| 190 my @keys = split(",",$CONF{fields}{$type}{key}); | 254 my @keys = split(",",$CONF{fields}{$type}{key}); |
| 191 | 255 |
| 192 my $i; | 256 my $i = 0; |
| 193 while(<F>) { | 257 while(<F>) { |
| 194 chomp; | 258 chomp; |
| 195 next if /^\#/; | 259 next if /^\#/; |
| 196 my @tok = split "\t"; | 260 my @tok = split "\t"; |
| 197 my $entry = {class=>$type}; | 261 my $entry = {class=>$type,idx=>$i}; |
| 198 for my $col (@fields) { | 262 for my $col (@fields) { |
| 199 my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); | 263 my ($field_name,$field_transform) = split(":",$CONF{fields}{$type}{$col}); |
| 200 my $value = $tok[$col]; | 264 my ($new_field_name,$value) = map2value($field_name,$tok[$col]); |
| 265 if($new_field_name ne $field_name) { | |
| 266 $entry->{$field_name} = $tok[$col]; | |
| 267 $field_name = $new_field_name; | |
| 268 } | |
| 201 if($field_transform) { | 269 if($field_transform) { |
| 202 $value = lc $value if $field_transform =~ /lc/; | 270 if($field_transform eq "lc") { |
| 271 $value = lc $value; | |
| 272 } elsif ($field_transform =~ /map\((.+),(.+)\)/) { | |
| 273 $entry->{$field_name} = $value; | |
| 274 $field_name = $1; | |
| 275 $value = map2interval($value,str2leaf($2)); | |
| 276 } | |
| 203 } | 277 } |
| 204 $entry->{ $field_name } = $value; | 278 $entry->{ $field_name } = $value; |
| 205 } | 279 } |
| 206 # skip mutation types that are not important | 280 # skip mutation types that are not important |
| 207 next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; | 281 next if $CONF{sv}{filter} && $type eq "sv" && exists $entry->{type} && ! $CONF{sv}{types}{$entry->{type}}; |
| 208 next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; | 282 next if $CONF{cnv}{filter} && $type eq "cnv" && exists $entry->{category} && ! $CONF{cnv}{types}{$entry->{category}}; |
| 209 if($type eq "sv") { | 283 if($type eq "sv") { |
| 210 $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; | 284 $entry->{weight} = $CONF{sv}{types}{$entry->{type}}; |
| 211 } | 285 } |
| 212 $entry->{chr} = "X" if $entry->{chr} eq 23; | 286 $entry->{chr} = "X" if $entry->{chr} eq 23; |
| 222 push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; | 296 push @{$data->{$entry->{$keys[0]}}{$entry->{$keys[1]}}}, $entry; |
| 223 } | 297 } |
| 224 } | 298 } |
| 225 printdebug("got",$i,$type); | 299 printdebug("got",$i,$type); |
| 226 return $data; | 300 return $data; |
| 301 } | |
| 302 | |
| 303 sub map2interval { | |
| 304 my ($x,$conf) = @_; | |
| 305 for my $int (keys %$conf) { | |
| 306 my $cat = $conf->{$int}; | |
| 307 return $cat if in_interval($x,$int); | |
| 308 } | |
| 309 printdumper($conf); | |
| 310 die "Could not map $x into any of the intervals seen"; | |
| 311 } | |
| 312 | |
| 313 sub in_interval { | |
| 314 my ($x,$int) = @_; | |
| 315 my $open_L = "\\("; | |
| 316 my $closed_L = "\\["; | |
| 317 my $open_R = "\\)"; | |
| 318 my $closed_R = "\\]"; | |
| 319 my $num = "[0-9]*"; #[0-9]+(?:.[0-9]+)?"; | |
| 320 my $rx = "($open_L|$closed_L)($num),($num)($open_R|$closed_R)"; | |
| 321 my $bool = 1; | |
| 322 if($int =~ /$rx/) { | |
| 323 if($2 ne "") { | |
| 324 $bool &&= $1 eq "(" ? $x > $2 : $x >= $2; | |
| 325 } | |
| 326 if($3 ne "") { | |
| 327 $bool &&= $4 eq ")" ? $x < $3 : $x <= $3; | |
| 328 } | |
| 329 } else { | |
| 330 $bool = $x == $int; | |
| 331 } | |
| 332 return $bool; | |
| 227 } | 333 } |
| 228 | 334 |
| 229 sub list2hash { | 335 sub list2hash { |
| 230 my %h; | 336 my %h; |
| 231 map {$h{$_}=1} @_; | 337 map {$h{$_}=1} @_; |
