50
|
1 #!/usr/local/perl/bin/perl
|
|
2 use feature ':5.10';
|
|
3 use strict 'vars';
|
|
4 use warnings;
|
|
5 use Getopt::Long;
|
|
6 use Pod::Usage;
|
|
7 use List::Util qw/ min max /;
|
|
8 use POSIX qw/ceil floor/;
|
|
9 use File::Temp qw(tempdir);
|
|
10 use File::Basename;
|
|
11
|
|
12 =head1 NAME
|
|
13
|
|
14 filterSquashedReads.pl --frac_max FLOAT
|
|
15
|
|
16 =head1 SYNOPSIS
|
|
17
|
|
18 this script reads sites after pcr duplicate removal.
|
|
19 for all crosslinking sites sharing start and stop coordinates, the maximum number
|
|
20 of squashed reads is determined.
|
|
21 alignments having less than frac_max * max reads are discarded
|
|
22
|
|
23 assumes bed entries to be sorted chr,strand,start,stop,score with score descending
|
|
24
|
|
25 Options:
|
|
26
|
|
27 --frac_max filter out alignments supported by less reads than this fraction of the maximum number of reads per position
|
|
28 -debug enable debug output
|
|
29 -help brief help message
|
|
30 -man full documentation
|
|
31
|
|
32 =head1 DESCRIPTION
|
|
33
|
|
34 =cut
|
|
35
|
|
36 ###############################################################################
|
|
37 # parse command line options
|
|
38 ###############################################################################
|
|
39 my $frac_max = 0.1;
|
|
40 my $help;
|
|
41 my $man;
|
|
42 my $debug;
|
|
43 my $result = GetOptions ( "frac_max=f" => \$frac_max,
|
|
44 "help" => \$help,
|
|
45 "man" => \$man,
|
|
46 "debug" => \$debug);
|
|
47 pod2usage(-exitstatus => 1, -verbose => 1) if $help;
|
|
48 pod2usage(-exitstatus => 0, -verbose => 2) if $man;
|
|
49 ($result) or pod2usage(2);
|
|
50
|
|
51 ###############################################################################
|
|
52 # main
|
|
53 ###############################################################################
|
|
54 my $current_chr = '';
|
|
55 my $current_start = -1;
|
|
56 my $current_stop = -1;
|
|
57 my $current_strand = '';
|
|
58 my $current_max = -1;
|
|
59 my $current_threshold = -1;
|
|
60
|
|
61 while (<>) {
|
|
62 my ($chr, $start, $stop, $id, $count, $strand) = split("\t");
|
|
63 # my ($count, undef, $start, $chr, $strand, $stop) = split("\t");
|
|
64
|
|
65 if ($current_start != $start or
|
|
66 $current_stop != $stop or
|
|
67 $current_chr ne $chr or
|
|
68 $current_strand ne $strand) {
|
|
69 # if this is the first occourence of these coordinates
|
|
70 # this must be the new maximum
|
|
71 # save new state
|
|
72 $current_start = $start;
|
|
73 $current_stop = $stop;
|
|
74 $current_chr = $chr;
|
|
75 $current_strand = $strand;
|
|
76 # print record and record maximum
|
|
77 $current_max = $count;
|
|
78 $current_threshold = $count*$frac_max;
|
|
79 print $_;
|
|
80 $debug and say "new threshold ${current_threshold} @ $chr $start $stop $strand $count";
|
|
81 } elsif ($count >= $current_threshold) {
|
|
82 # if it is not the first occourence, evaluate threshold and print if valid
|
|
83 print $_;
|
|
84 } else {
|
|
85 $debug and say "below threshold @ $chr $start $stop $strand $count";
|
|
86 }
|
|
87 }
|