Mercurial > repos > devteam > delete_overlapping_indels
diff delete_overlapping_indels.pl @ 0:fa09b51ed576 draft default tip
Uploaded tool tarball.
author | devteam |
---|---|
date | Tue, 20 Aug 2013 09:37:51 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/delete_overlapping_indels.pl Tue Aug 20 09:37:51 2013 -0400 @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w + +# This program detects overlapping indels in a chromosome and keeps all non-overlapping indels. As for overlapping indels, +# the first encountered one is kept and all others are removed. It requires three inputs: +# The first input is a TABULAR format file containing coordinates of indels in blocks extracted from multi-alignment. +# The second input is an integer number representing the number of the column where indel start coordinates are stored in the input file. +# The third input is an integer number representing the number of the column where indel end coordinates are stored in the input file. +# The output is a TABULAR format file containing all non-overlapping indels in the input file, and the first encountered indel of overlapping ones. +# Note: The number of the first column is 1. + +use strict; +use warnings; + +#varaibles to handle information related to indels +my $indel1 = ""; +my $indel2 = ""; +my @indelArray1 = (); +my @indelArray2 = (); +my $lineCounter1 = 0; +my $lineCounter2 = 0; +my $totalNumberofNonOverlappingIndels = 0; + +# check to make sure having correct files +my $usage = "usage: delete_overlapping_indels.pl [TABULAR.in] [indelStartColumn] [indelEndColumn] [TABULAR.out]\n"; +die $usage unless @ARGV == 4; + +my $inputFile = $ARGV[0]; +my $indelStartColumn = $ARGV[1] - 1; +my $indelEndColumn = $ARGV[2] - 1; +my $outputFile = $ARGV[3]; + +#verifie column numbers +if ($indelStartColumn < 0 ){ + die ("The indel start column number is invalid \n"); +} +if ($indelEndColumn < 0 ){ + die ("The indel end column number is invalid \n"); +} + +#open the input and output files +open (INPUT, "<", $inputFile) || die ("Could not open file $inputFile \n"); +open (OUTPUT, ">", $outputFile) || die ("Could not open file $outputFile \n"); + +#store the input file in the array @rawData +my @indelsRawData = <INPUT>; + +#iterated through the indels of the input file +INDEL1: +foreach $indel1 (@indelsRawData){ + chomp ($indel1); + $lineCounter1++; + + #get the first indel + @indelArray1 = split(/\t/, $indel1); + + #our purpose is to detect overlapping indels and to store one copy of them only in the output file + #all other non-overlapping indels will stored in the output file also + + $lineCounter2 = 0; + + #iterated through the indels of the input file + INDEL2: + foreach $indel2 (@indelsRawData){ + chomp ($indel2); + $lineCounter2++; + + if ($lineCounter2 > $lineCounter1){ + #get the second indel + @indelArray2 = split(/\t/, $indel2); + + #check if the two indels are overlapping + if (($indelArray2[$indelEndColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelEndColumn] <= $indelArray1[$indelEndColumn]) || ($indelArray2[$indelStartColumn] >= $indelArray1[$indelStartColumn] && $indelArray2[$indelStartColumn] <= $indelArray1[$indelEndColumn])){ + #print ("There is an overlap between" . "\n" . $indel1 . "\n" . $indel2 . "\n"); + #print("The two overlapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n\n"); + + #break out of the loop and go back to the outerloop + next INDEL1; + } + else{ + #print("The two non-overlaapping indels are located at the lines: " . $lineCounter1 . " " . $lineCounter2 . "\n"); + } + } + } + + print OUTPUT $indel1 . "\n"; + $totalNumberofNonOverlappingIndels++; +} + +#print("The total number of indels is: " . $lineCounter1 . "\n"); +#print("The total number of non-overlapping indels is: " . $totalNumberofNonOverlappingIndels . "\n"); + +#close the input and output files +close(OUTPUT); +close(INPUT); \ No newline at end of file