Mercurial > repos > devteam > multispecies_orthologous_microsats
changeset 0:d1bf24c1350f draft
Uploaded tool tarball.
author | devteam |
---|---|
date | Tue, 20 Aug 2013 11:02:09 -0400 |
parents | |
children | 4140235d590a |
files | multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml test-data/regVariation/microsatellite/Galaxy17_masked_short.maf.gz test-data/regVariation/microsatellite/Galaxy17_short_raw.txt test-data/regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz |
diffstat | 5 files changed, 5704 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl Tue Aug 20 11:02:09 2013 -0400 @@ -0,0 +1,5606 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Term::ANSIColor; +use File::Basename; +use IO::Handle; +use Cwd; +use File::Path; +use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter); +use File::Path qw(make_path remove_tree); +use File::Temp qw/ tempfile tempdir /; +my $tdir = tempdir( CLEANUP => 1 ); +chdir $tdir; +my $dir = getcwd; +#print "dir = $dir\n"; + +#$ENV{'PATH'} .= ':' . dirname($0); +my $date = `date`; + +my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV; +if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; } + +$tree_definition_all =~ s/\s+//g; +$threshold_array =~ s/\s+//g; +$allspeciesin =~ s/\s+//g; +#------------------------------------------------------------------------------- +# WHICH SPUTNIK USED? +my $sputnikpath = (); +$sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ; +#$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC"; +#print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath; +#exit if !-e $sputnikpath; +#$sputnikpath = "bx-sputnik" ; +#print "ARGV input = @ARGV\n"; +#print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n"; +#------------------------------------------------------------------------------- +# RUNFILE +#------------------------------------------------------------------------------- +$distance = 1; #bp +$distance++; +my @tree_definitions=MakeTrees($tree_definition_all); +my $allspeciesset = $tree_definition_all; +$allspeciesset =~ s/[\(\) ]+//g; +@allspecies = split(/,/,$allspeciesset); + +my @outputfiles = (); +my $round = 0; +#my $tdir = tempdir( CLEANUP => 0 ); +#chdir $tdir; + +foreach my $tree_definition (@tree_definitions){ + my @commas = ($tree_definition =~ /,/g) ; + #print "commas = @commas\n"; <STDIN>; + next if scalar(@commas) <= 1; + #print "species_set = $species_set\n"; + $treeSpeciesNum = scalar(@commas) + 1; + $species_set = $tree_definition; + $species_set =~ s/[\)\( ;]+//g; + #print "species_set = $species_set\n"; <STDIN>; + + $round++; + #------------------------------------------------------------------------------- + # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP) + $threshold_array=~ s/,/_/g; + my @thresharr = split("_",$threshold_array); + @thresholds=@thresharr; + #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold)); + #print "current dit=$dir\n"; + #------------------------------------------------------------------------------- + # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY + my @chrfiles=(); + + #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0]; + my $chromt=int(rand(10000)); + my $p_chr=$chromt; + + + my @exactspeciesset_unarranged = split(/,/,$species_set); + $tree_definition=~s/[\)\(, ]/\t/g; + my @treespecies=split(/\t+/,$tree_definition); + my @exactspecies=(); + + foreach my $spec (@treespecies){ + foreach my $espec (@exactspeciesset_unarranged){ + push @exactspecies, $spec if $spec eq $espec; + } + } + #print "exactspecies=@exactspecies\n"; + $focalspec = $exactspecies[0]; + my $arranged_species_set=join(".",@exactspecies); + my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); + my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt"); + #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; + maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set); + maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set); + #print "done maf to axt conversion\n"; + my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt"); + artificial_axdata_inverter ($chr_name, $reverse_chr_name); + #print "reverse_chr_name=$reverse_chr_name\n"; + #------------------------------------------------------------------------------- + # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT + foreach my $direct ("reverse_direction","forward_direction"){ + $p_chr=$chromt; + #print "direction = $direct\n"; + $p_chr = $p_chr."r" if $direct eq "reverse_direction"; + $p_chr = $p_chr if $direct eq "forward_direction"; + my $config = $species_set; + $config=~s/,/./g; + my @orgs = split(/\./,$arranged_species_set); + #print "ORGS= @orgs\n"; + my @tag=@orgs; + + + my $tags = join(",", @tag); + my @tags=@tag; + chomp $p_chr; + $tags = join("_", split(/,/, $tags)); + my $pchr = "chr".$p_chr; + + my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array; + my @sp_tags = (); + +# print "$ptag _ orthfile\n"; <STDIN>; + #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n"; + foreach my $sp (@tag){ + push(@sp_tags, ($sp.".".$ptag)); + } + + my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]); + my @presp_tags = (); + + foreach my $sp (@tag){ + push(@presp_tags, ($sp.".".$preptag)); + } + + my $resultdir = ""; + my $orthdir = ""; + my $filtereddir = ""; + my $pipedir = ""; + + my @title_queries = (); + push(@title_queries, "^[0-9]+"); + my $sep="\\s"; + for my $or (0 ... $#orgs){ + my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]")); + #$title =~ s/chr\\+\\s+\+/chr/g; + push(@title_queries, $title); + } + my $title_query = join($sep, @title_queries); + #print "title_queries=@title_queries\n"; + #print "query = >$title_query<\n"; + #print "orgs = @orgs\n"; + #------------------------------------------------------------------------------- + # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES + my $t1input = $pchr.".".$arranged_species_set.".net.axt"; + + my @t1outputs = (); + + foreach my $sp (@presp_tags){ + push(@t1outputs, $sp."_gap_op"); + } + + + + multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query); + #print "t1outputs=@t1outputs\n"; + #print "done t1\n"; <STDIN>; + #------------------------------------------------------------------------------- + #START T2.PL + + my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = (); + + for my $t (0 ... scalar(@tags)-1){ + multi_species_t2($t1outputs[$t], $tag[$t]); + } + #------------------------------------------------------------------------------- + #START T2.2.PL + + my @temp_tags = @tag; + + foreach my $sp (@presp_tags){ + my $t2input = $sp."_nogap_op_unrand"; + multi_species_t2_2($t2input, shift(@temp_tags)); + } + undef (@temp_tags); + + #------------------------------------------------------------------------------- + #START SPUTNIK + + my @jobIDs = (); + @temp_tags = @tag; + my @sput_filelist = (); + + foreach my $sp (@presp_tags){ + #print "sp = $sp\n"; + my $sputnikoutput = $pipedir.$sp."_sput_op0"; + my $sputnikinput = $pipedir.$sp."_nogap_op_unrand"; + push(@sput_filelist, $sputnikinput); + my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput; + # print "$sputnikcommand\n"; + my @sputnikcommand_system = $sputnikcommand; + system(@sputnikcommand_system); + } + + #------------------------------------------------------------------------------- + #START SPUTNIK OUTPUT CORRECTOR + + foreach my $sp (@presp_tags){ + my $corroutput = $pipedir.$sp."_sput_op1"; + my $corrinput = $pipedir.$sp."_sput_op0"; + sputnikoutput_corrector($corrinput,$corroutput); + + my $t4output = $pipedir.$sp."_sput_op2"; + multi_species_t4($corroutput,$t4output); + + my $t5output = $pipedir.$sp."_sput_op3"; + multi_species_t5($t4output,$t5output); + #print "done t5.pl for $sp\n"; + + my $t6output = $pipedir.$sp."_sput_op4"; + multi_species_t6($t5output,$t6output,scalar(@orgs)); + } + #------------------------------------------------------------------------------- + #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING + + foreach my $sp (@presp_tags){ + my $t9output = $pipedir.$sp."_gap_op_unrand_match"; + my $t9sequence = $pipedir.$sp."_gap_op_unrand2"; + my $t9micro = $pipedir.$sp."_sput_op4"; + t9($t9micro,$t9sequence,$t9output); + + my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match"; + my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2"; + t9($t9micro,$t9sequence2,$t9output2); + } + #print "done both t9.pl for all orgs\n"; + + #------------------------------------------------------------------------------- + # FIND COMPOUND MICROSATELLITES + + @jobIDs = (); + my $species_counter = 0; + + foreach my $sp (@presp_tags){ + my $simple_microsats=$pipedir.$sp."_sput_op4_simple"; + my $compound_microsats=$pipedir.$sp."_sput_op4_compound"; + my $input_micro = $pipedir.$sp."_sput_op4"; + my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match"; + multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array ); + $species_counter++; + } + + #------------------------------------------------------------------------------- + # READING AND FILTERING SIMPLE MICROSATELLITES + my $spcounter2=0; + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$spcounter2]; + $spcounter2++; + my $simple_microsats=$pipedir.$presp."_sput_op4_simple"; + my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered"; + my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue"; + multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); + } + + #------------------------------------------------------------------------------- + # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS + + $species_counter = 0; + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + my $compound_microsats = $pipedir.$presp."_sput_op4_compound"; + my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted"; + my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure"; + my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; + multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags)); + $species_counter++; + } + #------------------------------------------------------------------------------- + # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM.. + $species_counter = 0; + + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure"; + my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; + my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound"; + my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; + multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8"); + $species_counter++; + } + #------------------------------------------------------------------------------- + # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES + $species_counter = 0; + + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + + my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; + my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; + my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue"; + multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); + + my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted"; + my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; + my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue"; + multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags)); + + my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound"; + my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered"; + my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue"; + multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); + $species_counter++; + } + #print "done filtering both simple and compound microsatellites \n"; + + #------------------------------------------------------------------------------- + + my @combinedarray = (); + my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide"); + my @combinedarray_tags = ("mono", "di", "tri", "tetra"); + $species_counter = 0; + + foreach my $sp (@sp_tags){ + my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple"; + push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra"; + $species_counter++; + } + + #------------------------------------------------------------------------------- + # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION + my $sp_counter = 0; + foreach my $sp (@sp_tags){ + my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; + my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; + my $simple = $pipedir.$sp."_sput_op4_simple_filtered"; + my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple"; + `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`; + for my $i (0 ... 3){ + `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`; + } + $sp_counter++; + } + #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n"; + + #------------------------------------------------------------------------------- + # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES + # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES + @temp_tags = @sp_tags; + my $increment = 1000000; + my @splist = (); + my $targetdir = $pipedir; + $species_counter=0; + + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + $species_counter++; + my $localtag = shift @temp_tags; + my $locallist = $targetdir.$localtag."_".$p_chr."_list"; + push(@splist, $locallist); + my $input = $pipedir.$presp."_nogap_op_unrand2_match"; + chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr); + } + + + my @unionarray = (); + #print "splist=@splist\n"; + #------------------------------------------------------------------------------- + # FIND INTERRUPTED MICROSATELLITES + + $species_counter = 0; + + for my $i (0 .. $#combinedarray){ + + @jobIDs = (); + open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!"; + + while (my $sp1 = <JLIST1>){ + #print "$splist[$i]: sp1=$sp1\n"; + chomp $sp1; + + for my $j (0 ... $#combinedarray_tags){ + my $interr = $sp1."_interr_".$combinedarray_tags[$j]; + my $simple = $sp1."_simple_".$combinedarray_tags[$j]; + push @{$unionarray[$i]}, $interr, $simple; + multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8"); + } + } + $species_counter++; + } + close JLIST1; + #------------------------------------------------------------------------------- + # REUNION AND ZIPPING BEFORE T10.PL + + my @allarray = (); + + for my $i (0 ... $#sp_tags){ + my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; + unlink $localfile if -e $localfile; + push(@allarray, $localfile); + + my $unfiltered_localfile= $localfile."_unfiltered"; + my $residue_localfile= $localfile."_residue"; + + unlink $unfiltered_localfile; + #unlink $unfiltered_localfile; + for my $j (0 ... $#{$unionarray[$i]}){ + #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n"; + `cat $unionarray[$i][$j] >> $unfiltered_localfile`; + unlink $unionarray[$i][$j]; + } + + multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) ); + my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered"; + my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue"; + my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue"; + + `cat $analyzed_compound >> $localfile`; + } + #------------------------------------------------------------------------------- + # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS + + my $toescape=0; + + + for my $i (0 ... $#sp_tags){ + my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; + $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./; + my $direction = $1; + #print "localfile = $localfile , direction = $direction\n"; + # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1; + $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/; + last if $direction =~ /chr[0-9a-zA-Z]+r/; + my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match"; + my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match"; + my $reverselocal = $localfile; + $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g; + merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags)); + #------------------------------------------------------------------------------- + my $forward_separate = $localfile."_separate"; + my $reverse_separate = $reverselocal."_separate"; + my $diff = $forward_separate."_diff"; + my $miss = $forward_separate."_miss"; + my $common = $forward_separate."_common"; + forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags)); + #------------------------------------------------------------------------------- + my $symmetrical_file = $localfile."_symmetrical"; + my $merged_file = $localfile."_merged"; + #print "cating: $merged_file $common into -> $symmetrical_file \n"; + `cat $merged_file $common > $symmetrical_file`; + #------------------------------------------------------------------------------- + my $t10output = $symmetrical_file."_fin_hit_all_2"; + new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs)); + #------------------------------------------------------------------------------- + } + next if $toescape == 1; + #------------------------------------------------------------------------------------------------ + # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES + + + my @micros_array = (); + my $sampletag = (); + for my $i (0 ... $#sp_tags){ + my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2"; + push(@micros_array, $finhitFile); + $sampletag = $sp_tags[$i]; + } + #$sampletag =~ s/^([A-Z]+\.)/ORTH_/; + #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp"; + my $orthfiletemp = $ptag."_orthfile"; + my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation); + + my $maskedorthfiletemp = $ptag."_orthfile_masked"; + qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp); + + push @outputfiles , $maskedorthfiletemp; + } + $date = `date`; +} + +`cat @outputfiles > $orthfile`; + +my $rootdir = $dir; +$rootdir =~ s/\/[A-Za-z0-9\-_]+$//; +chdir $rootdir; +remove_tree($dir); + +#print "date = $date\n"; +#remove_tree($tdir); +#------------------------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------ + +#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx + +sub maftoAxt_multispecies { + #print "in maftoAxt_multispecies : got @_\n"; + my $fname=$_[0]; + open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; + my $treedefinition = $_[1]; + open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; + my $counter = 0; + my $exactspeciesset = $_[3]; + my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); + + $treedefinition=~s/[\)\(, ]/\t/g; + my @species=split(/\t+/,$treedefinition); + my @exactspecies=(); + + foreach my $spec (@species){ + foreach my $espec (@exactspeciesset_unarranged){ + push @exactspecies, $spec if $spec eq $espec; + } + } + #print "exactspecies=@exactspecies\n"; + + ########### + my $select = 2; + #select = 1 if all species need sequences to be present for each block otherwise, it is 0 + #select = 2 only the allowed set make up the alignment. use the removeset + # information to detect alignmenets that have other important genomes aligned. + ########### + my @allowedset = (); + @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; + @allowedset = join("_",0,@species) if $select == 1; + #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; + @allowedset = join("_",0,@exactspecies) if $select == 2; + #print "allowedset = @allowedset and exactspecies = @exactspecies\n"; + + my $start = 0; + my @sequences = (); + my @titles = (); + my $species_counter = "0"; + my $countermatch = 0; + my $outsideSpecies=0; + + while(my $line = <IN>){ +# print $line; + next if $line =~ /^#/; + next if $line =~ /^i/; + chomp $line; + my @fields = split(/\s+/,$line); + chomp $line; + if ($line =~ /^a /){ + $start = 1; + } + + if ($line =~ /^s /){ + + foreach my $sp (@allspecies){ +# print "checking species $sp\n"; + if ($fields[1] =~ /$sp/){ + $species_counter = $species_counter."_".$sp; + push(@sequences, $fields[6]); + my @sp_info = split(/\./,$fields[1]); + my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); + push(@titles, $title); +# print "species_counter = $species_counter\n"; + } + } + } + + if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ +# print "species_counter = $species_counter\n"; + my $arranged = reorderSpecies($species_counter, @allspecies); + my $stopper = 1; + my $arrno = 0; + +# print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n"; + if (scalar(@sequences) == scalar(@exactspecies)){ + foreach my $set (@allowedset){ +# print "testing $arranged against $set\n"; + if ($arranged eq $set){ + $stopper = 0; last; + } + $arrno++; + } + } + else{ + $stopper = 1; + } + + + if ($stopper == 0) { + @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; + @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; + my $filteredseq = filter_gaps(@sequences); + + if ($filteredseq ne "SHORT"){ + #print "printing"; <STDIN>; + $counter++; + print OUT join (" ",$counter, @titles), "\n"; + print OUT $filteredseq, "\n"; + print OUT "\n"; + $countermatch++; + } + } + else{ #print "nexting\n";<STDIN>; + } + + @sequences = (); @titles = (); $start = 0;$species_counter = "0"; + next; + + } + } +# print "countermatch = $countermatch\n"; +} + +sub reorderSpecies{ + my @inarr=@_; + my $currSpecies = shift (@inarr); + my $ordered_species = 0; + my @species=@inarr; + #print "species = @species\n"; + foreach my $order (@species){ + $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; + } + return $ordered_species; + +} + +sub filter_gaps{ + my @sequences = @_; +# print "sequences sent are @sequences\n"; + my $seq_length = length($sequences[0]); + my $seq_no = scalar(@sequences); + my $allgaps = (); + for (1 ... $seq_no){ + $allgaps = $allgaps."-"; + } + + my @seq_array = (); + my $seq_counter = 0; + foreach my $seq (@sequences){ +# my @sequence = split(/\s*/,$seq); + $seq_array[$seq_counter] = [split(/\s*/,$seq)]; +# push @seq_array, [@sequence]; + $seq_counter++; + } + my $g = 0; + while ( $g < $seq_length){ + last if (!exists $seq_array[0][$g]); + my $bases = (); + for my $u (0 ... $#seq_array){ + $bases = $bases.$seq_array[$u][$g]; + } +# print $bases, "\n"; + if ($bases eq $allgaps){ +# print "bases are $bases, position is $g \n"; + for my $seq (@seq_array){ + splice(@$seq , $g, 1); + } + } + else { + $g++; + } + } + + my @outs = (); + + foreach my $seq (@seq_array){ + push(@outs, join("",@$seq)); + } + return "SHORT" if length($outs[0]) <=100; + return (join("\n", @outs)); +} + + +sub allowedSetOfSpecies{ + my @allowed_species = split(/_/,$_[0]); + unshift @allowed_species, 0; +# print "allowed set = @allowed_species \n"; + my @output = (); + for (0 ... scalar(@allowed_species) - 4){ + push(@output, join("_",@allowed_species)); + pop @allowed_species; + } + return join(";",reverse(@output)); + +} + + +sub orderInfo{ + my @info = split(/;/,$_[0]); +# print "info = @info"; + my @old = split(/_/,$_[1]); + my @new = split(/_/,$_[2]); + shift @old; shift @new; + my @outinfo = (); + foreach my $spe (@new){ + for my $no (0 ... $#old){ + if ($spe eq $old[$no]){ + push(@outinfo, $info[$no]); + } + } + } +# print "outinfo = @outinfo \n"; + return join(";", @outinfo); +} + +#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx + +#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx +sub artificial_axdata_inverter{ + open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!"; + open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!"; + my $linecounter=0; + while (my $line = <IN>){ + $linecounter++; + #print "$linecounter\n"; + chomp $line; + my $final_line = $line; + my $trycounter = 0; + if ($line =~ /^[a-zA-Z\-]/){ + # while ($final_line eq $line){ + my @fields = split(/\s*/,$line); + + $final_line = join("",reverse(@fields)); + # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; + # $trycounter++; + # print "trying again....$trycounter : $final_line\n" if $final_line eq $line; + # } + } + + # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; + if ($line =~ /^[0-9]/){ + $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g; + $final_line = $line; + } + print OUT $final_line,"\n"; + #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; + } + close OUT; +} +#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx + + +#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx + +sub multi_species_t1 { + + my $input1 = $_[0]; + #print "@_\n"; <STDIN>; + my @tags = split(/_/, $_[1]); + my @outputs = split(/,/, $_[2]); + my $title_query = $_[3]; + my @handles = (); + + open(FILEB,"<$input1")or die "Cannot open file: $input1 $!"; + my $i = 0; + foreach my $path (@outputs){ + $handles[$i] = IO::Handle->new(); + open ($handles[$i], ">$path") or die "Can't open $path : $!"; + $i++; + } + + my $curdef; + my $start = 0; + + while (my $line = <FILEB> ) { + if ($line =~ /^\d/){ + $line =~ s/ +/\t/g; + my @fields = split(/\s+/, $line); + if (($line =~ /$title_query/)){ + my $title = $line; + my $counter = 0; + foreach my $tag (@tags){ + $line = <FILEB>; + print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line; + $counter++; + } + } + else{ + foreach my $tag (@tags){ + my $tine = <FILEB>; + } + } + + } + } + + foreach my $hand (@handles){ + $hand->close(); + } + + close FILEB; +} + +#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx + +#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx + +sub multi_species_t2{ + + my $input = $_[0]; + my $species = $_[1]; + my $output1 = $input."_unr"; + + #------------------------------------------------------------------------------------------ + open (FILEF1, "<$input") or die "Cannot open file $input :$!"; + open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!"; + + my $line1 = <FILEF1>; + + while($line1){ + { + # chomp($line); + if ($line1 =~ (m/^\>$species/)){ + chomp($line1); + print FILEF2 $line1; + $line1 = <FILEF1>; + chomp($line1); + print FILEF2 "\t", $line1,"\n"; + } + } + $line1 = <FILEF1>; + } + + close FILEF1; + close FILEF2; + #------------------------------------------------------------------------------------------ + + my $output2 = $output1."and"; + my $output3 = $output1."and2"; + open(IN,"<$output1"); + open (FILEF3, ">$output2"); + open (FILEF4, ">$output3"); + + + while (<IN>){ + my $line = $_; + chomp($line); + my @fields=split (/\t/, $line); + # print $line,"\n"; <STDIN>; + if($line !~ /random/){ + print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n"; + print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n"; + } + } + + + close IN; + close FILEF3; + close FILEF4; + unlink $output1; + + #------------------------------------------------------------------------------------------ + # OLD T3.PL RUDIMENT + + my $t3output = $output2; + $t3output =~ s/gap_op_unrand/nogap_op_unrand/g; + + open(IN,"<$output2"); + open(OUTA,">$t3output"); + + + while (<IN>){ + s/-//g unless /^>/; + print OUTA; + } + + close IN; + close OUTA; + #------------------------------------------------------------------------------------------ +} +#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx + + +#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx +sub multi_species_t2_2{ + #print "IN multi_species_t2_2 : @_\n"; + my $input = $_[0]; + my $species = $_[1]; + my $output1 = $input."2"; + + + open (FILEF1, "<$input"); + open (FILEF2, ">$output1"); + + my $line1 = <FILEF1>; + + while($line1){ + { + # chomp($line); + if ($line1 =~ (m/^\>$species/)){ + chomp($line1); + print FILEF2 $line1; + $line1 = <FILEF1>; + chomp($line1); + print FILEF2 "\t", $line1,"\n"; + } + } + $line1 = <FILEF1>; + } + + close FILEF1; + close FILEF2; +} + +#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx + + +#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx +sub sputnikoutput_corrector{ + my $input = $_[0]; + my $output = $_[1]; + open(IN,"<$input") or die "Cannot open file $input :$!"; + open(OUT,">$output") or die "Cannot open file $output :$!"; + my $tine; + while (my $line=<IN>){ + if($line =~/length /){ + $tine = $line; + $tine =~ s/\s+/\t/g; + my @fields = split(/\t/,$tine); + if ($fields[6] > 60){ + print OUT $line; + $line = <IN>; + + while (($line !~ /nucleotide/) && ($line !~ /^>/)){ + chomp $line; + print OUT $line; + $line = <IN>; + } + print OUT "\n"; + print OUT $line; + } + else{ + print OUT $line; + } + } + else{ + print OUT $line; + } + } + close IN; + close OUT; +} +#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx + + +#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx +sub multi_species_t4{ +# print "multi_species_t4 : @_\n"; + my $input = $_[0]; + my $output = $_[1]; + open (FILEA, "<$input"); + open (FILEB, ">$output"); + + my $line = <FILEA>; + + while ($line) { + # chomp $line; + if ($line =~ />/) { + chomp $line; + print FILEB $line, "\n"; + } + + + if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ + chomp $line; + print FILEB $line, " " ; + $line = <FILEA>; + chomp $line; + print FILEB $line,"\n"; + } + + $line = <FILEA>; + } + + + close FILEA; + close FILEB; + +} + +#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx + + +#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx +sub multi_species_t5{ + + my $input = $_[0]; + my $output = $_[1]; + + open(FILEB,"<$input"); + open(FILEC,">$output"); + + my $curdef; + + while (my $line = <FILEB> ) { + + if ($line =~ /^>/){ + chomp $line; + $curdef = $line; + next; + } + + if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ + print FILEC $curdef," ",$line; + } + + } + + + close FILEB; + close FILEC; + +} +#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx + + +#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx +sub multi_species_t6{ + my $input = $_[0]; + my $output = $_[1]; + my $focalstrand=$_[3]; +# print "inpput = @_\n"; + open (FILE, "<$input"); + open (FILE_MICRO, ">$output"); + my $linecounter=0; + while (my $line = <FILE>){ + $linecounter++; + chomp $line; + #print "line = $line\n"; + #MONO# + $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/; + my $strand=$1; + my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g); + #print "line = $line\n"; + my $specfieldsend = 2 + ($no_of_species*4) - 1; + my @fields = split(/\s+/, $line); + my @speciesdata = @fields[0 ... $specfieldsend]; + $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/; + my ($tide, $start, $end) = ($1, $2, $3); + #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n"; + if($line =~ /mononucleotide/){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n"; + } + #DI# + elsif($line =~ /dinucleotide/){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n"; + } + #TRI# + elsif($line =~ /trinucleotide/ ){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n"; + } + #TETRA# + elsif($line =~ /tetranucleotide/){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n"; + } + #PENTA# + elsif($line =~ /pentanucleotide/){ + #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n"; + } + else{ + # print "not: @fields\n"; + } + } +# print "linecounter=$linecounter\n"; + close FILE; + close FILE_MICRO; +} + +sub mono { + my $st = $_[0]; + my $tp = unpack "A1"x(length($st)/1),$st; + my $var1 = substr($tp, 0, 1); + return join ("\t", $var1); +} +sub di { + my $st = $_[0]; + my $tp = unpack "A2"x(length($st)/2),$st; + my $var1 = substr($tp, 0, 2); + return join ("\t", $var1); +} +sub tri { + my $st = $_[0]; + my $tp = unpack "A3"x(length($st)/3),$st; + my $var1 = substr($tp, 0, 3); + return join ("\t", $var1); +} +sub tetra { + my $st = $_[0]; + my $tp = unpack "A4"x(length($st)/4),$st; + my $var1 = substr($tp, 0, 4); + return join ("\t", $var1); +} +sub penta { + my $st = $_[0]; + my $tp = unpack "A5"x(length($st)/5),$st; + my $var1 = substr($tp, 0, 5); + return join ("\t", $var1); +} + +#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx + + +#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx +sub t9{ + my $input1 = $_[0]; + my $input2 = $_[1]; + my $output = $_[2]; + + + open(IN1,"<$input1") if -e $input1; + open(IN2,"<$input2") or die "cannot open file $_[1] : $!"; + open(OUT,">$output") or die "cannot open file $_[2] : $!"; + + + my %seen = (); + my $prevkey = 0; + + if (-e $input1){ + while (my $line = <IN1>){ + chomp($line); + my @fields = split(/\t/,$line); + my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]); + # print "key in t9 = $key1\n"; + $seen{$key1}++ if ($prevkey ne $key1) ; + $prevkey = $key1; + } +# print "done first hash\n"; + close IN1; + } + + while (my $line = <IN2>){ + # print $line, "**\n"; + if (-e $input1){ + chomp($line); + my @fields = split(/\t/,$line); + my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]); + if (exists $seen{$key2}){ + print OUT "$line\n" ; + delete $seen{$key2}; + } + } + else { + print OUT "$line\n" ; +# print "$line\n" ; + } + } + + close IN2; + close OUT; +} +#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx + + +sub multiSpecies_compound_microsat_hunter3{ + + my $input1 = $_[0]; ###### the *_sput_op4_ii file + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" + my $output1 = $_[2]; ###### plain microsatellite file + my $output2 = $_[3]; ###### compound microsatellite file + my $org = $_[4]; ###### 1 or 2 + $no_of_species = $_[5]; + #print "IN multiSpecies_compound_microsat_hunter3: @_\n"; + #my @tags = split(/\t/,$info); + sub compoundify; + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + open(OUT,">$output1") or die "Cannot open file $output1 $!"; + open(OUT2,">$output2") or die "Cannot open file $output2 $!"; + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + my @thresholds = ("0"); + push(@thresholds, split(/_/,$_[6])); + sub thresholdCheck; + my %micros = (); + while (my $line = <IN>){ + # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; + next if $line =~ /\t\t/; + if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$micros{$key}},$line); + } + else{ + } + } + close IN; + my @deletedlines = (); + + my $linecount = 0; + + while(my $sine = <SEQ>){ + my %microstart=(); + my %microend=(); + + my @sields = split(/\t/,$sine); + + my $key = (); + + if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "<-<-<-<-<-<-<-<\n"; + } + else{ + } + + if (exists $micros{$key}){ + $linecount++; + my @microstring = @{$micros{$key}}; + my @tempmicrostring = @{$micros{$key}}; + + foreach my $line (@tempmicrostring){ + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + my $firstflag = 'down'; + while( my $line =shift(@microstring)){ + # print "-----------\nline = $line "; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + my $startmicro = $line; + my $endmicro = $line; + + # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n"; + + delete ($microstart{$start}); + delete ($microend{$end}); + my $flag = 'down'; + my $startflag = 'down'; + my $endflag = 'down'; + my $prestart = $start - $distance; + my $postend = $end + $distance; + my @compoundlines = (); + my %compoundhash = (); + push (@compoundlines, $line); + push (@{$compoundhash{$line}},$line); + my $startrank = 1; + my $endrank = 1; + + while( ($startflag eq "down") || ($endflag eq "down") ){ + if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) { +# print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n"; + last; + } + + $firstflag = "up"; + if ($startflag eq "down"){ + for my $i ($prestart ... $start){ + + if(exists $microend{$i}){ + chomp $microend{$i}[0]; + if(exists $compoundhash{$microend{$i}[0]}) {next;} + # print "sending from microend $startmicro, $microend{$i}[0] |||\n"; + if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){ + push(@compoundlines, $microend{$i}[0]); + # print "accepted\n"; + my @tields = split(/\t/,$microend{$i}[0]); + $startmicro = $microend{$i}[0]; + chomp $startmicro; + $start = $tields[$startcord]; + $flag = 'down'; + $startrank++; + # print "startcompund = $microend{$i}[0]\n"; + delete $microend{$i}; + delete $microstart{$start}; + $startflag = 'down'; + $prestart = $start - $distance; + last; + } + else{ + $flag = 'up'; + $startflag = 'up'; + } + } + else{ + $flag = 'up'; + $startflag = 'up'; + } + } + } + + $endrank = $startrank; + + if ($endflag eq "down"){ + for my $i ($end ... $postend){ + + if(exists $microstart{$i} ){ + chomp $microstart{$i}[0]; + if(exists $compoundhash{$microstart{$i}[0]}) {next;} + # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n"; + + if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){ + push(@compoundlines, $microstart{$i}[0]); + # print "accepted\n"; + my @tields = split(/\t/,$microstart{$i}[0]); + $end = $tields[$endcord]-0; + $endmicro = $microstart{$i}[0]; + $endrank++; + chomp $endmicro; + $flag = 'down'; + # print "endcompund = $microstart{$i}[0]\n"; + delete $microstart{$i}; + delete $microend{$end}; + shift @microstring; + $postend = $end + $distance; + $endflag = 'down'; + last; + } + else{ + $flag = 'up'; + $endflag = 'up'; + } + } + else{ + $flag = 'up'; + $endflag = 'up'; + } + } + } + # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; + } #end while( $flag eq "down") + # print "compoundlines = @compoundlines \n"; + if (scalar (@compoundlines) == 1){ + print OUT $line,"\n"; + } + if (scalar (@compoundlines) > 1){ + my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]); + # print $compoundline,"\n"; + print OUT2 $compoundline,"\n"; + } + } #end foreach my $line (@microstring){ + } #if (exists $micros{$key}){ + + + } + + close OUT; + close OUT2; +} + + +#------------------------------------------------------------------------------------------------ +sub compoundify{ + my ($compoundlines, $sequence) = @_; +# print "\nfound to compound : @$compoundlines and$sequence \n"; + my $noOfComps = @$compoundlines; +# print "Number of elements in hash is $noOfComps\n"; + my @starts; + my @ends; + foreach my $line (@$compoundlines){ +# print "compoundify.. line = $line \n"; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + # print "start = $start, end = $end \n"; + push(@starts, $start); + push(@ends,$end); + } + my @temp = @$compoundlines; + my $startline=$temp[0]; + my @mields = split(/\t/,$startline); + my $startcoord = $mields[$startcord]; + my $startgapsign=$mields[$endcord]; + my @startsorted = sort { $a <=> $b } @starts; + my @endsorted = sort { $a <=> $b } @ends; + my @intervals; + for my $end (0 ... (scalar(@endsorted)-2)){ + my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1)); + push(@intervals,$interval); + # print "interval = $interval =\n"; + # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n"; + } + push(@intervals,""); + my $compoundmicrosat=(); + my $multiunit=""; + foreach my $line (@$compoundlines){ + my @fields = split(/\t/,$line); + my $component="[".$fields[$microsatcord]."]".shift(@intervals); + $compoundmicrosat=$compoundmicrosat.$component; + $multiunit=$multiunit."[".$fields[$motifcord]."]"; +# print "multiunit = $multiunit\n"; + } + my $compoundcopy = $compoundmicrosat; + $compoundcopy =~ s/\[|\]//g; + my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1; + + + my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit)); + return $compoundline; +} + +#------------------------------------------------------------------------------------------------ + +sub identityMatch_thresholdCheck{ + my $line1 = $_[0]; + my $line2 = $_[1]; + my $rank = $_[2]; + my @lields1 = split(/\t/,$line1); + my @lields2 = split(/\t/,$line2); +# print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n"; + + if (length($lields1[$motifcord]) == length($lields2[$motifcord])){ + my $probe = $lields1[$motifcord].$lields1[$motifcord]; + #print "$probe :: $lields2[$motifcord]\n"; + return "proceed" if $probe =~ /$lields2[$motifcord]/; + #print "line recieved\n"; + if ($rank ==1){ + return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; + } + else { + return "proceed" if thresholdCheck($line2) eq "proceed"; + return "stop"; + } + } + else{ + if ($rank ==1){ + return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; + } + else { + return "proceed" if thresholdCheck($line2) eq "proceed"; + return "stop"; + } + } + return "stop"; +} +#------------------------------------------------------------------------------------------------ + +sub thresholdCheck{ + my @checkthresholds=(0,@thresholds); + #print "IN thresholdCheck: @_\n"; + my $line = $_[0]; + my @lields = split(/\t/,$line); + return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])]; + return "stop"; +} +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx + +sub multiSpecies_filtering_interrupted_microsats{ +# print "IN multiSpecies_filtering_interrupted_microsats: @_\n"; + my $unfiltered = $_[0]; + my $filtered = $_[1]; + my $residue = $_[2]; + my $no_of_species = $_[5]; + open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; + open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; + open(RES,">$residue") or die "Cannot open file $residue: $!"; + + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + + + my @sub_thresholds = (0); + + push(@sub_thresholds, split(/_/,$_[3])); + my @thresholds = (0); + + push(@thresholds, split(/_/,$_[4])); + + while (my $line = <UNF>) { + next if $line !~ /[a-z]/; + #print $line; + chomp $line; + my @fields = split(/\t/,$line); + my $motif = $fields[$motifcord]; + my $realmotif = $motif; + #print "motif = $motif\n"; + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + my @motifs = split(/\]/,$motif); + $realmotif = $motifs[0]; + } +# print "realmotif = $realmotif"; + my $motif_size = length($realmotif); + + my $microsat = $fields[$microsatcord]; +# print "microsat = $microsat\n"; + $microsat =~ s/^\[|\]$//sg; + my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); + + $microsat = join("",@microsats); + if (length($microsat) < $thresholds[$motif_size]) { + # print length($microsat)," < ",$thresholds[$motif_size],"\n"; + print RES $line,"\n"; next; + } + my @lengths = (); + foreach my $mic (@microsats){ + push(@lengths, length($mic)); + } + if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) { + # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n"; + print RES $line,"\n"; next;} + else {print FIL $line,"\n"; next; + } + } + close FIL; + close RES; + +} + +sub largest_microsat{ + my $counter = 0; + my($max) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp > $max){ + $max = $temp; + } + } + return($max); +} + +#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx +sub multiSpecies_compound_microsat_analyzer{ + ####### PARAMETER ######## + ########################## + + my $input1 = $_[0]; ###### the *_sput_op4_ii file + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format + my $output2 = $_[3]; ###### the pure compound microsatellites + my $org = $_[4]; + my $no_of_species = $_[5]; +# print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + + open(OUT,">$output1") or die "Cannot open file $output1 $!"; + open(OUT2,">$output2") or die "Cannot open file $output2 $!"; + + +# print "opened files \n"; + my %micros = (); + my $keycounter=0; + my $linecounter=0; + while (my $line = <IN>){ + $linecounter++; + if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); + push (@{$micros{$key}},$line); + $keycounter++; + } + else{ + # print "no key\n"; + } + } + close IN; + my @deletedlines = (); +# print "done hash . linecounter=$linecounter, keycounter=$keycounter\n"; + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + my $keyfound=0; + my $keyexists=0; + my $inter=0; + my $pure=0; + + while(my $sine = <SEQ>){ + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = 0; + if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); + # print $sine; + # print $key; + $keyfound++; + } + else{ + + } + #<STDIN> if !defined $key; + + if (exists $micros{$key}){ + $keyexists++; + my @microstring = @{$micros{$key}}; + + my @filteredmicrostring; + + foreach my $line (@microstring){ + chomp $line; + my $copy_line = $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + # FOR COMPOUND MICROSATELLITES + if ($fields[$typecord] eq "compound"){ + $line = compound_microsat_analyser($line); + if ($line eq "NULL") { + print OUT2 "$copy_line\n"; + $pure++; + next; + } + else{ + print OUT "$line\n"; + $inter++; + next; + } + } + } + + } #if (exists $micros{$key}){ + } + close OUT; + close OUT2; +# print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n"; +} + +sub compound_microsat_analyser{ + my $line = $_[0]; + my @fields = split(/\t/,$line); + my $motifline = $fields[$motifcord]; + my $microsat = $fields[$microsatcord]; + $motifline =~ s/^\[|\]$//g; + $microsat =~ s/^\[|\]$//g; + $microsat =~ s/-//g; + my @interruptions = (); + my @motields = split(/\]\[/,$motifline); + my @microields = split(/\][a-zA-Z|-]*\[/,$microsat); + my @inields = split(/[.*]/,$microsat); + shift @inields; + my @motifcount = scalar(@motields); + my $prevmotif = $motields[0]; + my $prevmicro = $microields[0]; + my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0])); + my $localflag = 'down'; + my @infoarray = (); + + for my $l (1 ... (scalar(@motields)-1)){ + my $probe = $prevmotif.$prevmotif; + if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;} + + if ($probe =~ /$motields[$l]/i){ + my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l])); + my $curr_startphase = substr($microields[$l],0,length($motields[$l])); + if ($curr_startphase =~ /$prevphase/i) { + $infoarray[$l-1] = "insertion"; + } + else { + $infoarray[$l-1] = "indel/substitution"; + } + + $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase; + next; + } + else {$localflag = "up"; last;} + } + if ($localflag eq 'up') {return "NULL";} + + if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";} + if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";} + if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";} + if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";} + if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";} + + @microields = split(/[\[|\]]/,$microsat); + my @microsats = (); + my @positions = (); + my $lengthtracker = 0; + + for my $i (0 ... (scalar(@microields ) - 1)){ + if ($i%2 == 0){ + push(@microsats,$microields[$i]); + $lengthtracker = $lengthtracker + length($microields[$i]); + + } + else{ + push(@interruptions,$microields[$i]); + push(@positions, $lengthtracker+1); + $lengthtracker = $lengthtracker + length($microields[$i]); + } + + } + my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions))); + return($returnline); +} + +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx + +sub multiSpecies_compoundClarifyer{ +# print "IN multiSpecies_compoundClarifyer: @_\n"; + my $input1 = $_[0]; ###### the *_sput_compound + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format + my $output2 = $_[3]; ###### compound file + my $org = $_[4]; + my $no_of_species = $_[5]; + @thresholds = "0"; + push(@thresholds, split(/_/,$_[6])); + + + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $interrcord = $motifcord + 2; + $interrtypecord = $motifcord + 1; + + + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + + open(INT,">$output1") or die "Cannot open file $output2 $!"; + open(COMP,">$output2") or die "Cannot open file $output2 $!"; + #open(CH,">changed") or die "Cannot open file changed $!"; + +# print "opened files \n"; + my $linecounter = 0; + my $microcounter = 0; + + my %micros = (); + while (my $line = <IN>){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) { + my $key = join("\t",$1, $2, $3, $4); + # print $key, "#-#-#-#-#-#-#-#\n"; + # print "key = $key\n"; + push (@{$micros{$key}},$line); + $microcounter++; + } + else {#print $line," key not made\n"; <STDIN>; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close IN; + my @deletedlines = (); +# print "done hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide); + $printer = 0; + + while(my $sine = <SEQ>){ + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = (); + +# print "sine = $sine. focalspec = $focalspec \n"; #<STDIN>; + + if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) { + +# if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4); +# print "key = $key\n"; + } + else{ +# print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n"; + } + + if (exists $micros{$key}){ + my @microstring = @{$micros{$key}}; + delete $micros{$key}; + + foreach my $line (@microstring){ +# print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1; +# print "microsat = $line" if $printer == 1; + $linecounter++; + my $copy_line = $line; + my @mields = split(/\t/,$line); + my @fields = @mields; + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + my $microsat = $fields[$microsatcord]; + my $motifline = $fields[$motifcord]; + my $microsatcopy = $microsat; + my $positioner = $microsat; + $positioner =~ s/[a-zA-Z|-]/_/g; + $microsatcopy =~ s/^\[|\]$//gs; + chomp $microsatcopy; + my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); + my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); + my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start); +# print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1; + shift @inields; + #print "inields =@inields<\n"; + $motifline =~ s/^\[|\]$//gs; + chomp $motifline; + #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n"; + my @motields = split(/\]\[/,$motifline); + my $seq = $microsatcopy; + $seq =~ s/\[|\]//g; + my $seqlen = length($seq); + $seq = " ".$seq; + + my $longestmotif_no = longest_array_element(@motields); + my $shortestmotif_no = shortest_array_element(@motields); + #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n"; + + my $search = $motields[$longestmotif_no].$motields[$longestmotif_no]; + if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){ + print COMP $line; + next; + } + + my @shortestmotif_nos = (); + for my $m (0 ... $#motields){ + push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) ); + } + ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ + my $newleft =(); + my $leftstopper = 0; my $rightstopper = 0; + foreach my $shortmotif_no (@shortestmotif_nos){ + next if $shortmotif_no == 0; + my $last_left = $shortmotif_no; #$#motields; + my $last_hitter = 0; + for (my $i =($shortmotif_no-1); $i>=0; $i--){ + my $search = $motields[$shortmotif_no]; + if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} + if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ + $last_hitter = 1; + $last_left = $i+1; last; + } + my $probe = $motields[$i]; + if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} + + if ($probe !~ /$search/){ + $last_hitter = 1; + $last_left = $i+1; + # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n"; + last; + } + $last_left--;$last_hitter = 1; + # print "passed tests, last left = $last_left\n"; + } + # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n"; + if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;} + else {$leftstopper = 1; + # print "leftstopper = 1\n"; + } + } + + ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ + my $newright =(); + foreach my $shortmotif_no (@shortestmotif_nos){ + next if $shortmotif_no == $#motields; + my $last_right = $shortmotif_no;# -1; + for my $i ($shortmotif_no+1 ... $#motields){ + my $search = $motields[$shortmotif_no]; + if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} + if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ + $last_right = $i-1; last; + } + my $probe = $motields[$i]; + if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} + if ( $probe !~ /$search/){ + $last_right = $i-1; last; + } + $last_right++; + } + if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n"; + } + else {$rightstopper = 1; + } + } + + + if ($rightstopper == 1 && $leftstopper == 1){ + print COMP $line; +# print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1; + next; + } + +# print "pased initial testing phase \n" if $printer == 1; + my @outputs = (); + my @orig_starts = (); + my @orig_ends = (); + for my $mic (0 ... $#microields){ + my $miclen = length($microields[$mic]); + my $microleftlen = 0; + #print "\nmic = $mic\n"; + if($mic > 0){ + for my $submin (0 ... $mic-1){ + my $interval = (); + if (!exists $inields[$submin]) {$interval = "";} + else {$interval = $inields[$submin];} + #print "inield =$interval< and microield =$microields[$submin]<\n "; + $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); + } + } + push(@orig_starts,($microleftlen+1)); + push(@orig_ends, ($microleftlen+1 + $miclen -1)); + } + + ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ + + + for my $mic (0 ... $#microields){ + my $miclen = length($microields[$mic]); + my $microleftlen = 0; + if($mic > 0){ + for my $submin (0 ... $mic-1){ + # if(!exists $inields[$submin]) {$inields[$submin] = "";} + my $interval = (); + if (!exists $inields[$submin]) {$interval = "";} + else {$interval = $inields[$submin];} + #print "inield =$interval< and microield =$microields[$submin]<\n "; + $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); + } + } + $fields[$startcord] = $microleftlen+1; + $fields[$endcord] = $fields[$startcord] + $miclen -1; + $fields[$typecord] = $microsat_types[length($motields[$mic])]; + $fields[$microsatcord] = $microields[$mic]; + $fields[$motifcord] = $motields[$mic]; + my $templine = join("\t", (@fields[0 .. $motifcord]) ); + my $orig_templine = join("\t", (@fields[0 .. $motifcord]) ); + my $newline; + my $lefter = 1; my $righter = 1; + if ( $fields[$startcord] < 2){$lefter = 0;} + if ($fields[$endcord] == $seqlen){$righter = 0;} + + while($lefter == 1){ + $newline = left_extender($templine, $seq,$org); +# print "returned line from left extender= $newline \n" if $printer == 1; + if ($newline eq $templine){$templine = $newline; last;} + else {$templine = $newline;} + + if (left_extention_permission_giver($templine) eq "no") {last;} + } + while($righter == 1){ + $newline = right_extender($templine, $seq,$org); +# print "returned line from right extender= $newline \n" if $printer == 1; + if ($newline eq $templine){$templine = $newline; last;} + else {$templine = $newline;} + if (right_extention_permission_giver($templine) eq "no") {last;} + } + my @tempfields = split(/\t/,$templine); + $tempfields[$microsatcord] =~ s/\]|\[//g; + $tempfields[$motifcord] =~ s/^\[|\]$//gs; + my @tempmotields = split(/\]\[/,$tempfields[$motifcord]); + + if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) { +# print "scalar ( tempmotields) = 1\n" if $printer == 1; + next; + } + my $prevmotif = shift(@tempmotields); + my $stopper = 0; + + foreach my $tempmot (@tempmotields){ + if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;} + my $search = $prevmotif.$prevmotif; + if ($search !~ /$tempmot/) {$stopper = 1; last;} + $prevmotif = $tempmot; + } + if ( $stopper == 1) { +# print "length tempmot != length prevmotif\n" if $printer == 1; + next; + } + my $lastend = 0; + #---------------------------------------------------------- + my $left_captured = (); my $right_captured = (); + my $left_bp = (); my $right_bp = (); + # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n"; + for my $o (0 ... $#orig_starts){ +# print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1; +# print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1; + + if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord]) +# print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1; + $left_captured = $o; + $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1; + } + elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord]) +# print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1; + $right_captured = $o; + $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1; + } + $lastend = $orig_ends[$o] + } +# print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1; + my $leftmotif = (); my $left_trashed = (); + if ($tempfields[$startcord] != $fields[$startcord]) { + $leftmotif = $motields[$left_captured]; +# print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1; + if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n"; + } + $left_trashed = length($microields[$left_captured]) - $left_bp; + } + my $rightmotif = (); my $right_trashed = (); + if ($tempfields[$endcord] != $fields[$endcord]) { +# print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1; + $rightmotif = $motields[$right_captured]; + $right_trashed = length($microields[$right_captured]) - $right_bp; + } + + ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ + $stopper = 0; + my $deletioner = 0; + #if($tempfields[$startcord] != $fields[$startcord]){ +# print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1; + if ($left_captured != 0){ +# print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1; + for my $e (0 ... $left_captured-1){ + if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){ +# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ +# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ +# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + } + } + #} +# print "after left search, deletioner = $deletioner\n" if $printer == 1; + if ($deletioner >= 1) { +# print "deletioner = $deletioner\n" if $printer == 1; + next; + } + + $deletioner = 0; + + #if($tempfields[$endcord] != $fields[$endcord]){ +# print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1; + if ($right_captured != $#microields){ +# print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1; + for my $e ($right_captured+1 ... $#microields){ + if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){ +# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ +# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ +# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + } + } + #} +# print "deletioner = $deletioner\n" if $printer == 1; + if ($deletioner >= 1) { + next; + } + my $leftMotifs_notCaptured = (); + my $rightMotifs_notCaptured = (); + + if ($tempfields[$startcord] != $fields[$startcord] ){ + #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n"; + if (length($leftmotif) == 1 && $left_trashed > 3){ +# print "invaded left motif is long mononucleotide" if $printer == 1; + next; + + } + elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){ +# print "invaded left motif too long" if $printer == 1; + next; + } + } + if ($tempfields[$endcord] != $fields[$endcord] ){ + #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n"; + if (length($rightmotif)==1 && $right_trashed){ +# print "invaded right motif is long mononucleotide" if $printer == 1; + next; + + } + elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){ +# print "invaded right motif too long" if $printer == 1; + next; + + } + } + push @outputs, $templine; + } + if (scalar(@outputs) == 0){ print COMP $line; next;} + # print "outputs are:", join("\n",@outputs),"\n"; + if (scalar(@outputs) == 1){ + my @oields = split(/\t/,$outputs[0]); + my $start = $oields[$startcord]+$mields[$startcord]-1; + my $end = $start+($oields[$endcord]-$oields[$startcord]); + $oields[$startcord] = $start; $oields[$endcord] = $end; + print INT join("\t",@oields), "\n"; + # print CH $line,; + } + if (scalar(@outputs) > 1){ + my $motif_min = 10; + my $chosen_one = $outputs[0]; + foreach my $micro (@outputs){ + my @oields = split(/\t/,$micro); + my $tempmotif = $oields[$motifcord]; + $tempmotif =~ s/^\[|\]$//gs; + my @omots = split(/\]\[/, $tempmotif); + # print "motif_min = $motif_min, current motif = $tempmotif\n"; + my $start = $oields[$startcord]+$mields[$startcord]-1; + my $end = $start+($oields[$endcord]-$oields[$startcord]); + $oields[$startcord] = $start; $oields[$endcord] = $end; + if(length($omots[0]) < $motif_min) { + $chosen_one = join("\t",@oields); + $motif_min = length($omots[0]); + } + } + print INT $chosen_one, "\n"; + # print "chosen one is ".$chosen_one, "\n"; + # print CH $line; + + + } + + } + + } #if (exists $micros{$key}){ + else{ + } + } + close INT; + close COMP; +} +sub left_extender{ + #print "left extender\n"; + my ($line, $seq, $org) = @_; +# print "in left extender... line passed = $line and sequence is $seq\n"; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $firstmotif = (); + + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + } + else {$firstmotif = $motif;} + + #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n"; + my $leftphase = substr($microsat, 0,length($firstmotif)); + my $phaser = $leftphase.$leftphase; + my @phase = split(/\s*/,$leftphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($leftphase)-1)){ + push(@phases, substr($phaser, $crawler, length($leftphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $leftseq = substr($seq, 0, $start); +# print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n"; + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $firstmotif_length = length($firstmotif); + foreach my $phase (@phases){ +# print "left phase\t",substr($leftseq, -10),"\t$phase\n"; +# print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n"; + if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ +# print "in left pattern\n"; + my $trapped = $1; + my $trappedpos = length($leftseq)-length($trapped); + my $interval = $3; + my $intervalpos = index($trapped, $interval) + 1; +# print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n"; + + my $extention = substr($trapped, 0, length($trapped)-length($interval)); + my $leftpeep = substr($seq, 0, ($start-length($trapped))); + my @passed_overhangs; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase, -length($phase)+$i); +# print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n"; + #TEMPORARY... BETTER METHOD NEEDED + $leftpeep =~ s/-//g; + if ($leftpeep =~ /$overhang$/i){ + push(@passed_overhangs,$overhang); +# print "l overhang\n"; + } + } + + if(scalar(@passed_overhangs)>0){ + my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $overhang.$extention; + $trapped = $overhang.$trapped; + #print "trapped extended to $trapped \n"; + $trappedpos = length($leftseq)-length($trapped); + } + + push(@extentions,$extention); +# print "extentions = @extentions \n"; + + push(@trappeds,$trapped ); + push(@intervalposs,length($extention)+1); + push(@trappedposs, $trappedpos); +# print "trappeds = @trappeds\n"; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + + my $nikaal = shortest_array_element(@intervals); + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; + ##print "new fields 9 = $fields[9]\n"; + $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; + + if (exists ($fields[$motifcord+1])){ + $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; + } + else{$fields[$motifcord+1] = "indel/deletion";} + ##print "new fields 14 = $fields[14]\n"; + + if (exists ($fields[$motifcord+2])){ + $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + my @seventeen=(); + if (exists ($fields[$motifcord+3])){ + @seventeen = split(/,/,$fields[$motifcord+3]); + # #print "scalarseventeen =@seventeen<-\n"; + for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} + $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + } + + else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} + + ##print "new fields 16 = $fields[16]\n"; + ##print "new fields 17 = $fields[17]\n"; + + + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_compoundClarifyer_merge($returnline); + } + return $returnline; +} +sub right_extender{ + my ($line, $seq, $org) = @_; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $temp_lastmotif = (); + + if ($motif =~ /\]$/s){ + $motif =~ s/\]$//sg; + $motif =~ /.*\[([a-zA-Z]+)/; + $temp_lastmotif = $1; + } + else {$temp_lastmotif = $motif;} + my $lastmotif = substr($microsat,-length($temp_lastmotif)); + ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n"; + my $rightphase = substr($microsat, -length($lastmotif)); + my $phaser = $rightphase.$rightphase; + my @phase = split(/\s*/,$rightphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($rightphase)-1)){ + push(@phases, substr($phaser, $crawler, length($rightphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $rightseq = substr($seq, $end+1); + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $lastmotif_length = length($lastmotif); + foreach my $phase (@phases){ + if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ + my $trapped = $1; + my $trappedpos = $end+1; + my $interval = $2; + my $intervalpos = index($trapped, $interval) + 1; + + my $extention = substr($trapped, length($interval)); + my $rightpeep = substr($seq, ($end+length($trapped))+1); + my @passed_overhangs = ""; + + #TEMPORARY... BETTER METHOD NEEDED + $rightpeep =~ s/-//g; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase,0, $i); +# #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n"; + if ($rightpeep =~ /^$overhang/i){ + push(@passed_overhangs, $overhang); +# #print "r overhang\n"; + } + } + if (scalar(@passed_overhangs) > 0){ + my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $extention.$overhang; + $trapped = $trapped.$overhang; +# #print "trapped extended to $trapped \n"; + } + + push(@extentions,$extention); + ##print "extentions = @extentions \n"; + + push(@trappeds,$trapped ); + push(@intervalposs,$intervalpos); + push(@trappedposs, $trappedpos); +# #print "trappeds = @trappeds\n"; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + +# my $nikaal = longest_array_element(@trappeds); + my $nikaal = shortest_array_element(@intervals); + +# #print "longest element found = $nikaal \n"; + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; + ##print "new fields 9 = $fields[9]"; + $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); + + ##print "new fields 11 = $fields[11]\n"; + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; + ##print "new fields 12 = $fields[12]\n"; + + ##print "scalar of fields = ",scalar(@fields),"\n"; + if (exists ($fields[$motifcord+1])){ +# print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1]; +# <STDIN> if !exists $fields[$motifcord+1]; + $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; + } + else{$fields[$motifcord+1] = "indel/deletion";} + ##print "new fields 14 = $fields[14]\n"; + + if (exists ($fields[$motifcord+2])){ + $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + ##print "new fields 15 = $fields[15]\n"; + + my @seventeen=(); + if (exists ($fields[$motifcord+3])){ + ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n"; +# print " print fields = @fields\n " if !exists $fields[$motifcord+3]; + <STDIN> if !exists $fields[$motifcord+3]; + my $currpos = length($microsat)+$intervalposs[$nikaal]; + $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + + } + + else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} + + ##print "new fields 16 = $fields[16]\n"; + + ##print "new fields 17 = $fields[17]\n"; + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_compoundClarifyer_merge($returnline); + } + #print "finally right-extended line = ",$returnline,"\n"; + return $returnline; +} +sub longest_array_element{ + my $counter = 0; + my($max) = shift(@_); + my $maxcounter = 0; + foreach my $temp (@_) { + $counter++; + #print "finding largest array: $maxcounter \n" if $prinkter == 1; + if(length($temp) > length($max)){ + $max = $temp; + $maxcounter = $counter; + } + } + return($maxcounter); +} +sub shortest_array_element{ + my $counter = 0; + my($min) = shift(@_); + my $mincounter = 0; + foreach my $temp (@_) { + $counter++; + #print "finding largest array: $mincounter \n" if $prinkter == 1; + if(length($temp) < length($min)){ + $min = $temp; + $mincounter = $counter; + } + } + return($mincounter); +} + + +sub left_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/(^\[)|-//g; + my $motif = $fields[$motifcord]; + my $firstmotif = (); + my $firststretch = (); + my @stretches=(); + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + @stretches = split(/\]/,$microsat); + $firststretch = $stretches[0]; + ##print "firststretch = $firststretch\n"; + } + else {$firstmotif = $motif;$firststretch = $microsat;} + + if (length($firststretch) < $thresholds[length($firstmotif)]){ + return "no"; + } + else {return "yes";} + +} +sub right_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/-|(\]$)//sg; + my $motif = $fields[$motifcord]; + my $temp_lastmotif = (); + my $laststretch = (); + my @stretches=(); + + + if ($motif =~ /\]/){ + $motif =~ s/\]$//gs; + $motif =~ /.*\[([a-zA-Z]+)$/; + $temp_lastmotif = $1; + @stretches = split(/\[/,$microsat); + $laststretch = pop(@stretches); + ##print "last stretch = $laststretch\n"; + } + else {$temp_lastmotif = $motif; $laststretch = $microsat;} + + if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ + return "no"; + } + else { return "yes";} + + +} +sub multiSpecies_compoundClarifyer_merge{ + my $line = $_[0]; + #print "sent for mering: $line \n"; + my @mields = split(/\t/,$line); + my @fields = @mields; + my $microsat = $fields[$microsatcord]; + my $motifline = $fields[$motifcord]; + my $microsatcopy = $microsat; + $microsatcopy =~ s/^\[|\]$//sg; + my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); + my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); + shift @inields; + #print "inields =@inields<\n"; + $motifline =~ s/^\[|\]$//sg; + my @motields = split(/\]\[/,$motifline); + my @firstmotifs = (); + my @lastmotifs = (); + for my $i (0 ... $#microields){ + $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); + $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); + } + #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n"; + my @mergelist = (); + my @inter_poses = split(/,/,$fields[$interr_poscord]); + my $no_of_interruptions = $fields[$no_of_interruptionscord]; + my @interruptions = split(/,/,$fields[$interrcord]); + my @interrtypes = split(/,/,$fields[$interrtypecord]); + my $stopper = 0; + for my $i (0 ... $#motields-1){ + #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n"; + if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){ + $stopper = 1; + push(@mergelist, ($i)."_".($i+1)); + } + } + + return $line if scalar(@mergelist) == 0; + + foreach my $merging (@mergelist){ + my @sets = split(/_/, $merging); + my @tempmicro = (); + my @tempmot = (); + for my $i (0 ... $sets[0]-1){ + push(@tempmicro, "[".$microields[$i]."]"); + push(@tempmicro, $inields[$i]); + push(@tempmot, "[".$motields[$i]."]"); + #print "adding pre-motifs number $i\n"; + } + my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; + push (@tempmicro, $pusher); + push(@tempmot, "[".$motields[$sets[0]]."]"); + my $outcoming = -2; + for my $i ($sets[1]+1 ... $#microields-1){ + push(@tempmicro, "[".$microields[$i]."]"); + push(@tempmicro, $inields[$i]); + push(@tempmot, "[".$motields[$i]."]"); + #print "adding post-motifs number $i\n"; + $outcoming = $i; + } + if ($outcoming != -2){ + #print "outcoming = $outcoming \n"; + push(@tempmicro, "[".$microields[$outcoming+1 ]."]"); + push(@tempmot,"[". $motields[$outcoming+1]."]"); + } + $fields[$microsatcord] = join("",@tempmicro); + $fields[$motifcord] = join("",@tempmot); + + splice(@interrtypes, $sets[0], 1); + $fields[$interrtypecord] = join(",",@interrtypes); + splice(@interruptions, $sets[0], 1); + $fields[$interrcord] = join(",",@interruptions); + splice(@inter_poses, $sets[0], 1); + $fields[$interr_poscord] = join(",",@inter_poses); + $no_of_interruptions = $no_of_interruptions - 1; + } + + if ($no_of_interruptions == 0){ + $fields[$microsatcord] =~ s/^\[|\]$//sg; + $fields[$motifcord] =~ s/^\[|\]$//sg; + $line = join("\t", @fields[0 ... $motifcord]); + } + else{ + $line = join("\t", @fields); + } + return $line; +} + +sub thrashallow{ + my $motif = $_[0]; + return 4 if length($motif) == 2; + return 6 if length($motif) == 3; + return 8 if length($motif) == 4; + +} + +#xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx +sub multispecies_filtering_compound_microsats{ + my $unfiltered = $_[0]; + my $filtered = $_[1]; + my $residue = $_[2]; + my $no_of_species = $_[5]; + open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; + open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; + open(RES,">$residue") or die "Cannot open file $residue: $!"; + + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + + my @sub_thresholds = ("0"); + push(@sub_thresholds, split(/_/,$_[3])); + my @thresholds = ("0"); + push(@thresholds, split(/_/,$_[4])); + + while (my $line = <UNF>) { + if ($line !~ /compound/){ + print FIL $line,"\n"; next; + } + chomp $line; + my @fields = split(/\t/,$line); + my $motifline = $fields[$motifcord]; + $motifline =~ s/^\[|\]$//g; + my @motifs = split(/\]\[/,$motifline); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/^\[|\]$|-//g; + my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); + + my $stopper = 0; + for my $i (0 ... $#motifs){ + my @common = (); + my $probe = $motifs[$i].$motifs[$i]; + my $motif_size = length($motifs[$i]); + + for my $j (0 ... $#motifs){ + next if length($motifs[$i]) != length($motifs[$j]); + push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i; + } + + if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;} + else {next;} + } + + if ($stopper == 1){ + print RES $line,"\n"; + } + else { print FIL $line,"\n"; } + } + close FIL; + close RES; +} + +#xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx + +sub chromosome_unrand_breaker{ +# print "IN chromosome_unrand_breaker: @_\n "; + my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $dir = $_[1]; ###### directory where subsets are put + my $output2 = $_[2]; ###### list of subset files + my $increment = $_[3]; + my $info = $_[4]; + my $chr = $_[5]; + open(SEQ,"<$input1") or die "Cannot open file $input1 $!"; + + open(OUT,">$output2") or die "Cannot open file $output2 $!"; + + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + + my $seed = 0; + my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); + print OUT $subset,"\n"; + open(SUB,">$subset"); + + while(my $sine = <SEQ>){ + $seed++; + print SUB $sine; + + if ($seed%$increment == 0 ){ + close SUB; + $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); + open(SUB,">$subset"); + print SUB $sine; + print OUT $subset,"\n"; + # print $subset,"\n"; + } + } + close OUT; + close SUB; +} +#xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx +sub multiSpecies_interruptedMicrosatHunter{ +# print "IN multiSpecies_interruptedMicrosatHunter: @_\n"; + my $input1 = $_[0]; ###### the *_sput_op4_ii file + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format + my $output2 = $_[3]; ###### uninterrupted microsatellite file + my $org = $_[4]; + my $no_of_species = $_[5]; + + my @thresholds = "0"; + push(@thresholds, split(/_/,$_[6])); + +# print "thresholds = @thresholds \n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $interrcord = $motifcord + 2; + $interrtypecord = $motifcord + 1; + + + $prinkter = 0; +# print "prionkytet = $prinkter\n"; + + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + + open(INT,">$output1") or die "Cannot open file $output2 $!"; + open(UNINT,">$output2") or die "Cannot open file $output2 $!"; + +# print "opened files !!\n"; + my $linecounter = 0; + my $microcounter = 0; + + my %micros = (); + while (my $line = <IN>){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1; + push (@{$micros{$key}},$line); + $microcounter++; + } + else {#print $line if $prinkter == 1; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close IN; + my @deletedlines = (); +# print "done hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + while(my $sine = <SEQ>){ + #print $linecounter,"\n" if $linecounter % 1000 == 0; + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = (); + if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "<-<-<-<-<-<-<-<\n"; + } + + # $prinkter = 1 if $sine =~ /^>H\t499\t/; + + if (exists $micros{$key}){ + my @microstring = @{$micros{$key}}; + delete $micros{$key}; + my @filteredmicrostring; +# print "sequence = $sields[$sequencepos]" if $prinkter == 1; + foreach my $line (@microstring){ + $linecounter++; + my $copy_line = $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + +# print $line if $prinkter == 1; + #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE + my $newline; + while(1){ + # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1; + if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} + + $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org); + if ($newline eq $line){$line = $newline; last;} + else {$line = $newline;} + + if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} +# print "returned line from left extender= $line \n" if $prinkter == 1; + } + while(1){ + # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1; + if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} + + $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org); + if ($newline eq $line){$line = $newline; last;} + else {$line = $newline;} + + if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} +# print "returned line from right extender= $line \n" if $prinkter == 1; + } +# print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1; + + my @tempfields = split(/\t/,$line); + if ($tempfields[$microsatcord] =~ /\[/){ + print INT $line,"\n"; + } + else{ + print UNINT $line,"\n"; + } + + if ($line =~ /NULL/){ next; } + push(@filteredmicrostring, $line); + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + + my $firstflag = 'down'; + + } #if (exists $micros{$key}){ + } + close INT; + close UNINT; +# print "final number of lines = $linecounter\n"; +} + +sub multiSpecies_interruptedMicrosatHunter_left_extender{ + my ($line, $seq, $org) = @_; +# print "left extender, like passed = $line\n" if $prinkter == 1; +# print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $firstmotif = (); + + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + } + else {$firstmotif = $motif;} + +# print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1; + my $leftphase = substr($microsat, 0,length($firstmotif)); + my $phaser = $leftphase.$leftphase; + my @phase = split(/\s*/,$leftphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($leftphase)-1)){ + push(@phases, substr($phaser, $crawler, length($leftphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $leftseq = substr($seq, 0, $start); +# print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1; + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $firstmotif_length = length($firstmotif); + foreach my $phase (@phases){ +# print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1; +# print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1; + if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ +# print "in left pattern\n" if $prinkter == 1; + my $trapped = $1; + my $trappedpos = length($leftseq)-length($trapped); + my $interval = $3; + my $intervalpos = index($trapped, $interval) + 1; +# print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1; + + my $extention = substr($trapped, 0, length($trapped)-length($interval)); + my $leftpeep = substr($seq, 0, ($start-length($trapped))); + my @passed_overhangs; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase, -length($phase)+$i); +# print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1; + #TEMPORARY... BETTER METHOD NEEDED + $leftpeep =~ s/-//g; + if ($leftpeep =~ /$overhang$/i){ + push(@passed_overhangs,$overhang); +# print "l overhang\n" if $prinkter == 1; + } + } + + if(scalar(@passed_overhangs)>0){ + my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $overhang.$extention; + $trapped = $overhang.$trapped; +# print "trapped extended to $trapped \n" if $prinkter == 1; + $trappedpos = length($leftseq)-length($trapped); + } + + push(@extentions,$extention); +# print "extentions = @extentions \n" if $prinkter == 1; + + push(@trappeds,$trapped ); + push(@intervalposs,length($extention)+1); + push(@trappedposs, $trappedpos); +# print "trappeds = @trappeds\n" if $prinkter == 1; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + +############################ my $nikaal = longest_array_element(@trappeds); + my $nikaal = shortest_array_element(@intervals); + +# print "longest element found = $nikaal \n" if $prinkter == 1; + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; + #print "new fields 9 = $fields[9]\n" if $prinkter == 1; + $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); + + #print "new fields 9 = $fields[9]\n" if $prinkter == 1; + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; + #print "new fields 14 = $fields[12]\n" if $prinkter == 1; + + #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1; + + + if (scalar(@fields) > $motifcord+1){ + $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; + } + else{$fields[$motifcord+1] = "indel/deletion";} + #print "new fields 14 = $fields[14]\n" if $prinkter == 1; + + if (scalar(@fields)>$motifcord+2){ + $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + #print "new fields 15 = $fields[15]\n" if $prinkter == 1; + + my @seventeen=(); + + if (scalar(@fields)>$motifcord+3){ + @seventeen = split(/,/,$fields[$motifcord+3]); + # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1; + for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} + $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + } + + else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} + + #print "new fields 16 = $fields[16]\n" if $prinkter == 1; + #print "new fields 17 = $fields[17]\n" if $prinkter == 1; + +# return join("\t",@fields); + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); + } +# print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1; + return $returnline; +} + +sub multiSpecies_interruptedMicrosatHunter_right_extender{ +# print "right extender\n" if $prinkter == 1; + my ($line, $seq, $org) = @_; +# print "in right extender... line passed = $line\n" if $prinkter == 1; +# print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $temp_lastmotif = (); + + if ($motif =~ /\]$/){ + $motif =~ s/\]$//g; + $motif =~ /.*\[([a-zA-Z]+)/; + $temp_lastmotif = $1; + } + else {$temp_lastmotif = $motif;} + my $lastmotif = substr($microsat,-length($temp_lastmotif)); +# print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1; + my $rightphase = substr($microsat, -length($lastmotif)); + my $phaser = $rightphase.$rightphase; + my @phase = split(/\s*/,$rightphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($rightphase)-1)){ + push(@phases, substr($phaser, $crawler, length($rightphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $rightseq = substr($seq, $end+1); +# print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1; +# print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1; + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $lastmotif_length = length($lastmotif); + foreach my $phase (@phases){ +# print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1; +# print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1; + if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ +# print "in right pattern\n" if $prinkter == 1; + my $trapped = $1; + my $trappedpos = $end+1; + my $interval = $2; + my $intervalpos = index($trapped, $interval) + 1; +# print "trapped = $trapped, interval = $interval\n" if $prinkter == 1; + + my $extention = substr($trapped, length($interval)); + my $rightpeep = substr($seq, ($end+length($trapped))+1); + my @passed_overhangs = ""; + + #TEMPORARY... BETTER METHOD NEEDED + $rightpeep =~ s/-//g; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase,0, $i); +# print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1; + if ($rightpeep =~ /^$overhang/i){ + push(@passed_overhangs, $overhang); +# print "r overhang\n" if $prinkter == 1; + } + } + if (scalar(@passed_overhangs) > 0){ + my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $extention.$overhang; + $trapped = $trapped.$overhang; +# print "trapped extended to $trapped \n" if $prinkter == 1; + } + + push(@extentions,$extention); + #print "extentions = @extentions \n" if $prinkter == 1; + + push(@trappeds,$trapped ); + push(@intervalposs,$intervalpos); + push(@trappedposs, $trappedpos); +# print "trappeds = @trappeds\n" if $prinkter == 1; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + +################################### my $nikaal = longest_array_element(@trappeds); + my $nikaal = shortest_array_element(@intervals); + +# print "longest element found = $nikaal \n" if $prinkter == 1; + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; + $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); + + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; + + + if (scalar(@fields) > $motifcord+1){ + $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; + } + else{$fields[$motifcord+1] = "indel/deletion";} + + if (scalar(@fields)>$motifcord+2){ + $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + + my @seventeen=(); + if (scalar(@fields)>$motifcord+3){ + #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1; + my $currpos = length($microsat)+$intervalposs[$nikaal]; + $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + + } + + else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} + +# print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1; +# return join("\t",@fields); + + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); + } +# print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1; + return $returnline; + +} + +sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/(^\[)|-//sg; + my $motif = $fields[$motifcord]; + chomp $motif; +# print $motif, "\n" if $motif !~ /^\[/; + my $firstmotif = (); + my $firststretch = (); + my @stretches=(); + +# print "motif = $motif, microsat = $microsat\n" if $prinkter == 1; + if ($motif =~ /^\[/){ + $motif =~ s/^\[//sg; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + @stretches = split(/\]/,$microsat); + $firststretch = $stretches[0]; + #print "firststretch = $firststretch\n" if $prinkter == 1; + } + else {$firstmotif = $motif;$firststretch = $microsat;} +# print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1; + if (length($firststretch) < $thresholds[length($firstmotif)]){ + return "no"; + } + else {return "yes";} + +} +sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/-|(\]$)//sg; + my $motif = $fields[$motifcord]; + chomp $motif; + my $temp_lastmotif = (); + my $laststretch = (); + my @stretches=(); + + + if ($motif =~ /\]/){ + $motif =~ s/\]$//sg; + $motif =~ /.*\[([a-zA-Z]+)$/; + $temp_lastmotif = $1; + @stretches = split(/\[/,$microsat); + $laststretch = pop(@stretches); + #print "last stretch = $laststretch\n" if $prinkter == 1; + } + else {$temp_lastmotif = $motif; $laststretch = $microsat;} + + if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ + return "no"; + } + else { return "yes";} + + +} +sub checking_substitutions{ + + my ($line, $seq, $startprobes, $endprobes) = @_; + #print "sequence = $seq \n" if $prinkter == 1; + #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n"; + # <STDIN>; + my @seqarray = split(/\s*/,$seq); + my @startsubst_probes = split(/\|/,$startprobes); + my @endsubst_probes = split(/\|/,$endprobes); + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[11] - $fields[10]; + my $end = $fields[13] - $fields[10]; + my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9] + $motif =~ s/\[|\]//g; + my $microsat = $fields[14]; + $microsat =~ s/\[|\]//g; + #------------------------------------------------------------------------ + # GETTING START AND END PHASES + my $startphase = substr($microsat,0, length($motif)); + my $endphase = substr($microsat,-length($motif), length($motif)); + #print "start and end phases are - $startphase and $endphase\n"; + my $startflag = 'down'; + my $endflag = 'down'; + my $substitution_distance = length($motif); + my $prestart = $start - $substitution_distance; + my $postend = $end + $substitution_distance; + my @endadds = (); + my @startadds = (); + if (($prestart < 0) || ($postend > scalar(@seqarray))) { + last; + } + #------------------------------------------------------------------------#------------------------------------------------------------------------ + # CHECKING FOR SUBSTITUTION PROBES NOW + + if ($fields[8] ne "mononucleotide"){ + while ($startflag eq "down"){ + my $search = join("",@seqarray[$prestart...($start-1)]); + #print "search is from $prestart...($start-1) = $search\n"; + foreach my $probe (@startsubst_probes){ + #print "\t\tprobe = $probe\n"; + if ($search =~ /^$probe/){ + #print "\tfound addition to the left - $search \n"; + my $copyprobe = $probe; + my $type; + my $subspos = 0; + my $interruption = ""; + if ($search eq $startphase) { $type = "NONE";} + else{ + $copyprobe =~ s/\[a-zA-Z\]/^/g; + $subspos = index($copyprobe,"^") + 1; + $type = "substitution"; + $interruption = substr($search, $subspos,1); + } + my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos); + #print "adding information: $addinfo \n"; + push(@startadds, $addinfo); + $prestart = $prestart - $substitution_distance; + $start = $start-$substitution_distance; + $startflag = 'down'; + + last; + } + else{ + $startflag = 'up'; + } + } + } + #<STDIN>; + while ($endflag eq "down"){ + my $search = join("",@seqarray[($end+1)...$postend]); + #print "search is from ($end+1)...$postend] = $search\n"; + + foreach my $probe (@endsubst_probes){ + #print "\t\tprobe = $probe\n"; + if ($search =~ /$probe$/){ + my $copyprobe = $probe; + my $type; + my $subspos = 0; + my $interruption = ""; + if ($search eq $endphase) { $type = "NONE";} + else{ + $copyprobe =~ s/\[a-zA-Z\]/^/g; + $subspos = index($copyprobe,"^") + 1; + $type = "substitution"; + $interruption = substr($search, $subspos,1); + } + my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos); + #print "adding information: $addinfo \n"; + push(@endadds, $addinfo); + $postend = $postend + $substitution_distance; + $end = $end+$substitution_distance; + push(@endadds, $search); + $endflag = 'down'; + last; + } + else{ + $endflag = 'up'; + } + } + } + #print "startadds = @startadds, endadds = @endadds \n"; + + } +} +sub microsat_packer{ + my $microsat = $_[0]; + my $addition = $_[1]; + + + +} +sub multiSpecies_interruptedMicrosatHunter_merge{ + $prinkter = 0; +# print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n"; + my $line = $_[0]; +# print "sent for mering: $line \n" if $prinkter ==1; + my @mields = split(/\t/,$line); + my @fields = @mields; + my $microsat = allCaps($fields[$microsatcord]); + my $motifline = allCaps($fields[$motifcord]); + my $microsatcopy = $microsat; +# print "microsat = $microsat|\n" if $prinkter ==1; + $microsatcopy =~ s/^\[|\]$//sg; + chomp $microsatcopy; + my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); + my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); + shift @inields; +# print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1; + $motifline =~ s/^\[|\]$//sg; + my @motields = split(/\]\[/,$motifline); + my @firstmotifs = (); + my @lastmotifs = (); + for my $i (0 ... $#microields){ + $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); + $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); + } +# print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1; + my @mergelist = (); + my @inter_poses = split(/,/,$fields[$interr_poscord]); + my $no_of_interruptions = $fields[$no_of_interruptionscord]; + my @interruptions = split(/,/,$fields[$interrcord]); + my @interrtypes = split(/,/,$fields[$interrtypecord]); + my $stopper = 0; + for my $i (0 ... $#motields-1){ +# print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1; + if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){ + $stopper = 1; + push(@mergelist, ($i)."_".($i+1)); #<STDIN> if $prinkter ==1; + } + } + +# print "mergelist = @mergelist\n" if $prinkter ==1; + return $line if scalar(@mergelist) == 0; +# print "merging @mergelist\n" if $prinkter ==1; +# <STDIN> if $prinkter ==1; + + foreach my $merging (@mergelist){ + my @sets = split(/_/, $merging); +# print "sets = @sets\n" if $prinkter ==1; + my @tempmicro = (); + my @tempmot = (); +# print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1; + for my $i (0 ... $sets[0]-1){ +# print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1; + push(@tempmicro, "[".$microields[$i]."]"); + push(@tempmicro, $inields[$i]); + push(@tempmot, "[".$motields[$i]."]"); +# print "adding pre-motifs number $i\n" if $prinkter ==1; +# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; + } +# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; +# print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1; + my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; +# print "middle is, from @motields - @sets, number 0 which is is\n"; +# print ": $motields[$sets[0]]\n"; + push (@tempmicro, $pusher); + push(@tempmot, "[".$motields[$sets[0]]."]"); + push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]]; + my $outcoming = -2; +# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; +# print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1; + for my $i ($sets[1]+1 ... $#microields){ +# print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1; + push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i]; + push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i]; + push(@tempmot, "[".$motields[$i]."]"); +# print "adding post-motifs number $i\n" if $prinkter ==1; + $outcoming = $i; + } +# print "____________________________________________________________________________\n"; + $prinkter = 0; + $fields[$microsatcord] = join("",@tempmicro); + $fields[$motifcord] = join("",@tempmot); +# print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1; + + splice(@interrtypes, $sets[0], 1); + $fields[$interrtypecord] = join(",",@interrtypes); + splice(@interruptions, $sets[0], 1); + $fields[$interrcord] = join(",",@interruptions); + splice(@inter_poses, $sets[0], 1); + $fields[$interr_poscord] = join(",",@inter_poses); + $no_of_interruptions = $no_of_interruptions - 1; + } + + if ($no_of_interruptions == 0 && $line !~ /compound/){ + $fields[$microsatcord] =~ s/^\[|\]$//sg; + $fields[$motifcord] =~ s/^\[|\]$//sg; + $line = join("\t", @fields[0 ... $motifcord]); + } + else{ + $line = join("\t", @fields); + } +# print "post merging, the line is $line\n" if $prinkter ==1; + #<STDIN> if $stopper ==1; + return $line; +} +sub interval_asseser{ + my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3]; +} +#--------------------------------------------------------------------------------------------------- +sub allCaps{ + my $motif = $_[0]; + $motif =~ s/a/A/g; + $motif =~ s/c/C/g; + $motif =~ s/t/T/g; + $motif =~ s/g/G/g; + return $motif; +} + + +#xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx +sub merge_interruptedMicrosats{ +# print "IN merge_interruptedMicrosats: @_\n"; + my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" + my $input1 = $_[1]; ###### the *_sput_op4_ii file + my $input2 = $_[2]; ###### the *_sput_op4_ii file + $no_of_species = $_[3]; + + my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward + my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse + my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward + #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse + #my $info = $_[4]; + #my @tags = split(/\t/,$info); + + open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; + open(INF,"<$input1") or die "Cannot open file $input1 $!"; + open(INR,"<$input2") or die "Cannot open file $input2 $!"; + open(OUTF,">$output1") or die "Cannot open file $output1 $!"; + open(OUTR,">$output2") or die "Cannot open file $output2 $!"; + open(MER,">$output3") or die "Cannot open file $output3 $!"; + #open(MERR,">$output4") or die "Cannot open file $output4 $!"; + + + + $printer = 0; + +# print "files opened \n"; + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $typecord = $infocord + 1; + my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + $interrtypecord = $motifcord + 1; + $interrcord = $motifcord + 2; + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $mergestarts = $no_of_interruptionscord+ 1; + $mergeends = $no_of_interruptionscord+ 2; + $mergemicros = $no_of_interruptionscord+ 3; + + # NOW ADDING FORWARD MICROSATELLITES TO HASH + my %fmicros = (); + my $microcounter=0; + my $linecounter = 0; + while (my $line = <INF>){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$fmicros{$key}},$line); + $microcounter++; + } + else { + #print $line; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INF; + my @deletedlines = (); +# print "done forward hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW ADDING REVERSE MICROSATELLITES TO HASH + my %rmicros = (); + $microcounter=0; + while (my $line = <INR>){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$rmicros{$key}},$line); + $microcounter++; + } + else { + #print "cant make key\n"; + } + } +# print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INR; +# print "done reverse hash \n"; + $linecounter = 0; + + #------------------------------------------------------------------------------------------------ + + while(my $sine = <SEQ>){ + #<STDIN> if $sine =~ /16349128/; + next if $sine !~ /[a-zA-Z0-9]/; +# print "-" x 150, "\n" if $printer == 1; + my @sields = split(/\t/,$sine); + my @merged = (); + + my $key = (); + + if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $4, $5); + # print $key, "<-<-<-<-<-<-<-<\n"; + } + # print "key = $key\n"; + + my @sets1; + my @sets2; + chomp $sields[$sequencepos]; + my $rev_sequence = reverse($sields[$sequencepos]); + $rev_sequence =~ s/ //g; + $rev_sequence = " ".$rev_sequence; + next if (!exists $fmicros{$key} && !exists $rmicros{$key}); + + if (exists $fmicros{$key}){ + # print "line no : $linecount\n"; + my @raw_microstring = @{$fmicros{$key}}; + my %starts = (); my %ends = (); +# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} + my @microstring=(); + for my $u (0 ... $#raw_microstring){ + my @tields = split(/\t/,$raw_microstring[$u]); + next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; + push(@microstring, $raw_microstring[$u]); + $starts{$tields[$startcord]} = $tields[$startcord]; + $ends{$tields[$endcord]} = $tields[$endcord]; + } + + # print "founf microstring in forward\n: @microstring\n"; + chomp @microstring; + my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos])); + @sets1 = split("\=", $clusterresult); + my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp); + # print "sets = ", join("<all\nmerged>", @sets1), "\n<<-sets1\n"; <STDIN>; + } #if (exists $micros{$key}){ + + if (exists $rmicros{$key}){ + # print "line no : $linecount\n"; + my @raw_microstring = @{$rmicros{$key}}; + my %starts = (); my %ends = (); +# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} + my @microstring=(); + for my $u (0 ... $#raw_microstring){ + my @tields = split(/\t/,$raw_microstring[$u]); + next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; + push(@microstring, $raw_microstring[$u]); + $starts{$tields[$startcord]} = $tields[$startcord]; + $ends{$tields[$endcord]} = $tields[$endcord]; + } + # print "founf microstring in reverse\n: @microstring\n"; <STDIN>; + chomp @microstring; + # print "sending reversed sequence\n"; + my $clusterresult = (find_clusters(@microstring, $rev_sequence ) ); + @sets2 = split("\=", $clusterresult); + my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp); + } #if (exists $micros{$key}){ + + my @popout1 = (); + my @popout2 = (); + my @forwardset = (); + if (exists $sets2[1] ){ + if(exists $sets1[0]) { + push (@popout1, $sets1[0],$sets2[1]); + my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));# + print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n"; + my @localmerged = split("_X0X_", $forwardset[1]); + my $sequence = $sields[$sequencepos]; + $sequence =~ s/ //g; +# print "\nforwardset = @forwardset\n"; + for my $j (0 ... $#localmerged){ + $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); + } + + push (@merged, @localmerged); + + } + else{ + my @localmerged = split("_X0X_", $sets2[1]); + my $sequence = $sields[$sequencepos]; + $sequence =~ s/ //g; + for my $j (0 ... $#localmerged){ +# print "\nlocalmerged = @localmerged\n"; + $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); + } + + push (@merged, @localmerged); + } + } + elsif (exists $sets1[0]){ + print OUTF join("\n",split("_X0X_", $sets1[0])), "\n"; + } + + my @reverseset= (); + if (exists $sets1[1]){ + if (exists $sets2[0]){ + push (@popout2, $sets2[0],$sets1[1]); + # print "popout2 = @popout2\n"; + my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos])); + #print "reverseset = $reverseset[1] < --- reverseset1\n"; + print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n"; + push(@merged, (split("_X0X_", $reverseset[1]))); + } + else{ + push(@merged, (split("_X0X_", $sets1[1]))); + } + } + elsif (exists $sets2[0]){ + print OUTR join("\n",split("_X0X_", $sets2[0])), "\n"; + + } + + if (scalar @merged > 0){ + my @filtered_merged = split("__",(filterDuplicates_merged(@merged))); + print MER join("\n", @filtered_merged),"\n"; + } + # <STDIN> if $sine =~ /16349128/; + + } + close(SEQ); + close(INF); + close(INR); + close(OUTF); + close(OUTR); + close(MER); + +} +sub find_clusters{ + my @input = @_; + my $sequence = pop(@input); + $sequence =~ s/ //g; + my @microstring0 = @input; +# print "IN: find_clusters:\n"; + my %microstart=(); + my %microend=(); + my @nonmerged = (); + my @mergedSet = (); +# print "set of microsats = @microstring \n"; + my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0; +# print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1; + #<STDIN> if $printer == 1; + my @tempmicrostring = @microstring; + foreach my $line (@tempmicrostring){ + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/; + # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + my $firstflag = 'down'; + while( my $line =shift(@microstring)){ +# print "-----------\nline = $line \n" if $printer == 1; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ; + my $startmicro = $line; + my $endmicro = $line; +# print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; + + delete ($microstart{$start}); + delete ($microend{$end}); + my $flag = 'down'; + my $startflag = 'down'; + my $endflag = 'down'; + my $prestart = $start - $distance; + my $postend = $end + $distance; + my @compoundlines = (); + my %compoundhash = (); + push (@compoundlines, $line); + push (@{$compoundhash{$line}},$line); + my $startrank = 1; + my $endrank = 1; + + while( ($startflag eq "down") || ($endflag eq "down") ){ +# print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1; + if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){ +# print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1; + last; + } + + $firstflag = "up"; + if ($startflag eq "down"){ + for my $i ($prestart ... $end){ + if(exists $microend{$i}){ + chomp $microend{$i}[0]; + if(exists $compoundhash{$microend{$i}[0]}) {next;} + chomp $microend{$i}[0]; + push(@compoundlines, $microend{$i}[0]); + my @tields = split(/\t/,$microend{$i}[0]); + $startmicro = $microend{$i}[0]; + chomp $startmicro; + $flag = 'down'; + $startrank++; +# print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1; + delete $microend{$i}; + delete $microstart{$tields[$startcord]}; + $end = $tields[$endcord]; + $startflag = 'down'; + $prestart = $tields[$startcord] - $distance; + last; + } + else{ + $flag = 'up'; + $startflag = 'up'; + } + } + } + + if ($endflag eq "down"){ + + for my $i ($start ... $postend){ +# print "$start ----> $i -----> $postend\n" if $printer == 1; + if(exists $microstart{$i} ){ + chomp $microstart{$i}[0]; + if(exists $compoundhash{$microstart{$i}[0]}) {next;} + chomp $microstart{$i}[0]; + push(@compoundlines, $microstart{$i}[0]); + my @tields = split(/\t/,$microstart{$i}[0]); + $endmicro = $microstart{$i}[0]; + $endrank++; + chomp $endmicro; + $flag = 'down'; +# print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1; + + delete $microstart{$i} if exists $microstart{$i} ; + delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]}; +# print "done\n" if $printer == 1; + + shift @microstring; + $end = $tields[$endcord]; + $postend = $tields[$endcord] + $distance; + $endflag = 'down'; + last; + } + else{ + $flag = 'up'; + $endflag = 'up'; + } +# print "out of the if\n" if $printer == 1; + } +# print "out of the for\n" if $printer == 1; + + } +# print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; + } #end while( $flag eq "down") +# print "compoundlines = @compoundlines \n" if $printer == 1; + + if (scalar (@compoundlines) == 1){ + push(@nonmerged, $line); + + } + if (scalar (@compoundlines) > 1){ +# print "FROM CLUSTERER\n" if $printer == 1; + push(@mergedSet,merge_microsats(@compoundlines, $sequence) ); + } + } #end foreach my $line (@microstring){ +# print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1; +#<STDIN> if scalar(@mergedSet) > 0; +# print "EXIT: find_clusters\n"; +return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet)); +} + +sub custom { + $a->[$startcord+1] <=> $b->[$startcord+1]; +} + +sub popOuter { +# print "\nIN: popOuter @_\n"; <STDIN>; + my @all = split ("_X0X_",$_[0]); +# <STDIN> if !defined $_[0]; + my @merged = split ("_X0X_",$_[1]); + my $sequence = $_[2]; + my $seqlen = length($sequence); + my %microstart=(); + my %microend=(); + my @mergedSet = (); + my @nonmerged = (); + + foreach my $line (@all){ + my @fields = split(/\t/,$line); + my $start = $seqlen - $fields[$startcord]+ 1; + my $end = $seqlen - $fields[$endcord] + 1; + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + my $firstflag = 'down'; + my %forPopouting = (); + + while( my $line =shift(@merged)){ + # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + my $startmicro = $line; + my $endmicro = $line; + + + delete ($microstart{$start}); + delete ($microend{$end}); + my $flag = 'down'; + my $startflag = 'down'; + my $endflag = 'down'; + my $prestart = $start - $distance; + my $postend = $end + $distance; + my @compoundlines = (); + my %compoundhash = (); + push (@compoundlines, $line); + my $startrank = 1; + my $endrank = 1; + + # print "\nstart = $start, end = $end\n"; + # <STDIN>; + for my $i ($start ... $end){ + if(exists $microend{$i}){ + # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n"; + chomp $microend{$i}[0]; + my @fields = split(/\t/,$microend{$i}[0]); + delete $microstart{$seqlen - $fields[$startcord] + 1}; + my $invertseq = $sequence; + $invertseq =~ s/ //g; + push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq )); + delete $microend{$i}; + + } + + if(exists $microstart{$i} ){ + # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n"; + + chomp $microstart{$i}[0]; + my @fields = split(/\t/,$microstart{$i}[0]); + delete $microend{$seqlen - $fields[$endcord] + 1}; + my $invertseq = $sequence; + $invertseq =~ s/ //g; + push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) ); + delete $microstart{$i}; + } + } + + if (scalar (@compoundlines) == 1){ + push(@mergedSet,join("\t",@compoundlines) ); + } + else { +# print "FROM POPOUTER\n" if $printer == 1; + push(@mergedSet, merge_microsats(@compoundlines, $sequence) ); + } + } + + foreach my $key (sort keys %microstart) { + push(@nonmerged,$microstart{$key}[0]); + } + + return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) ); +} + + + +sub invert_justCoordinates{ + my $microsat = $_[0]; +# print "IN invert_justCoordinates ... @_\n" ; <STDIN>; + chomp $microsat; + my $seqLength = $_[1]; + my @fields = split(/\t/,$microsat); + my $start = $seqLength - $fields[$endcord] + 1; + my $end = $seqLength - $fields[$startcord] + 1; + $fields[$startcord] = $start; + $fields[$endcord] = $end; + $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); +# print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1; + return join("\t",@fields); +} + +sub largest_number{ + my $counter = 0; + my($max) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp > $max){ + $max = $temp; + } + } + return($max); +} +sub smallest_number{ + my $counter = 0; + my($min) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp < $min){ + $min = $temp; + } + } + return($min); +} + + +sub filterDuplicates_merged{ + my @merged = @_; + my %revmerged = (); + my @fmerged = (); + foreach my $micro (@merged) { + my @fields = split(/\t/,$micro); + if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){ + my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]); + # print "adding ... \n$key\n$micro\n"; + push(@{$revmerged{$key}}, $micro); + } + else{ + # print "pushing.. $micro\n"; + push(@fmerged, $micro); + } + } +# print "\n"; + foreach my $micro (@fmerged) { + my @fields = split(/\t/,$micro); + my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]); + # print "searching for key $key\n"; + if (exists $revmerged{$key}){ + # print "deleting $revmerged{$key}[0]\n"; + delete $revmerged{$key}; + } + } + foreach my $key (sort keys %revmerged) { + push(@fmerged,$revmerged{$key}[0]); + } +# print "returning ", join("\n", @fmerged),"\n" ; + return join("__", @fmerged); +} + +sub invert_microsat{ + my $micro = $_[0]; + chomp $micro; + if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;} + else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; } + my $sequence = $_[1]; + $sequence =~ s/ //g; + my $seqlen = length($sequence); + my @fields = split(/\t/,$micro); + my $start = $seqlen - $fields[$endcord] +1; + my $end = $seqlen - $fields[$startcord] +1; + $fields[$startcord] = $start; + $fields[$endcord] = $end; + $fields[$motifcord] = reverse_micro($fields[$motifcord]); + $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); + if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){ + my @intertypes = split(/,/,$fields[$interrtypecord]); + my @inters = split(/,/,$fields[$interrcord]); + my @interposes = split(/,/,$fields[$interr_poscord]); + $fields[$interrtypecord] = join(",",reverse(@intertypes)); + $fields[$no_of_interruptionscord] = scalar(@interposes); + for my $i (0 ... $fields[$no_of_interruptionscord]-1){ + if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){ + $inters[$i] = reverse($inters[$i]); + $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1; + } + else{ + $inters[$i] = ""; + $interposes[$i] = $interposes[$i] - 1; + } + $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1; + } + + $fields[$interrcord] = join(",",reverse(@inters)); + $fields[$interr_poscord] = join(",",reverse(@interposes)); + } + + my $finalmicrosat = join("\t", @fields); + return $finalmicrosat; + +} +sub reverse_micro{ + my $micro = reverse($_[0]); + my @strand = split(/\s*/,$micro); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} + if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} + } + return join("",@strand); +} + +#xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx + +sub forward_reverse_sputoutput_comparer { +# print "IN forward_reverse_sputoutput_comparer: @_\n"; + my $input0 = $_[0]; ###### the *nogap_unrand_match file + my $input1 = $_[1]; ###### the real file, *sput* data + my $input2 = $_[2]; ###### the reverse file, *sput* data + my $output1 = $_[3]; ###### microsats different in real file + my $output2 = $_[4]; ###### microsats missing in real file + my $output3 = $_[5]; ###### microsats common among real and reverse file + my $no_of_species = $_[6]; + + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + $interrtypecord = $motifcord + 1; + $interrcord = $motifcord + 2; + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $mergestarts = $no_of_interruptionscord+ 1; + $mergeends = $no_of_interruptionscord+ 2; + $mergemicros = $no_of_interruptionscord+ 3; + + + open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; + open(INF,"<$input1") or die "Cannot open file $input1 $!"; + open(INR,"<$input2") or die "Cannot open file $input2 $!"; + + open(DIFF,">$output1") or die "Cannot open file $output1 $!"; + #open(MISS,">$output2") or die "Cannot open file $output2 $!"; + open(SAME,">$output3") or die "Cannot open file $output3 $!"; + + +# print "opened files \n"; + my $linecounter = 0; + my $fcounter = 0; + my $rcounter = 0; + + $printer = 0; + #--------------------------------------------------------------------------------------------------- + # NOW ADDING FORWARD MICROSATELLITES TO HASH + my %fmicros = (); + my $microcounter=0; + while (my $line = <INF>){ + $linecounter++; + if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$fmicros{$key}},$line); + $microcounter++; + } + else { + #print $line; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INF; + my @deletedlines = (); +# print "done forward hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW ADDING REVERSE MICROSATELLITES TO HASH + my %rmicros = (); + $microcounter=0; + while (my $line = <INR>){ + $linecounter++; + if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$rmicros{$key}},$line); + $microcounter++; + } + else {} + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INR; +# print "done reverse hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + while(my $sine = <SEQ>){ + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = (); + if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); + } + else { + next; + } + $printer = 0; + my $sequence = $sields[$sequencepos]; + chomp $sequence; + $sequence =~ s/ //g; + my @localfs = (); + my @localrs = (); + + if (exists $fmicros{$key}){ + @localfs = @{$fmicros{$key}}; + delete $fmicros{$key}; + } + + my %forwardstarts = (); + my %forwardends = (); + + foreach my $f (@localfs){ + my @fields = split(/\t/,$f); + push (@{$forwardstarts{$fields[$startcord]}},$f); + push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]); + } + + if (exists $rmicros{$key}){ + @localrs = @{$rmicros{$key}}; + delete $rmicros{$key}; + } + else{ + } + + foreach my $r (@localrs){ + chomp $r; + my @rields = split(/\t/,$r); +# print "rields = @rields\n" if $printer == 1; + my $reciprocalstart = length($sequence) - $rields[$endcord] + 1; + my $reciprocalend = length($sequence) - $rields[$startcord] + 1; +# print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1; + my $microsat = reverse_micro(all_caps($rields[$microsatcord])); + my @localcollection=(); + for my $i ($reciprocalstart+1 ... $reciprocalend-1){ + if (exists $forwardstarts{$i}){ + push(@localcollection, $forwardstarts{$i}[0] ); + delete $forwardstarts{$i}; + } + if (exists $forwardends{$i}){ + next if !exists $forwardstarts{$forwardends{$i}[0]}; + push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] ); + } + } + if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);} + + if (scalar(@localcollection) == 0){ + print SAME invert_microsat($r,($sequence) ), "\n"; + } + + elsif (scalar(@localcollection) == 1){ +# print "f microsat = $localcollection[0]\n" if $printer == 1; + my @lields = split(/\t/,$localcollection[0]); + $lields[$microsatcord]=all_caps($lields[$microsatcord]); +# print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1; +# print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1; + if ($microsat eq $lields[$microsatcord]){ + chomp $localcollection[0]; + print SAME $localcollection[0], "\n"; + } + if ($microsat ne $lields[$microsatcord]){ + chomp $localcollection[0]; + my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence); +# print "newmicro = $newmicro\n" if $printer == 1; + if ($newmicro =~ /[a-zA-Z]/){ + print SAME $newmicro,"\n"; + } + else{ + print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n"; +# print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1; +# print "@rields\n@lields\n" if $printer == 1; + } + } + } + else{ +# print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1; + } + } + } + + close(SEQ); + close(INF); + close(INR); + close(DIFF); + close(SAME); + +} +sub all_caps{ + my @strand = split(/\s*/,$_[0]); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} + if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} + if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} + if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} + } + return join("",@strand); +} +sub microsatChooser{ + my $forward = $_[0]; + my $reverse = $_[1]; + my $sequence = $_[2]; + my $seqLength = length($sequence); + $sequence =~ s/ //g; + my @fields = split(/\t/,$forward); + my @rields = split(/\t/,$reverse); + my $r_start = $seqLength - $rields[$endcord] + 1; + my $r_end = $seqLength - $rields[$startcord] + 1; + + + my $f_microsat = $fields[$microsatcord]; + my $r_microsat = $rields[$microsatcord]; + + if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) { + return $forward if length($f_microsat) >= length($r_microsat); + return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat); + } + return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord]; + + my $f_microsat_copy = $f_microsat; + my $r_microsat_copy = $r_microsat; + $f_microsat_copy =~ s/^\[|\]$//g; + $r_microsat_copy =~ s/^\[|\]$//g; + + my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); + my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); + my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); + my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); + + my $f_motif = $fields[$motifcord]; + my $r_motif = $rields[$motifcord]; + my $f_motif_copy = $f_motif; + my $r_motif_copy = $r_motif; + $f_motif_copy =~ s/^\[|\]$//g; + $r_motif_copy =~ s/^\[|\]$//g; + + my @f_motields = split(/\]\[/,$f_motif_copy); + my @r_motields = split(/\]\[/,$r_motif_copy); + + my $f_purestretch = join("",@f_microields); + my $r_purestretch = join("",@r_microields); + + if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){ +# print "now.. studying $forward\n$reverse\n" if $printer == 1; + if ($fields[$typecord] eq $rields[$typecord]){ +# print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; + + if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ + my $subset_answer = isSubset($forward, $reverse, $seqLength); +# print "subset answer = $subset_answer\n" if $printer == 1; + return $forward if $subset_answer == 1; + return invert_microsat($reverse, $sequence) if $subset_answer == 2; + return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); + return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); + return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch); + return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch); + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ; + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ + return $forward; + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ + return invert_microsat($reverse, $sequence); + } + } + else{ + my $fmotlen = (); + my $rmotlen = (); + $fmotlen =1 if $fields[$typecord] eq "mononucleotide"; + $fmotlen =2 if $fields[$typecord] eq "dinucleotide"; + $fmotlen =3 if $fields[$typecord] eq "trinucleotide"; + $fmotlen =4 if $fields[$typecord] eq "tetranucleotide"; + $rmotlen =1 if $rields[$typecord] eq "mononucleotide"; + $rmotlen =2 if $rields[$typecord] eq "dinucleotide"; + $rmotlen =3 if $rields[$typecord] eq "trinucleotide"; + $rmotlen =4 if $rields[$typecord] eq "tetranucleotide"; + + if ($fmotlen < $rmotlen){ + if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){ + return $forward; + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + if ($fmotlen > $rmotlen){ + if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){ + return invert_microsat($reverse, $sequence); + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + } + } + if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){ +# print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; + if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ + my $subset_answer = isSubset($forward, $reverse, $seqLength); +# print "subset answer = $subset_answer\n" if $printer == 1; + return $forward if $subset_answer == 1; + return invert_microsat($reverse, $sequence) if $subset_answer == 2; +# print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1; + return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); + return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); + if ($subset_answer == 3){ + if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){ + if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){ + return $forward; + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){ + if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){ + return invert_microsat($reverse, $sequence); + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + } + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ + return $forward; + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ + return invert_microsat($reverse, $sequence); + } + + } + + if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){ +# print "one compound, one nucleotide\n" if $printer == 1; + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){ +# print "one compound, one nucleotide\n" if $printer == 1; + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } +} + +sub isSubset{ + my $forward = $_[0]; my @fields = split(/\t/,$forward); + my $reverse = $_[1]; my @rields = split(/\t/,$reverse); + my $seqLength = $_[2]; + my $r_start = $seqLength - $rields[$endcord] + 1; + my $r_end = $seqLength - $rields[$startcord] + 1; +# print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1; + return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end; + return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end; + return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord]; + return "3"; +} + +sub motifBYmotif_match{ + my $forward = $_[0]; + my $reverse = $_[1]; + $forward =~ s/^\[|\]$//g; + $reverse =~ s/^\[|\]$//g; + my @f_motields=split(/\]\[/, $forward); + my @r_motields=split(/\]\[/, $reverse); + my $finalresult = 0; + + if (scalar(@f_motields) != scalar(@r_motields)){ + my $subresult = 0; + my @mega = (); my @sub = (); + @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields); + @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields); + @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields); + @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields); + + for my $i (0 ... $#sub){ + my $probe = $sub[$i].$sub[$i]; +# print "probing $probe and $mega[$i]\n" if $printer == 1; + if ($probe =~ /$mega[$i]/) {$subresult = 1; } + else {$subresult = 0; last; } + } + + return 0 if $subresult == 0; + return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f + return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse + + } + else{ + for my $i (0 ... $#f_motields){ + my $probe = $f_motields[$i].$f_motields[$i]; + if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;} + else {$finalresult = 0 ;last;} + } + } +# print "finalresult = $finalresult\n" if $printer == 1; + return $finalresult; +} + +sub merge_microsats{ + my @input = @_; + my $sequence = pop(@input); + $sequence =~ s/ //g; + my @seq_string = @input; +# print "IN: merge_microsats\n"; +# print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n"; + my $start; + my $end; + my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string; +# print "\nrearranged into @micros \n"; + my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = (); + my @fields = (); + for my $i (0 ... $#micros){ + chomp $micros[$i]; + @fields = split(/\t/,$micros[$i]); + push(@types, $fields[$typecord]); + push(@motifs, $fields[$motifcord]); + + if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);} + else { push(@interruptiontypes, "NA"); } + if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);} + else { push(@interruptions, "NA"); } + if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);} + else { push(@interrposes, "NA"); } + if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);} + else { push(@no_of_interruptions, "NA"); } + if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));} + else { push(@mergestart, $fields[$startcord]); } + if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));} + else { push(@mergeend, $fields[$endcord]); } + if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);} + else { push(@mergemicro, $fields[$microsatcord]); } + + + } + $start = smallest_number(@mergestart); + $end = largest_number(@mergeend); + my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]"; + my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro)); + return $microsat; +} + +sub slided_microsat{ + my $forward = $_[0]; my @fields = split(/\t/,$forward); + my $reverse = $_[1]; my @rields = split(/\t/,$reverse); + my $seqLength = $_[2]; + my $r_start = $seqLength - $rields[$endcord] + 1; + my $r_end = $seqLength - $rields[$startcord] + 1; + my $motlen =(); + $motlen =1 if $fields[$typecord] eq "mononucleotide"; + $motlen =2 if $fields[$typecord] eq "dinucleotide"; + $motlen =3 if $fields[$typecord] eq "trinucleotide"; + $motlen =4 if $fields[$typecord] eq "tetranucleotide"; + + if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) { + return 0; + } + else{ + return 1; + } + +} + +#xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx + + + +#xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx +sub new_multispecies_t10{ + my $input1 = $_[0]; #gap_op_unrand_match + my $input2 = $_[1]; #sput + my $output = $_[2]; #output + my $bin = $output."_bin"; + my $orgs = join("|",split(/\./,$_[3])); + my @organisms = split(/\./,$_[3]); + my $no_of_species = scalar(@organisms); #3 + my $t10info = $output."_info"; + $prinkter = 0; + + open (MATCH, "<$input1"); + open (SPUT, "<$input2"); + open (OUT, ">$output"); + open (INFO, ">$t10info"); + + + sub microsat_bracketer; + sub custom; + my %seen = (); + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + #---------------------------------------------------------------------------------------------------------------# + # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES # + #---------------------------------------------------------------------------------------------------------------# + my $linecounter = 0; + my $microcounter = 0; + while (my $line = <SPUT>){ + chomp $line; + # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; + next if $line !~ /[0-9a-z]+/; + $linecounter++; + # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); + # print $key, "#-#-#-#-#-#-#-#\n"; + if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5); +# print "key = $key\n" if $prinkter == 1; + push (@{$seen{$key}},$line); + $microcounter++; + } + else { + #print "could not make ker in SPUT : \n$line \n"; + } + } +# print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; +# print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; + close SPUT; + + #---------------------------------------------------------------------------------------------------------------- + + #-------------------------------------------------------------------------------------------------------# + # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY # + # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN # + # SECOND FILE FOLLOWED BY # + # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION # + #-------------------------------------------------------------------------------------------------------# + my $ref = 0; + my $ref2 = 0; + my $ref3 = 0; + my $ref4 = 0; + my $deletes= 0; + my $duplicates = 0; + my $neighbors = 0; + my $tooshort = 0; + my $prevmicrol=(); + my $startnotfound = 0; + my $matchkeysformed = 0; + my $keysused = 0; + + while (my $line = <MATCH>) { +# print colored ['magenta'], $line if $prinkter == 1; + next if $line !~ /[a-zA-Z0-9]/; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); + if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $matchkeysformed++; + $key2 = join("\t",$1, $2, $3, $4, $5); +# print "key = $key2 \n" if $prinkter == 1; + } + else{ +# print "could not make ker in SEQ : $line\n"; + next; + } + my $sequence = $fields2[$sequencepos]; + $sequence =~ s/\*/-/g; + my $count = 0; + if (exists $seen{$key2}){ + $keysused++; + my @unsorted_raw = @{$seen{$key2}}; + delete $seen{$key2}; + my @sequencearr = split(/\s*/, $sequence); + +# print "sequencearr = @sequencearr\n" if $prinkter == 1; + + my $counter; + + my %start_database = (); + my %end_database = (); + foreach my $uns (@unsorted_raw){ + my @uields = split(/\t/,$uns); + $start_database{$uields[$startcord]} = $uns; + $end_database{$uields[$endcord]} = $uns; + } + + my @unsorted = (); + my %starts = (); my %ends = (); +# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;} + for my $u (0 ... $#unsorted_raw){ + my @tields = split(/\t/,$unsorted_raw[$u]); + next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; + push(@unsorted, $unsorted_raw[$u]); + $starts{$tields[$startcord]} = $unsorted_raw[$u]; +# print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1; + } + + my $basecounter= 0; + my $gapcounter = 0; + my $poscounter = 0; + + for my $s (@sequencearr){ + + $poscounter++; + if ($s eq "-"){ + $gapcounter++; next; + } + else{ + $basecounter++; + } + + + #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; + #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; + #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1; + + if (exists $starts{$basecounter}){ + my $locus = $starts{$basecounter}; +# print "locus identified = $locus\n" if $prinkter == 1; + my @fields3 = split(/\t/,$locus); + my $start = $fields3[$startcord]; + my $end = $fields3[$endcord]; + my $motif = $fields3[$motifcord]; + my $microsat = $fields3[$microsatcord]; + my @leftbracketpos = (); + my @rightbracketpos = (); + my $bracket_picker = 'no'; + my $leftbrackets=(); + my $rightbrackets = (); + my $micro_cpy = $microsat; +# print "microsat = $microsat\n" if $prinkter == 1; + while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';} + while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);} + $microsat =~ s/[\[\]\-\*]//g; +# print "microsat = $microsat\n" if $prinkter == 1; + my $human_search = join '-*', split //, $microsat; + my $temp = substr($sequence, $poscounter-1); +# print "with poscounter = $poscounter\n" if $prinkter == 1; + my $search_result = (); + my $posnow = (); + +# print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search; +# <STDIN> if !defined $temp || !defined $human_search; + + while ($temp =~ /($human_search)/gi){ + $search_result = $1; + # $posnow = pos($temp); + last; + } + + my @gapspos = (); + next if !defined $search_result; + + while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); } + my $gaps = join("__",@gapspos); + + my $final_microsat = $search_result; + if ($bracket_picker eq "yes"){ + $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets); + } + + my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat); + + if ($bracket_picker eq "yes") { + $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]); + } + print OUT $outsentence,"\n"; + } + } + } + } + my $unusedkeys = scalar(keys %seen); +# print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; +# print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; +# print INFO "microsatellites too short = $tooshort\n"; +# print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; + + #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; + #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; + #print "microsatellites too short = $tooshort\n"; + #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; + #print "unused keys = \n",join("\n", (keys %seen)),"\n"; + close (MATCH); + close (SPUT); + close (OUT); + close (INFO); +} + +sub microsat_bracketer{ +# print "in bracketer: @_\n"; + my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_; + my @gaps = split(/__/,$gapspos); + my @lefts = split(/__/,$leftbracketpos); + my @rights = split(/__/,$rightbracketpos); + my @new=(); + my $pure = $microsat; + $pure =~ s/-//g; + my $off = 0; + my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights); + push(@gaps, 0); + push(@lefts,0); + push(@rights,0); + + for my $i (1 ... $finallength){ +# print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n"; + if($rights[0] == $i){ + # print "pushed a ]\n"; + push(@new, "]"); + shift(@rights); + push(@rights,0); + for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} + next; + } + if($gaps[0] == $i){ + # print "pushed a -\n"; + push(@new, "-"); + shift(@gaps); + push(@gaps, 0); + for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;} + for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;} + + next; + } + if($lefts[0] == $i){ +# print "pushed a [\n"; + push(@new, "["); + shift(@lefts); + push(@lefts,0); + for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} + next; + } + else{ + my $pushed = substr($pure,$off,1); + $off++; + push(@new,$pushed ); +# print "pushed an alphabet, now new = @new, pushed = $pushed\n"; + next; + } + } + my $returnmicrosat = join("",@new); +# print "final microsat = $returnmicrosat \n"; + return($returnmicrosat); +} + +#xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx +sub multiSpecies_orthFinder4{ + #print "IN multiSpecies_orthFinder4: @_\n"; + my @handles = (); + #1 SEPT 30TH 2008 + #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER + #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL + #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST + #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED + #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY + #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES. + #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE + #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE + #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN + #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl + + #1 trial run: + #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50 + + $prinkter=0; + + ############# + my $CLUSTER_DIST = $_[4]; + ############# + + + my $aligns = $_[0]; + my @micros = split(/:/, $_[1]); + my $orth = $_[2]; + #my $not_orth = "notorth"; + @tags = split(/:/, $_[3]); + + $no_of_species=scalar(@tags); + my $junkfile = $orth."_junk"; + #open(JUNK,">$junkfile"); + + #my $info = $output1."_info"; + #print "inputs are : \n"; foreach(@micros){print $_,"\n";} + #print "info = @_\n"; + + + open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; + open (ORTH, ">$orth"); + my $output=$orth."_out"; + open (OUTP, ">$output"); + + + #open (NORTH, ">$not_orth"); + #open (INF, ">$info"); + my $i = 0; + foreach my $path (@micros){ + $handles[$i] = IO::Handle->new(); + open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!"; + $i++; + } + + #print "Opened files\n"; + + + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $motifcord = $typecord + 1; + $gapcord = $motifcord+1; + $startcord = $gapcord + 1; + $strandcord = $startcord + 1; + $endcord = $strandcord + 1; + $microsatcord = $endcord + 1; + $sequencepos = 2 + (4*$no_of_species) + 1 -1 ; + #$sequencepos = 17; + # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES + #---------------------------------------------------------------------------------------------------------------- + my @hasharr = (); + foreach my $path (@micros){ + open(READ, "<$path") or die "Cannot open file $path :$!"; + my %single_hash = (); + my $key = (); + my $counter = 0; + while (my $line = <READ>){ + $counter++; + # print $line; + chomp $line; + my @fields1 = split(/\t/,$line); + if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { + $key = join("\t",$1, $2, $4, $5); + +# print "key = : $key\n" if $prinkter == 1; + +# print $line if $prinkter == 1; + push (@{$single_hash{$key}},$line); + } + else{ + # print "microsat line incompatible\n"; + } + } + push @hasharr, {%single_hash}; + # print "@{$single_hash{$key}} \n"; +# print "done $path: counter = $counter\n" if $prinkter == 1; + close READ; + } +# print "Done hashes\n"; + #---------------------------------------------------------------------------------------------------------------- + my $question=(); + #---------------------------------------------------------------------------------------------------------------- + my @contigstarts = (); + my @contigends = (); + + my %contigclusters = (); + my %contigclustersFirstStartOnly=(); + my %contigclustersLastEndOnly=(); + my %contigclustersLastEndLengthOnly=(); + my %contigclustersFirstStartLengthOnly=(); + my %contigpath=(); + my $dotcounter = 0; + while (my $line = <BO>){ +# print "x" x 60, "\n" if $prinkter == 1; + $dotcounter++; +# print "." if $dotcounter % 100 ==0; +# print "\n" if $dotcounter % 5000 ==0; + next if $line !~ /^[0-9]+/; +# print $line if $prinkter == 1; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { + $key2 = join("\t",$1, $2, $4, $5); + } + else { +# print "seq line $line incompatible\n" if $prinkter == 1; + next;} + + + + + + + my @sequences = (); + for (0 ... $#tags){ + my $seq = <BO>; + # print $seq; + chomp $seq; + push(@sequences , " ".$seq); + } + my @origsequences = @sequences; + my $seqcopy = $sequences[0]; + my @strings = (); + $seqcopy =~ s/[a-zA-Z]|-/x/g; + my @string = split(/\s*/,$seqcopy); + + for my $s (0 ... $#tags){ + $sequences[$s] =~ s/-//g; + $sequences[$s] =~ s/[a-zA-Z]/x/g; + # print "length of sequence = ",length($sequences[$s]),"\n"; + my @tempstring = split(/\s*/,$sequences[$s]); + push(@strings, [@tempstring]) + + } + + my @species_list = (); + my @micro_count = 0; + my @starthash = (); + my $stopper = 1; + my @endhash = (); + + my @currentcontigstarts=(); + my @currentcontigends=(); + my @currentcontigchrs=(); + + for my $i (0 ... $#tags){ +# print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1; + my @temparr = (); + + if (exists $hasharr[$i]{$key2}){ + @temparr = @{$hasharr[$i]{$key2}}; + + $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/; +## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; +# print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; + my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; + my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; + $contigstarts[$i]{$startkey}= $key2; + $contigends[$i]{$endkey}= $key2; +# print "confirming existance: \n" if $prinkter == 1; +# print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1; +# print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1; + $currentcontigchrs[$i]=$1; + $currentcontigstarts[$i]=$2; + $currentcontigends[$i]=$3; + + } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} + else { + push (@starthash, {0 => "0"}); + push (@endhash, {0 => "0"}); + $currentcontigchrs[$i] = 0; + next; + } + $stopper = 0; + # print "exists: @temparr\n" if $prinkter == 1; + push(@micro_count, scalar(@temparr)); + push(@species_list, [@temparr]); + my @tempstart = (); my @tempend = (); + my %localends = (); + my %localhash = (); + # print "---------------------------\n"; + + foreach my $templine (@temparr){ +# print "templine = $templine\n" if $prinkter == 1; + my @tields = split(/\t/,$templine); + my $start = $tields[$startcord]; # - $tields[$gapcord]; + my $end = $tields[$endcord]; #- $tields[$gapcord]; + my $realstart = $tields[$startcord]- $tields[$gapcord]; + my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g); + $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/; + # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n"; + my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat; + # print "real start = $realstart, realend = $realend \n"; + for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;} + push(@tempstart, $start); + push(@tempend, $end); + $localhash{$start."-".$end} = $templine; + } + push @starthash, {%localhash}; + my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST); + +# print "foundclusters = $foundclusters\n"; + + my @clusters = split(/_/,$foundclusters); + + my $clustno = 0; + + foreach my $cluster (@clusters) { + my @constituenst = split(/,/,$cluster); +# print "clusters returned: @constituenst\n" if $prinkter == 1; + } + + @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters)); + + + } + next if $stopper == 1; + +# print colored ['blue'],"FINAL:\n" if $prinkter == 1; + my $finalclusters =findClusters(join("!",@string), 1); +# print "finalclusters = $finalclusters\n"; +# print colored ['blue'],"----------------------\n" if $prinkter == 1; + my @clusters = split(/,/,$finalclusters); +# print "@string\n" if $prinkter == 1; +# print "@clusters\n" if $prinkter == 1; +# print "------------------------------------------------------------------\n" if $prinkter == 1; + + my $clustno = 0; + + # foreach my $cluster (@clusters) { + # my @constituenst = split(/,/,$cluster); + # print "clusters returned: @constituenst\n"; + # } + + next if (scalar @clusters == 0); + + my @contigcluster=(); + my $clusterno=0; + my @contigClusterstarts=(); + my @contigClusterends = (); + + foreach my $clust (@clusters){ + # print "cluster: $clust\n"; + $clusterno++; + my @localclust = split(/\./, $clust); + my @result = (); + my @starts = (); + my @ends = (); + + for my $i (0 ... $#localclust){ + # print "localclust[$i]: $localclust[$i]\n"; + my @pattern = split(/:/, $localclust[$i]); + my @cords = split(/-/, $pattern[1]); + push (@starts, $cords[0]); + push (@ends, $cords[1]); + } + + my $extremestart = smallest_number(@starts); + my $extremeend = largest_number(@ends); + push(@contigClusterstarts, $extremestart); + push(@contigClusterends, $extremeend); +# print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ; + + foreach my $clustparts (@localclust){ + my @pattern = split(/:/, $clustparts); + # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n"; + push (@result, $starthash[$pattern[0]]{$pattern[1]}); + } + push(@contigcluster, join("\t", @result)); +# print join("\t", @result),"<-result \n" if $prinkter == 1 ; + } + + + my $firstclusterstart = smallest_number(@contigClusterstarts); + my $lastclusterend = largest_number(@contigClusterends); + + + $contigclustersFirstStartOnly{$key2}=$firstclusterstart; + $contigclustersLastEndOnly{$key2} = $lastclusterend; + $contigclusters{$key2}=[ @contigcluster ]; +# print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1; + for my $i (0 ... $#tags){ + #1 check if there exists adjacent alignment block wrt coordinates of this species. + next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. + #2 no need to worry about proximity of anything in adjacent block! + + #1 BELOW, the following is really to calclate the distance between the end coordinate of the + #2 cluster and the end of the gap-free sequence of each species. this is so that if an + #3 adjacent alignment block is found lateron, the exact distance between the potentially + #4 adjacent microsat clusters can be found here. the exact start coordinate will be used + #5 immediately below. + # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1; + + my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart); + my $species_endsubstring = (); + + if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";} + else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);} + +# print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1; +# print "for species: $tags[$i]: \n" if $prinkter == 1; + + $species_startsubstring =~ s/-| //g; + $species_endsubstring =~ s/-| //g; + $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring); + $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring); + + + +# print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1; +# print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1; +# print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1; + +# print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1; + + } + + + } +# print "\ndone the job of filling... \n"; + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + $prinkter=0; + open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; + + my %clusteringpaths=(); + my %clustersholder=(); + my %foundkeys=(); + my %clusteringpathsRev=(); + + + my $totalcount=(); + my $founkeys_enteredcount=(); + my $transfered=0; + my $complete_transfered=0; + my $plain_transfered=0; + my $existing_removed=0; + + while (my $line = <BO>){ +# print "x" x 60, "\n" if $prinkter == 1; + next if $line !~ /^[0-9]+/; + #print $line; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) { + $key2 = join("\t",$1, $2, $4, $5); + } + + else { + # print "seq line $line incompatible\n"; + next; + } +# print "KEY = : $key2\n" if $prinkter == 1; + + + my @currentcontigstarts=(); + my @currentcontigends=(); + my @currentcontigchrs=(); + my @clusters = (); + my @clusterscopy=(); + if (exists $contigclusters{$key2}){ + @clusters = @{$contigclusters{$key2}}; + @clusterscopy=@clusters; + for my $i (0 ... $#tags){ + # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; + if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){ + # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; + my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; + my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; + $currentcontigchrs[$i]=$1; + $currentcontigstarts[$i]=$2; + $currentcontigends[$i]=$3; + } + else { + $currentcontigchrs[$i] = 0; + # print "no microsat clusters for $key2\n" if $prinkter == 1; next; + } + } + } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} + + my @sequences = (); + for (0 ... $#tags){ + my $seq = <BO>; + # print $seq; + chomp $seq; + push(@sequences , " ".$seq); + } + + next if scalar @currentcontigchrs == 0; + + # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1; + my %visitedcontigs=(); + + for my $i (0 ... $#tags){ + #1 check if there exists adjacent alignment block wrt coordinates of this species. + next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. + #2 no need to worry about proximity of anything in adjacent block! + @clusters=@clusterscopy; + #1 BELOW, the following is really to calclate the distance between the end coordinate of the + #2 cluster and the end of the gap-free sequence of each species. this is so that if an + #3 adjacent alignment block is found lateron, the exact distance between the potentially + #4 adjacent microsat clusters can be found here. the exact start coordinate will be used + #5 immediately below. + my $firstclusterstart = $contigclustersFirstStartOnly{$key2}; + my $lastclusterend = $contigclustersLastEndOnly{$key2}; + + my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]); +# print "check if exists $key3 in contigends for $i\n" if $prinkter == 1; + + if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){ + $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies# + # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1; + + #1 extract coordinates of the last cluster of this found alignment block +# print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1; + # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1; + # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; + # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; + my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}}; + my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}}; + my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=(); + + for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){ + # print "val = $val\n" if $prinkter == 1; + if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){ + push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/; + } + } + # print "-----\n" if $prinkter == 1; + for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){ + # print "val = $val\n" if $prinkter == 1; + if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){ + push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/; + } + } + + + # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1; + # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1; + + # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){ + if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){ + my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}}; + $regurgitate[$#regurgitate]=~s/\n//g; + $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters); + delete $contigclusters{$contigends[$i]{$key3}}; + $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ]; + delete $contigclusters{$key2}; + $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0; + $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0; + + if (scalar(@clusters) < 1){ + # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2}; + $clusteringpaths{$key2}=$contigends[$i]{$key3}; + $clusteringpathsRev{$contigends[$i]{$key3}}=$key2; + print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n"; + # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n"; + $founkeys_enteredcount-- if exists $foundkeys{$key2}; + $existing_removed++ if exists $foundkeys{$key2}; +# print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1; + delete $foundkeys{$key2} if exists $foundkeys{$key2}; + $complete_transfered++; + } + else{ + print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2}; + $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2}; + $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2}; + + $founkeys_enteredcount++ if !exists $foundkeys{$key2}; + $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2}; + # print "adding foundkeys entry $foundkeys{$key2}\n"; + $transfered++; + } + #$contigclusters{$key2}=[ @contigcluster ]; + } + } + else{ + # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1; + $plain_transfered++; + print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; + $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2}; + $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; + $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; + $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; + # print "adding foundkeys entry $foundkeys{$key2}\n"; + + } + $totalcount++; + + } + + + } + close BO; + #close (NORTH); + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + + my $founkeys_count=(); + my $nopath_count=(); + my $pathed_count=0; + foreach my $key2 (keys %foundkeys){ + #print "x" x 60, "\n"; +# print "x" if $dotcounter % 100 ==0; +# print "\n" if $dotcounter % 5000 ==0; + $founkeys_count++; + my $key = $key2; +# print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1; + if ($clusteringpaths{$key} eq $key){ +# print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1; + $nopath_count++; + delete $foundkeys{$key2}; + print ORTH join ("\n",@{$contigclusters{$key2}}),"\n"; + } + else{ + my @pool=(); + my $key3=(); + $pathed_count++; +# print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1; +# print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1; + if ($clusteringpathsRev{$key} eq "0") { + next; + } + else{ + my $yek3 = $clusteringpathsRev{$key}; + my $yek = $key; +# print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1; + while ($yek3 ne "0"){ +# print "$yek->$yek3," if $prinkter == 1; + $yek = $yek3; + $yek3 = $clusteringpathsRev{$yek}; + } +# print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1; + $key3 = $clusteringpaths{$yek}; + $key = $yek; + } + +# print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1; + + while($key ne $key3){ +# print "KEEY $key->$key3\n" if $prinkter == 1; +# print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1; + + if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}}; + # print "now pool = @pool\n" if $prinkter == 1; + } + delete $foundkeys{$key3}; + $key=$key3; + $key3=$clusteringpaths{$key}; + } +# print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1; + my @firstcontig= @{$contigclusters{$key}}; + delete $foundkeys{$key2} if exists $foundkeys{$key2} ; + delete $foundkeys{$key} if exists $foundkeys{$key}; + + unshift @pool, pop @firstcontig; +# print join("\t",@pool),"\n" if $prinkter == 1; + print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0; + print ORTH join ("\t",@pool),"\n"; + # join(); + } + + } + #close (NORTH); +# print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1; + close (BO); + close (ORTH); + close (OUTP); + return 1; + +} +sub stringPainter{ + my @string = split(/_C0C_/,$_[0]); +# print $_[0], " <- in stringPainter\n"; +# print $_[1], " <- in clusters\n"; + + my @clusters = split(/,/, $_[1]); + for my $i (0 ... $#clusters){ + my $cluster = $clusters[$i]; +# print "cluster = $cluster\n"; + my @parts = split(/\./,$cluster); + my @cord = split(/:|-/,shift(@parts)); + my $minstart = $cord[1]; + my $maxend = $cord[2]; +# print "minstart = $minstart , maxend = $maxend\n"; + + for my $j (0 ... $#parts){ +# print "oing thri $parts[$j]\n"; + my @cord = split(/:|-/,$parts[$j]); + $minstart = $cord[1] if $cord[1] < $minstart; + $maxend = $cord[2] if $cord[2] > $maxend; + } +# print "minstart = $minstart , maxend = $maxend\n"; + for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;} + + + } +# print "@string <-done from function stringPainter\n"; + return join("_S0S_",@string); +} + +sub findClusters{ + my $continue = 0; + my @mapped_clusters = (); + my $clusterdist = $_[1]; + my $previous = 'x'; + my @localcluster = (); + my $cluster_starts = (); + my $cluster_ends = (); + my $localcluster_start = (); + my $localcluster_end = (); + my @record_cluster = (); + my @string = split(/\!/, $_[0]); + my $zerolength=0; + + for my $pos_pos (1 ... $#string){ + my $pos = $string[$pos_pos]; +# print $pos, "\n"; + if ($continue == 0 && $pos eq "x") {next;} + + if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){ + if ($zerolength == 0) {$localcluster_end = $pos_pos-1}; + $zerolength++; + $continue = 1; + } + + if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) { + $zerolength = 0; + $continue = 0; + my %seen; + my @uniqed = grep !$seen{$_}++, @localcluster; +# print "caught cluster : @uniqed \n"; + push(@mapped_clusters, [@uniqed]); +# print "clustered:\n@uniqed\n"; + @localcluster = (); + @record_cluster = (); + + } + + if ($pos ne "x"){ + $zerolength = 0; + $continue = 1; + $pos =~ s/x,//g; + my @entries = split(/,/,$pos); + $localcluster_end = 0; + $localcluster_start = 0; + push(@record_cluster,$pos); + + if ($continue == 0){ + @localcluster = (); + @localcluster = (@localcluster, @entries); + $localcluster_start = $pos_pos; + } + + if ($continue == 1 ) { + @localcluster = (@localcluster, @entries); + } + } + } + + if (scalar(@localcluster) > 0){ + my %seen; + my @uniqed = grep !$seen{$_}++, @localcluster; + # print "caught cluster : @uniqed \n"; + push(@mapped_clusters, [@uniqed]); + # print "clustered:\n@uniqed\n"; + @localcluster = (); + @record_cluster = (); + } + + my @returner = (); + + foreach my $clust (@mapped_clusters){ + my @localclust = @$clust; + my @result = (); + foreach my $clustparts (@localclust){ + push(@result,$clustparts); + } + push(@returner , join(".",@result)); + } +# print "returnig: ", join(",",@returner), "\n"; + return join(",",@returner); +} +#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx + +#xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx + +sub MakeTrees{ + my $tree = $_[0]; + my @parts=($tree); +# my @parts=(); + + while (1){ + $tree =~ s/^\(//g; + $tree =~ s/\)$//g; + my @arr = (); + + if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ + @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; + $tree = $2; + push @parts, $tree; + } + elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ + @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; + $tree = $1; + push @parts, $tree; + } + elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ + last; + } + } + return @parts; +} + +#xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx + +sub qualityFilter{ + my $unmaskedorthfile = $_[0]; + my $seqfile = $_[1]; + my $maskedorthfile = $_[2]; + my $filteredout = $maskedorthfile."_residue"; + open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!"; + + my %keyhash = (); + + while (my $line = <PMORTH>){ + my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/; + push @{$keyhash{$key}}, $line; + } + + open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!"; + open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!"; + open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!"; + + + + while (my $line = <SEQ>){ + chomp $line; + if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){ + my $key = join("\t", $1,$2,$3,$4); + next if !exists $keyhash{$key}; + my @orths = @{$keyhash{$key}} if exists $keyhash{$key}; + delete $keyhash{$key}; + + my $sine = <SEQ>; + + foreach my $orth (@orths){ + #print "-----------------------------------------------------------------\n"; + #print $orth; + my $orthcopy = $orth; + $orth =~ s/^>//; + my @parts = split(/>/,$orth); + + my @starts = (); + my @ends = (); + + foreach my $part (@parts){ + my $no_of_species = adjustCoordinates($part); + my @pields = split(/\t/,$part); + + # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n"; + + push @starts, $pields[$startcord]; + push @ends, $pields[$endcord]; + } + + #print "starts = @starts ... ends = @ends\n"; + + my $leftend = smallest_number(@starts)-10; + my $rightend = largest_number(@ends)+10; + + my $maskarea = substr($sine, $leftend, $rightend-$leftend+1); + print RES $orth if $maskarea =~ /#/; + + + next if $maskarea =~ /#/; + + print MORTH $orthcopy; + } + } + else{ + next; + } + + + } + +# print "UNDONE: ", scalar(keys %keyhash),"\n"; +# print MORTH "UNDONE: ", scalar(keys %keyhash),"\n"; + +} + +sub adjustCoordinates{ + my $line = $_[0]; + my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig; + my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g); +# print "line = $line\n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $motifcord = 2 + (4*$no_of_species) + 2 - 1; + $gapcord = $motifcord+1; + $startcord = $gapcord+1; + $strandcord = $startcord+1; + $endcord = $strandcord + 1; + $microsatcord = $endcord + 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + $interr_poscord = $microsatcord + 3; + $no_of_interruptionscord = $microsatcord + 4; + $interrcord = $microsatcord + 2; +# print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i; + return $no_of_species; +} + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml Tue Aug 20 11:02:09 2013 -0400 @@ -0,0 +1,93 @@ +<tool id="multispecies_orthologous_microsats" name="Extract orthologous microsatellites" version="1.0.0"> + <description> for multiple (>2) species alignments</description> + <command interpreter="perl"> + multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl + $input1 + $input2 + $out_file1 + $thresholds + $species + "$treedefinition" + $separation + + </command> + <inputs> + <page> + <param format="maf" name="input1" type="data" label="Select unfiltered MAF alignments" help= "NOTE: Currently users are requested to select only the alignments that contain five, four or three species' genomes. )"/> + <param format="maf" name="input2" type="data" label="Select the filtered version of above MAF alignments" help= "NOTE: Please use the tool 'Filter nucleotides' to filter nucleotides based on quality, in multiple species. When using the Filter nucleotide tool, ensure that you click 'Select All' for the option 'Mask Species')"/> + <param name="separation" size="10" type="integer" value="10" label="Minimum base pair distance between adjacent microsatellite blocks" + help="A value of 10 means: Adjacent microsatellites separated by less than 10 base pairs will be excluded from the output."/> + <param name="thresholds" size="15" type="text" value="9,10,12,12" label="Minimum Threshold for the number of repeats for microsatellites" + help="A value of 9,10,12,12 means: All monos having fewer than 9 repeats, dis having fewer than 5 repeats, tris having fewer than 4 repeats, tetras having fewer than 3 repeats will be excluded from the output."/> + <param name="species" type="select" label="Select species" display="checkboxes" multiple="true" help="NOTE: Currently users are requested to select one of these three combinations: hg18-panTro2-ponAbe2, hg18-panTro2-ponAbe2-rheMac2 or hg18-panTro2-ponAbe2-rheMac2-calJac1"> + <options> + <filter type="data_meta" ref="input1" key="species" /> + </options> + </param> + <param name="treedefinition" size="200" type="text" value = "((((hg18,panTro2),ponAbe2),rheMac2),calJac1)" label="Tree definition of all species above whether or not selected for microsatellite extraction" + help="For example: ((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/> + </page> + </inputs> + <outputs> + <data format="txt" name="out_file1" metadata_source="input1"/> + </outputs> + <requirements> + <requirement type="binary">bx-sputnik</requirement> + </requirements> + <tests> + <test> + <param name="input1" value="regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz"/> + <param name="input2" value="regVariation/microsatellite/Galaxy17_masked_short.maf.gz"/> + <param name="thresholds" value="9,10,12,12"/> + <param name="species" value="hg18,panTro2,ponAbe2,rheMac2,calJac1"/> + <param name="treedefinition" value="((((hg18,panTro2),ponAbe2),rheMac2),calJac1)"/> + <param name="separation" value="10"/> + <output name="out_file1" file="regVariation/microsatellite/Galaxy17_short_raw.txt"/> + </test> + </tests> + + <help> + +.. class:: infomark + +**What it does** + +This tool finds ortholgous microsatellite blocks between aligned species + +----- + +.. class:: warningmark + +**Note** + +A non-tabular format is created in which each row contains all information pertaining to a microsatellite locus from multiple species in the alignment. +The rows read like this: + +>hg18 15 hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 mononucleotide A 0 13 + 29 aaaaa------aaaAAA >rheMac2 15 hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 mononucleotide A 0 13 + 29 aaaaaaaa---AAAAAA + +Information from each species starts with an ">" followed by the species name, for instance, ">rheMac2". Below we describe all information listed for a microsatellite sequence in each species. + +After the species tag the alignemnt number is listed. +What follows is details of the alignment block from all the species, including the chromosome number, start and end coordinates in each species. For instance: + +hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 + +suggests that the alignment block as five species: hg18, panTro2, ponAbe2, rheMac2 and calJac1. + +Then the type of microsatellite is written, for instance, "mononucleotide". + +Then the microsatellite motif. + +Then the number of gaps in the alignment, in the respective species (as noted above, rheMac2 in this case). + +Then the start coordinate, the strand, and the end coordinate WITHIN the alignment block. + +At the end is listed the microsatellite sequence. + +If the microsatellite contains interruptions (which are not important for this tool), then the interruptions' information will be written out after the microsatellite sequence. + + +</help> + + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regVariation/microsatellite/Galaxy17_short_raw.txt Tue Aug 20 11:02:09 2013 -0400 @@ -0,0 +1,5 @@ +>rheMac2 1 hg18 chr22 16115000 16115212 panTro2 chr22 16131432 16131644 ponAbe2 chr22 13834301 13834513 rheMac2 chr10 61914657 61914870 calJac1 Contig13114 14954 15124 mononucleotide A 1 198 + 206 aaaaaaaaa +>calJac1 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 tetranucleotide AAAC 1 182 + 193 AAACAAACAAAC +>hg18 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 7 568 + 577 TTTTTTTTTT >panTro2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 6 568 + 577 TTTTTTTTTT >ponAbe2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 5 568 + 576 TTTTTTTTT >rheMac2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 22 568 + 579 TTTTTTTTTTTT +>ponAbe2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 tetranucleotide GAGG 8 821 + 833 GAGGGAGGGAGGG +>hg18 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >panTro2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >ponAbe2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >rheMac2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >calJac1 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1