# HG changeset patch # User devteam # Date 1377010929 14400 # Node ID d1bf24c1350faa9870e64e1482af2ba9a41d91f9 Uploaded tool tarball. diff -r 000000000000 -r d1bf24c1350f multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl Tue Aug 20 11:02:09 2013 -0400 @@ -0,0 +1,5606 @@ +#!/usr/bin/perl +use strict; +use warnings; +use Term::ANSIColor; +use File::Basename; +use IO::Handle; +use Cwd; +use File::Path; +use vars qw($distance @thresholds @tags $species_set @allspecies $printer $treeSpeciesNum $focalspec $mergestarts $mergeends $mergemicros $interrtypecord $microscanned $interrcord $interr_poscord $no_of_interruptionscord $infocord $typecord $startcord $strandcord $endcord $microsatcord $motifcord $sequencepos $no_of_species $gapcord $prinkter); +use File::Path qw(make_path remove_tree); +use File::Temp qw/ tempfile tempdir /; +my $tdir = tempdir( CLEANUP => 1 ); +chdir $tdir; +my $dir = getcwd; +#print "dir = $dir\n"; + +#$ENV{'PATH'} .= ':' . dirname($0); +my $date = `date`; + +my ($mafile, $mafile_sputt, $orthfile, $threshold_array, $allspeciesin, $tree_definition_all, $separation) = @ARGV; +if (!$mafile or !$mafile_sputt or !$orthfile or !$threshold_array or !$separation or !$tree_definition_all or !$allspeciesin) { die "missing arguments\n"; } + +$tree_definition_all =~ s/\s+//g; +$threshold_array =~ s/\s+//g; +$allspeciesin =~ s/\s+//g; +#------------------------------------------------------------------------------- +# WHICH SPUTNIK USED? +my $sputnikpath = (); +$sputnikpath = "sputnik_lowthresh_MATCH_MIN_SCORE3" ; +#$sputnikpath = "/Users/ydk/work/rhesus_microsat/codes/./sputnik_Mac-PowerPC"; +#print "sputnik_Mac-PowerPC non-existant\n" if !-e $sputnikpath; +#exit if !-e $sputnikpath; +#$sputnikpath = "bx-sputnik" ; +#print "ARGV input = @ARGV\n"; +#print "ARGV input :\n mafile=$mafile\n orthfile=$orthfile\n threshold_array=$threshold_array\n species_set=$species_set\n tree_definition=$tree_definition\n separation=$separation\n"; +#------------------------------------------------------------------------------- +# RUNFILE +#------------------------------------------------------------------------------- +$distance = 1; #bp +$distance++; +my @tree_definitions=MakeTrees($tree_definition_all); +my $allspeciesset = $tree_definition_all; +$allspeciesset =~ s/[\(\) ]+//g; +@allspecies = split(/,/,$allspeciesset); + +my @outputfiles = (); +my $round = 0; +#my $tdir = tempdir( CLEANUP => 0 ); +#chdir $tdir; + +foreach my $tree_definition (@tree_definitions){ + my @commas = ($tree_definition =~ /,/g) ; + #print "commas = @commas\n"; ; + next if scalar(@commas) <= 1; + #print "species_set = $species_set\n"; + $treeSpeciesNum = scalar(@commas) + 1; + $species_set = $tree_definition; + $species_set =~ s/[\)\( ;]+//g; + #print "species_set = $species_set\n"; ; + + $round++; + #------------------------------------------------------------------------------- + # MICROSATELLITE THRESHOLD SETTINGS (LENGTH, BP) + $threshold_array=~ s/,/_/g; + my @thresharr = split("_",$threshold_array); + @thresholds=@thresharr; + #my $threshold_array = join("_",($mono_threshold, $di_threshold, $tri_threshold, $tetra_threshold)); + #print "current dit=$dir\n"; + #------------------------------------------------------------------------------- + # CREATE AXT FILES IN FORWARD AND REVERSE ORDERS IF NECESSARY + my @chrfiles=(); + + #my $mafile = "/Users/ydk/work/rhesus_microsat/results/galay/align.txt"; #$ARGV[0]; + my $chromt=int(rand(10000)); + my $p_chr=$chromt; + + + my @exactspeciesset_unarranged = split(/,/,$species_set); + $tree_definition=~s/[\)\(, ]/\t/g; + my @treespecies=split(/\t+/,$tree_definition); + my @exactspecies=(); + + foreach my $spec (@treespecies){ + foreach my $espec (@exactspeciesset_unarranged){ + push @exactspecies, $spec if $spec eq $espec; + } + } + #print "exactspecies=@exactspecies\n"; + $focalspec = $exactspecies[0]; + my $arranged_species_set=join(".",@exactspecies); + my $chr_name = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt"); + my $chr_name_sputt = join(".",("chr".$p_chr),$arranged_species_set, "net", "axt_sputt"); + #print "sending to maftoAxt_multispecies: $mafile, $tree_definition, $chr_name, $species_set .. focalspec=$focalspec \n"; + maftoAxt_multispecies($mafile, $tree_definition, $chr_name, $species_set); + maftoAxt_multispecies($mafile_sputt, $tree_definition, $chr_name_sputt, $species_set); + #print "done maf to axt conversion\n"; + my $reverse_chr_name = join(".",("chr".$p_chr."r"),$arranged_species_set, "net", "axt"); + artificial_axdata_inverter ($chr_name, $reverse_chr_name); + #print "reverse_chr_name=$reverse_chr_name\n"; + #------------------------------------------------------------------------------- + # FIND THE CORRESPONDING CHIMP CHROMOSOME FROM FILE ORTp_chrS.TXT + foreach my $direct ("reverse_direction","forward_direction"){ + $p_chr=$chromt; + #print "direction = $direct\n"; + $p_chr = $p_chr."r" if $direct eq "reverse_direction"; + $p_chr = $p_chr if $direct eq "forward_direction"; + my $config = $species_set; + $config=~s/,/./g; + my @orgs = split(/\./,$arranged_species_set); + #print "ORGS= @orgs\n"; + my @tag=@orgs; + + + my $tags = join(",", @tag); + my @tags=@tag; + chomp $p_chr; + $tags = join("_", split(/,/, $tags)); + my $pchr = "chr".$p_chr; + + my $ptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1])."-".$threshold_array; + my @sp_tags = (); + +# print "$ptag _ orthfile\n"; ; + #print "orgs=@orgs, pchr=$pchr, hence, ptag = $ptag\n"; + foreach my $sp (@tag){ + push(@sp_tags, ($sp.".".$ptag)); + } + + my $preptag = $orgs[0]."-".$pchr.".".join(".",@orgs[1 ... scalar(@orgs)-1]); + my @presp_tags = (); + + foreach my $sp (@tag){ + push(@presp_tags, ($sp.".".$preptag)); + } + + my $resultdir = ""; + my $orthdir = ""; + my $filtereddir = ""; + my $pipedir = ""; + + my @title_queries = (); + push(@title_queries, "^[0-9]+"); + my $sep="\\s"; + for my $or (0 ... $#orgs){ + my $title = join($sep, ($orgs[$or], "[A-Za-z_]+[0-9a-zA-Z]+", "[0-9]+", "[0-9]+", "[\\-\\+]")); + #$title =~ s/chr\\+\\s+\+/chr/g; + push(@title_queries, $title); + } + my $title_query = join($sep, @title_queries); + #print "title_queries=@title_queries\n"; + #print "query = >$title_query<\n"; + #print "orgs = @orgs\n"; + #------------------------------------------------------------------------------- + # GET AXTNET FILES, EDIT THEM AND SPLIT THEM INTO HUMAN AND CHIMP INPUT FILES + my $t1input = $pchr.".".$arranged_species_set.".net.axt"; + + my @t1outputs = (); + + foreach my $sp (@presp_tags){ + push(@t1outputs, $sp."_gap_op"); + } + + + + multi_species_t1($t1input,$tags,(join(",", @t1outputs)), $title_query); + #print "t1outputs=@t1outputs\n"; + #print "done t1\n"; ; + #------------------------------------------------------------------------------- + #START T2.PL + + my $stag = (); my $tag1 = (); my $tag2 = (); my $schrs = (); + + for my $t (0 ... scalar(@tags)-1){ + multi_species_t2($t1outputs[$t], $tag[$t]); + } + #------------------------------------------------------------------------------- + #START T2.2.PL + + my @temp_tags = @tag; + + foreach my $sp (@presp_tags){ + my $t2input = $sp."_nogap_op_unrand"; + multi_species_t2_2($t2input, shift(@temp_tags)); + } + undef (@temp_tags); + + #------------------------------------------------------------------------------- + #START SPUTNIK + + my @jobIDs = (); + @temp_tags = @tag; + my @sput_filelist = (); + + foreach my $sp (@presp_tags){ + #print "sp = $sp\n"; + my $sputnikoutput = $pipedir.$sp."_sput_op0"; + my $sputnikinput = $pipedir.$sp."_nogap_op_unrand"; + push(@sput_filelist, $sputnikinput); + my $sputnikcommand = $sputnikpath." ".$sputnikinput." > ".$sputnikoutput; + # print "$sputnikcommand\n"; + my @sputnikcommand_system = $sputnikcommand; + system(@sputnikcommand_system); + } + + #------------------------------------------------------------------------------- + #START SPUTNIK OUTPUT CORRECTOR + + foreach my $sp (@presp_tags){ + my $corroutput = $pipedir.$sp."_sput_op1"; + my $corrinput = $pipedir.$sp."_sput_op0"; + sputnikoutput_corrector($corrinput,$corroutput); + + my $t4output = $pipedir.$sp."_sput_op2"; + multi_species_t4($corroutput,$t4output); + + my $t5output = $pipedir.$sp."_sput_op3"; + multi_species_t5($t4output,$t5output); + #print "done t5.pl for $sp\n"; + + my $t6output = $pipedir.$sp."_sput_op4"; + multi_species_t6($t5output,$t6output,scalar(@orgs)); + } + #------------------------------------------------------------------------------- + #START T9.PL FOR T10.PL AND FOR INTERRUPTED HUNTING + + foreach my $sp (@presp_tags){ + my $t9output = $pipedir.$sp."_gap_op_unrand_match"; + my $t9sequence = $pipedir.$sp."_gap_op_unrand2"; + my $t9micro = $pipedir.$sp."_sput_op4"; + t9($t9micro,$t9sequence,$t9output); + + my $t9output2 = $pipedir.$sp."_nogap_op_unrand2_match"; + my $t9sequence2 = $pipedir.$sp."_nogap_op_unrand2"; + t9($t9micro,$t9sequence2,$t9output2); + } + #print "done both t9.pl for all orgs\n"; + + #------------------------------------------------------------------------------- + # FIND COMPOUND MICROSATELLITES + + @jobIDs = (); + my $species_counter = 0; + + foreach my $sp (@presp_tags){ + my $simple_microsats=$pipedir.$sp."_sput_op4_simple"; + my $compound_microsats=$pipedir.$sp."_sput_op4_compound"; + my $input_micro = $pipedir.$sp."_sput_op4"; + my $input_seq = $pipedir.$sp."_nogap_op_unrand2_match"; + multiSpecies_compound_microsat_hunter3($input_micro,$input_seq,$simple_microsats,$compound_microsats,$orgs[$species_counter], scalar(@sp_tags), $threshold_array ); + $species_counter++; + } + + #------------------------------------------------------------------------------- + # READING AND FILTERING SIMPLE MICROSATELLITES + my $spcounter2=0; + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$spcounter2]; + $spcounter2++; + my $simple_microsats=$pipedir.$presp."_sput_op4_simple"; + my $simple_filterout = $pipedir.$sp."_sput_op4_simple_filtered"; + my $simple_residue = $pipedir.$sp."_sput_op4_simple_residue"; + multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); + } + + #------------------------------------------------------------------------------- + # ANALYZE COMPOUND MICROSATELLITES FOR BEING INTERRUPTED MICROSATS + + $species_counter = 0; + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + my $compound_microsats = $pipedir.$presp."_sput_op4_compound"; + my $analyzed_simple_microsats=$pipedir.$presp."_sput_op4_compound_interrupted"; + my $analyzed_compound_microsats=$pipedir.$presp."_sput_op4_compound_pure"; + my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; + multiSpecies_compound_microsat_analyzer($compound_microsats,$seq_file,$analyzed_simple_microsats,$analyzed_compound_microsats,$orgs[$species_counter], scalar(@sp_tags)); + $species_counter++; + } + #------------------------------------------------------------------------------- + # REANALYZE COMPOUND MICROSATELLITES FOR PRESENCE OF SIMPLE ONES WITHIN THEM.. + $species_counter = 0; + + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + my $compound_microsats = $pipedir.$presp."_sput_op4_compound_pure"; + my $compound_interrupted = $pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; + my $compound_compound = $pipedir.$presp."_sput_op4_compound_compound"; + my $seq_file = $pipedir.$presp."_nogap_op_unrand2_match"; + multiSpecies_compoundClarifyer($compound_microsats,$seq_file,$compound_interrupted,$compound_compound,$orgs[$species_counter], scalar(@sp_tags), "2_4_6_8", "3_4_6_8", "2_4_6_8"); + $species_counter++; + } + #------------------------------------------------------------------------------- + # READING AND FILTERING SIMPLE AND COMPOUND MICROSATELLITES + $species_counter = 0; + + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + + my $simple_microsats=$pipedir.$presp."_sput_op4_compound_clarifiedInterrupted"; + my $simple_filterout = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; + my $simple_residue = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_residue"; + multiSpecies_filtering_interrupted_microsats($simple_microsats, $simple_filterout, $simple_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); + + my $simple_microsats2 = $pipedir.$presp."_sput_op4_compound_interrupted"; + my $simple_filterout2 = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; + my $simple_residue2 = $pipedir.$sp."_sput_op4_compound_interrupted_residue"; + multiSpecies_filtering_interrupted_microsats($simple_microsats2, $simple_filterout2, $simple_residue2,$threshold_array,$threshold_array,scalar(@sp_tags)); + + my $compound_microsats=$pipedir.$presp."_sput_op4_compound_compound"; + my $compound_filterout = $pipedir.$sp."_sput_op4_compound_compound_filtered"; + my $compound_residue = $pipedir.$sp."_sput_op4_compound_compound_residue"; + multispecies_filtering_compound_microsats($compound_microsats, $compound_filterout, $compound_residue,$threshold_array,$threshold_array,scalar(@sp_tags)); + $species_counter++; + } + #print "done filtering both simple and compound microsatellites \n"; + + #------------------------------------------------------------------------------- + + my @combinedarray = (); + my @combinedarray_indicators = ("mononucleotide", "dinucleotide", "trinucleotide", "tetranucleotide"); + my @combinedarray_tags = ("mono", "di", "tri", "tetra"); + $species_counter = 0; + + foreach my $sp (@sp_tags){ + my $simple_interrupted = $pipedir.$sp."_simple_analyzed_simple"; + push @{$combinedarray[$species_counter]}, $pipedir.$sp."_simple_analyzed_simple_mono", $pipedir.$sp."_simple_analyzed_simple_di", $pipedir.$sp."_simple_analyzed_simple_tri", $pipedir.$sp."_simple_analyzed_simple_tetra"; + $species_counter++; + } + + #------------------------------------------------------------------------------- + # PUT TOGETHER THE INTERRUPTED AND SIMPLE MICROSATELLITES BASED ON THEIR MOTIF SIZE FOR FURTHER EXTENTION + my $sp_counter = 0; + foreach my $sp (@sp_tags){ + my $analyzed_simple = $pipedir.$sp."_sput_op4_compound_interrupted_filtered"; + my $clarifyed_simple = $pipedir.$sp."_sput_op4_compound_clarifiedInterrupted_filtered"; + my $simple = $pipedir.$sp."_sput_op4_simple_filtered"; + my $simple_analyzed_simple = $pipedir.$sp."_simple_analyzed_simple"; + `cat $analyzed_simple $clarifyed_simple $simple > $simple_analyzed_simple`; + for my $i (0 ... 3){ + `grep "$combinedarray_indicators[$i]" $simple_analyzed_simple > $combinedarray[$sp_counter][$i]`; + } + $sp_counter++; + } + #print "\ndone grouping interrupted & simple microsats based on their motif size for further extention\n"; + + #------------------------------------------------------------------------------- + # BREAK CHROMOSOME INTO PARTS OF CERTAIN NO. CONTIGS EACH, FOR FUTURE SEARCHING OF INTERRUPTED MICROSATELLITES + # ESPECIALLY DI, TRI AND TETRANUCLEOTIDE MICROSATELLITES + @temp_tags = @sp_tags; + my $increment = 1000000; + my @splist = (); + my $targetdir = $pipedir; + $species_counter=0; + + foreach my $sp (@sp_tags){ + my $presp = $presp_tags[$species_counter]; + $species_counter++; + my $localtag = shift @temp_tags; + my $locallist = $targetdir.$localtag."_".$p_chr."_list"; + push(@splist, $locallist); + my $input = $pipedir.$presp."_nogap_op_unrand2_match"; + chromosome_unrand_breaker($input,$targetdir,$locallist,$increment, $localtag, $pchr); + } + + + my @unionarray = (); + #print "splist=@splist\n"; + #------------------------------------------------------------------------------- + # FIND INTERRUPTED MICROSATELLITES + + $species_counter = 0; + + for my $i (0 .. $#combinedarray){ + + @jobIDs = (); + open (JLIST1, "$splist[$i]") or die "Cannot open file $splist[$i]: $!"; + + while (my $sp1 = ){ + #print "$splist[$i]: sp1=$sp1\n"; + chomp $sp1; + + for my $j (0 ... $#combinedarray_tags){ + my $interr = $sp1."_interr_".$combinedarray_tags[$j]; + my $simple = $sp1."_simple_".$combinedarray_tags[$j]; + push @{$unionarray[$i]}, $interr, $simple; + multiSpecies_interruptedMicrosatHunter($combinedarray[$i][$j],$sp1,$interr ,$simple, $orgs[$species_counter], scalar(@sp_tags), "3_4_6_8"); + } + } + $species_counter++; + } + close JLIST1; + #------------------------------------------------------------------------------- + # REUNION AND ZIPPING BEFORE T10.PL + + my @allarray = (); + + for my $i (0 ... $#sp_tags){ + my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; + unlink $localfile if -e $localfile; + push(@allarray, $localfile); + + my $unfiltered_localfile= $localfile."_unfiltered"; + my $residue_localfile= $localfile."_residue"; + + unlink $unfiltered_localfile; + #unlink $unfiltered_localfile; + for my $j (0 ... $#{$unionarray[$i]}){ + #print "listing files for species $i and list number $j= \n$unionarray[$i][$j] \n"; + `cat $unionarray[$i][$j] >> $unfiltered_localfile`; + unlink $unionarray[$i][$j]; + } + + multiSpecies_filtering_interrupted_microsats($unfiltered_localfile, $localfile, $residue_localfile,$threshold_array,$threshold_array,scalar(@sp_tags) ); + my $analyzed_compound = $pipedir.$sp_tags[$i]."_sput_op4_compound_compound_filtered"; + my $simple_residue = $pipedir.$sp_tags[$i]."_sput_op4_simple_residue"; + my $compound_residue = $pipedir.$sp_tags[$i]."_sput_op4_compound_residue"; + + `cat $analyzed_compound >> $localfile`; + } + #------------------------------------------------------------------------------- + # MERGING MICROSATELLITES THAT ARE VERY CLOSE TO EACH OTHER, INCLUDING THOSE FOUND BY SEARCHING IN 2 OPPOSIT DIRECTIONS + + my $toescape=0; + + + for my $i (0 ... $#sp_tags){ + my $localfile = $pipedir.$sp_tags[$i]."_allmicrosats"; + $localfile =~ /$focalspec\-(chr[0-9a-zA-Z]+)\./; + my $direction = $1; + #print "localfile = $localfile , direction = $direction\n"; + # `gzip $reverse_chr_name` if $direction =~ /chr[0-9a-zA-Z]+r/ && $switchboard{"deleting_processFiles"} != 1; + $toescape =1 if $direction =~ /chr[0-9a-zA-Z]+r/; + last if $direction =~ /chr[0-9a-zA-Z]+r/; + my $nogap_sequence = $pipedir.$presp_tags[$i]."_nogap_op_unrand2_match"; + my $gap_sequence = $pipedir.$presp_tags[$i]."_gap_op_unrand_match"; + my $reverselocal = $localfile; + $reverselocal =~ s/\-chr([0-9a-zA-Z]+)\./-chr$1r./g; + merge_interruptedMicrosats($nogap_sequence,$localfile, $reverselocal ,scalar(@sp_tags)); + #------------------------------------------------------------------------------- + my $forward_separate = $localfile."_separate"; + my $reverse_separate = $reverselocal."_separate"; + my $diff = $forward_separate."_diff"; + my $miss = $forward_separate."_miss"; + my $common = $forward_separate."_common"; + forward_reverse_sputoutput_comparer($nogap_sequence,$forward_separate, $reverse_separate, $diff, $miss, $common ,scalar(@sp_tags)); + #------------------------------------------------------------------------------- + my $symmetrical_file = $localfile."_symmetrical"; + my $merged_file = $localfile."_merged"; + #print "cating: $merged_file $common into -> $symmetrical_file \n"; + `cat $merged_file $common > $symmetrical_file`; + #------------------------------------------------------------------------------- + my $t10output = $symmetrical_file."_fin_hit_all_2"; + new_multispecies_t10($gap_sequence, $symmetrical_file, $t10output, join(".", @orgs)); + #------------------------------------------------------------------------------- + } + next if $toescape == 1; + #------------------------------------------------------------------------------------------------ + # BRINGING IT ALL TOGETHER: FINDING ORTHOLOGOUS MICROSATELLITES AMONG THE SPECIES + + + my @micros_array = (); + my $sampletag = (); + for my $i (0 ... $#sp_tags){ + my $finhitFile = $pipedir.$sp_tags[$i]."_allmicrosats_symmetrical_fin_hit_all_2"; + push(@micros_array, $finhitFile); + $sampletag = $sp_tags[$i]; + } + #$sampletag =~ s/^([A-Z]+\.)/ORTH_/; + #$sampletag = $sampletag."_monoThresh-".$mono_threshold."bp"; + my $orthfiletemp = $ptag."_orthfile"; + my $orthanswer = multiSpecies_orthFinder4($t1input, join(":",@micros_array), $orthfiletemp, join(":", @orgs), $separation); + + my $maskedorthfiletemp = $ptag."_orthfile_masked"; + qualityFilter ($orthfiletemp, $chr_name_sputt, $maskedorthfiletemp); + + push @outputfiles , $maskedorthfiletemp; + } + $date = `date`; +} + +`cat @outputfiles > $orthfile`; + +my $rootdir = $dir; +$rootdir =~ s/\/[A-Za-z0-9\-_]+$//; +chdir $rootdir; +remove_tree($dir); + +#print "date = $date\n"; +#remove_tree($tdir); +#------------------------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------ +#------------------------------------------------------------------------------------------------ + +#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx + +sub maftoAxt_multispecies { + #print "in maftoAxt_multispecies : got @_\n"; + my $fname=$_[0]; + open(IN,"<$_[0]") or die "Cannot open $_[0]: $! \n"; + my $treedefinition = $_[1]; + open(OUT,">$_[2]") or die "Cannot open $_[2]: $! \n"; + my $counter = 0; + my $exactspeciesset = $_[3]; + my @exactspeciesset_unarranged = split(/,/,$exactspeciesset); + + $treedefinition=~s/[\)\(, ]/\t/g; + my @species=split(/\t+/,$treedefinition); + my @exactspecies=(); + + foreach my $spec (@species){ + foreach my $espec (@exactspeciesset_unarranged){ + push @exactspecies, $spec if $spec eq $espec; + } + } + #print "exactspecies=@exactspecies\n"; + + ########### + my $select = 2; + #select = 1 if all species need sequences to be present for each block otherwise, it is 0 + #select = 2 only the allowed set make up the alignment. use the removeset + # information to detect alignmenets that have other important genomes aligned. + ########### + my @allowedset = (); + @allowedset = split(/;/,allowedSetOfSpecies(join("_",@species))) if $select == 0; + @allowedset = join("_",0,@species) if $select == 1; + #print "species = @species , allowedset =",join("\n", @allowedset) ," \n"; + @allowedset = join("_",0,@exactspecies) if $select == 2; + #print "allowedset = @allowedset and exactspecies = @exactspecies\n"; + + my $start = 0; + my @sequences = (); + my @titles = (); + my $species_counter = "0"; + my $countermatch = 0; + my $outsideSpecies=0; + + while(my $line = ){ +# print $line; + next if $line =~ /^#/; + next if $line =~ /^i/; + chomp $line; + my @fields = split(/\s+/,$line); + chomp $line; + if ($line =~ /^a /){ + $start = 1; + } + + if ($line =~ /^s /){ + + foreach my $sp (@allspecies){ +# print "checking species $sp\n"; + if ($fields[1] =~ /$sp/){ + $species_counter = $species_counter."_".$sp; + push(@sequences, $fields[6]); + my @sp_info = split(/\./,$fields[1]); + my $title = join(" ",@sp_info, $fields[2], ($fields[2]+$fields[3]), $fields[4]); + push(@titles, $title); +# print "species_counter = $species_counter\n"; + } + } + } + + if (($line !~ /^a/) && ($line !~ /^s/) && ($line !~ /^#/) && ($line !~ /^i/) && ($start = 1)){ +# print "species_counter = $species_counter\n"; + my $arranged = reorderSpecies($species_counter, @allspecies); + my $stopper = 1; + my $arrno = 0; + +# print "checking if ", scalar(@sequences), " match @exactspecies allowedset=@allowedset\n"; + if (scalar(@sequences) == scalar(@exactspecies)){ + foreach my $set (@allowedset){ +# print "testing $arranged against $set\n"; + if ($arranged eq $set){ + $stopper = 0; last; + } + $arrno++; + } + } + else{ + $stopper = 1; + } + + + if ($stopper == 0) { + @titles = split ";", orderInfo(join(";", @titles), $species_counter, $arranged) if $species_counter ne $arranged; + @sequences = split ";", orderInfo(join(";", @sequences), $species_counter, $arranged) if $species_counter ne $arranged; + my $filteredseq = filter_gaps(@sequences); + + if ($filteredseq ne "SHORT"){ + #print "printing"; ; + $counter++; + print OUT join (" ",$counter, @titles), "\n"; + print OUT $filteredseq, "\n"; + print OUT "\n"; + $countermatch++; + } + } + else{ #print "nexting\n";; + } + + @sequences = (); @titles = (); $start = 0;$species_counter = "0"; + next; + + } + } +# print "countermatch = $countermatch\n"; +} + +sub reorderSpecies{ + my @inarr=@_; + my $currSpecies = shift (@inarr); + my $ordered_species = 0; + my @species=@inarr; + #print "species = @species\n"; + foreach my $order (@species){ + $ordered_species = $ordered_species."_".$order if $currSpecies=~ /$order/; + } + return $ordered_species; + +} + +sub filter_gaps{ + my @sequences = @_; +# print "sequences sent are @sequences\n"; + my $seq_length = length($sequences[0]); + my $seq_no = scalar(@sequences); + my $allgaps = (); + for (1 ... $seq_no){ + $allgaps = $allgaps."-"; + } + + my @seq_array = (); + my $seq_counter = 0; + foreach my $seq (@sequences){ +# my @sequence = split(/\s*/,$seq); + $seq_array[$seq_counter] = [split(/\s*/,$seq)]; +# push @seq_array, [@sequence]; + $seq_counter++; + } + my $g = 0; + while ( $g < $seq_length){ + last if (!exists $seq_array[0][$g]); + my $bases = (); + for my $u (0 ... $#seq_array){ + $bases = $bases.$seq_array[$u][$g]; + } +# print $bases, "\n"; + if ($bases eq $allgaps){ +# print "bases are $bases, position is $g \n"; + for my $seq (@seq_array){ + splice(@$seq , $g, 1); + } + } + else { + $g++; + } + } + + my @outs = (); + + foreach my $seq (@seq_array){ + push(@outs, join("",@$seq)); + } + return "SHORT" if length($outs[0]) <=100; + return (join("\n", @outs)); +} + + +sub allowedSetOfSpecies{ + my @allowed_species = split(/_/,$_[0]); + unshift @allowed_species, 0; +# print "allowed set = @allowed_species \n"; + my @output = (); + for (0 ... scalar(@allowed_species) - 4){ + push(@output, join("_",@allowed_species)); + pop @allowed_species; + } + return join(";",reverse(@output)); + +} + + +sub orderInfo{ + my @info = split(/;/,$_[0]); +# print "info = @info"; + my @old = split(/_/,$_[1]); + my @new = split(/_/,$_[2]); + shift @old; shift @new; + my @outinfo = (); + foreach my $spe (@new){ + for my $no (0 ... $#old){ + if ($spe eq $old[$no]){ + push(@outinfo, $info[$no]); + } + } + } +# print "outinfo = @outinfo \n"; + return join(";", @outinfo); +} + +#xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx xxxxxxx maftoAxt_multispecies xxxxxxx + +#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx +sub artificial_axdata_inverter{ + open(IN,"<$_[0]") or die "Cannot open file $_[0]: $!"; + open(OUT,">$_[1]") or die "Cannot open file $_[1]: $!"; + my $linecounter=0; + while (my $line = ){ + $linecounter++; + #print "$linecounter\n"; + chomp $line; + my $final_line = $line; + my $trycounter = 0; + if ($line =~ /^[a-zA-Z\-]/){ + # while ($final_line eq $line){ + my @fields = split(/\s*/,$line); + + $final_line = join("",reverse(@fields)); + # print colored ['red'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; + # $trycounter++; + # print "trying again....$trycounter : $final_line\n" if $final_line eq $line; + # } + } + + # print colored ['yellow'], "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; + if ($line =~ /^[0-9]/){ + $line =~ s/chr([A-Z0-9a-b]+)/chr$1r/g; + $final_line = $line; + } + print OUT $final_line,"\n"; + #print "$line\n$final_line\n" if $final_line eq $line && $line !~ /chr/ && $line =~ /[a-zA-Z]/; + } + close OUT; +} +#xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx xxxxxxx artificial_axdata_inverter xxxxxxx + + +#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx + +sub multi_species_t1 { + + my $input1 = $_[0]; + #print "@_\n"; ; + my @tags = split(/_/, $_[1]); + my @outputs = split(/,/, $_[2]); + my $title_query = $_[3]; + my @handles = (); + + open(FILEB,"<$input1")or die "Cannot open file: $input1 $!"; + my $i = 0; + foreach my $path (@outputs){ + $handles[$i] = IO::Handle->new(); + open ($handles[$i], ">$path") or die "Can't open $path : $!"; + $i++; + } + + my $curdef; + my $start = 0; + + while (my $line = ) { + if ($line =~ /^\d/){ + $line =~ s/ +/\t/g; + my @fields = split(/\s+/, $line); + if (($line =~ /$title_query/)){ + my $title = $line; + my $counter = 0; + foreach my $tag (@tags){ + $line = ; + print {$handles[$counter]} ">",$tag,"\t",$title, " ",$line; + $counter++; + } + } + else{ + foreach my $tag (@tags){ + my $tine = ; + } + } + + } + } + + foreach my $hand (@handles){ + $hand->close(); + } + + close FILEB; +} + +#xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx xxxxxxx multi_species_t1 xxxxxxx + +#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx + +sub multi_species_t2{ + + my $input = $_[0]; + my $species = $_[1]; + my $output1 = $input."_unr"; + + #------------------------------------------------------------------------------------------ + open (FILEF1, "<$input") or die "Cannot open file $input :$!"; + open (FILEF2, ">$output1") or die "Cannot open file $output1 :$!"; + + my $line1 = ; + + while($line1){ + { + # chomp($line); + if ($line1 =~ (m/^\>$species/)){ + chomp($line1); + print FILEF2 $line1; + $line1 = ; + chomp($line1); + print FILEF2 "\t", $line1,"\n"; + } + } + $line1 = ; + } + + close FILEF1; + close FILEF2; + #------------------------------------------------------------------------------------------ + + my $output2 = $output1."and"; + my $output3 = $output1."and2"; + open(IN,"<$output1"); + open (FILEF3, ">$output2"); + open (FILEF4, ">$output3"); + + + while (){ + my $line = $_; + chomp($line); + my @fields=split (/\t/, $line); + # print $line,"\n"; ; + if($line !~ /random/){ + print FILEF3 join ("\t",@fields[0 ... scalar(@fields)-2]), "\n", $fields[scalar(@fields)-1], "\n"; + print FILEF4 join ("\t",@fields[0 ... scalar(@fields)-2]), "\t", $fields[scalar(@fields)-1], "\n"; + } + } + + + close IN; + close FILEF3; + close FILEF4; + unlink $output1; + + #------------------------------------------------------------------------------------------ + # OLD T3.PL RUDIMENT + + my $t3output = $output2; + $t3output =~ s/gap_op_unrand/nogap_op_unrand/g; + + open(IN,"<$output2"); + open(OUTA,">$t3output"); + + + while (){ + s/-//g unless /^>/; + print OUTA; + } + + close IN; + close OUTA; + #------------------------------------------------------------------------------------------ +} +#xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx xxxxxxx multi_species_t2 xxxxxxx + + +#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxxmulti_species_t2_2 xxxxxxx +sub multi_species_t2_2{ + #print "IN multi_species_t2_2 : @_\n"; + my $input = $_[0]; + my $species = $_[1]; + my $output1 = $input."2"; + + + open (FILEF1, "<$input"); + open (FILEF2, ">$output1"); + + my $line1 = ; + + while($line1){ + { + # chomp($line); + if ($line1 =~ (m/^\>$species/)){ + chomp($line1); + print FILEF2 $line1; + $line1 = ; + chomp($line1); + print FILEF2 "\t", $line1,"\n"; + } + } + $line1 = ; + } + + close FILEF1; + close FILEF2; +} + +#xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx xxxxxxx multi_species_t2_2 xxxxxxx + + +#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx +sub sputnikoutput_corrector{ + my $input = $_[0]; + my $output = $_[1]; + open(IN,"<$input") or die "Cannot open file $input :$!"; + open(OUT,">$output") or die "Cannot open file $output :$!"; + my $tine; + while (my $line=){ + if($line =~/length /){ + $tine = $line; + $tine =~ s/\s+/\t/g; + my @fields = split(/\t/,$tine); + if ($fields[6] > 60){ + print OUT $line; + $line = ; + + while (($line !~ /nucleotide/) && ($line !~ /^>/)){ + chomp $line; + print OUT $line; + $line = ; + } + print OUT "\n"; + print OUT $line; + } + else{ + print OUT $line; + } + } + else{ + print OUT $line; + } + } + close IN; + close OUT; +} +#xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx xxxxxxx sputnikoutput_corrector xxxxxxx + + +#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx +sub multi_species_t4{ +# print "multi_species_t4 : @_\n"; + my $input = $_[0]; + my $output = $_[1]; + open (FILEA, "<$input"); + open (FILEB, ">$output"); + + my $line = ; + + while ($line) { + # chomp $line; + if ($line =~ />/) { + chomp $line; + print FILEB $line, "\n"; + } + + + if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ + chomp $line; + print FILEB $line, " " ; + $line = ; + chomp $line; + print FILEB $line,"\n"; + } + + $line = ; + } + + + close FILEA; + close FILEB; + +} + +#xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx xxxxxxx multi_species_t4 xxxxxxx + + +#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx +sub multi_species_t5{ + + my $input = $_[0]; + my $output = $_[1]; + + open(FILEB,"<$input"); + open(FILEC,">$output"); + + my $curdef; + + while (my $line = ) { + + if ($line =~ /^>/){ + chomp $line; + $curdef = $line; + next; + } + + if ($line =~ /^m/ | $line =~ /^d/ | $line =~ /^t/ | $line =~ /^p/){ + print FILEC $curdef," ",$line; + } + + } + + + close FILEB; + close FILEC; + +} +#xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx xxxxxxx multi_species_t5 xxxxxxx + + +#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx +sub multi_species_t6{ + my $input = $_[0]; + my $output = $_[1]; + my $focalstrand=$_[3]; +# print "inpput = @_\n"; + open (FILE, "<$input"); + open (FILE_MICRO, ">$output"); + my $linecounter=0; + while (my $line = ){ + $linecounter++; + chomp $line; + #print "line = $line\n"; + #MONO# + $line =~ /$focalspec\s[a-zA-Z]+[0-9a-zA-Z]+\s[0-9]+\s[0-9]+\s([+\-])/; + my $strand=$1; + my $no_of_species = ($line =~ s/\s+[+\-]\s+/ /g); + #print "line = $line\n"; + my $specfieldsend = 2 + ($no_of_species*4) - 1; + my @fields = split(/\s+/, $line); + my @speciesdata = @fields[0 ... $specfieldsend]; + $line =~ /([a-z]+nucleotide)\s([0-9]+)\s:\s([0-9]+)/; + my ($tide, $start, $end) = ($1, $2, $3); + #print "no_of_species=$no_of_species.. speciesdata = @speciesdata and ($tide, $start, $end)\n"; + if($line =~ /mononucleotide/){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], mono($fields[$#fields]),),"\n"; + } + #DI# + elsif($line =~ /dinucleotide/){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], di($fields[$#fields]),),"\n"; + } + #TRI# + elsif($line =~ /trinucleotide/ ){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tri($fields[$#fields]),),"\n"; + } + #TETRA# + elsif($line =~ /tetranucleotide/){ + print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], tetra($fields[$#fields]),),"\n"; + } + #PENTA# + elsif($line =~ /pentanucleotide/){ + #print FILE_MICRO join("\t",@speciesdata, $tide, $start, $strand,$end, $fields[$#fields], penta($fields[$#fields]),),"\n"; + } + else{ + # print "not: @fields\n"; + } + } +# print "linecounter=$linecounter\n"; + close FILE; + close FILE_MICRO; +} + +sub mono { + my $st = $_[0]; + my $tp = unpack "A1"x(length($st)/1),$st; + my $var1 = substr($tp, 0, 1); + return join ("\t", $var1); +} +sub di { + my $st = $_[0]; + my $tp = unpack "A2"x(length($st)/2),$st; + my $var1 = substr($tp, 0, 2); + return join ("\t", $var1); +} +sub tri { + my $st = $_[0]; + my $tp = unpack "A3"x(length($st)/3),$st; + my $var1 = substr($tp, 0, 3); + return join ("\t", $var1); +} +sub tetra { + my $st = $_[0]; + my $tp = unpack "A4"x(length($st)/4),$st; + my $var1 = substr($tp, 0, 4); + return join ("\t", $var1); +} +sub penta { + my $st = $_[0]; + my $tp = unpack "A5"x(length($st)/5),$st; + my $var1 = substr($tp, 0, 5); + return join ("\t", $var1); +} + +#xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx xxxxxxx multi_species_t6 xxxxxxx + + +#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx +sub t9{ + my $input1 = $_[0]; + my $input2 = $_[1]; + my $output = $_[2]; + + + open(IN1,"<$input1") if -e $input1; + open(IN2,"<$input2") or die "cannot open file $_[1] : $!"; + open(OUT,">$output") or die "cannot open file $_[2] : $!"; + + + my %seen = (); + my $prevkey = 0; + + if (-e $input1){ + while (my $line = ){ + chomp($line); + my @fields = split(/\t/,$line); + my $key1 = join ("_K10K1_",@fields[0,1,3,4,5]); + # print "key in t9 = $key1\n"; + $seen{$key1}++ if ($prevkey ne $key1) ; + $prevkey = $key1; + } +# print "done first hash\n"; + close IN1; + } + + while (my $line = ){ + # print $line, "**\n"; + if (-e $input1){ + chomp($line); + my @fields = split(/\t/,$line); + my $key2 = join ("_K10K1_",@fields[0,1,3,4,5]); + if (exists $seen{$key2}){ + print OUT "$line\n" ; + delete $seen{$key2}; + } + } + else { + print OUT "$line\n" ; +# print "$line\n" ; + } + } + + close IN2; + close OUT; +} +#xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx xxxxxxxxxxxxxx t9 xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx + + +sub multiSpecies_compound_microsat_hunter3{ + + my $input1 = $_[0]; ###### the *_sput_op4_ii file + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" + my $output1 = $_[2]; ###### plain microsatellite file + my $output2 = $_[3]; ###### compound microsatellite file + my $org = $_[4]; ###### 1 or 2 + $no_of_species = $_[5]; + #print "IN multiSpecies_compound_microsat_hunter3: @_\n"; + #my @tags = split(/\t/,$info); + sub compoundify; + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + open(OUT,">$output1") or die "Cannot open file $output1 $!"; + open(OUT2,">$output2") or die "Cannot open file $output2 $!"; + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + my @thresholds = ("0"); + push(@thresholds, split(/_/,$_[6])); + sub thresholdCheck; + my %micros = (); + while (my $line = ){ + # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; + next if $line =~ /\t\t/; + if ($line =~ /^>[A-Za-z0-9_]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$micros{$key}},$line); + } + else{ + } + } + close IN; + my @deletedlines = (); + + my $linecount = 0; + + while(my $sine = ){ + my %microstart=(); + my %microend=(); + + my @sields = split(/\t/,$sine); + + my $key = (); + + if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([a-zA-Z0-9]+)\s([a-zA-Z]+[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "<-<-<-<-<-<-<-<\n"; + } + else{ + } + + if (exists $micros{$key}){ + $linecount++; + my @microstring = @{$micros{$key}}; + my @tempmicrostring = @{$micros{$key}}; + + foreach my $line (@tempmicrostring){ + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + my $firstflag = 'down'; + while( my $line =shift(@microstring)){ + # print "-----------\nline = $line "; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + my $startmicro = $line; + my $endmicro = $line; + + # print "fields=@fields, start = $start end=$end, startcord=$startcord, endcord=$endcord\n"; + + delete ($microstart{$start}); + delete ($microend{$end}); + my $flag = 'down'; + my $startflag = 'down'; + my $endflag = 'down'; + my $prestart = $start - $distance; + my $postend = $end + $distance; + my @compoundlines = (); + my %compoundhash = (); + push (@compoundlines, $line); + push (@{$compoundhash{$line}},$line); + my $startrank = 1; + my $endrank = 1; + + while( ($startflag eq "down") || ($endflag eq "down") ){ + if ((($prestart < 0) && $firstflag eq "up") || (($postend > length($sields[$sequencepos])) && $firstflag eq "up") ) { +# print "coming to the end of sequence,prestart = $prestart & post end = $postend and sequence length =", length($sields[$sequencepos])," so exiting\n"; + last; + } + + $firstflag = "up"; + if ($startflag eq "down"){ + for my $i ($prestart ... $start){ + + if(exists $microend{$i}){ + chomp $microend{$i}[0]; + if(exists $compoundhash{$microend{$i}[0]}) {next;} + # print "sending from microend $startmicro, $microend{$i}[0] |||\n"; + if (identityMatch_thresholdCheck($startmicro, $microend{$i}[0], $startrank) eq "proceed"){ + push(@compoundlines, $microend{$i}[0]); + # print "accepted\n"; + my @tields = split(/\t/,$microend{$i}[0]); + $startmicro = $microend{$i}[0]; + chomp $startmicro; + $start = $tields[$startcord]; + $flag = 'down'; + $startrank++; + # print "startcompund = $microend{$i}[0]\n"; + delete $microend{$i}; + delete $microstart{$start}; + $startflag = 'down'; + $prestart = $start - $distance; + last; + } + else{ + $flag = 'up'; + $startflag = 'up'; + } + } + else{ + $flag = 'up'; + $startflag = 'up'; + } + } + } + + $endrank = $startrank; + + if ($endflag eq "down"){ + for my $i ($end ... $postend){ + + if(exists $microstart{$i} ){ + chomp $microstart{$i}[0]; + if(exists $compoundhash{$microstart{$i}[0]}) {next;} + # print "sending from microstart $endmicro, $microstart{$i}[0] |||\n"; + + if(identityMatch_thresholdCheck($endmicro,$microstart{$i}[0], $endrank) eq "proceed"){ + push(@compoundlines, $microstart{$i}[0]); + # print "accepted\n"; + my @tields = split(/\t/,$microstart{$i}[0]); + $end = $tields[$endcord]-0; + $endmicro = $microstart{$i}[0]; + $endrank++; + chomp $endmicro; + $flag = 'down'; + # print "endcompund = $microstart{$i}[0]\n"; + delete $microstart{$i}; + delete $microend{$end}; + shift @microstring; + $postend = $end + $distance; + $endflag = 'down'; + last; + } + else{ + $flag = 'up'; + $endflag = 'up'; + } + } + else{ + $flag = 'up'; + $endflag = 'up'; + } + } + } + # print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; + } #end while( $flag eq "down") + # print "compoundlines = @compoundlines \n"; + if (scalar (@compoundlines) == 1){ + print OUT $line,"\n"; + } + if (scalar (@compoundlines) > 1){ + my $compoundline = compoundify(\@compoundlines, $sields[$sequencepos]); + # print $compoundline,"\n"; + print OUT2 $compoundline,"\n"; + } + } #end foreach my $line (@microstring){ + } #if (exists $micros{$key}){ + + + } + + close OUT; + close OUT2; +} + + +#------------------------------------------------------------------------------------------------ +sub compoundify{ + my ($compoundlines, $sequence) = @_; +# print "\nfound to compound : @$compoundlines and$sequence \n"; + my $noOfComps = @$compoundlines; +# print "Number of elements in hash is $noOfComps\n"; + my @starts; + my @ends; + foreach my $line (@$compoundlines){ +# print "compoundify.. line = $line \n"; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + # print "start = $start, end = $end \n"; + push(@starts, $start); + push(@ends,$end); + } + my @temp = @$compoundlines; + my $startline=$temp[0]; + my @mields = split(/\t/,$startline); + my $startcoord = $mields[$startcord]; + my $startgapsign=$mields[$endcord]; + my @startsorted = sort { $a <=> $b } @starts; + my @endsorted = sort { $a <=> $b } @ends; + my @intervals; + for my $end (0 ... (scalar(@endsorted)-2)){ + my $interval = substr($sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1)); + push(@intervals,$interval); + # print "interval = $interval =\n"; + # print "substr(sequence,($endsorted[$end]+1),(($startsorted[$end+1])-($endsorted[$end])-1))\n"; + } + push(@intervals,""); + my $compoundmicrosat=(); + my $multiunit=""; + foreach my $line (@$compoundlines){ + my @fields = split(/\t/,$line); + my $component="[".$fields[$microsatcord]."]".shift(@intervals); + $compoundmicrosat=$compoundmicrosat.$component; + $multiunit=$multiunit."[".$fields[$motifcord]."]"; +# print "multiunit = $multiunit\n"; + } + my $compoundcopy = $compoundmicrosat; + $compoundcopy =~ s/\[|\]//g; + my $compoundlength = $mields[$startcord] + length($compoundcopy) - 1; + + + my $compoundline = join("\t",(@mields[0 ... $infocord], "compound",@mields[$startcord ... $startcord+1],$compoundlength,$compoundmicrosat, $multiunit)); + return $compoundline; +} + +#------------------------------------------------------------------------------------------------ + +sub identityMatch_thresholdCheck{ + my $line1 = $_[0]; + my $line2 = $_[1]; + my $rank = $_[2]; + my @lields1 = split(/\t/,$line1); + my @lields2 = split(/\t/,$line2); +# print "recieved $line1 && $line2\n motif comparison: ", length($lields1[$motifcord])," : ",length($lields2[$motifcord]),"\n"; + + if (length($lields1[$motifcord]) == length($lields2[$motifcord])){ + my $probe = $lields1[$motifcord].$lields1[$motifcord]; + #print "$probe :: $lields2[$motifcord]\n"; + return "proceed" if $probe =~ /$lields2[$motifcord]/; + #print "line recieved\n"; + if ($rank ==1){ + return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; + } + else { + return "proceed" if thresholdCheck($line2) eq "proceed"; + return "stop"; + } + } + else{ + if ($rank ==1){ + return "proceed" if thresholdCheck($line1) eq "proceed" && thresholdCheck($line2) eq "proceed"; + } + else { + return "proceed" if thresholdCheck($line2) eq "proceed"; + return "stop"; + } + } + return "stop"; +} +#------------------------------------------------------------------------------------------------ + +sub thresholdCheck{ + my @checkthresholds=(0,@thresholds); + #print "IN thresholdCheck: @_\n"; + my $line = $_[0]; + my @lields = split(/\t/,$line); + return "proceed" if length($lields[$microsatcord]) >= $checkthresholds[length($lields[$motifcord])]; + return "stop"; +} +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx multiSpecies_compound_microsat_hunter3 xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx + +sub multiSpecies_filtering_interrupted_microsats{ +# print "IN multiSpecies_filtering_interrupted_microsats: @_\n"; + my $unfiltered = $_[0]; + my $filtered = $_[1]; + my $residue = $_[2]; + my $no_of_species = $_[5]; + open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; + open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; + open(RES,">$residue") or die "Cannot open file $residue: $!"; + + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + + + my @sub_thresholds = (0); + + push(@sub_thresholds, split(/_/,$_[3])); + my @thresholds = (0); + + push(@thresholds, split(/_/,$_[4])); + + while (my $line = ) { + next if $line !~ /[a-z]/; + #print $line; + chomp $line; + my @fields = split(/\t/,$line); + my $motif = $fields[$motifcord]; + my $realmotif = $motif; + #print "motif = $motif\n"; + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + my @motifs = split(/\]/,$motif); + $realmotif = $motifs[0]; + } +# print "realmotif = $realmotif"; + my $motif_size = length($realmotif); + + my $microsat = $fields[$microsatcord]; +# print "microsat = $microsat\n"; + $microsat =~ s/^\[|\]$//sg; + my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); + + $microsat = join("",@microsats); + if (length($microsat) < $thresholds[$motif_size]) { + # print length($microsat)," < ",$thresholds[$motif_size],"\n"; + print RES $line,"\n"; next; + } + my @lengths = (); + foreach my $mic (@microsats){ + push(@lengths, length($mic)); + } + if (largest_microsat(@lengths) < $sub_thresholds[$motif_size]) { + # print largest_microsat(@lengths)," < ",$sub_thresholds[$motif_size],"\n"; + print RES $line,"\n"; next;} + else {print FIL $line,"\n"; next; + } + } + close FIL; + close RES; + +} + +sub largest_microsat{ + my $counter = 0; + my($max) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp > $max){ + $max = $temp; + } + } + return($max); +} + +#xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx multiSpecies_filtering_interrupted_microsats xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx +sub multiSpecies_compound_microsat_analyzer{ + ####### PARAMETER ######## + ########################## + + my $input1 = $_[0]; ###### the *_sput_op4_ii file + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format + my $output2 = $_[3]; ###### the pure compound microsatellites + my $org = $_[4]; + my $no_of_species = $_[5]; +# print "IN multiSpecies_compound_microsat_analyzer: $input1\n $input2\n $output1\n $output2\n $org\n $no_of_species\n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + + open(OUT,">$output1") or die "Cannot open file $output1 $!"; + open(OUT2,">$output2") or die "Cannot open file $output2 $!"; + + +# print "opened files \n"; + my %micros = (); + my $keycounter=0; + my $linecounter=0; + while (my $line = ){ + $linecounter++; + if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); + push (@{$micros{$key}},$line); + $keycounter++; + } + else{ + # print "no key\n"; + } + } + close IN; + my @deletedlines = (); +# print "done hash . linecounter=$linecounter, keycounter=$keycounter\n"; + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + my $keyfound=0; + my $keyexists=0; + my $inter=0; + my $pure=0; + + while(my $sine = ){ + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = 0; + if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12); + # print $sine; + # print $key; + $keyfound++; + } + else{ + + } + # if !defined $key; + + if (exists $micros{$key}){ + $keyexists++; + my @microstring = @{$micros{$key}}; + + my @filteredmicrostring; + + foreach my $line (@microstring){ + chomp $line; + my $copy_line = $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + # FOR COMPOUND MICROSATELLITES + if ($fields[$typecord] eq "compound"){ + $line = compound_microsat_analyser($line); + if ($line eq "NULL") { + print OUT2 "$copy_line\n"; + $pure++; + next; + } + else{ + print OUT "$line\n"; + $inter++; + next; + } + } + } + + } #if (exists $micros{$key}){ + } + close OUT; + close OUT2; +# print "keyfound=$keyfound, keyexists=$keyexists, pure=$pure, inter=$inter\n"; +} + +sub compound_microsat_analyser{ + my $line = $_[0]; + my @fields = split(/\t/,$line); + my $motifline = $fields[$motifcord]; + my $microsat = $fields[$microsatcord]; + $motifline =~ s/^\[|\]$//g; + $microsat =~ s/^\[|\]$//g; + $microsat =~ s/-//g; + my @interruptions = (); + my @motields = split(/\]\[/,$motifline); + my @microields = split(/\][a-zA-Z|-]*\[/,$microsat); + my @inields = split(/[.*]/,$microsat); + shift @inields; + my @motifcount = scalar(@motields); + my $prevmotif = $motields[0]; + my $prevmicro = $microields[0]; + my $prevphase = substr($microields[0],-(length($motields[0])),length($motields[0])); + my $localflag = 'down'; + my @infoarray = (); + + for my $l (1 ... (scalar(@motields)-1)){ + my $probe = $prevmotif.$prevmotif; + if (length $prevmotif != length $motields[$l]) {$localflag = "up"; last;} + + if ($probe =~ /$motields[$l]/i){ + my $curr_endphase = substr($microields[$l],-length($motields[$l]),length($motields[$l])); + my $curr_startphase = substr($microields[$l],0,length($motields[$l])); + if ($curr_startphase =~ /$prevphase/i) { + $infoarray[$l-1] = "insertion"; + } + else { + $infoarray[$l-1] = "indel/substitution"; + } + + $prevmotif = $motields[$l]; $prevmicro = $microields[$l]; $prevphase = $curr_endphase; + next; + } + else {$localflag = "up"; last;} + } + if ($localflag eq 'up') {return "NULL";} + + if (length($prevmotif) == 1) {$fields[$typecord] = "mononucleotide";} + if (length($prevmotif) == 2) {$fields[$typecord] = "dinucleotide";} + if (length($prevmotif) == 3) {$fields[$typecord] = "trinucleotide";} + if (length($prevmotif) == 4) {$fields[$typecord] = "tetranucleotide";} + if (length($prevmotif) == 5) {$fields[$typecord] = "pentanucleotide";} + + @microields = split(/[\[|\]]/,$microsat); + my @microsats = (); + my @positions = (); + my $lengthtracker = 0; + + for my $i (0 ... (scalar(@microields ) - 1)){ + if ($i%2 == 0){ + push(@microsats,$microields[$i]); + $lengthtracker = $lengthtracker + length($microields[$i]); + + } + else{ + push(@interruptions,$microields[$i]); + push(@positions, $lengthtracker+1); + $lengthtracker = $lengthtracker + length($microields[$i]); + } + + } + my $returnline = join("\t",(join("\t",@fields),join(",",(@infoarray)),join(",",(@interruptions)),join(",",(@positions)),scalar(@interruptions))); + return($returnline); +} + +#xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx multiSpecies_compound_microsat_analyzer xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx + +sub multiSpecies_compoundClarifyer{ +# print "IN multiSpecies_compoundClarifyer: @_\n"; + my $input1 = $_[0]; ###### the *_sput_compound + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format + my $output2 = $_[3]; ###### compound file + my $org = $_[4]; + my $no_of_species = $_[5]; + @thresholds = "0"; + push(@thresholds, split(/_/,$_[6])); + + + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $interrcord = $motifcord + 2; + $interrtypecord = $motifcord + 1; + + + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + + open(INT,">$output1") or die "Cannot open file $output2 $!"; + open(COMP,">$output2") or die "Cannot open file $output2 $!"; + #open(CH,">changed") or die "Cannot open file changed $!"; + +# print "opened files \n"; + my $linecounter = 0; + my $microcounter = 0; + + my %micros = (); + while (my $line = ){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) { + my $key = join("\t",$1, $2, $3, $4); + # print $key, "#-#-#-#-#-#-#-#\n"; + # print "key = $key\n"; + push (@{$micros{$key}},$line); + $microcounter++; + } + else {#print $line," key not made\n"; ; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close IN; + my @deletedlines = (); +# print "done hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + my @microsat_types = qw(_ mononucleotide dinucleotide trinucleotide tetranucleotide); + $printer = 0; + + while(my $sine = ){ + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = (); + +# print "sine = $sine. focalspec = $focalspec \n"; #; + + if ($sine =~ /($focalspec)\s+([0-9a-zA-Z_\-]+)\s+([0-9]+)\s+([0-9]+)/ ) { + +# if ($sine =~ /([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s+[\+|\-]\s+([a-z0-9A-Z]+)\s+([0-9a-zA-Z_]+)\s+([0-9]+)\s+([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4); +# print "key = $key\n"; + } + else{ +# print "no key in $sine\nfor pattern ([a-z0-9A-Z]+) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) [\+|\-] (a-z0-9A-Z) (chr[0-9a-zA-Z]+) ([0-9]+) ([0-9]+) / \n"; + } + + if (exists $micros{$key}){ + my @microstring = @{$micros{$key}}; + delete $micros{$key}; + + foreach my $line (@microstring){ +# print "#---------#---------#---------#---------#---------#---------#---------#---------\n" if $printer == 1; +# print "microsat = $line" if $printer == 1; + $linecounter++; + my $copy_line = $line; + my @mields = split(/\t/,$line); + my @fields = @mields; + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + my $microsat = $fields[$microsatcord]; + my $motifline = $fields[$motifcord]; + my $microsatcopy = $microsat; + my $positioner = $microsat; + $positioner =~ s/[a-zA-Z|-]/_/g; + $microsatcopy =~ s/^\[|\]$//gs; + chomp $microsatcopy; + my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); + my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); + my $absolutstart = 1; my $absolutend = $absolutstart + ($end-$start); +# print "absolut: start = $absolutstart, end = $absolutend\n" if $printer == 1; + shift @inields; + #print "inields =@inields<\n"; + $motifline =~ s/^\[|\]$//gs; + chomp $motifline; + #print "microsat = $microsat, its copy = $microsatcopy motifline = $motifline<\n"; + my @motields = split(/\]\[/,$motifline); + my $seq = $microsatcopy; + $seq =~ s/\[|\]//g; + my $seqlen = length($seq); + $seq = " ".$seq; + + my $longestmotif_no = longest_array_element(@motields); + my $shortestmotif_no = shortest_array_element(@motields); + #print "shortest motif = $motields[$shortestmotif_no], longest motif = $motields[$longestmotif_no] \n"; + + my $search = $motields[$longestmotif_no].$motields[$longestmotif_no]; + if ((length($motields[$longestmotif_no]) == length($motields[$shortestmotif_no])) && ($search !~ /$motields[$shortestmotif_no]/) ){ + print COMP $line; + next; + } + + my @shortestmotif_nos = (); + for my $m (0 ... $#motields){ + push(@shortestmotif_nos, $m) if (length($motields[$m]) == length($motields[$shortestmotif_no]) ); + } + ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ + my $newleft =(); + my $leftstopper = 0; my $rightstopper = 0; + foreach my $shortmotif_no (@shortestmotif_nos){ + next if $shortmotif_no == 0; + my $last_left = $shortmotif_no; #$#motields; + my $last_hitter = 0; + for (my $i =($shortmotif_no-1); $i>=0; $i--){ + my $search = $motields[$shortmotif_no]; + if (length($motields[$shortmotif_no]) == 1){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} + if( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ + $last_hitter = 1; + $last_left = $i+1; last; + } + my $probe = $motields[$i]; + if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} + + if ($probe !~ /$search/){ + $last_hitter = 1; + $last_left = $i+1; + # print "hit the last match: before $microields[$i]..last left = $last_left.. exiting.\n"; + last; + } + $last_left--;$last_hitter = 1; + # print "passed tests, last left = $last_left\n"; + } + # print "comparing whether $last_left < $shortmotif_no, lasthit = $last_hitter\n"; + if (($last_left) < $shortmotif_no && $last_hitter == 1) {$leftstopper=0; last;} + else {$leftstopper = 1; + # print "leftstopper = 1\n"; + } + } + + ## LOOKING AT LEFT OF THE SHORTEST MOTIF------------------------------------------------ + my $newright =(); + foreach my $shortmotif_no (@shortestmotif_nos){ + next if $shortmotif_no == $#motields; + my $last_right = $shortmotif_no;# -1; + for my $i ($shortmotif_no+1 ... $#motields){ + my $search = $motields[$shortmotif_no]; + if (length($motields[$shortmotif_no]) == 1 ){ $search = $motields[$shortmotif_no].$motields[$shortmotif_no] ;} + if ( (length($motields[$i]) > length($motields[$shortmotif_no])) && length($microields[$i]) > (2.5 * length($motields[$i])) ){ + $last_right = $i-1; last; + } + my $probe = $motields[$i]; + if (length($motields[$shortmotif_no]) == length($motields[$i])) {$probe = $motields[$i].$motields[$i];} + if ( $probe !~ /$search/){ + $last_right = $i-1; last; + } + $last_right++; + } + if (($last_right) > $shortmotif_no) {$rightstopper=0; last;# print "rightstopper = 0\n"; + } + else {$rightstopper = 1; + } + } + + + if ($rightstopper == 1 && $leftstopper == 1){ + print COMP $line; +# print "rightstopper == 1 && leftstopper == 1\n" if $printer == 1; + next; + } + +# print "pased initial testing phase \n" if $printer == 1; + my @outputs = (); + my @orig_starts = (); + my @orig_ends = (); + for my $mic (0 ... $#microields){ + my $miclen = length($microields[$mic]); + my $microleftlen = 0; + #print "\nmic = $mic\n"; + if($mic > 0){ + for my $submin (0 ... $mic-1){ + my $interval = (); + if (!exists $inields[$submin]) {$interval = "";} + else {$interval = $inields[$submin];} + #print "inield =$interval< and microield =$microields[$submin]<\n "; + $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); + } + } + push(@orig_starts,($microleftlen+1)); + push(@orig_ends, ($microleftlen+1 + $miclen -1)); + } + + ############# F I N A L L Y S T U D Y I N G S E Q U E N C E S #########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ + + + for my $mic (0 ... $#microields){ + my $miclen = length($microields[$mic]); + my $microleftlen = 0; + if($mic > 0){ + for my $submin (0 ... $mic-1){ + # if(!exists $inields[$submin]) {$inields[$submin] = "";} + my $interval = (); + if (!exists $inields[$submin]) {$interval = "";} + else {$interval = $inields[$submin];} + #print "inield =$interval< and microield =$microields[$submin]<\n "; + $microleftlen = $microleftlen + length($microields[$submin]) + length($interval); + } + } + $fields[$startcord] = $microleftlen+1; + $fields[$endcord] = $fields[$startcord] + $miclen -1; + $fields[$typecord] = $microsat_types[length($motields[$mic])]; + $fields[$microsatcord] = $microields[$mic]; + $fields[$motifcord] = $motields[$mic]; + my $templine = join("\t", (@fields[0 .. $motifcord]) ); + my $orig_templine = join("\t", (@fields[0 .. $motifcord]) ); + my $newline; + my $lefter = 1; my $righter = 1; + if ( $fields[$startcord] < 2){$lefter = 0;} + if ($fields[$endcord] == $seqlen){$righter = 0;} + + while($lefter == 1){ + $newline = left_extender($templine, $seq,$org); +# print "returned line from left extender= $newline \n" if $printer == 1; + if ($newline eq $templine){$templine = $newline; last;} + else {$templine = $newline;} + + if (left_extention_permission_giver($templine) eq "no") {last;} + } + while($righter == 1){ + $newline = right_extender($templine, $seq,$org); +# print "returned line from right extender= $newline \n" if $printer == 1; + if ($newline eq $templine){$templine = $newline; last;} + else {$templine = $newline;} + if (right_extention_permission_giver($templine) eq "no") {last;} + } + my @tempfields = split(/\t/,$templine); + $tempfields[$microsatcord] =~ s/\]|\[//g; + $tempfields[$motifcord] =~ s/^\[|\]$//gs; + my @tempmotields = split(/\]\[/,$tempfields[$motifcord]); + + if (scalar(@tempmotields) == 1 && $templine eq $orig_templine) { +# print "scalar ( tempmotields) = 1\n" if $printer == 1; + next; + } + my $prevmotif = shift(@tempmotields); + my $stopper = 0; + + foreach my $tempmot (@tempmotields){ + if (length($tempmot) != length($prevmotif)) {$stopper = 1; last;} + my $search = $prevmotif.$prevmotif; + if ($search !~ /$tempmot/) {$stopper = 1; last;} + $prevmotif = $tempmot; + } + if ( $stopper == 1) { +# print "length tempmot != length prevmotif\n" if $printer == 1; + next; + } + my $lastend = 0; + #---------------------------------------------------------- + my $left_captured = (); my $right_captured = (); + my $left_bp = (); my $right_bp = (); + # print "new startcord = $tempfields[$startcord] , new endcord = $tempfields[$endcord].. orig strts = @orig_starts and orig ends = @orig_ends\n"; + for my $o (0 ... $#orig_starts){ +# print "we are talking abut tempstart:$tempfields[$startcord] >= origstart:$lastend && tempstart:$tempfields[$startcord] <= origend: $orig_ends[$o] \n" if $printer == 1; +# print "we are talking abut tempend:$tempfields[$endcord] >= origstart:$lastend && tempstart:$tempfields[$endcord] >= origend: $orig_ends[$o] \n" if $printer == 1; + + if (($tempfields[$startcord] > $lastend) && ($tempfields[$startcord] <= $orig_ends[$o])){ # && ($tempfields[$startcord] != $fields[$startcord]) +# print "motif captured on left is $microields[$o] from $microsat\n" if $printer == 1; + $left_captured = $o; + $left_bp = $orig_ends[$o] - $tempfields[$startcord] + 1; + } + elsif ($tempfields[$endcord] > $lastend && $tempfields[$endcord] <= $orig_ends[$o]){ #&& $tempfields[$endcord] != $fields[$endcord]) +# print "motif captured on right is $microields[$o] from $microsat\n" if $printer == 1; + $right_captured = $o; + $right_bp = $tempfields[$endcord] - $orig_starts[$o] + 1; + } + $lastend = $orig_ends[$o] + } +# print "leftcaptured = $left_captured, right = $right_captured\n" if $printer==1; + my $leftmotif = (); my $left_trashed = (); + if ($tempfields[$startcord] != $fields[$startcord]) { + $leftmotif = $motields[$left_captured]; +# print "$left_captured in @microields: $motields[$left_captured]\n" if $printer == 1; + if ( $left_captured !~ /[0-9]+/) {#print $line,"\n", $templine,"\n"; + } + $left_trashed = length($microields[$left_captured]) - $left_bp; + } + my $rightmotif = (); my $right_trashed = (); + if ($tempfields[$endcord] != $fields[$endcord]) { +# print "$right_captured in @microields: $motields[$right_captured]\n" if $printer == 1; + $rightmotif = $motields[$right_captured]; + $right_trashed = length($microields[$right_captured]) - $right_bp; + } + + ########## P A R A M S #####################@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@#########@@@@ + $stopper = 0; + my $deletioner = 0; + #if($tempfields[$startcord] != $fields[$startcord]){ +# print "enter left: tempfields,startcord : $tempfields[$startcord] != $absolutstart && left_captured: $left_captured != 0 \n" if $printer==1; + if ($left_captured != 0){ +# print "at line 370, going: 0 ... $left_captured-1 \n" if $printer == 1; + for my $e (0 ... $left_captured-1){ + if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e]) )){ +# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ +# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ +# print "motif on left not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + } + } + #} +# print "after left search, deletioner = $deletioner\n" if $printer == 1; + if ($deletioner >= 1) { +# print "deletioner = $deletioner\n" if $printer == 1; + next; + } + + $deletioner = 0; + + #if($tempfields[$endcord] != $fields[$endcord]){ +# print "if tempfields endcord: $tempfields[$endcord] != absolutend: $absolutend\n and $right_captured != $#microields\n" if $printer==1; + if ($right_captured != $#microields){ +# print "at line 394, going: $right_captured+1 ... $#microields \n" if $printer == 1; + for my $e ($right_captured+1 ... $#microields){ + if( length($motields[$e]) > 2 && length($microields[$e]) > (3* length($motields[$e])) ){ +# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 2 && length($microields[$e]) > (3* length($motields[$e]) )){ +# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + if( length($motields[$e]) == 1 && length($microields[$e]) > (4* length($motields[$e]) )){ +# print "motif on right not included too big to be ignored : $microields[$e] \n" if $printer == 1; + $deletioner++; last; + } + } + } + #} +# print "deletioner = $deletioner\n" if $printer == 1; + if ($deletioner >= 1) { + next; + } + my $leftMotifs_notCaptured = (); + my $rightMotifs_notCaptured = (); + + if ($tempfields[$startcord] != $fields[$startcord] ){ + #print "in left params: (length($leftmotif) == 1 && $tempfields[$startcord] != $fields[$startcord]) ... and .... $left_trashed > (1.5* length($leftmotif]) && ($tempfields[$startcord] != $fields[$startcord])\n"; + if (length($leftmotif) == 1 && $left_trashed > 3){ +# print "invaded left motif is long mononucleotide" if $printer == 1; + next; + + } + elsif ((length($leftmotif) != 1 && $left_trashed > ( thrashallow($leftmotif)) && ($tempfields[$startcord] != $fields[$startcord]) ) ){ +# print "invaded left motif too long" if $printer == 1; + next; + } + } + if ($tempfields[$endcord] != $fields[$endcord] ){ + #print "in right params: after $tempfields[$endcord] != $fields[$endcord] ..... (length($rightmotif)==1 && $tempfields[$endcord] != $fields[$endcord]) ... and ... $right_trashed > (1.5* length($rightmotif))\n"; + if (length($rightmotif)==1 && $right_trashed){ +# print "invaded right motif is long mononucleotide" if $printer == 1; + next; + + } + elsif (length($rightmotif) !=1 && ($right_trashed > ( thrashallow($rightmotif)) && $tempfields[$endcord] != $fields[$endcord])){ +# print "invaded right motif too long" if $printer == 1; + next; + + } + } + push @outputs, $templine; + } + if (scalar(@outputs) == 0){ print COMP $line; next;} + # print "outputs are:", join("\n",@outputs),"\n"; + if (scalar(@outputs) == 1){ + my @oields = split(/\t/,$outputs[0]); + my $start = $oields[$startcord]+$mields[$startcord]-1; + my $end = $start+($oields[$endcord]-$oields[$startcord]); + $oields[$startcord] = $start; $oields[$endcord] = $end; + print INT join("\t",@oields), "\n"; + # print CH $line,; + } + if (scalar(@outputs) > 1){ + my $motif_min = 10; + my $chosen_one = $outputs[0]; + foreach my $micro (@outputs){ + my @oields = split(/\t/,$micro); + my $tempmotif = $oields[$motifcord]; + $tempmotif =~ s/^\[|\]$//gs; + my @omots = split(/\]\[/, $tempmotif); + # print "motif_min = $motif_min, current motif = $tempmotif\n"; + my $start = $oields[$startcord]+$mields[$startcord]-1; + my $end = $start+($oields[$endcord]-$oields[$startcord]); + $oields[$startcord] = $start; $oields[$endcord] = $end; + if(length($omots[0]) < $motif_min) { + $chosen_one = join("\t",@oields); + $motif_min = length($omots[0]); + } + } + print INT $chosen_one, "\n"; + # print "chosen one is ".$chosen_one, "\n"; + # print CH $line; + + + } + + } + + } #if (exists $micros{$key}){ + else{ + } + } + close INT; + close COMP; +} +sub left_extender{ + #print "left extender\n"; + my ($line, $seq, $org) = @_; +# print "in left extender... line passed = $line and sequence is $seq\n"; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $firstmotif = (); + + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + } + else {$firstmotif = $motif;} + + #print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n"; + my $leftphase = substr($microsat, 0,length($firstmotif)); + my $phaser = $leftphase.$leftphase; + my @phase = split(/\s*/,$leftphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($leftphase)-1)){ + push(@phases, substr($phaser, $crawler, length($leftphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $leftseq = substr($seq, 0, $start); +# print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n"; + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $firstmotif_length = length($firstmotif); + foreach my $phase (@phases){ +# print "left phase\t",substr($leftseq, -10),"\t$phase\n"; +# print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n"; + if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ +# print "in left pattern\n"; + my $trapped = $1; + my $trappedpos = length($leftseq)-length($trapped); + my $interval = $3; + my $intervalpos = index($trapped, $interval) + 1; +# print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n"; + + my $extention = substr($trapped, 0, length($trapped)-length($interval)); + my $leftpeep = substr($seq, 0, ($start-length($trapped))); + my @passed_overhangs; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase, -length($phase)+$i); +# print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n"; + #TEMPORARY... BETTER METHOD NEEDED + $leftpeep =~ s/-//g; + if ($leftpeep =~ /$overhang$/i){ + push(@passed_overhangs,$overhang); +# print "l overhang\n"; + } + } + + if(scalar(@passed_overhangs)>0){ + my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $overhang.$extention; + $trapped = $overhang.$trapped; + #print "trapped extended to $trapped \n"; + $trappedpos = length($leftseq)-length($trapped); + } + + push(@extentions,$extention); +# print "extentions = @extentions \n"; + + push(@trappeds,$trapped ); + push(@intervalposs,length($extention)+1); + push(@trappedposs, $trappedpos); +# print "trappeds = @trappeds\n"; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + + my $nikaal = shortest_array_element(@intervals); + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; + ##print "new fields 9 = $fields[9]\n"; + $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; + + if (exists ($fields[$motifcord+1])){ + $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; + } + else{$fields[$motifcord+1] = "indel/deletion";} + ##print "new fields 14 = $fields[14]\n"; + + if (exists ($fields[$motifcord+2])){ + $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + my @seventeen=(); + if (exists ($fields[$motifcord+3])){ + @seventeen = split(/,/,$fields[$motifcord+3]); + # #print "scalarseventeen =@seventeen<-\n"; + for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} + $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + } + + else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} + + ##print "new fields 16 = $fields[16]\n"; + ##print "new fields 17 = $fields[17]\n"; + + + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_compoundClarifyer_merge($returnline); + } + return $returnline; +} +sub right_extender{ + my ($line, $seq, $org) = @_; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $temp_lastmotif = (); + + if ($motif =~ /\]$/s){ + $motif =~ s/\]$//sg; + $motif =~ /.*\[([a-zA-Z]+)/; + $temp_lastmotif = $1; + } + else {$temp_lastmotif = $motif;} + my $lastmotif = substr($microsat,-length($temp_lastmotif)); + ##print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n"; + my $rightphase = substr($microsat, -length($lastmotif)); + my $phaser = $rightphase.$rightphase; + my @phase = split(/\s*/,$rightphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($rightphase)-1)){ + push(@phases, substr($phaser, $crawler, length($rightphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $rightseq = substr($seq, $end+1); + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $lastmotif_length = length($lastmotif); + foreach my $phase (@phases){ + if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ + my $trapped = $1; + my $trappedpos = $end+1; + my $interval = $2; + my $intervalpos = index($trapped, $interval) + 1; + + my $extention = substr($trapped, length($interval)); + my $rightpeep = substr($seq, ($end+length($trapped))+1); + my @passed_overhangs = ""; + + #TEMPORARY... BETTER METHOD NEEDED + $rightpeep =~ s/-//g; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase,0, $i); +# #print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n"; + if ($rightpeep =~ /^$overhang/i){ + push(@passed_overhangs, $overhang); +# #print "r overhang\n"; + } + } + if (scalar(@passed_overhangs) > 0){ + my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $extention.$overhang; + $trapped = $trapped.$overhang; +# #print "trapped extended to $trapped \n"; + } + + push(@extentions,$extention); + ##print "extentions = @extentions \n"; + + push(@trappeds,$trapped ); + push(@intervalposs,$intervalpos); + push(@trappedposs, $trappedpos); +# #print "trappeds = @trappeds\n"; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + +# my $nikaal = longest_array_element(@trappeds); + my $nikaal = shortest_array_element(@intervals); + +# #print "longest element found = $nikaal \n"; + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; + ##print "new fields 9 = $fields[9]"; + $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); + + ##print "new fields 11 = $fields[11]\n"; + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; + ##print "new fields 12 = $fields[12]\n"; + + ##print "scalar of fields = ",scalar(@fields),"\n"; + if (exists ($fields[$motifcord+1])){ +# print " print fields = @fields.. scalar=", scalar(@fields),".. motifcord+1 = $motifcord + 1 \n " if !exists $fields[$motifcord+1]; +# if !exists $fields[$motifcord+1]; + $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; + } + else{$fields[$motifcord+1] = "indel/deletion";} + ##print "new fields 14 = $fields[14]\n"; + + if (exists ($fields[$motifcord+2])){ + $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + ##print "new fields 15 = $fields[15]\n"; + + my @seventeen=(); + if (exists ($fields[$motifcord+3])){ + ##print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n"; +# print " print fields = @fields\n " if !exists $fields[$motifcord+3]; + if !exists $fields[$motifcord+3]; + my $currpos = length($microsat)+$intervalposs[$nikaal]; + $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + + } + + else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} + + ##print "new fields 16 = $fields[16]\n"; + + ##print "new fields 17 = $fields[17]\n"; + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_compoundClarifyer_merge($returnline); + } + #print "finally right-extended line = ",$returnline,"\n"; + return $returnline; +} +sub longest_array_element{ + my $counter = 0; + my($max) = shift(@_); + my $maxcounter = 0; + foreach my $temp (@_) { + $counter++; + #print "finding largest array: $maxcounter \n" if $prinkter == 1; + if(length($temp) > length($max)){ + $max = $temp; + $maxcounter = $counter; + } + } + return($maxcounter); +} +sub shortest_array_element{ + my $counter = 0; + my($min) = shift(@_); + my $mincounter = 0; + foreach my $temp (@_) { + $counter++; + #print "finding largest array: $mincounter \n" if $prinkter == 1; + if(length($temp) < length($min)){ + $min = $temp; + $mincounter = $counter; + } + } + return($mincounter); +} + + +sub left_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/(^\[)|-//g; + my $motif = $fields[$motifcord]; + my $firstmotif = (); + my $firststretch = (); + my @stretches=(); + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + @stretches = split(/\]/,$microsat); + $firststretch = $stretches[0]; + ##print "firststretch = $firststretch\n"; + } + else {$firstmotif = $motif;$firststretch = $microsat;} + + if (length($firststretch) < $thresholds[length($firstmotif)]){ + return "no"; + } + else {return "yes";} + +} +sub right_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/-|(\]$)//sg; + my $motif = $fields[$motifcord]; + my $temp_lastmotif = (); + my $laststretch = (); + my @stretches=(); + + + if ($motif =~ /\]/){ + $motif =~ s/\]$//gs; + $motif =~ /.*\[([a-zA-Z]+)$/; + $temp_lastmotif = $1; + @stretches = split(/\[/,$microsat); + $laststretch = pop(@stretches); + ##print "last stretch = $laststretch\n"; + } + else {$temp_lastmotif = $motif; $laststretch = $microsat;} + + if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ + return "no"; + } + else { return "yes";} + + +} +sub multiSpecies_compoundClarifyer_merge{ + my $line = $_[0]; + #print "sent for mering: $line \n"; + my @mields = split(/\t/,$line); + my @fields = @mields; + my $microsat = $fields[$microsatcord]; + my $motifline = $fields[$motifcord]; + my $microsatcopy = $microsat; + $microsatcopy =~ s/^\[|\]$//sg; + my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); + my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); + shift @inields; + #print "inields =@inields<\n"; + $motifline =~ s/^\[|\]$//sg; + my @motields = split(/\]\[/,$motifline); + my @firstmotifs = (); + my @lastmotifs = (); + for my $i (0 ... $#microields){ + $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); + $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); + } + #print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n"; + my @mergelist = (); + my @inter_poses = split(/,/,$fields[$interr_poscord]); + my $no_of_interruptions = $fields[$no_of_interruptionscord]; + my @interruptions = split(/,/,$fields[$interrcord]); + my @interrtypes = split(/,/,$fields[$interrtypecord]); + my $stopper = 0; + for my $i (0 ... $#motields-1){ + #print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n"; + if (($lastmotifs[$i] eq $firstmotifs[$i+1]) && !exists $inields[$i]){ + $stopper = 1; + push(@mergelist, ($i)."_".($i+1)); + } + } + + return $line if scalar(@mergelist) == 0; + + foreach my $merging (@mergelist){ + my @sets = split(/_/, $merging); + my @tempmicro = (); + my @tempmot = (); + for my $i (0 ... $sets[0]-1){ + push(@tempmicro, "[".$microields[$i]."]"); + push(@tempmicro, $inields[$i]); + push(@tempmot, "[".$motields[$i]."]"); + #print "adding pre-motifs number $i\n"; + } + my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; + push (@tempmicro, $pusher); + push(@tempmot, "[".$motields[$sets[0]]."]"); + my $outcoming = -2; + for my $i ($sets[1]+1 ... $#microields-1){ + push(@tempmicro, "[".$microields[$i]."]"); + push(@tempmicro, $inields[$i]); + push(@tempmot, "[".$motields[$i]."]"); + #print "adding post-motifs number $i\n"; + $outcoming = $i; + } + if ($outcoming != -2){ + #print "outcoming = $outcoming \n"; + push(@tempmicro, "[".$microields[$outcoming+1 ]."]"); + push(@tempmot,"[". $motields[$outcoming+1]."]"); + } + $fields[$microsatcord] = join("",@tempmicro); + $fields[$motifcord] = join("",@tempmot); + + splice(@interrtypes, $sets[0], 1); + $fields[$interrtypecord] = join(",",@interrtypes); + splice(@interruptions, $sets[0], 1); + $fields[$interrcord] = join(",",@interruptions); + splice(@inter_poses, $sets[0], 1); + $fields[$interr_poscord] = join(",",@inter_poses); + $no_of_interruptions = $no_of_interruptions - 1; + } + + if ($no_of_interruptions == 0){ + $fields[$microsatcord] =~ s/^\[|\]$//sg; + $fields[$motifcord] =~ s/^\[|\]$//sg; + $line = join("\t", @fields[0 ... $motifcord]); + } + else{ + $line = join("\t", @fields); + } + return $line; +} + +sub thrashallow{ + my $motif = $_[0]; + return 4 if length($motif) == 2; + return 6 if length($motif) == 3; + return 8 if length($motif) == 4; + +} + +#xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx multiSpecies_compoundClarifyer xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx +sub multispecies_filtering_compound_microsats{ + my $unfiltered = $_[0]; + my $filtered = $_[1]; + my $residue = $_[2]; + my $no_of_species = $_[5]; + open(UNF,"<$unfiltered") or die "Cannot open file $unfiltered: $!"; + open(FIL,">$filtered") or die "Cannot open file $filtered: $!"; + open(RES,">$residue") or die "Cannot open file $residue: $!"; + + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + + my @sub_thresholds = ("0"); + push(@sub_thresholds, split(/_/,$_[3])); + my @thresholds = ("0"); + push(@thresholds, split(/_/,$_[4])); + + while (my $line = ) { + if ($line !~ /compound/){ + print FIL $line,"\n"; next; + } + chomp $line; + my @fields = split(/\t/,$line); + my $motifline = $fields[$motifcord]; + $motifline =~ s/^\[|\]$//g; + my @motifs = split(/\]\[/,$motifline); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/^\[|\]$|-//g; + my @microsats = split(/\][a-zA-Z|-]*\[/,$microsat); + + my $stopper = 0; + for my $i (0 ... $#motifs){ + my @common = (); + my $probe = $motifs[$i].$motifs[$i]; + my $motif_size = length($motifs[$i]); + + for my $j (0 ... $#motifs){ + next if length($motifs[$i]) != length($motifs[$j]); + push(@common, length($microsats[$j])) if $probe =~ /$motifs[$j]/i; + } + + if (largest_microsat(@common) < $sub_thresholds[$motif_size]) {$stopper = 1; last;} + else {next;} + } + + if ($stopper == 1){ + print RES $line,"\n"; + } + else { print FIL $line,"\n"; } + } + close FIL; + close RES; +} + +#xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx multispecies_filtering_compound_microsats xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx + +sub chromosome_unrand_breaker{ +# print "IN chromosome_unrand_breaker: @_\n "; + my $input1 = $_[0]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $dir = $_[1]; ###### directory where subsets are put + my $output2 = $_[2]; ###### list of subset files + my $increment = $_[3]; + my $info = $_[4]; + my $chr = $_[5]; + open(SEQ,"<$input1") or die "Cannot open file $input1 $!"; + + open(OUT,">$output2") or die "Cannot open file $output2 $!"; + + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + + my $seed = 0; + my $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); + print OUT $subset,"\n"; + open(SUB,">$subset"); + + while(my $sine = ){ + $seed++; + print SUB $sine; + + if ($seed%$increment == 0 ){ + close SUB; + $subset = $dir.$info."_".$chr."_".$seed."_".($seed+$increment); + open(SUB,">$subset"); + print SUB $sine; + print OUT $subset,"\n"; + # print $subset,"\n"; + } + } + close OUT; + close SUB; +} +#xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx chromosome_unrand_breaker xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx +sub multiSpecies_interruptedMicrosatHunter{ +# print "IN multiSpecies_interruptedMicrosatHunter: @_\n"; + my $input1 = $_[0]; ###### the *_sput_op4_ii file + my $input2 = $_[1]; ###### looks like this: my $t8humanoutput = "*_nogap_op_unrand2_match" + my $output1 = $_[2]; ###### interrupted microsatellite file, in new .interrupted format + my $output2 = $_[3]; ###### uninterrupted microsatellite file + my $org = $_[4]; + my $no_of_species = $_[5]; + + my @thresholds = "0"; + push(@thresholds, split(/_/,$_[6])); + +# print "thresholds = @thresholds \n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $interrcord = $motifcord + 2; + $interrtypecord = $motifcord + 1; + + + $prinkter = 0; +# print "prionkytet = $prinkter\n"; + + open(IN,"<$input1") or die "Cannot open file $input1 $!"; + open(SEQ,"<$input2") or die "Cannot open file $input2 $!"; + + open(INT,">$output1") or die "Cannot open file $output2 $!"; + open(UNINT,">$output2") or die "Cannot open file $output2 $!"; + +# print "opened files !!\n"; + my $linecounter = 0; + my $microcounter = 0; + + my %micros = (); + while (my $line = ){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s+([0-9a-zA-Z_]+)\s([0-9]+)\s+([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n" if $prinkter == 1; + push (@{$micros{$key}},$line); + $microcounter++; + } + else {#print $line if $prinkter == 1; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close IN; + my @deletedlines = (); +# print "done hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + while(my $sine = ){ + #print $linecounter,"\n" if $linecounter % 1000 == 0; + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = (); + if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $3, $4, $5); + # print $key, "<-<-<-<-<-<-<-<\n"; + } + + # $prinkter = 1 if $sine =~ /^>H\t499\t/; + + if (exists $micros{$key}){ + my @microstring = @{$micros{$key}}; + delete $micros{$key}; + my @filteredmicrostring; +# print "sequence = $sields[$sequencepos]" if $prinkter == 1; + foreach my $line (@microstring){ + $linecounter++; + my $copy_line = $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + +# print $line if $prinkter == 1; + #LOOKING FOR LEFTWARD EXTENTION OF MICROSATELLITE + my $newline; + while(1){ + # print "\n before left sequence = $sields[$sequencepos]\n" if $prinkter == 1; + if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} + + $newline = multiSpecies_interruptedMicrosatHunter_left_extender($line, $sields[$sequencepos],$org); + if ($newline eq $line){$line = $newline; last;} + else {$line = $newline;} + + if (multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver($line) eq "no") {last;} +# print "returned line from left extender= $line \n" if $prinkter == 1; + } + while(1){ + # print "sequence = $sields[$sequencepos]\n" if $prinkter == 1; + if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} + + $newline = multiSpecies_interruptedMicrosatHunter_right_extender($line, $sields[$sequencepos],$org); + if ($newline eq $line){$line = $newline; last;} + else {$line = $newline;} + + if (multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver($line) eq "no") {last;} +# print "returned line from right extender= $line \n" if $prinkter == 1; + } +# print "\n>>>>>>>>>>>>>>>>\n In the end, the line is: \n$line\n<<<<<<<<<<<<<<<<\n" if $prinkter == 1; + + my @tempfields = split(/\t/,$line); + if ($tempfields[$microsatcord] =~ /\[/){ + print INT $line,"\n"; + } + else{ + print UNINT $line,"\n"; + } + + if ($line =~ /NULL/){ next; } + push(@filteredmicrostring, $line); + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + + my $firstflag = 'down'; + + } #if (exists $micros{$key}){ + } + close INT; + close UNINT; +# print "final number of lines = $linecounter\n"; +} + +sub multiSpecies_interruptedMicrosatHunter_left_extender{ + my ($line, $seq, $org) = @_; +# print "left extender, like passed = $line\n" if $prinkter == 1; +# print "in left extender... line passed = $line and sequence is $seq\n" if $prinkter == 1; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $firstmotif = (); + + if ($motif =~ /^\[/){ + $motif =~ s/^\[//g; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + } + else {$firstmotif = $motif;} + +# print "hacked microsat = $microsat, motif = $motif, firstmotif = $firstmotif\n" if $prinkter == 1; + my $leftphase = substr($microsat, 0,length($firstmotif)); + my $phaser = $leftphase.$leftphase; + my @phase = split(/\s*/,$leftphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($leftphase)-1)){ + push(@phases, substr($phaser, $crawler, length($leftphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $leftseq = substr($seq, 0, $start); +# print "left phases are @phases , start = $start left sequence = ",substr($leftseq, -10),"\n" if $prinkter == 1; + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $firstmotif_length = length($firstmotif); + foreach my $phase (@phases){ +# print "left phase\t",substr($leftseq, -10),"\t$phase\n" if $prinkter == 1; +# print "search patter = (($phase)+([a-zA-Z|-]{0,$firstmotif_length})) \n" if $prinkter == 1; + if ($leftseq =~ /(($phase)+([a-zA-Z|-]{0,$firstmotif_length}))$/i){ +# print "in left pattern\n" if $prinkter == 1; + my $trapped = $1; + my $trappedpos = length($leftseq)-length($trapped); + my $interval = $3; + my $intervalpos = index($trapped, $interval) + 1; +# print "left trapped = $trapped, interval = $interval, intervalpos = $intervalpos\n" if $prinkter == 1; + + my $extention = substr($trapped, 0, length($trapped)-length($interval)); + my $leftpeep = substr($seq, 0, ($start-length($trapped))); + my @passed_overhangs; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase, -length($phase)+$i); +# print "current overhang = $overhang, leftpeep = ",substr($leftpeep,-10)," whole sequence = ",substr($seq, ($end - ($end-$start) - 20), (($end-$start)+20)),"\n" if $prinkter == 1; + #TEMPORARY... BETTER METHOD NEEDED + $leftpeep =~ s/-//g; + if ($leftpeep =~ /$overhang$/i){ + push(@passed_overhangs,$overhang); +# print "l overhang\n" if $prinkter == 1; + } + } + + if(scalar(@passed_overhangs)>0){ + my $overhang = $passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $overhang.$extention; + $trapped = $overhang.$trapped; +# print "trapped extended to $trapped \n" if $prinkter == 1; + $trappedpos = length($leftseq)-length($trapped); + } + + push(@extentions,$extention); +# print "extentions = @extentions \n" if $prinkter == 1; + + push(@trappeds,$trapped ); + push(@intervalposs,length($extention)+1); + push(@trappedposs, $trappedpos); +# print "trappeds = @trappeds\n" if $prinkter == 1; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + +############################ my $nikaal = longest_array_element(@trappeds); + my $nikaal = shortest_array_element(@intervals); + +# print "longest element found = $nikaal \n" if $prinkter == 1; + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = "[".$trappedphases[$nikaal]."]".$fields[$motifcord]; + #print "new fields 9 = $fields[9]\n" if $prinkter == 1; + $fields[$startcord] = $fields[$startcord]-length($trappeds[$nikaal]); + + #print "new fields 9 = $fields[9]\n" if $prinkter == 1; + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = "[".$extentions[$nikaal]."]".$intervals[$nikaal].$fields[$microsatcord]; + #print "new fields 14 = $fields[12]\n" if $prinkter == 1; + + #print "scalar of fields = ",scalar(@fields),"\n" if $prinkter == 1; + + + if (scalar(@fields) > $motifcord+1){ + $fields[$motifcord+1] = "indel/deletion,".$fields[$motifcord+1]; + } + else{$fields[$motifcord+1] = "indel/deletion";} + #print "new fields 14 = $fields[14]\n" if $prinkter == 1; + + if (scalar(@fields)>$motifcord+2){ + $fields[$motifcord+2] = $intervals[$nikaal].",".$fields[$motifcord+2]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + #print "new fields 15 = $fields[15]\n" if $prinkter == 1; + + my @seventeen=(); + + if (scalar(@fields)>$motifcord+3){ + @seventeen = split(/,/,$fields[$motifcord+3]); + # print "scalarseventeen =@seventeen<-\n" if $prinkter == 1; + for (0 ... scalar(@seventeen)-1) {$seventeen[$_] = $seventeen[$_]+length($trappeds[$nikaal]);} + $fields[$motifcord+3] = ($intervalposs[$nikaal]).",".join(",",@seventeen); + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + } + + else {$fields[$motifcord+3] = $intervalposs[$nikaal]; $fields[$motifcord+4]=1} + + #print "new fields 16 = $fields[16]\n" if $prinkter == 1; + #print "new fields 17 = $fields[17]\n" if $prinkter == 1; + +# return join("\t",@fields); + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); + } +# print "finally left-extended line = ",$returnline,"\n" if $prinkter == 1; + return $returnline; +} + +sub multiSpecies_interruptedMicrosatHunter_right_extender{ +# print "right extender\n" if $prinkter == 1; + my ($line, $seq, $org) = @_; +# print "in right extender... line passed = $line\n" if $prinkter == 1; +# print "line = $line, sequence = ",$seq, "\n" if $prinkter == 1; + chomp $line; + my @fields = split(/\t/,$line); + my $rstart = $fields[$startcord]; + my $microsat = $fields[$microsatcord]; + $microsat =~ s/\[|\]//g; + my $rend = $rstart + length($microsat)-1; + $microsat =~ s/-//g; + my $motif = $fields[$motifcord]; + my $temp_lastmotif = (); + + if ($motif =~ /\]$/){ + $motif =~ s/\]$//g; + $motif =~ /.*\[([a-zA-Z]+)/; + $temp_lastmotif = $1; + } + else {$temp_lastmotif = $motif;} + my $lastmotif = substr($microsat,-length($temp_lastmotif)); +# print "hacked microsat = $microsat, motif = $motif, lastmotif = $lastmotif\n" if $prinkter == 1; + my $rightphase = substr($microsat, -length($lastmotif)); + my $phaser = $rightphase.$rightphase; + my @phase = split(/\s*/,$rightphase); + my @phases; + my @copy_phases = @phases; + my $crawler=0; + for (0 ... (length($rightphase)-1)){ + push(@phases, substr($phaser, $crawler, length($rightphase))); + $crawler++; + } + + my $start = $rstart; + my $end = $rend; + + my $rightseq = substr($seq, $end+1); +# print "length of sequence = " ,length($seq), "the coordinate to start from = ", $end+1, "\n" if $prinkter == 1; +# print "right phases are @phases , end = $end right sequence = ",substr($rightseq,0,10),"\n" if $prinkter == 1; + my @extentions = (); + my @trappeds = (); + my @intervalposs = (); + my @trappedposs = (); + my @trappedphases = (); + my @intervals = (); + my $lastmotif_length = length($lastmotif); + foreach my $phase (@phases){ +# print "right phase\t$phase\t",substr($rightseq,0,10),"\n" if $prinkter == 1; +# print "search patter = (([a-zA-Z|-]{0,$lastmotif_length})($phase)+) \n" if $prinkter == 1; + if ($rightseq =~ /^(([a-zA-Z|-]{0,$lastmotif_length}?)($phase)+)/i){ +# print "in right pattern\n" if $prinkter == 1; + my $trapped = $1; + my $trappedpos = $end+1; + my $interval = $2; + my $intervalpos = index($trapped, $interval) + 1; +# print "trapped = $trapped, interval = $interval\n" if $prinkter == 1; + + my $extention = substr($trapped, length($interval)); + my $rightpeep = substr($seq, ($end+length($trapped))+1); + my @passed_overhangs = ""; + + #TEMPORARY... BETTER METHOD NEEDED + $rightpeep =~ s/-//g; + + for my $i (1 ... length($phase)-1){ + my $overhang = substr($phase,0, $i); +# print "current extention = $extention, overhang = $overhang, rightpeep = ",substr($rightpeep,0,10),"\n" if $prinkter == 1; + if ($rightpeep =~ /^$overhang/i){ + push(@passed_overhangs, $overhang); +# print "r overhang\n" if $prinkter == 1; + } + } + if (scalar(@passed_overhangs) > 0){ + my $overhang = @passed_overhangs[longest_array_element(@passed_overhangs)]; + $extention = $extention.$overhang; + $trapped = $trapped.$overhang; +# print "trapped extended to $trapped \n" if $prinkter == 1; + } + + push(@extentions,$extention); + #print "extentions = @extentions \n" if $prinkter == 1; + + push(@trappeds,$trapped ); + push(@intervalposs,$intervalpos); + push(@trappedposs, $trappedpos); +# print "trappeds = @trappeds\n" if $prinkter == 1; + push(@trappedphases, substr($extention,0,length($phase))); + push(@intervals, $interval); + } + } + if (scalar(@trappeds == 0)) {return $line;} + +################################### my $nikaal = longest_array_element(@trappeds); + my $nikaal = shortest_array_element(@intervals); + +# print "longest element found = $nikaal \n" if $prinkter == 1; + + if ($fields[$motifcord] !~ /\[/i) {$fields[$motifcord] = "[".$fields[$motifcord]."]";} + $fields[$motifcord] = $fields[$motifcord]."[".$trappedphases[$nikaal]."]"; + $fields[$endcord] = $fields[$endcord] + length($trappeds[$nikaal]); + + + if($fields[$microsatcord] !~ /^\[/i){ + $fields[$microsatcord] = "[".$fields[$microsatcord]."]"; + } + + $fields[$microsatcord] = $fields[$microsatcord].$intervals[$nikaal]."[".$extentions[$nikaal]."]"; + + + if (scalar(@fields) > $motifcord+1){ + $fields[$motifcord+1] = $fields[$motifcord+1].",indel/deletion"; + } + else{$fields[$motifcord+1] = "indel/deletion";} + + if (scalar(@fields)>$motifcord+2){ + $fields[$motifcord+2] = $fields[$motifcord+2].",".$intervals[$nikaal]; + } + else{$fields[$motifcord+2] = $intervals[$nikaal];} + + my @seventeen=(); + if (scalar(@fields)>$motifcord+3){ + #print "at 608 we are doing this:length($microsat)+$intervalposs[$nikaal]\n" if $prinkter == 1; + my $currpos = length($microsat)+$intervalposs[$nikaal]; + $fields[$motifcord+3] = $fields[$motifcord+3].",".$currpos; + $fields[$motifcord+4] = $fields[$motifcord+4]+1; + + } + + else {$fields[$motifcord+3] = length($microsat)+$intervalposs[$nikaal]; $fields[$motifcord+4]=1} + +# print "finally right-extended line = ",join("\t",@fields),"\n" if $prinkter == 1; +# return join("\t",@fields); + + my $returnline = join("\t",@fields); + my $pastline = $returnline; + if ($fields[$microsatcord] =~ /\[/){ + $returnline = multiSpecies_interruptedMicrosatHunter_merge($returnline); + } +# print "finally right-extended line = ",$returnline,"\n" if $prinkter == 1; + return $returnline; + +} + +sub multiSpecies_interruptedMicrosatHunter_left_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/(^\[)|-//sg; + my $motif = $fields[$motifcord]; + chomp $motif; +# print $motif, "\n" if $motif !~ /^\[/; + my $firstmotif = (); + my $firststretch = (); + my @stretches=(); + +# print "motif = $motif, microsat = $microsat\n" if $prinkter == 1; + if ($motif =~ /^\[/){ + $motif =~ s/^\[//sg; + $motif =~ /([a-zA-Z]+)\].*/; + $firstmotif = $1; + @stretches = split(/\]/,$microsat); + $firststretch = $stretches[0]; + #print "firststretch = $firststretch\n" if $prinkter == 1; + } + else {$firstmotif = $motif;$firststretch = $microsat;} +# print "if length:firststretch - length($firststretch) < threshes length :firstmotif ($firstmotif) - $thresholds[length($firstmotif)]\n" if $prinkter == 1; + if (length($firststretch) < $thresholds[length($firstmotif)]){ + return "no"; + } + else {return "yes";} + +} +sub multiSpecies_interruptedMicrosatHunter_right_extention_permission_giver{ + my @fields = split(/\t/,$_[0]); + my $microsat = $fields[$microsatcord]; + $microsat =~ s/-|(\]$)//sg; + my $motif = $fields[$motifcord]; + chomp $motif; + my $temp_lastmotif = (); + my $laststretch = (); + my @stretches=(); + + + if ($motif =~ /\]/){ + $motif =~ s/\]$//sg; + $motif =~ /.*\[([a-zA-Z]+)$/; + $temp_lastmotif = $1; + @stretches = split(/\[/,$microsat); + $laststretch = pop(@stretches); + #print "last stretch = $laststretch\n" if $prinkter == 1; + } + else {$temp_lastmotif = $motif; $laststretch = $microsat;} + + if (length($laststretch) < $thresholds[length($temp_lastmotif)]){ + return "no"; + } + else { return "yes";} + + +} +sub checking_substitutions{ + + my ($line, $seq, $startprobes, $endprobes) = @_; + #print "sequence = $seq \n" if $prinkter == 1; + #print "COMMAND = \n $line, \n $seq, \n $startprobes \n, $endprobes\n"; + # ; + my @seqarray = split(/\s*/,$seq); + my @startsubst_probes = split(/\|/,$startprobes); + my @endsubst_probes = split(/\|/,$endprobes); + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[11] - $fields[10]; + my $end = $fields[13] - $fields[10]; + my $motif = $fields[9]; #IN FUTURE, USE THIS AS A PROBE, LIKE MOTIF = $FIELDS[9].$FIELDS[9] + $motif =~ s/\[|\]//g; + my $microsat = $fields[14]; + $microsat =~ s/\[|\]//g; + #------------------------------------------------------------------------ + # GETTING START AND END PHASES + my $startphase = substr($microsat,0, length($motif)); + my $endphase = substr($microsat,-length($motif), length($motif)); + #print "start and end phases are - $startphase and $endphase\n"; + my $startflag = 'down'; + my $endflag = 'down'; + my $substitution_distance = length($motif); + my $prestart = $start - $substitution_distance; + my $postend = $end + $substitution_distance; + my @endadds = (); + my @startadds = (); + if (($prestart < 0) || ($postend > scalar(@seqarray))) { + last; + } + #------------------------------------------------------------------------#------------------------------------------------------------------------ + # CHECKING FOR SUBSTITUTION PROBES NOW + + if ($fields[8] ne "mononucleotide"){ + while ($startflag eq "down"){ + my $search = join("",@seqarray[$prestart...($start-1)]); + #print "search is from $prestart...($start-1) = $search\n"; + foreach my $probe (@startsubst_probes){ + #print "\t\tprobe = $probe\n"; + if ($search =~ /^$probe/){ + #print "\tfound addition to the left - $search \n"; + my $copyprobe = $probe; + my $type; + my $subspos = 0; + my $interruption = ""; + if ($search eq $startphase) { $type = "NONE";} + else{ + $copyprobe =~ s/\[a-zA-Z\]/^/g; + $subspos = index($copyprobe,"^") + 1; + $type = "substitution"; + $interruption = substr($search, $subspos,1); + } + my $addinfo = join("\t",$prestart, $start, $search, $type, $interruption, $subspos); + #print "adding information: $addinfo \n"; + push(@startadds, $addinfo); + $prestart = $prestart - $substitution_distance; + $start = $start-$substitution_distance; + $startflag = 'down'; + + last; + } + else{ + $startflag = 'up'; + } + } + } + #; + while ($endflag eq "down"){ + my $search = join("",@seqarray[($end+1)...$postend]); + #print "search is from ($end+1)...$postend] = $search\n"; + + foreach my $probe (@endsubst_probes){ + #print "\t\tprobe = $probe\n"; + if ($search =~ /$probe$/){ + my $copyprobe = $probe; + my $type; + my $subspos = 0; + my $interruption = ""; + if ($search eq $endphase) { $type = "NONE";} + else{ + $copyprobe =~ s/\[a-zA-Z\]/^/g; + $subspos = index($copyprobe,"^") + 1; + $type = "substitution"; + $interruption = substr($search, $subspos,1); + } + my $addinfo = join("\t",$end, $postend, $search, $type, $interruption, $subspos); + #print "adding information: $addinfo \n"; + push(@endadds, $addinfo); + $postend = $postend + $substitution_distance; + $end = $end+$substitution_distance; + push(@endadds, $search); + $endflag = 'down'; + last; + } + else{ + $endflag = 'up'; + } + } + } + #print "startadds = @startadds, endadds = @endadds \n"; + + } +} +sub microsat_packer{ + my $microsat = $_[0]; + my $addition = $_[1]; + + + +} +sub multiSpecies_interruptedMicrosatHunter_merge{ + $prinkter = 0; +# print "~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~|||~~~~~~~~\n"; + my $line = $_[0]; +# print "sent for mering: $line \n" if $prinkter ==1; + my @mields = split(/\t/,$line); + my @fields = @mields; + my $microsat = allCaps($fields[$microsatcord]); + my $motifline = allCaps($fields[$motifcord]); + my $microsatcopy = $microsat; +# print "microsat = $microsat|\n" if $prinkter ==1; + $microsatcopy =~ s/^\[|\]$//sg; + chomp $microsatcopy; + my @microields = split(/\][a-zA-Z|-]*\[/,$microsatcopy); + my @inields = split(/\[[a-zA-Z|-]*\]/,$microsat); + shift @inields; +# print "inields =",join("|",@inields)," microields = ",join("|",@microields)," and count of microields = ", $#microields,"\n" if $prinkter ==1; + $motifline =~ s/^\[|\]$//sg; + my @motields = split(/\]\[/,$motifline); + my @firstmotifs = (); + my @lastmotifs = (); + for my $i (0 ... $#microields){ + $firstmotifs[$i] = substr($microields[$i],0,length($motields[$i])); + $lastmotifs[$i] = substr($microields[$i],-length($motields[$i])); + } +# print "firstmotif = @firstmotifs... lastmotif = @lastmotifs\n" if $prinkter ==1; + my @mergelist = (); + my @inter_poses = split(/,/,$fields[$interr_poscord]); + my $no_of_interruptions = $fields[$no_of_interruptionscord]; + my @interruptions = split(/,/,$fields[$interrcord]); + my @interrtypes = split(/,/,$fields[$interrtypecord]); + my $stopper = 0; + for my $i (0 ... $#motields-1){ +# print "studying connection of $motields[$i] and $motields[$i+1], i = $i in $microsat\n:$lastmotifs[$i] eq $firstmotifs[$i+1]?\n" if $prinkter ==1; + if ((allCaps($lastmotifs[$i]) eq allCaps($firstmotifs[$i+1])) && (!exists $inields[$i] || $inields[$i] !~ /[a-zA-Z]/)){ + $stopper = 1; + push(@mergelist, ($i)."_".($i+1)); # if $prinkter ==1; + } + } + +# print "mergelist = @mergelist\n" if $prinkter ==1; + return $line if scalar(@mergelist) == 0; +# print "merging @mergelist\n" if $prinkter ==1; +# if $prinkter ==1; + + foreach my $merging (@mergelist){ + my @sets = split(/_/, $merging); +# print "sets = @sets\n" if $prinkter ==1; + my @tempmicro = (); + my @tempmot = (); +# print "for loop going from 0 ... ", $sets[0]-1, "\n" if $prinkter ==1; + for my $i (0 ... $sets[0]-1){ +# print " adding pre- i = $i adding: microields= $microields[$i]. motields = $motields[$i], inields = |$inields[$i]|\n" if $prinkter ==1; + push(@tempmicro, "[".$microields[$i]."]"); + push(@tempmicro, $inields[$i]); + push(@tempmot, "[".$motields[$i]."]"); +# print "adding pre-motifs number $i\n" if $prinkter ==1; +# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; + } +# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; +# print "now pushing ", "[",$microields[$sets[0]]," and ",$microields[$sets[1]],"]\n" if $prinkter ==1; + my $pusher = "[".$microields[$sets[0]].$microields[$sets[1]]."]"; +# print "middle is, from @motields - @sets, number 0 which is is\n"; +# print ": $motields[$sets[0]]\n"; + push (@tempmicro, $pusher); + push(@tempmot, "[".$motields[$sets[0]]."]"); + push (@tempmicro, $inields[$sets[1]]) if $sets[1] != $#microields && exists $sets[1] && exists $inields[$sets[1]]; + my $outcoming = -2; +# print "tempmot = @tempmot, tempmicro = @tempmicro \n" if $prinkter ==1; +# print "for loop going from ",$sets[1]+1, " ... ", $#microields, "\n" if $prinkter ==1; + for my $i ($sets[1]+1 ... $#microields){ +# print " adding post- i = $i adding: microields= $microields[$i]. motields = $motields[$i]\n" if $prinkter ==1; + push(@tempmicro, "[".$microields[$i]."]") if exists $microields[$i]; + push(@tempmicro, $inields[$i]) unless $i == $#microields || !exists $inields[$i]; + push(@tempmot, "[".$motields[$i]."]"); +# print "adding post-motifs number $i\n" if $prinkter ==1; + $outcoming = $i; + } +# print "____________________________________________________________________________\n"; + $prinkter = 0; + $fields[$microsatcord] = join("",@tempmicro); + $fields[$motifcord] = join("",@tempmot); +# print "tempmot = @tempmot, tempmicro = @tempmicro . microsat = $fields[$microsatcord] and motif = $fields[$motifcord] \n" if $prinkter ==1; + + splice(@interrtypes, $sets[0], 1); + $fields[$interrtypecord] = join(",",@interrtypes); + splice(@interruptions, $sets[0], 1); + $fields[$interrcord] = join(",",@interruptions); + splice(@inter_poses, $sets[0], 1); + $fields[$interr_poscord] = join(",",@inter_poses); + $no_of_interruptions = $no_of_interruptions - 1; + } + + if ($no_of_interruptions == 0 && $line !~ /compound/){ + $fields[$microsatcord] =~ s/^\[|\]$//sg; + $fields[$motifcord] =~ s/^\[|\]$//sg; + $line = join("\t", @fields[0 ... $motifcord]); + } + else{ + $line = join("\t", @fields); + } +# print "post merging, the line is $line\n" if $prinkter ==1; + # if $stopper ==1; + return $line; +} +sub interval_asseser{ + my $pre_phase = $_[0]; my $post_phase = $_[1]; my $inter = $_[3]; +} +#--------------------------------------------------------------------------------------------------- +sub allCaps{ + my $motif = $_[0]; + $motif =~ s/a/A/g; + $motif =~ s/c/C/g; + $motif =~ s/t/T/g; + $motif =~ s/g/G/g; + return $motif; +} + + +#xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx chromosome_unrand_breamultiSpecies_interruptedMicrosatHunterker xxxxxxxxxxxxxx multiSpecies_interruptedMicrosatHunter xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx +sub merge_interruptedMicrosats{ +# print "IN merge_interruptedMicrosats: @_\n"; + my $input0 = $_[0]; ######looks like this: my $t8humanoutput = $pipedir.$ptag."_nogap_op_unrand2" + my $input1 = $_[1]; ###### the *_sput_op4_ii file + my $input2 = $_[2]; ###### the *_sput_op4_ii file + $no_of_species = $_[3]; + + my $output1 = $_[1]."_separate"; #$_[3]; ###### plain microsatellite file forward + my $output2 = $_[2]."_separate"; ##$_[4]; ###### plain microsatellite file reverse + my $output3 = $_[1]."_merged"; ##$_[5]; ###### plain microsatellite file forward + #my $output4 = $_[2]."_merged"; ##$_[6]; ###### plain microsatellite file reverse + #my $info = $_[4]; + #my @tags = split(/\t/,$info); + + open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; + open(INF,"<$input1") or die "Cannot open file $input1 $!"; + open(INR,"<$input2") or die "Cannot open file $input2 $!"; + open(OUTF,">$output1") or die "Cannot open file $output1 $!"; + open(OUTR,">$output2") or die "Cannot open file $output2 $!"; + open(MER,">$output3") or die "Cannot open file $output3 $!"; + #open(MERR,">$output4") or die "Cannot open file $output4 $!"; + + + + $printer = 0; + +# print "files opened \n"; + $infocord = 2 + (4*$no_of_species) - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $typecord = $infocord + 1; + my $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + + $interrtypecord = $motifcord + 1; + $interrcord = $motifcord + 2; + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $mergestarts = $no_of_interruptionscord+ 1; + $mergeends = $no_of_interruptionscord+ 2; + $mergemicros = $no_of_interruptionscord+ 3; + + # NOW ADDING FORWARD MICROSATELLITES TO HASH + my %fmicros = (); + my $microcounter=0; + my $linecounter = 0; + while (my $line = ){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$fmicros{$key}},$line); + $microcounter++; + } + else { + #print $line; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INF; + my @deletedlines = (); +# print "done forward hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW ADDING REVERSE MICROSATELLITES TO HASH + my %rmicros = (); + $microcounter=0; + while (my $line = ){ + # print "$org\t(chr[0-9a-zA-Z]+)\t([0-9]+)\t([0-9])+\t \n"; + $linecounter++; + if ($line =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $4, $5); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$rmicros{$key}},$line); + $microcounter++; + } + else { + #print "cant make key\n"; + } + } +# print "number of reverse microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INR; +# print "done reverse hash \n"; + $linecounter = 0; + + #------------------------------------------------------------------------------------------------ + + while(my $sine = ){ + # if $sine =~ /16349128/; + next if $sine !~ /[a-zA-Z0-9]/; +# print "-" x 150, "\n" if $printer == 1; + my @sields = split(/\t/,$sine); + my @merged = (); + + my $key = (); + + if ($sine =~ /^>[A-Za-z0-9]+\s+([0-9]+)\s+([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $2, $4, $5); + # print $key, "<-<-<-<-<-<-<-<\n"; + } + # print "key = $key\n"; + + my @sets1; + my @sets2; + chomp $sields[$sequencepos]; + my $rev_sequence = reverse($sields[$sequencepos]); + $rev_sequence =~ s/ //g; + $rev_sequence = " ".$rev_sequence; + next if (!exists $fmicros{$key} && !exists $rmicros{$key}); + + if (exists $fmicros{$key}){ + # print "line no : $linecount\n"; + my @raw_microstring = @{$fmicros{$key}}; + my %starts = (); my %ends = (); +# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} + my @microstring=(); + for my $u (0 ... $#raw_microstring){ + my @tields = split(/\t/,$raw_microstring[$u]); + next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; + push(@microstring, $raw_microstring[$u]); + $starts{$tields[$startcord]} = $tields[$startcord]; + $ends{$tields[$endcord]} = $tields[$endcord]; + } + + # print "founf microstring in forward\n: @microstring\n"; + chomp @microstring; + my $clusterresult = (find_clusters(@microstring, $sields[$sequencepos])); + @sets1 = split("\=", $clusterresult); + my @temp = split(/_X0X_/,$sets1[0]) ; $microscanned+= scalar(@temp); + # print "sets = ", join("", @sets1), "\n<<-sets1\n"; ; + } #if (exists $micros{$key}){ + + if (exists $rmicros{$key}){ + # print "line no : $linecount\n"; + my @raw_microstring = @{$rmicros{$key}}; + my %starts = (); my %ends = (); +# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $printer == 1; foreach (@raw_microstring) {print colored ['blue'],$_,"\n" if $printer == 1;} + my @microstring=(); + for my $u (0 ... $#raw_microstring){ + my @tields = split(/\t/,$raw_microstring[$u]); + next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; + push(@microstring, $raw_microstring[$u]); + $starts{$tields[$startcord]} = $tields[$startcord]; + $ends{$tields[$endcord]} = $tields[$endcord]; + } + # print "founf microstring in reverse\n: @microstring\n"; ; + chomp @microstring; + # print "sending reversed sequence\n"; + my $clusterresult = (find_clusters(@microstring, $rev_sequence ) ); + @sets2 = split("\=", $clusterresult); + my @temp = split(/_X0X_/,$sets2[0]) ; $microscanned+= scalar(@temp); + } #if (exists $micros{$key}){ + + my @popout1 = (); + my @popout2 = (); + my @forwardset = (); + if (exists $sets2[1] ){ + if(exists $sets1[0]) { + push (@popout1, $sets1[0],$sets2[1]); + my @forwardset = split("=", popOuter(@popout1, $rev_sequence ));# + print OUTF join("\n",split("_X0X_", $forwardset[0])), "\n"; + my @localmerged = split("_X0X_", $forwardset[1]); + my $sequence = $sields[$sequencepos]; + $sequence =~ s/ //g; +# print "\nforwardset = @forwardset\n"; + for my $j (0 ... $#localmerged){ + $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); + } + + push (@merged, @localmerged); + + } + else{ + my @localmerged = split("_X0X_", $sets2[1]); + my $sequence = $sields[$sequencepos]; + $sequence =~ s/ //g; + for my $j (0 ... $#localmerged){ +# print "\nlocalmerged = @localmerged\n"; + $localmerged[$j] = invert_justCoordinates ($localmerged[$j], length($sequence)); + } + + push (@merged, @localmerged); + } + } + elsif (exists $sets1[0]){ + print OUTF join("\n",split("_X0X_", $sets1[0])), "\n"; + } + + my @reverseset= (); + if (exists $sets1[1]){ + if (exists $sets2[0]){ + push (@popout2, $sets2[0],$sets1[1]); + # print "popout2 = @popout2\n"; + my @reverseset = split("=", popOuter(@popout2, $sields[$sequencepos])); + #print "reverseset = $reverseset[1] < --- reverseset1\n"; + print OUTR join("\n",split("_X0X_", $reverseset[0])), "\n"; + push(@merged, (split("_X0X_", $reverseset[1]))); + } + else{ + push(@merged, (split("_X0X_", $sets1[1]))); + } + } + elsif (exists $sets2[0]){ + print OUTR join("\n",split("_X0X_", $sets2[0])), "\n"; + + } + + if (scalar @merged > 0){ + my @filtered_merged = split("__",(filterDuplicates_merged(@merged))); + print MER join("\n", @filtered_merged),"\n"; + } + # if $sine =~ /16349128/; + + } + close(SEQ); + close(INF); + close(INR); + close(OUTF); + close(OUTR); + close(MER); + +} +sub find_clusters{ + my @input = @_; + my $sequence = pop(@input); + $sequence =~ s/ //g; + my @microstring0 = @input; +# print "IN: find_clusters:\n"; + my %microstart=(); + my %microend=(); + my @nonmerged = (); + my @mergedSet = (); +# print "set of microsats = @microstring \n"; + my @microstring = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @microstring0; +# print "microstring = ", join("\n",@microstring0) ," \n---->\n", join("\n", @microstring),"\n ,,+." if $printer == 1; + # if $printer == 1; + my @tempmicrostring = @microstring; + foreach my $line (@tempmicrostring){ + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/; + # print " starts >>> start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + my $firstflag = 'down'; + while( my $line =shift(@microstring)){ +# print "-----------\nline = $line \n" if $printer == 1; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + next if $start !~ /[0-9]+/ || $end !~ /[0-9]+/ || $distance !~ /[0-9]+/ ; + my $startmicro = $line; + my $endmicro = $line; +# print "start: $start = $fields[11] - $fields[10] || $end = $fields[13] - $fields[10]\n"; + + delete ($microstart{$start}); + delete ($microend{$end}); + my $flag = 'down'; + my $startflag = 'down'; + my $endflag = 'down'; + my $prestart = $start - $distance; + my $postend = $end + $distance; + my @compoundlines = (); + my %compoundhash = (); + push (@compoundlines, $line); + push (@{$compoundhash{$line}},$line); + my $startrank = 1; + my $endrank = 1; + + while( ($startflag eq "down") || ($endflag eq "down") ){ +# print "prestart=$prestart, post end =$postend.. seqlen =", length($sequence)," firstflag = $firstflag \n" if $printer == 1; + if ( (($prestart < 0) && $firstflag eq "up") || (($postend > length($sequence) && $firstflag eq "up")) ){ +# print "coming to the end of sequence,post end = $postend and sequence length =", length($sequence)," so exiting\n" if $printer == 1; + last; + } + + $firstflag = "up"; + if ($startflag eq "down"){ + for my $i ($prestart ... $end){ + if(exists $microend{$i}){ + chomp $microend{$i}[0]; + if(exists $compoundhash{$microend{$i}[0]}) {next;} + chomp $microend{$i}[0]; + push(@compoundlines, $microend{$i}[0]); + my @tields = split(/\t/,$microend{$i}[0]); + $startmicro = $microend{$i}[0]; + chomp $startmicro; + $flag = 'down'; + $startrank++; +# print "deleting $microend{$i}[0] and $microstart{$tields[$startcord]}[0]\n" if $printer == 1; + delete $microend{$i}; + delete $microstart{$tields[$startcord]}; + $end = $tields[$endcord]; + $startflag = 'down'; + $prestart = $tields[$startcord] - $distance; + last; + } + else{ + $flag = 'up'; + $startflag = 'up'; + } + } + } + + if ($endflag eq "down"){ + + for my $i ($start ... $postend){ +# print "$start ----> $i -----> $postend\n" if $printer == 1; + if(exists $microstart{$i} ){ + chomp $microstart{$i}[0]; + if(exists $compoundhash{$microstart{$i}[0]}) {next;} + chomp $microstart{$i}[0]; + push(@compoundlines, $microstart{$i}[0]); + my @tields = split(/\t/,$microstart{$i}[0]); + $endmicro = $microstart{$i}[0]; + $endrank++; + chomp $endmicro; + $flag = 'down'; +# print "deleting $microend{$tields[$endcord]}[0]\n" if $printer == 1; + + delete $microstart{$i} if exists $microstart{$i} ; + delete $microend{$tields[$endcord]} if exists $microend{$tields[$endcord]}; +# print "done\n" if $printer == 1; + + shift @microstring; + $end = $tields[$endcord]; + $postend = $tields[$endcord] + $distance; + $endflag = 'down'; + last; + } + else{ + $flag = 'up'; + $endflag = 'up'; + } +# print "out of the if\n" if $printer == 1; + } +# print "out of the for\n" if $printer == 1; + + } +# print "for next turn, flag status: startflag = $startflag and endflag = $endflag \n"; + } #end while( $flag eq "down") +# print "compoundlines = @compoundlines \n" if $printer == 1; + + if (scalar (@compoundlines) == 1){ + push(@nonmerged, $line); + + } + if (scalar (@compoundlines) > 1){ +# print "FROM CLUSTERER\n" if $printer == 1; + push(@mergedSet,merge_microsats(@compoundlines, $sequence) ); + } + } #end foreach my $line (@microstring){ +# print join("\n",@mergedSet),"<-----mergedSet\n" if $printer == 1; +# if scalar(@mergedSet) > 0; +# print "EXIT: find_clusters\n"; +return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet)); +} + +sub custom { + $a->[$startcord+1] <=> $b->[$startcord+1]; +} + +sub popOuter { +# print "\nIN: popOuter @_\n"; ; + my @all = split ("_X0X_",$_[0]); +# if !defined $_[0]; + my @merged = split ("_X0X_",$_[1]); + my $sequence = $_[2]; + my $seqlen = length($sequence); + my %microstart=(); + my %microend=(); + my @mergedSet = (); + my @nonmerged = (); + + foreach my $line (@all){ + my @fields = split(/\t/,$line); + my $start = $seqlen - $fields[$startcord]+ 1; + my $end = $seqlen - $fields[$endcord] + 1; + push (@{$microstart{$start}},$line); + push (@{$microend{$end}},$line); + } + my $firstflag = 'down'; + my %forPopouting = (); + + while( my $line =shift(@merged)){ + # print "\n MErgedline: $line .. startcord = $startcord ... endcord = $endcord\n" ; + chomp $line; + my @fields = split(/\t/,$line); + my $start = $fields[$startcord]; + my $end = $fields[$endcord]; + my $startmicro = $line; + my $endmicro = $line; + + + delete ($microstart{$start}); + delete ($microend{$end}); + my $flag = 'down'; + my $startflag = 'down'; + my $endflag = 'down'; + my $prestart = $start - $distance; + my $postend = $end + $distance; + my @compoundlines = (); + my %compoundhash = (); + push (@compoundlines, $line); + my $startrank = 1; + my $endrank = 1; + + # print "\nstart = $start, end = $end\n"; + # ; + for my $i ($start ... $end){ + if(exists $microend{$i}){ + # print "\nmicrosat exists: $microend{$i}[0] microsat exists\n"; + chomp $microend{$i}[0]; + my @fields = split(/\t/,$microend{$i}[0]); + delete $microstart{$seqlen - $fields[$startcord] + 1}; + my $invertseq = $sequence; + $invertseq =~ s/ //g; + push(@compoundlines, invert_microsat($microend{$i}[0] , $invertseq )); + delete $microend{$i}; + + } + + if(exists $microstart{$i} ){ + # print "\nmicrosat exists: $microstart{$i}[0] microsat exists\n"; + + chomp $microstart{$i}[0]; + my @fields = split(/\t/,$microstart{$i}[0]); + delete $microend{$seqlen - $fields[$endcord] + 1}; + my $invertseq = $sequence; + $invertseq =~ s/ //g; + push(@compoundlines, invert_microsat($microstart{$i}[0], $invertseq) ); + delete $microstart{$i}; + } + } + + if (scalar (@compoundlines) == 1){ + push(@mergedSet,join("\t",@compoundlines) ); + } + else { +# print "FROM POPOUTER\n" if $printer == 1; + push(@mergedSet, merge_microsats(@compoundlines, $sequence) ); + } + } + + foreach my $key (sort keys %microstart) { + push(@nonmerged,$microstart{$key}[0]); + } + + return (join("_X0X_",@nonmerged). "=".join("_X0X_",@mergedSet) ); +} + + + +sub invert_justCoordinates{ + my $microsat = $_[0]; +# print "IN invert_justCoordinates ... @_\n" ; ; + chomp $microsat; + my $seqLength = $_[1]; + my @fields = split(/\t/,$microsat); + my $start = $seqLength - $fields[$endcord] + 1; + my $end = $seqLength - $fields[$startcord] + 1; + $fields[$startcord] = $start; + $fields[$endcord] = $end; + $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); +# print "RETURNIG: ", join("\t",@fields), "\n" if $printer == 1; + return join("\t",@fields); +} + +sub largest_number{ + my $counter = 0; + my($max) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp > $max){ + $max = $temp; + } + } + return($max); +} +sub smallest_number{ + my $counter = 0; + my($min) = shift(@_); + foreach my $temp (@_) { + #print "finding largest array: $maxcounter \n"; + if($temp < $min){ + $min = $temp; + } + } + return($min); +} + + +sub filterDuplicates_merged{ + my @merged = @_; + my %revmerged = (); + my @fmerged = (); + foreach my $micro (@merged) { + my @fields = split(/\t/,$micro); + if ($fields[3] =~ /chr[A-Z0-9a-z]+r/){ + my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]); + # print "adding ... \n$key\n$micro\n"; + push(@{$revmerged{$key}}, $micro); + } + else{ + # print "pushing.. $micro\n"; + push(@fmerged, $micro); + } + } +# print "\n"; + foreach my $micro (@fmerged) { + my @fields = split(/\t/,$micro); + my $key = join("_K0K_",$fields[1], $fields[$startcord], $fields[$endcord]); + # print "searching for key $key\n"; + if (exists $revmerged{$key}){ + # print "deleting $revmerged{$key}[0]\n"; + delete $revmerged{$key}; + } + } + foreach my $key (sort keys %revmerged) { + push(@fmerged,$revmerged{$key}[0]); + } +# print "returning ", join("\n", @fmerged),"\n" ; + return join("__", @fmerged); +} + +sub invert_microsat{ + my $micro = $_[0]; + chomp $micro; + if ($micro =~ /chr[A-Z0-9a-z]+r/) { $micro =~ s/chr([0-9a-b]+)r/chr$1/g ;} + else { $micro =~ s/chr([0-9a-b]+)/chr$1r/g ; } + my $sequence = $_[1]; + $sequence =~ s/ //g; + my $seqlen = length($sequence); + my @fields = split(/\t/,$micro); + my $start = $seqlen - $fields[$endcord] +1; + my $end = $seqlen - $fields[$startcord] +1; + $fields[$startcord] = $start; + $fields[$endcord] = $end; + $fields[$motifcord] = reverse_micro($fields[$motifcord]); + $fields[$microsatcord] = reverse_micro($fields[$microsatcord]); + if ($fields[$typecord] ne "compound" && exists $fields[$no_of_interruptionscord] ){ + my @intertypes = split(/,/,$fields[$interrtypecord]); + my @inters = split(/,/,$fields[$interrcord]); + my @interposes = split(/,/,$fields[$interr_poscord]); + $fields[$interrtypecord] = join(",",reverse(@intertypes)); + $fields[$no_of_interruptionscord] = scalar(@interposes); + for my $i (0 ... $fields[$no_of_interruptionscord]-1){ + if (exists $inters[$i] && $inters[$i] =~ /[a-zA-Z]/){ + $inters[$i] = reverse($inters[$i]); + $interposes[$i] = $interposes[$i] + length($inters[$i]) - 1; + } + else{ + $inters[$i] = ""; + $interposes[$i] = $interposes[$i] - 1; + } + $interposes[$i] = ($end - $start + 1) - $interposes[$i] + 1; + } + + $fields[$interrcord] = join(",",reverse(@inters)); + $fields[$interr_poscord] = join(",",reverse(@interposes)); + } + + my $finalmicrosat = join("\t", @fields); + return $finalmicrosat; + +} +sub reverse_micro{ + my $micro = reverse($_[0]); + my @strand = split(/\s*/,$micro); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /\[/i) {$strand[$i] = "]";next;} + if ($strand[$i] =~ /\]/i) {$strand[$i] = "[";next;} + } + return join("",@strand); +} + +#xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx merge_interruptedMicrosats xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx + +sub forward_reverse_sputoutput_comparer { +# print "IN forward_reverse_sputoutput_comparer: @_\n"; + my $input0 = $_[0]; ###### the *nogap_unrand_match file + my $input1 = $_[1]; ###### the real file, *sput* data + my $input2 = $_[2]; ###### the reverse file, *sput* data + my $output1 = $_[3]; ###### microsats different in real file + my $output2 = $_[4]; ###### microsats missing in real file + my $output3 = $_[5]; ###### microsats common among real and reverse file + my $no_of_species = $_[6]; + + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + $interrtypecord = $motifcord + 1; + $interrcord = $motifcord + 2; + $interr_poscord = $motifcord + 3; + $no_of_interruptionscord = $motifcord + 4; + $mergestarts = $no_of_interruptionscord+ 1; + $mergeends = $no_of_interruptionscord+ 2; + $mergemicros = $no_of_interruptionscord+ 3; + + + open(SEQ,"<$input0") or die "Cannot open file $input0 $!"; + open(INF,"<$input1") or die "Cannot open file $input1 $!"; + open(INR,"<$input2") or die "Cannot open file $input2 $!"; + + open(DIFF,">$output1") or die "Cannot open file $output1 $!"; + #open(MISS,">$output2") or die "Cannot open file $output2 $!"; + open(SAME,">$output3") or die "Cannot open file $output3 $!"; + + +# print "opened files \n"; + my $linecounter = 0; + my $fcounter = 0; + my $rcounter = 0; + + $printer = 0; + #--------------------------------------------------------------------------------------------------- + # NOW ADDING FORWARD MICROSATELLITES TO HASH + my %fmicros = (); + my $microcounter=0; + while (my $line = ){ + $linecounter++; + if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$fmicros{$key}},$line); + $microcounter++; + } + else { + #print $line; + } + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INF; + my @deletedlines = (); +# print "done forward hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + # NOW ADDING REVERSE MICROSATELLITES TO HASH + my %rmicros = (); + $microcounter=0; + while (my $line = ){ + $linecounter++; + if ($line =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); + # print $key, "#-#-#-#-#-#-#-#\n"; + push (@{$rmicros{$key}},$line); + $microcounter++; + } + else {} + } +# print "number of microsatellites added to hash = $microcounter\nnumber of lines scanned = $linecounter\n"; + close INR; +# print "done reverse hash \n"; + $linecounter = 0; + #--------------------------------------------------------------------------------------------------- + #--------------------------------------------------------------------------------------------------- + # NOW READING THE SEQUENCE FILE + while(my $sine = ){ + my %microstart=(); + my %microend=(); + my @sields = split(/\t/,$sine); + my $key = (); + if ($sine =~ /([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s[\+|\-]\s([0-9a-zA-Z]+)\s([0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $key = join("\t",$1, $3, $4, $5, $7, $8, $9, $11, $12); + } + else { + next; + } + $printer = 0; + my $sequence = $sields[$sequencepos]; + chomp $sequence; + $sequence =~ s/ //g; + my @localfs = (); + my @localrs = (); + + if (exists $fmicros{$key}){ + @localfs = @{$fmicros{$key}}; + delete $fmicros{$key}; + } + + my %forwardstarts = (); + my %forwardends = (); + + foreach my $f (@localfs){ + my @fields = split(/\t/,$f); + push (@{$forwardstarts{$fields[$startcord]}},$f); + push (@{$forwardends{$fields[$endcord]}},$fields[$startcord]); + } + + if (exists $rmicros{$key}){ + @localrs = @{$rmicros{$key}}; + delete $rmicros{$key}; + } + else{ + } + + foreach my $r (@localrs){ + chomp $r; + my @rields = split(/\t/,$r); +# print "rields = @rields\n" if $printer == 1; + my $reciprocalstart = length($sequence) - $rields[$endcord] + 1; + my $reciprocalend = length($sequence) - $rields[$startcord] + 1; +# print "reciprocal start = $reciprocalstart = ",length($sequence)," - $rields[$endcord] + 1\n" if $printer == 1; + my $microsat = reverse_micro(all_caps($rields[$microsatcord])); + my @localcollection=(); + for my $i ($reciprocalstart+1 ... $reciprocalend-1){ + if (exists $forwardstarts{$i}){ + push(@localcollection, $forwardstarts{$i}[0] ); + delete $forwardstarts{$i}; + } + if (exists $forwardends{$i}){ + next if !exists $forwardstarts{$forwardends{$i}[0]}; + push(@localcollection, $forwardstarts{$forwardends{$i}[0]}[0] ); + } + } + if (exists $forwardstarts{$reciprocalstart} && exists $forwardends{$reciprocalend}) {push(@localcollection,$forwardstarts{$reciprocalstart}[0]);} + + if (scalar(@localcollection) == 0){ + print SAME invert_microsat($r,($sequence) ), "\n"; + } + + elsif (scalar(@localcollection) == 1){ +# print "f microsat = $localcollection[0]\n" if $printer == 1; + my @lields = split(/\t/,$localcollection[0]); + $lields[$microsatcord]=all_caps($lields[$microsatcord]); +# print "comparing: $microsat and $lields[$microsatcord]\n" if $printer == 1; +# print "coordinates are: $lields[$startcord]-$lields[$endcord] and $reciprocalstart-$reciprocalend\n" if $printer == 1; + if ($microsat eq $lields[$microsatcord]){ + chomp $localcollection[0]; + print SAME $localcollection[0], "\n"; + } + if ($microsat ne $lields[$microsatcord]){ + chomp $localcollection[0]; + my $newmicro = microsatChooser(join("\t",@lields), join("\t",@rields), $sequence); +# print "newmicro = $newmicro\n" if $printer == 1; + if ($newmicro =~ /[a-zA-Z]/){ + print SAME $newmicro,"\n"; + } + else{ + print DIFF join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n"; +# print join("\t",$localcollection[0],"-->",$rields[$typecord],$reciprocalstart,$reciprocalend, $rields[$microsatcord], reverse_micro($rields[$motifcord]), @rields[$motifcord+1 ... $#rields] ),"\n" if $printer == 1; +# print "@rields\n@lields\n" if $printer == 1; + } + } + } + else{ +# print "multiple found for $r --> ", join("\t",@localcollection),"\n" if $printer == 1; + } + } + } + + close(SEQ); + close(INF); + close(INR); + close(DIFF); + close(SAME); + +} +sub all_caps{ + my @strand = split(/\s*/,$_[0]); + for my $i (0 ... $#strand){ + if ($strand[$i] =~ /c/) {$strand[$i] = "C";next;} + if ($strand[$i] =~ /a/) {$strand[$i] = "A";next;} + if ($strand[$i] =~ /t/) { $strand[$i] = "T";next;} + if ($strand[$i] =~ /g/) {$strand[$i] = "G";next;} + } + return join("",@strand); +} +sub microsatChooser{ + my $forward = $_[0]; + my $reverse = $_[1]; + my $sequence = $_[2]; + my $seqLength = length($sequence); + $sequence =~ s/ //g; + my @fields = split(/\t/,$forward); + my @rields = split(/\t/,$reverse); + my $r_start = $seqLength - $rields[$endcord] + 1; + my $r_end = $seqLength - $rields[$startcord] + 1; + + + my $f_microsat = $fields[$microsatcord]; + my $r_microsat = $rields[$microsatcord]; + + if ($fields[$typecord] =~ /\./ && $rields[$typecord] =~ /\./) { + return $forward if length($f_microsat) >= length($r_microsat); + return invert_microsat($reverse, $sequence) if length($f_microsat) < length($r_microsat); + } + return $forward if all_caps($fields[$motifcord]) eq all_caps($rields[$motifcord]) && $fields[$startcord] == $rields[$startcord] && $fields[$endcord] == $rields[$endcord]; + + my $f_microsat_copy = $f_microsat; + my $r_microsat_copy = $r_microsat; + $f_microsat_copy =~ s/^\[|\]$//g; + $r_microsat_copy =~ s/^\[|\]$//g; + + my @f_microields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); + my @r_microields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); + my @f_intields = split(/\][a-zA-Z]*\[/,$f_microsat_copy); + my @r_intields = split(/\][a-zA-Z]*\[/,$r_microsat_copy); + + my $f_motif = $fields[$motifcord]; + my $r_motif = $rields[$motifcord]; + my $f_motif_copy = $f_motif; + my $r_motif_copy = $r_motif; + $f_motif_copy =~ s/^\[|\]$//g; + $r_motif_copy =~ s/^\[|\]$//g; + + my @f_motields = split(/\]\[/,$f_motif_copy); + my @r_motields = split(/\]\[/,$r_motif_copy); + + my $f_purestretch = join("",@f_microields); + my $r_purestretch = join("",@r_microields); + + if ($fields[$typecord]=~/nucleotide/ && $rields[$typecord]=~/nucleotide/){ +# print "now.. studying $forward\n$reverse\n" if $printer == 1; + if ($fields[$typecord] eq $rields[$typecord]){ +# print "comparing motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; + + if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ + my $subset_answer = isSubset($forward, $reverse, $seqLength); +# print "subset answer = $subset_answer\n" if $printer == 1; + return $forward if $subset_answer == 1; + return invert_microsat($reverse, $sequence) if $subset_answer == 2; + return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); + return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); + return $forward if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) >= length($r_purestretch); + return invert_microsat($reverse, $sequence) if $subset_answer == 3 && slided_microsat($forward, $reverse, $seqLength) == 0 && length($f_purestretch) < length($r_purestretch); + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence) if $subset_answer == 3 ; + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ + return $forward; + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ + return invert_microsat($reverse, $sequence); + } + } + else{ + my $fmotlen = (); + my $rmotlen = (); + $fmotlen =1 if $fields[$typecord] eq "mononucleotide"; + $fmotlen =2 if $fields[$typecord] eq "dinucleotide"; + $fmotlen =3 if $fields[$typecord] eq "trinucleotide"; + $fmotlen =4 if $fields[$typecord] eq "tetranucleotide"; + $rmotlen =1 if $rields[$typecord] eq "mononucleotide"; + $rmotlen =2 if $rields[$typecord] eq "dinucleotide"; + $rmotlen =3 if $rields[$typecord] eq "trinucleotide"; + $rmotlen =4 if $rields[$typecord] eq "tetranucleotide"; + + if ($fmotlen < $rmotlen){ + if (abs($fields[$startcord] - $r_start) <= $fmotlen || abs($fields[$endcord] - $r_end) <= $fmotlen ){ + return $forward; + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + if ($fmotlen > $rmotlen){ + if (abs($fields[$startcord] - $r_start) <= $rmotlen || abs($fields[$endcord] - $r_end) <= $rmotlen){ + return invert_microsat($reverse, $sequence); + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + } + } + if ($fields[$typecord] eq "compound" && $rields[$typecord] eq "compound"){ +# print "comparing compound motifs::", all_caps($fields[$motifcord]) ," and ", all_caps(reverse_micro($rields[$motifcord])), "\n" if $printer == 1; + if(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 1){ + my $subset_answer = isSubset($forward, $reverse, $seqLength); +# print "subset answer = $subset_answer\n" if $printer == 1; + return $forward if $subset_answer == 1; + return invert_microsat($reverse, $sequence) if $subset_answer == 2; +# print length($f_purestretch) ,">", length($r_purestretch)," \n" if $printer == 1; + return $forward if $subset_answer == 0 && length($f_purestretch) >= length($r_purestretch); + return invert_microsat($reverse, $sequence) if $subset_answer == 0 && length($f_purestretch) < length($r_purestretch); + if ($subset_answer == 3){ + if ($fields[$startcord] < $r_start || $fields[$endcord] > $r_end){ + if (abs($fields[$startcord] - $r_start) < length($f_motields[0]) || abs($fields[$endcord] - $r_end) < length($f_motields[$#f_motields]) ){ + return $forward; + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + if ($fields[$startcord] > $r_start || $fields[$endcord] < $r_end){ + if (abs($fields[$startcord] - $r_start) < length($r_motields[0]) || abs($fields[$endcord] - $r_end) < length($r_motields[$#r_motields]) ){ + return invert_microsat($reverse, $sequence); + } + else{ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + } + } + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 0){ + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 2){ + return $forward; + } + elsif(motifBYmotif_match(all_caps($fields[$motifcord]), all_caps(reverse_micro($rields[$motifcord]))) == 3){ + return invert_microsat($reverse, $sequence); + } + + } + + if ($fields[$typecord] eq "compound" && $rields[$typecord] =~ /nucleotide/){ +# print "one compound, one nucleotide\n" if $printer == 1; + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } + if ($fields[$typecord] =~ /nucleotide/ && $rields[$typecord]eq "compound"){ +# print "one compound, one nucleotide\n" if $printer == 1; + return merge_microsats($forward, invert_microsat($reverse, $sequence), $sequence); + } +} + +sub isSubset{ + my $forward = $_[0]; my @fields = split(/\t/,$forward); + my $reverse = $_[1]; my @rields = split(/\t/,$reverse); + my $seqLength = $_[2]; + my $r_start = $seqLength - $rields[$endcord] + 1; + my $r_end = $seqLength - $rields[$startcord] + 1; +# print "we have $fields[$startcord] -> $fields[$endcord] && $r_start -> $r_end\n" if $printer == 1; + return "0" if $fields[$startcord] == $r_start && $fields[$endcord] == $r_end; + return "1" if $fields[$startcord] <= $r_start && $fields[$endcord] >= $r_end; + return "2" if $r_start <= $fields[$startcord] && $r_end >= $fields[$endcord]; + return "3"; +} + +sub motifBYmotif_match{ + my $forward = $_[0]; + my $reverse = $_[1]; + $forward =~ s/^\[|\]$//g; + $reverse =~ s/^\[|\]$//g; + my @f_motields=split(/\]\[/, $forward); + my @r_motields=split(/\]\[/, $reverse); + my $finalresult = 0; + + if (scalar(@f_motields) != scalar(@r_motields)){ + my $subresult = 0; + my @mega = (); my @sub = (); + @mega = @f_motields if scalar(@f_motields) > scalar(@r_motields); + @sub = @f_motields if scalar(@f_motields) > scalar(@r_motields); + @mega = @r_motields if scalar(@f_motields) < scalar(@r_motields); + @sub = @r_motields if scalar(@f_motields) < scalar(@r_motields); + + for my $i (0 ... $#sub){ + my $probe = $sub[$i].$sub[$i]; +# print "probing $probe and $mega[$i]\n" if $printer == 1; + if ($probe =~ /$mega[$i]/) {$subresult = 1; } + else {$subresult = 0; last; } + } + + return 0 if $subresult == 0; + return 2 if $subresult == 1 && scalar(@f_motields) > scalar(@r_motields); # r is subset of f + return 3 if $subresult == 1 && scalar(@f_motields) < scalar(@r_motields); # ^reverse + + } + else{ + for my $i (0 ... $#f_motields){ + my $probe = $f_motields[$i].$f_motields[$i]; + if ($probe =~ /$r_motields[$i]/) {$finalresult = 1 ;} + else {$finalresult = 0 ;last;} + } + } +# print "finalresult = $finalresult\n" if $printer == 1; + return $finalresult; +} + +sub merge_microsats{ + my @input = @_; + my $sequence = pop(@input); + $sequence =~ s/ //g; + my @seq_string = @input; +# print "IN: merge_microsats\n"; +# print "recieved for merging: ", join("\n", @seq_string), "\nsequence = $sequence\n"; + my $start; + my $end; + my @micros = map { $_->[0] } sort custom map { [$_, split /\t/ ] } @seq_string; +# print "\nrearranged into @micros \n"; + my (@motifs, @microsats, @interruptiontypes, @interruptions, @interrposes, @no_of_interruptions, @types, @starts, @ends, @mergestart, @mergeend, @mergemicro) = (); + my @fields = (); + for my $i (0 ... $#micros){ + chomp $micros[$i]; + @fields = split(/\t/,$micros[$i]); + push(@types, $fields[$typecord]); + push(@motifs, $fields[$motifcord]); + + if (exists $fields[$interrtypecord]){ push(@interruptiontypes, $fields[$interrtypecord]);} + else { push(@interruptiontypes, "NA"); } + if (exists $fields[$interrcord]) {push(@interruptions, $fields[$interrcord]);} + else { push(@interruptions, "NA"); } + if (exists $fields[$interr_poscord]) { push(@interrposes, $fields[$interr_poscord]);} + else { push(@interrposes, "NA"); } + if (exists $fields[$no_of_interruptionscord]) {push(@no_of_interruptions, $fields[$no_of_interruptionscord]);} + else { push(@no_of_interruptions, "NA"); } + if(exists $fields[$mergestarts]) { @mergestart = (@mergestart, split(/\./,$fields[$mergestarts]));} + else { push(@mergestart, $fields[$startcord]); } + if(exists $fields[$mergeends]) { @mergeend = (@mergeend, split(/\./,$fields[$mergeends]));} + else { push(@mergeend, $fields[$endcord]); } + if(exists $fields[$mergemicros]) { push(@mergemicro, $fields[$mergemicros]);} + else { push(@mergemicro, $fields[$microsatcord]); } + + + } + $start = smallest_number(@mergestart); + $end = largest_number(@mergeend); + my $microsat_entry = "[".substr( $sequence, $start-1, ($end - $start + 1) )."]"; + my $microsat = join("\t", @fields[0 ... $infocord], join(".", @types), $start, $fields[$strandcord], $end, $microsat_entry , join(".", @motifs), join(".", @interruptiontypes),join(".", @interruptions),join(".", @interrposes),join(".", @no_of_interruptions), join(".", @mergestart), join(".", @mergeend) , join(".", @mergemicro)); + return $microsat; +} + +sub slided_microsat{ + my $forward = $_[0]; my @fields = split(/\t/,$forward); + my $reverse = $_[1]; my @rields = split(/\t/,$reverse); + my $seqLength = $_[2]; + my $r_start = $seqLength - $rields[$endcord] + 1; + my $r_end = $seqLength - $rields[$startcord] + 1; + my $motlen =(); + $motlen =1 if $fields[$typecord] eq "mononucleotide"; + $motlen =2 if $fields[$typecord] eq "dinucleotide"; + $motlen =3 if $fields[$typecord] eq "trinucleotide"; + $motlen =4 if $fields[$typecord] eq "tetranucleotide"; + + if (abs($fields[$startcord] - $r_start) < $motlen || abs($fields[$endcord] - $r_end) < $motlen ) { + return 0; + } + else{ + return 1; + } + +} + +#xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx forward_reverse_sputoutput_comparer xxxxxxxxxxxxxx + + + +#xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx +sub new_multispecies_t10{ + my $input1 = $_[0]; #gap_op_unrand_match + my $input2 = $_[1]; #sput + my $output = $_[2]; #output + my $bin = $output."_bin"; + my $orgs = join("|",split(/\./,$_[3])); + my @organisms = split(/\./,$_[3]); + my $no_of_species = scalar(@organisms); #3 + my $t10info = $output."_info"; + $prinkter = 0; + + open (MATCH, "<$input1"); + open (SPUT, "<$input2"); + open (OUT, ">$output"); + open (INFO, ">$t10info"); + + + sub microsat_bracketer; + sub custom; + my %seen = (); + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $startcord = 2 + (4*$no_of_species) + 2 - 1; + $strandcord = 2 + (4*$no_of_species) + 3 - 1; + $endcord = 2 + (4*$no_of_species) + 4 - 1; + $microsatcord = 2 + (4*$no_of_species) + 5 - 1; + $motifcord = 2 + (4*$no_of_species) + 6 - 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + #---------------------------------------------------------------------------------------------------------------# + # MAKING A HASH FROM SPUT, WITH HASH KEYS GENERATED BELOW AND SEQUENCES STORED AS VALUES # + #---------------------------------------------------------------------------------------------------------------# + my $linecounter = 0; + my $microcounter = 0; + while (my $line = ){ + chomp $line; + # print "$org\t(chr[0-9]+)\t([0-9]+)\t([0-9])+\t \n"; + next if $line !~ /[0-9a-z]+/; + $linecounter++; + # my $key = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); + # print $key, "#-#-#-#-#-#-#-#\n"; + if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + my $key = join("\t",$1, $2, $3, $4, $5); +# print "key = $key\n" if $prinkter == 1; + push (@{$seen{$key}},$line); + $microcounter++; + } + else { + #print "could not make ker in SPUT : \n$line \n"; + } + } +# print "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; +# print INFO "done hash.. linecounter = $linecounter, microcounter = $microcounter and total keys entered = ",scalar(keys %seen),"\n"; + close SPUT; + + #---------------------------------------------------------------------------------------------------------------- + + #-------------------------------------------------------------------------------------------------------# + # THE ENTIRE CODE BELOW IS DEVOTED TO GENERATING HASH KEYS FROM MATCH FOLLOWED BY # + # USING THESE HASH KEYS TO CORRESPOND EACH SEQUENCE IN FIRST FILE TO ITS MICROSAT REPEATS IN # + # SECOND FILE FOLLOWED BY # + # FINDING THE EXACT LOCATION OF EACH MICROSAT REPEAT WITHIN EACH SEQUENCE USING THE 'index' FUNCTION # + #-------------------------------------------------------------------------------------------------------# + my $ref = 0; + my $ref2 = 0; + my $ref3 = 0; + my $ref4 = 0; + my $deletes= 0; + my $duplicates = 0; + my $neighbors = 0; + my $tooshort = 0; + my $prevmicrol=(); + my $startnotfound = 0; + my $matchkeysformed = 0; + my $keysused = 0; + + while (my $line = ) { +# print colored ['magenta'], $line if $prinkter == 1; + next if $line !~ /[a-zA-Z0-9]/; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + # $key2 = join("\t",$1 , $2, $4, $5, $6, $8, $9, $10, $12, $13); + if ($line =~ /([0-9]+)\s+([0-9a-zA-Z]+)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)\s/ ) { + $matchkeysformed++; + $key2 = join("\t",$1, $2, $3, $4, $5); +# print "key = $key2 \n" if $prinkter == 1; + } + else{ +# print "could not make ker in SEQ : $line\n"; + next; + } + my $sequence = $fields2[$sequencepos]; + $sequence =~ s/\*/-/g; + my $count = 0; + if (exists $seen{$key2}){ + $keysused++; + my @unsorted_raw = @{$seen{$key2}}; + delete $seen{$key2}; + my @sequencearr = split(/\s*/, $sequence); + +# print "sequencearr = @sequencearr\n" if $prinkter == 1; + + my $counter; + + my %start_database = (); + my %end_database = (); + foreach my $uns (@unsorted_raw){ + my @uields = split(/\t/,$uns); + $start_database{$uields[$startcord]} = $uns; + $end_database{$uields[$endcord]} = $uns; + } + + my @unsorted = (); + my %starts = (); my %ends = (); +# print colored ['yellow'],"unsorted, unfiltered microats = \n" if $prinkter == 1; foreach (@unsorted_raw) {print colored ['blue'],$_,"\n" if $prinkter == 1;} + for my $u (0 ... $#unsorted_raw){ + my @tields = split(/\t/,$unsorted_raw[$u]); + next if exists $starts{$tields[$startcord]} && exists $ends{$tields[$endcord]}; + push(@unsorted, $unsorted_raw[$u]); + $starts{$tields[$startcord]} = $unsorted_raw[$u]; +# print "in starts : $tields[$startcord] -> $unsorted_raw[$u]\n" if $prinkter == 1; + } + + my $basecounter= 0; + my $gapcounter = 0; + my $poscounter = 0; + + for my $s (@sequencearr){ + + $poscounter++; + if ($s eq "-"){ + $gapcounter++; next; + } + else{ + $basecounter++; + } + + + #print "s = $s, poscounter = $poscounter, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; + #print "s = $s, basecounter = $basecounter, gapcpunter = $gapcounter\n" if $prinkter == 1; + #print "s = $s, gapcpunter = $gapcounter\n" if $prinkter == 1; + + if (exists $starts{$basecounter}){ + my $locus = $starts{$basecounter}; +# print "locus identified = $locus\n" if $prinkter == 1; + my @fields3 = split(/\t/,$locus); + my $start = $fields3[$startcord]; + my $end = $fields3[$endcord]; + my $motif = $fields3[$motifcord]; + my $microsat = $fields3[$microsatcord]; + my @leftbracketpos = (); + my @rightbracketpos = (); + my $bracket_picker = 'no'; + my $leftbrackets=(); + my $rightbrackets = (); + my $micro_cpy = $microsat; +# print "microsat = $microsat\n" if $prinkter == 1; + while($microsat =~ m/\[/g) {push(@leftbracketpos, (pos($microsat))); $leftbrackets = join("__",@leftbracketpos);$bracket_picker='yes';} + while($microsat =~ m/\]/g) {push(@rightbracketpos, (pos($microsat))); $rightbrackets = join("__",@rightbracketpos);} + $microsat =~ s/[\[\]\-\*]//g; +# print "microsat = $microsat\n" if $prinkter == 1; + my $human_search = join '-*', split //, $microsat; + my $temp = substr($sequence, $poscounter-1); +# print "with poscounter = $poscounter\n" if $prinkter == 1; + my $search_result = (); + my $posnow = (); + +# print "for $line, temp $temp or human_search $human_search not defined\n" if !defined $temp || !defined $human_search; +# if !defined $temp || !defined $human_search; + + while ($temp =~ /($human_search)/gi){ + $search_result = $1; + # $posnow = pos($temp); + last; + } + + my @gapspos = (); + next if !defined $search_result; + + while($search_result =~ m/-/g) {push(@gapspos, (pos($search_result))); } + my $gaps = join("__",@gapspos); + + my $final_microsat = $search_result; + if ($bracket_picker eq "yes"){ + $final_microsat = microsat_bracketer($search_result, $gaps,$leftbrackets,$rightbrackets); + } + + my $outsentence = join("\t",join ("\t",@fields3[0 ... $infocord]),$fields3[$typecord],$fields3[$motifcord],$gapcounter,$poscounter,$fields3[$strandcord],$poscounter + length($search_result) -1 ,$final_microsat); + + if ($bracket_picker eq "yes") { + $outsentence = $outsentence."\t".join("\t",@fields3[($motifcord+1) ... $#fields3]); + } + print OUT $outsentence,"\n"; + } + } + } + } + my $unusedkeys = scalar(keys %seen); +# print INFO "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; +# print INFO "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; +# print INFO "microsatellites too short = $tooshort\n"; +# print INFO "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; + + #print "in hash = $ref, looped = $ref4, captured = $ref3\n REMOVED: \nmicrosats with too long gaps = $deletes\n"; + #print "exact duplicated removed = $duplicates \nmicrosats removed due to multiple microsats defined in +-10 bp neighboring region: $neighbors \n"; + #print "microsatellites too short = $tooshort\n"; + #print "keysused = $keysused...starts not found = $startnotfound ... matchkeysformed=$matchkeysformed ... unusedkeys=$unusedkeys\n"; + #print "unused keys = \n",join("\n", (keys %seen)),"\n"; + close (MATCH); + close (SPUT); + close (OUT); + close (INFO); +} + +sub microsat_bracketer{ +# print "in bracketer: @_\n"; + my ($microsat, $gapspos, $leftbracketpos, $rightbracketpos) = @_; + my @gaps = split(/__/,$gapspos); + my @lefts = split(/__/,$leftbracketpos); + my @rights = split(/__/,$rightbracketpos); + my @new=(); + my $pure = $microsat; + $pure =~ s/-//g; + my $off = 0; + my $finallength = length($microsat) + scalar(@lefts)+scalar(@rights); + push(@gaps, 0); + push(@lefts,0); + push(@rights,0); + + for my $i (1 ... $finallength){ +# print "1 current i = >$i<>, right = >$rights[0]< gap = $gaps[0] left = >$lefts[0]< and $rights[0] == $i\n"; + if($rights[0] == $i){ + # print "pushed a ]\n"; + push(@new, "]"); + shift(@rights); + push(@rights,0); + for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} + next; + } + if($gaps[0] == $i){ + # print "pushed a -\n"; + push(@new, "-"); + shift(@gaps); + push(@gaps, 0); + for my $j (0 ... scalar(@rights)-1) {$rights[$j]++;} + for my $j (0 ... scalar(@lefts)-1) {$lefts[$j]++;} + + next; + } + if($lefts[0] == $i){ +# print "pushed a [\n"; + push(@new, "["); + shift(@lefts); + push(@lefts,0); + for my $j (0 ... scalar(@gaps)-1) {$gaps[$j]++;} + next; + } + else{ + my $pushed = substr($pure,$off,1); + $off++; + push(@new,$pushed ); +# print "pushed an alphabet, now new = @new, pushed = $pushed\n"; + next; + } + } + my $returnmicrosat = join("",@new); +# print "final microsat = $returnmicrosat \n"; + return($returnmicrosat); +} + +#xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx new_multispecies_t10 xxxxxxxxxxxxxx + + +#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx +sub multiSpecies_orthFinder4{ + #print "IN multiSpecies_orthFinder4: @_\n"; + my @handles = (); + #1 SEPT 30TH 2008 + #2 THIS CODE (multiSpecies_orthFinder4.pl) IS BEING MADE SO THAT IN THE REMOVAL OF MICROSATELLITES THAT ARE CLOSER TO EACH OTHER + #3 THAN 50 BP (HE 50BP RADIUS OF EXCLUSION), WE ARE LOOKING ACCROSS ALIGNMENT BLOCKS.. AND NOT JUST LOOKING WITHIN THE ALIGNMENT BLOCKS. THIS WILL + #4 POTENTIALLY REMOVE EVEN MORE MICROSATELLITES THAN BEFORE, BUT THIS WILL RESCUE THOSE MICROSATELLITES THAT WERE LOST + #5 DUE TO OUR PREVIOUS REQUIREMENT FROM VERSION 3, THAT MICROSATELLITES THAT ARE CLOSER TO THE BOUNDARY THAN 25 BP NEED TO BE REMOVED + #6 SUCH A REQUIREMENT WAS A CRUDE WAY TO IMPOSE THE ABOVE 50 BP RADIUS OF EXCLUSION ACCROSS THE ALIGNMENT BLOCKS WITHOUT ACTUALLY + #7 CHECKING COORDINATES OF THE EXCLUDED MICROSATELLITES. + #8 IN ORDER TO TAKE CARE OF THE CASES WHERE MICROSATELLITES ARE PRELIOUSLY CLOSE TO ENDS OF THE ALIGNMENT BLOCKS, WE IMPOSE HERE + #9 A NEW REQUIREMENT THAT FOR A MICROSATELLITE TO BE CONSIDERED, ALL THE SPECIES NEED TO HAVE AT LEAST 10 BP OF NON-MICROSATELLITE SEQUENCE + #10 ON EITHER SIDE OF IT.. GAPLESS. THIS INFORMATION IS STORED IN THE VARIABLE: $FLANK_SUPPORT. THIS PART, INSTEAD OF BEING INCLUDED IN + #11 THIS CODE, WILL BE INCLUDED IN A NEW CODE THAT WE WILL BE WRITING AS PART OF THE PIPELINE: multiSpecies_microsatSetSelector.pl + + #1 trial run: + #2 perl ../../../codes/multiSpecies_orthFinder4.pl /gpfs/home/ydk104/work/rhesus_microsat/axtNet/hg18.panTro2.ponAbe2.rheMac2.calJac1/chr22.hg18.panTro2.ponAbe2.rheMac2.calJac1.net.axt H.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:C.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:O.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:R.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2:M.hg18-chr22.panTro2.ponAbe2.rheMac2.calJac1_allmicrosats_symmetrical_fin_hit_all_2 orth22 hg18:panTro2:ponAbe2:rheMac2:calJac1 50 + + $prinkter=0; + + ############# + my $CLUSTER_DIST = $_[4]; + ############# + + + my $aligns = $_[0]; + my @micros = split(/:/, $_[1]); + my $orth = $_[2]; + #my $not_orth = "notorth"; + @tags = split(/:/, $_[3]); + + $no_of_species=scalar(@tags); + my $junkfile = $orth."_junk"; + #open(JUNK,">$junkfile"); + + #my $info = $output1."_info"; + #print "inputs are : \n"; foreach(@micros){print $_,"\n";} + #print "info = @_\n"; + + + open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; + open (ORTH, ">$orth"); + my $output=$orth."_out"; + open (OUTP, ">$output"); + + + #open (NORTH, ">$not_orth"); + #open (INF, ">$info"); + my $i = 0; + foreach my $path (@micros){ + $handles[$i] = IO::Handle->new(); + open ($handles[$i], "<$path") or die "Can't open microsat file $path : $!"; + $i++; + } + + #print "Opened files\n"; + + + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $motifcord = $typecord + 1; + $gapcord = $motifcord+1; + $startcord = $gapcord + 1; + $strandcord = $startcord + 1; + $endcord = $strandcord + 1; + $microsatcord = $endcord + 1; + $sequencepos = 2 + (4*$no_of_species) + 1 -1 ; + #$sequencepos = 17; + # GENERATING HASHES CONTAINING CHIMP AND HUMAN DATA FROM ABOVE FILES + #---------------------------------------------------------------------------------------------------------------- + my @hasharr = (); + foreach my $path (@micros){ + open(READ, "<$path") or die "Cannot open file $path :$!"; + my %single_hash = (); + my $key = (); + my $counter = 0; + while (my $line = ){ + $counter++; + # print $line; + chomp $line; + my @fields1 = split(/\t/,$line); + if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { + $key = join("\t",$1, $2, $4, $5); + +# print "key = : $key\n" if $prinkter == 1; + +# print $line if $prinkter == 1; + push (@{$single_hash{$key}},$line); + } + else{ + # print "microsat line incompatible\n"; + } + } + push @hasharr, {%single_hash}; + # print "@{$single_hash{$key}} \n"; +# print "done $path: counter = $counter\n" if $prinkter == 1; + close READ; + } +# print "Done hashes\n"; + #---------------------------------------------------------------------------------------------------------------- + my $question=(); + #---------------------------------------------------------------------------------------------------------------- + my @contigstarts = (); + my @contigends = (); + + my %contigclusters = (); + my %contigclustersFirstStartOnly=(); + my %contigclustersLastEndOnly=(); + my %contigclustersLastEndLengthOnly=(); + my %contigclustersFirstStartLengthOnly=(); + my %contigpath=(); + my $dotcounter = 0; + while (my $line = ){ +# print "x" x 60, "\n" if $prinkter == 1; + $dotcounter++; +# print "." if $dotcounter % 100 ==0; +# print "\n" if $dotcounter % 5000 ==0; + next if $line !~ /^[0-9]+/; +# print $line if $prinkter == 1; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z]+)\s([0-9]+)\s([0-9]+)/ ) { + $key2 = join("\t",$1, $2, $4, $5); + } + else { +# print "seq line $line incompatible\n" if $prinkter == 1; + next;} + + + + + + + my @sequences = (); + for (0 ... $#tags){ + my $seq = ; + # print $seq; + chomp $seq; + push(@sequences , " ".$seq); + } + my @origsequences = @sequences; + my $seqcopy = $sequences[0]; + my @strings = (); + $seqcopy =~ s/[a-zA-Z]|-/x/g; + my @string = split(/\s*/,$seqcopy); + + for my $s (0 ... $#tags){ + $sequences[$s] =~ s/-//g; + $sequences[$s] =~ s/[a-zA-Z]/x/g; + # print "length of sequence = ",length($sequences[$s]),"\n"; + my @tempstring = split(/\s*/,$sequences[$s]); + push(@strings, [@tempstring]) + + } + + my @species_list = (); + my @micro_count = 0; + my @starthash = (); + my $stopper = 1; + my @endhash = (); + + my @currentcontigstarts=(); + my @currentcontigends=(); + my @currentcontigchrs=(); + + for my $i (0 ... $#tags){ +# print "searching for : if exists hasharr: $i : $tags[$i] : $key2 \n" if $prinkter == 1; + my @temparr = (); + + if (exists $hasharr[$i]{$key2}){ + @temparr = @{$hasharr[$i]{$key2}}; + + $line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/; +## print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9_])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; +# print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; + my $startkey = $1."_SK0SK_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; + my $endkey = $1."_EK0EK_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; + $contigstarts[$i]{$startkey}= $key2; + $contigends[$i]{$endkey}= $key2; +# print "confirming existance: \n" if $prinkter == 1; +# print "present \n" if exists $contigends[$i]{$endkey} && $prinkter == 1; +# print "absent \n" if !exists $contigends[$i]{$endkey} && $prinkter == 1; + $currentcontigchrs[$i]=$1; + $currentcontigstarts[$i]=$2; + $currentcontigends[$i]=$3; + + } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} + else { + push (@starthash, {0 => "0"}); + push (@endhash, {0 => "0"}); + $currentcontigchrs[$i] = 0; + next; + } + $stopper = 0; + # print "exists: @temparr\n" if $prinkter == 1; + push(@micro_count, scalar(@temparr)); + push(@species_list, [@temparr]); + my @tempstart = (); my @tempend = (); + my %localends = (); + my %localhash = (); + # print "---------------------------\n"; + + foreach my $templine (@temparr){ +# print "templine = $templine\n" if $prinkter == 1; + my @tields = split(/\t/,$templine); + my $start = $tields[$startcord]; # - $tields[$gapcord]; + my $end = $tields[$endcord]; #- $tields[$gapcord]; + my $realstart = $tields[$startcord]- $tields[$gapcord]; + my $gapsinmicrosat = ($tields[$microsatcord] =~ s/-/-/g); + $gapsinmicrosat = 0 if $gapsinmicrosat !~ /[0-9]+/; + # print "infocord = $infocord typecord = $typecord motifcord = $motifcord gapcord = $gapcord startcord = $startcord strandcord = $strandcord endcord = $endcord microsatcord = $microsatcord sequencepos = $sequencepos\n"; + my $realend = $tields[$endcord]- $tields[$gapcord]- $gapsinmicrosat; + # print "real start = $realstart, realend = $realend \n"; + for my $pos ($realstart ... $realend){ $strings[$i][$pos] = $strings[$i][$pos].",".$i.":".$start."-".$end;} + push(@tempstart, $start); + push(@tempend, $end); + $localhash{$start."-".$end} = $templine; + } + push @starthash, {%localhash}; + my $foundclusters =findClusters(join("!",@{$strings[$i]}), $CLUSTER_DIST); + +# print "foundclusters = $foundclusters\n"; + + my @clusters = split(/_/,$foundclusters); + + my $clustno = 0; + + foreach my $cluster (@clusters) { + my @constituenst = split(/,/,$cluster); +# print "clusters returned: @constituenst\n" if $prinkter == 1; + } + + @string = split("_S0S_",stringPainter(join("_C0C_",@string),$foundclusters)); + + + } + next if $stopper == 1; + +# print colored ['blue'],"FINAL:\n" if $prinkter == 1; + my $finalclusters =findClusters(join("!",@string), 1); +# print "finalclusters = $finalclusters\n"; +# print colored ['blue'],"----------------------\n" if $prinkter == 1; + my @clusters = split(/,/,$finalclusters); +# print "@string\n" if $prinkter == 1; +# print "@clusters\n" if $prinkter == 1; +# print "------------------------------------------------------------------\n" if $prinkter == 1; + + my $clustno = 0; + + # foreach my $cluster (@clusters) { + # my @constituenst = split(/,/,$cluster); + # print "clusters returned: @constituenst\n"; + # } + + next if (scalar @clusters == 0); + + my @contigcluster=(); + my $clusterno=0; + my @contigClusterstarts=(); + my @contigClusterends = (); + + foreach my $clust (@clusters){ + # print "cluster: $clust\n"; + $clusterno++; + my @localclust = split(/\./, $clust); + my @result = (); + my @starts = (); + my @ends = (); + + for my $i (0 ... $#localclust){ + # print "localclust[$i]: $localclust[$i]\n"; + my @pattern = split(/:/, $localclust[$i]); + my @cords = split(/-/, $pattern[1]); + push (@starts, $cords[0]); + push (@ends, $cords[1]); + } + + my $extremestart = smallest_number(@starts); + my $extremeend = largest_number(@ends); + push(@contigClusterstarts, $extremestart); + push(@contigClusterends, $extremeend); +# print "cluster starts from $extremestart and ends at $extremeend \n" if $prinkter == 1 ; + + foreach my $clustparts (@localclust){ + my @pattern = split(/:/, $clustparts); + # print "printing from pattern: $pattern[1]: $starthash[$pattern[0]]{$pattern[1]}\n"; + push (@result, $starthash[$pattern[0]]{$pattern[1]}); + } + push(@contigcluster, join("\t", @result)); +# print join("\t", @result),"<-result \n" if $prinkter == 1 ; + } + + + my $firstclusterstart = smallest_number(@contigClusterstarts); + my $lastclusterend = largest_number(@contigClusterends); + + + $contigclustersFirstStartOnly{$key2}=$firstclusterstart; + $contigclustersLastEndOnly{$key2} = $lastclusterend; + $contigclusters{$key2}=[ @contigcluster ]; +# print "currentcontigchr are @currentcontigchrs , firstclusterstart = $firstclusterstart, lastclusterend = $lastclusterend\n " if $prinkter == 1; + for my $i (0 ... $#tags){ + #1 check if there exists adjacent alignment block wrt coordinates of this species. + next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. + #2 no need to worry about proximity of anything in adjacent block! + + #1 BELOW, the following is really to calclate the distance between the end coordinate of the + #2 cluster and the end of the gap-free sequence of each species. this is so that if an + #3 adjacent alignment block is found lateron, the exact distance between the potentially + #4 adjacent microsat clusters can be found here. the exact start coordinate will be used + #5 immediately below. + # print "full sequence = $origsequences[$i] and its length = ",length($origsequences[$i])," \n" if $prinkter == 1; + + my $species_startsubstring = substr($origsequences[$i], 0, $firstclusterstart); + my $species_endsubstring = (); + + if (length ($origsequences[$i]) <= $lastclusterend+1){ $species_endsubstring = "";} + else{ $species_endsubstring = substr($origsequences[$i], $lastclusterend+1);} + +# print "\nnot defined species_endsubstring...\n" if !defined $species_endsubstring && $prinkter == 1; +# print "for species: $tags[$i]: \n" if $prinkter == 1; + + $species_startsubstring =~ s/-| //g; + $species_endsubstring =~ s/-| //g; + $contigclustersLastEndLengthOnly{$key2}[$i]=length($species_endsubstring); + $contigclustersFirstStartLengthOnly{$key2}[$i]=length($species_startsubstring); + + + +# print "species_startsubstring = $species_startsubstring, and its length =",length($species_startsubstring)," \n" if $prinkter == 1; +# print "species_endsubstring = $species_endsubstring, and its length =",length($species_endsubstring)," \n" if $prinkter == 1; +# print "attaching to contigclustersLastEndOnly: $key2: $i\n" if $prinkter == 1; + +# print "just confirming: $contigclustersLastEndLengthOnly{$key2}[$i] \n" if $prinkter == 1; + + } + + + } +# print "\ndone the job of filling... \n"; + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + $prinkter=0; + open (BO, "<$aligns") or die "Cannot open alignment file: $aligns: $!"; + + my %clusteringpaths=(); + my %clustersholder=(); + my %foundkeys=(); + my %clusteringpathsRev=(); + + + my $totalcount=(); + my $founkeys_enteredcount=(); + my $transfered=0; + my $complete_transfered=0; + my $plain_transfered=0; + my $existing_removed=0; + + while (my $line = ){ +# print "x" x 60, "\n" if $prinkter == 1; + next if $line !~ /^[0-9]+/; + #print $line; + chomp $line; + my @fields2 = split(/\t/,$line); + my $key2 = (); + if ($line =~ /([0-9]+)\s+($focalspec)\s(chr[0-9a-zA-Z_]+)\s([0-9]+)\s([0-9]+)/ ) { + $key2 = join("\t",$1, $2, $4, $5); + } + + else { + # print "seq line $line incompatible\n"; + next; + } +# print "KEY = : $key2\n" if $prinkter == 1; + + + my @currentcontigstarts=(); + my @currentcontigends=(); + my @currentcontigchrs=(); + my @clusters = (); + my @clusterscopy=(); + if (exists $contigclusters{$key2}){ + @clusters = @{$contigclusters{$key2}}; + @clusterscopy=@clusters; + for my $i (0 ... $#tags){ + # print "in line $line, trying to hunt for: $tags[$i]\\s([a-zA-Z0-9])+\\s([0-9]+)\\s([0-9]+) \n" if $prinkter == 1; + if ($line =~ /$tags[$i]\s([a-zA-Z0-9_]+)\s([0-9]+)\s([0-9]+)/){ + # print "org = $tags[$i], and chr = $1, start = $2, end =$3 \n" if $prinkter == 1; + my $startkey = $1."_S0E_".$2; #print "adding start key for this alignmebt block: $startkey to species $tags[$i]\n" if $prinkter == 1; + my $endkey = $1."_S0E_".$3; #print "adding end key for this alignmebt block: $endkey to species $tags[$i]\n" if $prinkter == 1; + $currentcontigchrs[$i]=$1; + $currentcontigstarts[$i]=$2; + $currentcontigends[$i]=$3; + } + else { + $currentcontigchrs[$i] = 0; + # print "no microsat clusters for $key2\n" if $prinkter == 1; next; + } + } + } # print "exists: @{$hasharr[$i]{$key2}}[0]\n"} + + my @sequences = (); + for (0 ... $#tags){ + my $seq = ; + # print $seq; + chomp $seq; + push(@sequences , " ".$seq); + } + + next if scalar @currentcontigchrs == 0; + + # print "contigchrs= @currentcontigchrs \n" if $prinkter == 1; + my %visitedcontigs=(); + + for my $i (0 ... $#tags){ + #1 check if there exists adjacent alignment block wrt coordinates of this species. + next if $currentcontigchrs[$i] eq "0"; #1 this means that there are no microsats in this species in this alignment block.. + #2 no need to worry about proximity of anything in adjacent block! + @clusters=@clusterscopy; + #1 BELOW, the following is really to calclate the distance between the end coordinate of the + #2 cluster and the end of the gap-free sequence of each species. this is so that if an + #3 adjacent alignment block is found lateron, the exact distance between the potentially + #4 adjacent microsat clusters can be found here. the exact start coordinate will be used + #5 immediately below. + my $firstclusterstart = $contigclustersFirstStartOnly{$key2}; + my $lastclusterend = $contigclustersLastEndOnly{$key2}; + + my $key3 = $currentcontigchrs[$i]."_S0E_".($currentcontigstarts[$i]); +# print "check if exists $key3 in contigends for $i\n" if $prinkter == 1; + + if (exists($contigends[$i]{$key3}) && !exists $visitedcontigs{$contigends[$i]{$key3}}){ + $visitedcontigs{$contigends[$i]{$key3}} = $contigends[$i]{$key3}; #1 this array keeps track of adjacent contigs that we have already visited, thus saving computational time and potential redundancies# + # print "just checking the hash visitedcontigs: ",$visitedcontigs{$contigends[$i]{$key3}} ,"\n" if $prinkter == 1; + + #1 extract coordinates of the last cluster of this found alignment block +# print "key of the found alignment block = ", $contigends[$i]{$key3},"\n" if $prinkter == 1; + # print "we are trying to mine: contigclustersAllLastEndLengthOnly_raw: $contigends[$i]{$key3}: $i \n" if $prinkter == 1; + # print "EXISTS\n" if exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; + # print "does NOT EXIST\n" if !exists $contigclusters{$contigends[$i]{$key3}} && $prinkter == 1; + my @contigclustersAllFirstStartLengthOnly_raw=@{$contigclustersFirstStartLengthOnly{$key2}}; + my @contigclustersAllLastEndLengthOnly_raw=@{$contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}}; + my @contigclustersAllFirstStartLengthOnly=(); my @contigclustersAllLastEndLengthOnly=(); + + for my $val (0 ... $#contigclustersAllFirstStartLengthOnly_raw){ + # print "val = $val\n" if $prinkter == 1; + if (defined $contigclustersAllFirstStartLengthOnly_raw[$val]){ + push(@contigclustersAllFirstStartLengthOnly, $contigclustersAllFirstStartLengthOnly_raw[$val]) if $contigclustersAllFirstStartLengthOnly_raw[$val] =~ /[0-9]+/; + } + } + # print "-----\n" if $prinkter == 1; + for my $val (0 ... $#contigclustersAllLastEndLengthOnly_raw){ + # print "val = $val\n" if $prinkter == 1; + if (defined $contigclustersAllLastEndLengthOnly_raw[$val]){ + push(@contigclustersAllLastEndLengthOnly, $contigclustersAllLastEndLengthOnly_raw[$val]) if $contigclustersAllLastEndLengthOnly_raw[$val] =~ /[0-9]+/; + } + } + + + # print "our two arrays are: starts = <@contigclustersAllFirstStartLengthOnly> ......... and ends = <@contigclustersAllLastEndLengthOnly>\n" if $prinkter == 1; + # print "the last cluster's end in that one is: ",smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly)," = ", smallest_number(@contigclustersAllFirstStartLengthOnly)," + ",smallest_number(@contigclustersAllLastEndLengthOnly),"\n" if $prinkter == 1; + + # if ($contigclustersFirstStartLengthOnly{$key2}[$i] + $contigclustersLastEndLengthOnly{$contigends[$i]{$key3}}[$i] < 50){ + if (smallest_number(@contigclustersAllFirstStartLengthOnly) + smallest_number(@contigclustersAllLastEndLengthOnly) < $CLUSTER_DIST){ + my @regurgitate = @{$contigclusters{$contigends[$i]{$key3}}}; + $regurgitate[$#regurgitate]=~s/\n//g; + $regurgitate[$#regurgitate] = $regurgitate[$#regurgitate]."\t".shift(@clusters); + delete $contigclusters{$contigends[$i]{$key3}}; + $contigclusters{$contigends[$i]{$key3}}=[ @regurgitate ]; + delete $contigclusters{$key2}; + $contigclusters{$key2}= [ @clusters ] if scalar(@clusters) >0; + $contigclusters{$key2}= [ "" ] if scalar(@clusters) ==0; + + if (scalar(@clusters) < 1){ + # print "$key2-> $clusteringpaths{$key2} in the loners\n" if exists $foundkeys{$key2}; + $clusteringpaths{$key2}=$contigends[$i]{$key3}; + $clusteringpathsRev{$contigends[$i]{$key3}}=$key2; + print OUTP "$contigends[$i]{$key3} -> $clusteringpathsRev{$contigends[$i]{$key3}}\n"; + # print " clusteringpaths $key2 -> $contigends[$i]{$key3}\n"; + $founkeys_enteredcount-- if exists $foundkeys{$key2}; + $existing_removed++ if exists $foundkeys{$key2}; +# print "$key2->",@{$contigclusters{$key2}},"->>$foundkeys{$key2}\n" if exists $foundkeys{$key2} && $prinkter == 1; + delete $foundkeys{$key2} if exists $foundkeys{$key2}; + $complete_transfered++; + } + else{ + print OUTP "$key2-> 0 not so lonely\n" if !exists $clusteringpathsRev{$key2}; + $clusteringpaths{$key2}=$key2 if !exists $clusteringpaths{$key2}; + $clusteringpathsRev{$key2}=0 if !exists $clusteringpathsRev{$key2}; + + $founkeys_enteredcount++ if !exists $foundkeys{$key2}; + $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2}; + # print "adding foundkeys entry $foundkeys{$key2}\n"; + $transfered++; + } + #$contigclusters{$key2}=[ @contigcluster ]; + } + } + else{ + # print "adjacent block with species $tags[$i] does not exist\n" if $prinkter == 1; + $plain_transfered++; + print OUTP "$key2-> 0 , going straight\n" if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; + $clusteringpaths{$key2}=$key2 if exists $contigclusters{$key2} && !exists $clusteringpaths{$key2}; + $clusteringpathsRev{$key2}=0 if exists $contigclusters{$key2} && !exists $clusteringpathsRev{$key2}; + $founkeys_enteredcount++ if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; + $foundkeys{$key2} = $key2 if !exists $foundkeys{$key2} && exists $contigclusters{$key2}; + # print "adding foundkeys entry $foundkeys{$key2}\n"; + + } + $totalcount++; + + } + + + } + close BO; + #close (NORTH); + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////////////////////////////////////////////// + + my $founkeys_count=(); + my $nopath_count=(); + my $pathed_count=0; + foreach my $key2 (keys %foundkeys){ + #print "x" x 60, "\n"; +# print "x" if $dotcounter % 100 ==0; +# print "\n" if $dotcounter % 5000 ==0; + $founkeys_count++; + my $key = $key2; +# print "$key2 -> $clusteringpaths{$key2}\n" if $prinkter == 1; + if ($clusteringpaths{$key} eq $key){ +# print "printing hit the alignment block immediately... no path needed\n" if $prinkter == 1; + $nopath_count++; + delete $foundkeys{$key2}; + print ORTH join ("\n",@{$contigclusters{$key2}}),"\n"; + } + else{ + my @pool=(); + my $key3=(); + $pathed_count++; +# print "going reverse... clusteringpathsRev, $key = $clusteringpathsRev{$key}\n" if exists $clusteringpathsRev{$key} && $prinkter == 1; +# print "going reverse... clusteringpathsRev $key does not exist\n" if !exists $clusteringpathsRev{$key} && $prinkter == 1; + if ($clusteringpathsRev{$key} eq "0") { + next; + } + else{ + my $yek3 = $clusteringpathsRev{$key}; + my $yek = $key; +# print "caught in the middle of a path, now goin down from $yek to $yek3, which is $clusteringpathsRev{$key} \n" if $prinkter == 1; + while ($yek3 ne "0"){ +# print "$yek->$yek3," if $prinkter == 1; + $yek = $yek3; + $yek3 = $clusteringpathsRev{$yek}; + } +# print "\nfinally reached the end of path: $yek3, and the next in line is $yek, and its up-route is $clusteringpaths{$yek}\n" if $prinkter == 1; + $key3 = $clusteringpaths{$yek}; + $key = $yek; + } + +# print "now that we are at bottom of the path, lets start climbing up again\n" if $prinkter == 1; + + while($key ne $key3){ +# print "KEEY $key->$key3\n" if $prinkter == 1; +# print "our contigcluster = @{$contigclusters{$key}}\n----------\n" if $prinkter == 1; + + if (scalar(@{$contigclusters{$key}}) > 0) {push @pool, @{$contigclusters{$key}}; + # print "now pool = @pool\n" if $prinkter == 1; + } + delete $foundkeys{$key3}; + $key=$key3; + $key3=$clusteringpaths{$key}; + } +# print "\nfinally, adding the first element of path: @{$contigclusters{$key}}\n AND printing the contents:\n" if $prinkter == 1; + my @firstcontig= @{$contigclusters{$key}}; + delete $foundkeys{$key2} if exists $foundkeys{$key2} ; + delete $foundkeys{$key} if exists $foundkeys{$key}; + + unshift @pool, pop @firstcontig; +# print join("\t",@pool),"\n" if $prinkter == 1; + print ORTH join ("\n",@firstcontig),"\n" if scalar(@firstcontig) > 0; + print ORTH join ("\t",@pool),"\n"; + # join(); + } + + } + #close (NORTH); +# print "founkeys_entered =$founkeys_enteredcount, plain_transfered=$plain_transfered,existing_removed=$existing_removed,founkeys_count =$founkeys_count, nopath_count =$nopath_count, transfered = $transfered, complete_transfered = $complete_transfered, totalcount = $totalcount, pathed=$pathed_count\n" if $prinkter == 1; + close (BO); + close (ORTH); + close (OUTP); + return 1; + +} +sub stringPainter{ + my @string = split(/_C0C_/,$_[0]); +# print $_[0], " <- in stringPainter\n"; +# print $_[1], " <- in clusters\n"; + + my @clusters = split(/,/, $_[1]); + for my $i (0 ... $#clusters){ + my $cluster = $clusters[$i]; +# print "cluster = $cluster\n"; + my @parts = split(/\./,$cluster); + my @cord = split(/:|-/,shift(@parts)); + my $minstart = $cord[1]; + my $maxend = $cord[2]; +# print "minstart = $minstart , maxend = $maxend\n"; + + for my $j (0 ... $#parts){ +# print "oing thri $parts[$j]\n"; + my @cord = split(/:|-/,$parts[$j]); + $minstart = $cord[1] if $cord[1] < $minstart; + $maxend = $cord[2] if $cord[2] > $maxend; + } +# print "minstart = $minstart , maxend = $maxend\n"; + for my $pos ($minstart ... $maxend){ $string[$pos] = $string[$pos].",".$cluster;} + + + } +# print "@string <-done from function stringPainter\n"; + return join("_S0S_",@string); +} + +sub findClusters{ + my $continue = 0; + my @mapped_clusters = (); + my $clusterdist = $_[1]; + my $previous = 'x'; + my @localcluster = (); + my $cluster_starts = (); + my $cluster_ends = (); + my $localcluster_start = (); + my $localcluster_end = (); + my @record_cluster = (); + my @string = split(/\!/, $_[0]); + my $zerolength=0; + + for my $pos_pos (1 ... $#string){ + my $pos = $string[$pos_pos]; +# print $pos, "\n"; + if ($continue == 0 && $pos eq "x") {next;} + + if ($continue == 1 && $pos eq "x" && $zerolength <= $clusterdist){ + if ($zerolength == 0) {$localcluster_end = $pos_pos-1}; + $zerolength++; + $continue = 1; + } + + if ($continue == 1 && $pos eq "x" && $zerolength > $clusterdist) { + $zerolength = 0; + $continue = 0; + my %seen; + my @uniqed = grep !$seen{$_}++, @localcluster; +# print "caught cluster : @uniqed \n"; + push(@mapped_clusters, [@uniqed]); +# print "clustered:\n@uniqed\n"; + @localcluster = (); + @record_cluster = (); + + } + + if ($pos ne "x"){ + $zerolength = 0; + $continue = 1; + $pos =~ s/x,//g; + my @entries = split(/,/,$pos); + $localcluster_end = 0; + $localcluster_start = 0; + push(@record_cluster,$pos); + + if ($continue == 0){ + @localcluster = (); + @localcluster = (@localcluster, @entries); + $localcluster_start = $pos_pos; + } + + if ($continue == 1 ) { + @localcluster = (@localcluster, @entries); + } + } + } + + if (scalar(@localcluster) > 0){ + my %seen; + my @uniqed = grep !$seen{$_}++, @localcluster; + # print "caught cluster : @uniqed \n"; + push(@mapped_clusters, [@uniqed]); + # print "clustered:\n@uniqed\n"; + @localcluster = (); + @record_cluster = (); + } + + my @returner = (); + + foreach my $clust (@mapped_clusters){ + my @localclust = @$clust; + my @result = (); + foreach my $clustparts (@localclust){ + push(@result,$clustparts); + } + push(@returner , join(".",@result)); + } +# print "returnig: ", join(",",@returner), "\n"; + return join(",",@returner); +} +#xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx multiSpecies_orthFinder4 xxxxxxxxxxxxxx + +#xxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx MakeTrees xxxxxxxxxxxxxxxxxxxxxxxxxxxx + +sub MakeTrees{ + my $tree = $_[0]; + my @parts=($tree); +# my @parts=(); + + while (1){ + $tree =~ s/^\(//g; + $tree =~ s/\)$//g; + my @arr = (); + + if ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)\)$/){ + @arr = $tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_\(\),]+)$/; + $tree = $2; + push @parts, $tree; + } + elsif ($tree =~ /^\(([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/){ + @arr = $tree =~ /^([a-zA-Z0-9_\(\),]+),([a-zA-Z0-9_]+)$/; + $tree = $1; + push @parts, $tree; + } + elsif ($tree =~ /^([a-zA-Z0-9_]+),([a-zA-Z0-9_]+)$/){ + last; + } + } + return @parts; +} + +#xxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx qualityFilter xxxxxxxxxxxxxxxxxxxxxxxxxxxx + +sub qualityFilter{ + my $unmaskedorthfile = $_[0]; + my $seqfile = $_[1]; + my $maskedorthfile = $_[2]; + my $filteredout = $maskedorthfile."_residue"; + open (PMORTH, "<$unmaskedorthfile") or die "Cannot open unmaskedorthfile file: $unmaskedorthfile: $!"; + + my %keyhash = (); + + while (my $line = ){ + my $key = join("\t", $1,$2,$3,$4) if $line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/; + push @{$keyhash{$key}}, $line; + } + + open (SEQ, "<$seqfile") or die "Cannot open seqfile file: $seqfile: $!"; + open (MORTH, ">$maskedorthfile") or die "Cannot open maskedorthfile file: $maskedorthfile: $!"; + open (RES, ">$filteredout") or die "Cannot open filteredout file: $filteredout: $!"; + + + + while (my $line = ){ + chomp $line; + if ($line =~ /($focalspec)\s+([a-zA-Z0-9\-_]+)\s+([0-9]+)\s+([0-9]+)/){ + my $key = join("\t", $1,$2,$3,$4); + next if !exists $keyhash{$key}; + my @orths = @{$keyhash{$key}} if exists $keyhash{$key}; + delete $keyhash{$key}; + + my $sine = ; + + foreach my $orth (@orths){ + #print "-----------------------------------------------------------------\n"; + #print $orth; + my $orthcopy = $orth; + $orth =~ s/^>//; + my @parts = split(/>/,$orth); + + my @starts = (); + my @ends = (); + + foreach my $part (@parts){ + my $no_of_species = adjustCoordinates($part); + my @pields = split(/\t/,$part); + + # print "pields = @pields .. no_of_species = $no_of_species .. startcord = $pields[$startcord]\n"; + + push @starts, $pields[$startcord]; + push @ends, $pields[$endcord]; + } + + #print "starts = @starts ... ends = @ends\n"; + + my $leftend = smallest_number(@starts)-10; + my $rightend = largest_number(@ends)+10; + + my $maskarea = substr($sine, $leftend, $rightend-$leftend+1); + print RES $orth if $maskarea =~ /#/; + + + next if $maskarea =~ /#/; + + print MORTH $orthcopy; + } + } + else{ + next; + } + + + } + +# print "UNDONE: ", scalar(keys %keyhash),"\n"; +# print MORTH "UNDONE: ", scalar(keys %keyhash),"\n"; + +} + +sub adjustCoordinates{ + my $line = $_[0]; + my $no_of_species = $line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)|(scaffold[0-9a-zA-Z\._\-]+)|(supercontig[0-9a-zA-Z\._\-]+)/x/ig; + my @got = ($line =~ s/(chr[0-9a-zA-Z]+)|(Contig[0-9a-zA-Z\._\-]+)/x/g); +# print "line = $line\n"; + $infocord = 2 + (4*$no_of_species) - 1; + $typecord = 2 + (4*$no_of_species) + 1 - 1; + $motifcord = 2 + (4*$no_of_species) + 2 - 1; + $gapcord = $motifcord+1; + $startcord = $gapcord+1; + $strandcord = $startcord+1; + $endcord = $strandcord + 1; + $microsatcord = $endcord + 1; + $sequencepos = 2 + (5*$no_of_species) + 1 -1 ; + $interr_poscord = $microsatcord + 3; + $no_of_interruptionscord = $microsatcord + 4; + $interrcord = $microsatcord + 2; +# print "$line\n startcord = $startcord, and endcord = $endcord and no_of_species = $no_of_species\n" if $line !~ /calJac/i; + return $no_of_species; +} + + + + diff -r 000000000000 -r d1bf24c1350f multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/multispecies_MicrosatDataGenerator_interrupted_GALAXY.xml Tue Aug 20 11:02:09 2013 -0400 @@ -0,0 +1,93 @@ + + for multiple (>2) species alignments + + multispecies_MicrosatDataGenerator_interrupted_GALAXY.pl + $input1 + $input2 + $out_file1 + $thresholds + $species + "$treedefinition" + $separation + + + + + + + + + + + + + + + + + + + + + bx-sputnik + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool finds ortholgous microsatellite blocks between aligned species + +----- + +.. class:: warningmark + +**Note** + +A non-tabular format is created in which each row contains all information pertaining to a microsatellite locus from multiple species in the alignment. +The rows read like this: + +>hg18 15 hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 mononucleotide A 0 13 + 29 aaaaa------aaaAAA >rheMac2 15 hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 mononucleotide A 0 13 + 29 aaaaaaaa---AAAAAA + +Information from each species starts with an ">" followed by the species name, for instance, ">rheMac2". Below we describe all information listed for a microsatellite sequence in each species. + +After the species tag the alignemnt number is listed. +What follows is details of the alignment block from all the species, including the chromosome number, start and end coordinates in each species. For instance: + +hg18 chr22 16092941 16093413 panTro2 chr22 16103944 16104421 ponAbe2 chr22 13797750 13798215 rheMac2 chr10 61890946 61891409 calJac1 Contig6986 140254 140728 + +suggests that the alignment block as five species: hg18, panTro2, ponAbe2, rheMac2 and calJac1. + +Then the type of microsatellite is written, for instance, "mononucleotide". + +Then the microsatellite motif. + +Then the number of gaps in the alignment, in the respective species (as noted above, rheMac2 in this case). + +Then the start coordinate, the strand, and the end coordinate WITHIN the alignment block. + +At the end is listed the microsatellite sequence. + +If the microsatellite contains interruptions (which are not important for this tool), then the interruptions' information will be written out after the microsatellite sequence. + + + + + + diff -r 000000000000 -r d1bf24c1350f test-data/regVariation/microsatellite/Galaxy17_masked_short.maf.gz Binary file test-data/regVariation/microsatellite/Galaxy17_masked_short.maf.gz has changed diff -r 000000000000 -r d1bf24c1350f test-data/regVariation/microsatellite/Galaxy17_short_raw.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/regVariation/microsatellite/Galaxy17_short_raw.txt Tue Aug 20 11:02:09 2013 -0400 @@ -0,0 +1,5 @@ +>rheMac2 1 hg18 chr22 16115000 16115212 panTro2 chr22 16131432 16131644 ponAbe2 chr22 13834301 13834513 rheMac2 chr10 61914657 61914870 calJac1 Contig13114 14954 15124 mononucleotide A 1 198 + 206 aaaaaaaaa +>calJac1 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 tetranucleotide AAAC 1 182 + 193 AAACAAACAAAC +>hg18 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 7 568 + 577 TTTTTTTTTT >panTro2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 6 568 + 577 TTTTTTTTTT >ponAbe2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 5 568 + 576 TTTTTTTTT >rheMac2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 mononucleotide T 22 568 + 579 TTTTTTTTTTTT +>ponAbe2 4 hg18 chr22 16118223 16119141 panTro2 chr22 16134653 16135572 ponAbe2 chr22 13837634 13838553 rheMac2 chr10 61918520 61919424 calJac1 Contig13114 15842 16747 tetranucleotide GAGG 8 821 + 833 GAGGGAGGGAGGG +>hg18 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >panTro2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >ponAbe2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >rheMac2 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 >calJac1 5 hg18 chr22 16119141 16119363 panTro2 chr22 16135572 16135794 ponAbe2 chr22 13838553 13838774 rheMac2 chr10 61919424 61919654 calJac1 Contig13114 16747 16966 dinucleotide [TC][TC] 0 21 + 34 [TCT]G[TCTCTCTCTC] indel/deletion G 4 1 diff -r 000000000000 -r d1bf24c1350f test-data/regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz Binary file test-data/regVariation/microsatellite/Galaxy17_unmasked_short.maf.gz has changed