#!/usr/bin/perl
use strict;
use File::Basename;
$| = 1;

# List Unique Variants for Pipeline 1.x and 2.x
# [Uses header if available, checks for position of xref field if not]
# Take one or more var or gene files
# Extract a non-redundant set of variants

# For var files:
# The fields used to define non-redundant variants are are:
# chromosome begin end varType reference alleleSeq xRef
# User can nominate class(es) of varType to filter on
# Outputs varScoreEAF, varScoreVAF and varQuality as a default but user can turn them off (separately)
# Scores and qualities stored in separate fields, all values for a variant across a set of genomes.
# Values for different genomes separated by ':', for two hom entries for the same genome by '|'
# Output is accepted by testvariants to generate a variant table, all fields kept in testvariants output

# For gene files:
# The fields used to define non-redundant gene variants are:
# chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
# User can nominate class(es) of varType, component or impact to filter on
# All gene entries kept ie  multiple entries if multiple transcripts

# NB Now treating xref as a separate component in var recs, as it is not consistent between X and Y vars
# Not fixed for gene recs yet

# perl List_Unique_Variants_2_0_9.pl
# --File_Type [V|G]
# --Input_File input_file_1 [set of var or gene files]
# --Input_File input_file_2
# ...
# --Input_File input_file_n
# --Output_File filename
# --Var_Type [For both file types, 'All' or any value from the varType field, multiple values allowed, separated by comma]
# --Component [Gene file specific,'All' or any value from component field of gene file, multiple allowed; 'All" is default]
# --Impact All [Gene file specific,'All' or any value from impact field of gene file, multiple allowed; 'All" is default]
# --Scores [1.x var file specific, yes|no, yes is default]
# --Scores_VAF [2.0 var file specific, yes|no, yes is default]
# --Scores_EAF [2.0 var file specific, yes|no, yes is default]
# --Score_Qualities [yes|no, yes is default]
# eg
# perl /Users/rtearle/Documents/Programming/Perl/Scripts/Dev/List_Unique_Variants_2_0_4 \
# --File_Type V \
# --Input_File /Yoruban_Trio_1100_37/GS19238-1100-37/GS00028-DNA_A01/ASM/gene-GS19238-1100-37-ASM.tsv.bz2 \
# --Input_File /Yoruban_Trio_1100_37/GS19239-1100-37/GS00028-DNA_B01/ASM/gene-GS19239-1100-37-ASM.tsv.bz2 \
# --Input_File /Yoruban_Trio_1100_37/GS19240-1100-37/GS00028-DNA_C01/ASM/gene-GS19240-1100-37-ASM.tsv.bz2 \
# --Output_File /Users/rtearle/Documents/TBF/YRI_Trio_Protein_Coding.tsv \
# --Var_Type All
# --Component All
# --Impact All
# --Scores_VAF yes \
# --Scores_EAF yes \
# --Score_Qualities yes

# var fields
# 1.x
# locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
# 2.0
# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef

# gene fields
# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
#  component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam

# Parsing and storing input parameters
# Only input_file fields can be repeated
# input paramaters are case insensitive

print "$0 @ARGV\nProcessing input parameters\n";
my $NrParams;
my %ExpectedParams =  GetExpectedParams (); # list of expected parms
my %EnteredParams = GetEnteredParams (); # list of entered params

# Input Files
my $FileType = $EnteredParams{file_type};
if ($FileType ne "V" and $FileType ne "G") {die "File Type must be 'V' or 'G', not '$FileType'\n";}
my $FilesIn = $EnteredParams{input_file}; # ptr to list of input files
print "File Type: $FileType\nInput Files:\n";
my $NrInputFiles = int(@$FilesIn);
foreach my $File (@$FilesIn) {print "$File\n";} # requires existing files
foreach my $File (@$FilesIn) {unless (-f $File) {die "Input file $File not found\n";}} # requires existing files
for my $n (0.. $NrInputFiles-2) # look for duplicates in list
	{for my $m ($n+1.. $NrInputFiles-1) {if ($$FilesIn[$n] eq $$FilesIn[$m] ) {die "File $$FilesIn[$n] is repeated in input file list\n";}}}

# Output Dir
#my $DirectoryOut = $EnteredParams{output_dir}; # output dir
#$DirectoryOut =~ s/\/$//; # remove trailing slash if present
#unless (-d $DirectoryOut) {mkdir $DirectoryOut or die "Cannot find/create output directory $DirectoryOut\n";} # uses existing dir or makes a new dir if it can

# Ouput File
my $FileOut = $EnteredParams{output_file}; # output file
print "FileOut: $FileOut\n";
$FileOut =~ /(^.+\/)?(.+$)/; # split in to path and filename
my $DirectoryOut = $1; # assign path  # NEED MORE TESTING EG EMPTY PATH #
$FileOut = $2; # assign file prefix
print "File: $FileOut\nDir: $DirectoryOut\n"; #exit;
# if (-f $DirectoryOut.$FileOut) # ouput file exists, create a new one based on the name
# {
# 	print "Output file $FileOut exists, modifying to unique file name ";
# 	$FileOut =~ /^(.+?)\./; # find name without extensions
# 	my $Stub = $1; # set stub to name without extensions
# 	$FileOut =~ /(\..+)?$/; # get extension(s)
# 	my $Ext = $1; # set ext to extensions
# 	my $n = 1; # n will increment to find a unique name
# 	my $Suff = ""; # suff tracks n
# 	while (-f $DirectoryOut.$Stub.$Suff.$Ext) {$Suff = "-$n"; $n++;} # loop till we have a new unique filename
# 	$FileOut = $Stub.$Suff.$Ext; # file out now has same name, same extensions, but also -n at the end of the name, making it unique
# 	print "$FileOut\n";
# }

#print "Files\n",join("\n",@$FilesIn),"\n\n";
#print "Ouput Dir\n$DirectoryOut\n";
#print "Ouput File\n$FileOut\n";

# Extract Header & Column Header
my $IN = OpenFile ($$FilesIn[0]); # open the first file with correct file format
my $Header = GetHeaderAsString ($IN); # get header
unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if there is no header, close and reopen file, ie start file again
my $ColHeader = <$IN>; # get col header, first remaining line
chomp $ColHeader;

# Get version if filetype is var - needed because there are new fields in 2.0 and posn of xRef changed
my ($Version, $XrefField);
if ($FileType eq "V")
{
	($Version, $XrefField) = GetVersion ($Header, $ColHeader);
	#print "$Version $XrefField\n"; exit;
	unless ($Version) {die "Cannot determine format version of first file in list\nNeed either a native Complete header or a native Complete Column Header with an xRef field\n";}
}

# Shared input params
my $OutputVarTypes = lc $EnteredParams{var_type} || $ExpectedParams{var_type}; # var types listed in file in lc
$OutputVarTypes =~ s/\,/\|/g; # create regex string
$OutputVarTypes =~ s/\,| //; # remove extraneous commas, spaces

# Input Params for var file
my ($KeepScoresVAF, $KeepScoresEAF, $KeepQuals, $KeepScores, $VarExtras);
if ($FileType eq "V")
{
	if ($Version == 2)
	{
		$KeepQuals = lc $EnteredParams{score_quality} || $ExpectedParams{score_quality}; # keep scoresQuality for 2.0
		$KeepQuals = 1 if $KeepQuals eq "yes"; # converting to boolean
		$KeepScoresVAF = lc $EnteredParams{scores_vaf} || $ExpectedParams{scores_vaf}; # keep scoresVAF for 2.0
		$KeepScoresVAF = 1 if $KeepScoresVAF eq "yes"; # converting to boolean
		$KeepScoresEAF = lc $EnteredParams{scores_eaf} || $ExpectedParams{scores_eaf}; # keep scoresEAF for 2.0	$KeepQuals = lc $EnteredParams{score_qualities} || $ExpectedParams{score_qualities}; # keep scoresQuality for 2.0
		$KeepScoresEAF = 1 if $KeepScoresEAF eq "yes"; # converting to boolean
	}
	else # Version 1
	{
		$KeepScores = lc $EnteredParams{scores} || $ExpectedParams{scores}; # keep scores for 1.x
		$KeepScores = 1 if $KeepScores eq "yes"; # converting to boolean
	}
	$VarExtras = 1 if $KeepScoresVAF or $KeepScoresEAF or $KeepQuals or $KeepScores; # flag to process var file for scores info
}

# Input Params for gene file
my $OutputComponents = uc $EnteredParams{component} || $ExpectedParams{component}; # components listed in file in uc
my $OutputImpacts = uc $EnteredParams{impact} || $ExpectedParams{impact}; # impacts listed in file in uc

# Loading chr nrs, setting up var hash
my @ChrNames = ('chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10',
				'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19',
				'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM'); # using this array forces the output order of chrs into the correct order
my %Vars; # hash to store var records in an array for each chr
foreach my $Chr (@ChrNames) {$Vars{$Chr} = {};} # print "$Chr\t";}  # set up hash of hashes, one for each chr
#print "\n"; #exit;

# Create ouput col header
if ($FileType eq "V")
{
	# 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
	# 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
	my @Fields = split "\t", $ColHeader;
	$ColHeader = join("\t",@Fields[3..8])."\t".$Fields[$XrefField];
	if ($Version == 2) # 2.0
	{
		if ($KeepScoresVAF) {$ColHeader .= "\tvarScoreVAF";}
		if ($KeepScoresEAF) {$ColHeader .= "\tvarScoreEAF";}
		if ($KeepQuals) {$ColHeader .= "\tvarQuality";}
	}
	else # 1.x
	{
		if ($KeepScores) {$ColHeader .= "\ttotalScore";}
	}
}
elsif ($FileType eq "G")
{
	# 1.x index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
	#  component componentIndex codingRegionKnown impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence
	# 2.0 index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc proteinAcc symbol orientation
	# component componentIndex hasCodingRegion impact nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam
	my @Fields = split "\t", $ColHeader;
	$ColHeader = join("\t",@Fields[3..30]); # 30 much more than needed
	$OutputComponents =~ s/\,/\|/g; # create regex string
	$OutputComponents =~ s/\,| //; # remove extraneous commas, spaces
	$OutputComponents =~ s/\,| //; # remove extraneous commas, spaces
	$OutputImpacts =~ s/\,/\|/g; # create regex string
	$OutputImpacts =~ s/\,| //; # remove extraneous commas, spaces
	#print "OutputVarTypes: $OutputVarTypes\nOutputComponents: $OutputComponents\nOutputImpacts: $OutputImpacts\n";
}
else
{
	die "FileType $FileType not understood\n"; # redundant
}

# Set up Processing Subs
if ($FileType eq "G") # use gene subs for 'G'
{
	*ExtractSub2Use = \&ExtractGeneFields;
	*AddRecSub2Use = \&AddGeneRec;
}
else # use var subs for 'V'
{
	*ExtractSub2Use = \&ExtractVarFields;
	*AddRecSub2Use = \&AddVarRec;
}
#print "Gene: ",\&ExtractGeneFields," ",\&AddGeneRec," Var: ",\&ExtractVarFields," ",\&AddVarRec," Using: ",\&ExtractSub2Use," ",\&AddRecSub2Use,"\n"; exit;

# Process Files
my $RecCount = 0; # total nr recs that are coding/splicing/spanning
my $FileCount = 0; # keeps track of nr files to use to format eg scores field
my $XRef;
foreach my $File (@$FilesIn)
{
	print "Processing file $File\n";
	$FileCount++; # counting nr files

	# Open file, process header, col header
	my $IN = OpenFile ($File); # open the file with correct file format
	my $Header = GetHeaderAsString ($IN); # get  header
	unless ($Header) {close $IN; $IN = OpenFile ($$FilesIn[0]);} # if no header, close and reopen file, ie start file again
	my $ColHeader = <$IN>; # get col header, first remaining line
	unless ($ColHeader =~ /^>/) {print "Suspect column header for file $File:\n$ColHeader\n;"}

	my $Count = 0; # cnr recs for this file that are coding/splicing/spanning
	while (<$IN>) # loop through remainder of file ie data
	{
		my ($Rec, $Chr, $ScoreVAF, $ScoreEAF, $ScoreQual, $XRef) = ExtractSub2Use ($_, $XrefField); # sub extracts wanted fields in rec as string, chr, other fields optionally
		next unless $Rec;
		AddRecSub2Use ($Rec, $Vars{$Chr}, $ScoreVAF, $ScoreEAF, $ScoreQual, $FileCount, $XRef) if $Rec; # only process if rec is not empty
		$Count++; # increment count of coding/splicing/spanning vars
	}
	print "Nr matched records for this file: $Count\n";
	$RecCount += $Count; # add this file's count to total count
	close $IN;
}
print "Nr matched records across all files:\t $RecCount\n"; # total count

# Open file out, write col header
print "Sorting and Saving to file $DirectoryOut$FileOut ...\n";
open my $OUT, ">", $DirectoryOut . $FileOut or die "could not write to $DirectoryOut/$FileOut\n";
if ($FileType eq "V")
{
	print $OUT "variantId\t";
} # first col header for var file
else
{
	$ColHeader =~ s/\tcall\t/\talleleSeq\t/;
	print $OUT "index\t";
} # first col header for gene file
print $OUT "$ColHeader\n"; # remainder of col header

$RecCount = 0; # reuse total count for nr of non-reduntant vars
$FileCount--; # reduce by one, used below to add missing delimiters
foreach my $Chr (@ChrNames) # sort records in each chr array and print with count
{
	foreach my $Rec (sort {SortStringsasArrays ($a, $b)} keys %{$Vars{$Chr}}) # using sub to sort on being, end fields
	{
		next unless $Rec;
		$RecCount++; # increment count of coding/splicing/spanning vars
		print $OUT "$RecCount\t$Rec"; # printing rec and count
		print $OUT "\t",$Vars{$Chr}->{$Rec}->[4] if $FileType eq "V"; # print xref if var files

		if ($VarExtras)
		{
			my $FieldDelimiterCount = () = $Vars{$Chr}->{$Rec}->[3] =~ /:/g;
			#print "$SpacerCount $Vars{$Chr}->{$Rec}->[1]\n";
			#exit if $TmpCount++ > 10;
			my $Addition = ":" x ($FileCount - $FieldDelimiterCount);
			if ($Version == 2)
			{
				print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScoresVAF;
				print $OUT "\t",$Vars{$Chr}->{$Rec}->[2],$Addition if $KeepScoresEAF;
				print $OUT "\t",$Vars{$Chr}->{$Rec}->[3],$Addition if $KeepQuals;
			}
			else
			{
				print $OUT "\t",$Vars{$Chr}->{$Rec}->[1],$Addition if $KeepScores;
			}
			#print $OUT "$Rec\t$Vars{$Chr}->{$Rec}\n"; # printing rec and count
		}
		print $OUT "\n";
	}
}
print "Nr saved records:\t $RecCount\n"; # count of non-redundant vars, c/f all vars abovve


###########################################################################
#                                   SUBS                                  #
###########################################################################

sub GetExpectedParams
{
	my %Hash = # hash to store expected params
	(
		"file_type" => -1,
		"input_file" => [],
		"output_file" => -1,
		"var_type" => "all",
		"component" => "ALL",
		"impact" => "ALL",
		"scores" => "yes",
		"scores_eaf" => "yes",
		"scores_vaf" => "yes",
		"score_quality" => "yes",
	);
	$NrParams = int keys %Hash;
	return %Hash;
}

sub GetEnteredParams
{
	# Processing @ARGV
	my %Hash;
	my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
	for my $n (1..$#ARGVs) # parse each [nb arg 0 is empty so ignored]
	{
		$ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
		my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
		$Key = lc $Key; # make lower case, ie case insensitive
		if ($Key eq "input_file") # multiple entries expected, setting up array
		{
			push @{$Hash{$Key}}, $Val; # add input to input hash

		}
		else
		{
			$Hash{$Key} = $Val; # make a hash entry out of key and val
		}
	}
	return %Hash; # hash now has each --entry param, with associated values
}

sub OpenFile
{
    my $File = shift;
    my $FH;
		open $FH, $File;
		return $FH;
}

sub OpenFileold
{
    my $File = shift;
    my $FH;

    if ($File =~ /.bz2$/)
    {
			open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
    }
		elsif ($File =~ /.gz$/)
    {
			open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
    }
    elsif ($File =~ /.tsv$/ or $File =~ /.txt$/)
    {
			open ($FH, "cat $File |") or die ("$!: can't open file $File");
    }
    else
    {
			print ("Do not recognise file type for file $File.\nOpening as text file\n");
			open ($FH, "cat $File |") or die ("$!: can't open file $File");
    }
    return $FH;
}

sub GetHeaderAsString
{
    my $FH = shift;
    my $Header = "";
    my $Count = 0;
    while (<$FH>) # loop until a line is empty
    {
			chomp;
			if ($_ eq "") # exit when empty line
			{
				return $Header ; # return ref to array
			}
			else
			{
				$Header .= $_;
			}
			return "" if $Count++ > 50; # too many lines for a header, must be no header, return empty array
    }
}

sub GetVersion
{
	my $Header = shift;
	my $ColHeader = shift;

	my $Version = 0; # need to know if it is 1.x or 2.x
	my $XrefField = -1;

	if ($FileType eq "V")
	{
		if ($Header)
		{
			$Header =~ /#FORMAT_VERSION\t(\d)/;
			if ($1 == 1) {$Version = 1; $XrefField = 11;}
			elsif ($1 == 2) {$Version = 2; $XrefField = 13;}
			else {print "Warning: Format Version not found in Header\n";} # not in header
		}
		unless ($Version)
		{
			my @ColHeader = split /\t/, $ColHeader;
			for my $n (0..int(@ColHeader)-1)
			{
				if ($ColHeader eq "xRef")
				{
					$XrefField = $n;
					if ($n == 11) 	 {$Version = 1;}
					elsif ($n == 13) {$Version = 2;}
					last;
				}
			}
		}
	}
	return ($Version, $XrefField);
}

sub ExtractGeneFields # expects a gene file rec, strips out file specific fields, gets chr
{
	my $Rec = shift;
	# gene fields
	# >index locus allele chromosome begin end varType reference call xRef geneId mrnaAcc
	# proteinAcc symbol orientation component componentIndex hasCodingRegion impact
	# nucleotidePos proteinPos annotationRefSequence sampleSequence genomeRefSequence pfam

	chomp $Rec; # remove return
	my @Fields = split "\t", $Rec;
	#print "$Fields[6] $Fields[15] $Fields[18]\t$OutputVarTypes $OutputComponents $OutputImpacts\n";
	unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave
	unless ($OutputComponents eq "ALL" or $Fields[15] =~ /$OutputComponents/) {return ("","");} # nominated vals not found, leave
	unless ($OutputImpacts eq "ALL" or $Fields[18] =~ /$OutputImpacts/) {return ("","");} # nominated vals not found, leave
	my $Chr = $Fields[3]; # assign chr
	$Rec = join("\t",@Fields[3..24]);
	#$Rec =~ s/\t$//; # remove trailing tab if there is one

	return ($Rec, $Chr);
}

sub ExtractVarFields # expects a gene file rec, strips out file specific fields, gets chr
{
	my $Rec = shift;
	my $XrefField = shift;
	# var fields
	# 1.x locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef
	# 2.0 locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef

	chomp $Rec; # remove return
	my @Fields = split "\t", $Rec;
	#print "$Fields[6] $OutputVarTypes \n"; exit;
	unless ($OutputVarTypes eq "all" or $Fields[6] =~ /$OutputVarTypes/) {return ("","");} # nominated vals not found, leave
	my $Chr = $Fields[3]; # assign chr
	#$Rec = join("\t",@Fields[3..8]);
	$Rec = join("\t",@Fields[3..8]);
	#$Rec =~ s/\t$//; # remove trailing tab if there is one

	if ($VarExtras)
	{
		return ($Rec, $Chr, $Fields[9], $Fields[10], $Fields[11], $Fields[$XrefField]) if $Version == 2;
		return ($Rec, $Chr, $Fields[9], $Fields[$XrefField]) # $Version == 1;
	}
	else
	{
		return ($Rec, $Chr);
	}
}

sub AddVarRec
{
	my $Rec = shift;
	my $RecHash = shift;
	my $ScoreVAF = shift;
	my $ScoreEAF = shift;
	my $ScoreQual = shift;
	my $FileCount = shift;
	my $XRef = shift;

	if ($VarExtras) # need to extract scores information
	{
		# locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef
		# Set delimiter
		my $Delimiter;
		if ($RecHash->{$Rec}) # hash entry for this var already exists
		{
			if ($RecHash->{$Rec}->[0] == $FileCount)
			{
				$Delimiter = "|"; # same chr, var is hom, use |
			}
			else # diff chr, use :
			{
				my $FieldDelimiterCount = () = $RecHash->{$Rec}->[3] =~ /:/g; # count nr field delims
				$Delimiter = ":" x ($FileCount - $FieldDelimiterCount - 1); # delimiters for any processed files, that didnt have this var
			}
			$RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer
		}
		else  # new var
		{
			$RecHash->{$Rec} = []; # create array to hold it
			$Delimiter = ":" x ($FileCount - 1); # delimiters for prev processed files, that didnt have this var
			$RecHash->{$Rec}->[4] = $XRef; # add xref
		}

		# Process var
		$RecHash->{$Rec}->[0] = $FileCount;
		if ($Version == 2)
		{
			$RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, varScoreVAF
			$RecHash->{$Rec}->[2] .= $Delimiter.$ScoreEAF; # add delimiter, varScoreVAF
			$RecHash->{$Rec}->[3] .= $Delimiter.($ScoreQual eq "VQHIGH" ? "H" : "L"); # add delimiter, qual
		}
		else
		{
			$RecHash->{$Rec}->[1] .= $Delimiter.$ScoreVAF; # add delimiter, totalScore
		}

	}
	else # just the rec, no var extras being extrcted
	{
		$RecHash->{$Rec}++; # hash with rec as key, increment count for this key
		$RecHash->{$Rec}->[4] = $XRef if length $XRef > length $RecHash->{$Rec}->[4]; # replace xref if new xref is longer, wasting space here
	}
}

sub AddGeneRec
{
	my $Rec = shift;
	my $RecHash = shift;

		$RecHash->{$Rec}++; # hash with rec as key, increment count for this key
}

sub SortStringsasArrays # sorts based on begin and end of two recs
{
	my $String1 = shift; # first string
	my $String2 = shift; # second string

	my @Array1 = split "\t", $String1; # put fields into array
	my @Array2 = split "\t", $String2;

	# array[1] is begin, array[2] is end, returning order based on these fields
	if ($Array1[1] < $Array2[1]) # begin of 1 < begin of 2
	{
		return -1;
	}
	elsif ($Array1[1] == $Array2[1]) # begin of 1 == begin of 2
	{
		if ($Array1[2] < $Array2[2]) # end of 1 < end of 2
		{
			return -1;
		}
		elsif ($Array1[2] == $Array2[2]) # end of 1 == end of 2
		{
			return 0;
		}
		else # end of 1 > end of 2
		{
			return 1;
		}
	}
	else # begin of 1 > begin of 2
	{
		return 1;
	}

}
