2
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 #use feature "say";
|
|
4 #use File::Basename;
|
|
5 $| = 1;
|
|
6
|
|
7 # Get_TestVariants_Variant_Frequencies
|
|
8 # Calculate the frequencies of variants in a testvariants output file
|
|
9 # Two values calculated:
|
|
10 # Frequency vs all alleles
|
|
11 # Frequency vs called alleles
|
|
12
|
|
13 # Input is a testvariants file
|
|
14 # Outputs:
|
|
15 # All data to *-Freq.tsv, including scores and quals
|
|
16 # vars and freqs to *-Freq_Short.tsv
|
|
17 # Exceptions to *-Freq_Log
|
|
18 # Stats to *-Freq_Stats
|
|
19
|
|
20 # Format:
|
|
21 # perl prog file dir
|
|
22 # ie
|
|
23 # perl Get_TestVariants_Variant_Frequencies \
|
|
24 # --Input input_file \
|
|
25 # --First_Genome_Field_Nr col_nr1 \
|
|
26 # --Last_Genome_Field_Nr col_nr2
|
|
27 # --Output1 output1 \
|
|
28 # --Output2 output2
|
|
29
|
|
30 # eg
|
|
31 # perl /perl/Get_TestVariants_Variant_Frequencies_0_0_1.pl \
|
|
32 # --Input /data/Family_Quartet_testvariants.tsv \
|
|
33 # --First_Genome_Field_Nr 9 \
|
|
34 # --Last_Genome_Field_Nr 11
|
|
35 # --Output1 output1 \
|
|
36 # --Output2 output2
|
|
37
|
|
38
|
|
39 # Rick Tearle 2010-11
|
|
40
|
|
41 my $Time -= time; # start time
|
|
42 my $Debug = 0;
|
|
43
|
|
44 # Parsing and storing input parameters
|
|
45 # Only childfields can be repeated
|
|
46 print "$0 @ARGV\nProcessing input parameters\n";
|
|
47 my %ExpectedParams = GetExpectedParams ();
|
|
48 my %EnteredParams = GetEnteredParams ();
|
|
49
|
|
50 # Setting up prog paras from input paras
|
|
51 my $FileIn = $EnteredParams{input};
|
|
52 unless (-f $FileIn) {die "Testvariants input file $FileIn not found\n";} # requires existing file
|
|
53 #my $FileOut = $EnteredParams{output}; #
|
|
54 #$DirectoryOut =~ s/\/$//; # remove trailing slash if present
|
|
55 #unless (-d $DirectoryOut) {die "Output directory $DirectoryOut not found\n";} # requires existing file
|
|
56 #print "$FileIn\n$DirectoryOut\n";
|
|
57 #$FileIn =~ /(^.+\/)(.+?)\./; # get filename without path and without extensions
|
|
58
|
|
59 # my $FileOut1 = $FileOut."-Freq.tsv";
|
|
60 # my $FileOut2 = $FileOut."-Freq_Short.tsv";
|
|
61 # my $FileOut3 = $FileOut."-Freq_Stats.tsv";
|
|
62 # my $FileOut4 = $FileOut."-Freq_Log.tsv";
|
|
63
|
|
64 print "\nOpening Input File:\n\t$FileIn\n";
|
|
65 my $IN = OpenFile ($FileIn); # open the file with correct file format
|
|
66
|
|
67 #print "\nOpening Output Files:\n\t$FileOut1\n\t$FileOut2\n\t$FileOut3\n\t$FileOut4\n"; #exit;
|
|
68 open my $OUT1, ">", $EnteredParams{output1};
|
|
69 open my $OUT2, ">", $EnteredParams{output2};
|
|
70 #open my $OUT3, ">", $EnteredParams{output3};
|
|
71 #open my $OUT4, ">", $EnteredParams{output4};
|
|
72
|
|
73 # Get col header and genomes fields
|
|
74 my $ColHeader = <$IN>; # get col header
|
|
75 chomp $ColHeader;
|
|
76 my @ColHeader = split /\t/, $ColHeader;
|
|
77 my $StartGenomes = $EnteredParams{first_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
|
|
78 my $StopGenomes = $EnteredParams{last_genome_field_nr} - 1; # first column with testvariants data, 1 based -> 0 based
|
|
79 if ($StartGenomes < 0) {die "No valid entry for First_Genome_Field_Nr, must be 1 or greater\n";}
|
|
80 if ($StopGenomes < 0) {die "No valid entry for Last_Genome_Field_Nr, must be 1 or greater\n";}
|
|
81 if ($StartGenomes > $StopGenomes) {die "Last_Genome_Field_Nr must be greater than or equal to First_Genome_Field_Nr\n";}
|
|
82 if ($StartGenomes > int @ColHeader) {die "First_Genome_Field_Nr > number of fields in column header\n";}
|
|
83 if ($StopGenomes > int @ColHeader) {die "Last_Genome_Field_Nr > number of fields in column header\n";}
|
|
84 my $NrGenomes = $StopGenomes - $StartGenomes + 1;
|
|
85 #print "$StartGenomes\t$StopGenomes\n"; #exit;
|
|
86 #print "First Genome Field:\n\t$ColHeader[$StartGenomes]\n";
|
|
87 #print "Last Genome Field:\n\t$ColHeader[$StopGenomes]\n\n";
|
|
88
|
|
89 # print column headers
|
|
90 print $OUT1 join("\t",@ColHeader),"\tAllFreq\tCalledFreq\n";
|
|
91 print $OUT2 join("\t",@ColHeader[0..7]),"\tAllFreq\tCalledFreq\n";
|
|
92 print join("\t",@ColHeader),"\n";
|
|
93 print "First Genome Field: $ColHeader[$StartGenomes]\n";
|
|
94 print "Last Genome Field: $ColHeader[$StopGenomes]\n";
|
|
95 print "Nr Genomes: $NrGenomes\n\n";
|
|
96
|
|
97 print "\nProcessing Variants....\n";
|
|
98 my $VariantCount = 0; # variant locus counter, not used
|
|
99 my %AllFreqCounts; # storing histogram of all freq counts
|
|
100 my %CalledFreqCounts; # storing histogram of called freq counts
|
|
101 my $Warnings;
|
|
102 while (<$IN>)
|
|
103 {
|
|
104 # testvariants fields: variantId chromosome begin end varType reference alleleSeq xRef GS000000XX1-ASM GS000000XX2-ASM [GS000000XXN-ASM]
|
|
105 my $Line = $_; # save line for output below
|
|
106 chomp $Line;
|
|
107 my @F = split /\t/, $Line; # split in to fields
|
|
108 $VariantCount++; # increment variant counter
|
|
109 my $UseFields = join ("",@F[$StartGenomes..$StopGenomes]); # get genome fields as string, to count 0s and 1s
|
|
110 my $Count1 = () = $UseFields =~ /1/g; # count the number of 1s
|
|
111 my $Count0 = () = $UseFields =~ /0/g; # count the number of 0s
|
|
112 my $CountN = () = $UseFields =~ /N/g; # count the number of Ns
|
|
113 my $NrAlleles = $Count1 + $Count0 + $CountN; # total count
|
|
114 unless ($NrAlleles == $NrGenomes *2 or $NrAlleles == $NrGenomes) # count does not match expected for diploid/haploid locus
|
|
115 {
|
|
116 print "$NrAlleles alleles for variant ",join(" ",@F[0..7]),"\n"; # log warning
|
|
117 #print "Expected $NrGenomes or ",$NrGenomes*2," alleles depending on ploidy of locus\n";
|
|
118 #if ($Warnings++ > 10) {die "Have found $Warnings exceptions for this file, termnating processing\n";} # terminate if too many warnings
|
|
119 }
|
|
120 my $AllFreq = sprintf("%0.3f",$Count1/$NrAlleles); # calculate freq of 1s vs all alleles
|
|
121 my $CalledFreq = sprintf("%0.3f",0);
|
|
122 if ($Count1+$Count0) {$CalledFreq = sprintf("%0.3f",$Count1/($Count1+$Count0));} # calculate freq of 1s vs called alleles, if there are any
|
|
123 $AllFreqCounts{$AllFreq}++; # increment all freq histogram
|
|
124 $CalledFreqCounts{$CalledFreq}++; # increment called freq histogram
|
|
125 #print "$Line\n$AlleleCount\t$Count1\t$Count0\t$AllFreq\t$CalledFreq\n"; #exit;
|
|
126 print $OUT1 "$Line\t$AllFreq\t$CalledFreq\n"; # output full testvariants plus frequencies for this var
|
|
127 print $OUT2 join("\t",@F[0..7]),"\t$AllFreq\t$CalledFreq\n"; # output just var info plus frequencies for this var
|
|
128 #exit if $VariantCount > 20;
|
|
129 }
|
|
130 close $OUT1;
|
|
131 close $OUT2;
|
|
132
|
|
133 # Print frequency histograms
|
|
134 print "Nr Variants at each Frequency (All):\nFreq\tCount\n"; # header
|
|
135 foreach my $Freq (sort {$a <=> $b} keys %AllFreqCounts) {print "$Freq\t$AllFreqCounts{$Freq}\n";}
|
|
136
|
|
137 print "\nNr Variants at each Frequency (Called):\nFreq\tCount\n"; # header
|
|
138 foreach my $Freq (sort {$a <=> $b} keys %CalledFreqCounts) {print "$Freq\t$CalledFreqCounts{$Freq}\n";}
|
|
139
|
|
140 $Time += time;
|
|
141 print "\ntime $Time\n";
|
|
142
|
|
143 ###########################################################################
|
|
144 # SUBS #
|
|
145 ###########################################################################
|
|
146
|
|
147 sub GetExpectedParams
|
|
148 {
|
|
149 my %Hash =
|
|
150 (
|
|
151 "input" => -1,
|
|
152 "output_dir" => -1,
|
|
153 ); # store parameters and values
|
|
154 return %Hash;
|
|
155 }
|
|
156
|
|
157 sub GetEnteredParams
|
|
158 {
|
|
159 # Processing @ARGV
|
|
160 my %Hash;
|
|
161
|
|
162 my @ARGVs = split /--/, join (" ",@ARGV); # split args on --, into array
|
|
163 #print "Start\n", join ("\n",@ARGVs),"\n",int @ARGVs - 1,"\n\n" if $Debug;
|
|
164 #print "Key\tVal\n" if $Debug; #exit;
|
|
165 for my $n (1..$#ARGVs) # parse each
|
|
166 {
|
|
167 $ARGVs[$n] =~ s/\s+$//; # remove any trailing spaces
|
|
168 my ($Key, $Val) = split / /, $ARGVs[$n], 2; # put first element into key, any other elements into val
|
|
169 $Key = lc $Key;
|
|
170 $Hash{$Key} = $Val; # make a hash entry out of key and val
|
|
171 #print "$Key\t$EnteredParams{$Key}\n" if $Debug;
|
|
172 }
|
|
173 #print int(keys %Hash),"\n" if $Debug;
|
|
174 #foreach my $Arg (keys %Hash) {print "Arg: $Arg\t",$ExpectedParams{$Arg},"\n";}
|
|
175 #print "Arg string:\t",join (" ",@ARGV),"\n" if $Debug;
|
|
176 #exit if $Debug;
|
|
177 return %Hash; # hash now has each -- entry param, with associated values
|
|
178 }
|
|
179
|
|
180 sub SaveArrayAsString
|
|
181 {
|
|
182 my $FH = shift;
|
|
183 my $Fields = shift;
|
|
184 #print "$Fields\n";
|
|
185 print $FH join("\t",@$Fields),"\n";
|
|
186 }
|
|
187
|
|
188 sub ConcatenateVariants
|
|
189 {
|
|
190 my $ArrayIn = shift; # ptr to array
|
|
191 my $StateFieldNr = shift; # field to process
|
|
192 #print int(@$ArrayIn),"\n";
|
|
193 my @ArrayOut; # array to store records out
|
|
194 my $Nr = -1;
|
|
195 foreach my $Entry (@$ArrayIn)
|
|
196 {
|
|
197 }
|
|
198 return \@ArrayOut; # return ptr to array
|
|
199 }
|
|
200
|
|
201 sub LoadStateRecord
|
|
202 {
|
|
203 my $Out = shift;
|
|
204 my $In = shift;
|
|
205 my $StateFieldNr = shift;
|
|
206
|
|
207 $Out->{State} = $$In[$StateFieldNr]; # get state for new record
|
|
208 $Out->{Chr} = $$In[1]; # get chr
|
|
209 $Out->{Begin} = $$In[2]; # get begin of state range
|
|
210 $Out->{End} = $$In[3]; # get current end of state range
|
|
211 $Out->{Records}++; # record added to new count
|
|
212 }
|
|
213
|
|
214 sub OpenFile
|
|
215 {
|
|
216 my $File = shift;
|
|
217 my $FH;
|
|
218 open ($FH, "$File") or die ("$!: can't open file $File");
|
|
219 return $FH;
|
|
220 }
|
|
221
|
|
222 sub OpenFileold
|
|
223 {
|
|
224 my $File = shift;
|
|
225 my $FH;
|
|
226
|
|
227 if ($File =~ /.bz2$/)
|
|
228 {
|
|
229 open ($FH, "bzcat $File |") or die ("$!: can't open file $File");
|
|
230 }
|
|
231 elsif ($File =~ /.gz$/)
|
|
232 {
|
|
233 open ($FH, "gunzip -c $File |") or die ("$!: can't open file $File");
|
|
234 }
|
|
235 elsif ($File =~ /.tsv$/)
|
|
236 {
|
|
237 open ($FH, "cat $File |") or die ("$!: can't open file $File");
|
|
238 }
|
|
239 else
|
|
240 {
|
|
241 die ("$!: do not recognise file type $File");
|
|
242 }
|
|
243 return $FH;
|
|
244 }
|
|
245
|
|
246 sub LoadNewRecord
|
|
247 {
|
|
248 my $In = shift;
|
|
249 my $Out = shift;
|
|
250 $Out->{Chr} = $In->{Chr};
|
|
251 $Out->{State} = $In->{State};
|
|
252 $Out->{Begin} = $In->{Begin};
|
|
253 $Out->{End} = $In->{End};
|
|
254 $Out->{Records} = $In->{Records};
|
|
255 }
|
|
256
|
|
257 sub NewStateRecord
|
|
258 {
|
|
259 my $Record =
|
|
260 {
|
|
261 Chr => "",
|
|
262 Begin => -1,
|
|
263 End => -1,
|
|
264 State => "",
|
|
265 Records => 0,
|
|
266 MIEs => 0,
|
|
267 StateErrors => 0,
|
|
268 Length => -1,
|
|
269 };
|
|
270 return $Record;
|
|
271 }
|