comparison bin/AnalyzeSequenceFilesData.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: AnalyzeSequenceFilesData.pl,v $
4 # $Date: 2015/02/28 20:46:04 $
5 # $Revision: 1.33 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SequenceFileUtil;
38 use AminoAcids;
39 use NucleicAcids;
40
41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
42
43 # Autoflush STDOUT
44 $| = 1;
45
46 # Starting message...
47 $ScriptName = basename($0);
48 print "\n$ScriptName: Starting...\n\n";
49 $StartTime = new Benchmark;
50
51 # Setup script usage message...
52 SetupScriptUsage();
53 if ($Options{help} || @ARGV < 1) {
54 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
55 }
56
57 # Expand wild card file names...
58 my(@SequenceFilesList);
59 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
60
61 print "Processing options...\n";
62 my(%OptionsInfo);
63 ProcessOptions();
64
65 # Set up information about input files...
66 print "Checking input sequence file(s)...\n";
67 my(%SequenceFilesInfo);
68 RetrieveSequenceFilesInfo();
69 SetupSequenceRegionsData();
70
71 # Process input files..
72 my($FileIndex);
73 if (@SequenceFilesList > 1) {
74 print "\nProcessing sequence files...\n";
75 }
76 for $FileIndex (0 .. $#SequenceFilesList) {
77 if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
78 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
79 AnalyzeSequenceFileData($FileIndex);
80 }
81 }
82 print "\n$ScriptName:Done...\n\n";
83
84 $EndTime = new Benchmark;
85 $TotalTime = timediff ($EndTime, $StartTime);
86 print "Total time: ", timestr($TotalTime), "\n";
87
88 ###############################################################################
89
90 # Analyze sequence file...
91 sub AnalyzeSequenceFileData {
92 my($FileIndex) = @_;
93 my($SequenceFile, $SequenceDataRef);
94
95 $SequenceFile = $SequenceFilesList[$FileIndex];
96
97 open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
98 $SequenceDataRef = ReadSequenceFile($SequenceFile);
99 close SEQUENCEFILE;
100
101 if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
102 CalculatePercentIdentityMatrix($FileIndex, $SequenceDataRef);
103 }
104 if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
105 PerformResidueFrequencyAnalysis($FileIndex, $SequenceDataRef);
106 }
107 }
108
109 # Calculate percent identity matrix...
110 sub CalculatePercentIdentityMatrix {
111 my($FileIndex, $SequenceDataRef) = @_;
112 my($PercentIdentity, $PercentIdentityMatrixFile, $PercentIdentityMatrixRef, $RowID, $ColID, $Line, @LineWords);
113
114 $PercentIdentityMatrixFile = $SequenceFilesInfo{OutFileRoot}[$FileIndex] . 'PercentIdentityMatrix.' . $SequenceFilesInfo{OutFileExt}[$FileIndex];
115 $PercentIdentityMatrixRef = CalculatePercentSequenceIdentityMatrix($SequenceDataRef, $OptionsInfo{IgnoreGaps}, $OptionsInfo{Precision});
116
117 print "Generating percent identity matrix file $PercentIdentityMatrixFile...\n";
118 open OUTFILE, ">$PercentIdentityMatrixFile" or die "Can't open $PercentIdentityMatrixFile: $!\n";
119
120 # Write out column labels...
121 @LineWords = ();
122 push @LineWords, '';
123 for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
124 push @LineWords, $ColID;
125 }
126 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
127 print OUTFILE "$Line\n";
128
129 # Write out rows...
130 for $RowID (@{$PercentIdentityMatrixRef->{IDs}}) {
131 @LineWords = ();
132 push @LineWords, $RowID;
133 for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
134 $PercentIdentity = $PercentIdentityMatrixRef->{PercentIdentity}{$RowID}{$ColID};
135 push @LineWords, $PercentIdentity;
136 }
137 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
138 print OUTFILE "$Line\n";
139 }
140 close OUTFILE;
141 }
142
143 # Perform frequency analysis...
144 sub PerformResidueFrequencyAnalysis {
145 my($FileIndex, $SequenceDataRef) = @_;
146
147 CountResiduesInRegions($FileIndex, $SequenceDataRef);
148 CalculatePercentResidueFrequencyInRegions($FileIndex, $SequenceDataRef);
149 GeneratePercentResidueFrequencyOutFilesForRegions($FileIndex, $SequenceDataRef);
150 }
151
152 # Count residues...
153 sub CountResiduesInRegions {
154 my($FileIndex, $SequenceDataRef) = @_;
155
156 # Setup rerfernce sequence data...
157 my($RefereceSequenceID, $RefereceSequence);
158 $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
159 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
160
161 # Count residues...
162 my($RegionID, $StartResNum, $EndResNum, $ResNum, $ResIndex, $ID, $Sequence, $Residue);
163 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
164 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
165 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
166 RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
167 $ResIndex = $ResNum - 1;
168 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
169 next RESNUM;
170 }
171 # Go over residues in column $ResNum in all the sequences...
172 ID: for $ID (@{$SequenceDataRef->{IDs}}) {
173 $Sequence = $SequenceDataRef->{Sequence}{$ID};
174 $Residue = substr($Sequence, $ResIndex, 1);
175 if (IsGapResidue($Residue)) {
176 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{Gap} += 1;
177 }
178 else {
179 if (exists $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}) {
180 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue} += 1;
181 }
182 else {
183 # Internal error...
184 print "Warning: Ignoring residue $Residue in sequence $ID during ResidueFrequencyAnalysis calculation: Unknown residue...\n";
185 }
186 }
187 }
188 }
189 }
190 }
191
192 # Calculate percent frequency for various residues in the sequence regions...
193 sub CalculatePercentResidueFrequencyInRegions {
194 my($FileIndex, $SequenceDataRef) = @_;
195 my($RegionID, $StartResNum, $EndResNum, $ResNum, $Residue, $Count, $PercentCount, $MaxResiduesCount, $Precision);
196
197 $MaxResiduesCount = $SequenceDataRef->{Count};
198 $Precision = $OptionsInfo{Precision};
199 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
200 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
201 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
202 RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
203 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
204 next RESNUM;
205 }
206 for $Residue (keys %{$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}}) {
207 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
208 $PercentCount = ($Count / $MaxResiduesCount) * 100;
209 $PercentCount = sprintf("%.${Precision}f", $PercentCount) + 0;
210 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue} = $PercentCount;
211 }
212 }
213 }
214 }
215
216 # Generate output files...
217 sub GeneratePercentResidueFrequencyOutFilesForRegions {
218 my($FileIndex, $SequenceDataRef) = @_;
219
220 # Setup rerfernce sequence data...
221 my($RefereceSequenceID, $RefereceSequence);
222 $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
223 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
224
225 my($RegionID, $StartResNum, $EndResNum, $ResNum, $Count, $PercentCount, $Residue, $RegionNum, $RegionOutFile, $PercentRegionOutFile, $OutFileRoot, $OutFileExt, $Line, @LineWords, @PercentLineWords);
226
227 $OutFileRoot = $SequenceFilesInfo{OutFileRoot}[$FileIndex];
228 $OutFileExt = $SequenceFilesInfo{OutFileExt}[$FileIndex];
229 $RegionNum = 0;
230 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
231 $RegionNum++;
232 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
233 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
234
235 $RegionOutFile = "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
236 $PercentRegionOutFile = "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
237
238 print "Generating $RegionOutFile and $PercentRegionOutFile...\n";
239 open REGIONOUTFILE, ">$RegionOutFile" or die "Error: Can't open $RegionOutFile: $! \n";
240 open PERCENTREGIONOUTFILE, ">$PercentRegionOutFile" or die "Error: Can't open $PercentRegionOutFile: $! \n";
241
242 # Write out reference residue positions as column values....
243 @LineWords = ();
244 push @LineWords, '';
245 RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
246 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
247 next RESNUM;
248 }
249 push @LineWords, $ResNum;
250 }
251 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
252 print REGIONOUTFILE "$Line\n";
253 print PERCENTREGIONOUTFILE "$Line\n";
254
255
256 # Write out row data for each residue; Gap residue is written last...
257 RESIDUE: for $Residue (sort @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}) {
258 if ($Residue =~ /^Gap$/i) {
259 next RESIDUE;
260 }
261 @LineWords = ();
262 push @LineWords, $Residue;
263 @PercentLineWords = ();
264 push @PercentLineWords, $Residue;
265
266 RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
267 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
268 next RESNUM;
269 }
270 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
271 push @LineWords, $Count;
272 $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
273 push @PercentLineWords, $PercentCount;
274 }
275 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
276 print REGIONOUTFILE "$Line\n";
277
278 $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
279 print PERCENTREGIONOUTFILE "$Line\n";
280 }
281
282 # Write out data for gap...
283 $Residue = 'Gap';
284 @LineWords = ();
285 push @LineWords, $Residue;
286 @PercentLineWords = ();
287 push @PercentLineWords, $Residue;
288
289 RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
290 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
291 next RESNUM;
292 }
293 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
294 push @LineWords, $Count;
295
296 $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
297 push @PercentLineWords, $PercentCount;
298 }
299 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
300 print REGIONOUTFILE "$Line\n";
301
302 $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
303 print PERCENTREGIONOUTFILE "$Line\n";
304
305 close REGIONOUTFILE;
306 close PERCENTREGIONOUTFILE;
307 }
308 }
309
310 # Process option values...
311 sub ProcessOptions {
312 %OptionsInfo = ();
313
314 # Setup analysis mode...
315 $OptionsInfo{CalculatePercentIdentityMatrix} = ($Options{mode} =~ /^(PercentIdentityMatrix|All)$/i) ? 1 : 0;
316 $OptionsInfo{PerformResidueFrequencyAnalysis} = ($Options{mode} =~ /^(ResidueFrequencyAnalysis|All)$/i) ? 1 : 0;
317
318 # Setup delimiter and quotes...
319 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
320 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
321
322 # Setup reference sequence and regions for residue frequence analysis...
323 $OptionsInfo{SpecifiedRefereceSequence} = $Options{referencesequence};
324 $OptionsInfo{SpecifiedRegion} = $Options{region};
325 @{$OptionsInfo{SpecifiedRegions}} = ();
326
327 my(@SpecifiedRegions);
328 @SpecifiedRegions = ();
329 if ($Options{region} =~ /\,/) {
330 @SpecifiedRegions = split /\,/, $OptionsInfo{SpecifiedRegion};
331 if (@SpecifiedRegions % 2) {
332 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
333 }
334 # Make sure EndResNum > StartResNum...
335 my($StartResNum, $EndResNum, $Index, $RegionNum);
336 $RegionNum = 0;
337 for ($Index = 0; $Index <= $#SpecifiedRegions; $Index += 2) {
338 $StartResNum = $SpecifiedRegions[$Index];
339 $EndResNum = $SpecifiedRegions[$Index + 1];
340 $RegionNum++;
341 if (!IsPositiveInteger($StartResNum)) {
342 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be a positive integer for region $RegionNum.\n";
343 }
344 if (!IsPositiveInteger($EndResNum)) {
345 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $EndResNum, must be a positive integer for region $RegionNum.\n";
346 }
347 if ($StartResNum >= $EndResNum) {
348 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller than end residue number, $EndResNum, for region $RegionNum.\n";
349 }
350 }
351 }
352 else {
353 if ($Options{region} !~ /^UseCompleteSequence$/i) {
354 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
355 }
356 }
357 push @{$OptionsInfo{SpecifiedRegions}}, @SpecifiedRegions;
358
359 # Miscellaneous options...
360 $OptionsInfo{Precision} = $Options{precision};
361 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
362 $OptionsInfo{RegionResiduesMode} = $Options{regionresiduesmode};
363
364 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
365 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
366 }
367
368 # Retrieve information about sequence files...
369 sub RetrieveSequenceFilesInfo {
370 my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $RefereceSequence, $RefereceSequenceID, $RefereceSequenceLen, $RefereceSequenceWithNoGaps, $RefereceSequenceWithNoGapsLen, $RefereceSequenceRegionCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $SequenceDataRef, $SpecifiedRefereceSequence, @SpecifiedRegions, @RefereceSequenceRegions);
371
372 %SequenceFilesInfo = ();
373 @{$SequenceFilesInfo{FilesOkay}} = ();
374 @{$SequenceFilesInfo{OutFileRoot}} = ();
375 @{$SequenceFilesInfo{OutFileExt}} = ();
376 @{$SequenceFilesInfo{Format}} = ();
377 @{$SequenceFilesInfo{SequenceCount}} = ();
378 @{$SequenceFilesInfo{RefereceSequenceID}} = ();
379 @{$SequenceFilesInfo{RefereceSequence}} = ();
380 @{$SequenceFilesInfo{RefereceSequenceLen}} = ();
381 @{$SequenceFilesInfo{RefereceSequenceWithNoGaps}} = ();
382 @{$SequenceFilesInfo{RefereceSequenceWithNoGapsLen}} = ();
383 @{$SequenceFilesInfo{RefereceSequenceRegions}} = ();
384 @{$SequenceFilesInfo{RefereceSequenceRegionCount}} = ();
385 @{$SequenceFilesInfo{ResidueCodes}} = ();
386
387 FILELIST: for $Index (0 .. $#SequenceFilesList) {
388 $SequenceFile = $SequenceFilesList[$Index];
389 $SequenceFilesInfo{FilesOkay}[$Index] = 0;
390 $SequenceFilesInfo{OutFileRoot}[$Index] = '';
391 $SequenceFilesInfo{OutFileExt}[$Index] = '';
392 $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
393 $SequenceFilesInfo{SequenceCount}[$Index] = 0;
394 $SequenceFilesInfo{RefereceSequenceID}[$Index] = '';
395 $SequenceFilesInfo{RefereceSequence}[$Index] = '';
396 $SequenceFilesInfo{RefereceSequenceLen}[$Index] = '';
397 $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = '';
398 $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = '';
399 @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]} = ();
400 $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = 0;
401 @{$SequenceFilesInfo{ResidueCodes}[$Index]} = ();
402
403 if (! open SEQUENCEFILE, "$SequenceFile") {
404 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
405 next FILELIST;
406 }
407 close SEQUENCEFILE;
408
409 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
410 if (!$FileSupported) {
411 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
412 next FILELIST;
413 }
414
415 $SequenceDataRef = ReadSequenceFile($SequenceFile);
416
417 $SequenceCount = $SequenceDataRef->{Count};
418 if (!$SequenceCount) {
419 warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
420 next FILELIST;
421 }
422
423 # Make sure all sequence lengths are identical...
424 if (!AreSequenceLengthsIdentical($SequenceDataRef)) {
425 warn "Warning: Ignoring file $SequenceFile: Sequence legths are not identical.\n";
426 next FILELIST;
427 }
428 $SpecifiedRefereceSequence = $OptionsInfo{SpecifiedRefereceSequence};
429 # Make sure reference sequence ID is valid...
430 if ($SpecifiedRefereceSequence =~ /^UseFirstSequenceID$/i) {
431 $RefereceSequenceID = $SequenceDataRef->{IDs}[0];
432 }
433 else {
434 if (!exists($SequenceDataRef->{Sequence}{$SpecifiedRefereceSequence})) {
435 warn "Warning: Ignoring file $SequenceFile: Rreference sequence ID, $SpecifiedRefereceSequence, specified using option \"--referencesequence\" is missing.\n";
436 next FILELIST;
437 }
438 $RefereceSequenceID = $SpecifiedRefereceSequence;
439 }
440
441 # Make sure sequence regions corresponding to reference sequence are valid...
442 @RefereceSequenceRegions = ();
443 $RefereceSequenceRegionCount = 0;
444 $RefereceSequence = $SequenceDataRef->{Sequence}{$RefereceSequenceID};
445 $RefereceSequenceLen = length $RefereceSequence;
446
447 $RefereceSequenceWithNoGaps = RemoveSequenceGaps($RefereceSequence);
448 $RefereceSequenceWithNoGapsLen = length $RefereceSequenceWithNoGaps;
449
450 @SpecifiedRegions = ();
451 push @SpecifiedRegions, @{$OptionsInfo{SpecifiedRegions}};
452 if (@SpecifiedRegions) {
453 # Make sure specified regions are valid...
454 my($StartResNum, $EndResNum, $RegionIndex, $RegionNum);
455 $RegionNum = 0;
456 for ($RegionIndex = 0; $RegionIndex <= $#SpecifiedRegions; $RegionIndex += 2) {
457 $StartResNum = $SpecifiedRegions[$RegionIndex];
458 $EndResNum = $SpecifiedRegions[$RegionIndex + 1];
459 $RegionNum++;
460 if ($OptionsInfo{IgnoreGaps}) {
461 if ($StartResNum > $RefereceSequenceWithNoGapsLen) {
462 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
463 next FILELIST;
464 }
465 if ($EndResNum > $RefereceSequenceWithNoGapsLen) {
466 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
467 next FILELIST;
468 }
469 }
470 else {
471 if ($StartResNum > $RefereceSequenceLen) {
472 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum.\n";
473 next FILELIST;
474 }
475 if ($EndResNum > $RefereceSequenceLen) {
476 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum.\n";
477 next FILELIST;
478 }
479 }
480 push @RefereceSequenceRegions, ($StartResNum, $EndResNum);
481 }
482 $RefereceSequenceRegionCount = $RegionNum;
483 }
484 else {
485 # Use complete sequence corresponding to reference sequence ID...
486 if ($OptionsInfo{IgnoreGaps}) {
487 push @RefereceSequenceRegions, (1, $RefereceSequenceWithNoGapsLen);
488 }
489 else {
490 push @RefereceSequenceRegions, (1, $RefereceSequenceLen);
491 }
492 $RefereceSequenceRegionCount = 1;
493 }
494 # Setup output file names...
495 $FileDir = ""; $FileName = ""; $FileExt = "";
496 ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
497 $FileExt = "csv";
498 if ($Options{outdelim} =~ /^tab$/i) {
499 $FileExt = "tsv";
500 }
501 $OutFileExt = $FileExt;
502 if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
503 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
504 if ($RootFileName && $RootFileExt) {
505 $FileName = $RootFileName;
506 }
507 else {
508 $FileName = $OptionsInfo{OutFileRoot};
509 }
510 $OutFileRoot = $FileName;
511 }
512 else {
513 $OutFileRoot = $FileName;
514 }
515 if (!$OptionsInfo{OverwriteFiles}) {
516 if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
517 if (-e "${OutFileRoot}PercentIdentityMatrix.${OutFileExt}") {
518 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentIdentityMatrix.${OutFileExt} already exists\n";
519 next FILELIST;
520 }
521 }
522 if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
523 my($RegionNum);
524 for $RegionNum (1 .. $RefereceSequenceRegionCount) {
525 if (-e "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
526 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
527 next FILELIST;
528 }
529 if (-e "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
530 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
531 next FILELIST;
532 }
533 }
534 }
535 }
536
537 $SequenceFilesInfo{FilesOkay}[$Index] = 1;
538 $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
539 $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
540
541 $SequenceFilesInfo{Format}[$Index] = $FileFormat;
542 $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
543 $SequenceFilesInfo{RefereceSequenceID}[$Index] = $RefereceSequenceID;
544 $SequenceFilesInfo{RefereceSequence}[$Index] = $RefereceSequence;
545 $SequenceFilesInfo{RefereceSequenceLen}[$Index] = $RefereceSequenceLen;
546 $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = $RefereceSequenceWithNoGaps;
547 $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = $RefereceSequenceWithNoGapsLen;
548 push @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}, @RefereceSequenceRegions;
549 $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = $RefereceSequenceRegionCount;
550
551 # Setup residue codes...
552 SetupSequenceFileResidueCodes($SequenceDataRef, $Index);
553 }
554 }
555
556 sub SetupSequenceFileResidueCodes {
557 my($SequenceDataRef, $FileIndex) = @_;
558 my($Residue, @ResidueCodesList, %ResidueCodesMap);
559
560 # Initialize
561 @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]} = ();
562
563 %ResidueCodesMap = ();
564 @ResidueCodesList = ();
565
566 # Setup standard amino acids and nucleic acids one letter codes...
567 if ($OptionsInfo{RegionResiduesMode} =~ /^AminoAcids$/i) {
568 @ResidueCodesList = AminoAcids::GetAminoAcids('OneLetterCode');
569 }
570 elsif ($OptionsInfo{RegionResiduesMode} =~ /^NucleicAcids$/i) {
571 push @ResidueCodesList, ('A', 'G', 'T', 'U', 'C');
572 }
573 push @ResidueCodesList, 'Gap';
574 for $Residue (@ResidueCodesList) {
575 $ResidueCodesMap{$Residue} = $Residue;
576 }
577
578 # Go over all the residues in all the sequences and add missing ones to the list...
579 my($ID, $Sequence, $ResIndex);
580 for $ID (@{$SequenceDataRef->{IDs}}) {
581 $Sequence = $SequenceDataRef->{Sequence}{$ID};
582 RES: for $ResIndex (0 .. (length($Sequence) - 1)) {
583 $Residue = substr($Sequence, $ResIndex, 1);
584 if (IsGapResidue($Residue)) {
585 next RES;
586 }
587 if (exists $ResidueCodesMap{$Residue}) {
588 next RES;
589 }
590 push @ResidueCodesList, $Residue;
591 $ResidueCodesMap{$Residue} = $Residue;
592 }
593 }
594 push @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}, @ResidueCodesList;
595 }
596
597 # Setup regions data for performing residue frequency analysis...
598 sub SetupSequenceRegionsData {
599 my($Index, $RefereceSequence, $RefereceSequenceLen, $RegionID, $StartResNum, $EndResNum, $RegionIndex, $RegionNum, $NoGapResNum, $ResNum, $ResIndex, $Residue, $ResidueCode, @RefereceSequenceRegions);
600
601
602 @{$SequenceFilesInfo{RefereceSequenceResNums}} = ();
603 @{$SequenceFilesInfo{RegionsData}} = ();
604
605 FILELIST: for $Index (0 .. $#SequenceFilesList) {
606 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}} = ();
607 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}} = ();
608 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}} = ();
609 %{$SequenceFilesInfo{RegionsData}[$Index]} = ();
610 @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}} = ();
611
612 if (!$SequenceFilesInfo{FilesOkay}[$Index]) {
613 next FILELIST;
614 }
615 if (!$OptionsInfo{PerformResidueFrequencyAnalysis}) {
616 next FILELIST;
617 }
618
619 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$Index];
620 $RefereceSequenceLen = $SequenceFilesInfo{RefereceSequenceLen}[$Index];
621
622 # Setup residue number mapping and gap status for refernece sequence...
623 $NoGapResNum = 0;
624 $ResNum = 0;
625 for $ResIndex (0 .. ($RefereceSequenceLen - 1)) {
626 $ResNum++;
627 $Residue = substr($RefereceSequence, $ResIndex, 1);
628 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 1;
629 if (!IsGapResidue($Residue)) {
630 $NoGapResNum++;
631 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 0;
632 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$NoGapResNum} = $ResNum;
633 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}{$ResNum} = $NoGapResNum;
634 }
635 }
636 # Map residue numbers for specified regions to the reference residue in input sequence/alignment files
637 $RegionNum = 0;
638 @RefereceSequenceRegions = ();
639 push @RefereceSequenceRegions, @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]};
640 for ($RegionIndex = 0; $RegionIndex <= $#RefereceSequenceRegions; $RegionIndex += 2) {
641 $StartResNum = $RefereceSequenceRegions[$RegionIndex];
642 $EndResNum = $RefereceSequenceRegions[$RegionIndex + 1];
643 $RegionNum++;
644 $RegionID = "Region${RegionNum}";
645 if ($OptionsInfo{IgnoreGaps}) {
646 # Map residue numbers to the reference sequence residue numbers to account for any ignored gaps...
647 $StartResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$StartResNum};
648 $EndResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$EndResNum};
649 }
650 push @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}}, $RegionID;
651 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{StartResNum} = $StartResNum;
652 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{EndResNum} = $EndResNum;
653
654 # Initialize data for residue codes...
655 for $ResNum ($StartResNum .. $EndResNum) {
656 for $ResidueCode (@{$SequenceFilesInfo{ResidueCodes}[$Index]}) {
657 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{Count}{$ResNum}{$ResidueCode} = 0;
658 }
659 }
660 }
661 }
662 }
663
664 # Setup script usage and retrieve command line arguments specified using various options...
665 sub SetupScriptUsage {
666
667 # Retrieve all the options...
668 %Options = ();
669 $Options{ignoregaps} = 'yes';
670 $Options{regionresiduesmode} = 'None';
671 $Options{mode} = 'PercentIdentityMatrix';
672 $Options{outdelim} = 'comma';
673 $Options{precision} = 2;
674 $Options{quote} = 'yes';
675 $Options{referencesequence} = 'UseFirstSequenceID';
676 $Options{region} = 'UseCompleteSequence';
677
678 if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "referencesequence=s", "region=s", "regionresiduesmode=s", "root|r=s", "workingdir|w=s")) {
679 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
680 }
681 if ($Options{workingdir}) {
682 if (! -d $Options{workingdir}) {
683 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
684 }
685 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
686 }
687 if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
688 die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
689 }
690 if ($Options{regionresiduesmode} !~ /^(AminoAcids|NucleicAcids|None)$/i) {
691 die "Error: The value specified, $Options{regionresiduesmode}, for option \"--regionresiduesmode\" is not valid. Allowed values: AminoAcids, NucleicAcids or None\n";
692 }
693 if ($Options{mode} !~ /^(PercentIdentityMatrix|ResidueFrequencyAnalysis|All)$/i) {
694 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: PercentIdentityMatrix, ResidueFrequencyAnalysis or All\n";
695 }
696 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
697 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
698 }
699 if ($Options{quote} !~ /^(yes|no)$/i) {
700 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
701 }
702 if (!IsPositiveInteger($Options{precision})) {
703 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
704 }
705 }
706
707 __END__
708
709 =head1 NAME
710
711 AnalyzeSequenceFilesData.pl - Analyze sequence and alignment files
712
713 =head1 SYNOPSIS
714
715 AnalyzeSequenceFilesData.pl SequenceFile(s) AlignmentFile(s)...
716
717 AnalyzeSequenceFilesData.pl [B<-h, --help>] [B<-i, --IgnoreGaps> yes | no]
718 [B<-m, --mode> PercentIdentityMatrix | ResidueFrequencyAnalysis | All]
719 [B<--outdelim> comma | tab | semicolon] [B<-o, --overwrite>] [B<-p, --precision> number] [B<-q, --quote> yes | no]
720 [B<--ReferenceSequence> SequenceID | UseFirstSequenceID]
721 [B<--region> "StartResNum, EndResNum, [StartResNum, EndResNum...]" | UseCompleteSequence]
722 [B<--RegionResiduesMode> AminoAcids | NucleicAcids | None]
723 [B<-w, --WorkingDir> dirname] SequenceFile(s) AlignmentFile(s)...
724
725 =head1 DESCRIPTION
726
727 Analyze I<SequenceFile(s) and AlignmentFile(s)> data: calculate pairwise percent identity matrix or
728 calculate percent occurrence of various residues in specified sequence regions. All the sequences
729 in the input file must have the same sequence lengths; otherwise, the sequence file is ignored.
730
731 The file names are separated by spaces. All the sequence files in a current directory can
732 be specified by I<*.aln>, I<*.msf>, I<*.fasta>, I<*.fta>, I<*.pir> or any other supported
733 formats; additionally, I<DirName> corresponds to all the sequence files in the current directory
734 with any of the supported file extension: I<.aln, .msf, .fasta, .fta, and .pir>.
735
736 Supported sequence formats are: I<ALN/CLustalW>, I<GCG/MSF>, I<PILEUP/MSF>, I<Pearson/FASTA>,
737 and I<NBRF/PIR>. Instead of using file extensions, file formats are detected by parsing the contents
738 of I<SequenceFile(s) and AlignmentFile(s)>.
739
740 =head1 OPTIONS
741
742 =over 4
743
744 =item B<-h, --help>
745
746 Print this help message.
747
748 =item B<-i, --IgnoreGaps> I<yes | no>
749
750 Ignore gaps during calculation of sequence lengths and specification of regions during residue
751 frequency analysis. Possible values: I<yes or no>. Default value: I<yes>.
752
753 =item B<-m, --mode> I<PercentIdentityMatrix | ResidueFrequencyAnalysis | All>
754
755 Specify how to analyze data in sequence files: calculate percent identity matrix or calculate
756 frequency of occurrence of residues in specific regions. During I<ResidueFrequencyAnalysis> value
757 of B<-m, --mode> option, output files are generated for both the residue count and percent residue
758 count. Possible values: I<PercentIdentityMatrix, ResidueFrequencyAnalysis, or All>. Default value:
759 I<PercentIdentityMatrix>.
760
761 =item B<--outdelim> I<comma | tab | semicolon>
762
763 Output text file delimiter. Possible values: I<comma, tab, or semicolon>.
764 Default value: I<comma>.
765
766 =item B<-o, --overwrite>
767
768 Overwrite existing files.
769
770 =item B<-p, --precision> I<number>
771
772 Precision of calculated values in the output file. Default: up to I<2> decimal places.
773 Valid values: positive integers.
774
775 =item B<-q, --quote> I<yes | no>
776
777 Put quotes around column values in output text file. Possible values: I<yes or
778 no>. Default value: I<yes>.
779
780 =item B<--ReferenceSequence> I<SequenceID | UseFirstSequenceID>
781
782 Specify reference sequence ID to identify regions for performing I<ResidueFrequencyAnalysis> specified
783 using B<-m, --mode> option. Default: I<UseFirstSequenceID>.
784
785 =item B<--region> I<StartResNum,EndResNum,[StartResNum,EndResNum...] | UseCompleteSequence>
786
787 Specify how to perform frequency of occurrence analysis for residues: use specific regions
788 indicated by starting and ending residue numbers in reference sequence or use the whole reference
789 sequence as one region. Default: I<UseCompleteSequence>.
790
791 Based on the value of B<-i, --IgnoreGaps> option, specified residue numbers I<StartResNum,EndResNum>
792 correspond to the positions in the reference sequence without gaps or with gaps.
793
794 For residue numbers corresponding to the reference sequence including gaps, percent occurrence
795 of various residues corresponding to gap position in reference sequence is also calculated.
796
797 =item B<--RegionResiduesMode> I<AminoAcids | NucleicAcids | None>
798
799 Specify how to process residues in the regions specified using B<--region> option during
800 I<ResidueFrequencyAnalysis> calculation: categorize residues as amino acids, nucleic acids, or simply
801 ignore residue category during the calculation. Possible values: I<AminoAcids, NucleicAcids or None>.
802 Default value: I<None>.
803
804 For I<AminoAcids> or I<NucleicAcids> values of B<--RegionResiduesMode> option, all the standard amino
805 acids or nucleic acids are listed in the output file for each region; Any gaps and other non standard residues
806 are added to the list as encountered.
807
808 For I<None> value of B<--RegionResiduesMode> option, no assumption is made about type of residues.
809 Residue and gaps are added to the list as encountered.
810
811 =item B<-r, --root> I<rootname>
812
813 New sequence file name is generated using the root: <Root><Mode>.<Ext> and
814 <Root><Mode><RegionNum>.<Ext>. Default new file
815 name: <SequenceFileName><Mode>.<Ext> for I<PercentIdentityMatrix> value B<m, --mode> option
816 and <SequenceFileName><Mode><RegionNum>.<Ext> for I<ResidueFrequencyAnalysis>.
817 The csv, and tsv <Ext> values are used for comma/semicolon, and tab delimited text
818 files respectively. This option is ignored for multiple input files.
819
820 =item B<-w --WorkingDir> I<text>
821
822 Location of working directory. Default: current directory.
823
824 =back
825
826 =head1 EXAMPLES
827
828 To calculate percent identity matrix for all sequences in Sample1.msf file and generate
829 Sample1PercentIdentityMatrix.csv, type:
830
831 % AnalyzeSequenceFilesData.pl Sample1.msf
832
833 To perform residue frequency analysis for all sequences in Sample1.aln file corresponding to
834 non-gap positions in the first sequence and generate Sample1ResidueFrequencyAnalysisRegion1.csv
835 and Sample1PercentResidueFrequencyAnalysisRegion1.csv files, type:
836
837 % AnalyzeSequenceFilesData.pl -m ResidueFrequencyAnalysis -o
838 Sample1.aln
839
840 To perform residue frequency analysis for all sequences in Sample1.aln file corresponding to
841 all positions in the first sequence and generate TestResidueFrequencyAnalysisRegion1.csv
842 and TestPercentResidueFrequencyAnalysisRegion1.csv files, type:
843
844 % AnalyzeSequenceFilesData.pl -m ResidueFrequencyAnalysis --IgnoreGaps
845 No -o -r Test Sample1.aln
846
847 To perform residue frequency analysis for all sequences in Sample1.aln file corresponding to
848 non-gap residue positions 5 to 10, and 30 to 40 in sequence ACHE_BOVIN and generate
849 Sample1ResidueFrequencyAnalysisRegion1.csv, Sample1ResidueFrequencyAnalysisRegion2.csv,
850 SamplePercentResidueFrequencyAnalysisRegion1.csv, and
851 SamplePercentResidueFrequencyAnalysisRegion2.csv files, type:
852
853 % AnalyzeSequenceFilesData.pl -m ResidueFrequencyAnalysis
854 --ReferenceSequence ACHE_BOVIN --region "5,15,30,40" -o Sample1.msf
855
856
857 =head1 AUTHOR
858
859 Manish Sud <msud@san.rr.com>
860
861 =head1 SEE ALSO
862
863 ExtractFromSequenceFiles.pl, InfoSequenceFiles.pl
864
865 =head1 COPYRIGHT
866
867 Copyright (C) 2015 Manish Sud. All rights reserved.
868
869 This file is part of MayaChemTools.
870
871 MayaChemTools is free software; you can redistribute it and/or modify it under
872 the terms of the GNU Lesser General Public License as published by the Free
873 Software Foundation; either version 3 of the License, or (at your option)
874 any later version.
875
876 =cut