Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/AnalyzeSequenceFilesData.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: AnalyzeSequenceFilesData.pl,v $ | |
4 # $Date: 2015/02/28 20:46:04 $ | |
5 # $Revision: 1.33 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileUtil; | |
36 use TextUtil; | |
37 use SequenceFileUtil; | |
38 use AminoAcids; | |
39 use NucleicAcids; | |
40 | |
41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
42 | |
43 # Autoflush STDOUT | |
44 $| = 1; | |
45 | |
46 # Starting message... | |
47 $ScriptName = basename($0); | |
48 print "\n$ScriptName: Starting...\n\n"; | |
49 $StartTime = new Benchmark; | |
50 | |
51 # Setup script usage message... | |
52 SetupScriptUsage(); | |
53 if ($Options{help} || @ARGV < 1) { | |
54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
55 } | |
56 | |
57 # Expand wild card file names... | |
58 my(@SequenceFilesList); | |
59 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir"); | |
60 | |
61 print "Processing options...\n"; | |
62 my(%OptionsInfo); | |
63 ProcessOptions(); | |
64 | |
65 # Set up information about input files... | |
66 print "Checking input sequence file(s)...\n"; | |
67 my(%SequenceFilesInfo); | |
68 RetrieveSequenceFilesInfo(); | |
69 SetupSequenceRegionsData(); | |
70 | |
71 # Process input files.. | |
72 my($FileIndex); | |
73 if (@SequenceFilesList > 1) { | |
74 print "\nProcessing sequence files...\n"; | |
75 } | |
76 for $FileIndex (0 .. $#SequenceFilesList) { | |
77 if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) { | |
78 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n"; | |
79 AnalyzeSequenceFileData($FileIndex); | |
80 } | |
81 } | |
82 print "\n$ScriptName:Done...\n\n"; | |
83 | |
84 $EndTime = new Benchmark; | |
85 $TotalTime = timediff ($EndTime, $StartTime); | |
86 print "Total time: ", timestr($TotalTime), "\n"; | |
87 | |
88 ############################################################################### | |
89 | |
90 # Analyze sequence file... | |
91 sub AnalyzeSequenceFileData { | |
92 my($FileIndex) = @_; | |
93 my($SequenceFile, $SequenceDataRef); | |
94 | |
95 $SequenceFile = $SequenceFilesList[$FileIndex]; | |
96 | |
97 open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n"; | |
98 $SequenceDataRef = ReadSequenceFile($SequenceFile); | |
99 close SEQUENCEFILE; | |
100 | |
101 if ($OptionsInfo{CalculatePercentIdentityMatrix}) { | |
102 CalculatePercentIdentityMatrix($FileIndex, $SequenceDataRef); | |
103 } | |
104 if ($OptionsInfo{PerformResidueFrequencyAnalysis}) { | |
105 PerformResidueFrequencyAnalysis($FileIndex, $SequenceDataRef); | |
106 } | |
107 } | |
108 | |
109 # Calculate percent identity matrix... | |
110 sub CalculatePercentIdentityMatrix { | |
111 my($FileIndex, $SequenceDataRef) = @_; | |
112 my($PercentIdentity, $PercentIdentityMatrixFile, $PercentIdentityMatrixRef, $RowID, $ColID, $Line, @LineWords); | |
113 | |
114 $PercentIdentityMatrixFile = $SequenceFilesInfo{OutFileRoot}[$FileIndex] . 'PercentIdentityMatrix.' . $SequenceFilesInfo{OutFileExt}[$FileIndex]; | |
115 $PercentIdentityMatrixRef = CalculatePercentSequenceIdentityMatrix($SequenceDataRef, $OptionsInfo{IgnoreGaps}, $OptionsInfo{Precision}); | |
116 | |
117 print "Generating percent identity matrix file $PercentIdentityMatrixFile...\n"; | |
118 open OUTFILE, ">$PercentIdentityMatrixFile" or die "Can't open $PercentIdentityMatrixFile: $!\n"; | |
119 | |
120 # Write out column labels... | |
121 @LineWords = (); | |
122 push @LineWords, ''; | |
123 for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) { | |
124 push @LineWords, $ColID; | |
125 } | |
126 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
127 print OUTFILE "$Line\n"; | |
128 | |
129 # Write out rows... | |
130 for $RowID (@{$PercentIdentityMatrixRef->{IDs}}) { | |
131 @LineWords = (); | |
132 push @LineWords, $RowID; | |
133 for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) { | |
134 $PercentIdentity = $PercentIdentityMatrixRef->{PercentIdentity}{$RowID}{$ColID}; | |
135 push @LineWords, $PercentIdentity; | |
136 } | |
137 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
138 print OUTFILE "$Line\n"; | |
139 } | |
140 close OUTFILE; | |
141 } | |
142 | |
143 # Perform frequency analysis... | |
144 sub PerformResidueFrequencyAnalysis { | |
145 my($FileIndex, $SequenceDataRef) = @_; | |
146 | |
147 CountResiduesInRegions($FileIndex, $SequenceDataRef); | |
148 CalculatePercentResidueFrequencyInRegions($FileIndex, $SequenceDataRef); | |
149 GeneratePercentResidueFrequencyOutFilesForRegions($FileIndex, $SequenceDataRef); | |
150 } | |
151 | |
152 # Count residues... | |
153 sub CountResiduesInRegions { | |
154 my($FileIndex, $SequenceDataRef) = @_; | |
155 | |
156 # Setup rerfernce sequence data... | |
157 my($RefereceSequenceID, $RefereceSequence); | |
158 $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex]; | |
159 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex]; | |
160 | |
161 # Count residues... | |
162 my($RegionID, $StartResNum, $EndResNum, $ResNum, $ResIndex, $ID, $Sequence, $Residue); | |
163 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) { | |
164 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum}; | |
165 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum}; | |
166 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { | |
167 $ResIndex = $ResNum - 1; | |
168 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { | |
169 next RESNUM; | |
170 } | |
171 # Go over residues in column $ResNum in all the sequences... | |
172 ID: for $ID (@{$SequenceDataRef->{IDs}}) { | |
173 $Sequence = $SequenceDataRef->{Sequence}{$ID}; | |
174 $Residue = substr($Sequence, $ResIndex, 1); | |
175 if (IsGapResidue($Residue)) { | |
176 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{Gap} += 1; | |
177 } | |
178 else { | |
179 if (exists $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}) { | |
180 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue} += 1; | |
181 } | |
182 else { | |
183 # Internal error... | |
184 print "Warning: Ignoring residue $Residue in sequence $ID during ResidueFrequencyAnalysis calculation: Unknown residue...\n"; | |
185 } | |
186 } | |
187 } | |
188 } | |
189 } | |
190 } | |
191 | |
192 # Calculate percent frequency for various residues in the sequence regions... | |
193 sub CalculatePercentResidueFrequencyInRegions { | |
194 my($FileIndex, $SequenceDataRef) = @_; | |
195 my($RegionID, $StartResNum, $EndResNum, $ResNum, $Residue, $Count, $PercentCount, $MaxResiduesCount, $Precision); | |
196 | |
197 $MaxResiduesCount = $SequenceDataRef->{Count}; | |
198 $Precision = $OptionsInfo{Precision}; | |
199 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) { | |
200 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum}; | |
201 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum}; | |
202 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { | |
203 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { | |
204 next RESNUM; | |
205 } | |
206 for $Residue (keys %{$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}}) { | |
207 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}; | |
208 $PercentCount = ($Count / $MaxResiduesCount) * 100; | |
209 $PercentCount = sprintf("%.${Precision}f", $PercentCount) + 0; | |
210 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue} = $PercentCount; | |
211 } | |
212 } | |
213 } | |
214 } | |
215 | |
216 # Generate output files... | |
217 sub GeneratePercentResidueFrequencyOutFilesForRegions { | |
218 my($FileIndex, $SequenceDataRef) = @_; | |
219 | |
220 # Setup rerfernce sequence data... | |
221 my($RefereceSequenceID, $RefereceSequence); | |
222 $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex]; | |
223 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex]; | |
224 | |
225 my($RegionID, $StartResNum, $EndResNum, $ResNum, $Count, $PercentCount, $Residue, $RegionNum, $RegionOutFile, $PercentRegionOutFile, $OutFileRoot, $OutFileExt, $Line, @LineWords, @PercentLineWords); | |
226 | |
227 $OutFileRoot = $SequenceFilesInfo{OutFileRoot}[$FileIndex]; | |
228 $OutFileExt = $SequenceFilesInfo{OutFileExt}[$FileIndex]; | |
229 $RegionNum = 0; | |
230 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) { | |
231 $RegionNum++; | |
232 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum}; | |
233 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum}; | |
234 | |
235 $RegionOutFile = "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}"; | |
236 $PercentRegionOutFile = "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}"; | |
237 | |
238 print "Generating $RegionOutFile and $PercentRegionOutFile...\n"; | |
239 open REGIONOUTFILE, ">$RegionOutFile" or die "Error: Can't open $RegionOutFile: $! \n"; | |
240 open PERCENTREGIONOUTFILE, ">$PercentRegionOutFile" or die "Error: Can't open $PercentRegionOutFile: $! \n"; | |
241 | |
242 # Write out reference residue positions as column values.... | |
243 @LineWords = (); | |
244 push @LineWords, ''; | |
245 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { | |
246 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { | |
247 next RESNUM; | |
248 } | |
249 push @LineWords, $ResNum; | |
250 } | |
251 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
252 print REGIONOUTFILE "$Line\n"; | |
253 print PERCENTREGIONOUTFILE "$Line\n"; | |
254 | |
255 | |
256 # Write out row data for each residue; Gap residue is written last... | |
257 RESIDUE: for $Residue (sort @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}) { | |
258 if ($Residue =~ /^Gap$/i) { | |
259 next RESIDUE; | |
260 } | |
261 @LineWords = (); | |
262 push @LineWords, $Residue; | |
263 @PercentLineWords = (); | |
264 push @PercentLineWords, $Residue; | |
265 | |
266 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { | |
267 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { | |
268 next RESNUM; | |
269 } | |
270 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}; | |
271 push @LineWords, $Count; | |
272 $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue}; | |
273 push @PercentLineWords, $PercentCount; | |
274 } | |
275 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
276 print REGIONOUTFILE "$Line\n"; | |
277 | |
278 $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
279 print PERCENTREGIONOUTFILE "$Line\n"; | |
280 } | |
281 | |
282 # Write out data for gap... | |
283 $Residue = 'Gap'; | |
284 @LineWords = (); | |
285 push @LineWords, $Residue; | |
286 @PercentLineWords = (); | |
287 push @PercentLineWords, $Residue; | |
288 | |
289 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { | |
290 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { | |
291 next RESNUM; | |
292 } | |
293 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}; | |
294 push @LineWords, $Count; | |
295 | |
296 $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue}; | |
297 push @PercentLineWords, $PercentCount; | |
298 } | |
299 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
300 print REGIONOUTFILE "$Line\n"; | |
301 | |
302 $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
303 print PERCENTREGIONOUTFILE "$Line\n"; | |
304 | |
305 close REGIONOUTFILE; | |
306 close PERCENTREGIONOUTFILE; | |
307 } | |
308 } | |
309 | |
310 # Process option values... | |
311 sub ProcessOptions { | |
312 %OptionsInfo = (); | |
313 | |
314 # Setup analysis mode... | |
315 $OptionsInfo{CalculatePercentIdentityMatrix} = ($Options{mode} =~ /^(PercentIdentityMatrix|All)$/i) ? 1 : 0; | |
316 $OptionsInfo{PerformResidueFrequencyAnalysis} = ($Options{mode} =~ /^(ResidueFrequencyAnalysis|All)$/i) ? 1 : 0; | |
317 | |
318 # Setup delimiter and quotes... | |
319 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); | |
320 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; | |
321 | |
322 # Setup reference sequence and regions for residue frequence analysis... | |
323 $OptionsInfo{SpecifiedRefereceSequence} = $Options{referencesequence}; | |
324 $OptionsInfo{SpecifiedRegion} = $Options{region}; | |
325 @{$OptionsInfo{SpecifiedRegions}} = (); | |
326 | |
327 my(@SpecifiedRegions); | |
328 @SpecifiedRegions = (); | |
329 if ($Options{region} =~ /\,/) { | |
330 @SpecifiedRegions = split /\,/, $OptionsInfo{SpecifiedRegion}; | |
331 if (@SpecifiedRegions % 2) { | |
332 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n"; | |
333 } | |
334 # Make sure EndResNum > StartResNum... | |
335 my($StartResNum, $EndResNum, $Index, $RegionNum); | |
336 $RegionNum = 0; | |
337 for ($Index = 0; $Index <= $#SpecifiedRegions; $Index += 2) { | |
338 $StartResNum = $SpecifiedRegions[$Index]; | |
339 $EndResNum = $SpecifiedRegions[$Index + 1]; | |
340 $RegionNum++; | |
341 if (!IsPositiveInteger($StartResNum)) { | |
342 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be a positive integer for region $RegionNum.\n"; | |
343 } | |
344 if (!IsPositiveInteger($EndResNum)) { | |
345 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $EndResNum, must be a positive integer for region $RegionNum.\n"; | |
346 } | |
347 if ($StartResNum >= $EndResNum) { | |
348 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller than end residue number, $EndResNum, for region $RegionNum.\n"; | |
349 } | |
350 } | |
351 } | |
352 else { | |
353 if ($Options{region} !~ /^UseCompleteSequence$/i) { | |
354 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n"; | |
355 } | |
356 } | |
357 push @{$OptionsInfo{SpecifiedRegions}}, @SpecifiedRegions; | |
358 | |
359 # Miscellaneous options... | |
360 $OptionsInfo{Precision} = $Options{precision}; | |
361 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0; | |
362 $OptionsInfo{RegionResiduesMode} = $Options{regionresiduesmode}; | |
363 | |
364 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | |
365 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | |
366 } | |
367 | |
368 # Retrieve information about sequence files... | |
369 sub RetrieveSequenceFilesInfo { | |
370 my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $RefereceSequence, $RefereceSequenceID, $RefereceSequenceLen, $RefereceSequenceWithNoGaps, $RefereceSequenceWithNoGapsLen, $RefereceSequenceRegionCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $SequenceDataRef, $SpecifiedRefereceSequence, @SpecifiedRegions, @RefereceSequenceRegions); | |
371 | |
372 %SequenceFilesInfo = (); | |
373 @{$SequenceFilesInfo{FilesOkay}} = (); | |
374 @{$SequenceFilesInfo{OutFileRoot}} = (); | |
375 @{$SequenceFilesInfo{OutFileExt}} = (); | |
376 @{$SequenceFilesInfo{Format}} = (); | |
377 @{$SequenceFilesInfo{SequenceCount}} = (); | |
378 @{$SequenceFilesInfo{RefereceSequenceID}} = (); | |
379 @{$SequenceFilesInfo{RefereceSequence}} = (); | |
380 @{$SequenceFilesInfo{RefereceSequenceLen}} = (); | |
381 @{$SequenceFilesInfo{RefereceSequenceWithNoGaps}} = (); | |
382 @{$SequenceFilesInfo{RefereceSequenceWithNoGapsLen}} = (); | |
383 @{$SequenceFilesInfo{RefereceSequenceRegions}} = (); | |
384 @{$SequenceFilesInfo{RefereceSequenceRegionCount}} = (); | |
385 @{$SequenceFilesInfo{ResidueCodes}} = (); | |
386 | |
387 FILELIST: for $Index (0 .. $#SequenceFilesList) { | |
388 $SequenceFile = $SequenceFilesList[$Index]; | |
389 $SequenceFilesInfo{FilesOkay}[$Index] = 0; | |
390 $SequenceFilesInfo{OutFileRoot}[$Index] = ''; | |
391 $SequenceFilesInfo{OutFileExt}[$Index] = ''; | |
392 $SequenceFilesInfo{Format}[$Index] = 'NotSupported'; | |
393 $SequenceFilesInfo{SequenceCount}[$Index] = 0; | |
394 $SequenceFilesInfo{RefereceSequenceID}[$Index] = ''; | |
395 $SequenceFilesInfo{RefereceSequence}[$Index] = ''; | |
396 $SequenceFilesInfo{RefereceSequenceLen}[$Index] = ''; | |
397 $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = ''; | |
398 $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = ''; | |
399 @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]} = (); | |
400 $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = 0; | |
401 @{$SequenceFilesInfo{ResidueCodes}[$Index]} = (); | |
402 | |
403 if (! open SEQUENCEFILE, "$SequenceFile") { | |
404 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n"; | |
405 next FILELIST; | |
406 } | |
407 close SEQUENCEFILE; | |
408 | |
409 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile); | |
410 if (!$FileSupported) { | |
411 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n"; | |
412 next FILELIST; | |
413 } | |
414 | |
415 $SequenceDataRef = ReadSequenceFile($SequenceFile); | |
416 | |
417 $SequenceCount = $SequenceDataRef->{Count}; | |
418 if (!$SequenceCount) { | |
419 warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n"; | |
420 next FILELIST; | |
421 } | |
422 | |
423 # Make sure all sequence lengths are identical... | |
424 if (!AreSequenceLengthsIdentical($SequenceDataRef)) { | |
425 warn "Warning: Ignoring file $SequenceFile: Sequence legths are not identical.\n"; | |
426 next FILELIST; | |
427 } | |
428 $SpecifiedRefereceSequence = $OptionsInfo{SpecifiedRefereceSequence}; | |
429 # Make sure reference sequence ID is valid... | |
430 if ($SpecifiedRefereceSequence =~ /^UseFirstSequenceID$/i) { | |
431 $RefereceSequenceID = $SequenceDataRef->{IDs}[0]; | |
432 } | |
433 else { | |
434 if (!exists($SequenceDataRef->{Sequence}{$SpecifiedRefereceSequence})) { | |
435 warn "Warning: Ignoring file $SequenceFile: Rreference sequence ID, $SpecifiedRefereceSequence, specified using option \"--referencesequence\" is missing.\n"; | |
436 next FILELIST; | |
437 } | |
438 $RefereceSequenceID = $SpecifiedRefereceSequence; | |
439 } | |
440 | |
441 # Make sure sequence regions corresponding to reference sequence are valid... | |
442 @RefereceSequenceRegions = (); | |
443 $RefereceSequenceRegionCount = 0; | |
444 $RefereceSequence = $SequenceDataRef->{Sequence}{$RefereceSequenceID}; | |
445 $RefereceSequenceLen = length $RefereceSequence; | |
446 | |
447 $RefereceSequenceWithNoGaps = RemoveSequenceGaps($RefereceSequence); | |
448 $RefereceSequenceWithNoGapsLen = length $RefereceSequenceWithNoGaps; | |
449 | |
450 @SpecifiedRegions = (); | |
451 push @SpecifiedRegions, @{$OptionsInfo{SpecifiedRegions}}; | |
452 if (@SpecifiedRegions) { | |
453 # Make sure specified regions are valid... | |
454 my($StartResNum, $EndResNum, $RegionIndex, $RegionNum); | |
455 $RegionNum = 0; | |
456 for ($RegionIndex = 0; $RegionIndex <= $#SpecifiedRegions; $RegionIndex += 2) { | |
457 $StartResNum = $SpecifiedRegions[$RegionIndex]; | |
458 $EndResNum = $SpecifiedRegions[$RegionIndex + 1]; | |
459 $RegionNum++; | |
460 if ($OptionsInfo{IgnoreGaps}) { | |
461 if ($StartResNum > $RefereceSequenceWithNoGapsLen) { | |
462 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n"; | |
463 next FILELIST; | |
464 } | |
465 if ($EndResNum > $RefereceSequenceWithNoGapsLen) { | |
466 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n"; | |
467 next FILELIST; | |
468 } | |
469 } | |
470 else { | |
471 if ($StartResNum > $RefereceSequenceLen) { | |
472 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum.\n"; | |
473 next FILELIST; | |
474 } | |
475 if ($EndResNum > $RefereceSequenceLen) { | |
476 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum.\n"; | |
477 next FILELIST; | |
478 } | |
479 } | |
480 push @RefereceSequenceRegions, ($StartResNum, $EndResNum); | |
481 } | |
482 $RefereceSequenceRegionCount = $RegionNum; | |
483 } | |
484 else { | |
485 # Use complete sequence corresponding to reference sequence ID... | |
486 if ($OptionsInfo{IgnoreGaps}) { | |
487 push @RefereceSequenceRegions, (1, $RefereceSequenceWithNoGapsLen); | |
488 } | |
489 else { | |
490 push @RefereceSequenceRegions, (1, $RefereceSequenceLen); | |
491 } | |
492 $RefereceSequenceRegionCount = 1; | |
493 } | |
494 # Setup output file names... | |
495 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
496 ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile); | |
497 $FileExt = "csv"; | |
498 if ($Options{outdelim} =~ /^tab$/i) { | |
499 $FileExt = "tsv"; | |
500 } | |
501 $OutFileExt = $FileExt; | |
502 if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) { | |
503 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
504 if ($RootFileName && $RootFileExt) { | |
505 $FileName = $RootFileName; | |
506 } | |
507 else { | |
508 $FileName = $OptionsInfo{OutFileRoot}; | |
509 } | |
510 $OutFileRoot = $FileName; | |
511 } | |
512 else { | |
513 $OutFileRoot = $FileName; | |
514 } | |
515 if (!$OptionsInfo{OverwriteFiles}) { | |
516 if ($OptionsInfo{CalculatePercentIdentityMatrix}) { | |
517 if (-e "${OutFileRoot}PercentIdentityMatrix.${OutFileExt}") { | |
518 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentIdentityMatrix.${OutFileExt} already exists\n"; | |
519 next FILELIST; | |
520 } | |
521 } | |
522 if ($OptionsInfo{PerformResidueFrequencyAnalysis}) { | |
523 my($RegionNum); | |
524 for $RegionNum (1 .. $RefereceSequenceRegionCount) { | |
525 if (-e "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") { | |
526 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n"; | |
527 next FILELIST; | |
528 } | |
529 if (-e "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") { | |
530 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n"; | |
531 next FILELIST; | |
532 } | |
533 } | |
534 } | |
535 } | |
536 | |
537 $SequenceFilesInfo{FilesOkay}[$Index] = 1; | |
538 $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | |
539 $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt; | |
540 | |
541 $SequenceFilesInfo{Format}[$Index] = $FileFormat; | |
542 $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount; | |
543 $SequenceFilesInfo{RefereceSequenceID}[$Index] = $RefereceSequenceID; | |
544 $SequenceFilesInfo{RefereceSequence}[$Index] = $RefereceSequence; | |
545 $SequenceFilesInfo{RefereceSequenceLen}[$Index] = $RefereceSequenceLen; | |
546 $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = $RefereceSequenceWithNoGaps; | |
547 $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = $RefereceSequenceWithNoGapsLen; | |
548 push @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}, @RefereceSequenceRegions; | |
549 $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = $RefereceSequenceRegionCount; | |
550 | |
551 # Setup residue codes... | |
552 SetupSequenceFileResidueCodes($SequenceDataRef, $Index); | |
553 } | |
554 } | |
555 | |
556 sub SetupSequenceFileResidueCodes { | |
557 my($SequenceDataRef, $FileIndex) = @_; | |
558 my($Residue, @ResidueCodesList, %ResidueCodesMap); | |
559 | |
560 # Initialize | |
561 @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]} = (); | |
562 | |
563 %ResidueCodesMap = (); | |
564 @ResidueCodesList = (); | |
565 | |
566 # Setup standard amino acids and nucleic acids one letter codes... | |
567 if ($OptionsInfo{RegionResiduesMode} =~ /^AminoAcids$/i) { | |
568 @ResidueCodesList = AminoAcids::GetAminoAcids('OneLetterCode'); | |
569 } | |
570 elsif ($OptionsInfo{RegionResiduesMode} =~ /^NucleicAcids$/i) { | |
571 push @ResidueCodesList, ('A', 'G', 'T', 'U', 'C'); | |
572 } | |
573 push @ResidueCodesList, 'Gap'; | |
574 for $Residue (@ResidueCodesList) { | |
575 $ResidueCodesMap{$Residue} = $Residue; | |
576 } | |
577 | |
578 # Go over all the residues in all the sequences and add missing ones to the list... | |
579 my($ID, $Sequence, $ResIndex); | |
580 for $ID (@{$SequenceDataRef->{IDs}}) { | |
581 $Sequence = $SequenceDataRef->{Sequence}{$ID}; | |
582 RES: for $ResIndex (0 .. (length($Sequence) - 1)) { | |
583 $Residue = substr($Sequence, $ResIndex, 1); | |
584 if (IsGapResidue($Residue)) { | |
585 next RES; | |
586 } | |
587 if (exists $ResidueCodesMap{$Residue}) { | |
588 next RES; | |
589 } | |
590 push @ResidueCodesList, $Residue; | |
591 $ResidueCodesMap{$Residue} = $Residue; | |
592 } | |
593 } | |
594 push @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}, @ResidueCodesList; | |
595 } | |
596 | |
597 # Setup regions data for performing residue frequency analysis... | |
598 sub SetupSequenceRegionsData { | |
599 my($Index, $RefereceSequence, $RefereceSequenceLen, $RegionID, $StartResNum, $EndResNum, $RegionIndex, $RegionNum, $NoGapResNum, $ResNum, $ResIndex, $Residue, $ResidueCode, @RefereceSequenceRegions); | |
600 | |
601 | |
602 @{$SequenceFilesInfo{RefereceSequenceResNums}} = (); | |
603 @{$SequenceFilesInfo{RegionsData}} = (); | |
604 | |
605 FILELIST: for $Index (0 .. $#SequenceFilesList) { | |
606 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}} = (); | |
607 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}} = (); | |
608 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}} = (); | |
609 %{$SequenceFilesInfo{RegionsData}[$Index]} = (); | |
610 @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}} = (); | |
611 | |
612 if (!$SequenceFilesInfo{FilesOkay}[$Index]) { | |
613 next FILELIST; | |
614 } | |
615 if (!$OptionsInfo{PerformResidueFrequencyAnalysis}) { | |
616 next FILELIST; | |
617 } | |
618 | |
619 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$Index]; | |
620 $RefereceSequenceLen = $SequenceFilesInfo{RefereceSequenceLen}[$Index]; | |
621 | |
622 # Setup residue number mapping and gap status for refernece sequence... | |
623 $NoGapResNum = 0; | |
624 $ResNum = 0; | |
625 for $ResIndex (0 .. ($RefereceSequenceLen - 1)) { | |
626 $ResNum++; | |
627 $Residue = substr($RefereceSequence, $ResIndex, 1); | |
628 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 1; | |
629 if (!IsGapResidue($Residue)) { | |
630 $NoGapResNum++; | |
631 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 0; | |
632 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$NoGapResNum} = $ResNum; | |
633 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}{$ResNum} = $NoGapResNum; | |
634 } | |
635 } | |
636 # Map residue numbers for specified regions to the reference residue in input sequence/alignment files | |
637 $RegionNum = 0; | |
638 @RefereceSequenceRegions = (); | |
639 push @RefereceSequenceRegions, @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}; | |
640 for ($RegionIndex = 0; $RegionIndex <= $#RefereceSequenceRegions; $RegionIndex += 2) { | |
641 $StartResNum = $RefereceSequenceRegions[$RegionIndex]; | |
642 $EndResNum = $RefereceSequenceRegions[$RegionIndex + 1]; | |
643 $RegionNum++; | |
644 $RegionID = "Region${RegionNum}"; | |
645 if ($OptionsInfo{IgnoreGaps}) { | |
646 # Map residue numbers to the reference sequence residue numbers to account for any ignored gaps... | |
647 $StartResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$StartResNum}; | |
648 $EndResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$EndResNum}; | |
649 } | |
650 push @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}}, $RegionID; | |
651 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{StartResNum} = $StartResNum; | |
652 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{EndResNum} = $EndResNum; | |
653 | |
654 # Initialize data for residue codes... | |
655 for $ResNum ($StartResNum .. $EndResNum) { | |
656 for $ResidueCode (@{$SequenceFilesInfo{ResidueCodes}[$Index]}) { | |
657 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{Count}{$ResNum}{$ResidueCode} = 0; | |
658 } | |
659 } | |
660 } | |
661 } | |
662 } | |
663 | |
664 # Setup script usage and retrieve command line arguments specified using various options... | |
665 sub SetupScriptUsage { | |
666 | |
667 # Retrieve all the options... | |
668 %Options = (); | |
669 $Options{ignoregaps} = 'yes'; | |
670 $Options{regionresiduesmode} = 'None'; | |
671 $Options{mode} = 'PercentIdentityMatrix'; | |
672 $Options{outdelim} = 'comma'; | |
673 $Options{precision} = 2; | |
674 $Options{quote} = 'yes'; | |
675 $Options{referencesequence} = 'UseFirstSequenceID'; | |
676 $Options{region} = 'UseCompleteSequence'; | |
677 | |
678 if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "referencesequence=s", "region=s", "regionresiduesmode=s", "root|r=s", "workingdir|w=s")) { | |
679 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
680 } | |
681 if ($Options{workingdir}) { | |
682 if (! -d $Options{workingdir}) { | |
683 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
684 } | |
685 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
686 } | |
687 if ($Options{ignoregaps} !~ /^(yes|no)$/i) { | |
688 die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
689 } | |
690 if ($Options{regionresiduesmode} !~ /^(AminoAcids|NucleicAcids|None)$/i) { | |
691 die "Error: The value specified, $Options{regionresiduesmode}, for option \"--regionresiduesmode\" is not valid. Allowed values: AminoAcids, NucleicAcids or None\n"; | |
692 } | |
693 if ($Options{mode} !~ /^(PercentIdentityMatrix|ResidueFrequencyAnalysis|All)$/i) { | |
694 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: PercentIdentityMatrix, ResidueFrequencyAnalysis or All\n"; | |
695 } | |
696 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
697 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
698 } | |
699 if ($Options{quote} !~ /^(yes|no)$/i) { | |
700 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
701 } | |
702 if (!IsPositiveInteger($Options{precision})) { | |
703 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; | |
704 } | |
705 } | |
706 | |
707 __END__ | |
708 | |
709 =head1 NAME | |
710 | |
711 AnalyzeSequenceFilesData.pl - Analyze sequence and alignment files | |
712 | |
713 =head1 SYNOPSIS | |
714 | |
715 AnalyzeSequenceFilesData.pl SequenceFile(s) AlignmentFile(s)... | |
716 | |
717 AnalyzeSequenceFilesData.pl [B<-h, --help>] [B<-i, --IgnoreGaps> yes | no] | |
718 [B<-m, --mode> PercentIdentityMatrix | ResidueFrequencyAnalysis | All] | |
719 [B<--outdelim> comma | tab | semicolon] [B<-o, --overwrite>] [B<-p, --precision> number] [B<-q, --quote> yes | no] | |
720 [B<--ReferenceSequence> SequenceID | UseFirstSequenceID] | |
721 [B<--region> "StartResNum, EndResNum, [StartResNum, EndResNum...]" | UseCompleteSequence] | |
722 [B<--RegionResiduesMode> AminoAcids | NucleicAcids | None] | |
723 [B<-w, --WorkingDir> dirname] SequenceFile(s) AlignmentFile(s)... | |
724 | |
725 =head1 DESCRIPTION | |
726 | |
727 Analyze I<SequenceFile(s) and AlignmentFile(s)> data: calculate pairwise percent identity matrix or | |
728 calculate percent occurrence of various residues in specified sequence regions. All the sequences | |
729 in the input file must have the same sequence lengths; otherwise, the sequence file is ignored. | |
730 | |
731 The file names are separated by spaces. All the sequence files in a current directory can | |
732 be specified by I<*.aln>, I<*.msf>, I<*.fasta>, I<*.fta>, I<*.pir> or any other supported | |
733 formats; additionally, I<DirName> corresponds to all the sequence files in the current directory | |
734 with any of the supported file extension: I<.aln, .msf, .fasta, .fta, and .pir>. | |
735 | |
736 Supported sequence formats are: I<ALN/CLustalW>, I<GCG/MSF>, I<PILEUP/MSF>, I<Pearson/FASTA>, | |
737 and I<NBRF/PIR>. Instead of using file extensions, file formats are detected by parsing the contents | |
738 of I<SequenceFile(s) and AlignmentFile(s)>. | |
739 | |
740 =head1 OPTIONS | |
741 | |
742 =over 4 | |
743 | |
744 =item B<-h, --help> | |
745 | |
746 Print this help message. | |
747 | |
748 =item B<-i, --IgnoreGaps> I<yes | no> | |
749 | |
750 Ignore gaps during calculation of sequence lengths and specification of regions during residue | |
751 frequency analysis. Possible values: I<yes or no>. Default value: I<yes>. | |
752 | |
753 =item B<-m, --mode> I<PercentIdentityMatrix | ResidueFrequencyAnalysis | All> | |
754 | |
755 Specify how to analyze data in sequence files: calculate percent identity matrix or calculate | |
756 frequency of occurrence of residues in specific regions. During I<ResidueFrequencyAnalysis> value | |
757 of B<-m, --mode> option, output files are generated for both the residue count and percent residue | |
758 count. Possible values: I<PercentIdentityMatrix, ResidueFrequencyAnalysis, or All>. Default value: | |
759 I<PercentIdentityMatrix>. | |
760 | |
761 =item B<--outdelim> I<comma | tab | semicolon> | |
762 | |
763 Output text file delimiter. Possible values: I<comma, tab, or semicolon>. | |
764 Default value: I<comma>. | |
765 | |
766 =item B<-o, --overwrite> | |
767 | |
768 Overwrite existing files. | |
769 | |
770 =item B<-p, --precision> I<number> | |
771 | |
772 Precision of calculated values in the output file. Default: up to I<2> decimal places. | |
773 Valid values: positive integers. | |
774 | |
775 =item B<-q, --quote> I<yes | no> | |
776 | |
777 Put quotes around column values in output text file. Possible values: I<yes or | |
778 no>. Default value: I<yes>. | |
779 | |
780 =item B<--ReferenceSequence> I<SequenceID | UseFirstSequenceID> | |
781 | |
782 Specify reference sequence ID to identify regions for performing I<ResidueFrequencyAnalysis> specified | |
783 using B<-m, --mode> option. Default: I<UseFirstSequenceID>. | |
784 | |
785 =item B<--region> I<StartResNum,EndResNum,[StartResNum,EndResNum...] | UseCompleteSequence> | |
786 | |
787 Specify how to perform frequency of occurrence analysis for residues: use specific regions | |
788 indicated by starting and ending residue numbers in reference sequence or use the whole reference | |
789 sequence as one region. Default: I<UseCompleteSequence>. | |
790 | |
791 Based on the value of B<-i, --IgnoreGaps> option, specified residue numbers I<StartResNum,EndResNum> | |
792 correspond to the positions in the reference sequence without gaps or with gaps. | |
793 | |
794 For residue numbers corresponding to the reference sequence including gaps, percent occurrence | |
795 of various residues corresponding to gap position in reference sequence is also calculated. | |
796 | |
797 =item B<--RegionResiduesMode> I<AminoAcids | NucleicAcids | None> | |
798 | |
799 Specify how to process residues in the regions specified using B<--region> option during | |
800 I<ResidueFrequencyAnalysis> calculation: categorize residues as amino acids, nucleic acids, or simply | |
801 ignore residue category during the calculation. Possible values: I<AminoAcids, NucleicAcids or None>. | |
802 Default value: I<None>. | |
803 | |
804 For I<AminoAcids> or I<NucleicAcids> values of B<--RegionResiduesMode> option, all the standard amino | |
805 acids or nucleic acids are listed in the output file for each region; Any gaps and other non standard residues | |
806 are added to the list as encountered. | |
807 | |
808 For I<None> value of B<--RegionResiduesMode> option, no assumption is made about type of residues. | |
809 Residue and gaps are added to the list as encountered. | |
810 | |
811 =item B<-r, --root> I<rootname> | |
812 | |
813 New sequence file name is generated using the root: <Root><Mode>.<Ext> and | |
814 <Root><Mode><RegionNum>.<Ext>. Default new file | |
815 name: <SequenceFileName><Mode>.<Ext> for I<PercentIdentityMatrix> value B<m, --mode> option | |
816 and <SequenceFileName><Mode><RegionNum>.<Ext> for I<ResidueFrequencyAnalysis>. | |
817 The csv, and tsv <Ext> values are used for comma/semicolon, and tab delimited text | |
818 files respectively. This option is ignored for multiple input files. | |
819 | |
820 =item B<-w --WorkingDir> I<text> | |
821 | |
822 Location of working directory. Default: current directory. | |
823 | |
824 =back | |
825 | |
826 =head1 EXAMPLES | |
827 | |
828 To calculate percent identity matrix for all sequences in Sample1.msf file and generate | |
829 Sample1PercentIdentityMatrix.csv, type: | |
830 | |
831 % AnalyzeSequenceFilesData.pl Sample1.msf | |
832 | |
833 To perform residue frequency analysis for all sequences in Sample1.aln file corresponding to | |
834 non-gap positions in the first sequence and generate Sample1ResidueFrequencyAnalysisRegion1.csv | |
835 and Sample1PercentResidueFrequencyAnalysisRegion1.csv files, type: | |
836 | |
837 % AnalyzeSequenceFilesData.pl -m ResidueFrequencyAnalysis -o | |
838 Sample1.aln | |
839 | |
840 To perform residue frequency analysis for all sequences in Sample1.aln file corresponding to | |
841 all positions in the first sequence and generate TestResidueFrequencyAnalysisRegion1.csv | |
842 and TestPercentResidueFrequencyAnalysisRegion1.csv files, type: | |
843 | |
844 % AnalyzeSequenceFilesData.pl -m ResidueFrequencyAnalysis --IgnoreGaps | |
845 No -o -r Test Sample1.aln | |
846 | |
847 To perform residue frequency analysis for all sequences in Sample1.aln file corresponding to | |
848 non-gap residue positions 5 to 10, and 30 to 40 in sequence ACHE_BOVIN and generate | |
849 Sample1ResidueFrequencyAnalysisRegion1.csv, Sample1ResidueFrequencyAnalysisRegion2.csv, | |
850 SamplePercentResidueFrequencyAnalysisRegion1.csv, and | |
851 SamplePercentResidueFrequencyAnalysisRegion2.csv files, type: | |
852 | |
853 % AnalyzeSequenceFilesData.pl -m ResidueFrequencyAnalysis | |
854 --ReferenceSequence ACHE_BOVIN --region "5,15,30,40" -o Sample1.msf | |
855 | |
856 | |
857 =head1 AUTHOR | |
858 | |
859 Manish Sud <msud@san.rr.com> | |
860 | |
861 =head1 SEE ALSO | |
862 | |
863 ExtractFromSequenceFiles.pl, InfoSequenceFiles.pl | |
864 | |
865 =head1 COPYRIGHT | |
866 | |
867 Copyright (C) 2015 Manish Sud. All rights reserved. | |
868 | |
869 This file is part of MayaChemTools. | |
870 | |
871 MayaChemTools is free software; you can redistribute it and/or modify it under | |
872 the terms of the GNU Lesser General Public License as published by the Free | |
873 Software Foundation; either version 3 of the License, or (at your option) | |
874 any later version. | |
875 | |
876 =cut |