Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/ExtractFromSequenceFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: ExtractFromSequenceFiles.pl,v $ | |
4 # $Date: 2015/02/28 20:46:19 $ | |
5 # $Revision: 1.23 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileUtil; | |
36 use TextUtil; | |
37 use SequenceFileUtil; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename($0); | |
46 print "\n$ScriptName: Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Setup script usage message... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 # Expand wild card file names... | |
56 my(@SequenceFilesList); | |
57 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir"); | |
58 | |
59 # Process options... | |
60 print "Processing options...\n"; | |
61 my(%OptionsInfo); | |
62 ProcessOptions(); | |
63 | |
64 # Set up information about input files... | |
65 print "Checking input sequence file(s)...\n"; | |
66 my(%SequenceFilesInfo); | |
67 RetrieveSequenceFilesInfo(); | |
68 | |
69 # Process input files.. | |
70 my($FileIndex); | |
71 if (@SequenceFilesList > 1) { | |
72 print "\nProcessing sequence files...\n"; | |
73 } | |
74 for $FileIndex (0 .. $#SequenceFilesList) { | |
75 if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) { | |
76 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n"; | |
77 ExtractFromSequenceFiles($FileIndex); | |
78 } | |
79 } | |
80 print "\n$ScriptName:Done...\n\n"; | |
81 | |
82 $EndTime = new Benchmark; | |
83 $TotalTime = timediff ($EndTime, $StartTime); | |
84 print "Total time: ", timestr($TotalTime), "\n"; | |
85 | |
86 ############################################################################### | |
87 | |
88 # Extract from sequence files... | |
89 sub ExtractFromSequenceFiles { | |
90 my($FileIndex) = @_; | |
91 my($OutSequenceFile, $SequenceFile, $SequenceDataRef, $SpecifiedSequenceDataRef); | |
92 | |
93 # Read sequence file... | |
94 $SequenceFile = $SequenceFilesList[$FileIndex]; | |
95 open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n"; | |
96 $SequenceDataRef = ReadSequenceFile($SequenceFile); | |
97 close SEQUENCEFILE; | |
98 | |
99 $OutSequenceFile = $SequenceFilesInfo{OutFile}[$FileIndex]; | |
100 print "Generating sequence file $OutSequenceFile...\n"; | |
101 | |
102 # Retrieve sequence data for specified sequences... | |
103 $SpecifiedSequenceDataRef = GetSpecifiedSequenceData($SequenceDataRef); | |
104 | |
105 # Handle gaps... | |
106 if ($OptionsInfo{IgnoreGaps}) { | |
107 if (@{$SpecifiedSequenceDataRef->{IDs}} > 1) { | |
108 if (AreSequenceLengthsIdentical($SpecifiedSequenceDataRef)) { | |
109 $SpecifiedSequenceDataRef = RemoveSequenceAlignmentGapColumns($SpecifiedSequenceDataRef); | |
110 } | |
111 } | |
112 else { | |
113 # Remove the gaps from the sequence... | |
114 my($ID, $Sequence); | |
115 $ID = $SpecifiedSequenceDataRef->{IDs}[0]; | |
116 $Sequence = $SpecifiedSequenceDataRef->{Sequence}{$ID}; | |
117 $SpecifiedSequenceDataRef->{Sequence}{$ID} = RemoveSequenceGaps($Sequence); | |
118 } | |
119 } | |
120 | |
121 # Write out the file... | |
122 WritePearsonFastaSequenceFile($OutSequenceFile, $SpecifiedSequenceDataRef, $OptionsInfo{MaxSequenceLength}); | |
123 } | |
124 | |
125 # Get specified sequence data... | |
126 sub GetSpecifiedSequenceData { | |
127 my($SequenceDataRef) = @_; | |
128 | |
129 if ($OptionsInfo{Mode} =~ /^SequenceID$/i) { | |
130 return GetDataBySequenceIDs($SequenceDataRef); | |
131 } | |
132 elsif ($Options{mode} =~ /^SequenceNum$/i) { | |
133 return GetDataBySequenceNums($SequenceDataRef); | |
134 } | |
135 elsif ($Options{mode} =~ /^SequenceNumRange$/i) { | |
136 return GetDataBySequenceNumRange($SequenceDataRef); | |
137 } | |
138 else { | |
139 return undef; | |
140 } | |
141 } | |
142 | |
143 # Get specified sequence data... | |
144 sub GetDataBySequenceIDs { | |
145 my($SequenceDataRef) = @_; | |
146 my($ID, $SequenceCount, $IDMatched, $SpecifiedID, %SpecifiedSequenceDataMap); | |
147 | |
148 # Go over sequences and collect sequences for writing out a new sequence file... | |
149 %SpecifiedSequenceDataMap = (); | |
150 @{$SpecifiedSequenceDataMap{IDs}} = (); | |
151 %{$SpecifiedSequenceDataMap{Description}} = (); | |
152 %{$SpecifiedSequenceDataMap{Sequence}} = (); | |
153 | |
154 $SequenceCount = 0; | |
155 ID: for $ID (@{$SequenceDataRef->{IDs}}) { | |
156 if ($OptionsInfo{MatchExactSequenceIDs}) { | |
157 if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{lc($ID)}) { | |
158 next ID; | |
159 } | |
160 if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) { | |
161 last ID; | |
162 } | |
163 $SequenceCount++; | |
164 } | |
165 else { | |
166 # Does this ID contains specified ID as substring... | |
167 $IDMatched = 0; | |
168 SPECIFIEDID: for $SpecifiedID (@{$OptionsInfo{SpecifiedSequenceIDs}}) { | |
169 if ($ID =~ /$SpecifiedID/i) { | |
170 $IDMatched = 1; | |
171 last SPECIFIEDID; | |
172 } | |
173 } | |
174 if (!$IDMatched) { | |
175 next ID; | |
176 } | |
177 $SequenceCount++; | |
178 } | |
179 # Collect sequence data... | |
180 push @{$SpecifiedSequenceDataMap{IDs}}, $ID; | |
181 $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID}; | |
182 $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID}; | |
183 } | |
184 | |
185 return \%SpecifiedSequenceDataMap; | |
186 } | |
187 | |
188 # Get specified sequence data... | |
189 sub GetDataBySequenceNums { | |
190 my($SequenceDataRef) = @_; | |
191 my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap); | |
192 | |
193 # Go over sequences and collect sequences for writing out a new sequence file... | |
194 %SpecifiedSequenceDataMap = (); | |
195 @{$SpecifiedSequenceDataMap{IDs}} = (); | |
196 %{$SpecifiedSequenceDataMap{Description}} = (); | |
197 %{$SpecifiedSequenceDataMap{Sequence}} = (); | |
198 | |
199 $SequenceNum = 0; | |
200 $SequenceCount = 0; | |
201 ID: for $ID (@{$SequenceDataRef->{IDs}}) { | |
202 $SequenceNum++; | |
203 if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceNum}) { | |
204 next ID; | |
205 } | |
206 if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) { | |
207 last ID; | |
208 } | |
209 $SequenceCount++; | |
210 | |
211 # Collect sequence data... | |
212 push @{$SpecifiedSequenceDataMap{IDs}}, $ID; | |
213 $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID}; | |
214 $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID}; | |
215 } | |
216 | |
217 return \%SpecifiedSequenceDataMap; | |
218 } | |
219 | |
220 # Get specified sequence data... | |
221 sub GetDataBySequenceNumRange { | |
222 my($SequenceDataRef) = @_; | |
223 my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap); | |
224 | |
225 # Go over sequences and collect sequences for writing out a new sequence file... | |
226 %SpecifiedSequenceDataMap = (); | |
227 @{$SpecifiedSequenceDataMap{IDs}} = (); | |
228 %{$SpecifiedSequenceDataMap{Description}} = (); | |
229 %{$SpecifiedSequenceDataMap{Sequence}} = (); | |
230 | |
231 $SequenceNum = 0; | |
232 $SequenceCount = 0; | |
233 ID: for $ID (@{$SequenceDataRef->{IDs}}) { | |
234 $SequenceNum++; | |
235 | |
236 if (!($SequenceNum >= $OptionsInfo{SpecifiedSequenceIDs}[0] && $SequenceNum <= $OptionsInfo{SpecifiedSequenceIDs}[1])) { | |
237 next ID; | |
238 } | |
239 if ($SequenceNum > $OptionsInfo{SpecifiedSequenceIDs}[1]) { | |
240 last ID; | |
241 } | |
242 $SequenceCount++; | |
243 # Collect sequence data... | |
244 push @{$SpecifiedSequenceDataMap{IDs}}, $ID; | |
245 $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID}; | |
246 $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID}; | |
247 } | |
248 | |
249 return \%SpecifiedSequenceDataMap; | |
250 } | |
251 | |
252 | |
253 # Process option values... | |
254 sub ProcessOptions { | |
255 %OptionsInfo = (); | |
256 | |
257 # Miscellaneous options... | |
258 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0; | |
259 | |
260 $OptionsInfo{Mode} = $Options{mode}; | |
261 $OptionsInfo{MatchExactSequenceIDs} = $Options{sequenceidmatch} =~ /Exact/i ? 1 :0; | |
262 | |
263 # Check specified sequences value... | |
264 $OptionsInfo{SpecifiedSequences} = $Options{sequences}; | |
265 @{$OptionsInfo{SpecifiedSequenceIDs}} = (); | |
266 %{$OptionsInfo{SpecifiedSequenceIDsMap}} = (); | |
267 | |
268 my(@SpecifiedSequenceIDs) = (); | |
269 if ($Options{mode} =~ /^SequenceID$/i) { | |
270 if (!$Options{sequences}) { | |
271 die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceID\" of \"-m, --mode\" option\n"; | |
272 } | |
273 @SpecifiedSequenceIDs = split /\,/, $Options{sequences}; | |
274 } | |
275 elsif ($Options{mode} =~ /^SequenceNum$/i) { | |
276 if ($Options{sequences}) { | |
277 @SpecifiedSequenceIDs = split /\,/, $Options{sequences}; | |
278 my($SequenceNum); | |
279 for $SequenceNum (@SpecifiedSequenceIDs) { | |
280 if (!IsPositiveInteger($SequenceNum)) { | |
281 die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n"; | |
282 } | |
283 } | |
284 } | |
285 else { | |
286 push @SpecifiedSequenceIDs, "1"; | |
287 } | |
288 } | |
289 elsif ($Options{mode} =~ /^SequenceNumRange$/i) { | |
290 if (!$Options{sequences}) { | |
291 die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceNumRange\" of \"-m, --mode\" option\n"; | |
292 } | |
293 @SpecifiedSequenceIDs = split /\,/, $Options{sequences}; | |
294 if (@SpecifiedSequenceIDs != 2) { | |
295 die "Error: The number of values", scalar @SpecifiedSequenceIDs, " specified, $Options{sequences}, for option \"-s, --Sequences\" are not valid. Number of values must be 2 to indicate starting and ending sequence number.\n"; | |
296 } | |
297 my($SequenceNum); | |
298 for $SequenceNum (@SpecifiedSequenceIDs) { | |
299 if (!IsPositiveInteger($SequenceNum)) { | |
300 die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n"; | |
301 } | |
302 } | |
303 if ($SpecifiedSequenceIDs[0] > $SpecifiedSequenceIDs[1]) { | |
304 die "Error: The value specified \"$Options{sequences}\" for option \"-s, --Sequences\" are not valid: Starting sequence number $SpecifiedSequenceIDs[0] must be smaller than ending sequence number $SpecifiedSequenceIDs[1]\n"; | |
305 } | |
306 } | |
307 push @{$OptionsInfo{SpecifiedSequenceIDs}}, @SpecifiedSequenceIDs; | |
308 my($SequenceID); | |
309 for $SequenceID (@SpecifiedSequenceIDs) { | |
310 if ($Options{mode} =~ /^SequenceID$/i) { | |
311 $OptionsInfo{SpecifiedSequenceIDsMap}{lc($SequenceID)} = $SequenceID; | |
312 } | |
313 else { | |
314 $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceID} = $SequenceID; | |
315 } | |
316 } | |
317 | |
318 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; | |
319 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | |
320 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | |
321 } | |
322 | |
323 # Retrieve information about sequence files... | |
324 sub RetrieveSequenceFilesInfo { | |
325 my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $OutFileMode, $SequenceDataRef); | |
326 | |
327 %SequenceFilesInfo = (); | |
328 @{$SequenceFilesInfo{FilesOkay}} = (); | |
329 @{$SequenceFilesInfo{OutFileRoot}} = (); | |
330 @{$SequenceFilesInfo{OutFileExt}} = (); | |
331 @{$SequenceFilesInfo{OutFile}} = (); | |
332 @{$SequenceFilesInfo{Format}} = (); | |
333 @{$SequenceFilesInfo{SequenceCount}} = (); | |
334 | |
335 FILELIST: for $Index (0 .. $#SequenceFilesList) { | |
336 $SequenceFile = $SequenceFilesList[$Index]; | |
337 $SequenceFilesInfo{FilesOkay}[$Index] = 0; | |
338 $SequenceFilesInfo{OutFileRoot}[$Index] = ''; | |
339 $SequenceFilesInfo{OutFileExt}[$Index] = ''; | |
340 $SequenceFilesInfo{OutFile}[$Index] = ''; | |
341 $SequenceFilesInfo{Format}[$Index] = 'NotSupported'; | |
342 $SequenceFilesInfo{SequenceCount}[$Index] = 0; | |
343 | |
344 if (! open SEQUENCEFILE, "$SequenceFile") { | |
345 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n"; | |
346 next FILELIST; | |
347 } | |
348 close SEQUENCEFILE; | |
349 | |
350 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile); | |
351 if (!$FileSupported) { | |
352 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n"; | |
353 next FILELIST; | |
354 } | |
355 $SequenceDataRef = ReadSequenceFile($SequenceFile); | |
356 | |
357 $SequenceCount = $SequenceDataRef->{Count}; | |
358 if (!$SequenceCount) { | |
359 warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n"; | |
360 next FILELIST; | |
361 } | |
362 | |
363 # Setup output file names... | |
364 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
365 ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile); | |
366 $OutFileExt = 'fasta'; | |
367 if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) { | |
368 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
369 if ($RootFileName && $RootFileExt) { | |
370 $FileName = $RootFileName; | |
371 } | |
372 else { | |
373 $FileName = $OptionsInfo{OutFileRoot}; | |
374 } | |
375 $OutFileRoot = $FileName; | |
376 } | |
377 else { | |
378 $OutFileRoot = $FileName; | |
379 } | |
380 MODE: { | |
381 if ($OptionsInfo{Mode} =~ /^SequenceID$/i) { $OutFileMode = 'SequenceID'; last MODE;} | |
382 if ($OptionsInfo{Mode} =~ /^SequenceNum$/i) { $OutFileMode = 'SequenceNum'; last MODE;} | |
383 if ($OptionsInfo{Mode} =~ /^SequenceNumRange$/i) { $OutFileMode = 'SequenceNumRange'; last MODE;} | |
384 $OutFileMode = ''; | |
385 } | |
386 if (!$OptionsInfo{OverwriteFiles}) { | |
387 if (-e "${OutFileRoot}${OutFileMode}.${OutFileExt}") { | |
388 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}${OutFileMode}.${OutFileExt} already exists\n"; | |
389 next FILELIST; | |
390 } | |
391 } | |
392 | |
393 $SequenceFilesInfo{FilesOkay}[$Index] = 1; | |
394 $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | |
395 $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt; | |
396 $SequenceFilesInfo{OutFile}[$Index] = "${OutFileRoot}${OutFileMode}.${OutFileExt}"; | |
397 | |
398 $SequenceFilesInfo{Format}[$Index] = $FileFormat; | |
399 $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount; | |
400 } | |
401 } | |
402 | |
403 # Setup script usage and retrieve command line arguments specified using various options... | |
404 sub SetupScriptUsage { | |
405 | |
406 # Retrieve all the options... | |
407 %Options = (); | |
408 $Options{ignoregaps} = 'Yes'; | |
409 $Options{mode} = 'SequenceNum'; | |
410 $Options{sequenceidmatch} = 'Relaxed'; | |
411 $Options{sequencelength} = 80; | |
412 | |
413 if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "overwrite|o", "root|r=s", "sequences|s=s", "sequenceidmatch=s", "sequencelength=i", "workingdir|w=s")) { | |
414 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
415 } | |
416 if ($Options{workingdir}) { | |
417 if (! -d $Options{workingdir}) { | |
418 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
419 } | |
420 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
421 } | |
422 if ($Options{ignoregaps} !~ /^(yes|no)$/i) { | |
423 die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
424 } | |
425 if ($Options{mode} !~ /^(SequenceID|SequenceNum|SequenceNumRange)$/i) { | |
426 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: SequenceID, SequenceNum, or SequenceNumRange\n"; | |
427 } | |
428 if ($Options{sequenceidmatch} !~ /^(Exact|Relaxed)$/i) { | |
429 die "Error: The value specified, $Options{sequenceidmatch}, for option \"--SequenceIDMatch\" is not valid. Allowed values: Exact or Relaxed\n"; | |
430 } | |
431 if (!IsPositiveInteger($Options{sequencelength})) { | |
432 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n"; | |
433 } | |
434 } | |
435 | |
436 __END__ | |
437 | |
438 =head1 NAME | |
439 | |
440 ExtractFromSequenceFiles.pl - Extract data from sequence and alignment files | |
441 | |
442 =head1 SYNOPSIS | |
443 | |
444 ExtractFromSequenceFiles.pl SequenceFile(s) AlignmentFile(s)... | |
445 | |
446 ExtractFromSequenceFiles.pl [B<-h, --help>] [B<-i, --IgnoreGaps> yes | no] | |
447 [B<-m, --mode> SequenceID | SequenceNum | SequenceNumRange] [B<-o, --overwrite>] | |
448 [B<-r, --root> rootname] [B<-s, --Sequences> "SequenceID, [SequenceID,...]" | "SequenceNum, [SequenceNum,...]" | | |
449 "StartingSeqNum, EndingSeqNum"] [B<--SequenceIDMatch> Exact | Relaxed] | |
450 [B<-w, --WorkingDir> dirname] SequenceFile(s) AlignmentFile(s)... | |
451 | |
452 =head1 DESCRIPTION | |
453 | |
454 Extract specific data from I<SequenceFile(s) and AlignmentFile(s)> and generate | |
455 FASTA files. You can extract sequences using sequence IDs or sequence numbers. | |
456 | |
457 The file names are separated by spaces. All the sequence files in a current directory can | |
458 be specified by I<*.aln>, I<*.msf>, I<*.fasta>, I<*.fta>, I<*.pir> or any other supported | |
459 formats; additionally, I<DirName> corresponds to all the sequence files in the current directory | |
460 with any of the supported file extension: I<.aln, .msf, .fasta, .fta, and .pir>. | |
461 | |
462 Supported sequence formats are: I<ALN/CLustalW>, I<GCG/MSF>, I<PILEUP/MSF>, I<Pearson/FASTA>, | |
463 and I<NBRF/PIR>. Instead of using file extensions, file formats are detected by parsing the contents | |
464 of I<SequenceFile(s) and AlignmentFile(s)>. | |
465 | |
466 =head1 OPTIONS | |
467 | |
468 =over 4 | |
469 | |
470 =item B<-h, --help> | |
471 | |
472 Print this help message. | |
473 | |
474 =item B<-i, --IgnoreGaps> I<yes | no> | |
475 | |
476 Ignore gaps or gap columns during during generation of new sequence or alignment file(s). | |
477 Possible values: I<yes or no>. Default value: I<yes>. | |
478 | |
479 In order to remove gap columns, length of all the sequence must be same; otherwise, | |
480 this option is ignored. | |
481 | |
482 =item B<-m, --mode> I<SequenceID | SequenceNum | SequenceNumRange> | |
483 | |
484 Specify how to extract data from sequence files: extract sequences using sequence | |
485 IDs or sequence numbers. Possible values: I<SequenceID | SequenceNum | |
486 | SequenceNumRange>. Default: I<SequenceNum> with value of 1. | |
487 | |
488 The sequence numbers correspond to position of sequences starting from 1 for first sequence | |
489 in I<SequenceFile(s) and AlignmentFile(s)>. | |
490 | |
491 =item B<-o, --overwrite> | |
492 | |
493 Overwrite existing files. | |
494 | |
495 =item B<-r, --root> I<rootname> | |
496 | |
497 New sequence file name is generated using the root: <Root><Mode>.<Ext>. Default new file: | |
498 <SequenceFileName><Mode>.<Ext>. This option is ignored for multiple input files. | |
499 | |
500 =item B<-s, --Sequences> I<"SequenceID,[SequenceID,...]" | "SequenceNum,[SequenceNum,...]" | "StartingSeqNum,EndingSeqNum"> | |
501 | |
502 This value is B<-m, --mode> specific. In general, it's a comma delimites list of sequence IDs or sequence | |
503 numbers. | |
504 | |
505 For I<SequenceID> value of B<-m, --mode> option, input value format is: I<SequenceID,...>. Examples: | |
506 | |
507 ACHE_BOVIN | |
508 ACHE_BOVIN,ACHE_HUMAN | |
509 | |
510 For I<SequenceNum> value of B<-m, --mode> option, input value format is: I<SequenceNum,...>. Examples: | |
511 | |
512 2 | |
513 1,5 | |
514 | |
515 For I<SequenceNum> value of B<-m, --mode> option, input value format is: I<StaringSeqNum,EndingSeqNum>. Examples: | |
516 | |
517 2,4 | |
518 | |
519 =item B<--SequenceIDMatch> I<Exact | Relaxed> | |
520 | |
521 Sequence IDs matching criterion during I<SequenceID> value of B<-m, --mode> option: match | |
522 specified sequence ID exactly or as sub string against sequence IDs in the files. Possible | |
523 values: I<Exact | Relaxed>. Default: I<Relaxed>. Sequence ID match is case insenstitive | |
524 during both options. | |
525 | |
526 =item B<--SequenceLength> I<number> | |
527 | |
528 Maximum sequence length per line in sequence file(s). Default: I<80>. | |
529 | |
530 =item B<-w --WorkingDir> I<text> | |
531 | |
532 Location of working directory. Default: current directory. | |
533 | |
534 =back | |
535 | |
536 =head1 EXAMPLES | |
537 | |
538 To extract first sequence from Sample1.fasta sequence file and generate Sample1SequenceNum.fasta | |
539 sequence file, type: | |
540 | |
541 % ExtractFromSequenceFiles.pl -o Sample1.fasta | |
542 | |
543 To extract first sequence from Sample1.aln alignment file and generate Sample1SequenceNum.fasta | |
544 sequence file without any column gaps, type: | |
545 | |
546 % ExtractFromSequenceFiles.pl -o Sample1.aln | |
547 | |
548 To extract first sequence from Sample1.aln alignment file and generate Sample1SequenceNum.fasta | |
549 sequence file with column gaps, type: | |
550 | |
551 % ExtractFromSequenceFiles.pl --IgnroreGaps No -o Sample1.aln | |
552 | |
553 To extract sequence number 1 and 4 from Sample1.fasta sequence file and generate | |
554 Sample1SequenceNum.fasta sequence file, type: | |
555 | |
556 % ExtractFromSequenceFiles.pl -o -m SequenceNum --Sequences 1,4 | |
557 -o Sample1.fasta | |
558 | |
559 To extract sequences from sequence number 1 to 4 from Sample1.fasta sequence file and generate | |
560 Sample1SequenceNumRange.fasta sequence file, type: | |
561 | |
562 % ExtractFromSequenceFiles.pl -o -m SequenceNumRange --Sequences | |
563 1,4 -o Sample1.fasta | |
564 | |
565 To extract sequence ID "Q9P993/104-387" from sequence from Sample1.fasta sequence file and generate | |
566 Sample1SequenceID.fasta sequence file, type: | |
567 | |
568 % ExtractFromSequenceFiles.pl -o -m SequenceID --Sequences | |
569 "Q9P993/104-387" --SequenceIDMatch Exact -o Sample1.fasta | |
570 | |
571 =head1 AUTHOR | |
572 | |
573 Manish Sud <msud@san.rr.com> | |
574 | |
575 =head1 SEE ALSO | |
576 | |
577 AnalyzeSequenceFilesData.pl, InfoSequenceFiles.pl | |
578 | |
579 =head1 COPYRIGHT | |
580 | |
581 Copyright (C) 2015 Manish Sud. All rights reserved. | |
582 | |
583 This file is part of MayaChemTools. | |
584 | |
585 MayaChemTools is free software; you can redistribute it and/or modify it under | |
586 the terms of the GNU Lesser General Public License as published by the Free | |
587 Software Foundation; either version 3 of the License, or (at your option) | |
588 any later version. | |
589 | |
590 =cut |