comparison bin/ExtractFromSequenceFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: ExtractFromSequenceFiles.pl,v $
4 # $Date: 2015/02/28 20:46:19 $
5 # $Revision: 1.23 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SequenceFileUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename($0);
46 print "\n$ScriptName: Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Setup script usage message...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 # Expand wild card file names...
56 my(@SequenceFilesList);
57 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
58
59 # Process options...
60 print "Processing options...\n";
61 my(%OptionsInfo);
62 ProcessOptions();
63
64 # Set up information about input files...
65 print "Checking input sequence file(s)...\n";
66 my(%SequenceFilesInfo);
67 RetrieveSequenceFilesInfo();
68
69 # Process input files..
70 my($FileIndex);
71 if (@SequenceFilesList > 1) {
72 print "\nProcessing sequence files...\n";
73 }
74 for $FileIndex (0 .. $#SequenceFilesList) {
75 if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
76 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
77 ExtractFromSequenceFiles($FileIndex);
78 }
79 }
80 print "\n$ScriptName:Done...\n\n";
81
82 $EndTime = new Benchmark;
83 $TotalTime = timediff ($EndTime, $StartTime);
84 print "Total time: ", timestr($TotalTime), "\n";
85
86 ###############################################################################
87
88 # Extract from sequence files...
89 sub ExtractFromSequenceFiles {
90 my($FileIndex) = @_;
91 my($OutSequenceFile, $SequenceFile, $SequenceDataRef, $SpecifiedSequenceDataRef);
92
93 # Read sequence file...
94 $SequenceFile = $SequenceFilesList[$FileIndex];
95 open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
96 $SequenceDataRef = ReadSequenceFile($SequenceFile);
97 close SEQUENCEFILE;
98
99 $OutSequenceFile = $SequenceFilesInfo{OutFile}[$FileIndex];
100 print "Generating sequence file $OutSequenceFile...\n";
101
102 # Retrieve sequence data for specified sequences...
103 $SpecifiedSequenceDataRef = GetSpecifiedSequenceData($SequenceDataRef);
104
105 # Handle gaps...
106 if ($OptionsInfo{IgnoreGaps}) {
107 if (@{$SpecifiedSequenceDataRef->{IDs}} > 1) {
108 if (AreSequenceLengthsIdentical($SpecifiedSequenceDataRef)) {
109 $SpecifiedSequenceDataRef = RemoveSequenceAlignmentGapColumns($SpecifiedSequenceDataRef);
110 }
111 }
112 else {
113 # Remove the gaps from the sequence...
114 my($ID, $Sequence);
115 $ID = $SpecifiedSequenceDataRef->{IDs}[0];
116 $Sequence = $SpecifiedSequenceDataRef->{Sequence}{$ID};
117 $SpecifiedSequenceDataRef->{Sequence}{$ID} = RemoveSequenceGaps($Sequence);
118 }
119 }
120
121 # Write out the file...
122 WritePearsonFastaSequenceFile($OutSequenceFile, $SpecifiedSequenceDataRef, $OptionsInfo{MaxSequenceLength});
123 }
124
125 # Get specified sequence data...
126 sub GetSpecifiedSequenceData {
127 my($SequenceDataRef) = @_;
128
129 if ($OptionsInfo{Mode} =~ /^SequenceID$/i) {
130 return GetDataBySequenceIDs($SequenceDataRef);
131 }
132 elsif ($Options{mode} =~ /^SequenceNum$/i) {
133 return GetDataBySequenceNums($SequenceDataRef);
134 }
135 elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
136 return GetDataBySequenceNumRange($SequenceDataRef);
137 }
138 else {
139 return undef;
140 }
141 }
142
143 # Get specified sequence data...
144 sub GetDataBySequenceIDs {
145 my($SequenceDataRef) = @_;
146 my($ID, $SequenceCount, $IDMatched, $SpecifiedID, %SpecifiedSequenceDataMap);
147
148 # Go over sequences and collect sequences for writing out a new sequence file...
149 %SpecifiedSequenceDataMap = ();
150 @{$SpecifiedSequenceDataMap{IDs}} = ();
151 %{$SpecifiedSequenceDataMap{Description}} = ();
152 %{$SpecifiedSequenceDataMap{Sequence}} = ();
153
154 $SequenceCount = 0;
155 ID: for $ID (@{$SequenceDataRef->{IDs}}) {
156 if ($OptionsInfo{MatchExactSequenceIDs}) {
157 if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{lc($ID)}) {
158 next ID;
159 }
160 if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
161 last ID;
162 }
163 $SequenceCount++;
164 }
165 else {
166 # Does this ID contains specified ID as substring...
167 $IDMatched = 0;
168 SPECIFIEDID: for $SpecifiedID (@{$OptionsInfo{SpecifiedSequenceIDs}}) {
169 if ($ID =~ /$SpecifiedID/i) {
170 $IDMatched = 1;
171 last SPECIFIEDID;
172 }
173 }
174 if (!$IDMatched) {
175 next ID;
176 }
177 $SequenceCount++;
178 }
179 # Collect sequence data...
180 push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
181 $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
182 $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
183 }
184
185 return \%SpecifiedSequenceDataMap;
186 }
187
188 # Get specified sequence data...
189 sub GetDataBySequenceNums {
190 my($SequenceDataRef) = @_;
191 my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
192
193 # Go over sequences and collect sequences for writing out a new sequence file...
194 %SpecifiedSequenceDataMap = ();
195 @{$SpecifiedSequenceDataMap{IDs}} = ();
196 %{$SpecifiedSequenceDataMap{Description}} = ();
197 %{$SpecifiedSequenceDataMap{Sequence}} = ();
198
199 $SequenceNum = 0;
200 $SequenceCount = 0;
201 ID: for $ID (@{$SequenceDataRef->{IDs}}) {
202 $SequenceNum++;
203 if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceNum}) {
204 next ID;
205 }
206 if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
207 last ID;
208 }
209 $SequenceCount++;
210
211 # Collect sequence data...
212 push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
213 $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
214 $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
215 }
216
217 return \%SpecifiedSequenceDataMap;
218 }
219
220 # Get specified sequence data...
221 sub GetDataBySequenceNumRange {
222 my($SequenceDataRef) = @_;
223 my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
224
225 # Go over sequences and collect sequences for writing out a new sequence file...
226 %SpecifiedSequenceDataMap = ();
227 @{$SpecifiedSequenceDataMap{IDs}} = ();
228 %{$SpecifiedSequenceDataMap{Description}} = ();
229 %{$SpecifiedSequenceDataMap{Sequence}} = ();
230
231 $SequenceNum = 0;
232 $SequenceCount = 0;
233 ID: for $ID (@{$SequenceDataRef->{IDs}}) {
234 $SequenceNum++;
235
236 if (!($SequenceNum >= $OptionsInfo{SpecifiedSequenceIDs}[0] && $SequenceNum <= $OptionsInfo{SpecifiedSequenceIDs}[1])) {
237 next ID;
238 }
239 if ($SequenceNum > $OptionsInfo{SpecifiedSequenceIDs}[1]) {
240 last ID;
241 }
242 $SequenceCount++;
243 # Collect sequence data...
244 push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
245 $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
246 $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
247 }
248
249 return \%SpecifiedSequenceDataMap;
250 }
251
252
253 # Process option values...
254 sub ProcessOptions {
255 %OptionsInfo = ();
256
257 # Miscellaneous options...
258 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
259
260 $OptionsInfo{Mode} = $Options{mode};
261 $OptionsInfo{MatchExactSequenceIDs} = $Options{sequenceidmatch} =~ /Exact/i ? 1 :0;
262
263 # Check specified sequences value...
264 $OptionsInfo{SpecifiedSequences} = $Options{sequences};
265 @{$OptionsInfo{SpecifiedSequenceIDs}} = ();
266 %{$OptionsInfo{SpecifiedSequenceIDsMap}} = ();
267
268 my(@SpecifiedSequenceIDs) = ();
269 if ($Options{mode} =~ /^SequenceID$/i) {
270 if (!$Options{sequences}) {
271 die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceID\" of \"-m, --mode\" option\n";
272 }
273 @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
274 }
275 elsif ($Options{mode} =~ /^SequenceNum$/i) {
276 if ($Options{sequences}) {
277 @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
278 my($SequenceNum);
279 for $SequenceNum (@SpecifiedSequenceIDs) {
280 if (!IsPositiveInteger($SequenceNum)) {
281 die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
282 }
283 }
284 }
285 else {
286 push @SpecifiedSequenceIDs, "1";
287 }
288 }
289 elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
290 if (!$Options{sequences}) {
291 die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceNumRange\" of \"-m, --mode\" option\n";
292 }
293 @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
294 if (@SpecifiedSequenceIDs != 2) {
295 die "Error: The number of values", scalar @SpecifiedSequenceIDs, " specified, $Options{sequences}, for option \"-s, --Sequences\" are not valid. Number of values must be 2 to indicate starting and ending sequence number.\n";
296 }
297 my($SequenceNum);
298 for $SequenceNum (@SpecifiedSequenceIDs) {
299 if (!IsPositiveInteger($SequenceNum)) {
300 die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
301 }
302 }
303 if ($SpecifiedSequenceIDs[0] > $SpecifiedSequenceIDs[1]) {
304 die "Error: The value specified \"$Options{sequences}\" for option \"-s, --Sequences\" are not valid: Starting sequence number $SpecifiedSequenceIDs[0] must be smaller than ending sequence number $SpecifiedSequenceIDs[1]\n";
305 }
306 }
307 push @{$OptionsInfo{SpecifiedSequenceIDs}}, @SpecifiedSequenceIDs;
308 my($SequenceID);
309 for $SequenceID (@SpecifiedSequenceIDs) {
310 if ($Options{mode} =~ /^SequenceID$/i) {
311 $OptionsInfo{SpecifiedSequenceIDsMap}{lc($SequenceID)} = $SequenceID;
312 }
313 else {
314 $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceID} = $SequenceID;
315 }
316 }
317
318 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
319 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
320 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
321 }
322
323 # Retrieve information about sequence files...
324 sub RetrieveSequenceFilesInfo {
325 my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $OutFileMode, $SequenceDataRef);
326
327 %SequenceFilesInfo = ();
328 @{$SequenceFilesInfo{FilesOkay}} = ();
329 @{$SequenceFilesInfo{OutFileRoot}} = ();
330 @{$SequenceFilesInfo{OutFileExt}} = ();
331 @{$SequenceFilesInfo{OutFile}} = ();
332 @{$SequenceFilesInfo{Format}} = ();
333 @{$SequenceFilesInfo{SequenceCount}} = ();
334
335 FILELIST: for $Index (0 .. $#SequenceFilesList) {
336 $SequenceFile = $SequenceFilesList[$Index];
337 $SequenceFilesInfo{FilesOkay}[$Index] = 0;
338 $SequenceFilesInfo{OutFileRoot}[$Index] = '';
339 $SequenceFilesInfo{OutFileExt}[$Index] = '';
340 $SequenceFilesInfo{OutFile}[$Index] = '';
341 $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
342 $SequenceFilesInfo{SequenceCount}[$Index] = 0;
343
344 if (! open SEQUENCEFILE, "$SequenceFile") {
345 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
346 next FILELIST;
347 }
348 close SEQUENCEFILE;
349
350 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
351 if (!$FileSupported) {
352 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
353 next FILELIST;
354 }
355 $SequenceDataRef = ReadSequenceFile($SequenceFile);
356
357 $SequenceCount = $SequenceDataRef->{Count};
358 if (!$SequenceCount) {
359 warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
360 next FILELIST;
361 }
362
363 # Setup output file names...
364 $FileDir = ""; $FileName = ""; $FileExt = "";
365 ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
366 $OutFileExt = 'fasta';
367 if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
368 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
369 if ($RootFileName && $RootFileExt) {
370 $FileName = $RootFileName;
371 }
372 else {
373 $FileName = $OptionsInfo{OutFileRoot};
374 }
375 $OutFileRoot = $FileName;
376 }
377 else {
378 $OutFileRoot = $FileName;
379 }
380 MODE: {
381 if ($OptionsInfo{Mode} =~ /^SequenceID$/i) { $OutFileMode = 'SequenceID'; last MODE;}
382 if ($OptionsInfo{Mode} =~ /^SequenceNum$/i) { $OutFileMode = 'SequenceNum'; last MODE;}
383 if ($OptionsInfo{Mode} =~ /^SequenceNumRange$/i) { $OutFileMode = 'SequenceNumRange'; last MODE;}
384 $OutFileMode = '';
385 }
386 if (!$OptionsInfo{OverwriteFiles}) {
387 if (-e "${OutFileRoot}${OutFileMode}.${OutFileExt}") {
388 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}${OutFileMode}.${OutFileExt} already exists\n";
389 next FILELIST;
390 }
391 }
392
393 $SequenceFilesInfo{FilesOkay}[$Index] = 1;
394 $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
395 $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
396 $SequenceFilesInfo{OutFile}[$Index] = "${OutFileRoot}${OutFileMode}.${OutFileExt}";
397
398 $SequenceFilesInfo{Format}[$Index] = $FileFormat;
399 $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
400 }
401 }
402
403 # Setup script usage and retrieve command line arguments specified using various options...
404 sub SetupScriptUsage {
405
406 # Retrieve all the options...
407 %Options = ();
408 $Options{ignoregaps} = 'Yes';
409 $Options{mode} = 'SequenceNum';
410 $Options{sequenceidmatch} = 'Relaxed';
411 $Options{sequencelength} = 80;
412
413 if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "overwrite|o", "root|r=s", "sequences|s=s", "sequenceidmatch=s", "sequencelength=i", "workingdir|w=s")) {
414 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
415 }
416 if ($Options{workingdir}) {
417 if (! -d $Options{workingdir}) {
418 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
419 }
420 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
421 }
422 if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
423 die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
424 }
425 if ($Options{mode} !~ /^(SequenceID|SequenceNum|SequenceNumRange)$/i) {
426 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: SequenceID, SequenceNum, or SequenceNumRange\n";
427 }
428 if ($Options{sequenceidmatch} !~ /^(Exact|Relaxed)$/i) {
429 die "Error: The value specified, $Options{sequenceidmatch}, for option \"--SequenceIDMatch\" is not valid. Allowed values: Exact or Relaxed\n";
430 }
431 if (!IsPositiveInteger($Options{sequencelength})) {
432 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
433 }
434 }
435
436 __END__
437
438 =head1 NAME
439
440 ExtractFromSequenceFiles.pl - Extract data from sequence and alignment files
441
442 =head1 SYNOPSIS
443
444 ExtractFromSequenceFiles.pl SequenceFile(s) AlignmentFile(s)...
445
446 ExtractFromSequenceFiles.pl [B<-h, --help>] [B<-i, --IgnoreGaps> yes | no]
447 [B<-m, --mode> SequenceID | SequenceNum | SequenceNumRange] [B<-o, --overwrite>]
448 [B<-r, --root> rootname] [B<-s, --Sequences> "SequenceID, [SequenceID,...]" | "SequenceNum, [SequenceNum,...]" |
449 "StartingSeqNum, EndingSeqNum"] [B<--SequenceIDMatch> Exact | Relaxed]
450 [B<-w, --WorkingDir> dirname] SequenceFile(s) AlignmentFile(s)...
451
452 =head1 DESCRIPTION
453
454 Extract specific data from I<SequenceFile(s) and AlignmentFile(s)> and generate
455 FASTA files. You can extract sequences using sequence IDs or sequence numbers.
456
457 The file names are separated by spaces. All the sequence files in a current directory can
458 be specified by I<*.aln>, I<*.msf>, I<*.fasta>, I<*.fta>, I<*.pir> or any other supported
459 formats; additionally, I<DirName> corresponds to all the sequence files in the current directory
460 with any of the supported file extension: I<.aln, .msf, .fasta, .fta, and .pir>.
461
462 Supported sequence formats are: I<ALN/CLustalW>, I<GCG/MSF>, I<PILEUP/MSF>, I<Pearson/FASTA>,
463 and I<NBRF/PIR>. Instead of using file extensions, file formats are detected by parsing the contents
464 of I<SequenceFile(s) and AlignmentFile(s)>.
465
466 =head1 OPTIONS
467
468 =over 4
469
470 =item B<-h, --help>
471
472 Print this help message.
473
474 =item B<-i, --IgnoreGaps> I<yes | no>
475
476 Ignore gaps or gap columns during during generation of new sequence or alignment file(s).
477 Possible values: I<yes or no>. Default value: I<yes>.
478
479 In order to remove gap columns, length of all the sequence must be same; otherwise,
480 this option is ignored.
481
482 =item B<-m, --mode> I<SequenceID | SequenceNum | SequenceNumRange>
483
484 Specify how to extract data from sequence files: extract sequences using sequence
485 IDs or sequence numbers. Possible values: I<SequenceID | SequenceNum
486 | SequenceNumRange>. Default: I<SequenceNum> with value of 1.
487
488 The sequence numbers correspond to position of sequences starting from 1 for first sequence
489 in I<SequenceFile(s) and AlignmentFile(s)>.
490
491 =item B<-o, --overwrite>
492
493 Overwrite existing files.
494
495 =item B<-r, --root> I<rootname>
496
497 New sequence file name is generated using the root: <Root><Mode>.<Ext>. Default new file:
498 <SequenceFileName><Mode>.<Ext>. This option is ignored for multiple input files.
499
500 =item B<-s, --Sequences> I<"SequenceID,[SequenceID,...]" | "SequenceNum,[SequenceNum,...]" | "StartingSeqNum,EndingSeqNum">
501
502 This value is B<-m, --mode> specific. In general, it's a comma delimites list of sequence IDs or sequence
503 numbers.
504
505 For I<SequenceID> value of B<-m, --mode> option, input value format is: I<SequenceID,...>. Examples:
506
507 ACHE_BOVIN
508 ACHE_BOVIN,ACHE_HUMAN
509
510 For I<SequenceNum> value of B<-m, --mode> option, input value format is: I<SequenceNum,...>. Examples:
511
512 2
513 1,5
514
515 For I<SequenceNum> value of B<-m, --mode> option, input value format is: I<StaringSeqNum,EndingSeqNum>. Examples:
516
517 2,4
518
519 =item B<--SequenceIDMatch> I<Exact | Relaxed>
520
521 Sequence IDs matching criterion during I<SequenceID> value of B<-m, --mode> option: match
522 specified sequence ID exactly or as sub string against sequence IDs in the files. Possible
523 values: I<Exact | Relaxed>. Default: I<Relaxed>. Sequence ID match is case insenstitive
524 during both options.
525
526 =item B<--SequenceLength> I<number>
527
528 Maximum sequence length per line in sequence file(s). Default: I<80>.
529
530 =item B<-w --WorkingDir> I<text>
531
532 Location of working directory. Default: current directory.
533
534 =back
535
536 =head1 EXAMPLES
537
538 To extract first sequence from Sample1.fasta sequence file and generate Sample1SequenceNum.fasta
539 sequence file, type:
540
541 % ExtractFromSequenceFiles.pl -o Sample1.fasta
542
543 To extract first sequence from Sample1.aln alignment file and generate Sample1SequenceNum.fasta
544 sequence file without any column gaps, type:
545
546 % ExtractFromSequenceFiles.pl -o Sample1.aln
547
548 To extract first sequence from Sample1.aln alignment file and generate Sample1SequenceNum.fasta
549 sequence file with column gaps, type:
550
551 % ExtractFromSequenceFiles.pl --IgnroreGaps No -o Sample1.aln
552
553 To extract sequence number 1 and 4 from Sample1.fasta sequence file and generate
554 Sample1SequenceNum.fasta sequence file, type:
555
556 % ExtractFromSequenceFiles.pl -o -m SequenceNum --Sequences 1,4
557 -o Sample1.fasta
558
559 To extract sequences from sequence number 1 to 4 from Sample1.fasta sequence file and generate
560 Sample1SequenceNumRange.fasta sequence file, type:
561
562 % ExtractFromSequenceFiles.pl -o -m SequenceNumRange --Sequences
563 1,4 -o Sample1.fasta
564
565 To extract sequence ID "Q9P993/104-387" from sequence from Sample1.fasta sequence file and generate
566 Sample1SequenceID.fasta sequence file, type:
567
568 % ExtractFromSequenceFiles.pl -o -m SequenceID --Sequences
569 "Q9P993/104-387" --SequenceIDMatch Exact -o Sample1.fasta
570
571 =head1 AUTHOR
572
573 Manish Sud <msud@san.rr.com>
574
575 =head1 SEE ALSO
576
577 AnalyzeSequenceFilesData.pl, InfoSequenceFiles.pl
578
579 =head1 COPYRIGHT
580
581 Copyright (C) 2015 Manish Sud. All rights reserved.
582
583 This file is part of MayaChemTools.
584
585 MayaChemTools is free software; you can redistribute it and/or modify it under
586 the terms of the GNU Lesser General Public License as published by the Free
587 Software Foundation; either version 3 of the License, or (at your option)
588 any later version.
589
590 =cut