0
|
1 # $Id: Lucy.pm,v 1.6 2002/10/22 07:38:46 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Tools::Lucy
|
|
4 #
|
|
5 # Copyright Her Majesty the Queen of England
|
|
6 # written by Andrew Walsh (paeruginosa@hotmail.com) during employment with
|
|
7 # Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 # POD documentation - main docs before the code
|
|
11
|
|
12 =head1 NAME
|
|
13
|
|
14 Bio::Tools::Lucy - Object for analyzing the output from Lucy,
|
|
15 a vector and quality trimming program from TIGR
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # Create the Lucy object from an existing Lucy output file
|
|
20 @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1);
|
|
21 $lucyObj = Bio::Tools::Lucy->new(@params);
|
|
22
|
|
23 # Get names of all sequences
|
|
24 $names = $lucyObj->get_sequence_names();
|
|
25
|
|
26 # Print seq and qual values for sequences >400 bp in order to run CAP3
|
|
27 foreach $name (@$names) {
|
|
28 next unless $lucyObj->length_clear($name) > 400;
|
|
29 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
|
|
30 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
|
|
31 }
|
|
32
|
|
33 # Get an array of Bio::PrimarySeq objects
|
|
34 @seqObjs = $lucyObj->get_Seq_Objs();
|
|
35
|
|
36
|
|
37 =head1 DESCRIPTION
|
|
38
|
|
39 Bio::Tools::Lucy.pm provides methods for analyzing the sequence and
|
|
40 quality values generated by Lucy program from TIGR.
|
|
41
|
|
42 Lucy will identify vector, poly-A/T tails, and poor quality regions in
|
|
43 a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf)
|
|
44
|
|
45 The input to Lucy can be the Phred sequence and quality files
|
|
46 generated from running Phred on a set of chromatograms.
|
|
47
|
|
48 Lucy can be obtained (free of charge to academic users) from
|
|
49 www.tigr.org/softlab
|
|
50
|
|
51 There are a few methods that will only be available if you make some
|
|
52 minor changes to the source for Lucy and then recompile. The changes
|
|
53 are in the 'lucy.c' file and there is a diff between the original and
|
|
54 the modified file in the Appendix
|
|
55
|
|
56 Please contact the author of this module if you have any problems
|
|
57 making these modifications.
|
|
58
|
|
59 You do not have to make these modifications to use this module.
|
|
60
|
|
61 =head2 Creating a Lucy object
|
|
62
|
|
63 @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1,
|
|
64 'fwd_desig' => '_F', 'rev_desig' => '_R');
|
|
65 $lucyObj = Bio::Tools::Lucy->new(@params);
|
|
66
|
|
67 =head2 Using a Lucy object
|
|
68
|
|
69 You should get an array with the sequence names in order to use
|
|
70 accessor methods. Note: The Lucy binary program will fail unless
|
|
71 the sequence names provided as input are unique.
|
|
72
|
|
73 $names_ref = $lucyObj->get_sequence_names();
|
|
74
|
|
75 This code snippet will produce a Fasta format file with sequence
|
|
76 lengths and %GC in the description line.
|
|
77
|
|
78 foreach $name (@$names) {
|
|
79 print FILE ">$name\t",
|
|
80 $lucyObj->length_clear($name), "\t",
|
|
81 $lucyObj->per_GC($name), "\n",
|
|
82 $lucyObj->sequence($name), "\n";
|
|
83 }
|
|
84
|
|
85
|
|
86 Print seq and qual values for sequences >400 bp in order to assemble
|
|
87 them with CAP3 (or other assembler).
|
|
88
|
|
89 foreach $name (@$names) {
|
|
90 next unless $lucyObj->length_clear($name) > 400;
|
|
91 print SEQ ">$name\n", $lucyObj->sequence($name), "\n";
|
|
92 print QUAL ">$name\n", $lucyObj->quality($name), "\n";
|
|
93 }
|
|
94
|
|
95 Get all the sequences as Bio::PrimarySeq objects (eg., for use with
|
|
96 Bio::Tools::Blast to perform BLAST).
|
|
97
|
|
98 @seqObjs = $lucyObj->get_Seq_Objs();
|
|
99
|
|
100 Or use only those sequences that are full length and have a Poly-A
|
|
101 tail.
|
|
102
|
|
103 foreach $name (@$names) {
|
|
104 next unless ($lucyObj->full_length($name) and $lucy->polyA($name));
|
|
105 push @seqObjs, $lucyObj->get_Seq_Obj($name);
|
|
106 }
|
|
107
|
|
108
|
|
109 Get the names of those sequences that were rejected by Lucy.
|
|
110
|
|
111 $rejects_ref = $lucyObj->get_rejects();
|
|
112
|
|
113 Print the names of the rejects and 1 letter code for reason they
|
|
114 were rejected.
|
|
115
|
|
116 foreach $key (sort keys %$rejects_ref) {
|
|
117 print "$key: ", $rejects_ref->{$key};
|
|
118 }
|
|
119
|
|
120 There is a lot of other information available about the sequences
|
|
121 analyzed by Lucy (see APPENDIX). This module can be used with the
|
|
122 DBI module to store this sequence information in a database.
|
|
123
|
|
124 =head1 FEEDBACK
|
|
125
|
|
126 =head2 Mailing Lists
|
|
127
|
|
128 User feedback is an integral part of the evolution of this and other
|
|
129 Bioperl modules. Send your comments and suggestions preferably to one
|
|
130 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
131
|
|
132 bioperl-l@bioperl.org - General discussion
|
|
133 http://bio.perl.org/MailList.html - About the mailing lists
|
|
134
|
|
135 =head2 Reporting Bugs
|
|
136
|
|
137 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
138 the bugs and their resolution. Bug reports can be submitted via email
|
|
139 or the web:
|
|
140
|
|
141 bioperl-bugs@bio.perl.org
|
|
142 http://bugzilla.bioperl.org/
|
|
143
|
|
144 =head1 AUTHOR
|
|
145
|
|
146 Andrew G. Walsh paeruginosa@hotmail.com
|
|
147
|
|
148 =head1 APPENDIX
|
|
149
|
|
150 Methods available to Lucy objects are described below. Please note
|
|
151 that any method beginning with an underscore is considered internal
|
|
152 and should not be called directly.
|
|
153
|
|
154 =cut
|
|
155
|
|
156
|
|
157 package Bio::Tools::Lucy;
|
|
158
|
|
159 use vars qw($VERSION $AUTOLOAD @ISA @ATTR %OK_FIELD);
|
|
160 use strict;
|
|
161 use Bio::PrimarySeq;
|
|
162 use Bio::Root::Root;
|
|
163 use Bio::Root::IO;
|
|
164
|
|
165 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
166 @ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr);
|
|
167 foreach my $attr (@ATTR) {
|
|
168 $OK_FIELD{$attr}++
|
|
169 }
|
|
170 $VERSION = "0.01";
|
|
171
|
|
172 sub AUTOLOAD {
|
|
173 my $self = shift;
|
|
174 my $attr = $AUTOLOAD;
|
|
175 $attr =~ s/.*:://;
|
|
176 $attr = lc $attr;
|
|
177 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
|
|
178 $self->{$attr} = shift if @_;
|
|
179 return $self->{$attr};
|
|
180 }
|
|
181
|
|
182 =head2 new
|
|
183
|
|
184 Title : new
|
|
185 Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R',
|
|
186 fwd_desig => '_F')
|
|
187 Function: creates a Lucy object from Lucy analysis files
|
|
188 Returns : reference to Bio::Tools::Lucy object
|
|
189 Args : seqfile Fasta sequence file generated by Lucy
|
|
190 qualfile Quality values file generated by Lucy
|
|
191 infofile Info file created when Lucy is run with -debug 'infofile' option
|
|
192 stderrfile Standard error captured from Lucy when Lucy is run
|
|
193 with -info option and STDERR is directed to stderrfile
|
|
194 (ie. lucy ... 2> stderrfile).
|
|
195 Info in this file will include sequences dropped for low
|
|
196 quality. If you've modified Lucy source (see adv_stderr below),
|
|
197 it will also include info on which sequences were dropped because
|
|
198 they were vector, too short, had no insert, and whether a poly-A
|
|
199 tail was found (if Lucy was run with -cdna option).
|
|
200 lucy_verbose verbosity level (0-1).
|
|
201 fwd_desig The string used to determine whether sequence is a forward read.
|
|
202 The parser will assume that this match will occus at the
|
|
203 end of the sequence name string.
|
|
204 rev_desig As above, for reverse reads.
|
|
205 adv_stderr Can be set to a true value (1). Will only work if you have modified
|
|
206 the Lucy source code as outlined in DESCRIPTION and capture
|
|
207 the standard error from Lucy.
|
|
208
|
|
209 If you don't provide filenames for qualfile, infofile or stderrfile,
|
|
210 the module will assume that .qual, .info, and .stderr are the file
|
|
211 extensions and search in the same directory as the .seq file for these
|
|
212 files.
|
|
213
|
|
214 For example, if you create a Lucy object with $lucyObj =
|
|
215 Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will
|
|
216 find lucy.qual, lucy.info and lucy.stderr.
|
|
217
|
|
218 You can omit any or all of the quality, info or stderr files, but you
|
|
219 will not be able to use all of the object methods (see method
|
|
220 documentation below).
|
|
221
|
|
222 =cut
|
|
223
|
|
224 sub new {
|
|
225 my ($class,@args) = @_;
|
|
226 my $self = $class->SUPER::new(@args);
|
|
227 my ($attr, $value);
|
|
228 while (@args) {
|
|
229 $attr = shift @args;
|
|
230 $attr = lc $attr;
|
|
231 $value = shift @args;
|
|
232 $self->{$attr} = $value;
|
|
233 }
|
|
234 &_parse($self);
|
|
235 return $self;
|
|
236 }
|
|
237
|
|
238 =head2 _parse
|
|
239
|
|
240 Title : _parse
|
|
241 Usage : n/a (internal function)
|
|
242 Function: called by new() to parse Lucy output files
|
|
243 Returns : nothing
|
|
244 Args : none
|
|
245
|
|
246 =cut
|
|
247
|
|
248 sub _parse {
|
|
249 my $self = shift;
|
|
250 $self->{seqfile} =~ /^(\S+)\.\S+$/;
|
|
251 my $file = $1;
|
|
252
|
|
253 print "Opening $self->{seqfile} for parsing...\n" if $self->{lucy_verbose};
|
|
254 open SEQ, "$self->{seqfile}" or $self->throw("Could not open sequence file: $self->{seqfile}");
|
|
255 my ($name, $line);
|
|
256 my $seq = "";
|
|
257 my @lines = <SEQ>;
|
|
258 while ($line = pop @lines) {
|
|
259 chomp $line;
|
|
260 if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) {
|
|
261 $name = $1;
|
|
262 if ($self->{fwd_desig}) {
|
|
263 $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/;
|
|
264 }
|
|
265 if ($self->{rev_desig}) {
|
|
266 $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/;
|
|
267 }
|
|
268 $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4
|
|
269 $self->{sequences}{$name}{max_clone_len} = $3;
|
|
270 $self->{sequences}{$name}{med_clone_len} = $4;
|
|
271 $self->{sequences}{$name}{beg_clear} = $5;
|
|
272 $self->{sequences}{$name}{end_clear} = $6;
|
|
273 $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong.
|
|
274 my $beg = $5-1; # substr function begins with index 0
|
|
275 $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg);
|
|
276 my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//;
|
|
277 my $countGC = $seq =~ tr/[GC]//;
|
|
278 $self->{sequences}{$name}{per_GC} = $countGC/$count * 100;
|
|
279 $seq = "";
|
|
280 }
|
|
281 else {
|
|
282 $seq = $line.$seq;
|
|
283 }
|
|
284 }
|
|
285
|
|
286
|
|
287 # now parse quality values (check for presence of quality file first)
|
|
288 if ($self->{qualfile}) {
|
|
289 open QUAL, "$self->{qualfile}" or $self->throw("Could not open quality file: $self->{qualfile}");
|
|
290 @lines = <QUAL>;
|
|
291 }
|
|
292 elsif (-e "$file.qual") {
|
|
293 print "You did not set qualfile, but I'm opening $file.qual\n" if $self->{lucy_verbose};
|
|
294 $self->qualfile("$file.qual");
|
|
295 open QUAL, "$file.qual" or $self->throw("Could not open quality file: $file.qual");
|
|
296 @lines = <QUAL>;
|
|
297 }
|
|
298 else {
|
|
299 print "I did not find a quality file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose};
|
|
300 @lines = ();
|
|
301 }
|
|
302
|
|
303 my (@vals, @slice, $num, $tot, $vals);
|
|
304 my $qual = "";
|
|
305 while ($line = pop @lines) {
|
|
306 chomp $line;
|
|
307 if ($line =~ /^>(\S+)/) {
|
|
308 $name = $1;
|
|
309 @vals = split /\s/ , $qual;
|
|
310 @slice = @vals[$self->{sequences}{$name}{beg_clear} .. $self->{sequences}{$name}{end_clear}];
|
|
311 $vals = join "\t", @slice;
|
|
312 $self->{sequences}{$name}{quality} = $vals;
|
|
313 $qual = "";
|
|
314 foreach $num (@slice) {
|
|
315 $tot += $num;
|
|
316 }
|
|
317 $num = @slice;
|
|
318 $self->{sequences}{$name}{avg_quality} = $tot/$num;
|
|
319 $tot = 0;
|
|
320 }
|
|
321 else {
|
|
322 $qual = $line.$qual;
|
|
323 }
|
|
324 }
|
|
325
|
|
326 # determine whether reads are full length
|
|
327
|
|
328 if ($self->{infofile}) {
|
|
329 open INFO, "$self->{infofile}" or $self->throw("Could not open info file: $self->{infofile}");
|
|
330 @lines = <INFO>;
|
|
331 }
|
|
332 elsif (-e "$file.info") {
|
|
333 print "You did not set infofile, but I'm opening $file.info\n" if $self->{lucy_verbose};
|
|
334 $self->infofile("$file.info");
|
|
335 open INFO, "$file.info" or $self->throw("Could not open info file: $file.info");
|
|
336 @lines = <INFO>;
|
|
337 }
|
|
338 else {
|
|
339 print "I did not find an info file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose};
|
|
340 @lines = ();
|
|
341 }
|
|
342
|
|
343 foreach (@lines) {
|
|
344 /^(\S+).+CLV\s+(\d+)\s+(\d+)$/;
|
|
345 if ($2>0 && $3>0) {
|
|
346 $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too
|
|
347 }
|
|
348 }
|
|
349
|
|
350
|
|
351 # parse rejects (and presence of poly-A if Lucy has been modified)
|
|
352
|
|
353 if ($self->{stderrfile}) {
|
|
354 open STDERR_LUCY, "$self->{stderrfile}" or $self->throw("Could not open quality file: $self->{stderrfile}");
|
|
355 @lines = <STDERR_LUCY>;
|
|
356
|
|
357 }
|
|
358 elsif (-e "$file.stderr") {
|
|
359 print "You did not set stderrfile, but I'm opening $file.stderr\n" if $self->{lucy_verbose};
|
|
360 $self->stderrfile("$file.stderr");
|
|
361 open STDERR_LUCY, "$file.stderr" or $self->throw("Could not open quality file: $file.stderr");
|
|
362 @lines = <STDERR_LUCY>;
|
|
363 }
|
|
364 else {
|
|
365 print "I did not find a standard error file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose};
|
|
366 @lines = ();
|
|
367 }
|
|
368
|
|
369 if ($self->{adv_stderr}) {
|
|
370 foreach (@lines) {
|
|
371 $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/;
|
|
372 $self->{reject}{$1} = "V" if /Vector: (\S+)/;
|
|
373 $self->{reject}{$1} = "E" if /Empty: (\S+)/;
|
|
374 $self->{reject}{$1} = "S" if /Short: (\S+)/;
|
|
375 $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/;
|
|
376 if (/Dropped PolyA: (\S+)/) {
|
|
377 $self->{reject}{$1} = "P";
|
|
378 delete $self->{sequences}{$1};
|
|
379 }
|
|
380 }
|
|
381 }
|
|
382 else {
|
|
383 foreach (@lines) {
|
|
384 $self->{reject}{$1} = "R" if /dropping\s+(\S+)/;
|
|
385 }
|
|
386 }
|
|
387
|
|
388 }
|
|
389
|
|
390 =head2 get_Seq_Objs
|
|
391
|
|
392 Title : get_Seq_Objs
|
|
393 Usage : $lucyObj->get_Seq_Objs()
|
|
394 Function: returns an array of references to Bio::PrimarySeq objects
|
|
395 where -id = 'sequence name' and -seq = 'sequence'
|
|
396
|
|
397 Returns : array of Bio::PrimarySeq objects
|
|
398 Args : none
|
|
399
|
|
400 =cut
|
|
401
|
|
402 sub get_Seq_Objs {
|
|
403 my $self = shift;
|
|
404 my($seqobj, @seqobjs);
|
|
405 foreach my $key (sort keys %{$self->{sequences}}) {
|
|
406 $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}",
|
|
407 -id => "$key");
|
|
408 push @seqobjs, $seqobj;
|
|
409 }
|
|
410 return \@seqobjs;
|
|
411 }
|
|
412
|
|
413 =head2 get_Seq_Obj
|
|
414
|
|
415 Title : get_Seq_Obj
|
|
416 Usage : $lucyObj->get_Seq_Obj($seqname)
|
|
417 Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name'
|
|
418 and -seq = 'sequence'
|
|
419 Returns : reference to Bio::PrimarySeq object
|
|
420 Args : name of a sequence
|
|
421
|
|
422 =cut
|
|
423
|
|
424 sub get_Seq_Obj {
|
|
425 my ($self, $key) = @_;
|
|
426 my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}",
|
|
427 -id => "$key");
|
|
428 return $seqobj;
|
|
429 }
|
|
430
|
|
431 =head2 get_sequence_names
|
|
432
|
|
433 Title : get_sequence_names
|
|
434 Usage : $lucyObj->get_sequence_names
|
|
435 Function: returns reference to an array of names of the sequences analyzed by Lucy.
|
|
436 These names are required for most of the accessor methods.
|
|
437 Note: The Lucy binary will fail unless sequence names are unique.
|
|
438 Returns : array reference
|
|
439 Args : none
|
|
440
|
|
441 =cut
|
|
442
|
|
443 sub get_sequence_names {
|
|
444 my $self = shift;
|
|
445 my @keys = sort keys %{$self->{sequences}};
|
|
446 return \@keys;
|
|
447 }
|
|
448
|
|
449 =head2 sequence
|
|
450
|
|
451 Title : sequence
|
|
452 Usage : $lucyObj->sequence($seqname)
|
|
453 Function: returns the DNA sequence of one of the sequences analyzed by Lucy.
|
|
454 Returns : string
|
|
455 Args : name of a sequence
|
|
456
|
|
457 =cut
|
|
458
|
|
459 sub sequence {
|
|
460 my ($self, $key) = @_;
|
|
461 return $self->{sequences}{$key}{sequence};
|
|
462 }
|
|
463
|
|
464 =head2 quality
|
|
465
|
|
466 Title : quality
|
|
467 Usage : $lucyObj->quality($seqname)
|
|
468 Function: returns the quality values of one of the sequences analyzed by Lucy.
|
|
469 This method depends on the user having provided a quality file.
|
|
470 Returns : string
|
|
471 Args : name of a sequence
|
|
472
|
|
473 =cut
|
|
474
|
|
475 sub quality {
|
|
476 my($self, $key) = @_;
|
|
477 return $self->{sequences}{$key}{quality};
|
|
478 }
|
|
479
|
|
480 =head2 avg_quality
|
|
481
|
|
482 Title : avg_quality
|
|
483 Usage : $lucyObj->avg_quality($seqname)
|
|
484 Function: returns the average quality value for one of the sequences analyzed by Lucy.
|
|
485 Returns : float
|
|
486 Args : name of a sequence
|
|
487
|
|
488 =cut
|
|
489
|
|
490 sub avg_quality {
|
|
491 my($self, $key) = @_;
|
|
492 return $self->{sequences}{$key}{avg_quality};
|
|
493 }
|
|
494
|
|
495 =head2 direction
|
|
496
|
|
497 Title : direction
|
|
498 Usage : $lucyObj->direction($seqname)
|
|
499 Function: returns the direction for one of the sequences analyzed by Lucy
|
|
500 providing that 'fwd_desig' or 'rev_desig' were set when the
|
|
501 Lucy object was created.
|
|
502 Strings returned are: 'F' for forward, 'R' for reverse.
|
|
503 Returns : string
|
|
504 Args : name of a sequence
|
|
505
|
|
506 =cut
|
|
507
|
|
508 sub direction {
|
|
509 my($self, $key) = @_;
|
|
510 return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction};
|
|
511 return "";
|
|
512 }
|
|
513
|
|
514 =head2 length_raw
|
|
515
|
|
516 Title : length_raw
|
|
517 Usage : $lucyObj->length_raw($seqname)
|
|
518 Function: returns the length of a DNA sequence prior to quality/ vector
|
|
519 trimming by Lucy.
|
|
520 Returns : integer
|
|
521 Args : name of a sequence
|
|
522
|
|
523 =cut
|
|
524
|
|
525 sub length_raw {
|
|
526 my($self, $key) = @_;
|
|
527 return $self->{sequences}{$key}{length_raw};
|
|
528 }
|
|
529
|
|
530 =head2 length_clear
|
|
531
|
|
532 Title : length_clear
|
|
533 Usage : $lucyObj->length_clear($seqname)
|
|
534 Function: returns the length of a DNA sequence following quality/ vector
|
|
535 trimming by Lucy.
|
|
536 Returns : integer
|
|
537 Args : name of a sequence
|
|
538
|
|
539 =cut
|
|
540
|
|
541 sub length_clear {
|
|
542 my($self, $key) = @_;
|
|
543 return $self->{sequences}{$key}{length_clear};
|
|
544 }
|
|
545
|
|
546 =head2 start_clear
|
|
547
|
|
548 Title : start_clear
|
|
549 Usage : $lucyObj->start_clear($seqname)
|
|
550 Function: returns the beginning position of good quality, vector free DNA sequence
|
|
551 determined by Lucy.
|
|
552 Returns : integer
|
|
553 Args : name of a sequence
|
|
554
|
|
555 =cut
|
|
556
|
|
557 sub start_clear {
|
|
558 my($self, $key) = @_;
|
|
559 return $self->{sequences}{$key}{beg_clear};
|
|
560 }
|
|
561
|
|
562
|
|
563 =head2 end_clear
|
|
564
|
|
565 Title : end_clear
|
|
566 Usage : $lucyObj->end_clear($seqname)
|
|
567 Function: returns the ending position of good quality, vector free DNA sequence
|
|
568 determined by Lucy.
|
|
569 Returns : integer
|
|
570 Args : name of a sequence
|
|
571
|
|
572 =cut
|
|
573
|
|
574 sub end_clear {
|
|
575 my($self, $key) = @_;
|
|
576 return $self->{sequences}{$key}{end_clear};
|
|
577 }
|
|
578
|
|
579 =head2 per_GC
|
|
580
|
|
581 Title : per_GC
|
|
582 Usage : $lucyObj->per_GC($seqname)
|
|
583 Function: returns the percente of the good quality, vector free DNA sequence
|
|
584 determined by Lucy.
|
|
585 Returns : float
|
|
586 Args : name of a sequence
|
|
587
|
|
588 =cut
|
|
589
|
|
590 sub per_GC {
|
|
591 my($self, $key) = @_;
|
|
592 return $self->{sequences}{$key}{per_GC};
|
|
593 }
|
|
594
|
|
595 =head2 full_length
|
|
596
|
|
597 Title : full_length
|
|
598 Usage : $lucyObj->full_length($seqname)
|
|
599 Function: returns the truth value for whether or not the sequence read was
|
|
600 full length (ie. vector present on both ends of read). This method
|
|
601 depends on the user having provided the 'info' file (Lucy must be
|
|
602 run with the -debug 'info_filename' option to get this file).
|
|
603 Returns : boolean
|
|
604 Args : name of a sequence
|
|
605
|
|
606 =cut
|
|
607
|
|
608 sub full_length {
|
|
609 my($self, $key) = @_;
|
|
610 return 1 if $self->{sequences}{$key}{full_length};
|
|
611 return 0;
|
|
612 }
|
|
613
|
|
614 =head2 polyA
|
|
615
|
|
616 Title : polyA
|
|
617 Usage : $lucyObj->polyA($seqname)
|
|
618 Function: returns the truth value for whether or not a poly-A tail was detected
|
|
619 and clipped by Lucy. This method depends on the user having modified
|
|
620 the source for Lucy as outlined in DESCRIPTION and invoking Lucy with
|
|
621 the -cdna option and saving the standard error.
|
|
622 Note, the final sequence will not show the poly-A/T region.
|
|
623 Returns : boolean
|
|
624 Args : name of a sequence
|
|
625
|
|
626 =cut
|
|
627
|
|
628 sub polyA {
|
|
629 my($self, $key) = @_;
|
|
630 return 1 if $self->{sequences}{$key}{polyA};
|
|
631 return 0;
|
|
632 }
|
|
633
|
|
634 =head2 get_rejects
|
|
635
|
|
636 Title : get_rejects
|
|
637 Usage : $lucyObj->get_rejects()
|
|
638 Function: returns a hash containing names of rejects and a 1 letter code for the
|
|
639 reason Lucy rejected the sequence.
|
|
640 Q- rejected because of low quality values
|
|
641 S- sequence was short
|
|
642 V- sequence was vector
|
|
643 E- sequence was empty
|
|
644 P- poly-A/T trimming caused sequence to be too short
|
|
645 In order to get the rejects, you must provide a file with the standard
|
|
646 error from Lucy. You will only get the quality category rejects unless
|
|
647 you have modified the source and recompiled Lucy as outlined in DESCRIPTION.
|
|
648 Returns : hash reference
|
|
649 Args : none
|
|
650
|
|
651 =cut
|
|
652
|
|
653 sub get_rejects {
|
|
654 my $self = shift;
|
|
655 return $self->{reject};
|
|
656 }
|
|
657
|
|
658 =head2 Diff for Lucy source code
|
|
659
|
|
660 352a353,354
|
|
661 > /* AGW added next line */
|
|
662 > fprintf(stderr, "Empty: %s\n", seqs[i].name);
|
|
663 639a642,643
|
|
664 > /* AGW added next line */
|
|
665 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name);
|
|
666 678c682,686
|
|
667 < if (left) seqs[i].left+=left;
|
|
668 ---
|
|
669 > if (left) {
|
|
670 > seqs[i].left+=left;
|
|
671 > /* AGW added next line */
|
|
672 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name);
|
|
673 > }
|
|
674 681c689,693
|
|
675 < if (right) seqs[i].right-=right;
|
|
676 ---
|
|
677 > if (right) {
|
|
678 > seqs[i].right-=right;
|
|
679 > /* AGW added next line */
|
|
680 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name);
|
|
681 > }
|
|
682 682a695,696
|
|
683 > /* AGW added next line */
|
|
684 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name);
|
|
685 734a749,750
|
|
686 > /* AGW added next line */
|
|
687 > fprintf(stderr, "Vector: %s\n", seqs[i].name);
|
|
688
|
|
689 =cut
|
|
690
|
|
691 1;
|