Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Lucy.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Lucy.pm,v 1.6 2002/10/22 07:38:46 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::Lucy | |
4 # | |
5 # Copyright Her Majesty the Queen of England | |
6 # written by Andrew Walsh (paeruginosa@hotmail.com) during employment with | |
7 # Agriculture and Agri-food Canada, Cereal Research Centre, Winnipeg, MB | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # POD documentation - main docs before the code | |
11 | |
12 =head1 NAME | |
13 | |
14 Bio::Tools::Lucy - Object for analyzing the output from Lucy, | |
15 a vector and quality trimming program from TIGR | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # Create the Lucy object from an existing Lucy output file | |
20 @params = ('seqfile' => 'lucy.seq', 'lucy_verbose' => 1); | |
21 $lucyObj = Bio::Tools::Lucy->new(@params); | |
22 | |
23 # Get names of all sequences | |
24 $names = $lucyObj->get_sequence_names(); | |
25 | |
26 # Print seq and qual values for sequences >400 bp in order to run CAP3 | |
27 foreach $name (@$names) { | |
28 next unless $lucyObj->length_clear($name) > 400; | |
29 print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; | |
30 print QUAL ">$name\n", $lucyObj->quality($name), "\n"; | |
31 } | |
32 | |
33 # Get an array of Bio::PrimarySeq objects | |
34 @seqObjs = $lucyObj->get_Seq_Objs(); | |
35 | |
36 | |
37 =head1 DESCRIPTION | |
38 | |
39 Bio::Tools::Lucy.pm provides methods for analyzing the sequence and | |
40 quality values generated by Lucy program from TIGR. | |
41 | |
42 Lucy will identify vector, poly-A/T tails, and poor quality regions in | |
43 a sequence. (www.genomics.purdue.edu/gcg/other/lucy.pdf) | |
44 | |
45 The input to Lucy can be the Phred sequence and quality files | |
46 generated from running Phred on a set of chromatograms. | |
47 | |
48 Lucy can be obtained (free of charge to academic users) from | |
49 www.tigr.org/softlab | |
50 | |
51 There are a few methods that will only be available if you make some | |
52 minor changes to the source for Lucy and then recompile. The changes | |
53 are in the 'lucy.c' file and there is a diff between the original and | |
54 the modified file in the Appendix | |
55 | |
56 Please contact the author of this module if you have any problems | |
57 making these modifications. | |
58 | |
59 You do not have to make these modifications to use this module. | |
60 | |
61 =head2 Creating a Lucy object | |
62 | |
63 @params = ('seqfile' => 'lucy.seq', 'adv_stderr' => 1, | |
64 'fwd_desig' => '_F', 'rev_desig' => '_R'); | |
65 $lucyObj = Bio::Tools::Lucy->new(@params); | |
66 | |
67 =head2 Using a Lucy object | |
68 | |
69 You should get an array with the sequence names in order to use | |
70 accessor methods. Note: The Lucy binary program will fail unless | |
71 the sequence names provided as input are unique. | |
72 | |
73 $names_ref = $lucyObj->get_sequence_names(); | |
74 | |
75 This code snippet will produce a Fasta format file with sequence | |
76 lengths and %GC in the description line. | |
77 | |
78 foreach $name (@$names) { | |
79 print FILE ">$name\t", | |
80 $lucyObj->length_clear($name), "\t", | |
81 $lucyObj->per_GC($name), "\n", | |
82 $lucyObj->sequence($name), "\n"; | |
83 } | |
84 | |
85 | |
86 Print seq and qual values for sequences >400 bp in order to assemble | |
87 them with CAP3 (or other assembler). | |
88 | |
89 foreach $name (@$names) { | |
90 next unless $lucyObj->length_clear($name) > 400; | |
91 print SEQ ">$name\n", $lucyObj->sequence($name), "\n"; | |
92 print QUAL ">$name\n", $lucyObj->quality($name), "\n"; | |
93 } | |
94 | |
95 Get all the sequences as Bio::PrimarySeq objects (eg., for use with | |
96 Bio::Tools::Blast to perform BLAST). | |
97 | |
98 @seqObjs = $lucyObj->get_Seq_Objs(); | |
99 | |
100 Or use only those sequences that are full length and have a Poly-A | |
101 tail. | |
102 | |
103 foreach $name (@$names) { | |
104 next unless ($lucyObj->full_length($name) and $lucy->polyA($name)); | |
105 push @seqObjs, $lucyObj->get_Seq_Obj($name); | |
106 } | |
107 | |
108 | |
109 Get the names of those sequences that were rejected by Lucy. | |
110 | |
111 $rejects_ref = $lucyObj->get_rejects(); | |
112 | |
113 Print the names of the rejects and 1 letter code for reason they | |
114 were rejected. | |
115 | |
116 foreach $key (sort keys %$rejects_ref) { | |
117 print "$key: ", $rejects_ref->{$key}; | |
118 } | |
119 | |
120 There is a lot of other information available about the sequences | |
121 analyzed by Lucy (see APPENDIX). This module can be used with the | |
122 DBI module to store this sequence information in a database. | |
123 | |
124 =head1 FEEDBACK | |
125 | |
126 =head2 Mailing Lists | |
127 | |
128 User feedback is an integral part of the evolution of this and other | |
129 Bioperl modules. Send your comments and suggestions preferably to one | |
130 of the Bioperl mailing lists. Your participation is much appreciated. | |
131 | |
132 bioperl-l@bioperl.org - General discussion | |
133 http://bio.perl.org/MailList.html - About the mailing lists | |
134 | |
135 =head2 Reporting Bugs | |
136 | |
137 Report bugs to the Bioperl bug tracking system to help us keep track | |
138 the bugs and their resolution. Bug reports can be submitted via email | |
139 or the web: | |
140 | |
141 bioperl-bugs@bio.perl.org | |
142 http://bugzilla.bioperl.org/ | |
143 | |
144 =head1 AUTHOR | |
145 | |
146 Andrew G. Walsh paeruginosa@hotmail.com | |
147 | |
148 =head1 APPENDIX | |
149 | |
150 Methods available to Lucy objects are described below. Please note | |
151 that any method beginning with an underscore is considered internal | |
152 and should not be called directly. | |
153 | |
154 =cut | |
155 | |
156 | |
157 package Bio::Tools::Lucy; | |
158 | |
159 use vars qw($VERSION $AUTOLOAD @ISA @ATTR %OK_FIELD); | |
160 use strict; | |
161 use Bio::PrimarySeq; | |
162 use Bio::Root::Root; | |
163 use Bio::Root::IO; | |
164 | |
165 @ISA = qw(Bio::Root::Root Bio::Root::IO); | |
166 @ATTR = qw(seqfile qualfile stderrfile infofile lucy_verbose fwd_desig rev_desig adv_stderr); | |
167 foreach my $attr (@ATTR) { | |
168 $OK_FIELD{$attr}++ | |
169 } | |
170 $VERSION = "0.01"; | |
171 | |
172 sub AUTOLOAD { | |
173 my $self = shift; | |
174 my $attr = $AUTOLOAD; | |
175 $attr =~ s/.*:://; | |
176 $attr = lc $attr; | |
177 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; | |
178 $self->{$attr} = shift if @_; | |
179 return $self->{$attr}; | |
180 } | |
181 | |
182 =head2 new | |
183 | |
184 Title : new | |
185 Usage : $lucyObj = Bio::Tools::Lucy->new(seqfile => lucy.seq, rev_desig => '_R', | |
186 fwd_desig => '_F') | |
187 Function: creates a Lucy object from Lucy analysis files | |
188 Returns : reference to Bio::Tools::Lucy object | |
189 Args : seqfile Fasta sequence file generated by Lucy | |
190 qualfile Quality values file generated by Lucy | |
191 infofile Info file created when Lucy is run with -debug 'infofile' option | |
192 stderrfile Standard error captured from Lucy when Lucy is run | |
193 with -info option and STDERR is directed to stderrfile | |
194 (ie. lucy ... 2> stderrfile). | |
195 Info in this file will include sequences dropped for low | |
196 quality. If you've modified Lucy source (see adv_stderr below), | |
197 it will also include info on which sequences were dropped because | |
198 they were vector, too short, had no insert, and whether a poly-A | |
199 tail was found (if Lucy was run with -cdna option). | |
200 lucy_verbose verbosity level (0-1). | |
201 fwd_desig The string used to determine whether sequence is a forward read. | |
202 The parser will assume that this match will occus at the | |
203 end of the sequence name string. | |
204 rev_desig As above, for reverse reads. | |
205 adv_stderr Can be set to a true value (1). Will only work if you have modified | |
206 the Lucy source code as outlined in DESCRIPTION and capture | |
207 the standard error from Lucy. | |
208 | |
209 If you don't provide filenames for qualfile, infofile or stderrfile, | |
210 the module will assume that .qual, .info, and .stderr are the file | |
211 extensions and search in the same directory as the .seq file for these | |
212 files. | |
213 | |
214 For example, if you create a Lucy object with $lucyObj = | |
215 Bio::Tools::Lucy-E<gt>new(seqfile =E<gt>lucy.seq), the module will | |
216 find lucy.qual, lucy.info and lucy.stderr. | |
217 | |
218 You can omit any or all of the quality, info or stderr files, but you | |
219 will not be able to use all of the object methods (see method | |
220 documentation below). | |
221 | |
222 =cut | |
223 | |
224 sub new { | |
225 my ($class,@args) = @_; | |
226 my $self = $class->SUPER::new(@args); | |
227 my ($attr, $value); | |
228 while (@args) { | |
229 $attr = shift @args; | |
230 $attr = lc $attr; | |
231 $value = shift @args; | |
232 $self->{$attr} = $value; | |
233 } | |
234 &_parse($self); | |
235 return $self; | |
236 } | |
237 | |
238 =head2 _parse | |
239 | |
240 Title : _parse | |
241 Usage : n/a (internal function) | |
242 Function: called by new() to parse Lucy output files | |
243 Returns : nothing | |
244 Args : none | |
245 | |
246 =cut | |
247 | |
248 sub _parse { | |
249 my $self = shift; | |
250 $self->{seqfile} =~ /^(\S+)\.\S+$/; | |
251 my $file = $1; | |
252 | |
253 print "Opening $self->{seqfile} for parsing...\n" if $self->{lucy_verbose}; | |
254 open SEQ, "$self->{seqfile}" or $self->throw("Could not open sequence file: $self->{seqfile}"); | |
255 my ($name, $line); | |
256 my $seq = ""; | |
257 my @lines = <SEQ>; | |
258 while ($line = pop @lines) { | |
259 chomp $line; | |
260 if ($line =~ /^>(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/) { | |
261 $name = $1; | |
262 if ($self->{fwd_desig}) { | |
263 $self->{sequences}{$name}{direction} = "F" if $name =~ /^(\S+)($self->{fwd_desig})$/; | |
264 } | |
265 if ($self->{rev_desig}) { | |
266 $self->{sequences}{$name}{direction} = "R" if $name =~ /^(\S+)($self->{rev_desig})$/; | |
267 } | |
268 $self->{sequences}{$name}{min_clone_len} = $2; # this is used for TIGR Assembler, as are $3 and $4 | |
269 $self->{sequences}{$name}{max_clone_len} = $3; | |
270 $self->{sequences}{$name}{med_clone_len} = $4; | |
271 $self->{sequences}{$name}{beg_clear} = $5; | |
272 $self->{sequences}{$name}{end_clear} = $6; | |
273 $self->{sequences}{$name}{length_raw} = $seq =~ tr/[AGCTN]//; # from what I've seen, these are the bases Phred calls. Please let me know if I'm wrong. | |
274 my $beg = $5-1; # substr function begins with index 0 | |
275 $seq = $self->{sequences}{$name}{sequence} = substr ($seq, $beg, $6-$beg); | |
276 my $count = $self->{sequences}{$name}{length_clear} = $seq =~ tr/[AGCTN]//; | |
277 my $countGC = $seq =~ tr/[GC]//; | |
278 $self->{sequences}{$name}{per_GC} = $countGC/$count * 100; | |
279 $seq = ""; | |
280 } | |
281 else { | |
282 $seq = $line.$seq; | |
283 } | |
284 } | |
285 | |
286 | |
287 # now parse quality values (check for presence of quality file first) | |
288 if ($self->{qualfile}) { | |
289 open QUAL, "$self->{qualfile}" or $self->throw("Could not open quality file: $self->{qualfile}"); | |
290 @lines = <QUAL>; | |
291 } | |
292 elsif (-e "$file.qual") { | |
293 print "You did not set qualfile, but I'm opening $file.qual\n" if $self->{lucy_verbose}; | |
294 $self->qualfile("$file.qual"); | |
295 open QUAL, "$file.qual" or $self->throw("Could not open quality file: $file.qual"); | |
296 @lines = <QUAL>; | |
297 } | |
298 else { | |
299 print "I did not find a quality file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; | |
300 @lines = (); | |
301 } | |
302 | |
303 my (@vals, @slice, $num, $tot, $vals); | |
304 my $qual = ""; | |
305 while ($line = pop @lines) { | |
306 chomp $line; | |
307 if ($line =~ /^>(\S+)/) { | |
308 $name = $1; | |
309 @vals = split /\s/ , $qual; | |
310 @slice = @vals[$self->{sequences}{$name}{beg_clear} .. $self->{sequences}{$name}{end_clear}]; | |
311 $vals = join "\t", @slice; | |
312 $self->{sequences}{$name}{quality} = $vals; | |
313 $qual = ""; | |
314 foreach $num (@slice) { | |
315 $tot += $num; | |
316 } | |
317 $num = @slice; | |
318 $self->{sequences}{$name}{avg_quality} = $tot/$num; | |
319 $tot = 0; | |
320 } | |
321 else { | |
322 $qual = $line.$qual; | |
323 } | |
324 } | |
325 | |
326 # determine whether reads are full length | |
327 | |
328 if ($self->{infofile}) { | |
329 open INFO, "$self->{infofile}" or $self->throw("Could not open info file: $self->{infofile}"); | |
330 @lines = <INFO>; | |
331 } | |
332 elsif (-e "$file.info") { | |
333 print "You did not set infofile, but I'm opening $file.info\n" if $self->{lucy_verbose}; | |
334 $self->infofile("$file.info"); | |
335 open INFO, "$file.info" or $self->throw("Could not open info file: $file.info"); | |
336 @lines = <INFO>; | |
337 } | |
338 else { | |
339 print "I did not find an info file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; | |
340 @lines = (); | |
341 } | |
342 | |
343 foreach (@lines) { | |
344 /^(\S+).+CLV\s+(\d+)\s+(\d+)$/; | |
345 if ($2>0 && $3>0) { | |
346 $self->{sequences}{$1}{full_length} = 1 if $self->{sequences}{$1}; # will show cleavage info for rejected sequences too | |
347 } | |
348 } | |
349 | |
350 | |
351 # parse rejects (and presence of poly-A if Lucy has been modified) | |
352 | |
353 if ($self->{stderrfile}) { | |
354 open STDERR_LUCY, "$self->{stderrfile}" or $self->throw("Could not open quality file: $self->{stderrfile}"); | |
355 @lines = <STDERR_LUCY>; | |
356 | |
357 } | |
358 elsif (-e "$file.stderr") { | |
359 print "You did not set stderrfile, but I'm opening $file.stderr\n" if $self->{lucy_verbose}; | |
360 $self->stderrfile("$file.stderr"); | |
361 open STDERR_LUCY, "$file.stderr" or $self->throw("Could not open quality file: $file.stderr"); | |
362 @lines = <STDERR_LUCY>; | |
363 } | |
364 else { | |
365 print "I did not find a standard error file. You will not be able to use all of the accessor methods.\n" if $self->{lucy_verbose}; | |
366 @lines = (); | |
367 } | |
368 | |
369 if ($self->{adv_stderr}) { | |
370 foreach (@lines) { | |
371 $self->{reject}{$1} = "Q" if /dropping\s+(\S+)/; | |
372 $self->{reject}{$1} = "V" if /Vector: (\S+)/; | |
373 $self->{reject}{$1} = "E" if /Empty: (\S+)/; | |
374 $self->{reject}{$1} = "S" if /Short: (\S+)/; | |
375 $self->{sequences}{$1}{polyA} = 1 if /(\S+) has PolyA/; | |
376 if (/Dropped PolyA: (\S+)/) { | |
377 $self->{reject}{$1} = "P"; | |
378 delete $self->{sequences}{$1}; | |
379 } | |
380 } | |
381 } | |
382 else { | |
383 foreach (@lines) { | |
384 $self->{reject}{$1} = "R" if /dropping\s+(\S+)/; | |
385 } | |
386 } | |
387 | |
388 } | |
389 | |
390 =head2 get_Seq_Objs | |
391 | |
392 Title : get_Seq_Objs | |
393 Usage : $lucyObj->get_Seq_Objs() | |
394 Function: returns an array of references to Bio::PrimarySeq objects | |
395 where -id = 'sequence name' and -seq = 'sequence' | |
396 | |
397 Returns : array of Bio::PrimarySeq objects | |
398 Args : none | |
399 | |
400 =cut | |
401 | |
402 sub get_Seq_Objs { | |
403 my $self = shift; | |
404 my($seqobj, @seqobjs); | |
405 foreach my $key (sort keys %{$self->{sequences}}) { | |
406 $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", | |
407 -id => "$key"); | |
408 push @seqobjs, $seqobj; | |
409 } | |
410 return \@seqobjs; | |
411 } | |
412 | |
413 =head2 get_Seq_Obj | |
414 | |
415 Title : get_Seq_Obj | |
416 Usage : $lucyObj->get_Seq_Obj($seqname) | |
417 Function: returns reference to a Bio::PrimarySeq object where -id = 'sequence name' | |
418 and -seq = 'sequence' | |
419 Returns : reference to Bio::PrimarySeq object | |
420 Args : name of a sequence | |
421 | |
422 =cut | |
423 | |
424 sub get_Seq_Obj { | |
425 my ($self, $key) = @_; | |
426 my $seqobj = Bio::PrimarySeq->new( -seq => "$self->{sequences}{$key}{sequence}", | |
427 -id => "$key"); | |
428 return $seqobj; | |
429 } | |
430 | |
431 =head2 get_sequence_names | |
432 | |
433 Title : get_sequence_names | |
434 Usage : $lucyObj->get_sequence_names | |
435 Function: returns reference to an array of names of the sequences analyzed by Lucy. | |
436 These names are required for most of the accessor methods. | |
437 Note: The Lucy binary will fail unless sequence names are unique. | |
438 Returns : array reference | |
439 Args : none | |
440 | |
441 =cut | |
442 | |
443 sub get_sequence_names { | |
444 my $self = shift; | |
445 my @keys = sort keys %{$self->{sequences}}; | |
446 return \@keys; | |
447 } | |
448 | |
449 =head2 sequence | |
450 | |
451 Title : sequence | |
452 Usage : $lucyObj->sequence($seqname) | |
453 Function: returns the DNA sequence of one of the sequences analyzed by Lucy. | |
454 Returns : string | |
455 Args : name of a sequence | |
456 | |
457 =cut | |
458 | |
459 sub sequence { | |
460 my ($self, $key) = @_; | |
461 return $self->{sequences}{$key}{sequence}; | |
462 } | |
463 | |
464 =head2 quality | |
465 | |
466 Title : quality | |
467 Usage : $lucyObj->quality($seqname) | |
468 Function: returns the quality values of one of the sequences analyzed by Lucy. | |
469 This method depends on the user having provided a quality file. | |
470 Returns : string | |
471 Args : name of a sequence | |
472 | |
473 =cut | |
474 | |
475 sub quality { | |
476 my($self, $key) = @_; | |
477 return $self->{sequences}{$key}{quality}; | |
478 } | |
479 | |
480 =head2 avg_quality | |
481 | |
482 Title : avg_quality | |
483 Usage : $lucyObj->avg_quality($seqname) | |
484 Function: returns the average quality value for one of the sequences analyzed by Lucy. | |
485 Returns : float | |
486 Args : name of a sequence | |
487 | |
488 =cut | |
489 | |
490 sub avg_quality { | |
491 my($self, $key) = @_; | |
492 return $self->{sequences}{$key}{avg_quality}; | |
493 } | |
494 | |
495 =head2 direction | |
496 | |
497 Title : direction | |
498 Usage : $lucyObj->direction($seqname) | |
499 Function: returns the direction for one of the sequences analyzed by Lucy | |
500 providing that 'fwd_desig' or 'rev_desig' were set when the | |
501 Lucy object was created. | |
502 Strings returned are: 'F' for forward, 'R' for reverse. | |
503 Returns : string | |
504 Args : name of a sequence | |
505 | |
506 =cut | |
507 | |
508 sub direction { | |
509 my($self, $key) = @_; | |
510 return $self->{sequences}{$key}{direction} if $self->{sequences}{$key}{direction}; | |
511 return ""; | |
512 } | |
513 | |
514 =head2 length_raw | |
515 | |
516 Title : length_raw | |
517 Usage : $lucyObj->length_raw($seqname) | |
518 Function: returns the length of a DNA sequence prior to quality/ vector | |
519 trimming by Lucy. | |
520 Returns : integer | |
521 Args : name of a sequence | |
522 | |
523 =cut | |
524 | |
525 sub length_raw { | |
526 my($self, $key) = @_; | |
527 return $self->{sequences}{$key}{length_raw}; | |
528 } | |
529 | |
530 =head2 length_clear | |
531 | |
532 Title : length_clear | |
533 Usage : $lucyObj->length_clear($seqname) | |
534 Function: returns the length of a DNA sequence following quality/ vector | |
535 trimming by Lucy. | |
536 Returns : integer | |
537 Args : name of a sequence | |
538 | |
539 =cut | |
540 | |
541 sub length_clear { | |
542 my($self, $key) = @_; | |
543 return $self->{sequences}{$key}{length_clear}; | |
544 } | |
545 | |
546 =head2 start_clear | |
547 | |
548 Title : start_clear | |
549 Usage : $lucyObj->start_clear($seqname) | |
550 Function: returns the beginning position of good quality, vector free DNA sequence | |
551 determined by Lucy. | |
552 Returns : integer | |
553 Args : name of a sequence | |
554 | |
555 =cut | |
556 | |
557 sub start_clear { | |
558 my($self, $key) = @_; | |
559 return $self->{sequences}{$key}{beg_clear}; | |
560 } | |
561 | |
562 | |
563 =head2 end_clear | |
564 | |
565 Title : end_clear | |
566 Usage : $lucyObj->end_clear($seqname) | |
567 Function: returns the ending position of good quality, vector free DNA sequence | |
568 determined by Lucy. | |
569 Returns : integer | |
570 Args : name of a sequence | |
571 | |
572 =cut | |
573 | |
574 sub end_clear { | |
575 my($self, $key) = @_; | |
576 return $self->{sequences}{$key}{end_clear}; | |
577 } | |
578 | |
579 =head2 per_GC | |
580 | |
581 Title : per_GC | |
582 Usage : $lucyObj->per_GC($seqname) | |
583 Function: returns the percente of the good quality, vector free DNA sequence | |
584 determined by Lucy. | |
585 Returns : float | |
586 Args : name of a sequence | |
587 | |
588 =cut | |
589 | |
590 sub per_GC { | |
591 my($self, $key) = @_; | |
592 return $self->{sequences}{$key}{per_GC}; | |
593 } | |
594 | |
595 =head2 full_length | |
596 | |
597 Title : full_length | |
598 Usage : $lucyObj->full_length($seqname) | |
599 Function: returns the truth value for whether or not the sequence read was | |
600 full length (ie. vector present on both ends of read). This method | |
601 depends on the user having provided the 'info' file (Lucy must be | |
602 run with the -debug 'info_filename' option to get this file). | |
603 Returns : boolean | |
604 Args : name of a sequence | |
605 | |
606 =cut | |
607 | |
608 sub full_length { | |
609 my($self, $key) = @_; | |
610 return 1 if $self->{sequences}{$key}{full_length}; | |
611 return 0; | |
612 } | |
613 | |
614 =head2 polyA | |
615 | |
616 Title : polyA | |
617 Usage : $lucyObj->polyA($seqname) | |
618 Function: returns the truth value for whether or not a poly-A tail was detected | |
619 and clipped by Lucy. This method depends on the user having modified | |
620 the source for Lucy as outlined in DESCRIPTION and invoking Lucy with | |
621 the -cdna option and saving the standard error. | |
622 Note, the final sequence will not show the poly-A/T region. | |
623 Returns : boolean | |
624 Args : name of a sequence | |
625 | |
626 =cut | |
627 | |
628 sub polyA { | |
629 my($self, $key) = @_; | |
630 return 1 if $self->{sequences}{$key}{polyA}; | |
631 return 0; | |
632 } | |
633 | |
634 =head2 get_rejects | |
635 | |
636 Title : get_rejects | |
637 Usage : $lucyObj->get_rejects() | |
638 Function: returns a hash containing names of rejects and a 1 letter code for the | |
639 reason Lucy rejected the sequence. | |
640 Q- rejected because of low quality values | |
641 S- sequence was short | |
642 V- sequence was vector | |
643 E- sequence was empty | |
644 P- poly-A/T trimming caused sequence to be too short | |
645 In order to get the rejects, you must provide a file with the standard | |
646 error from Lucy. You will only get the quality category rejects unless | |
647 you have modified the source and recompiled Lucy as outlined in DESCRIPTION. | |
648 Returns : hash reference | |
649 Args : none | |
650 | |
651 =cut | |
652 | |
653 sub get_rejects { | |
654 my $self = shift; | |
655 return $self->{reject}; | |
656 } | |
657 | |
658 =head2 Diff for Lucy source code | |
659 | |
660 352a353,354 | |
661 > /* AGW added next line */ | |
662 > fprintf(stderr, "Empty: %s\n", seqs[i].name); | |
663 639a642,643 | |
664 > /* AGW added next line */ | |
665 > fprintf(stderr, "Short/ no insert: %s\n", seqs[i].name); | |
666 678c682,686 | |
667 < if (left) seqs[i].left+=left; | |
668 --- | |
669 > if (left) { | |
670 > seqs[i].left+=left; | |
671 > /* AGW added next line */ | |
672 > fprintf(stderr, "%s has PolyA (left).\n", seqs[i].name); | |
673 > } | |
674 681c689,693 | |
675 < if (right) seqs[i].right-=right; | |
676 --- | |
677 > if (right) { | |
678 > seqs[i].right-=right; | |
679 > /* AGW added next line */ | |
680 > fprintf(stderr, "%s has PolyA (right).\n", seqs[i].name); | |
681 > } | |
682 682a695,696 | |
683 > /* AGW added next line */ | |
684 > fprintf(stderr, "Dropped PolyA: %s\n", seqs[i].name); | |
685 734a749,750 | |
686 > /* AGW added next line */ | |
687 > fprintf(stderr, "Vector: %s\n", seqs[i].name); | |
688 | |
689 =cut | |
690 | |
691 1; |