comparison fasta2shrep_gspan.pl @ 0:b01beb170290 draft default tip

Uploaded
author bgruening
date Tue, 29 Oct 2013 11:10:19 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b01beb170290
1 #!/usr/bin/perl
2 #use feature ':5.10';
3 use File::Basename;
4 use lib dirname($0); # search skript directory for modules
5 use strict 'vars';
6 use warnings;
7 use Getopt::Long;
8 use Pod::Usage;
9 #use List::Util qw/ min max /;
10
11 #use StructureLibrary::RNAtoolsConfig;
12 #use StructureLibrary::Sequence;
13
14 use Cwd qw(getcwd abs_path);
15 use File::Temp qw(tempdir);
16 use File::Copy;
17 use POSIX qw(ceil);
18
19 use FindBin;
20 use lib "$FindBin::Bin";
21 #use GraphClust;
22
23 #use vars qw(%CONFIG);
24 #*CONFIG = \%GraphClust::CONFIG;
25
26 =head1 NAME
27
28
29 fasta2shrep_gspan.pl -fasta mysequences.fasta -wins "50,100,150,200" -shift 5 -M 8
30
31 =head1 SYNOPSIS
32
33 Options:
34
35 HELP
36 -help brief help message
37 -man full documentation
38
39 COMPULSORY
40 -fasta <STRING> e.g. "sequence.fasta"
41 All sequences in fasta format.
42
43 OPTIONS
44 -wins [INTEGER] e.g. "50,100,200"
45 A list of window sizes to use.
46 If none are given (empty string ''), then the entire sequence is
47 taken with no windows. Each window > 1 required!
48 -shift <INTEGER> e.g. 20
49 The shift of the window, relative to the window size given in
50 percent. So you give which percent of the window size shall be
51 used for the shift. Of course the shift is rounded down to the
52 nearest whole number.
53 Example 20 % of a window 150 would result in a step size of 30 nt.
54 It is a relative parameter, as you can give different window sizes.
55 If you do not give this parameter there is a default shift of 1 nt.
56 -cue Crop unpaired ends.
57 If you give this flag, then the unpaired ends of each
58 single structure are ignored. E.g. the structure
59 ...(((...))).. becomes just (((...)))
60 -stack Adds stacking information to graphs. This adds an additional
61 vertex (type P) for each pair of stacked base-pairs and four edges
62 (type p) from each of the involved bases to the new vertex.
63 -e <FLOAT> e.g. 5.0
64 Energy range in kcal/mol (RNAshapes)
65 Use only one of -e and -c!
66 -c <INTEGER> e.g. 10
67 Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)
68 Use only one of -e and -c!
69 -t <INTEGER> [1-5] e.g. 3 OR "3=0,4=100,5=200"
70 The shape type (RNAshapes). Default is 3.
71 With the list format, the shape level can be changed for different window length
72 "4=100" means that shape level 4 is used from length 100nt (window length)
73 The first given length has to be 0! Not continuous given levels are allowed!
74 -M <INTEGER> e.g. 10
75 Max number of shreps that should be taken per window.
76 -u Ignore unstable structures (RNAshapes).
77 This option filters out closed structures with positive free energy.
78 -r Calculate structure probabilities for shreps (RNAshapes)
79 -i <INT> e.g. 10
80 Turn on structure sampling and gives number of sampling iterations.
81 Default no sampling (i=0)
82 -sample-len <INT> e.g. 100
83 Only in sampling mode: Sampling is only used for seqs/windows >= given length,
84 Default: sample all lengths (0), if -i > 0
85 -q Turn on shape probabilities for RNAshapes, no sampling mode allowed
86 -Tp <FLOAT> e.g 0.001
87 Filter cutoff for shape probabilities, applied before -M filter!
88 -seq-graph-win add for each window a graph which contains no structure
89 -seq-graph-t add for each 't #' a graph which contains no structure
90 -seq-graph-alph change the alphabet of unstructured graphs
91 -annotate <STRING> annotation.tab
92 A file with annotations to be added as abstract graphs
93 on the sequence leven (if given) and on the structure
94 (SHREP) level. The format is has the following TAB-delimited
95 columns: SEQID, START, END, NAMESPACE#LABEL.
96 Labels with the same name-space and SEQID form connected
97 components, which is a sequence of label vertices ordered
98 by the START position in the sequence.
99 -abstr Add abstract structure graphs to the single shrep graph
100 instances.
101 -nostr Calculate no structures, only add sequence information,
102 if this is given, then -seq-graph-win AND/OR -seq-graph-t
103 are required.
104 -match-shape <SHAPE>
105 all seqs/windows will be constraint folded into that shape via
106 RNAshapes (if structure is given in another way this struct will be kept),
107 if this shape is not possible within given energy range, produce a
108 specific t graph with only one vertex 'X'. By this the instance
109 becomes very unsimilar to all other graphs (for knn)
110 -vp enable graph computation with viewpoints:
111 svmsgdnspdk will center on those nucleotides that are given
112 via capital letters and ignore those given as lowercase letters
113 -tmp <STRING> e.g. "/scratch/1/sita/tmp"
114 A directory for writing temporary files
115 -o <STRING> e.g. "ProjectX/MySequences/GSPAN/"
116 Output directory for gspan files containing graphs.
117 -group <INTEGER> e.g. 5
118 Combine/group that number of input seqs into 1 gspan file
119 output name is then '<INT>.group.gspan.bz2'
120 -sge Use SGE cluster for each sequence separately
121 -sge-logdir stdout directory for SGE call
122 -sge-errdir sdterr directory for SGE call
123 -stdout send graphs to stdout instead of writing to files
124 -ignore-header don't write fasta id part after first space to gspan
125 -debug additional debug output
126
127
128 DEFAULT VALUES
129 -wins ""
130 -shift 1 nt
131 -c 10
132 -t 3
133 -M 0 # selects all shreps
134 -tmp "/var/tmp/fasta2shrep"
135 -o "CURRENT_DIR/GSPAN/"
136
137 SGE mode
138 -sge-logdir "CURRENT_DIR/GSPAN/SGE_log"
139 -sge-errdir "CURRENT_DIR/GSPAN/SGE_log"
140 -task-id <NUM>
141
142 =head1 DESCRIPTION
143
144 =cut
145
146 ###############################################################################
147 # end handler for temporary directory
148 # adds an error handler that deletes the directory in case of error
149 # SIGUSR{1/2} are sent by the sge prior to the uncatchable SIGKILL if the
150 # option -notify was set
151 ###############################################################################
152 $SIG{'INT'} = 'end_handler';
153 $SIG{'TERM'} = 'end_handler';
154 $SIG{'ABRT'} = 'end_handler';
155 $SIG{'USR1'} = 'end_handler';
156 $SIG{'USR2'} = 'end_handler';
157
158 sub end_handler {
159 print STDERR "signal ", $_[0], " caught, cleaning up temporary files\n";
160
161 # change into home directory. deletion of the temporary directory will
162 # fail if it is the current working directory
163 chdir();
164 File::Temp::cleanup();
165 die();
166 }
167
168 ###############################################################################
169 # PARSE COMMAND LINE OPTIONS
170 ###############################################################################
171
172 # command line options
173 my ( $i_help, $i_man, $i_debug, $i_fas, $i_wins, $i_shift, $i_crop_unpaired_ends,
174 $i_r, $i_e, $i_c, $i_t, $i_u, $i_M, $i_o, $i_sge, $i_jobid, $i_tmp,
175 $i_ignore_seq_header, $i_stacks, $i_stdout, $i_q, $i_T, $i_i,
176 $i_sample_min_length, $i_sge_logDir, $i_sge_errDir, $i_groupsize, $i_annotate,
177 $i_abstr, $i_no_structure, $i_vp, $i_matchShape, $i_rnashapes_binpath );
178
179 my ( $i_add_seq_graph_win, $i_add_seq_graph_t, $i_change_seq_graph_alph );
180
181 my $options = GetOptions(
182 "help" => \$i_help,
183 "man" => \$i_man,
184 "debug" => \$i_debug,
185 "fasta=s" => \$i_fas,
186 "wins=s" => \$i_wins,
187 "shift=f" => \$i_shift,
188 "cue" => \$i_crop_unpaired_ends,
189 "stack" => \$i_stacks,
190 "r" => \$i_r,
191 "e=f" => \$i_e,
192 "c=i" => \$i_c,
193 "t=s" => \$i_t,
194 "u" => \$i_u,
195 "M=i" => \$i_M,
196 "tmp=s" => \$i_tmp,
197 "o=s" => \$i_o,
198 "i=i" => \$i_i,
199 "sample-len=i" => \$i_sample_min_length,
200 "q" => \$i_q,
201 "Tp=f" => \$i_T,
202 "seq-graph-win" => \$i_add_seq_graph_win,
203 "seq-graph-t" => \$i_add_seq_graph_t,
204 "-seq-graph-alph" => \$i_change_seq_graph_alph,
205 "sge" => \$i_sge,
206 "task-id=i" => \$i_jobid,
207 "ignore-header" => \$i_ignore_seq_header,
208 "stdout" => \$i_stdout,
209 "sge-logdir=s" => \$i_sge_logDir,
210 "sge-errdir=s" => \$i_sge_errDir,
211 "group=i" => \$i_groupsize,
212 "annotate=s" => \$i_annotate,
213 "abstr" => \$i_abstr,
214 "nostr" => \$i_no_structure,
215 "vp" => \$i_vp,
216 "match-shape=s" => \$i_matchShape,
217 "rnashapes-bin=s" => \$i_rnashapes_binpath,
218 );
219
220 pod2usage( -exitstatus => 1, -verbose => 1 ) if $i_help;
221 pod2usage( -exitstatus => 0, -verbose => 2 ) if $i_man;
222 ($options) or pod2usage(2);
223
224 # check compulsory options
225 ($i_fas) or pod2usage("Error: the option -fasta is compulsory!\n");
226 ( -e $i_fas ) or pod2usage("Error: no such file - $i_fas!\n");
227 $i_fas = abs_path($i_fas);
228
229 # check other options and set default values
230 pod2usage("Error: either -e OR -c can be given, but NOT both!\n") if ( $i_e && $i_c );
231 ( $i_e or $i_c ) or $i_c = 10; # set -c 10, if neither -e or -c are given
232 ($i_M) or $i_M = 0; # max number of shreps is 0 (=means take all computed)
233 ($i_i) or $i_i = 0; # default no sampling else sampling iterations
234 ($i_sample_min_length) or $i_sample_min_length = 0;
235 pod2usage("\nError: use --sample-len <INT> only with -i <1..INT> !\n") if ( not $i_i and $i_sample_min_length );
236 ($i_T) or $i_T = 0;
237 ($i_q) or $i_q = 0;
238 pod2usage("\nError: Sampling (-i) not possible with shape probabilities (-q)!\n") if ( $i_i and $i_q );
239 ($i_add_seq_graph_win) or $i_add_seq_graph_win = 0;
240 ($i_add_seq_graph_t) or $i_add_seq_graph_t = 0;
241 ($i_change_seq_graph_alph) or $i_change_seq_graph_alph = 0;
242 ($i_no_structure) or $i_no_structure = 0;
243
244 if ($i_change_seq_graph_alph) {
245 ($i_add_seq_graph_t) or ($i_add_seq_graph_win) or pod2usage( "Error: " .
246 "When giving the parameter -seq-graph-alph, then either -seq-graph-t or -seq-graph-win" .
247 " must also be given!\n" );
248 }
249 ( -e $i_annotate ) or pod2usage("Error: no such file - $i_annotate!\n")
250 if ($i_annotate);
251
252 $i_add_seq_graph_t = 1 if ($i_no_structure);
253
254 ($i_t) or $i_t = 3; # default abstraction type is 3
255 my $change_shape_level = 0;
256 my @level_lens = ( -1, -1, -1, -1, -1 ); ## array_idx-1=shape level, value=start length of this level
257
258 if ( $i_t !~ /^\d+$/ ) {
259
260 my @t_minlens = split( ",", $i_t ); ## -t "3=100,4=200"
261
262 foreach my $idx ( 1 .. @t_minlens ) {
263
264 my $level = $t_minlens[ $idx - 1 ]; ## $level = "3=100"
265
266 die "$level Wrong -t format! Example: -t 3=0,4=100,5=200\n" if ( $level !~ /^\d+\=\d+$/ );
267 my @lev_len = split( "=", $level ); ## $level = "3=100"
268
269 die "Wrong -t format! First level given needs to be length 0! Example: -t 3=0,4=100,5=200\n" if ( $idx == 1 && $lev_len[1] != 0 );
270 die "Wrong -t format! Only level 1-5 allowed! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[0] < 1 or $lev_len[0] > 5 );
271 die "Wrong -t format! Length >= 0 expected! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[1] < 0 );
272 $change_shape_level = 1;
273 $level_lens[ $lev_len[0] - 1 ] = $lev_len[1];
274 }
275
276 ($i_debug) and print STDERR "i_t = $i_t - change $change_shape_level - shape level lengths#" . join( ":", @level_lens ) . "#" . join( ":", @t_minlens ) . "#\n";
277 }
278
279 ## checks for match-shape
280 if ($i_matchShape){
281 die "Please provide correct match shape string like '[]'! Exit...\n\n"
282 if ($i_matchShape !~ /^[\[\]_]+$/);
283 ## RNAshapes prodices anyway only 1 structure, no suboptimal structs in match-shape folding
284 $i_M = 1;
285 }
286
287 my $CURRDIR = getcwd;
288
289 # set up tmp directory
290 # default tmp is /var/tmp, usually not NFS mounted!
291 ( defined $i_tmp ) or $i_tmp = '/var/tmp/';
292 my $tmp_template = 'fasta2shrep-XXXXXX';
293
294 # CLEANUP => 1 : automatically delete at exit
295 $i_tmp = tempdir( $tmp_template, DIR => $i_tmp, CLEANUP => 1 );
296
297 # create GSPAN directory when not printing to stdout
298 if ( not $i_stdout ) {
299 if ($i_o) {
300 ( -e $i_o ) or system("mkdir -p $i_o");
301 } else {
302 system("mkdir -p GSPAN");
303 $i_o = $CURRDIR . "/GSPAN/";
304 }
305 }
306
307 ###############################################################################
308 # GLOBAL VARIABLES
309 ###############################################################################
310 $i_rnashapes_binpath or $i_rnashapes_binpath = "RNAshapes";
311 die "Please provide full RNAshapes path (e.g. /usr/bin/RNAshapes) !\n" if (!-f "$i_rnashapes_binpath");
312 my $rnashapes_loc = $i_rnashapes_binpath;
313 if ( !$rnashapes_loc || !-e $rnashapes_loc ) {
314 my $loc = `which RNAshapes`;
315 chomp($loc);
316 die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !$loc );
317 die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !-e $loc );
318 $rnashapes_loc = $loc;
319 }
320
321 my @WINDOWS = ();
322 @WINDOWS = split( ",", $i_wins ) if ($i_wins);
323 my $globalFolding;
324 $globalFolding = 1 unless @WINDOWS;
325 my $CURRUSER = getlogin;
326 my $SEQNO = 0; # used to id the sequences
327 my $GSPANNO = 0; # used for gspan filenames
328 # minimum length of sequence that will work with RNAshapes
329 # sequences with one or two nucleotides will be restricted to sequence-only graphs
330 # this ensures that no sequences are skipped and external info kept synchronized
331 my $GSPAN_SEQ_MINLEN = 3;
332
333 # name spaces
334 my $ABSTRUCT = "AS";
335 ###############################################################################
336 # EXECUTION CODE
337 ###############################################################################
338
339 # read fasta file into hash
340 my ( $headers_aref, $sequences_aref, $metainfo_aref ) = read_fasta_with_nonunique_headers_meta($i_fas);
341
342 # call script again on the sge cluster in a batch job
343 if ($i_sge) {
344 my $SCRIPTNAME = $0;
345 my $CLUSTERSUBMIT = $FindBin::Bin."/fasta2shrep_gspan.sge";
346
347 die "Cannot find SGE submit script $CLUSTERSUBMIT! Exit...\n\n" if ( !-e $CLUSTERSUBMIT );
348
349 my $SGE_jobs = @{$sequences_aref};
350 $SGE_jobs = ceil( @{$sequences_aref} / $i_groupsize ) if ($i_groupsize);
351
352 my $params = "-fasta $i_fas ";
353 $params .= "-wins $i_wins " if ($i_wins);
354 $params .= "-shift $i_shift " if ($i_shift);
355 $params .= "-e $i_e " if ($i_e);
356 $params .= "-c $i_c " if ($i_c);
357 $params .= "-t $i_t " if ($i_t);
358 $params .= "-u " if ($i_u);
359 $params .= "-r " if ($i_r);
360 $params .= "-M $i_M " if ($i_M);
361 $params .= "-o $i_o " if ($i_o);
362 $params .= "-i $i_i " if ($i_i);
363 $params .= "--sample-len $i_sample_min_length" if ($i_sample_min_length);
364 $params .= "-q " if ($i_q);
365 $params .= "-Tp $i_T " if ($i_T);
366 $params .= "--seq-grap-win " if ($i_add_seq_graph_win);
367 $params .= "--seq-graph-t " if ($i_add_seq_graph_t);
368 $params .= "--seq-graph-alph " if ($i_change_seq_graph_alph);
369 $params .= "-ignore-header " if ($i_ignore_seq_header);
370 $params .= "-cue " if ($i_crop_unpaired_ends);
371 $params .= "-stack " if ($i_stacks);
372 $params .= "--group $i_groupsize " if ($i_groupsize);
373 $params .= "-annotate $i_annotate" if ($i_annotate);
374 $params .= "-abstr" if ($i_abstr);
375 $params .= "-nostr " if ($i_no_structure);
376 $params .= "--debug " if ($i_debug);
377 $params .= "-vp " if ($i_vp);
378 $params .= "--match-shape $i_matchShape " if ($i_matchShape);
379
380 print "used script:" . $SCRIPTNAME . "\n";
381 print "used submit script:" . $CLUSTERSUBMIT . "\n";
382
383 $i_sge_logDir = "$i_o/SGE_log" if ( !$i_sge_logDir );
384 mkdir($i_sge_logDir);
385
386 $i_sge_errDir = $i_sge_logDir if ( !$i_sge_errDir );
387 mkdir($i_sge_errDir);
388
389 my $ssh = 1; ## can be used for debug shell script call
390 if ($ssh) {
391 system( "ssh $CURRUSER\@biui.informatik.uni-freiburg.de "
392 . "'export SGE_ROOT=/opt/sge-6.0/; cd $CURRDIR; "
393 . "/opt/sge-6.0/bin/lx24-amd64/qsub -t 1-$SGE_jobs -o $i_sge_errDir/ -e $i_sge_errDir/ "
394 . "$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME \"$params\" ' " );
395 } else {
396 system("$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME '$params' ");
397 }
398 exit;
399 }
400
401 ## compute shreps and gspan, either local or after SGE submission
402
403 # TODO read and process annotations
404
405 my @used_seq_headers;
406 my @used_seqs;
407 my @used_meta;
408 my $group_idx;
409
410 if ($i_jobid) {
411 ## just process the one sequence as given by the jobid number
412 my $used_grouping = 1; ## if no group is given, make 1 seq per job = group=1
413 $used_grouping = $i_groupsize if ($i_groupsize);
414 my $st = ( $i_jobid - 1 ) * $used_grouping + 1;
415 my $end = $st + $used_grouping - 1;
416 $end = @{$sequences_aref} if ( $end > @{$sequences_aref} );
417
418 foreach my $idx ( $st .. $end ) {
419 push( @used_seq_headers, $headers_aref->[ $idx - 1 ] );
420 push( @used_seqs, $sequences_aref->[ $idx - 1 ] );
421 push( @used_meta, $metainfo_aref->[ $idx - 1 ] );
422 }
423
424 $group_idx = $i_jobid;
425
426 #($i_debug) and print STDERR "st $st end $end gr $group_idx job $i_jobid gs $i_groupsize\n";
427
428 $GSPANNO = $i_jobid-1 if (!$i_groupsize);
429
430 } else {
431 ## process all sequences at once
432 @used_seq_headers = @{$headers_aref};
433 @used_seqs = @{$sequences_aref};
434 @used_meta = @{$metainfo_aref};
435 }
436
437 my $out;
438 my $gspanfile;
439 my $out_no_match_shape;
440
441 if ($i_matchShape && !$i_stdout && !$i_groupsize){
442 open($out_no_match_shape,">$i_o/fasta2shrep.no_match");
443 }
444
445 # for each sequence in the fasta file
446 while ( my $seq = shift @used_seqs ) {
447 my $tmp_header = shift @used_seq_headers;
448 my $tmp_meta = shift @used_meta;
449
450 my ( $seq_id, $seq_header ) = ( $tmp_header =~ /(\S+)\s*([\S*\s*]*)/ );
451 $i_ignore_seq_header and $seq_header = '';
452
453 my $seq_fasta = generate_single_fasta_from_sequence_X( $seq_id, $seq );
454 my $seq_len = length($seq);
455
456 # only print sequence graphs for sequences below this threshold
457 my $no_structure_override = ($seq_len < $GSPAN_SEQ_MINLEN) ? 1 : 0;
458
459 $GSPANNO++;
460
461 # set outstream for gspan output to correct file/STDOUT
462 if ($i_stdout) {
463 $out = \*STDOUT;
464 } elsif ( !$i_groupsize ) {
465 $gspanfile = $i_tmp . '/' . $GSPANNO . '.gspan';
466 open( $out, "| bzip2 -f > $gspanfile.bz2" );
467 } elsif ( ( $GSPANNO - 1 ) % $i_groupsize == 0 ) {
468
469 if ( $GSPANNO > 1 ) {
470 close($out);
471 system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
472 if ($out_no_match_shape){
473 close($out_no_match_shape);
474 system("mv $gspanfile.no_match $i_o/$group_idx.group.gspan.no_match");
475 }
476 }
477
478 if ( !$i_jobid ) {
479 $group_idx = int( ( $GSPANNO - 1 ) / $i_groupsize ) + 1;
480 }
481
482 $gspanfile = "$i_tmp/$group_idx.group.gspan";
483 open( $out, "| bzip2 -f > $gspanfile.bz2" );
484 open( $out_no_match_shape, ">$gspanfile.no_match" ) if ($out_no_match_shape);
485 }
486
487 ## do not use folding windows in special cases
488 if ( $globalFolding || $i_no_structure || $no_structure_override) {
489 @WINDOWS = ();
490 push( @WINDOWS, $seq_len );
491 }
492
493 ##check win sizes for global folding
494 my @WINDOWS_used = sort { $a <=> $b } @WINDOWS;
495 foreach my $w_idx ( 0 .. ( @WINDOWS_used - 1 ) ) {
496 if ( $WINDOWS_used[$w_idx] >= $seq_len && $WINDOWS_used[$w_idx] > 1 ) {
497 if ( $w_idx < $#WINDOWS_used ) {
498 @WINDOWS_used = @WINDOWS_used[ 0 .. $w_idx ];
499 last;
500 }
501 }
502 }
503
504 ## use seq graph only if no shape folding wanted
505 $i_add_seq_graph_t = 1 if ($i_no_structure || $no_structure_override);
506
507 ## no shape info in graphheader if we have fixed structure (according LocaRNA handling)
508 ## use tags #FS or #S for provided structure
509 my $graph_header;
510 my @struct_meta = grep { $_ =~ /#FS/ || $_ =~ /#S/ } keys %{$tmp_meta};
511
512 if (@struct_meta) {
513 $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, 1 );
514 } else {
515 $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, $i_no_structure || $no_structure_override );
516 }
517
518 print $out $graph_header;
519
520 my $gi = 0; # current graph index
521
522 ## add graph with no structure at all depending on $i_add_seq_graph_t
523 if ($i_add_seq_graph_t) {
524
525 ($gi) = convertSeqWindow( $seq, $seq_len, 1, $gi, $graph_header, $out, $i_annotate, $seq );
526
527 }
528
529 ## encode fixed structure only if provided and structures wanted in general
530 if ( @struct_meta && !$i_no_structure && !$no_structure_override) {
531
532 my $struct_meta = "#FS";
533 $struct_meta = "#S" if ( !exists $tmp_meta->{$struct_meta} );
534
535 my $seq_shrep = [ $tmp_meta->{$struct_meta}, "ENERGY", "0.00", "SHAPE", $struct_meta ];
536 $gi = convertShapeWindow( [$seq_shrep], $seq, $seq_len, 1, $gi, $out,
537 $graph_header, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
538 @WINDOWS_used = (); ## no shape folding if we have a fixed structure
539
540 }
541
542 ## ignore RNAshapes folding if wanted, but do correct file move afterwards
543 ## (just "next" does not work due to output)
544 @WINDOWS_used = () if ($i_no_structure or $no_structure_override);
545
546 #for each window size in list
547 foreach my $win_size (@WINDOWS_used) {
548
549 # calculate shift size from percentage
550 my $curr_shift = 1;
551 if ($i_shift) {
552 $curr_shift = ( $i_shift / 100 ) * $win_size;
553 $curr_shift = int($curr_shift); #round down
554 $curr_shift = 1 unless ($curr_shift); # just in case it is 0
555 }
556 ($i_debug) and print STDERR "winsize: $win_size curr_shift: $curr_shift\n";
557 ($i_debug) and print STDERR "\nNext: $seq_id\t winsize:$win_size \n";
558
559 # choose current shape level, depending on $i_t
560 my $curr_t = 0;
561 if ($change_shape_level) {
562 for ( my $i = 0 ; $i < @level_lens ; $i++ ) {
563 $curr_t = $i + 1 if ( $level_lens[$i] != -1 && ( $level_lens[$i] <= $win_size ) );
564 }
565 ($i_debug) and print STDERR "$win_size curr type $curr_t\n";
566 } else {
567 $curr_t = $i_t;
568 }
569
570 my $rnashapesoutput_fh;
571
572 # call RNAshapes and write to $rnashapesoutput_fh
573 $rnashapesoutput_fh = call_RNAshapes( $seq_fasta, $rnashapes_loc, $win_size,
574 $curr_shift, $i_e, $i_c, $curr_t, $i_u, $i_r, $i_q, $i_T, $i_i, $i_sample_min_length, $seq_len, $i_matchShape );
575
576 # read RNAshapes output from $rnashapesoutput_fh and write subgraph
577 # to gspan file
578 my $gi_old = $gi;
579 $gi = convert_RNAshapes_output( $rnashapesoutput_fh, $gi, $i_M, $out, $graph_header,
580 $win_size, $seq_len, $curr_t, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
581
582 ## no (match) shape found at all for this seq
583 if ($gi == $gi_old+1){
584 $gi = convertSeqWindow( "X", 1, 1, $gi, $graph_header, $out, $i_annotate, $seq );
585 print $out_no_match_shape $seq_id."\n" if (!$i_stdout && $out_no_match_shape);
586 }
587 } ## foreach WINDOW_used
588
589 if ( !$i_stdout && !$i_groupsize ) {
590 close($out);
591 move "$gspanfile.bz2", "$i_o/$GSPANNO.gspan.bz2";
592 }
593
594 system("rm $seq_fasta");
595
596 } ## while @used_seqs
597
598 if ($i_groupsize) {
599 close($out);
600 system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
601 close($out_no_match_shape) if ($out_no_match_shape);
602 move "$gspanfile.no_match", "$i_o/$group_idx.group.gspan.no_match" if ($out_no_match_shape);
603 } elsif ( $out_no_match_shape && !$i_stdout && !$i_groupsize ) {
604 close($out_no_match_shape);
605 }
606
607 ###############################################################################
608 # METHODS
609 ###############################################################################
610
611 ############################################################################
612 # Generates fasta file for a single sequence (one-lined-fasta). This
613 # fasta is stored in the temp directory and should be deleted at the end.
614 # Input:
615 # seq_id : the sequence ID
616 # seq : the sequence
617 #
618 # Output:
619 # The fasta file name
620 ############################################################################
621 sub generate_single_fasta_from_sequence_X {
622 my ( $seq_id, $seq ) = @_;
623
624 $seq = uc($seq);
625 $seq =~ tr/T/U/;
626 $seq =~ s/[^AUCGN]/N/g;
627
628 my $outfas = $i_tmp . "/seq_" . $SEQNO++ . ".fasta";
629 my $host = readpipe("hostname");
630 open( FAS, ">$outfas" ) or die "$host Cannot open file $outfas! Exit...\n\n";
631 print FAS ">$seq_id\n$seq";
632 close(FAS);
633
634 return $outfas;
635 }
636
637 ############################################################################
638 # RNAshapes is called with the given input or default parameters.
639 # Input:
640 # seq_fasta : the sequence fasta file
641 # rnashapes_location : the location of the installation files for RNAshapes
642 # win_size : the current window size
643 # shift : the input parameter -shift
644 # e : the input parameter -e
645 # c : the input parameter -c
646 # t : the input parameter -t
647 # u : the input parameter -u
648 # r : the input parameter -r
649 #
650 # Output: none
651 ############################################################################
652 sub call_RNAshapes {
653 my ( $seq_fasta, $rnashapes_location, $win_size, $shift, $e, $c, $t, $u, $r, $q, $T, $i, $sample_length, $seqLen, $matchShape ) = @_;
654 my $FUNCTION = "call_RNAshapes in fasta2shrep_gspan.pl";
655
656 ($seq_fasta) or die("INPUT ERROR in $FUNCTION: the fasta file is compulsory!\n");
657 ($rnashapes_location) or die( "INPUT ERROR in $FUNCTION: the RNAshapes location" . " is compulsory!\n" );
658 die "$rnashapes_location does not exist! Exit...\n\n" if ( !-e $rnashapes_location );
659 my $call = $rnashapes_location . " -o 1 "; # the output format is of type 1
660 $call .= "-q " if ($q);
661 $call .= "-T $T " if ( $q and $T );
662 $call .= "-w $win_size ";
663 $call .= "-W $shift " if ($i_shift);
664 die("ERROR in $FUNCTION: Give only one of the options -c or -e (RNAshapes)!\n")
665
666 if ( $e && $c );
667 $call .= "-e $e " if ($e);
668 $call .= "-c $c " if ($c);
669 $call .= "-t $t " if ($t);
670 $call .= "-u " if ( $u and not $i ); ## not possible in sampling mode
671 $call .= "-r " if ($r);
672 $call .= "-m $matchShape " if ($matchShape);
673
674 ## check is a bit long but : we want to sample if the window is larger than $sample_length and full seq is larger than window or sample_len
675 ## necessary to do sampling a large window is given, sample_length is shorter than window, but seq is longer than sample_len
676 $call .= "-i $i -A " if ( not $q and $i and ( $win_size >= $sample_length and ( $win_size <= $seqLen or $seqLen >= $sample_length ) ) ); ## -A is to omit samples and print only combined shape probs
677 $call .= " < $seq_fasta";
678
679 ($i_debug) and print STDERR "$seqLen $sample_length $win_size $call\n";
680
681 open my $rnashapesoutput, "$call |" or die( "ERROR in $FUNCTION: The following call " . "could not be carried out!\n$call\n" );
682
683 return $rnashapesoutput;
684 }
685
686 ############################################################################
687 # The output of RNAshapes for one sequence and one window size is read
688 # and converted into graph format.
689 # Input:
690 # rnashapeoutput : filehandle for RNAshapes output in format -o 1
691 # curr_gi : current graph index (for vertices)
692 # maxShreps : max number of shreps to convert to graphs
693 # graph_file_hdl : the output handler for the graph file
694 # graphHead : the header line for the complete graph (for sequence)
695 # winSize : current window size in input for RNAshapes
696 # seqLen : full input seq length
697 # used_t :
698 # annotate:
699 # abstr :
700 # cue :
701 # stacks :
702 # orig_seq : the nucleotide sequence as read from fasta
703 #
704 # Output:
705 # The current graph index
706 ############################################################################
707 sub convert_RNAshapes_output {
708 my ( $rnashapesoutput, $curr_gi, $maxShreps, $graph_file_hdl, $graphHead, $winSize, $seqLen, $used_t, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
709
710 ## omit first line in output, contains fasta header line
711 my $line = <$rnashapesoutput>;
712 my $win_shreps = [];
713 my $win_start;
714 my $win_end;
715 my $win_seq;
716 my $win_shrep_count = 0;
717 my $win_sample = 0;
718 my $winHead;
719 my $win_globalFolding;
720 my $win_size_real;
721
722 # reading RNAshapes output
723 while ( $line = <$rnashapesoutput> ) {
724
725 if ( $line =~ /^(\d+)\s+(\d+)$/ ) {
726 ## line: "<start> <end>"
727
728 if ( @{$win_shreps} > 0 ) {
729
730 print $graph_file_hdl $winHead;
731
732 ## remove SHAPE="_" shrep if it exists in $win_shreps
733 ## do this only when have a sequence graph already
734 if ($i_add_seq_graph_t) {
735 my @new_win_shreps = ();
736 map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
737 $win_shreps = \@new_win_shreps;
738 }
739
740 ## add graph with no structure depending on $i_add_seq_graph_win to win
741 if ($i_add_seq_graph_win ) {
742 ( $curr_gi ) = convertSeqWindow( $win_seq, $win_size_real,
743 $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
744 }
745
746 $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
747 $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr,
748 $cue, $stacks, $orig_seq );
749 }
750
751 ## set new window params
752 $win_shreps = [];
753 $win_start = $1;
754 $win_end = $2;
755 $win_shrep_count = 0;
756 $win_size_real = $win_end - $win_start + 1;
757
758 if ( ($win_size_real) >= $seqLen ) {
759 $win_globalFolding = 1;
760 } else {
761 $win_globalFolding = 0;
762 }
763 my $win_center = $win_start + ( ( $win_size_real + 1 ) / 2 );
764
765 $winHead = getWindowHeader( $graphHead, $winSize, $win_start, $win_end, $win_center, $win_globalFolding, $win_sample, $used_t );
766
767 } elsif ( $line =~ /^(\S+)$/ ) {
768 ## line: "CUUAUGAGUAAGGAAAAUAACGAUUCGGGGUGACGCCCGAAUCCUCACUG"
769 $win_seq = uc($1); ## to be 101% sure that we have by default uppercase chars
770
771 } elsif ( $line =~ /^Results for (\d+) iterations:$/ ) {
772 ## line: "Results for 10 iterations:"
773 $win_sample = $1;
774
775 } elsif ( $line =~ /^Shape\s+\S+\s+not found within energy range.*$/ ) {
776 ## line: "Shape [] not found within energy range (-24.75 to -27.50). Try -c or -e to increase range."
777 $win_shreps = [];
778
779 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)$/ ) {
780 ## line:"...((((..(((....)))))))...........(((((......))))) (-10.10) [[]][]"
781 ## take only $maxShreps shreps per window if set
782 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
783 push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPE", $3 ] );
784 $win_shrep_count++;
785
786 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)$/ ) {
787 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) [[[[[]]]]]"
788 ## take only $maxShreps shreps per window if set
789 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
790 push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPE", $4 ] );
791 $win_shrep_count++;
792
793 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
794 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) 0.3000000 [[[[[]]]]]"
795 ## take only $maxShreps shreps per window if set
796 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
797 push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPEPROB", $3, "SHAPE", $4 ] );
798 $win_shrep_count++;
799
800 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
801 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) 0.3000000 [[[[[]]]]]"
802 ## take only $maxShreps shreps per window if set
803 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
804 push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPEPROB", $4, "SHAPE", $5 ] );
805 $win_shrep_count++;
806
807 } else {
808 next if ( $line =~ /^$/ );
809 die "Unexpected shape output format!\nline=$line\n\nExit...\n\n";
810 }
811 }
812
813 ## convert last windows
814 if ( @{$win_shreps} > 0 ) {
815
816 print $graph_file_hdl $winHead;
817
818 ## remove SHAPE="_" shrep if it exists in $win_shreps
819 ## do this only when have a sequence graph already
820 if ($i_add_seq_graph_t) {
821 my @new_win_shreps = ();
822 map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
823 $win_shreps = \@new_win_shreps;
824 }
825
826 ## add graph with no structure depending on $i_add_seq_graph_win to win
827 if ($i_add_seq_graph_win) {
828 ($curr_gi) = convertSeqWindow( $win_seq, $win_size_real, $win_start,
829 $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
830 }
831
832 $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
833 $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, $cue,
834 $stacks, $orig_seq );
835 }
836
837 close($rnashapesoutput);
838
839 return $curr_gi + 1; # return the gi (graph index) for the next subgraph
840 }
841
842 ## Sub to create graph for a complete unstructured sequence
843 # TODO: document function
844 sub convertSeqWindow {
845 my ( $win_seq, $win_size_real, $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ) = @_;
846
847 my $seq_shrep;
848
849 # use a different alphabet (lowercase) if option seq-graph-alph set
850 my $seq_graph_sequence = $i_change_seq_graph_alph ? lc($win_seq) : uc($win_seq);
851 $seq_shrep = [ "." x $win_size_real, "ENERGY", "0.00", "SHAPE", "_", "STRUCT", "." x $win_size_real, "SEQ", $seq_graph_sequence ];
852 my $backboneGraph_ref = getBackboneGraph( $seq_graph_sequence, $curr_gi, $win_start, 0, ( $win_size_real - 1 ), $orig_seq );
853
854 print $graph_file_hdl getSeqHeader( $winHead, $seq_shrep );
855 print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
856
857 $curr_gi += $win_size_real;
858
859 if ($annotate) {
860 ## TODO add annotation graphs
861 }
862
863 return ($curr_gi);
864 }
865
866 ############################################################################
867 # The results for one window are converted in this method to GSPAN graphs.
868 # Input:
869 # win_shreps_aref : the array ref of shreps (dot-bracket structures) for the
870 # current window
871 # win_seq : the nucleotide sequence for the current window
872 # win_size_real : the current (true) window size
873 # win_start : the starting position of win_seq in the original seq (1-n)
874 # curr_gi : the current graph index
875 # graph_file_hdl : the graph file handler
876 # winHead : the header line for the current sequence window
877 # TODO annotate
878 # abstr : input parameter -abstr
879 # cue : input parameter -cue
880 # stacks : input parameter -stacks
881 # orig_seq : the nucleotide sequence as read from fasta
882 #
883 # Output:
884 # The current graph index
885 ############################################################################
886 sub convertShapeWindow {
887 my ( $win_shreps_aref, $win_seq, $win_size_real, $win_start, $curr_gi,
888 $graph_file_hdl, $winHead, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
889
890 ## generate for each shrep a connected component in gspan file
891 foreach my $shrep ( @{$win_shreps_aref} ) {
892
893 # get the current gi as it was at the beginning of this shrep
894 my $shrep_gi = $curr_gi;
895
896 # cut off unpaired ends, if option is given
897 my $shrep_struct = $shrep->[0];
898 my $crop_index_left = 0;
899 my $crop_index_right = length($shrep_struct) - 1;
900
901 # find croping indices with numbering from 0 to n-1
902 if ($cue) {
903 $crop_index_left = index( $shrep_struct, "(" ); # find 1st occ of "("
904 $crop_index_right = rindex( $shrep_struct, ")" ); # find last occ of ")"
905
906 # if the complete window is unpaired, then don't crop
907 if ( $crop_index_left == -1 ) {
908 $crop_index_left = 0;
909 $crop_index_right = length($shrep_struct) - 1;
910 }
911 }
912
913 # create structure graph
914 my $backboneGraph_ref = getBackboneGraph( $win_seq, $curr_gi, $win_start, $crop_index_left, $crop_index_right, $orig_seq );
915 my $structGraph_ref;
916
917 # add both basic edges, stacks and abstract graph
918 if ($abstr) {
919 ( $structGraph_ref, $curr_gi ) = getStructPlusAbstractStructGraph( $shrep, $curr_gi, $win_size_real, $cue, $stacks );
920
921 # just add basic structure graph edges, and stacks
922 } else {
923 ( $structGraph_ref, $curr_gi ) = getStructGraph( $shrep, $curr_gi, $win_size_real, $stacks );
924 }
925
926 # additional information for shrep header: sequence and dot-bracket
927 my $crop_length = $crop_index_right - $crop_index_left + 1;
928 my $shrepheader_struct = substr( $shrep_struct, $crop_index_left, $crop_length );
929 my $shrepheader_seq = substr( $win_seq, $crop_index_left, $crop_length );
930 push( @{$shrep}, 'STRUCT', $shrepheader_struct, 'SEQ', $shrepheader_seq );
931
932 # print structure graph to file
933 print $graph_file_hdl getShapeHeader( $winHead, $shrep );
934 print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
935
936 # don't print empty array; sgdnspdk breaks with empty lines
937 if ( @{$structGraph_ref} > 0 ) {
938 print $graph_file_hdl join( "\n", @{$structGraph_ref} ) . "\n";
939
940 if ($annotate) {
941
942 # TODO add annotations to shrep structure
943 # $win_start, $win_end (1-n)
944 }
945
946 }
947 }
948
949 return $curr_gi;
950 }
951
952 ###########################################################################
953 # Here the vertices for the nucleotides are generated and also the backbone
954 # edges, which connect the nucleotides in their correct order (i.e order
955 # of the given sequence)
956 # Input:
957 # win_seq : the sequence for the current window
958 # win_start : the starting position for the current window
959 # curr_crop_i_left : the cropping index for the left end of sequence (-cue)
960 # curr_crop_i_right : the cropping index for the right end of sequence (-cue)
961 # orig_seq : the nucleotide sequence as read from fasta
962 #
963 # Output:
964 # The lines of the graph in an array reference to be printed to a file with
965 # one element per line.
966 ############################################################################
967 sub getBackboneGraph {
968 my ( $win_seq, $curr_gi, $win_start, $curr_crop_i_left, $curr_crop_i_right, $orig_seq ) = @_;
969
970 # RNAshapes substitutes T -> U, sequence only does not
971 # thus, just transform all t/T -> u/U to remain consistent
972 $orig_seq =~ tr /tT/uU/;
973 $win_seq =~ tr /tT/uU/;
974
975 my @seq;
976 @seq = split( "", $win_seq );
977
978 # if the vp option is set we need to obtain the original capitalization from
979 # $orig_seq and extract the windowed sequence manually
980 my @seq_vp;
981 if ( defined $i_vp ) {
982 my $win_len = length($win_seq);
983 my $capitalized_win_seq = substr( $orig_seq, $win_start - 1, $win_len );
984 ( uc($capitalized_win_seq) eq uc($win_seq) ) or
985 die( "error: windowed sequence generated due to vp option not equal " .
986 "to sequence reported by RNASHAPES.\n" .
987 "'${capitalized_win_seq}' != '${win_seq}'" );
988 @seq_vp = split( "", $capitalized_win_seq );
989 }
990
991 my @vert = ();
992 my @edg = ();
993
994 my $curr_abs_pos = $win_start + $curr_crop_i_left;
995 $curr_gi += $curr_crop_i_left;
996
997 # set vertice labeled with 'v' or 'V' according to sequence and vp option
998 # when vp set,
999 # uppercase nucleotides are annotated with 'v'
1000 # lowercase nucleotides are annotated with 'V'
1001 my $vertice_label = ( defined $i_vp and ( $seq_vp[$curr_crop_i_left] =~ /[a-z]/ ) ) ? 'V' : 'v';
1002
1003 # create backbone vertice of first nucleotide
1004 push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$curr_crop_i_left], $curr_abs_pos ) );
1005
1006 foreach my $idx ( ( $curr_crop_i_left + 1 ) .. $curr_crop_i_right ) {
1007 $curr_abs_pos++;
1008 $curr_gi++;
1009
1010 # set vertice label as described above
1011 $vertice_label = ( defined $i_vp and ( $seq_vp[$idx] =~ /[a-z]/ ) ) ? 'V' : 'v';
1012 push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$idx], $curr_abs_pos ) );
1013 push( @edg, join( ' ', "e", $curr_gi - 1, $curr_gi, '> 1' ) );
1014 }
1015
1016 my @ret = ( @vert, @edg );
1017
1018 return \@ret;
1019 }
1020
1021 ###########################################################################
1022 # This method does the same as getStructGraph, but is extended to identify
1023 # the abstract parts of the structure and also add this to the graph
1024 # via abstract relations, see information below.
1025 # We already have the backbone graph, now the base-pair edges need to be
1026 # added to this graph and the abstract graph is added after this. Finally
1027 # the abstract relations, pointing from the abstract level to the basic
1028 # structure level is added. All the while, we keep track of the current
1029 # index graph. Furthermore, in the basic structure graph, we have vertices
1030 # stacks that connects the four stacked base-pairs involved. (symbols P
1031 # for vertex and p for edges). The abstract vertices are labelled according
1032 # to the given name-space, then #, then the abstract structure type, e.g.
1033 # HL for hairpin-loop. The edges between the structure types are denoted
1034 # with n, and the relation vertex is ^R, with ^r going from the abstract
1035 # structure to the relation and from the relation to the basic structure
1036 # is @r.
1037 #
1038 # Input:
1039 # curr_shrep : the current shrep as dot-bracket structure
1040 # curr_gi : the current graph index
1041 # win_size : the window size
1042 # cue : the input parameter -cue (whether to crop unpaired ends)
1043 # stacks : the input parameter -stacks (whether to add stack information)
1044 #
1045 # Output:
1046 # The structure graph information line-by-line as an array reference and
1047 # the current graph index.
1048 #
1049 ############################################################################
1050 sub getStructPlusAbstractStructGraph {
1051 my ( $curr_shrep, $curr_gi, $win_size, $cue, $stacks ) = @_;
1052
1053 # $win_size = length($curr_shrep->[0]);
1054
1055 my @struct = split( "", $curr_shrep->[0] );
1056
1057 # print "=================testing: new shrep : gi=$curr_gi ===============================\n" if ($i_debug);
1058
1059 # OBJECTS and VARIABLES
1060 # all indices are saved according to the current graph index, $curr_gi,
1061 # which is at the beginning of the current window, so that the nucleotide
1062 # index can be inferred using $idx + $curr_gi
1063 # opening brackets
1064 my @open_blks = (); # open blocks of consecutive opening brackets (array of arrays)
1065 my @p_open = (); # currently open block of the last identified complete block
1066 my @c_open = (); # current "incomplete" block of consecutive opening brackets
1067 # base-pair objects
1068 my %bps = (); # hash of all base-pairs with stem for a value, if it is at
1069 # the BP is opening or closing a stem object
1070 my @c_bp = (); # current BP, just being closed at current index pos (i,j)
1071 my @p_bp = (); # previously closed BP at position before current index
1072 # stem objects
1073 my %stems = (); # all stems, key="i:j,k:l", value=stem array, (i,j) outer BP
1074 my @c_stem = (); # current incomplete stem object
1075 my @stmbrks = (); # all stem-breaks in the form of (i,k,l), where i is the remaining
1076 # open base and (k,l) is the subsequent closing base-pair
1077
1078 # my @p_stmbrk = (); # previous stem-break in the form of (i,k,l), where i is the remaining
1079 # # open base and (k,l) is the subsequent closing base-pair
1080 # loop objects
1081 my @up = (); # all unpaired regions (i,j) that are of unkown type
1082 my @c_up = (); # current incomplete unpaired object
1083 my @hls = (); # all hairpin loop objects
1084 my @bls = (); # all bulge-loop objects
1085 my @ils = (); # all internal loops
1086 my @mls = (); # all multi-loops
1087 my @c_mls = (); # current incomplete multi-loops! (there can be more than one)
1088 my @els = (); # all external loops
1089 my @c_el = (); # current incomplete external loop
1090
1091 # iterate through shrep structure an identify abstract parts
1092 foreach my $idx ( 0 .. @struct - 1 ) {
1093
1094 #===================================================
1095 # current char is the opening bracket of a base-pair
1096 if ( $struct[$idx] eq "(" ) {
1097
1098 #===================================================
1099
1100 # update current window index to current graph index
1101 $idx += $curr_gi;
1102
1103 # case: '((' currently there is an open block, extend it
1104 if (@c_open) {
1105 push( @c_open, $idx );
1106
1107 # case: '.(' or ')(' or BOS'(' there is no open block, create a new one ".(" or ")("
1108 } else {
1109
1110 # begin a new current open block
1111 push( @c_open, $idx );
1112
1113 # case: ')('
1114 if (@c_stem) {
1115
1116 # CLOSE STEM
1117 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
1118 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
1119
1120 # case (x(x)(
1121 if (@open_blks) {
1122
1123 # EXTEND_ML, if one exists
1124 if (@c_mls) {
1125 extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
1126
1127 # OPEN_ML
1128 } else {
1129
1130 # case 'x((x)(' the opening of the ML was a stem-break
1131 if ( @stmbrks
1132 && $stmbrks[-1]->[1] == $p_bp[0]
1133 && $stmbrks[-1]->[2] == $p_bp[1] ) {
1134 my @c_stmbrk = @{ pop(@stmbrks) };
1135 my @a = ( $c_stmbrk[0], -1 );
1136 my @ml = ();
1137 push( @ml, \@a ); # 1st base-pair is in-waiting
1138 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
1139 push( @ml, \@a2 ); #2nd BP
1140 my @a3 = ($idx);
1141 push( @ml, \@a3 ); #first of current BP
1142 push( @c_mls, \@ml );
1143 print STDERR "TEST - opened ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
1144
1145 # case 'x(.(x)('
1146 } elsif (@up) {
1147 my @tmp_up = @{ pop(@up) };
1148
1149 # compare adjacent BP to UP to see if they fit
1150 if ( $tmp_up[1] + 1 == $p_bp[0]
1151 && $open_blks[-1]->[-1] == $tmp_up[0] - 1 ) {
1152 my @a = ( $open_blks[-1]->[-1], -1 );
1153 my @ml = ();
1154 push( @ml, \@a ); # 1st BP awaits closing
1155 my @newbp = @p_bp;
1156 push( @ml, \@newbp ); # 2nd BP
1157 my @a2 = ($idx);
1158 push( @ml, \@a2 ); # current open BP, awaits closing
1159 push( @c_mls, \@ml );
1160 print STDERR "TEST - opened ML with prev UP1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
1161
1162 if ($i_debug);
1163 } else {
1164 die "ERROR: the base-pairs to not match the " . "previous unpaired region?!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1165 }
1166 } else {
1167 die "ERROR: in case ML, but there is no initial" . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1168 }
1169 }
1170
1171 # case: 'x(x)('
1172 } else {
1173
1174 # case: '.(x)(' => CLOSE_EL
1175 if (@c_el) {
1176 if ( $c_el[-1] == $p_bp[0] ) {
1177 push( @c_el, $p_bp[1] );
1178 my @newel = @c_el;
1179 push( @els, \@newel );
1180 @c_el = ();
1181 print STDERR "TEST - closed EL1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
1182 } else {
1183 die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1184 }
1185 }
1186
1187 # in both cases OPEN_EL
1188 push( @c_el, @p_bp );
1189 push( @c_el, $idx );
1190 print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
1191 }
1192
1193 # case '.('
1194 } elsif (@c_up) {
1195
1196 # case (x(x).( OR 'x(.('
1197 if (@open_blks) {
1198
1199 # case 'x(x).(' ML
1200 if ( @p_bp && ( $p_bp[1] == $c_up[0] - 1 ) ) {
1201
1202 # case '((x)x(x).(' EXTEND_ML, if one exists
1203 if (@c_mls) {
1204
1205 extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
1206
1207 # case 'x((x).(' or 'x(.(x).(' OPEN_ML
1208 } else {
1209
1210 # case 'x((x).(' the opening of the ML was a stem-break
1211 if ( @stmbrks
1212 && $stmbrks[-1]->[1] == $p_bp[0]
1213 && $stmbrks[-1]->[2] == $p_bp[1] ) {
1214 my @c_stmbrk = @{ pop(@stmbrks) };
1215 my @a = ( $c_stmbrk[0], -1 );
1216 my @ml = ();
1217 push( @ml, \@a ); # 1st base-pair is in-waiting
1218 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
1219 push( @ml, \@a2 ); #2nd BP
1220 my @a3 = ($idx);
1221 push( @ml, \@a3 ); #first of current BP
1222 push( @c_mls, \@ml );
1223 print STDERR "TEST - open ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
1224
1225 # case 'x(.(x).(' opening the ML with initial UP region
1226 } elsif (@up) {
1227
1228 # get previous unpaired region
1229 my $tmp = pop(@up);
1230 my @p_up = @{$tmp};
1231
1232 # case: '(.(x).(' compare adjacent BP to UP to see if they create a ML
1233 if ( $p_up[1] + 1 == $p_bp[0]
1234 && $open_blks[-1]->[-1] == $p_up[0] - 1 ) {
1235 my @a = ( $open_blks[-1]->[-1], -1 );
1236 my @ml = ();
1237 push( @ml, \@a ); # 1st BP awaits closing
1238 my @newbp = @p_bp;
1239 push( @ml, \@newbp ); # 2nd BP
1240 my @a2 = ($idx);
1241 push( @ml, \@a2 ); # current open BP, awaits closing
1242 push( @c_mls, \@ml );
1243 print STDERR "TEST - opened ML with prev UP2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
1244
1245 if ($i_debug);
1246 }
1247
1248 # has to be an ML, but can't find first part
1249 } else {
1250 die "ERROR: in case ML, but there is no initial " . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1251 }
1252 }
1253
1254 # case '(.(
1255 } else {
1256 my @newup = @c_up;
1257 push( @up, \@newup );
1258 print STDERR "TEST - closed UP:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newup ) ) if ($i_debug);
1259 }
1260
1261 # case: 'x(x).(' OR 'EOS.('
1262 } else {
1263
1264 # case: '.(x)(' => CLOSE_EL
1265 if (@c_el) {
1266 if ( $c_el[-1] == $c_bp[0] ) {
1267 push( @c_el, $p_bp[1] );
1268 my @newel = @c_el;
1269 push( @els, \@newel );
1270 @c_el = ();
1271 print STDERR "TEST - closed EL2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
1272 } else {
1273 die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1274 }
1275 }
1276
1277 # in both cases OPEN_EL
1278 if (@p_bp) {
1279 push( @c_el, @p_bp );
1280 push( @c_el, $idx );
1281 } else {
1282
1283 # ignore this if -cue is given
1284 # TODO check the gi index
1285 if ($cue) {
1286 @c_el = ();
1287 } else {
1288 push( @c_el, ( $curr_gi, $curr_gi ) );
1289 push( @c_el, $idx );
1290 print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
1291 }
1292 }
1293 }
1294 @c_up = ();
1295
1296 } else {
1297
1298 # no more cases except opening bracket at beginning of sequence
1299 # in this case do nothing, as the index has already been added
1300 die "ERROR: '((', ')(', '.(' have all been covered\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) )
1301 unless ( $idx == $curr_gi );
1302
1303 }
1304 }
1305
1306 #===================================================
1307 # current char is the unpaired base
1308 } elsif ( $struct[$idx] eq "." ) {
1309
1310 #===================================================
1311
1312 # update current window index to current graph index
1313 $idx += $curr_gi;
1314
1315 # case ').' => CLOSE_STEM
1316 if (@c_stem) {
1317 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
1318 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
1319 }
1320
1321 # case '..' extend UP, @c_up
1322 if (@c_up) {
1323 $c_up[1] = $idx;
1324
1325 # case 'x.' open new UP
1326 } else {
1327
1328 # OPEN_UP, @c_up
1329 @c_up = ( $idx, $idx );
1330
1331 # case '(.' just come to the end of an open block, push it onto stack
1332 if (@c_open) {
1333 my @newopen = @c_open;
1334 push( @open_blks, \@newopen );
1335 @c_open = ();
1336
1337 # case ').' or '.'
1338 # (have already closed stem and created stem-break and emptied p_open)
1339 } else {
1340
1341 # case 'x(x).' or '.' => CLOSE_EL
1342 if ( !@open_blks ) {
1343
1344 # case 'x().().'
1345 if (@c_el) {
1346
1347 # double check
1348 if ( $c_el[-1] == $p_bp[0] ) {
1349 push( @c_el, $p_bp[1] );
1350 my @newel = @c_el;
1351 push( @els, \@newel );
1352 @c_el = ();
1353 print STDERR "TEST - closed EL3:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
1354 } else {
1355 die "This case should not occur!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1356 }
1357 }
1358 }
1359 }
1360 }
1361
1362 #===================================================
1363 # current char is the closing bracket of a base-pair
1364 } elsif ( $struct[$idx] eq ")" ) {
1365
1366 #===================================================
1367
1368 # update current window index to current graph index
1369 $idx += $curr_gi;
1370
1371 # save previous base-pair
1372 if (@c_bp) {
1373 @p_bp = @c_bp;
1374 }
1375
1376 # case: '((x))', extend stem
1377 if (@p_open) {
1378
1379 # get current base-pair
1380 @c_bp = ( pop(@p_open), $idx );
1381
1382 # add it to the base-pair hash
1383 $bps{"$c_bp[0]:$c_bp[1]"} = "";
1384
1385 # EXTEND_STEM
1386 if (@c_stem) {
1387 $c_stem[0] = $c_bp[0];
1388 $c_stem[1] = $c_bp[1];
1389 } else {
1390 die "ERROR: in case '))' there has to be an open stem object!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1391 }
1392
1393 # case: '(x(x))' or '(x.)', '()' not allowed
1394 } else {
1395
1396 # case: '(x(x))' CLOSE_STEM close previous stem
1397 if (@c_stem) {
1398 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
1399 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
1400
1401 # case '(x(x))' OPEN_STEM
1402 if (@open_blks) {
1403 my $tmp = pop(@open_blks);
1404 @p_open = @{$tmp};
1405
1406 # get current base-pair
1407 @c_bp = ( pop(@p_open), $idx );
1408
1409 # add it to the base-pair hash
1410 # as the stem is not finished yet, cannot add stem information
1411 $bps{"$c_bp[0]:$c_bp[1]"} = "";
1412
1413 # open stem
1414 $c_stem[0] = $c_bp[0];
1415 $c_stem[1] = $c_bp[1];
1416 $c_stem[2] = $c_bp[0];
1417 $c_stem[3] = $c_bp[1];
1418 } else {
1419 die "ERROR: there have to be open blocks to match " . "current closing bracket!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1420 }
1421
1422 # case: '(.(x))' => CLOSE_BL
1423 if ( @up && $up[-1]->[0] - 1 == $c_bp[0] && $up[-1]->[1] + 1 == $p_bp[0] ) {
1424 my @newBL = ( @c_bp, @p_bp );
1425 push( @bls, \@newBL );
1426 pop(@up);
1427 print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newBL ) ) if ($i_debug);
1428
1429 # case: '(x(x).(x))' => CLOSE_ML
1430 } elsif ( @c_mls && $c_mls[-1]->[-1]->[-1] == $p_bp[0] ) {
1431 my @newml = @{ pop(@c_mls) };
1432 push( @{ $newml[-1] }, $p_bp[1] ); # close last BP
1433 $newml[0]->[1] = $c_bp[1]; # close 1st BP
1434 push( @mls, \@newml );
1435 print STDERR "TEST - closed ML1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
1436 } else {
1437 die "ERROR: what is this case?\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1438 }
1439
1440 # case: (x.)
1441 } else {
1442
1443 # OPEN_STEM open new stem
1444 if (@open_blks) {
1445 @p_open = @{ pop(@open_blks) };
1446
1447 # get current base-pair
1448 @c_bp = ( pop(@p_open), $idx );
1449
1450 # add it to the base-pair hash
1451 # as the stem is not finished yet, cannot add stem information
1452 $bps{"$c_bp[0]:$c_bp[1]"} = "";
1453
1454 # open stem
1455 $c_stem[0] = $c_bp[0];
1456 $c_stem[1] = $c_bp[1];
1457 $c_stem[2] = $c_bp[0];
1458 $c_stem[3] = $c_bp[1];
1459 } else {
1460 die "ERROR: there have to be open blocks to match " . "current closing bracket!$curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1461 }
1462
1463 # CLOSE_C_UP
1464 if (@c_up) {
1465
1466 # case: (.) => close_HL
1467 if ( $c_up[0] - 1 == $c_bp[0] ) {
1468 my @newhl = @c_bp;
1469 push( @hls, \@newhl );
1470 @c_up = ();
1471 print STDERR "TEST - closed HL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newhl ) ) if ($i_debug);
1472
1473 # case: (x(x).)
1474 } elsif ( @p_bp && $c_up[0] - 1 == $p_bp[1] ) {
1475
1476 # case: ((x).) => CLOSE_BL
1477 if ( @stmbrks
1478 && $stmbrks[-1]->[0] == $c_bp[0]
1479 && $stmbrks[-1]->[1] == $p_bp[0]
1480 && $stmbrks[-1]->[2] == $p_bp[1] ) {
1481 my @newbl = ( @c_bp, @p_bp );
1482 push( @bls, \@newbl );
1483 pop(@stmbrks);
1484 @c_up = ();
1485 print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newbl ) ) if ($i_debug);
1486
1487 # case: (x(x).), either CLOSE_ML or CLOSE_IL
1488 } else {
1489
1490 # case '(.(x).)' => CLOSE_IL
1491 if ( @up
1492 && ( $up[-1]->[0] - 1 == $c_bp[0] )
1493 && ( $up[-1]->[1] + 1 == $p_bp[0] ) ) {
1494 my @p_up = @{ pop(@up) };
1495 my @newil = ( @c_bp, @p_bp );
1496 push( @ils, \@newil );
1497 @c_up = ();
1498 print STDERR "TEST - closed IL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newil ) ) if ($i_debug);
1499
1500 # case (x(x)x(x).) => CLOSE_ML remember, array of tuples
1501 } elsif ( @c_mls && $c_mls[-1]->[-1]->[0] == $p_bp[0] ) {
1502 my @newml = @{ pop(@c_mls) };
1503 push( @{ $newml[-1] }, $p_bp[1] ); # close final BP
1504 $newml[0]->[1] = $c_bp[1]; # close 1st BP
1505 push( @mls, \@newml );
1506 @c_up = ();
1507 print STDERR "TEST - closed ML2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
1508 } else {
1509 die "There should be no such case!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1510 }
1511 }
1512
1513 # there should be no more unknown unpaired regions left!
1514 } else {
1515 die "There shouldn't be any unknown unpaired " . "regions left\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1516 }
1517
1518 } else {
1519 die "ERROR: '()' is not allowed in $curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
1520 }
1521 }
1522 }
1523 }
1524 }
1525
1526 # EOS
1527
1528 # CLOSE_STEM
1529 print STDERR "TEST - closed stem EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@c_stem ) )
1530 if ( @c_stem && $i_debug );
1531 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ) if (@c_stem);
1532 die "ERROR: there should not be an open ML!\n" . $curr_shrep . "\n"
1533 if (@c_mls);
1534
1535 # case ().() CLOSE_EL
1536 if (@c_el) {
1537 if ( $c_el[-1] == $p_bp[0] ) {
1538 push( @c_el, $p_bp[1] );
1539 my @newel = @c_el;
1540 push( @els, \@newel );
1541 @c_el = ();
1542
1543 print STDERR "TEST - closed EL4 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
1544 if ($i_debug);
1545 } else {
1546 die "ERROR: the base-pairs in the EL don't match" . $curr_shrep . "\n";
1547 }
1548 }
1549
1550 # case (). CLOSE_EL
1551 if (@c_up) {
1552 unless ($cue) {
1553 if (@p_bp) {
1554 push( @c_el, @p_bp );
1555 } else {
1556 push( @c_el, ( $curr_gi, $curr_gi ) );
1557 }
1558
1559 push( @c_el, ( ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ) );
1560 my @newel = @c_el;
1561 push( @els, \@newel );
1562 @c_el = ();
1563 print STDERR "TEST - closed EL5 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
1564 if ($i_debug);
1565 ## check case where cue is given, but window is completely unstructured
1566 } else {
1567 unless (@p_bp) {
1568 my @newel = ( $curr_gi, $curr_gi, ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) );
1569 push( @els, \@newel );
1570 }
1571 }
1572 }
1573
1574 # # update current graph index (since all shrep indices have been read)
1575 $curr_gi += $win_size;
1576
1577 # ===================================================
1578 # build graph lines
1579 my @graph_lines = ();
1580
1581 # ===================================================
1582
1583 # graph line objects
1584 my @edg = ();
1585 my @stackgraph = ();
1586 my @abstrgraph = ();
1587
1588 #-------------------> stems,stacks
1589 # build basic graph edges, stacks, and vertices for stems in the abstract graph
1590 my @tmp = keys %stems;
1591 foreach my $sk (@tmp) {
1592 my @stem = @{ $stems{$sk} }; # stem (i,j,k,l)=> (i,j)= (0,1) and (k,l)=(2,3)
1593 die "ERROR: There are not enough elements in this stem: ", ( join( ",", @stem ) ), "\n" unless ( scalar(@stem) == 4 );
1594
1595 # infer base-pairs from stem
1596 @p_bp = ();
1597 @c_bp = ();
1598 my @stem_relation_idxs = ();
1599 my @stack_relation_idxs = ();
1600 my $stacksize = $stem[2] - $stem[0] + 1;
1601 for ( my $x = 0 ; $x < $stacksize ; $x++ ) {
1602
1603 # for (my $i = $stem[0]; $i <= $stem[2] ; $i++){
1604 # for(my $j = $stem[1]; $j >= $stem[3]; $j--){
1605 my $i = $stem[0] + $x;
1606 my $j = $stem[1] - $x;
1607 @p_bp = @c_bp;
1608 @c_bp = ( $i, $j );
1609 if ( defined $bps{"$i:$j"} ) {
1610
1611 # add indices to stem relation object
1612 push( @stem_relation_idxs, $i );
1613 push( @stem_relation_idxs, $j );
1614
1615 # add edge to graph edges
1616 push( @edg, "e $i $j s" );
1617
1618 # add stack if option given
1619 if ($stacks) {
1620 if (@p_bp) {
1621 push( @stackgraph, "v $curr_gi P" );
1622 push( @stackgraph, "e $curr_gi $p_bp[0] p" );
1623 push( @stackgraph, "e $curr_gi $p_bp[1] p" );
1624 push( @stackgraph, "e $c_bp[0] $curr_gi p" );
1625 push( @stackgraph, "e $c_bp[1] $curr_gi p" );
1626 push( @stack_relation_idxs, $curr_gi );
1627
1628 # update graph index
1629 ++$curr_gi;
1630 }
1631 }
1632 } else {
1633 die "ERROR: There is no such base-pair in the base-pair hash:" . " ($i, $j)\n";
1634 }
1635 }
1636
1637 # }
1638 # }
1639 # add stem to abstract graph
1640 # (graph indices become a bit jumbled, but saves time)
1641 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#S" );
1642
1643 # stem object now saves the curr_gi index for that stem
1644 $stems{$sk} = $curr_gi;
1645 ++$curr_gi;
1646
1647 # add relations
1648 push( @abstrgraph, "v $curr_gi ^R" );
1649 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
1650 foreach my $r (@stem_relation_idxs) {
1651 push( @abstrgraph, "e $curr_gi $r \@r" );
1652 }
1653 if ($stacks) {
1654 foreach my $r (@stack_relation_idxs) {
1655 push( @abstrgraph, "e $curr_gi $r \@r" );
1656 }
1657 }
1658 ++$curr_gi;
1659 }
1660
1661 #-------------------> add HLs
1662 foreach my $hl (@hls) {
1663
1664 # check size
1665 die "ERROR: the HL object is the incorrect size\n" unless ( @{$hl} == 2 );
1666
1667 # add vertex
1668 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#HL" );
1669
1670 # add edge from stem to hl
1671 if ( defined( $stems{ $bps{"$hl->[0]:$hl->[1]"} } ) ) {
1672 push( @abstrgraph, "e " . $stems{ $bps{"$hl->[0]:$hl->[1]"} } . " $curr_gi n" );
1673 } else {
1674 die "ERROR: stem not defined in base-pair hash for $hl->[0]:$hl->[1]\n";
1675 }
1676 ++$curr_gi;
1677
1678 # add relations
1679 push( @abstrgraph, "v $curr_gi ^R" );
1680 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
1681
1682 # add all nodes between i to j, inclusively
1683 for ( my $r = $hl->[0] ; $r <= $hl->[1] ; $r++ ) {
1684 push( @abstrgraph, "e $curr_gi $r \@r" );
1685 }
1686 ++$curr_gi;
1687 }
1688
1689 #------------------> add BLs
1690 foreach my $bl (@bls) {
1691
1692 # check size
1693 die "ERROR: the BL object is the incorrect size\n" unless ( scalar( @{$bl} ) == 4 );
1694
1695 # add vertex
1696 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#BL" );
1697
1698 # add edges from bl to both adjacent stems
1699 if ( defined( $stems{ $bps{"$bl->[0]:$bl->[1]"} } )
1700 && defined( $stems{ $bps{"$bl->[2]:$bl->[3]"} } ) ) {
1701
1702 # add outer stem: S->BL
1703 push( @abstrgraph, "e " . $stems{ $bps{"$bl->[0]:$bl->[1]"} } . " $curr_gi n" );
1704
1705 # add inner stem: BL->S
1706 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bl->[2]:$bl->[3]"} } . " n" );
1707 } else {
1708 die "ERROR: stems not defined for BL\n";
1709 }
1710 ++$curr_gi;
1711
1712 # add relations
1713 push( @abstrgraph, "v $curr_gi ^R" );
1714 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
1715
1716 # add all nodes between i to k and l to j, inclusively
1717 for ( my $r = $bl->[0] ; $r <= $bl->[2] ; $r++ ) {
1718 push( @abstrgraph, "e $curr_gi $r \@r" );
1719 }
1720 for ( my $r = $bl->[3] ; $r <= $bl->[1] ; $r++ ) {
1721 push( @abstrgraph, "e $curr_gi $r \@r" );
1722 }
1723 ++$curr_gi;
1724 }
1725
1726 #------------------> add ILs
1727 foreach my $il (@ils) {
1728
1729 # check size
1730 die "ERROR: the IL object is the incorrect size\n" unless ( scalar( @{$il} ) == 4 );
1731
1732 # add vertex
1733 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#IL" );
1734
1735 # add edges from il to both adjacent stems
1736 if ( defined( $stems{ $bps{"$il->[0]:$il->[1]"} } )
1737 && defined( $stems{ $bps{"$il->[2]:$il->[3]"} } ) ) {
1738
1739 # add outer stem: S->IL
1740 push( @abstrgraph, "e " . $stems{ $bps{"$il->[0]:$il->[1]"} } . " $curr_gi n" );
1741
1742 # add inner stem: IL->S
1743 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$il->[2]:$il->[3]"} } . " n" );
1744 } else {
1745 die "ERROR: stems not defined for IL\n";
1746 }
1747 ++$curr_gi;
1748
1749 # add relations
1750 push( @abstrgraph, "v $curr_gi ^R" );
1751 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
1752
1753 # add all nodes between i to k and l to j, inclusively
1754 for ( my $r = $il->[0] ; $r <= $il->[2] ; $r++ ) {
1755 push( @abstrgraph, "e $curr_gi $r \@r" );
1756 }
1757 for ( my $r = $il->[3] ; $r <= $il->[1] ; $r++ ) {
1758 push( @abstrgraph, "e $curr_gi $r \@r" );
1759 }
1760 ++$curr_gi;
1761 }
1762
1763 #------------------> add ELs
1764 foreach my $el (@els) {
1765
1766 #check size
1767 die "ERROR: the EL object is the incorrect size\n" unless ( scalar( @{$el} ) == 4 );
1768
1769 # add vertex
1770 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#EL" );
1771
1772 # add edges from el to both adjacent stems, if available
1773 unless ( $el->[0] == $el->[1] ) {
1774 if ( defined( $stems{ $bps{"$el->[0]:$el->[1]"} } ) ) {
1775 push( @abstrgraph, "e " . $stems{ $bps{"$el->[0]:$el->[1]"} } . " $curr_gi n" );
1776 } else {
1777 die "ERROR: stems not defined for left BP of EL\n";
1778 }
1779 }
1780 unless ( $el->[2] == $el->[3] ) {
1781 if ( defined( $stems{ $bps{"$el->[2]:$el->[3]"} } ) ) {
1782 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$el->[2]:$el->[3]"} } . " n" );
1783 } else {
1784 die "ERROR: stems not defined for right BP of EL\n";
1785 }
1786 }
1787 ++$curr_gi;
1788
1789 # add relations
1790 push( @abstrgraph, "v $curr_gi ^R" );
1791 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
1792
1793 # @r relation edges
1794 for ( my $r = $el->[1] ; $r <= $el->[2] ; $r++ ) {
1795 push( @abstrgraph, "e $curr_gi $r \@r" );
1796 }
1797
1798 # if BP, not EOS
1799 unless ( $el->[0] == $el->[1] ) {
1800 push( @abstrgraph, "e $curr_gi $el->[0] \@r" );
1801 }
1802
1803 # if BP, not EOS
1804 unless ( $el->[2] == $el->[3] ) {
1805 push( @abstrgraph, "e $curr_gi $el->[3] \@r" );
1806 }
1807 ++$curr_gi;
1808 }
1809
1810 #------------------> add MLs
1811 foreach my $ml (@mls) {
1812
1813 #check size
1814 die "ERROR: the ML object is not big enough!\n" unless ( scalar( @{$ml} ) >= 3 );
1815
1816 # add vertex
1817 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#ML" );
1818
1819 # add edge from outer stem to first BP
1820 my $closing_bp = shift( @{$ml} );
1821 if ( defined( $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } ) ) {
1822 push( @abstrgraph, "e " . $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } . " $curr_gi n" );
1823 } else {
1824 die "ERROR: ML is not defined for BP $closing_bp->[0]:$closing_bp->[1]\n";
1825 }
1826
1827 # add remaining stems
1828 foreach my $bp ( @{$ml} ) {
1829 if ( defined( $stems{ $bps{"$bp->[0]:$bp->[1]"} } ) ) {
1830 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bp->[0]:$bp->[1]"} } . " n" );
1831 } else {
1832 die "ERROR: ML is not defined for BP $bp->[0]:$bp->[1]\n";
1833 }
1834 }
1835 ++$curr_gi;
1836
1837 # add relations
1838 push( @abstrgraph, "v $curr_gi ^R" );
1839 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
1840
1841 # add @r relation edges between i and k, where (i,j) is the first BP and (k,l)
1842 # the second BP
1843 my $pbp_aref = $closing_bp;
1844 my $cbp_aref = shift( @{$ml} );
1845 for ( my $r = $pbp_aref->[0] ; $r <= $cbp_aref->[0] ; $r++ ) {
1846 push( @abstrgraph, "e $curr_gi $r \@r" );
1847 }
1848
1849 # rest of @r relation edges
1850 foreach my $bp ( @{$ml} ) {
1851 $pbp_aref = $cbp_aref;
1852 $cbp_aref = $bp;
1853 for ( my $r = $pbp_aref->[1] ; $r <= $cbp_aref->[0] ; $r++ ) {
1854 push( @abstrgraph, "e $curr_gi $r \@r" );
1855 }
1856 }
1857
1858 # add final stretch between last BP (m,n) and first closing base-pair (i,j)
1859 for ( my $r = $cbp_aref->[1] ; $r <= $closing_bp->[1] ; $r++ ) {
1860 push( @abstrgraph, "e $curr_gi $r \@r" );
1861 }
1862 ++$curr_gi;
1863 }
1864
1865 @graph_lines = ( @edg, @stackgraph, @abstrgraph );
1866 return ( \@graph_lines, $curr_gi );
1867 }
1868
1869 # just to test the current shrep dot-bracket structure parsing
1870 sub test {
1871 my ( $shrep, $idx, $gi, $mark_aref, $isML ) = @_;
1872
1873 my $result = "";
1874
1875 $result .= "$shrep\n";
1876 my @a = split( "", $shrep );
1877 for ( my $i = 0 ; $i <= @a ; $i++ ) {
1878 my $mark = 0;
1879 foreach my $j ( @{$mark_aref} ) {
1880 if ($isML) {
1881 $mark = 1 if ( $i == $j->[0] - $gi );
1882 $mark = 1 if ( scalar( @{$j} == 2 && $i == $j->[1] - $gi ) );
1883 } else {
1884 $mark = 1 if ( $i == $j - $gi );
1885 }
1886
1887 }
1888 if ( $i == $idx - $gi ) {
1889 $result .= "^";
1890 } elsif ($mark) {
1891 $result .= "'";
1892 } else {
1893 $result .= " ";
1894 }
1895 }
1896 $result .= "\n";
1897
1898 return $result;
1899 }
1900
1901 ###########################################################################
1902 # Extends or closes a multi-loop. This means a ML already exists.
1903 # We can have the case ((x)( or ((x).(
1904 #
1905 # Input:
1906 # curr_shrep The current shrep object, with shrep string at pos 0
1907 # c_ml_aref The current open MLs array reference
1908 # p_bp_aref The previous base-pair array reference
1909 # stmbrks_aref The stem-breaks array reference
1910 # open_blks_aref The open bracket blocks array reference
1911 # idx The current index
1912 # #curr_gi The current graph index
1913 # Output:
1914 # None, it should just modify the ML accordingly
1915 ############################################################################
1916 sub extend_or_open_nested_ML {
1917 my ( $curr_shrep, $c_mls_aref, $p_bp_aref, $up_aref, $stmbrks_aref, $open_blks_aref, $idx, $curr_gi ) = @_;
1918
1919 # case '(x(x)x(x)x(' extend current ML
1920 if ( $c_mls_aref->[-1]->[-1]->[-1] == $p_bp_aref->[0] ) {
1921 push( @{ $c_mls_aref->[-1]->[-1] }, $p_bp_aref->[1] ); #close previous BP
1922 my @a = ($idx);
1923 push( @{ $c_mls_aref->[-1] }, \@a ); # add new open base-pair
1924 print STDERR "TEST - extended ML:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, $c_mls_aref->[-1], 1 ) ) if ($i_debug);
1925
1926 # case '(x(x)x((x)(' nested ML, => OPEN_ML with stem-break
1927 } elsif ( @$stmbrks_aref
1928 && $stmbrks_aref->[-1]->[1] == $p_bp_aref->[0]
1929 && $stmbrks_aref->[-1]->[2] == $p_bp_aref->[1] ) {
1930
1931 my @c_stmbrk = @{ pop( @{$stmbrks_aref} ) };
1932 my @a = ( $c_stmbrk[0], -1 );
1933 my @ml = ();
1934 push( @ml, \@a ); # 1st base-pair is in-waiting
1935 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
1936 push( @ml, \@a2 ); #2nd BP
1937 my @a3 = ($idx);
1938 push( @ml, \@a3 ); #first of current BP
1939 push( @{$c_mls_aref}, \@ml );
1940 print STDERR "TEST - opened nested ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
1941
1942 # case '(x(x)x(.(x)(' nested ML, => OPEN_ML with UP
1943 } elsif ( @$up_aref
1944 && $up_aref->[-1]->[0] - 1 == $open_blks_aref->[-1]->[-1]
1945 && $up_aref->[-1]->[1] + 1 == $p_bp_aref->[0] ) {
1946 my @tmp_up = @{ pop( @{$up_aref} ) };
1947 my @a = ( $open_blks_aref->[-1]->[-1], -1 );
1948 my @ml = ();
1949 push( @ml, \@a ); # 1st BP awaits closing
1950 my @newbp = @{$p_bp_aref};
1951 push( @ml, \@newbp ); #2nd BP
1952 my @a2 = ($idx);
1953 push( @ml, \@a2 ); # current open BP, awaits closing
1954 push( @{$c_mls_aref}, \@ml );
1955 print STDERR "TEST - opened nested ML with prev UP\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
1956 } else {
1957 die "ERROR: in the case extend or open nested ML, " . "but the base-pairs don't fit for either!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) if ($i_debug);
1958 }
1959 }
1960
1961 ###########################################################################
1962 # Closes a stem-object for the abstract graph. If necessary, it also
1963 # creates a stem-break because the current open block is still open,
1964 # which can only mean that there is a stem-break between two consecutive
1965 # opening brackets.
1966 #
1967 # Input:
1968 # c_stem_aref : current stem array reference
1969 # stems_href : all stems hash reference
1970 # p_bp_aref : previous base-pair array reference
1971 # c_cp_aref : current base-pair array reference
1972 # p_open_aref : previously opened open-bracket block
1973 # p_stmbrk_aref : the previous stem-break
1974 # open_blks_aref : all old open blocks array reference
1975 # bps_href : all base-pairs hash reference
1976 #
1977 # Output:
1978 # None, it should just modify the input variables accordingly
1979 ############################################################################
1980 sub close_stem {
1981 my ( $c_stem_aref, $stems_href, $p_bp_aref, $c_bp_aref, $p_open_aref, $stmbrks_aref, $open_blks_aref, $bps_href ) = @_;
1982
1983 my @newstem = @$c_stem_aref;
1984 my $stemkey = "$newstem[0]:$newstem[1],$newstem[2]:$newstem[3]";
1985 $stems_href->{$stemkey} = \@newstem;
1986
1987 # reset and clean running variables
1988 @$c_stem_aref = ();
1989 @$p_bp_aref = @$c_bp_aref;
1990 @$c_bp_aref = ();
1991
1992 # add stem to base-pair hash
1993 $bps_href->{"$newstem[0]:$newstem[1]"} = $stemkey;
1994 $bps_href->{"$newstem[2]:$newstem[3]"} = $stemkey;
1995
1996 # there are still bases open in the previous open block, '[[]x',
1997 # where [] is a closed stem and [ is an open block,
1998 # must create STEM-BREAK
1999 # this cannot be the case ((x)), because there p_open is empty
2000 if (@$p_open_aref) {
2001
2002 # check: the next open base is adjacent to the previous BP
2003 if ( $p_open_aref->[-1] == $p_bp_aref->[0] - 1 ) {
2004 my @stmbrk = ( $p_open_aref->[-1], @$p_bp_aref );
2005 push( @$stmbrks_aref, \@stmbrk );
2006
2007 # push the previous open block back onto stack
2008 my @newopen = @$p_open_aref;
2009 push( @$open_blks_aref, \@newopen );
2010 @$p_open_aref = ();
2011 } else {
2012
2013 # error, this should not be possible
2014 die("ERROR in close_stem(), [.[]x should not occur here\n");
2015 }
2016 }
2017 }
2018
2019 ###########################################################################
2020 # Here the information about the structure is added to the graph. In a
2021 # first step each base-pair is added by adding edges between the vertices
2022 # of the respective nucleotides. Then, if two base-pairs are stacked,
2023 # we add an extra vertex and connect the four nucleotide vertices that
2024 # are involved with extra edges (symbols P used for vertex and p for edges).
2025 # While adding new vertices we keep track of the current graph index.
2026 #
2027 # Input:
2028 # curr_shrep : the current shrep as dot-bracket structure
2029 # curr_gi : the current graph index
2030 # win_size : the window size
2031 # stacks : the input parameter -stacks (whether to add stacking information)
2032 #
2033 # Output:
2034 # The structure graph information line-by-line as an array reference and
2035 # the current graph index.
2036 ############################################################################
2037 sub getStructGraph {
2038 my ( $curr_shrep, $curr_gi, $win_size, $stacks ) = @_;
2039
2040 my @struct = split( "", $curr_shrep->[0] );
2041
2042 my @edg = ();
2043 my @starts = ();
2044 my @pairs = ();
2045
2046 foreach my $idx ( 0 .. @struct - 1 ) {
2047
2048 push( @starts, $idx ) if ( $struct[$idx] eq "(" );
2049
2050 if ( $struct[$idx] eq ")" ) {
2051 my $start = $curr_gi + pop(@starts);
2052 my $end = $curr_gi + $idx;
2053 push( @edg, "e " . $start . " " . $end . " s " );
2054 my @pair = ( $start, $end );
2055 push( @pairs, \@pair );
2056 }
2057 }
2058
2059 # update current graph index
2060 $curr_gi = $curr_gi + $win_size;
2061
2062 # initiate structure lines array
2063 my @stacking_info = ();
2064
2065 # add stacking information to graph unless input option tells us not to
2066 if ($stacks) {
2067
2068 my $stacked_pairs = 0;
2069
2070 # get stacked base-pairs (they are ordered according to
2071 # position of closing nucleotide)
2072 for ( my $i = 1 ; $i < @pairs ; $i++ ) {
2073
2074 # add stacked base-pairs (vertices+edges)
2075 my $curr_bp_aref = $pairs[$i];
2076 my $prev_bp_aref = $pairs[ $i - 1 ];
2077
2078 # if the current base-pair is stacked on the previous base-pair,
2079 # when curr_start = prev_start - 1 AND curr_end = prev_end + 1
2080 if ( $curr_bp_aref->[0] == $prev_bp_aref->[0] - 1
2081 && $curr_bp_aref->[1] == $prev_bp_aref->[1] + 1 ) {
2082
2083 # add stacking vertex P
2084 push( @stacking_info, "v $curr_gi P" );
2085
2086 # add four edges from involved nucleotids
2087 push( @stacking_info, "e $curr_gi $prev_bp_aref->[0] p" );
2088 push( @stacking_info, "e $curr_gi $prev_bp_aref->[1] p" );
2089 push( @stacking_info, "e $curr_bp_aref->[0] $curr_gi p" );
2090 push( @stacking_info, "e $curr_bp_aref->[1] $curr_gi p" );
2091 ++$curr_gi; # add one to the index, ready for next vertex
2092
2093 }
2094 }
2095 }
2096
2097 my @str_graphlines = ( @edg, @stacking_info );
2098 return ( \@str_graphlines, $curr_gi );
2099 }
2100
2101 ###########################################################################
2102 # This method gives us the overall graph header for this sequence. This
2103 # graph includes all window calculations and all shreps for all windows.
2104 # Input:
2105 # seq_id : the sequence ID
2106 # seq_head : the header information for the sequence
2107 # wins_aref : the windows sizes for folding
2108 # h_shift : input parameter -shift
2109 # h_e : input parameter -e
2110 # h_c : input parameter -c
2111 # h_t : input parameter -t
2112 # h_u : input parameter -u
2113 # h_r : input parameter -r
2114 # h_M : input parameter -M
2115 # h_crop_unpaired_ends : input parameter -cue
2116 #
2117 # Output:
2118 # The header line as a string
2119 ############################################################################
2120 sub getGraphHeader {
2121 my ( $seq_id, $seq_head, $wins_aref, $h_shift, $h_e, $h_c, $h_t, $h_u, $h_r, $h_M, $h_crop_unpaired_ends, $h_i, $h_sample_len, $h_q, $h_Tp, $h_seqlen, $h_noStr ) = @_;
2122
2123 my $ret;
2124
2125 $ret = "t # SEQID $seq_id ";
2126 $ret .= "$seq_head " if ( defined $seq_head );
2127 $ret .= "SEQLEN $h_seqlen ";
2128
2129 return $ret . "\n" if ($h_noStr);
2130
2131 $ret .= "MAXWINSHREPS $h_M ";
2132 $ret .= "CUE 1 " if ($h_crop_unpaired_ends);
2133 $ret .= "RNASHAPES -w " . ( join( ",", @{$wins_aref} ) ) . " ";
2134 $ret .= "-SHIFT%w $h_shift " if ( not $globalFolding and $h_shift );
2135 $ret .= "-SHIFT 1 " if ( not $globalFolding and not $h_shift );
2136 $ret .= "-e $h_e " if ( defined $h_e );
2137 $ret .= "-c $h_c " if ( defined $h_c );
2138 $ret .= "-t $h_t " if ( defined $h_t );
2139 $ret .= "-u 1 " if ($h_u);
2140 $ret .= "-r 1 " if ($h_r);
2141 $ret .= "-i $h_i SAMPLE_LEN $h_sample_len" if ( not $h_q and $h_i );
2142 $ret .= "-q 1 " if ($h_q);
2143 $ret .= "-T $h_Tp" if ( $h_q and defined $h_Tp );
2144 $ret .= "\n";
2145
2146 return $ret;
2147 }
2148
2149 ###########################################################################
2150 # This method gives us the window header for the subgraph that includes
2151 # all shreps for the current window.
2152 # Input:
2153 # graphHead : the header line for the entire sequence graph
2154 # win_size : the size of the current window
2155 # win_start : the starting position of the current window
2156 # win_end : the end position of the current window
2157 # win_center : the centre position of the current window
2158 # win_global_fold : (boolean) whether the actual number of nucleotides
2159 # is smaller than the given window size (end of sequence?)
2160 #
2161 # Output:
2162 # The header line as a string
2163 ############################################################################
2164 sub getWindowHeader {
2165 my ( $graphHead, $win_size, $win_start, $win_end, $win_center, $win_global_fold, $win_sample, $used_t ) = @_;
2166
2167 chomp $graphHead;
2168
2169 $graphHead =~ s/t # //;
2170
2171 my $ret = "w # $graphHead ";
2172 $ret .= "SHAPE_TYPE $used_t ";
2173 $ret .= "GLOBALFOLD $win_global_fold ";
2174 $ret .= "WSIZE $win_size ";
2175 $ret .= "WSTART $win_start ";
2176 $ret .= "WEND $win_end ";
2177 $ret .= "WCENT $win_center ";
2178 $ret .= "WSAMPLE $win_sample ";
2179 $ret .= "\n";
2180
2181 return $ret;
2182 }
2183
2184 ###########################################################################
2185 # This method gives us the shape header that includes the single shrep
2186 # connected components.
2187 # Input:
2188 # winHead : the header for the current window
2189 # shrep : information about the current shrep
2190 #
2191 # Output:
2192 # The header line as a string
2193 ############################################################################
2194 sub getShapeHeader {
2195 my ( $winHead, $shrep ) = @_;
2196
2197 chomp $winHead;
2198
2199 $winHead =~ s/w # //;
2200 $winHead =~ s/t # //;
2201
2202 my $ret = "s # $winHead ";
2203
2204 my @info = @{$shrep};
2205 $ret .= join( " ", @info[ 1 .. $#info ] );
2206 $ret .= "\n";
2207
2208 return $ret;
2209 }
2210
2211 ###########################################################################
2212 # This method gives us the sequence header that includes the unstructured
2213 # sequence connected components.
2214 # Input:
2215 # winHead : the header for the current window
2216 # shrep : information about the current shrep
2217 #
2218 # Output:
2219 # The header line as a string
2220 ############################################################################
2221 sub getSeqHeader {
2222 my ( $winHead, $shrep ) = @_;
2223
2224 chomp $winHead;
2225
2226 $winHead =~ s/w # //;
2227 $winHead =~ s/t # //;
2228
2229 my $ret = "u # $winHead ";
2230
2231 my @info = @{$shrep};
2232 $ret .= join( " ", @info[ 1 .. $#info ] );
2233 $ret .= "\n";
2234
2235 return $ret;
2236 }
2237
2238
2239 ##################################################################################
2240 # This method parses a fasta file and is useful if the header ID is not-unique!!
2241 # It returns the header lines in an array and the sequence lines in the same
2242 # order. It is then up to the user to extract the parts from the header that is
2243 # necessary for the script.
2244 # Furthermore, the method deals with multiple lines, and returns a single sequence
2245 # without line breaks.
2246 # Input:
2247 # file The name of the fasta file
2248 # Output:
2249 # (1) An array reference for each header line
2250 # (2) An array reference for each sequence in same order as the headers
2251 ##################################################################################
2252 sub read_fasta_with_nonunique_headers_meta {
2253 my ($file) = @_;
2254 my $FUNCTION = "read_fasta_file in Sequences.pm";
2255
2256 my $header = "";
2257 my $seqstring = "";
2258 my @headers = ();
2259 my @sequences = ();
2260 my @meta = ();
2261 my %seq_meta = ();
2262 open( IN_HANDLE, "<$file" ) || die "ERROR in $FUNCTION:\n" . "Couldn't open the following file in package Tool," . " sub read_fasta_file: $file\n";
2263
2264 while ( my $line = <IN_HANDLE> ) {
2265 chomp($line);
2266
2267 # header (can contain one space after > symbol)
2268 if ( $line =~ /^\>(.*)/ ) {
2269 if ($header) {
2270 $seqstring =~ s/\s*//g; ## do not allow spaces in sequence
2271 push( @headers, $header );
2272 push( @sequences, $seqstring );
2273 push( @meta, +{%seq_meta} ); ## anonymous hash reference
2274 #print keys %seq_meta;
2275 $seqstring = "";
2276 undef(%seq_meta);
2277 }
2278 $header = $1;
2279
2280 } elsif ( $line =~ /(.+)\s+(#\S+)$/ && $header ) {
2281
2282 if ( exists $seq_meta{$2} ) {
2283 $seq_meta{$2} .= $1;
2284 } else {
2285 $seq_meta{$2} = $1;
2286 }
2287
2288 } elsif ($header) {
2289 $seqstring .= $line
2290 }
2291 }
2292
2293 if ($header) {
2294 $seqstring =~ s/\s*//g; ## do not allow spaces in sequence
2295 push( @headers, $header );
2296 push( @sequences, $seqstring );
2297 push( @meta, +{%seq_meta} );
2298 $seqstring = "";
2299 %seq_meta = ();
2300 }
2301 return ( \@headers, \@sequences, \@meta );
2302 }
2303
2304
2305 1;
2306
2307 #############################################################################
2308 # Programming description
2309 #############################################################################
2310 # Substructure graphs for machine learning with Fabrizio
2311 # -------------------------------------------------------
2312 #
2313 # (1) Parameters (RNAshapes parameter):
2314 # - Window sizes [] (-w)
2315 # - window shift size (-W)
2316 # - calculate structure probabilities (-r)
2317 # - energy range kcal/mol (-e) OR energy relative percentage (%) to MFE (-c)
2318 # - shape type 1 to 5 (-t)
2319 # - ignore unstable substructures (-u)
2320 # - max shreps
2321 #
2322 #
2323 # (2) For each sequence, generate one graph/file that consists of all windows. The general format for one graph is as follows:
2324 #
2325 # t # seq_id parameters
2326 # v graph_index nt_type window_size window_centre abs_seq_pos
2327 # ...
2328 # e m m+1 > 1 (backbone)
2329 # ...
2330 # e base_i_graph_index base_j_graph_index s shrep_e shrep_p ...
2331 #
2332 # For each window (subgraph) we create a subgraph (of the subgraph) for each substructure.
2333 # We have a running index (gi) for each subgraph. All vertex and edge indices of the subgraph add
2334 # the running graph index to the actual window position. For example
2335 #
2336 #
2337 # Sequence: AAACC CUUUG GG
2338 # 01234 56789 01
2339 #
2340 # Window=10 substructure1 = (((...))). centre 5.5
2341 #
2342 # v 0 A 10 5.5 0
2343 # v 1 A 10 5.5 1
2344 # v 2 A 10 5.5 2
2345 # v 3 C 10 5.5 3
2346 # v 4 C 10 5.5 4
2347 # v 5 C 10 5.5 5
2348 # v 6 U 10 5.5 6
2349 # v 7 U 10 5.5 7
2350 # v 8 U 10 5.5 8
2351 # v 9 G 10 5.5 9
2352 # e 0 1 > 1
2353 # e 1 2 > 1
2354 # e 2 3 > 1
2355 # e 3 4 > 1
2356 # e 4 5 > 1
2357 # e 5 6 > 1
2358 # e 6 7 > 1
2359 # e 7 8 > 1
2360 # e 8 9 > 1
2361 # e 0 8 s -15.0 0.1223
2362 # e 1 7 s -15.0 0.1223
2363 # e 2 6 s -15.0 0.1223
2364 #
2365 # gi = 9+1 = 10
2366 #
2367 # Window = 10 substructure2 = .(((...))) centre 6.5
2368 # v 10 A 10 6.5 2
2369 # v 11 C 10 6.5 3
2370 # v 12 C 10 6.5 4
2371 # v 13 C 10 6.5 5
2372 # v 14 U 10 6.5 6
2373 # v 15 U 10 6.5 7
2374 # v 16 U 10 6.5 8
2375 # v 17 G 10 6.5 9
2376 # v 18 G 10 6.5 10
2377 # v 19 G 10 6.5 11
2378 # e 10 11 > 1
2379 # e 11 12 > 1
2380 # e 12 13 > 1
2381 # e 13 14 > 1
2382 # e 14 15 > 1
2383 # e 15 16 > 1
2384 # e 16 17 > 1
2385 # e 17 18 > 1
2386 # e 18 19 > 1
2387 # e 11 19 s -17.0 0.156
2388 # e 12 18 s -17.0 0.156
2389 # e 13 17 s -17.0 0.156
2390 #
2391 # gi = 19+1 = 20
2392 #
2393 #
2394 #
2395 # Write one perl script to create graphs for a set of sequences, fasta2shrep_gspan.pl.
2396 #
2397 # INPUT:
2398 # -f fasta file with all sequences to compute
2399 # parameters as above
2400 #
2401 # OUTPUT:
2402 # one file per sequence that contains graph called seq_id.gspan
2403 #
2404 # (1) for each window size call RNAshapes and write to a tmp file
2405 # (2) parse result of RNAshapes (catch RNAshapes error - sequence too long?) check for max shreps.
2406 # (3) convert RNAshapes result to subgraph -> write to file (readpipe) look at efficiency and errors
2407 # (4) repeat (1) to (3) for each sequence
2408
2409 # ABSTRACT STRUCTURE GRAPH
2410 # To each shrep graph (s #) add an abstract structure graph with special
2411 # abstract relations. For example, to add the abstract structure graph to
2412 # the previous shrep structure with gi numbers from 10 to 19, we first
2413 # identify the abstract shape, i.e. EL-S-HL and then create nodes (labelled
2414 # with a given name-space followed the actual node name and separated by a
2415 # hash) and edges for this graph as follows:
2416 # NOTE: at the moment we add the adjacent base-pairs to the loop definitions
2417 # but these can be removed if necessary in the future
2418 # v 20 abstruct#EL
2419 # v 21 abstruct#S
2420 # v 22 abstruct#HL
2421 # e 20 21 n
2422 # e 21 22 n
2423 # v 23 ^R
2424 # e 20 23 ^r
2425 # e 23 10 @r
2426 # e 23 11 @r
2427 # e 23 19 @r
2428 # v 24 ^R
2429 # e 21 24 ^r
2430 # e 24 11 @r
2431 # e 24 12 @r
2432 # e 24 13 @r
2433 # e 24 17 @r
2434 # e 24 18 @r
2435 # e 24 19 @r
2436 # v 25 ^R
2437 # e 22 25 ^r
2438 # e 25 13 @r
2439 # e 25 14 @r
2440 # e 25 15 @r
2441 # e 25 16 @r
2442 # e 25 17 @r
2443
2444 # gi = 25+1 = 26
2445
2446 # ANOTATION FILE
2447 # This is a file that labels a sequence region with a given annotation. In one
2448 # file we can have annotations within different name-spaces, for example target
2449 # sites predicted with different tools.
2450 # File format is a tab-delimited file with the following columns:
2451 # SEQID (same ID as in the fasta file header)
2452 # i (left positon of region)
2453 # j (right position of region, if one position i=j)
2454 # NAMESPACE#LABEL (give each annotation type one name-space and choose a label depending on the task)
2455
2456 # E.g we have 2 different miRNAs and we predict the target-sites with (1) IntaRNA and (2) TargetSearch,
2457 # then this could be IntaRNA#miR1, IntaRNA#miR2, TargetSearch#miR1, and TargetSearch#miR2.
2458 # All labels can be used more than once for one or more sequences.
2459 # all labels with the same namespace and the same sequence ID are grouped into
2460 # one abstract graph, according to the order of i.
2461 # Example
2462 # SEQID i j NAMESPACE#LABEL
2463 # s1 10 20 IntaRNA#miR1
2464 # s1 54 60 IntaRNA#miR2
2465 # s1 15 25 TargetSearch#miR1
2466 # s1 54 60 TargetSearch#miR2
2467 # s2 ...
2468
2469 # We create connected component per sequence ID per name-space,
2470 # so that all labels with
2471 # the same namespace are grouped together (for one sequence) as follows:
2472 # v 26 IntaRNA#miR1
2473 # v 27 IntaRNA#miR2
2474 # e 26 27 n
2475 # then create R nodes and ^r and @r relations as before (from i to j)
2476
2477 # If a sequence graph option is given, then these graphs (u #) have to
2478 # be labelled with this annotation
2479