0
|
1 #!/usr/bin/perl
|
|
2 #use feature ':5.10';
|
|
3 use File::Basename;
|
|
4 use lib dirname($0); # search skript directory for modules
|
|
5 use strict 'vars';
|
|
6 use warnings;
|
|
7 use Getopt::Long;
|
|
8 use Pod::Usage;
|
|
9 #use List::Util qw/ min max /;
|
|
10
|
|
11 #use StructureLibrary::RNAtoolsConfig;
|
|
12 #use StructureLibrary::Sequence;
|
|
13
|
|
14 use Cwd qw(getcwd abs_path);
|
|
15 use File::Temp qw(tempdir);
|
|
16 use File::Copy;
|
|
17 use POSIX qw(ceil);
|
|
18
|
|
19 use FindBin;
|
|
20 use lib "$FindBin::Bin";
|
|
21 #use GraphClust;
|
|
22
|
|
23 #use vars qw(%CONFIG);
|
|
24 #*CONFIG = \%GraphClust::CONFIG;
|
|
25
|
|
26 =head1 NAME
|
|
27
|
|
28
|
|
29 fasta2shrep_gspan.pl -fasta mysequences.fasta -wins "50,100,150,200" -shift 5 -M 8
|
|
30
|
|
31 =head1 SYNOPSIS
|
|
32
|
|
33 Options:
|
|
34
|
|
35 HELP
|
|
36 -help brief help message
|
|
37 -man full documentation
|
|
38
|
|
39 COMPULSORY
|
|
40 -fasta <STRING> e.g. "sequence.fasta"
|
|
41 All sequences in fasta format.
|
|
42
|
|
43 OPTIONS
|
|
44 -wins [INTEGER] e.g. "50,100,200"
|
|
45 A list of window sizes to use.
|
|
46 If none are given (empty string ''), then the entire sequence is
|
|
47 taken with no windows. Each window > 1 required!
|
|
48 -shift <INTEGER> e.g. 20
|
|
49 The shift of the window, relative to the window size given in
|
|
50 percent. So you give which percent of the window size shall be
|
|
51 used for the shift. Of course the shift is rounded down to the
|
|
52 nearest whole number.
|
|
53 Example 20 % of a window 150 would result in a step size of 30 nt.
|
|
54 It is a relative parameter, as you can give different window sizes.
|
|
55 If you do not give this parameter there is a default shift of 1 nt.
|
|
56 -cue Crop unpaired ends.
|
|
57 If you give this flag, then the unpaired ends of each
|
|
58 single structure are ignored. E.g. the structure
|
|
59 ...(((...))).. becomes just (((...)))
|
|
60 -stack Adds stacking information to graphs. This adds an additional
|
|
61 vertex (type P) for each pair of stacked base-pairs and four edges
|
|
62 (type p) from each of the involved bases to the new vertex.
|
|
63 -e <FLOAT> e.g. 5.0
|
|
64 Energy range in kcal/mol (RNAshapes)
|
|
65 Use only one of -e and -c!
|
|
66 -c <INTEGER> e.g. 10
|
|
67 Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)
|
|
68 Use only one of -e and -c!
|
|
69 -t <INTEGER> [1-5] e.g. 3 OR "3=0,4=100,5=200"
|
|
70 The shape type (RNAshapes). Default is 3.
|
|
71 With the list format, the shape level can be changed for different window length
|
|
72 "4=100" means that shape level 4 is used from length 100nt (window length)
|
|
73 The first given length has to be 0! Not continuous given levels are allowed!
|
|
74 -M <INTEGER> e.g. 10
|
|
75 Max number of shreps that should be taken per window.
|
|
76 -u Ignore unstable structures (RNAshapes).
|
|
77 This option filters out closed structures with positive free energy.
|
|
78 -r Calculate structure probabilities for shreps (RNAshapes)
|
|
79 -i <INT> e.g. 10
|
|
80 Turn on structure sampling and gives number of sampling iterations.
|
|
81 Default no sampling (i=0)
|
|
82 -sample-len <INT> e.g. 100
|
|
83 Only in sampling mode: Sampling is only used for seqs/windows >= given length,
|
|
84 Default: sample all lengths (0), if -i > 0
|
|
85 -q Turn on shape probabilities for RNAshapes, no sampling mode allowed
|
|
86 -Tp <FLOAT> e.g 0.001
|
|
87 Filter cutoff for shape probabilities, applied before -M filter!
|
|
88 -seq-graph-win add for each window a graph which contains no structure
|
|
89 -seq-graph-t add for each 't #' a graph which contains no structure
|
|
90 -seq-graph-alph change the alphabet of unstructured graphs
|
|
91 -annotate <STRING> annotation.tab
|
|
92 A file with annotations to be added as abstract graphs
|
|
93 on the sequence leven (if given) and on the structure
|
|
94 (SHREP) level. The format is has the following TAB-delimited
|
|
95 columns: SEQID, START, END, NAMESPACE#LABEL.
|
|
96 Labels with the same name-space and SEQID form connected
|
|
97 components, which is a sequence of label vertices ordered
|
|
98 by the START position in the sequence.
|
|
99 -abstr Add abstract structure graphs to the single shrep graph
|
|
100 instances.
|
|
101 -nostr Calculate no structures, only add sequence information,
|
|
102 if this is given, then -seq-graph-win AND/OR -seq-graph-t
|
|
103 are required.
|
|
104 -match-shape <SHAPE>
|
|
105 all seqs/windows will be constraint folded into that shape via
|
|
106 RNAshapes (if structure is given in another way this struct will be kept),
|
|
107 if this shape is not possible within given energy range, produce a
|
|
108 specific t graph with only one vertex 'X'. By this the instance
|
|
109 becomes very unsimilar to all other graphs (for knn)
|
|
110 -vp enable graph computation with viewpoints:
|
|
111 svmsgdnspdk will center on those nucleotides that are given
|
|
112 via capital letters and ignore those given as lowercase letters
|
|
113 -tmp <STRING> e.g. "/scratch/1/sita/tmp"
|
|
114 A directory for writing temporary files
|
|
115 -o <STRING> e.g. "ProjectX/MySequences/GSPAN/"
|
|
116 Output directory for gspan files containing graphs.
|
|
117 -group <INTEGER> e.g. 5
|
|
118 Combine/group that number of input seqs into 1 gspan file
|
|
119 output name is then '<INT>.group.gspan.bz2'
|
|
120 -sge Use SGE cluster for each sequence separately
|
|
121 -sge-logdir stdout directory for SGE call
|
|
122 -sge-errdir sdterr directory for SGE call
|
|
123 -stdout send graphs to stdout instead of writing to files
|
|
124 -ignore-header don't write fasta id part after first space to gspan
|
|
125 -debug additional debug output
|
|
126
|
|
127
|
|
128 DEFAULT VALUES
|
|
129 -wins ""
|
|
130 -shift 1 nt
|
|
131 -c 10
|
|
132 -t 3
|
|
133 -M 0 # selects all shreps
|
|
134 -tmp "/var/tmp/fasta2shrep"
|
|
135 -o "CURRENT_DIR/GSPAN/"
|
|
136
|
|
137 SGE mode
|
|
138 -sge-logdir "CURRENT_DIR/GSPAN/SGE_log"
|
|
139 -sge-errdir "CURRENT_DIR/GSPAN/SGE_log"
|
|
140 -task-id <NUM>
|
|
141
|
|
142 =head1 DESCRIPTION
|
|
143
|
|
144 =cut
|
|
145
|
|
146 ###############################################################################
|
|
147 # end handler for temporary directory
|
|
148 # adds an error handler that deletes the directory in case of error
|
|
149 # SIGUSR{1/2} are sent by the sge prior to the uncatchable SIGKILL if the
|
|
150 # option -notify was set
|
|
151 ###############################################################################
|
|
152 $SIG{'INT'} = 'end_handler';
|
|
153 $SIG{'TERM'} = 'end_handler';
|
|
154 $SIG{'ABRT'} = 'end_handler';
|
|
155 $SIG{'USR1'} = 'end_handler';
|
|
156 $SIG{'USR2'} = 'end_handler';
|
|
157
|
|
158 sub end_handler {
|
|
159 print STDERR "signal ", $_[0], " caught, cleaning up temporary files\n";
|
|
160
|
|
161 # change into home directory. deletion of the temporary directory will
|
|
162 # fail if it is the current working directory
|
|
163 chdir();
|
|
164 File::Temp::cleanup();
|
|
165 die();
|
|
166 }
|
|
167
|
|
168 ###############################################################################
|
|
169 # PARSE COMMAND LINE OPTIONS
|
|
170 ###############################################################################
|
|
171
|
|
172 # command line options
|
|
173 my ( $i_help, $i_man, $i_debug, $i_fas, $i_wins, $i_shift, $i_crop_unpaired_ends,
|
|
174 $i_r, $i_e, $i_c, $i_t, $i_u, $i_M, $i_o, $i_sge, $i_jobid, $i_tmp,
|
|
175 $i_ignore_seq_header, $i_stacks, $i_stdout, $i_q, $i_T, $i_i,
|
|
176 $i_sample_min_length, $i_sge_logDir, $i_sge_errDir, $i_groupsize, $i_annotate,
|
|
177 $i_abstr, $i_no_structure, $i_vp, $i_matchShape, $i_rnashapes_binpath );
|
|
178
|
|
179 my ( $i_add_seq_graph_win, $i_add_seq_graph_t, $i_change_seq_graph_alph );
|
|
180
|
|
181 my $options = GetOptions(
|
|
182 "help" => \$i_help,
|
|
183 "man" => \$i_man,
|
|
184 "debug" => \$i_debug,
|
|
185 "fasta=s" => \$i_fas,
|
|
186 "wins=s" => \$i_wins,
|
|
187 "shift=f" => \$i_shift,
|
|
188 "cue" => \$i_crop_unpaired_ends,
|
|
189 "stack" => \$i_stacks,
|
|
190 "r" => \$i_r,
|
|
191 "e=f" => \$i_e,
|
|
192 "c=i" => \$i_c,
|
|
193 "t=s" => \$i_t,
|
|
194 "u" => \$i_u,
|
|
195 "M=i" => \$i_M,
|
|
196 "tmp=s" => \$i_tmp,
|
|
197 "o=s" => \$i_o,
|
|
198 "i=i" => \$i_i,
|
|
199 "sample-len=i" => \$i_sample_min_length,
|
|
200 "q" => \$i_q,
|
|
201 "Tp=f" => \$i_T,
|
|
202 "seq-graph-win" => \$i_add_seq_graph_win,
|
|
203 "seq-graph-t" => \$i_add_seq_graph_t,
|
|
204 "-seq-graph-alph" => \$i_change_seq_graph_alph,
|
|
205 "sge" => \$i_sge,
|
|
206 "task-id=i" => \$i_jobid,
|
|
207 "ignore-header" => \$i_ignore_seq_header,
|
|
208 "stdout" => \$i_stdout,
|
|
209 "sge-logdir=s" => \$i_sge_logDir,
|
|
210 "sge-errdir=s" => \$i_sge_errDir,
|
|
211 "group=i" => \$i_groupsize,
|
|
212 "annotate=s" => \$i_annotate,
|
|
213 "abstr" => \$i_abstr,
|
|
214 "nostr" => \$i_no_structure,
|
|
215 "vp" => \$i_vp,
|
|
216 "match-shape=s" => \$i_matchShape,
|
|
217 "rnashapes-bin=s" => \$i_rnashapes_binpath,
|
|
218 );
|
|
219
|
|
220 pod2usage( -exitstatus => 1, -verbose => 1 ) if $i_help;
|
|
221 pod2usage( -exitstatus => 0, -verbose => 2 ) if $i_man;
|
|
222 ($options) or pod2usage(2);
|
|
223
|
|
224 # check compulsory options
|
|
225 ($i_fas) or pod2usage("Error: the option -fasta is compulsory!\n");
|
|
226 ( -e $i_fas ) or pod2usage("Error: no such file - $i_fas!\n");
|
|
227 $i_fas = abs_path($i_fas);
|
|
228
|
|
229 # check other options and set default values
|
|
230 pod2usage("Error: either -e OR -c can be given, but NOT both!\n") if ( $i_e && $i_c );
|
|
231 ( $i_e or $i_c ) or $i_c = 10; # set -c 10, if neither -e or -c are given
|
|
232 ($i_M) or $i_M = 0; # max number of shreps is 0 (=means take all computed)
|
|
233 ($i_i) or $i_i = 0; # default no sampling else sampling iterations
|
|
234 ($i_sample_min_length) or $i_sample_min_length = 0;
|
|
235 pod2usage("\nError: use --sample-len <INT> only with -i <1..INT> !\n") if ( not $i_i and $i_sample_min_length );
|
|
236 ($i_T) or $i_T = 0;
|
|
237 ($i_q) or $i_q = 0;
|
|
238 pod2usage("\nError: Sampling (-i) not possible with shape probabilities (-q)!\n") if ( $i_i and $i_q );
|
|
239 ($i_add_seq_graph_win) or $i_add_seq_graph_win = 0;
|
|
240 ($i_add_seq_graph_t) or $i_add_seq_graph_t = 0;
|
|
241 ($i_change_seq_graph_alph) or $i_change_seq_graph_alph = 0;
|
|
242 ($i_no_structure) or $i_no_structure = 0;
|
|
243
|
|
244 if ($i_change_seq_graph_alph) {
|
|
245 ($i_add_seq_graph_t) or ($i_add_seq_graph_win) or pod2usage( "Error: " .
|
|
246 "When giving the parameter -seq-graph-alph, then either -seq-graph-t or -seq-graph-win" .
|
|
247 " must also be given!\n" );
|
|
248 }
|
|
249 ( -e $i_annotate ) or pod2usage("Error: no such file - $i_annotate!\n")
|
|
250 if ($i_annotate);
|
|
251
|
|
252 $i_add_seq_graph_t = 1 if ($i_no_structure);
|
|
253
|
|
254 ($i_t) or $i_t = 3; # default abstraction type is 3
|
|
255 my $change_shape_level = 0;
|
|
256 my @level_lens = ( -1, -1, -1, -1, -1 ); ## array_idx-1=shape level, value=start length of this level
|
|
257
|
|
258 if ( $i_t !~ /^\d+$/ ) {
|
|
259
|
|
260 my @t_minlens = split( ",", $i_t ); ## -t "3=100,4=200"
|
|
261
|
|
262 foreach my $idx ( 1 .. @t_minlens ) {
|
|
263
|
|
264 my $level = $t_minlens[ $idx - 1 ]; ## $level = "3=100"
|
|
265
|
|
266 die "$level Wrong -t format! Example: -t 3=0,4=100,5=200\n" if ( $level !~ /^\d+\=\d+$/ );
|
|
267 my @lev_len = split( "=", $level ); ## $level = "3=100"
|
|
268
|
|
269 die "Wrong -t format! First level given needs to be length 0! Example: -t 3=0,4=100,5=200\n" if ( $idx == 1 && $lev_len[1] != 0 );
|
|
270 die "Wrong -t format! Only level 1-5 allowed! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[0] < 1 or $lev_len[0] > 5 );
|
|
271 die "Wrong -t format! Length >= 0 expected! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[1] < 0 );
|
|
272 $change_shape_level = 1;
|
|
273 $level_lens[ $lev_len[0] - 1 ] = $lev_len[1];
|
|
274 }
|
|
275
|
|
276 ($i_debug) and print STDERR "i_t = $i_t - change $change_shape_level - shape level lengths#" . join( ":", @level_lens ) . "#" . join( ":", @t_minlens ) . "#\n";
|
|
277 }
|
|
278
|
|
279 ## checks for match-shape
|
|
280 if ($i_matchShape){
|
|
281 die "Please provide correct match shape string like '[]'! Exit...\n\n"
|
|
282 if ($i_matchShape !~ /^[\[\]_]+$/);
|
|
283 ## RNAshapes prodices anyway only 1 structure, no suboptimal structs in match-shape folding
|
|
284 $i_M = 1;
|
|
285 }
|
|
286
|
|
287 my $CURRDIR = getcwd;
|
|
288
|
|
289 # set up tmp directory
|
|
290 # default tmp is /var/tmp, usually not NFS mounted!
|
|
291 ( defined $i_tmp ) or $i_tmp = '/var/tmp/';
|
|
292 my $tmp_template = 'fasta2shrep-XXXXXX';
|
|
293
|
|
294 # CLEANUP => 1 : automatically delete at exit
|
|
295 $i_tmp = tempdir( $tmp_template, DIR => $i_tmp, CLEANUP => 1 );
|
|
296
|
|
297 # create GSPAN directory when not printing to stdout
|
|
298 if ( not $i_stdout ) {
|
|
299 if ($i_o) {
|
|
300 ( -e $i_o ) or system("mkdir -p $i_o");
|
|
301 } else {
|
|
302 system("mkdir -p GSPAN");
|
|
303 $i_o = $CURRDIR . "/GSPAN/";
|
|
304 }
|
|
305 }
|
|
306
|
|
307 ###############################################################################
|
|
308 # GLOBAL VARIABLES
|
|
309 ###############################################################################
|
|
310 $i_rnashapes_binpath or $i_rnashapes_binpath = "RNAshapes";
|
|
311 die "Please provide full RNAshapes path (e.g. /usr/bin/RNAshapes) !\n" if (!-f "$i_rnashapes_binpath");
|
|
312 my $rnashapes_loc = $i_rnashapes_binpath;
|
|
313 if ( !$rnashapes_loc || !-e $rnashapes_loc ) {
|
|
314 my $loc = `which RNAshapes`;
|
|
315 chomp($loc);
|
|
316 die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !$loc );
|
|
317 die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !-e $loc );
|
|
318 $rnashapes_loc = $loc;
|
|
319 }
|
|
320
|
|
321 my @WINDOWS = ();
|
|
322 @WINDOWS = split( ",", $i_wins ) if ($i_wins);
|
|
323 my $globalFolding;
|
|
324 $globalFolding = 1 unless @WINDOWS;
|
|
325 my $CURRUSER = getlogin;
|
|
326 my $SEQNO = 0; # used to id the sequences
|
|
327 my $GSPANNO = 0; # used for gspan filenames
|
|
328 # minimum length of sequence that will work with RNAshapes
|
|
329 # sequences with one or two nucleotides will be restricted to sequence-only graphs
|
|
330 # this ensures that no sequences are skipped and external info kept synchronized
|
|
331 my $GSPAN_SEQ_MINLEN = 3;
|
|
332
|
|
333 # name spaces
|
|
334 my $ABSTRUCT = "AS";
|
|
335 ###############################################################################
|
|
336 # EXECUTION CODE
|
|
337 ###############################################################################
|
|
338
|
|
339 # read fasta file into hash
|
|
340 my ( $headers_aref, $sequences_aref, $metainfo_aref ) = read_fasta_with_nonunique_headers_meta($i_fas);
|
|
341
|
|
342 # call script again on the sge cluster in a batch job
|
|
343 if ($i_sge) {
|
|
344 my $SCRIPTNAME = $0;
|
|
345 my $CLUSTERSUBMIT = $FindBin::Bin."/fasta2shrep_gspan.sge";
|
|
346
|
|
347 die "Cannot find SGE submit script $CLUSTERSUBMIT! Exit...\n\n" if ( !-e $CLUSTERSUBMIT );
|
|
348
|
|
349 my $SGE_jobs = @{$sequences_aref};
|
|
350 $SGE_jobs = ceil( @{$sequences_aref} / $i_groupsize ) if ($i_groupsize);
|
|
351
|
|
352 my $params = "-fasta $i_fas ";
|
|
353 $params .= "-wins $i_wins " if ($i_wins);
|
|
354 $params .= "-shift $i_shift " if ($i_shift);
|
|
355 $params .= "-e $i_e " if ($i_e);
|
|
356 $params .= "-c $i_c " if ($i_c);
|
|
357 $params .= "-t $i_t " if ($i_t);
|
|
358 $params .= "-u " if ($i_u);
|
|
359 $params .= "-r " if ($i_r);
|
|
360 $params .= "-M $i_M " if ($i_M);
|
|
361 $params .= "-o $i_o " if ($i_o);
|
|
362 $params .= "-i $i_i " if ($i_i);
|
|
363 $params .= "--sample-len $i_sample_min_length" if ($i_sample_min_length);
|
|
364 $params .= "-q " if ($i_q);
|
|
365 $params .= "-Tp $i_T " if ($i_T);
|
|
366 $params .= "--seq-grap-win " if ($i_add_seq_graph_win);
|
|
367 $params .= "--seq-graph-t " if ($i_add_seq_graph_t);
|
|
368 $params .= "--seq-graph-alph " if ($i_change_seq_graph_alph);
|
|
369 $params .= "-ignore-header " if ($i_ignore_seq_header);
|
|
370 $params .= "-cue " if ($i_crop_unpaired_ends);
|
|
371 $params .= "-stack " if ($i_stacks);
|
|
372 $params .= "--group $i_groupsize " if ($i_groupsize);
|
|
373 $params .= "-annotate $i_annotate" if ($i_annotate);
|
|
374 $params .= "-abstr" if ($i_abstr);
|
|
375 $params .= "-nostr " if ($i_no_structure);
|
|
376 $params .= "--debug " if ($i_debug);
|
|
377 $params .= "-vp " if ($i_vp);
|
|
378 $params .= "--match-shape $i_matchShape " if ($i_matchShape);
|
|
379
|
|
380 print "used script:" . $SCRIPTNAME . "\n";
|
|
381 print "used submit script:" . $CLUSTERSUBMIT . "\n";
|
|
382
|
|
383 $i_sge_logDir = "$i_o/SGE_log" if ( !$i_sge_logDir );
|
|
384 mkdir($i_sge_logDir);
|
|
385
|
|
386 $i_sge_errDir = $i_sge_logDir if ( !$i_sge_errDir );
|
|
387 mkdir($i_sge_errDir);
|
|
388
|
|
389 my $ssh = 1; ## can be used for debug shell script call
|
|
390 if ($ssh) {
|
|
391 system( "ssh $CURRUSER\@biui.informatik.uni-freiburg.de "
|
|
392 . "'export SGE_ROOT=/opt/sge-6.0/; cd $CURRDIR; "
|
|
393 . "/opt/sge-6.0/bin/lx24-amd64/qsub -t 1-$SGE_jobs -o $i_sge_errDir/ -e $i_sge_errDir/ "
|
|
394 . "$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME \"$params\" ' " );
|
|
395 } else {
|
|
396 system("$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME '$params' ");
|
|
397 }
|
|
398 exit;
|
|
399 }
|
|
400
|
|
401 ## compute shreps and gspan, either local or after SGE submission
|
|
402
|
|
403 # TODO read and process annotations
|
|
404
|
|
405 my @used_seq_headers;
|
|
406 my @used_seqs;
|
|
407 my @used_meta;
|
|
408 my $group_idx;
|
|
409
|
|
410 if ($i_jobid) {
|
|
411 ## just process the one sequence as given by the jobid number
|
|
412 my $used_grouping = 1; ## if no group is given, make 1 seq per job = group=1
|
|
413 $used_grouping = $i_groupsize if ($i_groupsize);
|
|
414 my $st = ( $i_jobid - 1 ) * $used_grouping + 1;
|
|
415 my $end = $st + $used_grouping - 1;
|
|
416 $end = @{$sequences_aref} if ( $end > @{$sequences_aref} );
|
|
417
|
|
418 foreach my $idx ( $st .. $end ) {
|
|
419 push( @used_seq_headers, $headers_aref->[ $idx - 1 ] );
|
|
420 push( @used_seqs, $sequences_aref->[ $idx - 1 ] );
|
|
421 push( @used_meta, $metainfo_aref->[ $idx - 1 ] );
|
|
422 }
|
|
423
|
|
424 $group_idx = $i_jobid;
|
|
425
|
|
426 #($i_debug) and print STDERR "st $st end $end gr $group_idx job $i_jobid gs $i_groupsize\n";
|
|
427
|
|
428 $GSPANNO = $i_jobid-1 if (!$i_groupsize);
|
|
429
|
|
430 } else {
|
|
431 ## process all sequences at once
|
|
432 @used_seq_headers = @{$headers_aref};
|
|
433 @used_seqs = @{$sequences_aref};
|
|
434 @used_meta = @{$metainfo_aref};
|
|
435 }
|
|
436
|
|
437 my $out;
|
|
438 my $gspanfile;
|
|
439 my $out_no_match_shape;
|
|
440
|
|
441 if ($i_matchShape && !$i_stdout && !$i_groupsize){
|
|
442 open($out_no_match_shape,">$i_o/fasta2shrep.no_match");
|
|
443 }
|
|
444
|
|
445 # for each sequence in the fasta file
|
|
446 while ( my $seq = shift @used_seqs ) {
|
|
447 my $tmp_header = shift @used_seq_headers;
|
|
448 my $tmp_meta = shift @used_meta;
|
|
449
|
|
450 my ( $seq_id, $seq_header ) = ( $tmp_header =~ /(\S+)\s*([\S*\s*]*)/ );
|
|
451 $i_ignore_seq_header and $seq_header = '';
|
|
452
|
|
453 my $seq_fasta = generate_single_fasta_from_sequence_X( $seq_id, $seq );
|
|
454 my $seq_len = length($seq);
|
|
455
|
|
456 # only print sequence graphs for sequences below this threshold
|
|
457 my $no_structure_override = ($seq_len < $GSPAN_SEQ_MINLEN) ? 1 : 0;
|
|
458
|
|
459 $GSPANNO++;
|
|
460
|
|
461 # set outstream for gspan output to correct file/STDOUT
|
|
462 if ($i_stdout) {
|
|
463 $out = \*STDOUT;
|
|
464 } elsif ( !$i_groupsize ) {
|
|
465 $gspanfile = $i_tmp . '/' . $GSPANNO . '.gspan';
|
|
466 open( $out, "| bzip2 -f > $gspanfile.bz2" );
|
|
467 } elsif ( ( $GSPANNO - 1 ) % $i_groupsize == 0 ) {
|
|
468
|
|
469 if ( $GSPANNO > 1 ) {
|
|
470 close($out);
|
|
471 system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
|
|
472 if ($out_no_match_shape){
|
|
473 close($out_no_match_shape);
|
|
474 system("mv $gspanfile.no_match $i_o/$group_idx.group.gspan.no_match");
|
|
475 }
|
|
476 }
|
|
477
|
|
478 if ( !$i_jobid ) {
|
|
479 $group_idx = int( ( $GSPANNO - 1 ) / $i_groupsize ) + 1;
|
|
480 }
|
|
481
|
|
482 $gspanfile = "$i_tmp/$group_idx.group.gspan";
|
|
483 open( $out, "| bzip2 -f > $gspanfile.bz2" );
|
|
484 open( $out_no_match_shape, ">$gspanfile.no_match" ) if ($out_no_match_shape);
|
|
485 }
|
|
486
|
|
487 ## do not use folding windows in special cases
|
|
488 if ( $globalFolding || $i_no_structure || $no_structure_override) {
|
|
489 @WINDOWS = ();
|
|
490 push( @WINDOWS, $seq_len );
|
|
491 }
|
|
492
|
|
493 ##check win sizes for global folding
|
|
494 my @WINDOWS_used = sort { $a <=> $b } @WINDOWS;
|
|
495 foreach my $w_idx ( 0 .. ( @WINDOWS_used - 1 ) ) {
|
|
496 if ( $WINDOWS_used[$w_idx] >= $seq_len && $WINDOWS_used[$w_idx] > 1 ) {
|
|
497 if ( $w_idx < $#WINDOWS_used ) {
|
|
498 @WINDOWS_used = @WINDOWS_used[ 0 .. $w_idx ];
|
|
499 last;
|
|
500 }
|
|
501 }
|
|
502 }
|
|
503
|
|
504 ## use seq graph only if no shape folding wanted
|
|
505 $i_add_seq_graph_t = 1 if ($i_no_structure || $no_structure_override);
|
|
506
|
|
507 ## no shape info in graphheader if we have fixed structure (according LocaRNA handling)
|
|
508 ## use tags #FS or #S for provided structure
|
|
509 my $graph_header;
|
|
510 my @struct_meta = grep { $_ =~ /#FS/ || $_ =~ /#S/ } keys %{$tmp_meta};
|
|
511
|
|
512 if (@struct_meta) {
|
|
513 $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, 1 );
|
|
514 } else {
|
|
515 $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, $i_no_structure || $no_structure_override );
|
|
516 }
|
|
517
|
|
518 print $out $graph_header;
|
|
519
|
|
520 my $gi = 0; # current graph index
|
|
521
|
|
522 ## add graph with no structure at all depending on $i_add_seq_graph_t
|
|
523 if ($i_add_seq_graph_t) {
|
|
524
|
|
525 ($gi) = convertSeqWindow( $seq, $seq_len, 1, $gi, $graph_header, $out, $i_annotate, $seq );
|
|
526
|
|
527 }
|
|
528
|
|
529 ## encode fixed structure only if provided and structures wanted in general
|
|
530 if ( @struct_meta && !$i_no_structure && !$no_structure_override) {
|
|
531
|
|
532 my $struct_meta = "#FS";
|
|
533 $struct_meta = "#S" if ( !exists $tmp_meta->{$struct_meta} );
|
|
534
|
|
535 my $seq_shrep = [ $tmp_meta->{$struct_meta}, "ENERGY", "0.00", "SHAPE", $struct_meta ];
|
|
536 $gi = convertShapeWindow( [$seq_shrep], $seq, $seq_len, 1, $gi, $out,
|
|
537 $graph_header, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
|
|
538 @WINDOWS_used = (); ## no shape folding if we have a fixed structure
|
|
539
|
|
540 }
|
|
541
|
|
542 ## ignore RNAshapes folding if wanted, but do correct file move afterwards
|
|
543 ## (just "next" does not work due to output)
|
|
544 @WINDOWS_used = () if ($i_no_structure or $no_structure_override);
|
|
545
|
|
546 #for each window size in list
|
|
547 foreach my $win_size (@WINDOWS_used) {
|
|
548
|
|
549 # calculate shift size from percentage
|
|
550 my $curr_shift = 1;
|
|
551 if ($i_shift) {
|
|
552 $curr_shift = ( $i_shift / 100 ) * $win_size;
|
|
553 $curr_shift = int($curr_shift); #round down
|
|
554 $curr_shift = 1 unless ($curr_shift); # just in case it is 0
|
|
555 }
|
|
556 ($i_debug) and print STDERR "winsize: $win_size curr_shift: $curr_shift\n";
|
|
557 ($i_debug) and print STDERR "\nNext: $seq_id\t winsize:$win_size \n";
|
|
558
|
|
559 # choose current shape level, depending on $i_t
|
|
560 my $curr_t = 0;
|
|
561 if ($change_shape_level) {
|
|
562 for ( my $i = 0 ; $i < @level_lens ; $i++ ) {
|
|
563 $curr_t = $i + 1 if ( $level_lens[$i] != -1 && ( $level_lens[$i] <= $win_size ) );
|
|
564 }
|
|
565 ($i_debug) and print STDERR "$win_size curr type $curr_t\n";
|
|
566 } else {
|
|
567 $curr_t = $i_t;
|
|
568 }
|
|
569
|
|
570 my $rnashapesoutput_fh;
|
|
571
|
|
572 # call RNAshapes and write to $rnashapesoutput_fh
|
|
573 $rnashapesoutput_fh = call_RNAshapes( $seq_fasta, $rnashapes_loc, $win_size,
|
|
574 $curr_shift, $i_e, $i_c, $curr_t, $i_u, $i_r, $i_q, $i_T, $i_i, $i_sample_min_length, $seq_len, $i_matchShape );
|
|
575
|
|
576 # read RNAshapes output from $rnashapesoutput_fh and write subgraph
|
|
577 # to gspan file
|
|
578 my $gi_old = $gi;
|
|
579 $gi = convert_RNAshapes_output( $rnashapesoutput_fh, $gi, $i_M, $out, $graph_header,
|
|
580 $win_size, $seq_len, $curr_t, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
|
|
581
|
|
582 ## no (match) shape found at all for this seq
|
|
583 if ($gi == $gi_old+1){
|
|
584 $gi = convertSeqWindow( "X", 1, 1, $gi, $graph_header, $out, $i_annotate, $seq );
|
|
585 print $out_no_match_shape $seq_id."\n" if (!$i_stdout && $out_no_match_shape);
|
|
586 }
|
|
587 } ## foreach WINDOW_used
|
|
588
|
|
589 if ( !$i_stdout && !$i_groupsize ) {
|
|
590 close($out);
|
|
591 move "$gspanfile.bz2", "$i_o/$GSPANNO.gspan.bz2";
|
|
592 }
|
|
593
|
|
594 system("rm $seq_fasta");
|
|
595
|
|
596 } ## while @used_seqs
|
|
597
|
|
598 if ($i_groupsize) {
|
|
599 close($out);
|
|
600 system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
|
|
601 close($out_no_match_shape) if ($out_no_match_shape);
|
|
602 move "$gspanfile.no_match", "$i_o/$group_idx.group.gspan.no_match" if ($out_no_match_shape);
|
|
603 } elsif ( $out_no_match_shape && !$i_stdout && !$i_groupsize ) {
|
|
604 close($out_no_match_shape);
|
|
605 }
|
|
606
|
|
607 ###############################################################################
|
|
608 # METHODS
|
|
609 ###############################################################################
|
|
610
|
|
611 ############################################################################
|
|
612 # Generates fasta file for a single sequence (one-lined-fasta). This
|
|
613 # fasta is stored in the temp directory and should be deleted at the end.
|
|
614 # Input:
|
|
615 # seq_id : the sequence ID
|
|
616 # seq : the sequence
|
|
617 #
|
|
618 # Output:
|
|
619 # The fasta file name
|
|
620 ############################################################################
|
|
621 sub generate_single_fasta_from_sequence_X {
|
|
622 my ( $seq_id, $seq ) = @_;
|
|
623
|
|
624 $seq = uc($seq);
|
|
625 $seq =~ tr/T/U/;
|
|
626 $seq =~ s/[^AUCGN]/N/g;
|
|
627
|
|
628 my $outfas = $i_tmp . "/seq_" . $SEQNO++ . ".fasta";
|
|
629 my $host = readpipe("hostname");
|
|
630 open( FAS, ">$outfas" ) or die "$host Cannot open file $outfas! Exit...\n\n";
|
|
631 print FAS ">$seq_id\n$seq";
|
|
632 close(FAS);
|
|
633
|
|
634 return $outfas;
|
|
635 }
|
|
636
|
|
637 ############################################################################
|
|
638 # RNAshapes is called with the given input or default parameters.
|
|
639 # Input:
|
|
640 # seq_fasta : the sequence fasta file
|
|
641 # rnashapes_location : the location of the installation files for RNAshapes
|
|
642 # win_size : the current window size
|
|
643 # shift : the input parameter -shift
|
|
644 # e : the input parameter -e
|
|
645 # c : the input parameter -c
|
|
646 # t : the input parameter -t
|
|
647 # u : the input parameter -u
|
|
648 # r : the input parameter -r
|
|
649 #
|
|
650 # Output: none
|
|
651 ############################################################################
|
|
652 sub call_RNAshapes {
|
|
653 my ( $seq_fasta, $rnashapes_location, $win_size, $shift, $e, $c, $t, $u, $r, $q, $T, $i, $sample_length, $seqLen, $matchShape ) = @_;
|
|
654 my $FUNCTION = "call_RNAshapes in fasta2shrep_gspan.pl";
|
|
655
|
|
656 ($seq_fasta) or die("INPUT ERROR in $FUNCTION: the fasta file is compulsory!\n");
|
|
657 ($rnashapes_location) or die( "INPUT ERROR in $FUNCTION: the RNAshapes location" . " is compulsory!\n" );
|
|
658 die "$rnashapes_location does not exist! Exit...\n\n" if ( !-e $rnashapes_location );
|
|
659 my $call = $rnashapes_location . " -o 1 "; # the output format is of type 1
|
|
660 $call .= "-q " if ($q);
|
|
661 $call .= "-T $T " if ( $q and $T );
|
|
662 $call .= "-w $win_size ";
|
|
663 $call .= "-W $shift " if ($i_shift);
|
|
664 die("ERROR in $FUNCTION: Give only one of the options -c or -e (RNAshapes)!\n")
|
|
665
|
|
666 if ( $e && $c );
|
|
667 $call .= "-e $e " if ($e);
|
|
668 $call .= "-c $c " if ($c);
|
|
669 $call .= "-t $t " if ($t);
|
|
670 $call .= "-u " if ( $u and not $i ); ## not possible in sampling mode
|
|
671 $call .= "-r " if ($r);
|
|
672 $call .= "-m $matchShape " if ($matchShape);
|
|
673
|
|
674 ## check is a bit long but : we want to sample if the window is larger than $sample_length and full seq is larger than window or sample_len
|
|
675 ## necessary to do sampling a large window is given, sample_length is shorter than window, but seq is longer than sample_len
|
|
676 $call .= "-i $i -A " if ( not $q and $i and ( $win_size >= $sample_length and ( $win_size <= $seqLen or $seqLen >= $sample_length ) ) ); ## -A is to omit samples and print only combined shape probs
|
|
677 $call .= " < $seq_fasta";
|
|
678
|
|
679 ($i_debug) and print STDERR "$seqLen $sample_length $win_size $call\n";
|
|
680
|
|
681 open my $rnashapesoutput, "$call |" or die( "ERROR in $FUNCTION: The following call " . "could not be carried out!\n$call\n" );
|
|
682
|
|
683 return $rnashapesoutput;
|
|
684 }
|
|
685
|
|
686 ############################################################################
|
|
687 # The output of RNAshapes for one sequence and one window size is read
|
|
688 # and converted into graph format.
|
|
689 # Input:
|
|
690 # rnashapeoutput : filehandle for RNAshapes output in format -o 1
|
|
691 # curr_gi : current graph index (for vertices)
|
|
692 # maxShreps : max number of shreps to convert to graphs
|
|
693 # graph_file_hdl : the output handler for the graph file
|
|
694 # graphHead : the header line for the complete graph (for sequence)
|
|
695 # winSize : current window size in input for RNAshapes
|
|
696 # seqLen : full input seq length
|
|
697 # used_t :
|
|
698 # annotate:
|
|
699 # abstr :
|
|
700 # cue :
|
|
701 # stacks :
|
|
702 # orig_seq : the nucleotide sequence as read from fasta
|
|
703 #
|
|
704 # Output:
|
|
705 # The current graph index
|
|
706 ############################################################################
|
|
707 sub convert_RNAshapes_output {
|
|
708 my ( $rnashapesoutput, $curr_gi, $maxShreps, $graph_file_hdl, $graphHead, $winSize, $seqLen, $used_t, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
|
|
709
|
|
710 ## omit first line in output, contains fasta header line
|
|
711 my $line = <$rnashapesoutput>;
|
|
712 my $win_shreps = [];
|
|
713 my $win_start;
|
|
714 my $win_end;
|
|
715 my $win_seq;
|
|
716 my $win_shrep_count = 0;
|
|
717 my $win_sample = 0;
|
|
718 my $winHead;
|
|
719 my $win_globalFolding;
|
|
720 my $win_size_real;
|
|
721
|
|
722 # reading RNAshapes output
|
|
723 while ( $line = <$rnashapesoutput> ) {
|
|
724
|
|
725 if ( $line =~ /^(\d+)\s+(\d+)$/ ) {
|
|
726 ## line: "<start> <end>"
|
|
727
|
|
728 if ( @{$win_shreps} > 0 ) {
|
|
729
|
|
730 print $graph_file_hdl $winHead;
|
|
731
|
|
732 ## remove SHAPE="_" shrep if it exists in $win_shreps
|
|
733 ## do this only when have a sequence graph already
|
|
734 if ($i_add_seq_graph_t) {
|
|
735 my @new_win_shreps = ();
|
|
736 map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
|
|
737 $win_shreps = \@new_win_shreps;
|
|
738 }
|
|
739
|
|
740 ## add graph with no structure depending on $i_add_seq_graph_win to win
|
|
741 if ($i_add_seq_graph_win ) {
|
|
742 ( $curr_gi ) = convertSeqWindow( $win_seq, $win_size_real,
|
|
743 $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
|
|
744 }
|
|
745
|
|
746 $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
|
|
747 $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr,
|
|
748 $cue, $stacks, $orig_seq );
|
|
749 }
|
|
750
|
|
751 ## set new window params
|
|
752 $win_shreps = [];
|
|
753 $win_start = $1;
|
|
754 $win_end = $2;
|
|
755 $win_shrep_count = 0;
|
|
756 $win_size_real = $win_end - $win_start + 1;
|
|
757
|
|
758 if ( ($win_size_real) >= $seqLen ) {
|
|
759 $win_globalFolding = 1;
|
|
760 } else {
|
|
761 $win_globalFolding = 0;
|
|
762 }
|
|
763 my $win_center = $win_start + ( ( $win_size_real + 1 ) / 2 );
|
|
764
|
|
765 $winHead = getWindowHeader( $graphHead, $winSize, $win_start, $win_end, $win_center, $win_globalFolding, $win_sample, $used_t );
|
|
766
|
|
767 } elsif ( $line =~ /^(\S+)$/ ) {
|
|
768 ## line: "CUUAUGAGUAAGGAAAAUAACGAUUCGGGGUGACGCCCGAAUCCUCACUG"
|
|
769 $win_seq = uc($1); ## to be 101% sure that we have by default uppercase chars
|
|
770
|
|
771 } elsif ( $line =~ /^Results for (\d+) iterations:$/ ) {
|
|
772 ## line: "Results for 10 iterations:"
|
|
773 $win_sample = $1;
|
|
774
|
|
775 } elsif ( $line =~ /^Shape\s+\S+\s+not found within energy range.*$/ ) {
|
|
776 ## line: "Shape [] not found within energy range (-24.75 to -27.50). Try -c or -e to increase range."
|
|
777 $win_shreps = [];
|
|
778
|
|
779 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)$/ ) {
|
|
780 ## line:"...((((..(((....)))))))...........(((((......))))) (-10.10) [[]][]"
|
|
781 ## take only $maxShreps shreps per window if set
|
|
782 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
|
|
783 push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPE", $3 ] );
|
|
784 $win_shrep_count++;
|
|
785
|
|
786 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)$/ ) {
|
|
787 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) [[[[[]]]]]"
|
|
788 ## take only $maxShreps shreps per window if set
|
|
789 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
|
|
790 push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPE", $4 ] );
|
|
791 $win_shrep_count++;
|
|
792
|
|
793 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
|
|
794 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) 0.3000000 [[[[[]]]]]"
|
|
795 ## take only $maxShreps shreps per window if set
|
|
796 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
|
|
797 push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPEPROB", $3, "SHAPE", $4 ] );
|
|
798 $win_shrep_count++;
|
|
799
|
|
800 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
|
|
801 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) 0.3000000 [[[[[]]]]]"
|
|
802 ## take only $maxShreps shreps per window if set
|
|
803 next if ( $maxShreps && $win_shrep_count >= $maxShreps );
|
|
804 push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPEPROB", $4, "SHAPE", $5 ] );
|
|
805 $win_shrep_count++;
|
|
806
|
|
807 } else {
|
|
808 next if ( $line =~ /^$/ );
|
|
809 die "Unexpected shape output format!\nline=$line\n\nExit...\n\n";
|
|
810 }
|
|
811 }
|
|
812
|
|
813 ## convert last windows
|
|
814 if ( @{$win_shreps} > 0 ) {
|
|
815
|
|
816 print $graph_file_hdl $winHead;
|
|
817
|
|
818 ## remove SHAPE="_" shrep if it exists in $win_shreps
|
|
819 ## do this only when have a sequence graph already
|
|
820 if ($i_add_seq_graph_t) {
|
|
821 my @new_win_shreps = ();
|
|
822 map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
|
|
823 $win_shreps = \@new_win_shreps;
|
|
824 }
|
|
825
|
|
826 ## add graph with no structure depending on $i_add_seq_graph_win to win
|
|
827 if ($i_add_seq_graph_win) {
|
|
828 ($curr_gi) = convertSeqWindow( $win_seq, $win_size_real, $win_start,
|
|
829 $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
|
|
830 }
|
|
831
|
|
832 $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
|
|
833 $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, $cue,
|
|
834 $stacks, $orig_seq );
|
|
835 }
|
|
836
|
|
837 close($rnashapesoutput);
|
|
838
|
|
839 return $curr_gi + 1; # return the gi (graph index) for the next subgraph
|
|
840 }
|
|
841
|
|
842 ## Sub to create graph for a complete unstructured sequence
|
|
843 # TODO: document function
|
|
844 sub convertSeqWindow {
|
|
845 my ( $win_seq, $win_size_real, $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ) = @_;
|
|
846
|
|
847 my $seq_shrep;
|
|
848
|
|
849 # use a different alphabet (lowercase) if option seq-graph-alph set
|
|
850 my $seq_graph_sequence = $i_change_seq_graph_alph ? lc($win_seq) : uc($win_seq);
|
|
851 $seq_shrep = [ "." x $win_size_real, "ENERGY", "0.00", "SHAPE", "_", "STRUCT", "." x $win_size_real, "SEQ", $seq_graph_sequence ];
|
|
852 my $backboneGraph_ref = getBackboneGraph( $seq_graph_sequence, $curr_gi, $win_start, 0, ( $win_size_real - 1 ), $orig_seq );
|
|
853
|
|
854 print $graph_file_hdl getSeqHeader( $winHead, $seq_shrep );
|
|
855 print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
|
|
856
|
|
857 $curr_gi += $win_size_real;
|
|
858
|
|
859 if ($annotate) {
|
|
860 ## TODO add annotation graphs
|
|
861 }
|
|
862
|
|
863 return ($curr_gi);
|
|
864 }
|
|
865
|
|
866 ############################################################################
|
|
867 # The results for one window are converted in this method to GSPAN graphs.
|
|
868 # Input:
|
|
869 # win_shreps_aref : the array ref of shreps (dot-bracket structures) for the
|
|
870 # current window
|
|
871 # win_seq : the nucleotide sequence for the current window
|
|
872 # win_size_real : the current (true) window size
|
|
873 # win_start : the starting position of win_seq in the original seq (1-n)
|
|
874 # curr_gi : the current graph index
|
|
875 # graph_file_hdl : the graph file handler
|
|
876 # winHead : the header line for the current sequence window
|
|
877 # TODO annotate
|
|
878 # abstr : input parameter -abstr
|
|
879 # cue : input parameter -cue
|
|
880 # stacks : input parameter -stacks
|
|
881 # orig_seq : the nucleotide sequence as read from fasta
|
|
882 #
|
|
883 # Output:
|
|
884 # The current graph index
|
|
885 ############################################################################
|
|
886 sub convertShapeWindow {
|
|
887 my ( $win_shreps_aref, $win_seq, $win_size_real, $win_start, $curr_gi,
|
|
888 $graph_file_hdl, $winHead, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
|
|
889
|
|
890 ## generate for each shrep a connected component in gspan file
|
|
891 foreach my $shrep ( @{$win_shreps_aref} ) {
|
|
892
|
|
893 # get the current gi as it was at the beginning of this shrep
|
|
894 my $shrep_gi = $curr_gi;
|
|
895
|
|
896 # cut off unpaired ends, if option is given
|
|
897 my $shrep_struct = $shrep->[0];
|
|
898 my $crop_index_left = 0;
|
|
899 my $crop_index_right = length($shrep_struct) - 1;
|
|
900
|
|
901 # find croping indices with numbering from 0 to n-1
|
|
902 if ($cue) {
|
|
903 $crop_index_left = index( $shrep_struct, "(" ); # find 1st occ of "("
|
|
904 $crop_index_right = rindex( $shrep_struct, ")" ); # find last occ of ")"
|
|
905
|
|
906 # if the complete window is unpaired, then don't crop
|
|
907 if ( $crop_index_left == -1 ) {
|
|
908 $crop_index_left = 0;
|
|
909 $crop_index_right = length($shrep_struct) - 1;
|
|
910 }
|
|
911 }
|
|
912
|
|
913 # create structure graph
|
|
914 my $backboneGraph_ref = getBackboneGraph( $win_seq, $curr_gi, $win_start, $crop_index_left, $crop_index_right, $orig_seq );
|
|
915 my $structGraph_ref;
|
|
916
|
|
917 # add both basic edges, stacks and abstract graph
|
|
918 if ($abstr) {
|
|
919 ( $structGraph_ref, $curr_gi ) = getStructPlusAbstractStructGraph( $shrep, $curr_gi, $win_size_real, $cue, $stacks );
|
|
920
|
|
921 # just add basic structure graph edges, and stacks
|
|
922 } else {
|
|
923 ( $structGraph_ref, $curr_gi ) = getStructGraph( $shrep, $curr_gi, $win_size_real, $stacks );
|
|
924 }
|
|
925
|
|
926 # additional information for shrep header: sequence and dot-bracket
|
|
927 my $crop_length = $crop_index_right - $crop_index_left + 1;
|
|
928 my $shrepheader_struct = substr( $shrep_struct, $crop_index_left, $crop_length );
|
|
929 my $shrepheader_seq = substr( $win_seq, $crop_index_left, $crop_length );
|
|
930 push( @{$shrep}, 'STRUCT', $shrepheader_struct, 'SEQ', $shrepheader_seq );
|
|
931
|
|
932 # print structure graph to file
|
|
933 print $graph_file_hdl getShapeHeader( $winHead, $shrep );
|
|
934 print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
|
|
935
|
|
936 # don't print empty array; sgdnspdk breaks with empty lines
|
|
937 if ( @{$structGraph_ref} > 0 ) {
|
|
938 print $graph_file_hdl join( "\n", @{$structGraph_ref} ) . "\n";
|
|
939
|
|
940 if ($annotate) {
|
|
941
|
|
942 # TODO add annotations to shrep structure
|
|
943 # $win_start, $win_end (1-n)
|
|
944 }
|
|
945
|
|
946 }
|
|
947 }
|
|
948
|
|
949 return $curr_gi;
|
|
950 }
|
|
951
|
|
952 ###########################################################################
|
|
953 # Here the vertices for the nucleotides are generated and also the backbone
|
|
954 # edges, which connect the nucleotides in their correct order (i.e order
|
|
955 # of the given sequence)
|
|
956 # Input:
|
|
957 # win_seq : the sequence for the current window
|
|
958 # win_start : the starting position for the current window
|
|
959 # curr_crop_i_left : the cropping index for the left end of sequence (-cue)
|
|
960 # curr_crop_i_right : the cropping index for the right end of sequence (-cue)
|
|
961 # orig_seq : the nucleotide sequence as read from fasta
|
|
962 #
|
|
963 # Output:
|
|
964 # The lines of the graph in an array reference to be printed to a file with
|
|
965 # one element per line.
|
|
966 ############################################################################
|
|
967 sub getBackboneGraph {
|
|
968 my ( $win_seq, $curr_gi, $win_start, $curr_crop_i_left, $curr_crop_i_right, $orig_seq ) = @_;
|
|
969
|
|
970 # RNAshapes substitutes T -> U, sequence only does not
|
|
971 # thus, just transform all t/T -> u/U to remain consistent
|
|
972 $orig_seq =~ tr /tT/uU/;
|
|
973 $win_seq =~ tr /tT/uU/;
|
|
974
|
|
975 my @seq;
|
|
976 @seq = split( "", $win_seq );
|
|
977
|
|
978 # if the vp option is set we need to obtain the original capitalization from
|
|
979 # $orig_seq and extract the windowed sequence manually
|
|
980 my @seq_vp;
|
|
981 if ( defined $i_vp ) {
|
|
982 my $win_len = length($win_seq);
|
|
983 my $capitalized_win_seq = substr( $orig_seq, $win_start - 1, $win_len );
|
|
984 ( uc($capitalized_win_seq) eq uc($win_seq) ) or
|
|
985 die( "error: windowed sequence generated due to vp option not equal " .
|
|
986 "to sequence reported by RNASHAPES.\n" .
|
|
987 "'${capitalized_win_seq}' != '${win_seq}'" );
|
|
988 @seq_vp = split( "", $capitalized_win_seq );
|
|
989 }
|
|
990
|
|
991 my @vert = ();
|
|
992 my @edg = ();
|
|
993
|
|
994 my $curr_abs_pos = $win_start + $curr_crop_i_left;
|
|
995 $curr_gi += $curr_crop_i_left;
|
|
996
|
|
997 # set vertice labeled with 'v' or 'V' according to sequence and vp option
|
|
998 # when vp set,
|
|
999 # uppercase nucleotides are annotated with 'v'
|
|
1000 # lowercase nucleotides are annotated with 'V'
|
|
1001 my $vertice_label = ( defined $i_vp and ( $seq_vp[$curr_crop_i_left] =~ /[a-z]/ ) ) ? 'V' : 'v';
|
|
1002
|
|
1003 # create backbone vertice of first nucleotide
|
|
1004 push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$curr_crop_i_left], $curr_abs_pos ) );
|
|
1005
|
|
1006 foreach my $idx ( ( $curr_crop_i_left + 1 ) .. $curr_crop_i_right ) {
|
|
1007 $curr_abs_pos++;
|
|
1008 $curr_gi++;
|
|
1009
|
|
1010 # set vertice label as described above
|
|
1011 $vertice_label = ( defined $i_vp and ( $seq_vp[$idx] =~ /[a-z]/ ) ) ? 'V' : 'v';
|
|
1012 push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$idx], $curr_abs_pos ) );
|
|
1013 push( @edg, join( ' ', "e", $curr_gi - 1, $curr_gi, '> 1' ) );
|
|
1014 }
|
|
1015
|
|
1016 my @ret = ( @vert, @edg );
|
|
1017
|
|
1018 return \@ret;
|
|
1019 }
|
|
1020
|
|
1021 ###########################################################################
|
|
1022 # This method does the same as getStructGraph, but is extended to identify
|
|
1023 # the abstract parts of the structure and also add this to the graph
|
|
1024 # via abstract relations, see information below.
|
|
1025 # We already have the backbone graph, now the base-pair edges need to be
|
|
1026 # added to this graph and the abstract graph is added after this. Finally
|
|
1027 # the abstract relations, pointing from the abstract level to the basic
|
|
1028 # structure level is added. All the while, we keep track of the current
|
|
1029 # index graph. Furthermore, in the basic structure graph, we have vertices
|
|
1030 # stacks that connects the four stacked base-pairs involved. (symbols P
|
|
1031 # for vertex and p for edges). The abstract vertices are labelled according
|
|
1032 # to the given name-space, then #, then the abstract structure type, e.g.
|
|
1033 # HL for hairpin-loop. The edges between the structure types are denoted
|
|
1034 # with n, and the relation vertex is ^R, with ^r going from the abstract
|
|
1035 # structure to the relation and from the relation to the basic structure
|
|
1036 # is @r.
|
|
1037 #
|
|
1038 # Input:
|
|
1039 # curr_shrep : the current shrep as dot-bracket structure
|
|
1040 # curr_gi : the current graph index
|
|
1041 # win_size : the window size
|
|
1042 # cue : the input parameter -cue (whether to crop unpaired ends)
|
|
1043 # stacks : the input parameter -stacks (whether to add stack information)
|
|
1044 #
|
|
1045 # Output:
|
|
1046 # The structure graph information line-by-line as an array reference and
|
|
1047 # the current graph index.
|
|
1048 #
|
|
1049 ############################################################################
|
|
1050 sub getStructPlusAbstractStructGraph {
|
|
1051 my ( $curr_shrep, $curr_gi, $win_size, $cue, $stacks ) = @_;
|
|
1052
|
|
1053 # $win_size = length($curr_shrep->[0]);
|
|
1054
|
|
1055 my @struct = split( "", $curr_shrep->[0] );
|
|
1056
|
|
1057 # print "=================testing: new shrep : gi=$curr_gi ===============================\n" if ($i_debug);
|
|
1058
|
|
1059 # OBJECTS and VARIABLES
|
|
1060 # all indices are saved according to the current graph index, $curr_gi,
|
|
1061 # which is at the beginning of the current window, so that the nucleotide
|
|
1062 # index can be inferred using $idx + $curr_gi
|
|
1063 # opening brackets
|
|
1064 my @open_blks = (); # open blocks of consecutive opening brackets (array of arrays)
|
|
1065 my @p_open = (); # currently open block of the last identified complete block
|
|
1066 my @c_open = (); # current "incomplete" block of consecutive opening brackets
|
|
1067 # base-pair objects
|
|
1068 my %bps = (); # hash of all base-pairs with stem for a value, if it is at
|
|
1069 # the BP is opening or closing a stem object
|
|
1070 my @c_bp = (); # current BP, just being closed at current index pos (i,j)
|
|
1071 my @p_bp = (); # previously closed BP at position before current index
|
|
1072 # stem objects
|
|
1073 my %stems = (); # all stems, key="i:j,k:l", value=stem array, (i,j) outer BP
|
|
1074 my @c_stem = (); # current incomplete stem object
|
|
1075 my @stmbrks = (); # all stem-breaks in the form of (i,k,l), where i is the remaining
|
|
1076 # open base and (k,l) is the subsequent closing base-pair
|
|
1077
|
|
1078 # my @p_stmbrk = (); # previous stem-break in the form of (i,k,l), where i is the remaining
|
|
1079 # # open base and (k,l) is the subsequent closing base-pair
|
|
1080 # loop objects
|
|
1081 my @up = (); # all unpaired regions (i,j) that are of unkown type
|
|
1082 my @c_up = (); # current incomplete unpaired object
|
|
1083 my @hls = (); # all hairpin loop objects
|
|
1084 my @bls = (); # all bulge-loop objects
|
|
1085 my @ils = (); # all internal loops
|
|
1086 my @mls = (); # all multi-loops
|
|
1087 my @c_mls = (); # current incomplete multi-loops! (there can be more than one)
|
|
1088 my @els = (); # all external loops
|
|
1089 my @c_el = (); # current incomplete external loop
|
|
1090
|
|
1091 # iterate through shrep structure an identify abstract parts
|
|
1092 foreach my $idx ( 0 .. @struct - 1 ) {
|
|
1093
|
|
1094 #===================================================
|
|
1095 # current char is the opening bracket of a base-pair
|
|
1096 if ( $struct[$idx] eq "(" ) {
|
|
1097
|
|
1098 #===================================================
|
|
1099
|
|
1100 # update current window index to current graph index
|
|
1101 $idx += $curr_gi;
|
|
1102
|
|
1103 # case: '((' currently there is an open block, extend it
|
|
1104 if (@c_open) {
|
|
1105 push( @c_open, $idx );
|
|
1106
|
|
1107 # case: '.(' or ')(' or BOS'(' there is no open block, create a new one ".(" or ")("
|
|
1108 } else {
|
|
1109
|
|
1110 # begin a new current open block
|
|
1111 push( @c_open, $idx );
|
|
1112
|
|
1113 # case: ')('
|
|
1114 if (@c_stem) {
|
|
1115
|
|
1116 # CLOSE STEM
|
|
1117 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
|
|
1118 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
|
|
1119
|
|
1120 # case (x(x)(
|
|
1121 if (@open_blks) {
|
|
1122
|
|
1123 # EXTEND_ML, if one exists
|
|
1124 if (@c_mls) {
|
|
1125 extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
|
|
1126
|
|
1127 # OPEN_ML
|
|
1128 } else {
|
|
1129
|
|
1130 # case 'x((x)(' the opening of the ML was a stem-break
|
|
1131 if ( @stmbrks
|
|
1132 && $stmbrks[-1]->[1] == $p_bp[0]
|
|
1133 && $stmbrks[-1]->[2] == $p_bp[1] ) {
|
|
1134 my @c_stmbrk = @{ pop(@stmbrks) };
|
|
1135 my @a = ( $c_stmbrk[0], -1 );
|
|
1136 my @ml = ();
|
|
1137 push( @ml, \@a ); # 1st base-pair is in-waiting
|
|
1138 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
|
|
1139 push( @ml, \@a2 ); #2nd BP
|
|
1140 my @a3 = ($idx);
|
|
1141 push( @ml, \@a3 ); #first of current BP
|
|
1142 push( @c_mls, \@ml );
|
|
1143 print STDERR "TEST - opened ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
|
|
1144
|
|
1145 # case 'x(.(x)('
|
|
1146 } elsif (@up) {
|
|
1147 my @tmp_up = @{ pop(@up) };
|
|
1148
|
|
1149 # compare adjacent BP to UP to see if they fit
|
|
1150 if ( $tmp_up[1] + 1 == $p_bp[0]
|
|
1151 && $open_blks[-1]->[-1] == $tmp_up[0] - 1 ) {
|
|
1152 my @a = ( $open_blks[-1]->[-1], -1 );
|
|
1153 my @ml = ();
|
|
1154 push( @ml, \@a ); # 1st BP awaits closing
|
|
1155 my @newbp = @p_bp;
|
|
1156 push( @ml, \@newbp ); # 2nd BP
|
|
1157 my @a2 = ($idx);
|
|
1158 push( @ml, \@a2 ); # current open BP, awaits closing
|
|
1159 push( @c_mls, \@ml );
|
|
1160 print STDERR "TEST - opened ML with prev UP1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
|
|
1161
|
|
1162 if ($i_debug);
|
|
1163 } else {
|
|
1164 die "ERROR: the base-pairs to not match the " . "previous unpaired region?!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1165 }
|
|
1166 } else {
|
|
1167 die "ERROR: in case ML, but there is no initial" . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1168 }
|
|
1169 }
|
|
1170
|
|
1171 # case: 'x(x)('
|
|
1172 } else {
|
|
1173
|
|
1174 # case: '.(x)(' => CLOSE_EL
|
|
1175 if (@c_el) {
|
|
1176 if ( $c_el[-1] == $p_bp[0] ) {
|
|
1177 push( @c_el, $p_bp[1] );
|
|
1178 my @newel = @c_el;
|
|
1179 push( @els, \@newel );
|
|
1180 @c_el = ();
|
|
1181 print STDERR "TEST - closed EL1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
|
|
1182 } else {
|
|
1183 die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1184 }
|
|
1185 }
|
|
1186
|
|
1187 # in both cases OPEN_EL
|
|
1188 push( @c_el, @p_bp );
|
|
1189 push( @c_el, $idx );
|
|
1190 print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
|
|
1191 }
|
|
1192
|
|
1193 # case '.('
|
|
1194 } elsif (@c_up) {
|
|
1195
|
|
1196 # case (x(x).( OR 'x(.('
|
|
1197 if (@open_blks) {
|
|
1198
|
|
1199 # case 'x(x).(' ML
|
|
1200 if ( @p_bp && ( $p_bp[1] == $c_up[0] - 1 ) ) {
|
|
1201
|
|
1202 # case '((x)x(x).(' EXTEND_ML, if one exists
|
|
1203 if (@c_mls) {
|
|
1204
|
|
1205 extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
|
|
1206
|
|
1207 # case 'x((x).(' or 'x(.(x).(' OPEN_ML
|
|
1208 } else {
|
|
1209
|
|
1210 # case 'x((x).(' the opening of the ML was a stem-break
|
|
1211 if ( @stmbrks
|
|
1212 && $stmbrks[-1]->[1] == $p_bp[0]
|
|
1213 && $stmbrks[-1]->[2] == $p_bp[1] ) {
|
|
1214 my @c_stmbrk = @{ pop(@stmbrks) };
|
|
1215 my @a = ( $c_stmbrk[0], -1 );
|
|
1216 my @ml = ();
|
|
1217 push( @ml, \@a ); # 1st base-pair is in-waiting
|
|
1218 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
|
|
1219 push( @ml, \@a2 ); #2nd BP
|
|
1220 my @a3 = ($idx);
|
|
1221 push( @ml, \@a3 ); #first of current BP
|
|
1222 push( @c_mls, \@ml );
|
|
1223 print STDERR "TEST - open ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
|
|
1224
|
|
1225 # case 'x(.(x).(' opening the ML with initial UP region
|
|
1226 } elsif (@up) {
|
|
1227
|
|
1228 # get previous unpaired region
|
|
1229 my $tmp = pop(@up);
|
|
1230 my @p_up = @{$tmp};
|
|
1231
|
|
1232 # case: '(.(x).(' compare adjacent BP to UP to see if they create a ML
|
|
1233 if ( $p_up[1] + 1 == $p_bp[0]
|
|
1234 && $open_blks[-1]->[-1] == $p_up[0] - 1 ) {
|
|
1235 my @a = ( $open_blks[-1]->[-1], -1 );
|
|
1236 my @ml = ();
|
|
1237 push( @ml, \@a ); # 1st BP awaits closing
|
|
1238 my @newbp = @p_bp;
|
|
1239 push( @ml, \@newbp ); # 2nd BP
|
|
1240 my @a2 = ($idx);
|
|
1241 push( @ml, \@a2 ); # current open BP, awaits closing
|
|
1242 push( @c_mls, \@ml );
|
|
1243 print STDERR "TEST - opened ML with prev UP2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
|
|
1244
|
|
1245 if ($i_debug);
|
|
1246 }
|
|
1247
|
|
1248 # has to be an ML, but can't find first part
|
|
1249 } else {
|
|
1250 die "ERROR: in case ML, but there is no initial " . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1251 }
|
|
1252 }
|
|
1253
|
|
1254 # case '(.(
|
|
1255 } else {
|
|
1256 my @newup = @c_up;
|
|
1257 push( @up, \@newup );
|
|
1258 print STDERR "TEST - closed UP:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newup ) ) if ($i_debug);
|
|
1259 }
|
|
1260
|
|
1261 # case: 'x(x).(' OR 'EOS.('
|
|
1262 } else {
|
|
1263
|
|
1264 # case: '.(x)(' => CLOSE_EL
|
|
1265 if (@c_el) {
|
|
1266 if ( $c_el[-1] == $c_bp[0] ) {
|
|
1267 push( @c_el, $p_bp[1] );
|
|
1268 my @newel = @c_el;
|
|
1269 push( @els, \@newel );
|
|
1270 @c_el = ();
|
|
1271 print STDERR "TEST - closed EL2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
|
|
1272 } else {
|
|
1273 die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1274 }
|
|
1275 }
|
|
1276
|
|
1277 # in both cases OPEN_EL
|
|
1278 if (@p_bp) {
|
|
1279 push( @c_el, @p_bp );
|
|
1280 push( @c_el, $idx );
|
|
1281 } else {
|
|
1282
|
|
1283 # ignore this if -cue is given
|
|
1284 # TODO check the gi index
|
|
1285 if ($cue) {
|
|
1286 @c_el = ();
|
|
1287 } else {
|
|
1288 push( @c_el, ( $curr_gi, $curr_gi ) );
|
|
1289 push( @c_el, $idx );
|
|
1290 print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
|
|
1291 }
|
|
1292 }
|
|
1293 }
|
|
1294 @c_up = ();
|
|
1295
|
|
1296 } else {
|
|
1297
|
|
1298 # no more cases except opening bracket at beginning of sequence
|
|
1299 # in this case do nothing, as the index has already been added
|
|
1300 die "ERROR: '((', ')(', '.(' have all been covered\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) )
|
|
1301 unless ( $idx == $curr_gi );
|
|
1302
|
|
1303 }
|
|
1304 }
|
|
1305
|
|
1306 #===================================================
|
|
1307 # current char is the unpaired base
|
|
1308 } elsif ( $struct[$idx] eq "." ) {
|
|
1309
|
|
1310 #===================================================
|
|
1311
|
|
1312 # update current window index to current graph index
|
|
1313 $idx += $curr_gi;
|
|
1314
|
|
1315 # case ').' => CLOSE_STEM
|
|
1316 if (@c_stem) {
|
|
1317 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
|
|
1318 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
|
|
1319 }
|
|
1320
|
|
1321 # case '..' extend UP, @c_up
|
|
1322 if (@c_up) {
|
|
1323 $c_up[1] = $idx;
|
|
1324
|
|
1325 # case 'x.' open new UP
|
|
1326 } else {
|
|
1327
|
|
1328 # OPEN_UP, @c_up
|
|
1329 @c_up = ( $idx, $idx );
|
|
1330
|
|
1331 # case '(.' just come to the end of an open block, push it onto stack
|
|
1332 if (@c_open) {
|
|
1333 my @newopen = @c_open;
|
|
1334 push( @open_blks, \@newopen );
|
|
1335 @c_open = ();
|
|
1336
|
|
1337 # case ').' or '.'
|
|
1338 # (have already closed stem and created stem-break and emptied p_open)
|
|
1339 } else {
|
|
1340
|
|
1341 # case 'x(x).' or '.' => CLOSE_EL
|
|
1342 if ( !@open_blks ) {
|
|
1343
|
|
1344 # case 'x().().'
|
|
1345 if (@c_el) {
|
|
1346
|
|
1347 # double check
|
|
1348 if ( $c_el[-1] == $p_bp[0] ) {
|
|
1349 push( @c_el, $p_bp[1] );
|
|
1350 my @newel = @c_el;
|
|
1351 push( @els, \@newel );
|
|
1352 @c_el = ();
|
|
1353 print STDERR "TEST - closed EL3:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
|
|
1354 } else {
|
|
1355 die "This case should not occur!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1356 }
|
|
1357 }
|
|
1358 }
|
|
1359 }
|
|
1360 }
|
|
1361
|
|
1362 #===================================================
|
|
1363 # current char is the closing bracket of a base-pair
|
|
1364 } elsif ( $struct[$idx] eq ")" ) {
|
|
1365
|
|
1366 #===================================================
|
|
1367
|
|
1368 # update current window index to current graph index
|
|
1369 $idx += $curr_gi;
|
|
1370
|
|
1371 # save previous base-pair
|
|
1372 if (@c_bp) {
|
|
1373 @p_bp = @c_bp;
|
|
1374 }
|
|
1375
|
|
1376 # case: '((x))', extend stem
|
|
1377 if (@p_open) {
|
|
1378
|
|
1379 # get current base-pair
|
|
1380 @c_bp = ( pop(@p_open), $idx );
|
|
1381
|
|
1382 # add it to the base-pair hash
|
|
1383 $bps{"$c_bp[0]:$c_bp[1]"} = "";
|
|
1384
|
|
1385 # EXTEND_STEM
|
|
1386 if (@c_stem) {
|
|
1387 $c_stem[0] = $c_bp[0];
|
|
1388 $c_stem[1] = $c_bp[1];
|
|
1389 } else {
|
|
1390 die "ERROR: in case '))' there has to be an open stem object!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1391 }
|
|
1392
|
|
1393 # case: '(x(x))' or '(x.)', '()' not allowed
|
|
1394 } else {
|
|
1395
|
|
1396 # case: '(x(x))' CLOSE_STEM close previous stem
|
|
1397 if (@c_stem) {
|
|
1398 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
|
|
1399 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
|
|
1400
|
|
1401 # case '(x(x))' OPEN_STEM
|
|
1402 if (@open_blks) {
|
|
1403 my $tmp = pop(@open_blks);
|
|
1404 @p_open = @{$tmp};
|
|
1405
|
|
1406 # get current base-pair
|
|
1407 @c_bp = ( pop(@p_open), $idx );
|
|
1408
|
|
1409 # add it to the base-pair hash
|
|
1410 # as the stem is not finished yet, cannot add stem information
|
|
1411 $bps{"$c_bp[0]:$c_bp[1]"} = "";
|
|
1412
|
|
1413 # open stem
|
|
1414 $c_stem[0] = $c_bp[0];
|
|
1415 $c_stem[1] = $c_bp[1];
|
|
1416 $c_stem[2] = $c_bp[0];
|
|
1417 $c_stem[3] = $c_bp[1];
|
|
1418 } else {
|
|
1419 die "ERROR: there have to be open blocks to match " . "current closing bracket!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1420 }
|
|
1421
|
|
1422 # case: '(.(x))' => CLOSE_BL
|
|
1423 if ( @up && $up[-1]->[0] - 1 == $c_bp[0] && $up[-1]->[1] + 1 == $p_bp[0] ) {
|
|
1424 my @newBL = ( @c_bp, @p_bp );
|
|
1425 push( @bls, \@newBL );
|
|
1426 pop(@up);
|
|
1427 print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newBL ) ) if ($i_debug);
|
|
1428
|
|
1429 # case: '(x(x).(x))' => CLOSE_ML
|
|
1430 } elsif ( @c_mls && $c_mls[-1]->[-1]->[-1] == $p_bp[0] ) {
|
|
1431 my @newml = @{ pop(@c_mls) };
|
|
1432 push( @{ $newml[-1] }, $p_bp[1] ); # close last BP
|
|
1433 $newml[0]->[1] = $c_bp[1]; # close 1st BP
|
|
1434 push( @mls, \@newml );
|
|
1435 print STDERR "TEST - closed ML1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
|
|
1436 } else {
|
|
1437 die "ERROR: what is this case?\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1438 }
|
|
1439
|
|
1440 # case: (x.)
|
|
1441 } else {
|
|
1442
|
|
1443 # OPEN_STEM open new stem
|
|
1444 if (@open_blks) {
|
|
1445 @p_open = @{ pop(@open_blks) };
|
|
1446
|
|
1447 # get current base-pair
|
|
1448 @c_bp = ( pop(@p_open), $idx );
|
|
1449
|
|
1450 # add it to the base-pair hash
|
|
1451 # as the stem is not finished yet, cannot add stem information
|
|
1452 $bps{"$c_bp[0]:$c_bp[1]"} = "";
|
|
1453
|
|
1454 # open stem
|
|
1455 $c_stem[0] = $c_bp[0];
|
|
1456 $c_stem[1] = $c_bp[1];
|
|
1457 $c_stem[2] = $c_bp[0];
|
|
1458 $c_stem[3] = $c_bp[1];
|
|
1459 } else {
|
|
1460 die "ERROR: there have to be open blocks to match " . "current closing bracket!$curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1461 }
|
|
1462
|
|
1463 # CLOSE_C_UP
|
|
1464 if (@c_up) {
|
|
1465
|
|
1466 # case: (.) => close_HL
|
|
1467 if ( $c_up[0] - 1 == $c_bp[0] ) {
|
|
1468 my @newhl = @c_bp;
|
|
1469 push( @hls, \@newhl );
|
|
1470 @c_up = ();
|
|
1471 print STDERR "TEST - closed HL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newhl ) ) if ($i_debug);
|
|
1472
|
|
1473 # case: (x(x).)
|
|
1474 } elsif ( @p_bp && $c_up[0] - 1 == $p_bp[1] ) {
|
|
1475
|
|
1476 # case: ((x).) => CLOSE_BL
|
|
1477 if ( @stmbrks
|
|
1478 && $stmbrks[-1]->[0] == $c_bp[0]
|
|
1479 && $stmbrks[-1]->[1] == $p_bp[0]
|
|
1480 && $stmbrks[-1]->[2] == $p_bp[1] ) {
|
|
1481 my @newbl = ( @c_bp, @p_bp );
|
|
1482 push( @bls, \@newbl );
|
|
1483 pop(@stmbrks);
|
|
1484 @c_up = ();
|
|
1485 print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newbl ) ) if ($i_debug);
|
|
1486
|
|
1487 # case: (x(x).), either CLOSE_ML or CLOSE_IL
|
|
1488 } else {
|
|
1489
|
|
1490 # case '(.(x).)' => CLOSE_IL
|
|
1491 if ( @up
|
|
1492 && ( $up[-1]->[0] - 1 == $c_bp[0] )
|
|
1493 && ( $up[-1]->[1] + 1 == $p_bp[0] ) ) {
|
|
1494 my @p_up = @{ pop(@up) };
|
|
1495 my @newil = ( @c_bp, @p_bp );
|
|
1496 push( @ils, \@newil );
|
|
1497 @c_up = ();
|
|
1498 print STDERR "TEST - closed IL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newil ) ) if ($i_debug);
|
|
1499
|
|
1500 # case (x(x)x(x).) => CLOSE_ML remember, array of tuples
|
|
1501 } elsif ( @c_mls && $c_mls[-1]->[-1]->[0] == $p_bp[0] ) {
|
|
1502 my @newml = @{ pop(@c_mls) };
|
|
1503 push( @{ $newml[-1] }, $p_bp[1] ); # close final BP
|
|
1504 $newml[0]->[1] = $c_bp[1]; # close 1st BP
|
|
1505 push( @mls, \@newml );
|
|
1506 @c_up = ();
|
|
1507 print STDERR "TEST - closed ML2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
|
|
1508 } else {
|
|
1509 die "There should be no such case!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1510 }
|
|
1511 }
|
|
1512
|
|
1513 # there should be no more unknown unpaired regions left!
|
|
1514 } else {
|
|
1515 die "There shouldn't be any unknown unpaired " . "regions left\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1516 }
|
|
1517
|
|
1518 } else {
|
|
1519 die "ERROR: '()' is not allowed in $curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
|
|
1520 }
|
|
1521 }
|
|
1522 }
|
|
1523 }
|
|
1524 }
|
|
1525
|
|
1526 # EOS
|
|
1527
|
|
1528 # CLOSE_STEM
|
|
1529 print STDERR "TEST - closed stem EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@c_stem ) )
|
|
1530 if ( @c_stem && $i_debug );
|
|
1531 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ) if (@c_stem);
|
|
1532 die "ERROR: there should not be an open ML!\n" . $curr_shrep . "\n"
|
|
1533 if (@c_mls);
|
|
1534
|
|
1535 # case ().() CLOSE_EL
|
|
1536 if (@c_el) {
|
|
1537 if ( $c_el[-1] == $p_bp[0] ) {
|
|
1538 push( @c_el, $p_bp[1] );
|
|
1539 my @newel = @c_el;
|
|
1540 push( @els, \@newel );
|
|
1541 @c_el = ();
|
|
1542
|
|
1543 print STDERR "TEST - closed EL4 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
|
|
1544 if ($i_debug);
|
|
1545 } else {
|
|
1546 die "ERROR: the base-pairs in the EL don't match" . $curr_shrep . "\n";
|
|
1547 }
|
|
1548 }
|
|
1549
|
|
1550 # case (). CLOSE_EL
|
|
1551 if (@c_up) {
|
|
1552 unless ($cue) {
|
|
1553 if (@p_bp) {
|
|
1554 push( @c_el, @p_bp );
|
|
1555 } else {
|
|
1556 push( @c_el, ( $curr_gi, $curr_gi ) );
|
|
1557 }
|
|
1558
|
|
1559 push( @c_el, ( ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ) );
|
|
1560 my @newel = @c_el;
|
|
1561 push( @els, \@newel );
|
|
1562 @c_el = ();
|
|
1563 print STDERR "TEST - closed EL5 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
|
|
1564 if ($i_debug);
|
|
1565 ## check case where cue is given, but window is completely unstructured
|
|
1566 } else {
|
|
1567 unless (@p_bp) {
|
|
1568 my @newel = ( $curr_gi, $curr_gi, ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) );
|
|
1569 push( @els, \@newel );
|
|
1570 }
|
|
1571 }
|
|
1572 }
|
|
1573
|
|
1574 # # update current graph index (since all shrep indices have been read)
|
|
1575 $curr_gi += $win_size;
|
|
1576
|
|
1577 # ===================================================
|
|
1578 # build graph lines
|
|
1579 my @graph_lines = ();
|
|
1580
|
|
1581 # ===================================================
|
|
1582
|
|
1583 # graph line objects
|
|
1584 my @edg = ();
|
|
1585 my @stackgraph = ();
|
|
1586 my @abstrgraph = ();
|
|
1587
|
|
1588 #-------------------> stems,stacks
|
|
1589 # build basic graph edges, stacks, and vertices for stems in the abstract graph
|
|
1590 my @tmp = keys %stems;
|
|
1591 foreach my $sk (@tmp) {
|
|
1592 my @stem = @{ $stems{$sk} }; # stem (i,j,k,l)=> (i,j)= (0,1) and (k,l)=(2,3)
|
|
1593 die "ERROR: There are not enough elements in this stem: ", ( join( ",", @stem ) ), "\n" unless ( scalar(@stem) == 4 );
|
|
1594
|
|
1595 # infer base-pairs from stem
|
|
1596 @p_bp = ();
|
|
1597 @c_bp = ();
|
|
1598 my @stem_relation_idxs = ();
|
|
1599 my @stack_relation_idxs = ();
|
|
1600 my $stacksize = $stem[2] - $stem[0] + 1;
|
|
1601 for ( my $x = 0 ; $x < $stacksize ; $x++ ) {
|
|
1602
|
|
1603 # for (my $i = $stem[0]; $i <= $stem[2] ; $i++){
|
|
1604 # for(my $j = $stem[1]; $j >= $stem[3]; $j--){
|
|
1605 my $i = $stem[0] + $x;
|
|
1606 my $j = $stem[1] - $x;
|
|
1607 @p_bp = @c_bp;
|
|
1608 @c_bp = ( $i, $j );
|
|
1609 if ( defined $bps{"$i:$j"} ) {
|
|
1610
|
|
1611 # add indices to stem relation object
|
|
1612 push( @stem_relation_idxs, $i );
|
|
1613 push( @stem_relation_idxs, $j );
|
|
1614
|
|
1615 # add edge to graph edges
|
|
1616 push( @edg, "e $i $j s" );
|
|
1617
|
|
1618 # add stack if option given
|
|
1619 if ($stacks) {
|
|
1620 if (@p_bp) {
|
|
1621 push( @stackgraph, "v $curr_gi P" );
|
|
1622 push( @stackgraph, "e $curr_gi $p_bp[0] p" );
|
|
1623 push( @stackgraph, "e $curr_gi $p_bp[1] p" );
|
|
1624 push( @stackgraph, "e $c_bp[0] $curr_gi p" );
|
|
1625 push( @stackgraph, "e $c_bp[1] $curr_gi p" );
|
|
1626 push( @stack_relation_idxs, $curr_gi );
|
|
1627
|
|
1628 # update graph index
|
|
1629 ++$curr_gi;
|
|
1630 }
|
|
1631 }
|
|
1632 } else {
|
|
1633 die "ERROR: There is no such base-pair in the base-pair hash:" . " ($i, $j)\n";
|
|
1634 }
|
|
1635 }
|
|
1636
|
|
1637 # }
|
|
1638 # }
|
|
1639 # add stem to abstract graph
|
|
1640 # (graph indices become a bit jumbled, but saves time)
|
|
1641 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#S" );
|
|
1642
|
|
1643 # stem object now saves the curr_gi index for that stem
|
|
1644 $stems{$sk} = $curr_gi;
|
|
1645 ++$curr_gi;
|
|
1646
|
|
1647 # add relations
|
|
1648 push( @abstrgraph, "v $curr_gi ^R" );
|
|
1649 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
|
|
1650 foreach my $r (@stem_relation_idxs) {
|
|
1651 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1652 }
|
|
1653 if ($stacks) {
|
|
1654 foreach my $r (@stack_relation_idxs) {
|
|
1655 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1656 }
|
|
1657 }
|
|
1658 ++$curr_gi;
|
|
1659 }
|
|
1660
|
|
1661 #-------------------> add HLs
|
|
1662 foreach my $hl (@hls) {
|
|
1663
|
|
1664 # check size
|
|
1665 die "ERROR: the HL object is the incorrect size\n" unless ( @{$hl} == 2 );
|
|
1666
|
|
1667 # add vertex
|
|
1668 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#HL" );
|
|
1669
|
|
1670 # add edge from stem to hl
|
|
1671 if ( defined( $stems{ $bps{"$hl->[0]:$hl->[1]"} } ) ) {
|
|
1672 push( @abstrgraph, "e " . $stems{ $bps{"$hl->[0]:$hl->[1]"} } . " $curr_gi n" );
|
|
1673 } else {
|
|
1674 die "ERROR: stem not defined in base-pair hash for $hl->[0]:$hl->[1]\n";
|
|
1675 }
|
|
1676 ++$curr_gi;
|
|
1677
|
|
1678 # add relations
|
|
1679 push( @abstrgraph, "v $curr_gi ^R" );
|
|
1680 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
|
|
1681
|
|
1682 # add all nodes between i to j, inclusively
|
|
1683 for ( my $r = $hl->[0] ; $r <= $hl->[1] ; $r++ ) {
|
|
1684 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1685 }
|
|
1686 ++$curr_gi;
|
|
1687 }
|
|
1688
|
|
1689 #------------------> add BLs
|
|
1690 foreach my $bl (@bls) {
|
|
1691
|
|
1692 # check size
|
|
1693 die "ERROR: the BL object is the incorrect size\n" unless ( scalar( @{$bl} ) == 4 );
|
|
1694
|
|
1695 # add vertex
|
|
1696 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#BL" );
|
|
1697
|
|
1698 # add edges from bl to both adjacent stems
|
|
1699 if ( defined( $stems{ $bps{"$bl->[0]:$bl->[1]"} } )
|
|
1700 && defined( $stems{ $bps{"$bl->[2]:$bl->[3]"} } ) ) {
|
|
1701
|
|
1702 # add outer stem: S->BL
|
|
1703 push( @abstrgraph, "e " . $stems{ $bps{"$bl->[0]:$bl->[1]"} } . " $curr_gi n" );
|
|
1704
|
|
1705 # add inner stem: BL->S
|
|
1706 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bl->[2]:$bl->[3]"} } . " n" );
|
|
1707 } else {
|
|
1708 die "ERROR: stems not defined for BL\n";
|
|
1709 }
|
|
1710 ++$curr_gi;
|
|
1711
|
|
1712 # add relations
|
|
1713 push( @abstrgraph, "v $curr_gi ^R" );
|
|
1714 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
|
|
1715
|
|
1716 # add all nodes between i to k and l to j, inclusively
|
|
1717 for ( my $r = $bl->[0] ; $r <= $bl->[2] ; $r++ ) {
|
|
1718 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1719 }
|
|
1720 for ( my $r = $bl->[3] ; $r <= $bl->[1] ; $r++ ) {
|
|
1721 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1722 }
|
|
1723 ++$curr_gi;
|
|
1724 }
|
|
1725
|
|
1726 #------------------> add ILs
|
|
1727 foreach my $il (@ils) {
|
|
1728
|
|
1729 # check size
|
|
1730 die "ERROR: the IL object is the incorrect size\n" unless ( scalar( @{$il} ) == 4 );
|
|
1731
|
|
1732 # add vertex
|
|
1733 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#IL" );
|
|
1734
|
|
1735 # add edges from il to both adjacent stems
|
|
1736 if ( defined( $stems{ $bps{"$il->[0]:$il->[1]"} } )
|
|
1737 && defined( $stems{ $bps{"$il->[2]:$il->[3]"} } ) ) {
|
|
1738
|
|
1739 # add outer stem: S->IL
|
|
1740 push( @abstrgraph, "e " . $stems{ $bps{"$il->[0]:$il->[1]"} } . " $curr_gi n" );
|
|
1741
|
|
1742 # add inner stem: IL->S
|
|
1743 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$il->[2]:$il->[3]"} } . " n" );
|
|
1744 } else {
|
|
1745 die "ERROR: stems not defined for IL\n";
|
|
1746 }
|
|
1747 ++$curr_gi;
|
|
1748
|
|
1749 # add relations
|
|
1750 push( @abstrgraph, "v $curr_gi ^R" );
|
|
1751 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
|
|
1752
|
|
1753 # add all nodes between i to k and l to j, inclusively
|
|
1754 for ( my $r = $il->[0] ; $r <= $il->[2] ; $r++ ) {
|
|
1755 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1756 }
|
|
1757 for ( my $r = $il->[3] ; $r <= $il->[1] ; $r++ ) {
|
|
1758 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1759 }
|
|
1760 ++$curr_gi;
|
|
1761 }
|
|
1762
|
|
1763 #------------------> add ELs
|
|
1764 foreach my $el (@els) {
|
|
1765
|
|
1766 #check size
|
|
1767 die "ERROR: the EL object is the incorrect size\n" unless ( scalar( @{$el} ) == 4 );
|
|
1768
|
|
1769 # add vertex
|
|
1770 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#EL" );
|
|
1771
|
|
1772 # add edges from el to both adjacent stems, if available
|
|
1773 unless ( $el->[0] == $el->[1] ) {
|
|
1774 if ( defined( $stems{ $bps{"$el->[0]:$el->[1]"} } ) ) {
|
|
1775 push( @abstrgraph, "e " . $stems{ $bps{"$el->[0]:$el->[1]"} } . " $curr_gi n" );
|
|
1776 } else {
|
|
1777 die "ERROR: stems not defined for left BP of EL\n";
|
|
1778 }
|
|
1779 }
|
|
1780 unless ( $el->[2] == $el->[3] ) {
|
|
1781 if ( defined( $stems{ $bps{"$el->[2]:$el->[3]"} } ) ) {
|
|
1782 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$el->[2]:$el->[3]"} } . " n" );
|
|
1783 } else {
|
|
1784 die "ERROR: stems not defined for right BP of EL\n";
|
|
1785 }
|
|
1786 }
|
|
1787 ++$curr_gi;
|
|
1788
|
|
1789 # add relations
|
|
1790 push( @abstrgraph, "v $curr_gi ^R" );
|
|
1791 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
|
|
1792
|
|
1793 # @r relation edges
|
|
1794 for ( my $r = $el->[1] ; $r <= $el->[2] ; $r++ ) {
|
|
1795 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1796 }
|
|
1797
|
|
1798 # if BP, not EOS
|
|
1799 unless ( $el->[0] == $el->[1] ) {
|
|
1800 push( @abstrgraph, "e $curr_gi $el->[0] \@r" );
|
|
1801 }
|
|
1802
|
|
1803 # if BP, not EOS
|
|
1804 unless ( $el->[2] == $el->[3] ) {
|
|
1805 push( @abstrgraph, "e $curr_gi $el->[3] \@r" );
|
|
1806 }
|
|
1807 ++$curr_gi;
|
|
1808 }
|
|
1809
|
|
1810 #------------------> add MLs
|
|
1811 foreach my $ml (@mls) {
|
|
1812
|
|
1813 #check size
|
|
1814 die "ERROR: the ML object is not big enough!\n" unless ( scalar( @{$ml} ) >= 3 );
|
|
1815
|
|
1816 # add vertex
|
|
1817 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#ML" );
|
|
1818
|
|
1819 # add edge from outer stem to first BP
|
|
1820 my $closing_bp = shift( @{$ml} );
|
|
1821 if ( defined( $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } ) ) {
|
|
1822 push( @abstrgraph, "e " . $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } . " $curr_gi n" );
|
|
1823 } else {
|
|
1824 die "ERROR: ML is not defined for BP $closing_bp->[0]:$closing_bp->[1]\n";
|
|
1825 }
|
|
1826
|
|
1827 # add remaining stems
|
|
1828 foreach my $bp ( @{$ml} ) {
|
|
1829 if ( defined( $stems{ $bps{"$bp->[0]:$bp->[1]"} } ) ) {
|
|
1830 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bp->[0]:$bp->[1]"} } . " n" );
|
|
1831 } else {
|
|
1832 die "ERROR: ML is not defined for BP $bp->[0]:$bp->[1]\n";
|
|
1833 }
|
|
1834 }
|
|
1835 ++$curr_gi;
|
|
1836
|
|
1837 # add relations
|
|
1838 push( @abstrgraph, "v $curr_gi ^R" );
|
|
1839 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
|
|
1840
|
|
1841 # add @r relation edges between i and k, where (i,j) is the first BP and (k,l)
|
|
1842 # the second BP
|
|
1843 my $pbp_aref = $closing_bp;
|
|
1844 my $cbp_aref = shift( @{$ml} );
|
|
1845 for ( my $r = $pbp_aref->[0] ; $r <= $cbp_aref->[0] ; $r++ ) {
|
|
1846 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1847 }
|
|
1848
|
|
1849 # rest of @r relation edges
|
|
1850 foreach my $bp ( @{$ml} ) {
|
|
1851 $pbp_aref = $cbp_aref;
|
|
1852 $cbp_aref = $bp;
|
|
1853 for ( my $r = $pbp_aref->[1] ; $r <= $cbp_aref->[0] ; $r++ ) {
|
|
1854 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1855 }
|
|
1856 }
|
|
1857
|
|
1858 # add final stretch between last BP (m,n) and first closing base-pair (i,j)
|
|
1859 for ( my $r = $cbp_aref->[1] ; $r <= $closing_bp->[1] ; $r++ ) {
|
|
1860 push( @abstrgraph, "e $curr_gi $r \@r" );
|
|
1861 }
|
|
1862 ++$curr_gi;
|
|
1863 }
|
|
1864
|
|
1865 @graph_lines = ( @edg, @stackgraph, @abstrgraph );
|
|
1866 return ( \@graph_lines, $curr_gi );
|
|
1867 }
|
|
1868
|
|
1869 # just to test the current shrep dot-bracket structure parsing
|
|
1870 sub test {
|
|
1871 my ( $shrep, $idx, $gi, $mark_aref, $isML ) = @_;
|
|
1872
|
|
1873 my $result = "";
|
|
1874
|
|
1875 $result .= "$shrep\n";
|
|
1876 my @a = split( "", $shrep );
|
|
1877 for ( my $i = 0 ; $i <= @a ; $i++ ) {
|
|
1878 my $mark = 0;
|
|
1879 foreach my $j ( @{$mark_aref} ) {
|
|
1880 if ($isML) {
|
|
1881 $mark = 1 if ( $i == $j->[0] - $gi );
|
|
1882 $mark = 1 if ( scalar( @{$j} == 2 && $i == $j->[1] - $gi ) );
|
|
1883 } else {
|
|
1884 $mark = 1 if ( $i == $j - $gi );
|
|
1885 }
|
|
1886
|
|
1887 }
|
|
1888 if ( $i == $idx - $gi ) {
|
|
1889 $result .= "^";
|
|
1890 } elsif ($mark) {
|
|
1891 $result .= "'";
|
|
1892 } else {
|
|
1893 $result .= " ";
|
|
1894 }
|
|
1895 }
|
|
1896 $result .= "\n";
|
|
1897
|
|
1898 return $result;
|
|
1899 }
|
|
1900
|
|
1901 ###########################################################################
|
|
1902 # Extends or closes a multi-loop. This means a ML already exists.
|
|
1903 # We can have the case ((x)( or ((x).(
|
|
1904 #
|
|
1905 # Input:
|
|
1906 # curr_shrep The current shrep object, with shrep string at pos 0
|
|
1907 # c_ml_aref The current open MLs array reference
|
|
1908 # p_bp_aref The previous base-pair array reference
|
|
1909 # stmbrks_aref The stem-breaks array reference
|
|
1910 # open_blks_aref The open bracket blocks array reference
|
|
1911 # idx The current index
|
|
1912 # #curr_gi The current graph index
|
|
1913 # Output:
|
|
1914 # None, it should just modify the ML accordingly
|
|
1915 ############################################################################
|
|
1916 sub extend_or_open_nested_ML {
|
|
1917 my ( $curr_shrep, $c_mls_aref, $p_bp_aref, $up_aref, $stmbrks_aref, $open_blks_aref, $idx, $curr_gi ) = @_;
|
|
1918
|
|
1919 # case '(x(x)x(x)x(' extend current ML
|
|
1920 if ( $c_mls_aref->[-1]->[-1]->[-1] == $p_bp_aref->[0] ) {
|
|
1921 push( @{ $c_mls_aref->[-1]->[-1] }, $p_bp_aref->[1] ); #close previous BP
|
|
1922 my @a = ($idx);
|
|
1923 push( @{ $c_mls_aref->[-1] }, \@a ); # add new open base-pair
|
|
1924 print STDERR "TEST - extended ML:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, $c_mls_aref->[-1], 1 ) ) if ($i_debug);
|
|
1925
|
|
1926 # case '(x(x)x((x)(' nested ML, => OPEN_ML with stem-break
|
|
1927 } elsif ( @$stmbrks_aref
|
|
1928 && $stmbrks_aref->[-1]->[1] == $p_bp_aref->[0]
|
|
1929 && $stmbrks_aref->[-1]->[2] == $p_bp_aref->[1] ) {
|
|
1930
|
|
1931 my @c_stmbrk = @{ pop( @{$stmbrks_aref} ) };
|
|
1932 my @a = ( $c_stmbrk[0], -1 );
|
|
1933 my @ml = ();
|
|
1934 push( @ml, \@a ); # 1st base-pair is in-waiting
|
|
1935 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
|
|
1936 push( @ml, \@a2 ); #2nd BP
|
|
1937 my @a3 = ($idx);
|
|
1938 push( @ml, \@a3 ); #first of current BP
|
|
1939 push( @{$c_mls_aref}, \@ml );
|
|
1940 print STDERR "TEST - opened nested ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
|
|
1941
|
|
1942 # case '(x(x)x(.(x)(' nested ML, => OPEN_ML with UP
|
|
1943 } elsif ( @$up_aref
|
|
1944 && $up_aref->[-1]->[0] - 1 == $open_blks_aref->[-1]->[-1]
|
|
1945 && $up_aref->[-1]->[1] + 1 == $p_bp_aref->[0] ) {
|
|
1946 my @tmp_up = @{ pop( @{$up_aref} ) };
|
|
1947 my @a = ( $open_blks_aref->[-1]->[-1], -1 );
|
|
1948 my @ml = ();
|
|
1949 push( @ml, \@a ); # 1st BP awaits closing
|
|
1950 my @newbp = @{$p_bp_aref};
|
|
1951 push( @ml, \@newbp ); #2nd BP
|
|
1952 my @a2 = ($idx);
|
|
1953 push( @ml, \@a2 ); # current open BP, awaits closing
|
|
1954 push( @{$c_mls_aref}, \@ml );
|
|
1955 print STDERR "TEST - opened nested ML with prev UP\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
|
|
1956 } else {
|
|
1957 die "ERROR: in the case extend or open nested ML, " . "but the base-pairs don't fit for either!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) if ($i_debug);
|
|
1958 }
|
|
1959 }
|
|
1960
|
|
1961 ###########################################################################
|
|
1962 # Closes a stem-object for the abstract graph. If necessary, it also
|
|
1963 # creates a stem-break because the current open block is still open,
|
|
1964 # which can only mean that there is a stem-break between two consecutive
|
|
1965 # opening brackets.
|
|
1966 #
|
|
1967 # Input:
|
|
1968 # c_stem_aref : current stem array reference
|
|
1969 # stems_href : all stems hash reference
|
|
1970 # p_bp_aref : previous base-pair array reference
|
|
1971 # c_cp_aref : current base-pair array reference
|
|
1972 # p_open_aref : previously opened open-bracket block
|
|
1973 # p_stmbrk_aref : the previous stem-break
|
|
1974 # open_blks_aref : all old open blocks array reference
|
|
1975 # bps_href : all base-pairs hash reference
|
|
1976 #
|
|
1977 # Output:
|
|
1978 # None, it should just modify the input variables accordingly
|
|
1979 ############################################################################
|
|
1980 sub close_stem {
|
|
1981 my ( $c_stem_aref, $stems_href, $p_bp_aref, $c_bp_aref, $p_open_aref, $stmbrks_aref, $open_blks_aref, $bps_href ) = @_;
|
|
1982
|
|
1983 my @newstem = @$c_stem_aref;
|
|
1984 my $stemkey = "$newstem[0]:$newstem[1],$newstem[2]:$newstem[3]";
|
|
1985 $stems_href->{$stemkey} = \@newstem;
|
|
1986
|
|
1987 # reset and clean running variables
|
|
1988 @$c_stem_aref = ();
|
|
1989 @$p_bp_aref = @$c_bp_aref;
|
|
1990 @$c_bp_aref = ();
|
|
1991
|
|
1992 # add stem to base-pair hash
|
|
1993 $bps_href->{"$newstem[0]:$newstem[1]"} = $stemkey;
|
|
1994 $bps_href->{"$newstem[2]:$newstem[3]"} = $stemkey;
|
|
1995
|
|
1996 # there are still bases open in the previous open block, '[[]x',
|
|
1997 # where [] is a closed stem and [ is an open block,
|
|
1998 # must create STEM-BREAK
|
|
1999 # this cannot be the case ((x)), because there p_open is empty
|
|
2000 if (@$p_open_aref) {
|
|
2001
|
|
2002 # check: the next open base is adjacent to the previous BP
|
|
2003 if ( $p_open_aref->[-1] == $p_bp_aref->[0] - 1 ) {
|
|
2004 my @stmbrk = ( $p_open_aref->[-1], @$p_bp_aref );
|
|
2005 push( @$stmbrks_aref, \@stmbrk );
|
|
2006
|
|
2007 # push the previous open block back onto stack
|
|
2008 my @newopen = @$p_open_aref;
|
|
2009 push( @$open_blks_aref, \@newopen );
|
|
2010 @$p_open_aref = ();
|
|
2011 } else {
|
|
2012
|
|
2013 # error, this should not be possible
|
|
2014 die("ERROR in close_stem(), [.[]x should not occur here\n");
|
|
2015 }
|
|
2016 }
|
|
2017 }
|
|
2018
|
|
2019 ###########################################################################
|
|
2020 # Here the information about the structure is added to the graph. In a
|
|
2021 # first step each base-pair is added by adding edges between the vertices
|
|
2022 # of the respective nucleotides. Then, if two base-pairs are stacked,
|
|
2023 # we add an extra vertex and connect the four nucleotide vertices that
|
|
2024 # are involved with extra edges (symbols P used for vertex and p for edges).
|
|
2025 # While adding new vertices we keep track of the current graph index.
|
|
2026 #
|
|
2027 # Input:
|
|
2028 # curr_shrep : the current shrep as dot-bracket structure
|
|
2029 # curr_gi : the current graph index
|
|
2030 # win_size : the window size
|
|
2031 # stacks : the input parameter -stacks (whether to add stacking information)
|
|
2032 #
|
|
2033 # Output:
|
|
2034 # The structure graph information line-by-line as an array reference and
|
|
2035 # the current graph index.
|
|
2036 ############################################################################
|
|
2037 sub getStructGraph {
|
|
2038 my ( $curr_shrep, $curr_gi, $win_size, $stacks ) = @_;
|
|
2039
|
|
2040 my @struct = split( "", $curr_shrep->[0] );
|
|
2041
|
|
2042 my @edg = ();
|
|
2043 my @starts = ();
|
|
2044 my @pairs = ();
|
|
2045
|
|
2046 foreach my $idx ( 0 .. @struct - 1 ) {
|
|
2047
|
|
2048 push( @starts, $idx ) if ( $struct[$idx] eq "(" );
|
|
2049
|
|
2050 if ( $struct[$idx] eq ")" ) {
|
|
2051 my $start = $curr_gi + pop(@starts);
|
|
2052 my $end = $curr_gi + $idx;
|
|
2053 push( @edg, "e " . $start . " " . $end . " s " );
|
|
2054 my @pair = ( $start, $end );
|
|
2055 push( @pairs, \@pair );
|
|
2056 }
|
|
2057 }
|
|
2058
|
|
2059 # update current graph index
|
|
2060 $curr_gi = $curr_gi + $win_size;
|
|
2061
|
|
2062 # initiate structure lines array
|
|
2063 my @stacking_info = ();
|
|
2064
|
|
2065 # add stacking information to graph unless input option tells us not to
|
|
2066 if ($stacks) {
|
|
2067
|
|
2068 my $stacked_pairs = 0;
|
|
2069
|
|
2070 # get stacked base-pairs (they are ordered according to
|
|
2071 # position of closing nucleotide)
|
|
2072 for ( my $i = 1 ; $i < @pairs ; $i++ ) {
|
|
2073
|
|
2074 # add stacked base-pairs (vertices+edges)
|
|
2075 my $curr_bp_aref = $pairs[$i];
|
|
2076 my $prev_bp_aref = $pairs[ $i - 1 ];
|
|
2077
|
|
2078 # if the current base-pair is stacked on the previous base-pair,
|
|
2079 # when curr_start = prev_start - 1 AND curr_end = prev_end + 1
|
|
2080 if ( $curr_bp_aref->[0] == $prev_bp_aref->[0] - 1
|
|
2081 && $curr_bp_aref->[1] == $prev_bp_aref->[1] + 1 ) {
|
|
2082
|
|
2083 # add stacking vertex P
|
|
2084 push( @stacking_info, "v $curr_gi P" );
|
|
2085
|
|
2086 # add four edges from involved nucleotids
|
|
2087 push( @stacking_info, "e $curr_gi $prev_bp_aref->[0] p" );
|
|
2088 push( @stacking_info, "e $curr_gi $prev_bp_aref->[1] p" );
|
|
2089 push( @stacking_info, "e $curr_bp_aref->[0] $curr_gi p" );
|
|
2090 push( @stacking_info, "e $curr_bp_aref->[1] $curr_gi p" );
|
|
2091 ++$curr_gi; # add one to the index, ready for next vertex
|
|
2092
|
|
2093 }
|
|
2094 }
|
|
2095 }
|
|
2096
|
|
2097 my @str_graphlines = ( @edg, @stacking_info );
|
|
2098 return ( \@str_graphlines, $curr_gi );
|
|
2099 }
|
|
2100
|
|
2101 ###########################################################################
|
|
2102 # This method gives us the overall graph header for this sequence. This
|
|
2103 # graph includes all window calculations and all shreps for all windows.
|
|
2104 # Input:
|
|
2105 # seq_id : the sequence ID
|
|
2106 # seq_head : the header information for the sequence
|
|
2107 # wins_aref : the windows sizes for folding
|
|
2108 # h_shift : input parameter -shift
|
|
2109 # h_e : input parameter -e
|
|
2110 # h_c : input parameter -c
|
|
2111 # h_t : input parameter -t
|
|
2112 # h_u : input parameter -u
|
|
2113 # h_r : input parameter -r
|
|
2114 # h_M : input parameter -M
|
|
2115 # h_crop_unpaired_ends : input parameter -cue
|
|
2116 #
|
|
2117 # Output:
|
|
2118 # The header line as a string
|
|
2119 ############################################################################
|
|
2120 sub getGraphHeader {
|
|
2121 my ( $seq_id, $seq_head, $wins_aref, $h_shift, $h_e, $h_c, $h_t, $h_u, $h_r, $h_M, $h_crop_unpaired_ends, $h_i, $h_sample_len, $h_q, $h_Tp, $h_seqlen, $h_noStr ) = @_;
|
|
2122
|
|
2123 my $ret;
|
|
2124
|
|
2125 $ret = "t # SEQID $seq_id ";
|
|
2126 $ret .= "$seq_head " if ( defined $seq_head );
|
|
2127 $ret .= "SEQLEN $h_seqlen ";
|
|
2128
|
|
2129 return $ret . "\n" if ($h_noStr);
|
|
2130
|
|
2131 $ret .= "MAXWINSHREPS $h_M ";
|
|
2132 $ret .= "CUE 1 " if ($h_crop_unpaired_ends);
|
|
2133 $ret .= "RNASHAPES -w " . ( join( ",", @{$wins_aref} ) ) . " ";
|
|
2134 $ret .= "-SHIFT%w $h_shift " if ( not $globalFolding and $h_shift );
|
|
2135 $ret .= "-SHIFT 1 " if ( not $globalFolding and not $h_shift );
|
|
2136 $ret .= "-e $h_e " if ( defined $h_e );
|
|
2137 $ret .= "-c $h_c " if ( defined $h_c );
|
|
2138 $ret .= "-t $h_t " if ( defined $h_t );
|
|
2139 $ret .= "-u 1 " if ($h_u);
|
|
2140 $ret .= "-r 1 " if ($h_r);
|
|
2141 $ret .= "-i $h_i SAMPLE_LEN $h_sample_len" if ( not $h_q and $h_i );
|
|
2142 $ret .= "-q 1 " if ($h_q);
|
|
2143 $ret .= "-T $h_Tp" if ( $h_q and defined $h_Tp );
|
|
2144 $ret .= "\n";
|
|
2145
|
|
2146 return $ret;
|
|
2147 }
|
|
2148
|
|
2149 ###########################################################################
|
|
2150 # This method gives us the window header for the subgraph that includes
|
|
2151 # all shreps for the current window.
|
|
2152 # Input:
|
|
2153 # graphHead : the header line for the entire sequence graph
|
|
2154 # win_size : the size of the current window
|
|
2155 # win_start : the starting position of the current window
|
|
2156 # win_end : the end position of the current window
|
|
2157 # win_center : the centre position of the current window
|
|
2158 # win_global_fold : (boolean) whether the actual number of nucleotides
|
|
2159 # is smaller than the given window size (end of sequence?)
|
|
2160 #
|
|
2161 # Output:
|
|
2162 # The header line as a string
|
|
2163 ############################################################################
|
|
2164 sub getWindowHeader {
|
|
2165 my ( $graphHead, $win_size, $win_start, $win_end, $win_center, $win_global_fold, $win_sample, $used_t ) = @_;
|
|
2166
|
|
2167 chomp $graphHead;
|
|
2168
|
|
2169 $graphHead =~ s/t # //;
|
|
2170
|
|
2171 my $ret = "w # $graphHead ";
|
|
2172 $ret .= "SHAPE_TYPE $used_t ";
|
|
2173 $ret .= "GLOBALFOLD $win_global_fold ";
|
|
2174 $ret .= "WSIZE $win_size ";
|
|
2175 $ret .= "WSTART $win_start ";
|
|
2176 $ret .= "WEND $win_end ";
|
|
2177 $ret .= "WCENT $win_center ";
|
|
2178 $ret .= "WSAMPLE $win_sample ";
|
|
2179 $ret .= "\n";
|
|
2180
|
|
2181 return $ret;
|
|
2182 }
|
|
2183
|
|
2184 ###########################################################################
|
|
2185 # This method gives us the shape header that includes the single shrep
|
|
2186 # connected components.
|
|
2187 # Input:
|
|
2188 # winHead : the header for the current window
|
|
2189 # shrep : information about the current shrep
|
|
2190 #
|
|
2191 # Output:
|
|
2192 # The header line as a string
|
|
2193 ############################################################################
|
|
2194 sub getShapeHeader {
|
|
2195 my ( $winHead, $shrep ) = @_;
|
|
2196
|
|
2197 chomp $winHead;
|
|
2198
|
|
2199 $winHead =~ s/w # //;
|
|
2200 $winHead =~ s/t # //;
|
|
2201
|
|
2202 my $ret = "s # $winHead ";
|
|
2203
|
|
2204 my @info = @{$shrep};
|
|
2205 $ret .= join( " ", @info[ 1 .. $#info ] );
|
|
2206 $ret .= "\n";
|
|
2207
|
|
2208 return $ret;
|
|
2209 }
|
|
2210
|
|
2211 ###########################################################################
|
|
2212 # This method gives us the sequence header that includes the unstructured
|
|
2213 # sequence connected components.
|
|
2214 # Input:
|
|
2215 # winHead : the header for the current window
|
|
2216 # shrep : information about the current shrep
|
|
2217 #
|
|
2218 # Output:
|
|
2219 # The header line as a string
|
|
2220 ############################################################################
|
|
2221 sub getSeqHeader {
|
|
2222 my ( $winHead, $shrep ) = @_;
|
|
2223
|
|
2224 chomp $winHead;
|
|
2225
|
|
2226 $winHead =~ s/w # //;
|
|
2227 $winHead =~ s/t # //;
|
|
2228
|
|
2229 my $ret = "u # $winHead ";
|
|
2230
|
|
2231 my @info = @{$shrep};
|
|
2232 $ret .= join( " ", @info[ 1 .. $#info ] );
|
|
2233 $ret .= "\n";
|
|
2234
|
|
2235 return $ret;
|
|
2236 }
|
|
2237
|
|
2238
|
|
2239 ##################################################################################
|
|
2240 # This method parses a fasta file and is useful if the header ID is not-unique!!
|
|
2241 # It returns the header lines in an array and the sequence lines in the same
|
|
2242 # order. It is then up to the user to extract the parts from the header that is
|
|
2243 # necessary for the script.
|
|
2244 # Furthermore, the method deals with multiple lines, and returns a single sequence
|
|
2245 # without line breaks.
|
|
2246 # Input:
|
|
2247 # file The name of the fasta file
|
|
2248 # Output:
|
|
2249 # (1) An array reference for each header line
|
|
2250 # (2) An array reference for each sequence in same order as the headers
|
|
2251 ##################################################################################
|
|
2252 sub read_fasta_with_nonunique_headers_meta {
|
|
2253 my ($file) = @_;
|
|
2254 my $FUNCTION = "read_fasta_file in Sequences.pm";
|
|
2255
|
|
2256 my $header = "";
|
|
2257 my $seqstring = "";
|
|
2258 my @headers = ();
|
|
2259 my @sequences = ();
|
|
2260 my @meta = ();
|
|
2261 my %seq_meta = ();
|
|
2262 open( IN_HANDLE, "<$file" ) || die "ERROR in $FUNCTION:\n" . "Couldn't open the following file in package Tool," . " sub read_fasta_file: $file\n";
|
|
2263
|
|
2264 while ( my $line = <IN_HANDLE> ) {
|
|
2265 chomp($line);
|
|
2266
|
|
2267 # header (can contain one space after > symbol)
|
|
2268 if ( $line =~ /^\>(.*)/ ) {
|
|
2269 if ($header) {
|
|
2270 $seqstring =~ s/\s*//g; ## do not allow spaces in sequence
|
|
2271 push( @headers, $header );
|
|
2272 push( @sequences, $seqstring );
|
|
2273 push( @meta, +{%seq_meta} ); ## anonymous hash reference
|
|
2274 #print keys %seq_meta;
|
|
2275 $seqstring = "";
|
|
2276 undef(%seq_meta);
|
|
2277 }
|
|
2278 $header = $1;
|
|
2279
|
|
2280 } elsif ( $line =~ /(.+)\s+(#\S+)$/ && $header ) {
|
|
2281
|
|
2282 if ( exists $seq_meta{$2} ) {
|
|
2283 $seq_meta{$2} .= $1;
|
|
2284 } else {
|
|
2285 $seq_meta{$2} = $1;
|
|
2286 }
|
|
2287
|
|
2288 } elsif ($header) {
|
|
2289 $seqstring .= $line
|
|
2290 }
|
|
2291 }
|
|
2292
|
|
2293 if ($header) {
|
|
2294 $seqstring =~ s/\s*//g; ## do not allow spaces in sequence
|
|
2295 push( @headers, $header );
|
|
2296 push( @sequences, $seqstring );
|
|
2297 push( @meta, +{%seq_meta} );
|
|
2298 $seqstring = "";
|
|
2299 %seq_meta = ();
|
|
2300 }
|
|
2301 return ( \@headers, \@sequences, \@meta );
|
|
2302 }
|
|
2303
|
|
2304
|
|
2305 1;
|
|
2306
|
|
2307 #############################################################################
|
|
2308 # Programming description
|
|
2309 #############################################################################
|
|
2310 # Substructure graphs for machine learning with Fabrizio
|
|
2311 # -------------------------------------------------------
|
|
2312 #
|
|
2313 # (1) Parameters (RNAshapes parameter):
|
|
2314 # - Window sizes [] (-w)
|
|
2315 # - window shift size (-W)
|
|
2316 # - calculate structure probabilities (-r)
|
|
2317 # - energy range kcal/mol (-e) OR energy relative percentage (%) to MFE (-c)
|
|
2318 # - shape type 1 to 5 (-t)
|
|
2319 # - ignore unstable substructures (-u)
|
|
2320 # - max shreps
|
|
2321 #
|
|
2322 #
|
|
2323 # (2) For each sequence, generate one graph/file that consists of all windows. The general format for one graph is as follows:
|
|
2324 #
|
|
2325 # t # seq_id parameters
|
|
2326 # v graph_index nt_type window_size window_centre abs_seq_pos
|
|
2327 # ...
|
|
2328 # e m m+1 > 1 (backbone)
|
|
2329 # ...
|
|
2330 # e base_i_graph_index base_j_graph_index s shrep_e shrep_p ...
|
|
2331 #
|
|
2332 # For each window (subgraph) we create a subgraph (of the subgraph) for each substructure.
|
|
2333 # We have a running index (gi) for each subgraph. All vertex and edge indices of the subgraph add
|
|
2334 # the running graph index to the actual window position. For example
|
|
2335 #
|
|
2336 #
|
|
2337 # Sequence: AAACC CUUUG GG
|
|
2338 # 01234 56789 01
|
|
2339 #
|
|
2340 # Window=10 substructure1 = (((...))). centre 5.5
|
|
2341 #
|
|
2342 # v 0 A 10 5.5 0
|
|
2343 # v 1 A 10 5.5 1
|
|
2344 # v 2 A 10 5.5 2
|
|
2345 # v 3 C 10 5.5 3
|
|
2346 # v 4 C 10 5.5 4
|
|
2347 # v 5 C 10 5.5 5
|
|
2348 # v 6 U 10 5.5 6
|
|
2349 # v 7 U 10 5.5 7
|
|
2350 # v 8 U 10 5.5 8
|
|
2351 # v 9 G 10 5.5 9
|
|
2352 # e 0 1 > 1
|
|
2353 # e 1 2 > 1
|
|
2354 # e 2 3 > 1
|
|
2355 # e 3 4 > 1
|
|
2356 # e 4 5 > 1
|
|
2357 # e 5 6 > 1
|
|
2358 # e 6 7 > 1
|
|
2359 # e 7 8 > 1
|
|
2360 # e 8 9 > 1
|
|
2361 # e 0 8 s -15.0 0.1223
|
|
2362 # e 1 7 s -15.0 0.1223
|
|
2363 # e 2 6 s -15.0 0.1223
|
|
2364 #
|
|
2365 # gi = 9+1 = 10
|
|
2366 #
|
|
2367 # Window = 10 substructure2 = .(((...))) centre 6.5
|
|
2368 # v 10 A 10 6.5 2
|
|
2369 # v 11 C 10 6.5 3
|
|
2370 # v 12 C 10 6.5 4
|
|
2371 # v 13 C 10 6.5 5
|
|
2372 # v 14 U 10 6.5 6
|
|
2373 # v 15 U 10 6.5 7
|
|
2374 # v 16 U 10 6.5 8
|
|
2375 # v 17 G 10 6.5 9
|
|
2376 # v 18 G 10 6.5 10
|
|
2377 # v 19 G 10 6.5 11
|
|
2378 # e 10 11 > 1
|
|
2379 # e 11 12 > 1
|
|
2380 # e 12 13 > 1
|
|
2381 # e 13 14 > 1
|
|
2382 # e 14 15 > 1
|
|
2383 # e 15 16 > 1
|
|
2384 # e 16 17 > 1
|
|
2385 # e 17 18 > 1
|
|
2386 # e 18 19 > 1
|
|
2387 # e 11 19 s -17.0 0.156
|
|
2388 # e 12 18 s -17.0 0.156
|
|
2389 # e 13 17 s -17.0 0.156
|
|
2390 #
|
|
2391 # gi = 19+1 = 20
|
|
2392 #
|
|
2393 #
|
|
2394 #
|
|
2395 # Write one perl script to create graphs for a set of sequences, fasta2shrep_gspan.pl.
|
|
2396 #
|
|
2397 # INPUT:
|
|
2398 # -f fasta file with all sequences to compute
|
|
2399 # parameters as above
|
|
2400 #
|
|
2401 # OUTPUT:
|
|
2402 # one file per sequence that contains graph called seq_id.gspan
|
|
2403 #
|
|
2404 # (1) for each window size call RNAshapes and write to a tmp file
|
|
2405 # (2) parse result of RNAshapes (catch RNAshapes error - sequence too long?) check for max shreps.
|
|
2406 # (3) convert RNAshapes result to subgraph -> write to file (readpipe) look at efficiency and errors
|
|
2407 # (4) repeat (1) to (3) for each sequence
|
|
2408
|
|
2409 # ABSTRACT STRUCTURE GRAPH
|
|
2410 # To each shrep graph (s #) add an abstract structure graph with special
|
|
2411 # abstract relations. For example, to add the abstract structure graph to
|
|
2412 # the previous shrep structure with gi numbers from 10 to 19, we first
|
|
2413 # identify the abstract shape, i.e. EL-S-HL and then create nodes (labelled
|
|
2414 # with a given name-space followed the actual node name and separated by a
|
|
2415 # hash) and edges for this graph as follows:
|
|
2416 # NOTE: at the moment we add the adjacent base-pairs to the loop definitions
|
|
2417 # but these can be removed if necessary in the future
|
|
2418 # v 20 abstruct#EL
|
|
2419 # v 21 abstruct#S
|
|
2420 # v 22 abstruct#HL
|
|
2421 # e 20 21 n
|
|
2422 # e 21 22 n
|
|
2423 # v 23 ^R
|
|
2424 # e 20 23 ^r
|
|
2425 # e 23 10 @r
|
|
2426 # e 23 11 @r
|
|
2427 # e 23 19 @r
|
|
2428 # v 24 ^R
|
|
2429 # e 21 24 ^r
|
|
2430 # e 24 11 @r
|
|
2431 # e 24 12 @r
|
|
2432 # e 24 13 @r
|
|
2433 # e 24 17 @r
|
|
2434 # e 24 18 @r
|
|
2435 # e 24 19 @r
|
|
2436 # v 25 ^R
|
|
2437 # e 22 25 ^r
|
|
2438 # e 25 13 @r
|
|
2439 # e 25 14 @r
|
|
2440 # e 25 15 @r
|
|
2441 # e 25 16 @r
|
|
2442 # e 25 17 @r
|
|
2443
|
|
2444 # gi = 25+1 = 26
|
|
2445
|
|
2446 # ANOTATION FILE
|
|
2447 # This is a file that labels a sequence region with a given annotation. In one
|
|
2448 # file we can have annotations within different name-spaces, for example target
|
|
2449 # sites predicted with different tools.
|
|
2450 # File format is a tab-delimited file with the following columns:
|
|
2451 # SEQID (same ID as in the fasta file header)
|
|
2452 # i (left positon of region)
|
|
2453 # j (right position of region, if one position i=j)
|
|
2454 # NAMESPACE#LABEL (give each annotation type one name-space and choose a label depending on the task)
|
|
2455
|
|
2456 # E.g we have 2 different miRNAs and we predict the target-sites with (1) IntaRNA and (2) TargetSearch,
|
|
2457 # then this could be IntaRNA#miR1, IntaRNA#miR2, TargetSearch#miR1, and TargetSearch#miR2.
|
|
2458 # All labels can be used more than once for one or more sequences.
|
|
2459 # all labels with the same namespace and the same sequence ID are grouped into
|
|
2460 # one abstract graph, according to the order of i.
|
|
2461 # Example
|
|
2462 # SEQID i j NAMESPACE#LABEL
|
|
2463 # s1 10 20 IntaRNA#miR1
|
|
2464 # s1 54 60 IntaRNA#miR2
|
|
2465 # s1 15 25 TargetSearch#miR1
|
|
2466 # s1 54 60 TargetSearch#miR2
|
|
2467 # s2 ...
|
|
2468
|
|
2469 # We create connected component per sequence ID per name-space,
|
|
2470 # so that all labels with
|
|
2471 # the same namespace are grouped together (for one sequence) as follows:
|
|
2472 # v 26 IntaRNA#miR1
|
|
2473 # v 27 IntaRNA#miR2
|
|
2474 # e 26 27 n
|
|
2475 # then create R nodes and ^r and @r relations as before (from i to j)
|
|
2476
|
|
2477 # If a sequence graph option is given, then these graphs (u #) have to
|
|
2478 # be labelled with this annotation
|
|
2479
|