Mercurial > repos > bgruening > sequence2gspan
comparison fasta2shrep_gspan.pl @ 0:b01beb170290 draft default tip
Uploaded
| author | bgruening |
|---|---|
| date | Tue, 29 Oct 2013 11:10:19 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b01beb170290 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 #use feature ':5.10'; | |
| 3 use File::Basename; | |
| 4 use lib dirname($0); # search skript directory for modules | |
| 5 use strict 'vars'; | |
| 6 use warnings; | |
| 7 use Getopt::Long; | |
| 8 use Pod::Usage; | |
| 9 #use List::Util qw/ min max /; | |
| 10 | |
| 11 #use StructureLibrary::RNAtoolsConfig; | |
| 12 #use StructureLibrary::Sequence; | |
| 13 | |
| 14 use Cwd qw(getcwd abs_path); | |
| 15 use File::Temp qw(tempdir); | |
| 16 use File::Copy; | |
| 17 use POSIX qw(ceil); | |
| 18 | |
| 19 use FindBin; | |
| 20 use lib "$FindBin::Bin"; | |
| 21 #use GraphClust; | |
| 22 | |
| 23 #use vars qw(%CONFIG); | |
| 24 #*CONFIG = \%GraphClust::CONFIG; | |
| 25 | |
| 26 =head1 NAME | |
| 27 | |
| 28 | |
| 29 fasta2shrep_gspan.pl -fasta mysequences.fasta -wins "50,100,150,200" -shift 5 -M 8 | |
| 30 | |
| 31 =head1 SYNOPSIS | |
| 32 | |
| 33 Options: | |
| 34 | |
| 35 HELP | |
| 36 -help brief help message | |
| 37 -man full documentation | |
| 38 | |
| 39 COMPULSORY | |
| 40 -fasta <STRING> e.g. "sequence.fasta" | |
| 41 All sequences in fasta format. | |
| 42 | |
| 43 OPTIONS | |
| 44 -wins [INTEGER] e.g. "50,100,200" | |
| 45 A list of window sizes to use. | |
| 46 If none are given (empty string ''), then the entire sequence is | |
| 47 taken with no windows. Each window > 1 required! | |
| 48 -shift <INTEGER> e.g. 20 | |
| 49 The shift of the window, relative to the window size given in | |
| 50 percent. So you give which percent of the window size shall be | |
| 51 used for the shift. Of course the shift is rounded down to the | |
| 52 nearest whole number. | |
| 53 Example 20 % of a window 150 would result in a step size of 30 nt. | |
| 54 It is a relative parameter, as you can give different window sizes. | |
| 55 If you do not give this parameter there is a default shift of 1 nt. | |
| 56 -cue Crop unpaired ends. | |
| 57 If you give this flag, then the unpaired ends of each | |
| 58 single structure are ignored. E.g. the structure | |
| 59 ...(((...))).. becomes just (((...))) | |
| 60 -stack Adds stacking information to graphs. This adds an additional | |
| 61 vertex (type P) for each pair of stacked base-pairs and four edges | |
| 62 (type p) from each of the involved bases to the new vertex. | |
| 63 -e <FLOAT> e.g. 5.0 | |
| 64 Energy range in kcal/mol (RNAshapes) | |
| 65 Use only one of -e and -c! | |
| 66 -c <INTEGER> e.g. 10 | |
| 67 Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes) | |
| 68 Use only one of -e and -c! | |
| 69 -t <INTEGER> [1-5] e.g. 3 OR "3=0,4=100,5=200" | |
| 70 The shape type (RNAshapes). Default is 3. | |
| 71 With the list format, the shape level can be changed for different window length | |
| 72 "4=100" means that shape level 4 is used from length 100nt (window length) | |
| 73 The first given length has to be 0! Not continuous given levels are allowed! | |
| 74 -M <INTEGER> e.g. 10 | |
| 75 Max number of shreps that should be taken per window. | |
| 76 -u Ignore unstable structures (RNAshapes). | |
| 77 This option filters out closed structures with positive free energy. | |
| 78 -r Calculate structure probabilities for shreps (RNAshapes) | |
| 79 -i <INT> e.g. 10 | |
| 80 Turn on structure sampling and gives number of sampling iterations. | |
| 81 Default no sampling (i=0) | |
| 82 -sample-len <INT> e.g. 100 | |
| 83 Only in sampling mode: Sampling is only used for seqs/windows >= given length, | |
| 84 Default: sample all lengths (0), if -i > 0 | |
| 85 -q Turn on shape probabilities for RNAshapes, no sampling mode allowed | |
| 86 -Tp <FLOAT> e.g 0.001 | |
| 87 Filter cutoff for shape probabilities, applied before -M filter! | |
| 88 -seq-graph-win add for each window a graph which contains no structure | |
| 89 -seq-graph-t add for each 't #' a graph which contains no structure | |
| 90 -seq-graph-alph change the alphabet of unstructured graphs | |
| 91 -annotate <STRING> annotation.tab | |
| 92 A file with annotations to be added as abstract graphs | |
| 93 on the sequence leven (if given) and on the structure | |
| 94 (SHREP) level. The format is has the following TAB-delimited | |
| 95 columns: SEQID, START, END, NAMESPACE#LABEL. | |
| 96 Labels with the same name-space and SEQID form connected | |
| 97 components, which is a sequence of label vertices ordered | |
| 98 by the START position in the sequence. | |
| 99 -abstr Add abstract structure graphs to the single shrep graph | |
| 100 instances. | |
| 101 -nostr Calculate no structures, only add sequence information, | |
| 102 if this is given, then -seq-graph-win AND/OR -seq-graph-t | |
| 103 are required. | |
| 104 -match-shape <SHAPE> | |
| 105 all seqs/windows will be constraint folded into that shape via | |
| 106 RNAshapes (if structure is given in another way this struct will be kept), | |
| 107 if this shape is not possible within given energy range, produce a | |
| 108 specific t graph with only one vertex 'X'. By this the instance | |
| 109 becomes very unsimilar to all other graphs (for knn) | |
| 110 -vp enable graph computation with viewpoints: | |
| 111 svmsgdnspdk will center on those nucleotides that are given | |
| 112 via capital letters and ignore those given as lowercase letters | |
| 113 -tmp <STRING> e.g. "/scratch/1/sita/tmp" | |
| 114 A directory for writing temporary files | |
| 115 -o <STRING> e.g. "ProjectX/MySequences/GSPAN/" | |
| 116 Output directory for gspan files containing graphs. | |
| 117 -group <INTEGER> e.g. 5 | |
| 118 Combine/group that number of input seqs into 1 gspan file | |
| 119 output name is then '<INT>.group.gspan.bz2' | |
| 120 -sge Use SGE cluster for each sequence separately | |
| 121 -sge-logdir stdout directory for SGE call | |
| 122 -sge-errdir sdterr directory for SGE call | |
| 123 -stdout send graphs to stdout instead of writing to files | |
| 124 -ignore-header don't write fasta id part after first space to gspan | |
| 125 -debug additional debug output | |
| 126 | |
| 127 | |
| 128 DEFAULT VALUES | |
| 129 -wins "" | |
| 130 -shift 1 nt | |
| 131 -c 10 | |
| 132 -t 3 | |
| 133 -M 0 # selects all shreps | |
| 134 -tmp "/var/tmp/fasta2shrep" | |
| 135 -o "CURRENT_DIR/GSPAN/" | |
| 136 | |
| 137 SGE mode | |
| 138 -sge-logdir "CURRENT_DIR/GSPAN/SGE_log" | |
| 139 -sge-errdir "CURRENT_DIR/GSPAN/SGE_log" | |
| 140 -task-id <NUM> | |
| 141 | |
| 142 =head1 DESCRIPTION | |
| 143 | |
| 144 =cut | |
| 145 | |
| 146 ############################################################################### | |
| 147 # end handler for temporary directory | |
| 148 # adds an error handler that deletes the directory in case of error | |
| 149 # SIGUSR{1/2} are sent by the sge prior to the uncatchable SIGKILL if the | |
| 150 # option -notify was set | |
| 151 ############################################################################### | |
| 152 $SIG{'INT'} = 'end_handler'; | |
| 153 $SIG{'TERM'} = 'end_handler'; | |
| 154 $SIG{'ABRT'} = 'end_handler'; | |
| 155 $SIG{'USR1'} = 'end_handler'; | |
| 156 $SIG{'USR2'} = 'end_handler'; | |
| 157 | |
| 158 sub end_handler { | |
| 159 print STDERR "signal ", $_[0], " caught, cleaning up temporary files\n"; | |
| 160 | |
| 161 # change into home directory. deletion of the temporary directory will | |
| 162 # fail if it is the current working directory | |
| 163 chdir(); | |
| 164 File::Temp::cleanup(); | |
| 165 die(); | |
| 166 } | |
| 167 | |
| 168 ############################################################################### | |
| 169 # PARSE COMMAND LINE OPTIONS | |
| 170 ############################################################################### | |
| 171 | |
| 172 # command line options | |
| 173 my ( $i_help, $i_man, $i_debug, $i_fas, $i_wins, $i_shift, $i_crop_unpaired_ends, | |
| 174 $i_r, $i_e, $i_c, $i_t, $i_u, $i_M, $i_o, $i_sge, $i_jobid, $i_tmp, | |
| 175 $i_ignore_seq_header, $i_stacks, $i_stdout, $i_q, $i_T, $i_i, | |
| 176 $i_sample_min_length, $i_sge_logDir, $i_sge_errDir, $i_groupsize, $i_annotate, | |
| 177 $i_abstr, $i_no_structure, $i_vp, $i_matchShape, $i_rnashapes_binpath ); | |
| 178 | |
| 179 my ( $i_add_seq_graph_win, $i_add_seq_graph_t, $i_change_seq_graph_alph ); | |
| 180 | |
| 181 my $options = GetOptions( | |
| 182 "help" => \$i_help, | |
| 183 "man" => \$i_man, | |
| 184 "debug" => \$i_debug, | |
| 185 "fasta=s" => \$i_fas, | |
| 186 "wins=s" => \$i_wins, | |
| 187 "shift=f" => \$i_shift, | |
| 188 "cue" => \$i_crop_unpaired_ends, | |
| 189 "stack" => \$i_stacks, | |
| 190 "r" => \$i_r, | |
| 191 "e=f" => \$i_e, | |
| 192 "c=i" => \$i_c, | |
| 193 "t=s" => \$i_t, | |
| 194 "u" => \$i_u, | |
| 195 "M=i" => \$i_M, | |
| 196 "tmp=s" => \$i_tmp, | |
| 197 "o=s" => \$i_o, | |
| 198 "i=i" => \$i_i, | |
| 199 "sample-len=i" => \$i_sample_min_length, | |
| 200 "q" => \$i_q, | |
| 201 "Tp=f" => \$i_T, | |
| 202 "seq-graph-win" => \$i_add_seq_graph_win, | |
| 203 "seq-graph-t" => \$i_add_seq_graph_t, | |
| 204 "-seq-graph-alph" => \$i_change_seq_graph_alph, | |
| 205 "sge" => \$i_sge, | |
| 206 "task-id=i" => \$i_jobid, | |
| 207 "ignore-header" => \$i_ignore_seq_header, | |
| 208 "stdout" => \$i_stdout, | |
| 209 "sge-logdir=s" => \$i_sge_logDir, | |
| 210 "sge-errdir=s" => \$i_sge_errDir, | |
| 211 "group=i" => \$i_groupsize, | |
| 212 "annotate=s" => \$i_annotate, | |
| 213 "abstr" => \$i_abstr, | |
| 214 "nostr" => \$i_no_structure, | |
| 215 "vp" => \$i_vp, | |
| 216 "match-shape=s" => \$i_matchShape, | |
| 217 "rnashapes-bin=s" => \$i_rnashapes_binpath, | |
| 218 ); | |
| 219 | |
| 220 pod2usage( -exitstatus => 1, -verbose => 1 ) if $i_help; | |
| 221 pod2usage( -exitstatus => 0, -verbose => 2 ) if $i_man; | |
| 222 ($options) or pod2usage(2); | |
| 223 | |
| 224 # check compulsory options | |
| 225 ($i_fas) or pod2usage("Error: the option -fasta is compulsory!\n"); | |
| 226 ( -e $i_fas ) or pod2usage("Error: no such file - $i_fas!\n"); | |
| 227 $i_fas = abs_path($i_fas); | |
| 228 | |
| 229 # check other options and set default values | |
| 230 pod2usage("Error: either -e OR -c can be given, but NOT both!\n") if ( $i_e && $i_c ); | |
| 231 ( $i_e or $i_c ) or $i_c = 10; # set -c 10, if neither -e or -c are given | |
| 232 ($i_M) or $i_M = 0; # max number of shreps is 0 (=means take all computed) | |
| 233 ($i_i) or $i_i = 0; # default no sampling else sampling iterations | |
| 234 ($i_sample_min_length) or $i_sample_min_length = 0; | |
| 235 pod2usage("\nError: use --sample-len <INT> only with -i <1..INT> !\n") if ( not $i_i and $i_sample_min_length ); | |
| 236 ($i_T) or $i_T = 0; | |
| 237 ($i_q) or $i_q = 0; | |
| 238 pod2usage("\nError: Sampling (-i) not possible with shape probabilities (-q)!\n") if ( $i_i and $i_q ); | |
| 239 ($i_add_seq_graph_win) or $i_add_seq_graph_win = 0; | |
| 240 ($i_add_seq_graph_t) or $i_add_seq_graph_t = 0; | |
| 241 ($i_change_seq_graph_alph) or $i_change_seq_graph_alph = 0; | |
| 242 ($i_no_structure) or $i_no_structure = 0; | |
| 243 | |
| 244 if ($i_change_seq_graph_alph) { | |
| 245 ($i_add_seq_graph_t) or ($i_add_seq_graph_win) or pod2usage( "Error: " . | |
| 246 "When giving the parameter -seq-graph-alph, then either -seq-graph-t or -seq-graph-win" . | |
| 247 " must also be given!\n" ); | |
| 248 } | |
| 249 ( -e $i_annotate ) or pod2usage("Error: no such file - $i_annotate!\n") | |
| 250 if ($i_annotate); | |
| 251 | |
| 252 $i_add_seq_graph_t = 1 if ($i_no_structure); | |
| 253 | |
| 254 ($i_t) or $i_t = 3; # default abstraction type is 3 | |
| 255 my $change_shape_level = 0; | |
| 256 my @level_lens = ( -1, -1, -1, -1, -1 ); ## array_idx-1=shape level, value=start length of this level | |
| 257 | |
| 258 if ( $i_t !~ /^\d+$/ ) { | |
| 259 | |
| 260 my @t_minlens = split( ",", $i_t ); ## -t "3=100,4=200" | |
| 261 | |
| 262 foreach my $idx ( 1 .. @t_minlens ) { | |
| 263 | |
| 264 my $level = $t_minlens[ $idx - 1 ]; ## $level = "3=100" | |
| 265 | |
| 266 die "$level Wrong -t format! Example: -t 3=0,4=100,5=200\n" if ( $level !~ /^\d+\=\d+$/ ); | |
| 267 my @lev_len = split( "=", $level ); ## $level = "3=100" | |
| 268 | |
| 269 die "Wrong -t format! First level given needs to be length 0! Example: -t 3=0,4=100,5=200\n" if ( $idx == 1 && $lev_len[1] != 0 ); | |
| 270 die "Wrong -t format! Only level 1-5 allowed! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[0] < 1 or $lev_len[0] > 5 ); | |
| 271 die "Wrong -t format! Length >= 0 expected! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[1] < 0 ); | |
| 272 $change_shape_level = 1; | |
| 273 $level_lens[ $lev_len[0] - 1 ] = $lev_len[1]; | |
| 274 } | |
| 275 | |
| 276 ($i_debug) and print STDERR "i_t = $i_t - change $change_shape_level - shape level lengths#" . join( ":", @level_lens ) . "#" . join( ":", @t_minlens ) . "#\n"; | |
| 277 } | |
| 278 | |
| 279 ## checks for match-shape | |
| 280 if ($i_matchShape){ | |
| 281 die "Please provide correct match shape string like '[]'! Exit...\n\n" | |
| 282 if ($i_matchShape !~ /^[\[\]_]+$/); | |
| 283 ## RNAshapes prodices anyway only 1 structure, no suboptimal structs in match-shape folding | |
| 284 $i_M = 1; | |
| 285 } | |
| 286 | |
| 287 my $CURRDIR = getcwd; | |
| 288 | |
| 289 # set up tmp directory | |
| 290 # default tmp is /var/tmp, usually not NFS mounted! | |
| 291 ( defined $i_tmp ) or $i_tmp = '/var/tmp/'; | |
| 292 my $tmp_template = 'fasta2shrep-XXXXXX'; | |
| 293 | |
| 294 # CLEANUP => 1 : automatically delete at exit | |
| 295 $i_tmp = tempdir( $tmp_template, DIR => $i_tmp, CLEANUP => 1 ); | |
| 296 | |
| 297 # create GSPAN directory when not printing to stdout | |
| 298 if ( not $i_stdout ) { | |
| 299 if ($i_o) { | |
| 300 ( -e $i_o ) or system("mkdir -p $i_o"); | |
| 301 } else { | |
| 302 system("mkdir -p GSPAN"); | |
| 303 $i_o = $CURRDIR . "/GSPAN/"; | |
| 304 } | |
| 305 } | |
| 306 | |
| 307 ############################################################################### | |
| 308 # GLOBAL VARIABLES | |
| 309 ############################################################################### | |
| 310 $i_rnashapes_binpath or $i_rnashapes_binpath = "RNAshapes"; | |
| 311 die "Please provide full RNAshapes path (e.g. /usr/bin/RNAshapes) !\n" if (!-f "$i_rnashapes_binpath"); | |
| 312 my $rnashapes_loc = $i_rnashapes_binpath; | |
| 313 if ( !$rnashapes_loc || !-e $rnashapes_loc ) { | |
| 314 my $loc = `which RNAshapes`; | |
| 315 chomp($loc); | |
| 316 die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !$loc ); | |
| 317 die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !-e $loc ); | |
| 318 $rnashapes_loc = $loc; | |
| 319 } | |
| 320 | |
| 321 my @WINDOWS = (); | |
| 322 @WINDOWS = split( ",", $i_wins ) if ($i_wins); | |
| 323 my $globalFolding; | |
| 324 $globalFolding = 1 unless @WINDOWS; | |
| 325 my $CURRUSER = getlogin; | |
| 326 my $SEQNO = 0; # used to id the sequences | |
| 327 my $GSPANNO = 0; # used for gspan filenames | |
| 328 # minimum length of sequence that will work with RNAshapes | |
| 329 # sequences with one or two nucleotides will be restricted to sequence-only graphs | |
| 330 # this ensures that no sequences are skipped and external info kept synchronized | |
| 331 my $GSPAN_SEQ_MINLEN = 3; | |
| 332 | |
| 333 # name spaces | |
| 334 my $ABSTRUCT = "AS"; | |
| 335 ############################################################################### | |
| 336 # EXECUTION CODE | |
| 337 ############################################################################### | |
| 338 | |
| 339 # read fasta file into hash | |
| 340 my ( $headers_aref, $sequences_aref, $metainfo_aref ) = read_fasta_with_nonunique_headers_meta($i_fas); | |
| 341 | |
| 342 # call script again on the sge cluster in a batch job | |
| 343 if ($i_sge) { | |
| 344 my $SCRIPTNAME = $0; | |
| 345 my $CLUSTERSUBMIT = $FindBin::Bin."/fasta2shrep_gspan.sge"; | |
| 346 | |
| 347 die "Cannot find SGE submit script $CLUSTERSUBMIT! Exit...\n\n" if ( !-e $CLUSTERSUBMIT ); | |
| 348 | |
| 349 my $SGE_jobs = @{$sequences_aref}; | |
| 350 $SGE_jobs = ceil( @{$sequences_aref} / $i_groupsize ) if ($i_groupsize); | |
| 351 | |
| 352 my $params = "-fasta $i_fas "; | |
| 353 $params .= "-wins $i_wins " if ($i_wins); | |
| 354 $params .= "-shift $i_shift " if ($i_shift); | |
| 355 $params .= "-e $i_e " if ($i_e); | |
| 356 $params .= "-c $i_c " if ($i_c); | |
| 357 $params .= "-t $i_t " if ($i_t); | |
| 358 $params .= "-u " if ($i_u); | |
| 359 $params .= "-r " if ($i_r); | |
| 360 $params .= "-M $i_M " if ($i_M); | |
| 361 $params .= "-o $i_o " if ($i_o); | |
| 362 $params .= "-i $i_i " if ($i_i); | |
| 363 $params .= "--sample-len $i_sample_min_length" if ($i_sample_min_length); | |
| 364 $params .= "-q " if ($i_q); | |
| 365 $params .= "-Tp $i_T " if ($i_T); | |
| 366 $params .= "--seq-grap-win " if ($i_add_seq_graph_win); | |
| 367 $params .= "--seq-graph-t " if ($i_add_seq_graph_t); | |
| 368 $params .= "--seq-graph-alph " if ($i_change_seq_graph_alph); | |
| 369 $params .= "-ignore-header " if ($i_ignore_seq_header); | |
| 370 $params .= "-cue " if ($i_crop_unpaired_ends); | |
| 371 $params .= "-stack " if ($i_stacks); | |
| 372 $params .= "--group $i_groupsize " if ($i_groupsize); | |
| 373 $params .= "-annotate $i_annotate" if ($i_annotate); | |
| 374 $params .= "-abstr" if ($i_abstr); | |
| 375 $params .= "-nostr " if ($i_no_structure); | |
| 376 $params .= "--debug " if ($i_debug); | |
| 377 $params .= "-vp " if ($i_vp); | |
| 378 $params .= "--match-shape $i_matchShape " if ($i_matchShape); | |
| 379 | |
| 380 print "used script:" . $SCRIPTNAME . "\n"; | |
| 381 print "used submit script:" . $CLUSTERSUBMIT . "\n"; | |
| 382 | |
| 383 $i_sge_logDir = "$i_o/SGE_log" if ( !$i_sge_logDir ); | |
| 384 mkdir($i_sge_logDir); | |
| 385 | |
| 386 $i_sge_errDir = $i_sge_logDir if ( !$i_sge_errDir ); | |
| 387 mkdir($i_sge_errDir); | |
| 388 | |
| 389 my $ssh = 1; ## can be used for debug shell script call | |
| 390 if ($ssh) { | |
| 391 system( "ssh $CURRUSER\@biui.informatik.uni-freiburg.de " | |
| 392 . "'export SGE_ROOT=/opt/sge-6.0/; cd $CURRDIR; " | |
| 393 . "/opt/sge-6.0/bin/lx24-amd64/qsub -t 1-$SGE_jobs -o $i_sge_errDir/ -e $i_sge_errDir/ " | |
| 394 . "$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME \"$params\" ' " ); | |
| 395 } else { | |
| 396 system("$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME '$params' "); | |
| 397 } | |
| 398 exit; | |
| 399 } | |
| 400 | |
| 401 ## compute shreps and gspan, either local or after SGE submission | |
| 402 | |
| 403 # TODO read and process annotations | |
| 404 | |
| 405 my @used_seq_headers; | |
| 406 my @used_seqs; | |
| 407 my @used_meta; | |
| 408 my $group_idx; | |
| 409 | |
| 410 if ($i_jobid) { | |
| 411 ## just process the one sequence as given by the jobid number | |
| 412 my $used_grouping = 1; ## if no group is given, make 1 seq per job = group=1 | |
| 413 $used_grouping = $i_groupsize if ($i_groupsize); | |
| 414 my $st = ( $i_jobid - 1 ) * $used_grouping + 1; | |
| 415 my $end = $st + $used_grouping - 1; | |
| 416 $end = @{$sequences_aref} if ( $end > @{$sequences_aref} ); | |
| 417 | |
| 418 foreach my $idx ( $st .. $end ) { | |
| 419 push( @used_seq_headers, $headers_aref->[ $idx - 1 ] ); | |
| 420 push( @used_seqs, $sequences_aref->[ $idx - 1 ] ); | |
| 421 push( @used_meta, $metainfo_aref->[ $idx - 1 ] ); | |
| 422 } | |
| 423 | |
| 424 $group_idx = $i_jobid; | |
| 425 | |
| 426 #($i_debug) and print STDERR "st $st end $end gr $group_idx job $i_jobid gs $i_groupsize\n"; | |
| 427 | |
| 428 $GSPANNO = $i_jobid-1 if (!$i_groupsize); | |
| 429 | |
| 430 } else { | |
| 431 ## process all sequences at once | |
| 432 @used_seq_headers = @{$headers_aref}; | |
| 433 @used_seqs = @{$sequences_aref}; | |
| 434 @used_meta = @{$metainfo_aref}; | |
| 435 } | |
| 436 | |
| 437 my $out; | |
| 438 my $gspanfile; | |
| 439 my $out_no_match_shape; | |
| 440 | |
| 441 if ($i_matchShape && !$i_stdout && !$i_groupsize){ | |
| 442 open($out_no_match_shape,">$i_o/fasta2shrep.no_match"); | |
| 443 } | |
| 444 | |
| 445 # for each sequence in the fasta file | |
| 446 while ( my $seq = shift @used_seqs ) { | |
| 447 my $tmp_header = shift @used_seq_headers; | |
| 448 my $tmp_meta = shift @used_meta; | |
| 449 | |
| 450 my ( $seq_id, $seq_header ) = ( $tmp_header =~ /(\S+)\s*([\S*\s*]*)/ ); | |
| 451 $i_ignore_seq_header and $seq_header = ''; | |
| 452 | |
| 453 my $seq_fasta = generate_single_fasta_from_sequence_X( $seq_id, $seq ); | |
| 454 my $seq_len = length($seq); | |
| 455 | |
| 456 # only print sequence graphs for sequences below this threshold | |
| 457 my $no_structure_override = ($seq_len < $GSPAN_SEQ_MINLEN) ? 1 : 0; | |
| 458 | |
| 459 $GSPANNO++; | |
| 460 | |
| 461 # set outstream for gspan output to correct file/STDOUT | |
| 462 if ($i_stdout) { | |
| 463 $out = \*STDOUT; | |
| 464 } elsif ( !$i_groupsize ) { | |
| 465 $gspanfile = $i_tmp . '/' . $GSPANNO . '.gspan'; | |
| 466 open( $out, "| bzip2 -f > $gspanfile.bz2" ); | |
| 467 } elsif ( ( $GSPANNO - 1 ) % $i_groupsize == 0 ) { | |
| 468 | |
| 469 if ( $GSPANNO > 1 ) { | |
| 470 close($out); | |
| 471 system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2"); | |
| 472 if ($out_no_match_shape){ | |
| 473 close($out_no_match_shape); | |
| 474 system("mv $gspanfile.no_match $i_o/$group_idx.group.gspan.no_match"); | |
| 475 } | |
| 476 } | |
| 477 | |
| 478 if ( !$i_jobid ) { | |
| 479 $group_idx = int( ( $GSPANNO - 1 ) / $i_groupsize ) + 1; | |
| 480 } | |
| 481 | |
| 482 $gspanfile = "$i_tmp/$group_idx.group.gspan"; | |
| 483 open( $out, "| bzip2 -f > $gspanfile.bz2" ); | |
| 484 open( $out_no_match_shape, ">$gspanfile.no_match" ) if ($out_no_match_shape); | |
| 485 } | |
| 486 | |
| 487 ## do not use folding windows in special cases | |
| 488 if ( $globalFolding || $i_no_structure || $no_structure_override) { | |
| 489 @WINDOWS = (); | |
| 490 push( @WINDOWS, $seq_len ); | |
| 491 } | |
| 492 | |
| 493 ##check win sizes for global folding | |
| 494 my @WINDOWS_used = sort { $a <=> $b } @WINDOWS; | |
| 495 foreach my $w_idx ( 0 .. ( @WINDOWS_used - 1 ) ) { | |
| 496 if ( $WINDOWS_used[$w_idx] >= $seq_len && $WINDOWS_used[$w_idx] > 1 ) { | |
| 497 if ( $w_idx < $#WINDOWS_used ) { | |
| 498 @WINDOWS_used = @WINDOWS_used[ 0 .. $w_idx ]; | |
| 499 last; | |
| 500 } | |
| 501 } | |
| 502 } | |
| 503 | |
| 504 ## use seq graph only if no shape folding wanted | |
| 505 $i_add_seq_graph_t = 1 if ($i_no_structure || $no_structure_override); | |
| 506 | |
| 507 ## no shape info in graphheader if we have fixed structure (according LocaRNA handling) | |
| 508 ## use tags #FS or #S for provided structure | |
| 509 my $graph_header; | |
| 510 my @struct_meta = grep { $_ =~ /#FS/ || $_ =~ /#S/ } keys %{$tmp_meta}; | |
| 511 | |
| 512 if (@struct_meta) { | |
| 513 $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, 1 ); | |
| 514 } else { | |
| 515 $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, $i_no_structure || $no_structure_override ); | |
| 516 } | |
| 517 | |
| 518 print $out $graph_header; | |
| 519 | |
| 520 my $gi = 0; # current graph index | |
| 521 | |
| 522 ## add graph with no structure at all depending on $i_add_seq_graph_t | |
| 523 if ($i_add_seq_graph_t) { | |
| 524 | |
| 525 ($gi) = convertSeqWindow( $seq, $seq_len, 1, $gi, $graph_header, $out, $i_annotate, $seq ); | |
| 526 | |
| 527 } | |
| 528 | |
| 529 ## encode fixed structure only if provided and structures wanted in general | |
| 530 if ( @struct_meta && !$i_no_structure && !$no_structure_override) { | |
| 531 | |
| 532 my $struct_meta = "#FS"; | |
| 533 $struct_meta = "#S" if ( !exists $tmp_meta->{$struct_meta} ); | |
| 534 | |
| 535 my $seq_shrep = [ $tmp_meta->{$struct_meta}, "ENERGY", "0.00", "SHAPE", $struct_meta ]; | |
| 536 $gi = convertShapeWindow( [$seq_shrep], $seq, $seq_len, 1, $gi, $out, | |
| 537 $graph_header, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq ); | |
| 538 @WINDOWS_used = (); ## no shape folding if we have a fixed structure | |
| 539 | |
| 540 } | |
| 541 | |
| 542 ## ignore RNAshapes folding if wanted, but do correct file move afterwards | |
| 543 ## (just "next" does not work due to output) | |
| 544 @WINDOWS_used = () if ($i_no_structure or $no_structure_override); | |
| 545 | |
| 546 #for each window size in list | |
| 547 foreach my $win_size (@WINDOWS_used) { | |
| 548 | |
| 549 # calculate shift size from percentage | |
| 550 my $curr_shift = 1; | |
| 551 if ($i_shift) { | |
| 552 $curr_shift = ( $i_shift / 100 ) * $win_size; | |
| 553 $curr_shift = int($curr_shift); #round down | |
| 554 $curr_shift = 1 unless ($curr_shift); # just in case it is 0 | |
| 555 } | |
| 556 ($i_debug) and print STDERR "winsize: $win_size curr_shift: $curr_shift\n"; | |
| 557 ($i_debug) and print STDERR "\nNext: $seq_id\t winsize:$win_size \n"; | |
| 558 | |
| 559 # choose current shape level, depending on $i_t | |
| 560 my $curr_t = 0; | |
| 561 if ($change_shape_level) { | |
| 562 for ( my $i = 0 ; $i < @level_lens ; $i++ ) { | |
| 563 $curr_t = $i + 1 if ( $level_lens[$i] != -1 && ( $level_lens[$i] <= $win_size ) ); | |
| 564 } | |
| 565 ($i_debug) and print STDERR "$win_size curr type $curr_t\n"; | |
| 566 } else { | |
| 567 $curr_t = $i_t; | |
| 568 } | |
| 569 | |
| 570 my $rnashapesoutput_fh; | |
| 571 | |
| 572 # call RNAshapes and write to $rnashapesoutput_fh | |
| 573 $rnashapesoutput_fh = call_RNAshapes( $seq_fasta, $rnashapes_loc, $win_size, | |
| 574 $curr_shift, $i_e, $i_c, $curr_t, $i_u, $i_r, $i_q, $i_T, $i_i, $i_sample_min_length, $seq_len, $i_matchShape ); | |
| 575 | |
| 576 # read RNAshapes output from $rnashapesoutput_fh and write subgraph | |
| 577 # to gspan file | |
| 578 my $gi_old = $gi; | |
| 579 $gi = convert_RNAshapes_output( $rnashapesoutput_fh, $gi, $i_M, $out, $graph_header, | |
| 580 $win_size, $seq_len, $curr_t, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq ); | |
| 581 | |
| 582 ## no (match) shape found at all for this seq | |
| 583 if ($gi == $gi_old+1){ | |
| 584 $gi = convertSeqWindow( "X", 1, 1, $gi, $graph_header, $out, $i_annotate, $seq ); | |
| 585 print $out_no_match_shape $seq_id."\n" if (!$i_stdout && $out_no_match_shape); | |
| 586 } | |
| 587 } ## foreach WINDOW_used | |
| 588 | |
| 589 if ( !$i_stdout && !$i_groupsize ) { | |
| 590 close($out); | |
| 591 move "$gspanfile.bz2", "$i_o/$GSPANNO.gspan.bz2"; | |
| 592 } | |
| 593 | |
| 594 system("rm $seq_fasta"); | |
| 595 | |
| 596 } ## while @used_seqs | |
| 597 | |
| 598 if ($i_groupsize) { | |
| 599 close($out); | |
| 600 system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2"); | |
| 601 close($out_no_match_shape) if ($out_no_match_shape); | |
| 602 move "$gspanfile.no_match", "$i_o/$group_idx.group.gspan.no_match" if ($out_no_match_shape); | |
| 603 } elsif ( $out_no_match_shape && !$i_stdout && !$i_groupsize ) { | |
| 604 close($out_no_match_shape); | |
| 605 } | |
| 606 | |
| 607 ############################################################################### | |
| 608 # METHODS | |
| 609 ############################################################################### | |
| 610 | |
| 611 ############################################################################ | |
| 612 # Generates fasta file for a single sequence (one-lined-fasta). This | |
| 613 # fasta is stored in the temp directory and should be deleted at the end. | |
| 614 # Input: | |
| 615 # seq_id : the sequence ID | |
| 616 # seq : the sequence | |
| 617 # | |
| 618 # Output: | |
| 619 # The fasta file name | |
| 620 ############################################################################ | |
| 621 sub generate_single_fasta_from_sequence_X { | |
| 622 my ( $seq_id, $seq ) = @_; | |
| 623 | |
| 624 $seq = uc($seq); | |
| 625 $seq =~ tr/T/U/; | |
| 626 $seq =~ s/[^AUCGN]/N/g; | |
| 627 | |
| 628 my $outfas = $i_tmp . "/seq_" . $SEQNO++ . ".fasta"; | |
| 629 my $host = readpipe("hostname"); | |
| 630 open( FAS, ">$outfas" ) or die "$host Cannot open file $outfas! Exit...\n\n"; | |
| 631 print FAS ">$seq_id\n$seq"; | |
| 632 close(FAS); | |
| 633 | |
| 634 return $outfas; | |
| 635 } | |
| 636 | |
| 637 ############################################################################ | |
| 638 # RNAshapes is called with the given input or default parameters. | |
| 639 # Input: | |
| 640 # seq_fasta : the sequence fasta file | |
| 641 # rnashapes_location : the location of the installation files for RNAshapes | |
| 642 # win_size : the current window size | |
| 643 # shift : the input parameter -shift | |
| 644 # e : the input parameter -e | |
| 645 # c : the input parameter -c | |
| 646 # t : the input parameter -t | |
| 647 # u : the input parameter -u | |
| 648 # r : the input parameter -r | |
| 649 # | |
| 650 # Output: none | |
| 651 ############################################################################ | |
| 652 sub call_RNAshapes { | |
| 653 my ( $seq_fasta, $rnashapes_location, $win_size, $shift, $e, $c, $t, $u, $r, $q, $T, $i, $sample_length, $seqLen, $matchShape ) = @_; | |
| 654 my $FUNCTION = "call_RNAshapes in fasta2shrep_gspan.pl"; | |
| 655 | |
| 656 ($seq_fasta) or die("INPUT ERROR in $FUNCTION: the fasta file is compulsory!\n"); | |
| 657 ($rnashapes_location) or die( "INPUT ERROR in $FUNCTION: the RNAshapes location" . " is compulsory!\n" ); | |
| 658 die "$rnashapes_location does not exist! Exit...\n\n" if ( !-e $rnashapes_location ); | |
| 659 my $call = $rnashapes_location . " -o 1 "; # the output format is of type 1 | |
| 660 $call .= "-q " if ($q); | |
| 661 $call .= "-T $T " if ( $q and $T ); | |
| 662 $call .= "-w $win_size "; | |
| 663 $call .= "-W $shift " if ($i_shift); | |
| 664 die("ERROR in $FUNCTION: Give only one of the options -c or -e (RNAshapes)!\n") | |
| 665 | |
| 666 if ( $e && $c ); | |
| 667 $call .= "-e $e " if ($e); | |
| 668 $call .= "-c $c " if ($c); | |
| 669 $call .= "-t $t " if ($t); | |
| 670 $call .= "-u " if ( $u and not $i ); ## not possible in sampling mode | |
| 671 $call .= "-r " if ($r); | |
| 672 $call .= "-m $matchShape " if ($matchShape); | |
| 673 | |
| 674 ## check is a bit long but : we want to sample if the window is larger than $sample_length and full seq is larger than window or sample_len | |
| 675 ## necessary to do sampling a large window is given, sample_length is shorter than window, but seq is longer than sample_len | |
| 676 $call .= "-i $i -A " if ( not $q and $i and ( $win_size >= $sample_length and ( $win_size <= $seqLen or $seqLen >= $sample_length ) ) ); ## -A is to omit samples and print only combined shape probs | |
| 677 $call .= " < $seq_fasta"; | |
| 678 | |
| 679 ($i_debug) and print STDERR "$seqLen $sample_length $win_size $call\n"; | |
| 680 | |
| 681 open my $rnashapesoutput, "$call |" or die( "ERROR in $FUNCTION: The following call " . "could not be carried out!\n$call\n" ); | |
| 682 | |
| 683 return $rnashapesoutput; | |
| 684 } | |
| 685 | |
| 686 ############################################################################ | |
| 687 # The output of RNAshapes for one sequence and one window size is read | |
| 688 # and converted into graph format. | |
| 689 # Input: | |
| 690 # rnashapeoutput : filehandle for RNAshapes output in format -o 1 | |
| 691 # curr_gi : current graph index (for vertices) | |
| 692 # maxShreps : max number of shreps to convert to graphs | |
| 693 # graph_file_hdl : the output handler for the graph file | |
| 694 # graphHead : the header line for the complete graph (for sequence) | |
| 695 # winSize : current window size in input for RNAshapes | |
| 696 # seqLen : full input seq length | |
| 697 # used_t : | |
| 698 # annotate: | |
| 699 # abstr : | |
| 700 # cue : | |
| 701 # stacks : | |
| 702 # orig_seq : the nucleotide sequence as read from fasta | |
| 703 # | |
| 704 # Output: | |
| 705 # The current graph index | |
| 706 ############################################################################ | |
| 707 sub convert_RNAshapes_output { | |
| 708 my ( $rnashapesoutput, $curr_gi, $maxShreps, $graph_file_hdl, $graphHead, $winSize, $seqLen, $used_t, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_; | |
| 709 | |
| 710 ## omit first line in output, contains fasta header line | |
| 711 my $line = <$rnashapesoutput>; | |
| 712 my $win_shreps = []; | |
| 713 my $win_start; | |
| 714 my $win_end; | |
| 715 my $win_seq; | |
| 716 my $win_shrep_count = 0; | |
| 717 my $win_sample = 0; | |
| 718 my $winHead; | |
| 719 my $win_globalFolding; | |
| 720 my $win_size_real; | |
| 721 | |
| 722 # reading RNAshapes output | |
| 723 while ( $line = <$rnashapesoutput> ) { | |
| 724 | |
| 725 if ( $line =~ /^(\d+)\s+(\d+)$/ ) { | |
| 726 ## line: "<start> <end>" | |
| 727 | |
| 728 if ( @{$win_shreps} > 0 ) { | |
| 729 | |
| 730 print $graph_file_hdl $winHead; | |
| 731 | |
| 732 ## remove SHAPE="_" shrep if it exists in $win_shreps | |
| 733 ## do this only when have a sequence graph already | |
| 734 if ($i_add_seq_graph_t) { | |
| 735 my @new_win_shreps = (); | |
| 736 map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps}; | |
| 737 $win_shreps = \@new_win_shreps; | |
| 738 } | |
| 739 | |
| 740 ## add graph with no structure depending on $i_add_seq_graph_win to win | |
| 741 if ($i_add_seq_graph_win ) { | |
| 742 ( $curr_gi ) = convertSeqWindow( $win_seq, $win_size_real, | |
| 743 $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ); | |
| 744 } | |
| 745 | |
| 746 $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real, | |
| 747 $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, | |
| 748 $cue, $stacks, $orig_seq ); | |
| 749 } | |
| 750 | |
| 751 ## set new window params | |
| 752 $win_shreps = []; | |
| 753 $win_start = $1; | |
| 754 $win_end = $2; | |
| 755 $win_shrep_count = 0; | |
| 756 $win_size_real = $win_end - $win_start + 1; | |
| 757 | |
| 758 if ( ($win_size_real) >= $seqLen ) { | |
| 759 $win_globalFolding = 1; | |
| 760 } else { | |
| 761 $win_globalFolding = 0; | |
| 762 } | |
| 763 my $win_center = $win_start + ( ( $win_size_real + 1 ) / 2 ); | |
| 764 | |
| 765 $winHead = getWindowHeader( $graphHead, $winSize, $win_start, $win_end, $win_center, $win_globalFolding, $win_sample, $used_t ); | |
| 766 | |
| 767 } elsif ( $line =~ /^(\S+)$/ ) { | |
| 768 ## line: "CUUAUGAGUAAGGAAAAUAACGAUUCGGGGUGACGCCCGAAUCCUCACUG" | |
| 769 $win_seq = uc($1); ## to be 101% sure that we have by default uppercase chars | |
| 770 | |
| 771 } elsif ( $line =~ /^Results for (\d+) iterations:$/ ) { | |
| 772 ## line: "Results for 10 iterations:" | |
| 773 $win_sample = $1; | |
| 774 | |
| 775 } elsif ( $line =~ /^Shape\s+\S+\s+not found within energy range.*$/ ) { | |
| 776 ## line: "Shape [] not found within energy range (-24.75 to -27.50). Try -c or -e to increase range." | |
| 777 $win_shreps = []; | |
| 778 | |
| 779 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)$/ ) { | |
| 780 ## line:"...((((..(((....)))))))...........(((((......))))) (-10.10) [[]][]" | |
| 781 ## take only $maxShreps shreps per window if set | |
| 782 next if ( $maxShreps && $win_shrep_count >= $maxShreps ); | |
| 783 push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPE", $3 ] ); | |
| 784 $win_shrep_count++; | |
| 785 | |
| 786 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)$/ ) { | |
| 787 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) [[[[[]]]]]" | |
| 788 ## take only $maxShreps shreps per window if set | |
| 789 next if ( $maxShreps && $win_shrep_count >= $maxShreps ); | |
| 790 push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPE", $4 ] ); | |
| 791 $win_shrep_count++; | |
| 792 | |
| 793 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) { | |
| 794 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) 0.3000000 [[[[[]]]]]" | |
| 795 ## take only $maxShreps shreps per window if set | |
| 796 next if ( $maxShreps && $win_shrep_count >= $maxShreps ); | |
| 797 push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPEPROB", $3, "SHAPE", $4 ] ); | |
| 798 $win_shrep_count++; | |
| 799 | |
| 800 } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) { | |
| 801 ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) 0.3000000 [[[[[]]]]]" | |
| 802 ## take only $maxShreps shreps per window if set | |
| 803 next if ( $maxShreps && $win_shrep_count >= $maxShreps ); | |
| 804 push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPEPROB", $4, "SHAPE", $5 ] ); | |
| 805 $win_shrep_count++; | |
| 806 | |
| 807 } else { | |
| 808 next if ( $line =~ /^$/ ); | |
| 809 die "Unexpected shape output format!\nline=$line\n\nExit...\n\n"; | |
| 810 } | |
| 811 } | |
| 812 | |
| 813 ## convert last windows | |
| 814 if ( @{$win_shreps} > 0 ) { | |
| 815 | |
| 816 print $graph_file_hdl $winHead; | |
| 817 | |
| 818 ## remove SHAPE="_" shrep if it exists in $win_shreps | |
| 819 ## do this only when have a sequence graph already | |
| 820 if ($i_add_seq_graph_t) { | |
| 821 my @new_win_shreps = (); | |
| 822 map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps}; | |
| 823 $win_shreps = \@new_win_shreps; | |
| 824 } | |
| 825 | |
| 826 ## add graph with no structure depending on $i_add_seq_graph_win to win | |
| 827 if ($i_add_seq_graph_win) { | |
| 828 ($curr_gi) = convertSeqWindow( $win_seq, $win_size_real, $win_start, | |
| 829 $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ); | |
| 830 } | |
| 831 | |
| 832 $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real, | |
| 833 $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, $cue, | |
| 834 $stacks, $orig_seq ); | |
| 835 } | |
| 836 | |
| 837 close($rnashapesoutput); | |
| 838 | |
| 839 return $curr_gi + 1; # return the gi (graph index) for the next subgraph | |
| 840 } | |
| 841 | |
| 842 ## Sub to create graph for a complete unstructured sequence | |
| 843 # TODO: document function | |
| 844 sub convertSeqWindow { | |
| 845 my ( $win_seq, $win_size_real, $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ) = @_; | |
| 846 | |
| 847 my $seq_shrep; | |
| 848 | |
| 849 # use a different alphabet (lowercase) if option seq-graph-alph set | |
| 850 my $seq_graph_sequence = $i_change_seq_graph_alph ? lc($win_seq) : uc($win_seq); | |
| 851 $seq_shrep = [ "." x $win_size_real, "ENERGY", "0.00", "SHAPE", "_", "STRUCT", "." x $win_size_real, "SEQ", $seq_graph_sequence ]; | |
| 852 my $backboneGraph_ref = getBackboneGraph( $seq_graph_sequence, $curr_gi, $win_start, 0, ( $win_size_real - 1 ), $orig_seq ); | |
| 853 | |
| 854 print $graph_file_hdl getSeqHeader( $winHead, $seq_shrep ); | |
| 855 print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n"; | |
| 856 | |
| 857 $curr_gi += $win_size_real; | |
| 858 | |
| 859 if ($annotate) { | |
| 860 ## TODO add annotation graphs | |
| 861 } | |
| 862 | |
| 863 return ($curr_gi); | |
| 864 } | |
| 865 | |
| 866 ############################################################################ | |
| 867 # The results for one window are converted in this method to GSPAN graphs. | |
| 868 # Input: | |
| 869 # win_shreps_aref : the array ref of shreps (dot-bracket structures) for the | |
| 870 # current window | |
| 871 # win_seq : the nucleotide sequence for the current window | |
| 872 # win_size_real : the current (true) window size | |
| 873 # win_start : the starting position of win_seq in the original seq (1-n) | |
| 874 # curr_gi : the current graph index | |
| 875 # graph_file_hdl : the graph file handler | |
| 876 # winHead : the header line for the current sequence window | |
| 877 # TODO annotate | |
| 878 # abstr : input parameter -abstr | |
| 879 # cue : input parameter -cue | |
| 880 # stacks : input parameter -stacks | |
| 881 # orig_seq : the nucleotide sequence as read from fasta | |
| 882 # | |
| 883 # Output: | |
| 884 # The current graph index | |
| 885 ############################################################################ | |
| 886 sub convertShapeWindow { | |
| 887 my ( $win_shreps_aref, $win_seq, $win_size_real, $win_start, $curr_gi, | |
| 888 $graph_file_hdl, $winHead, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_; | |
| 889 | |
| 890 ## generate for each shrep a connected component in gspan file | |
| 891 foreach my $shrep ( @{$win_shreps_aref} ) { | |
| 892 | |
| 893 # get the current gi as it was at the beginning of this shrep | |
| 894 my $shrep_gi = $curr_gi; | |
| 895 | |
| 896 # cut off unpaired ends, if option is given | |
| 897 my $shrep_struct = $shrep->[0]; | |
| 898 my $crop_index_left = 0; | |
| 899 my $crop_index_right = length($shrep_struct) - 1; | |
| 900 | |
| 901 # find croping indices with numbering from 0 to n-1 | |
| 902 if ($cue) { | |
| 903 $crop_index_left = index( $shrep_struct, "(" ); # find 1st occ of "(" | |
| 904 $crop_index_right = rindex( $shrep_struct, ")" ); # find last occ of ")" | |
| 905 | |
| 906 # if the complete window is unpaired, then don't crop | |
| 907 if ( $crop_index_left == -1 ) { | |
| 908 $crop_index_left = 0; | |
| 909 $crop_index_right = length($shrep_struct) - 1; | |
| 910 } | |
| 911 } | |
| 912 | |
| 913 # create structure graph | |
| 914 my $backboneGraph_ref = getBackboneGraph( $win_seq, $curr_gi, $win_start, $crop_index_left, $crop_index_right, $orig_seq ); | |
| 915 my $structGraph_ref; | |
| 916 | |
| 917 # add both basic edges, stacks and abstract graph | |
| 918 if ($abstr) { | |
| 919 ( $structGraph_ref, $curr_gi ) = getStructPlusAbstractStructGraph( $shrep, $curr_gi, $win_size_real, $cue, $stacks ); | |
| 920 | |
| 921 # just add basic structure graph edges, and stacks | |
| 922 } else { | |
| 923 ( $structGraph_ref, $curr_gi ) = getStructGraph( $shrep, $curr_gi, $win_size_real, $stacks ); | |
| 924 } | |
| 925 | |
| 926 # additional information for shrep header: sequence and dot-bracket | |
| 927 my $crop_length = $crop_index_right - $crop_index_left + 1; | |
| 928 my $shrepheader_struct = substr( $shrep_struct, $crop_index_left, $crop_length ); | |
| 929 my $shrepheader_seq = substr( $win_seq, $crop_index_left, $crop_length ); | |
| 930 push( @{$shrep}, 'STRUCT', $shrepheader_struct, 'SEQ', $shrepheader_seq ); | |
| 931 | |
| 932 # print structure graph to file | |
| 933 print $graph_file_hdl getShapeHeader( $winHead, $shrep ); | |
| 934 print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n"; | |
| 935 | |
| 936 # don't print empty array; sgdnspdk breaks with empty lines | |
| 937 if ( @{$structGraph_ref} > 0 ) { | |
| 938 print $graph_file_hdl join( "\n", @{$structGraph_ref} ) . "\n"; | |
| 939 | |
| 940 if ($annotate) { | |
| 941 | |
| 942 # TODO add annotations to shrep structure | |
| 943 # $win_start, $win_end (1-n) | |
| 944 } | |
| 945 | |
| 946 } | |
| 947 } | |
| 948 | |
| 949 return $curr_gi; | |
| 950 } | |
| 951 | |
| 952 ########################################################################### | |
| 953 # Here the vertices for the nucleotides are generated and also the backbone | |
| 954 # edges, which connect the nucleotides in their correct order (i.e order | |
| 955 # of the given sequence) | |
| 956 # Input: | |
| 957 # win_seq : the sequence for the current window | |
| 958 # win_start : the starting position for the current window | |
| 959 # curr_crop_i_left : the cropping index for the left end of sequence (-cue) | |
| 960 # curr_crop_i_right : the cropping index for the right end of sequence (-cue) | |
| 961 # orig_seq : the nucleotide sequence as read from fasta | |
| 962 # | |
| 963 # Output: | |
| 964 # The lines of the graph in an array reference to be printed to a file with | |
| 965 # one element per line. | |
| 966 ############################################################################ | |
| 967 sub getBackboneGraph { | |
| 968 my ( $win_seq, $curr_gi, $win_start, $curr_crop_i_left, $curr_crop_i_right, $orig_seq ) = @_; | |
| 969 | |
| 970 # RNAshapes substitutes T -> U, sequence only does not | |
| 971 # thus, just transform all t/T -> u/U to remain consistent | |
| 972 $orig_seq =~ tr /tT/uU/; | |
| 973 $win_seq =~ tr /tT/uU/; | |
| 974 | |
| 975 my @seq; | |
| 976 @seq = split( "", $win_seq ); | |
| 977 | |
| 978 # if the vp option is set we need to obtain the original capitalization from | |
| 979 # $orig_seq and extract the windowed sequence manually | |
| 980 my @seq_vp; | |
| 981 if ( defined $i_vp ) { | |
| 982 my $win_len = length($win_seq); | |
| 983 my $capitalized_win_seq = substr( $orig_seq, $win_start - 1, $win_len ); | |
| 984 ( uc($capitalized_win_seq) eq uc($win_seq) ) or | |
| 985 die( "error: windowed sequence generated due to vp option not equal " . | |
| 986 "to sequence reported by RNASHAPES.\n" . | |
| 987 "'${capitalized_win_seq}' != '${win_seq}'" ); | |
| 988 @seq_vp = split( "", $capitalized_win_seq ); | |
| 989 } | |
| 990 | |
| 991 my @vert = (); | |
| 992 my @edg = (); | |
| 993 | |
| 994 my $curr_abs_pos = $win_start + $curr_crop_i_left; | |
| 995 $curr_gi += $curr_crop_i_left; | |
| 996 | |
| 997 # set vertice labeled with 'v' or 'V' according to sequence and vp option | |
| 998 # when vp set, | |
| 999 # uppercase nucleotides are annotated with 'v' | |
| 1000 # lowercase nucleotides are annotated with 'V' | |
| 1001 my $vertice_label = ( defined $i_vp and ( $seq_vp[$curr_crop_i_left] =~ /[a-z]/ ) ) ? 'V' : 'v'; | |
| 1002 | |
| 1003 # create backbone vertice of first nucleotide | |
| 1004 push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$curr_crop_i_left], $curr_abs_pos ) ); | |
| 1005 | |
| 1006 foreach my $idx ( ( $curr_crop_i_left + 1 ) .. $curr_crop_i_right ) { | |
| 1007 $curr_abs_pos++; | |
| 1008 $curr_gi++; | |
| 1009 | |
| 1010 # set vertice label as described above | |
| 1011 $vertice_label = ( defined $i_vp and ( $seq_vp[$idx] =~ /[a-z]/ ) ) ? 'V' : 'v'; | |
| 1012 push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$idx], $curr_abs_pos ) ); | |
| 1013 push( @edg, join( ' ', "e", $curr_gi - 1, $curr_gi, '> 1' ) ); | |
| 1014 } | |
| 1015 | |
| 1016 my @ret = ( @vert, @edg ); | |
| 1017 | |
| 1018 return \@ret; | |
| 1019 } | |
| 1020 | |
| 1021 ########################################################################### | |
| 1022 # This method does the same as getStructGraph, but is extended to identify | |
| 1023 # the abstract parts of the structure and also add this to the graph | |
| 1024 # via abstract relations, see information below. | |
| 1025 # We already have the backbone graph, now the base-pair edges need to be | |
| 1026 # added to this graph and the abstract graph is added after this. Finally | |
| 1027 # the abstract relations, pointing from the abstract level to the basic | |
| 1028 # structure level is added. All the while, we keep track of the current | |
| 1029 # index graph. Furthermore, in the basic structure graph, we have vertices | |
| 1030 # stacks that connects the four stacked base-pairs involved. (symbols P | |
| 1031 # for vertex and p for edges). The abstract vertices are labelled according | |
| 1032 # to the given name-space, then #, then the abstract structure type, e.g. | |
| 1033 # HL for hairpin-loop. The edges between the structure types are denoted | |
| 1034 # with n, and the relation vertex is ^R, with ^r going from the abstract | |
| 1035 # structure to the relation and from the relation to the basic structure | |
| 1036 # is @r. | |
| 1037 # | |
| 1038 # Input: | |
| 1039 # curr_shrep : the current shrep as dot-bracket structure | |
| 1040 # curr_gi : the current graph index | |
| 1041 # win_size : the window size | |
| 1042 # cue : the input parameter -cue (whether to crop unpaired ends) | |
| 1043 # stacks : the input parameter -stacks (whether to add stack information) | |
| 1044 # | |
| 1045 # Output: | |
| 1046 # The structure graph information line-by-line as an array reference and | |
| 1047 # the current graph index. | |
| 1048 # | |
| 1049 ############################################################################ | |
| 1050 sub getStructPlusAbstractStructGraph { | |
| 1051 my ( $curr_shrep, $curr_gi, $win_size, $cue, $stacks ) = @_; | |
| 1052 | |
| 1053 # $win_size = length($curr_shrep->[0]); | |
| 1054 | |
| 1055 my @struct = split( "", $curr_shrep->[0] ); | |
| 1056 | |
| 1057 # print "=================testing: new shrep : gi=$curr_gi ===============================\n" if ($i_debug); | |
| 1058 | |
| 1059 # OBJECTS and VARIABLES | |
| 1060 # all indices are saved according to the current graph index, $curr_gi, | |
| 1061 # which is at the beginning of the current window, so that the nucleotide | |
| 1062 # index can be inferred using $idx + $curr_gi | |
| 1063 # opening brackets | |
| 1064 my @open_blks = (); # open blocks of consecutive opening brackets (array of arrays) | |
| 1065 my @p_open = (); # currently open block of the last identified complete block | |
| 1066 my @c_open = (); # current "incomplete" block of consecutive opening brackets | |
| 1067 # base-pair objects | |
| 1068 my %bps = (); # hash of all base-pairs with stem for a value, if it is at | |
| 1069 # the BP is opening or closing a stem object | |
| 1070 my @c_bp = (); # current BP, just being closed at current index pos (i,j) | |
| 1071 my @p_bp = (); # previously closed BP at position before current index | |
| 1072 # stem objects | |
| 1073 my %stems = (); # all stems, key="i:j,k:l", value=stem array, (i,j) outer BP | |
| 1074 my @c_stem = (); # current incomplete stem object | |
| 1075 my @stmbrks = (); # all stem-breaks in the form of (i,k,l), where i is the remaining | |
| 1076 # open base and (k,l) is the subsequent closing base-pair | |
| 1077 | |
| 1078 # my @p_stmbrk = (); # previous stem-break in the form of (i,k,l), where i is the remaining | |
| 1079 # # open base and (k,l) is the subsequent closing base-pair | |
| 1080 # loop objects | |
| 1081 my @up = (); # all unpaired regions (i,j) that are of unkown type | |
| 1082 my @c_up = (); # current incomplete unpaired object | |
| 1083 my @hls = (); # all hairpin loop objects | |
| 1084 my @bls = (); # all bulge-loop objects | |
| 1085 my @ils = (); # all internal loops | |
| 1086 my @mls = (); # all multi-loops | |
| 1087 my @c_mls = (); # current incomplete multi-loops! (there can be more than one) | |
| 1088 my @els = (); # all external loops | |
| 1089 my @c_el = (); # current incomplete external loop | |
| 1090 | |
| 1091 # iterate through shrep structure an identify abstract parts | |
| 1092 foreach my $idx ( 0 .. @struct - 1 ) { | |
| 1093 | |
| 1094 #=================================================== | |
| 1095 # current char is the opening bracket of a base-pair | |
| 1096 if ( $struct[$idx] eq "(" ) { | |
| 1097 | |
| 1098 #=================================================== | |
| 1099 | |
| 1100 # update current window index to current graph index | |
| 1101 $idx += $curr_gi; | |
| 1102 | |
| 1103 # case: '((' currently there is an open block, extend it | |
| 1104 if (@c_open) { | |
| 1105 push( @c_open, $idx ); | |
| 1106 | |
| 1107 # case: '.(' or ')(' or BOS'(' there is no open block, create a new one ".(" or ")(" | |
| 1108 } else { | |
| 1109 | |
| 1110 # begin a new current open block | |
| 1111 push( @c_open, $idx ); | |
| 1112 | |
| 1113 # case: ')(' | |
| 1114 if (@c_stem) { | |
| 1115 | |
| 1116 # CLOSE STEM | |
| 1117 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug); | |
| 1118 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ); | |
| 1119 | |
| 1120 # case (x(x)( | |
| 1121 if (@open_blks) { | |
| 1122 | |
| 1123 # EXTEND_ML, if one exists | |
| 1124 if (@c_mls) { | |
| 1125 extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi ); | |
| 1126 | |
| 1127 # OPEN_ML | |
| 1128 } else { | |
| 1129 | |
| 1130 # case 'x((x)(' the opening of the ML was a stem-break | |
| 1131 if ( @stmbrks | |
| 1132 && $stmbrks[-1]->[1] == $p_bp[0] | |
| 1133 && $stmbrks[-1]->[2] == $p_bp[1] ) { | |
| 1134 my @c_stmbrk = @{ pop(@stmbrks) }; | |
| 1135 my @a = ( $c_stmbrk[0], -1 ); | |
| 1136 my @ml = (); | |
| 1137 push( @ml, \@a ); # 1st base-pair is in-waiting | |
| 1138 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] ); | |
| 1139 push( @ml, \@a2 ); #2nd BP | |
| 1140 my @a3 = ($idx); | |
| 1141 push( @ml, \@a3 ); #first of current BP | |
| 1142 push( @c_mls, \@ml ); | |
| 1143 print STDERR "TEST - opened ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); | |
| 1144 | |
| 1145 # case 'x(.(x)(' | |
| 1146 } elsif (@up) { | |
| 1147 my @tmp_up = @{ pop(@up) }; | |
| 1148 | |
| 1149 # compare adjacent BP to UP to see if they fit | |
| 1150 if ( $tmp_up[1] + 1 == $p_bp[0] | |
| 1151 && $open_blks[-1]->[-1] == $tmp_up[0] - 1 ) { | |
| 1152 my @a = ( $open_blks[-1]->[-1], -1 ); | |
| 1153 my @ml = (); | |
| 1154 push( @ml, \@a ); # 1st BP awaits closing | |
| 1155 my @newbp = @p_bp; | |
| 1156 push( @ml, \@newbp ); # 2nd BP | |
| 1157 my @a2 = ($idx); | |
| 1158 push( @ml, \@a2 ); # current open BP, awaits closing | |
| 1159 push( @c_mls, \@ml ); | |
| 1160 print STDERR "TEST - opened ML with prev UP1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) | |
| 1161 | |
| 1162 if ($i_debug); | |
| 1163 } else { | |
| 1164 die "ERROR: the base-pairs to not match the " . "previous unpaired region?!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1165 } | |
| 1166 } else { | |
| 1167 die "ERROR: in case ML, but there is no initial" . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1168 } | |
| 1169 } | |
| 1170 | |
| 1171 # case: 'x(x)(' | |
| 1172 } else { | |
| 1173 | |
| 1174 # case: '.(x)(' => CLOSE_EL | |
| 1175 if (@c_el) { | |
| 1176 if ( $c_el[-1] == $p_bp[0] ) { | |
| 1177 push( @c_el, $p_bp[1] ); | |
| 1178 my @newel = @c_el; | |
| 1179 push( @els, \@newel ); | |
| 1180 @c_el = (); | |
| 1181 print STDERR "TEST - closed EL1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug); | |
| 1182 } else { | |
| 1183 die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1184 } | |
| 1185 } | |
| 1186 | |
| 1187 # in both cases OPEN_EL | |
| 1188 push( @c_el, @p_bp ); | |
| 1189 push( @c_el, $idx ); | |
| 1190 print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug); | |
| 1191 } | |
| 1192 | |
| 1193 # case '.(' | |
| 1194 } elsif (@c_up) { | |
| 1195 | |
| 1196 # case (x(x).( OR 'x(.(' | |
| 1197 if (@open_blks) { | |
| 1198 | |
| 1199 # case 'x(x).(' ML | |
| 1200 if ( @p_bp && ( $p_bp[1] == $c_up[0] - 1 ) ) { | |
| 1201 | |
| 1202 # case '((x)x(x).(' EXTEND_ML, if one exists | |
| 1203 if (@c_mls) { | |
| 1204 | |
| 1205 extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi ); | |
| 1206 | |
| 1207 # case 'x((x).(' or 'x(.(x).(' OPEN_ML | |
| 1208 } else { | |
| 1209 | |
| 1210 # case 'x((x).(' the opening of the ML was a stem-break | |
| 1211 if ( @stmbrks | |
| 1212 && $stmbrks[-1]->[1] == $p_bp[0] | |
| 1213 && $stmbrks[-1]->[2] == $p_bp[1] ) { | |
| 1214 my @c_stmbrk = @{ pop(@stmbrks) }; | |
| 1215 my @a = ( $c_stmbrk[0], -1 ); | |
| 1216 my @ml = (); | |
| 1217 push( @ml, \@a ); # 1st base-pair is in-waiting | |
| 1218 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] ); | |
| 1219 push( @ml, \@a2 ); #2nd BP | |
| 1220 my @a3 = ($idx); | |
| 1221 push( @ml, \@a3 ); #first of current BP | |
| 1222 push( @c_mls, \@ml ); | |
| 1223 print STDERR "TEST - open ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); | |
| 1224 | |
| 1225 # case 'x(.(x).(' opening the ML with initial UP region | |
| 1226 } elsif (@up) { | |
| 1227 | |
| 1228 # get previous unpaired region | |
| 1229 my $tmp = pop(@up); | |
| 1230 my @p_up = @{$tmp}; | |
| 1231 | |
| 1232 # case: '(.(x).(' compare adjacent BP to UP to see if they create a ML | |
| 1233 if ( $p_up[1] + 1 == $p_bp[0] | |
| 1234 && $open_blks[-1]->[-1] == $p_up[0] - 1 ) { | |
| 1235 my @a = ( $open_blks[-1]->[-1], -1 ); | |
| 1236 my @ml = (); | |
| 1237 push( @ml, \@a ); # 1st BP awaits closing | |
| 1238 my @newbp = @p_bp; | |
| 1239 push( @ml, \@newbp ); # 2nd BP | |
| 1240 my @a2 = ($idx); | |
| 1241 push( @ml, \@a2 ); # current open BP, awaits closing | |
| 1242 push( @c_mls, \@ml ); | |
| 1243 print STDERR "TEST - opened ML with prev UP2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) | |
| 1244 | |
| 1245 if ($i_debug); | |
| 1246 } | |
| 1247 | |
| 1248 # has to be an ML, but can't find first part | |
| 1249 } else { | |
| 1250 die "ERROR: in case ML, but there is no initial " . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1251 } | |
| 1252 } | |
| 1253 | |
| 1254 # case '(.( | |
| 1255 } else { | |
| 1256 my @newup = @c_up; | |
| 1257 push( @up, \@newup ); | |
| 1258 print STDERR "TEST - closed UP:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newup ) ) if ($i_debug); | |
| 1259 } | |
| 1260 | |
| 1261 # case: 'x(x).(' OR 'EOS.(' | |
| 1262 } else { | |
| 1263 | |
| 1264 # case: '.(x)(' => CLOSE_EL | |
| 1265 if (@c_el) { | |
| 1266 if ( $c_el[-1] == $c_bp[0] ) { | |
| 1267 push( @c_el, $p_bp[1] ); | |
| 1268 my @newel = @c_el; | |
| 1269 push( @els, \@newel ); | |
| 1270 @c_el = (); | |
| 1271 print STDERR "TEST - closed EL2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug); | |
| 1272 } else { | |
| 1273 die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1274 } | |
| 1275 } | |
| 1276 | |
| 1277 # in both cases OPEN_EL | |
| 1278 if (@p_bp) { | |
| 1279 push( @c_el, @p_bp ); | |
| 1280 push( @c_el, $idx ); | |
| 1281 } else { | |
| 1282 | |
| 1283 # ignore this if -cue is given | |
| 1284 # TODO check the gi index | |
| 1285 if ($cue) { | |
| 1286 @c_el = (); | |
| 1287 } else { | |
| 1288 push( @c_el, ( $curr_gi, $curr_gi ) ); | |
| 1289 push( @c_el, $idx ); | |
| 1290 print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug); | |
| 1291 } | |
| 1292 } | |
| 1293 } | |
| 1294 @c_up = (); | |
| 1295 | |
| 1296 } else { | |
| 1297 | |
| 1298 # no more cases except opening bracket at beginning of sequence | |
| 1299 # in this case do nothing, as the index has already been added | |
| 1300 die "ERROR: '((', ')(', '.(' have all been covered\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) | |
| 1301 unless ( $idx == $curr_gi ); | |
| 1302 | |
| 1303 } | |
| 1304 } | |
| 1305 | |
| 1306 #=================================================== | |
| 1307 # current char is the unpaired base | |
| 1308 } elsif ( $struct[$idx] eq "." ) { | |
| 1309 | |
| 1310 #=================================================== | |
| 1311 | |
| 1312 # update current window index to current graph index | |
| 1313 $idx += $curr_gi; | |
| 1314 | |
| 1315 # case ').' => CLOSE_STEM | |
| 1316 if (@c_stem) { | |
| 1317 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug); | |
| 1318 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ); | |
| 1319 } | |
| 1320 | |
| 1321 # case '..' extend UP, @c_up | |
| 1322 if (@c_up) { | |
| 1323 $c_up[1] = $idx; | |
| 1324 | |
| 1325 # case 'x.' open new UP | |
| 1326 } else { | |
| 1327 | |
| 1328 # OPEN_UP, @c_up | |
| 1329 @c_up = ( $idx, $idx ); | |
| 1330 | |
| 1331 # case '(.' just come to the end of an open block, push it onto stack | |
| 1332 if (@c_open) { | |
| 1333 my @newopen = @c_open; | |
| 1334 push( @open_blks, \@newopen ); | |
| 1335 @c_open = (); | |
| 1336 | |
| 1337 # case ').' or '.' | |
| 1338 # (have already closed stem and created stem-break and emptied p_open) | |
| 1339 } else { | |
| 1340 | |
| 1341 # case 'x(x).' or '.' => CLOSE_EL | |
| 1342 if ( !@open_blks ) { | |
| 1343 | |
| 1344 # case 'x().().' | |
| 1345 if (@c_el) { | |
| 1346 | |
| 1347 # double check | |
| 1348 if ( $c_el[-1] == $p_bp[0] ) { | |
| 1349 push( @c_el, $p_bp[1] ); | |
| 1350 my @newel = @c_el; | |
| 1351 push( @els, \@newel ); | |
| 1352 @c_el = (); | |
| 1353 print STDERR "TEST - closed EL3:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug); | |
| 1354 } else { | |
| 1355 die "This case should not occur!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1356 } | |
| 1357 } | |
| 1358 } | |
| 1359 } | |
| 1360 } | |
| 1361 | |
| 1362 #=================================================== | |
| 1363 # current char is the closing bracket of a base-pair | |
| 1364 } elsif ( $struct[$idx] eq ")" ) { | |
| 1365 | |
| 1366 #=================================================== | |
| 1367 | |
| 1368 # update current window index to current graph index | |
| 1369 $idx += $curr_gi; | |
| 1370 | |
| 1371 # save previous base-pair | |
| 1372 if (@c_bp) { | |
| 1373 @p_bp = @c_bp; | |
| 1374 } | |
| 1375 | |
| 1376 # case: '((x))', extend stem | |
| 1377 if (@p_open) { | |
| 1378 | |
| 1379 # get current base-pair | |
| 1380 @c_bp = ( pop(@p_open), $idx ); | |
| 1381 | |
| 1382 # add it to the base-pair hash | |
| 1383 $bps{"$c_bp[0]:$c_bp[1]"} = ""; | |
| 1384 | |
| 1385 # EXTEND_STEM | |
| 1386 if (@c_stem) { | |
| 1387 $c_stem[0] = $c_bp[0]; | |
| 1388 $c_stem[1] = $c_bp[1]; | |
| 1389 } else { | |
| 1390 die "ERROR: in case '))' there has to be an open stem object!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1391 } | |
| 1392 | |
| 1393 # case: '(x(x))' or '(x.)', '()' not allowed | |
| 1394 } else { | |
| 1395 | |
| 1396 # case: '(x(x))' CLOSE_STEM close previous stem | |
| 1397 if (@c_stem) { | |
| 1398 print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug); | |
| 1399 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ); | |
| 1400 | |
| 1401 # case '(x(x))' OPEN_STEM | |
| 1402 if (@open_blks) { | |
| 1403 my $tmp = pop(@open_blks); | |
| 1404 @p_open = @{$tmp}; | |
| 1405 | |
| 1406 # get current base-pair | |
| 1407 @c_bp = ( pop(@p_open), $idx ); | |
| 1408 | |
| 1409 # add it to the base-pair hash | |
| 1410 # as the stem is not finished yet, cannot add stem information | |
| 1411 $bps{"$c_bp[0]:$c_bp[1]"} = ""; | |
| 1412 | |
| 1413 # open stem | |
| 1414 $c_stem[0] = $c_bp[0]; | |
| 1415 $c_stem[1] = $c_bp[1]; | |
| 1416 $c_stem[2] = $c_bp[0]; | |
| 1417 $c_stem[3] = $c_bp[1]; | |
| 1418 } else { | |
| 1419 die "ERROR: there have to be open blocks to match " . "current closing bracket!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1420 } | |
| 1421 | |
| 1422 # case: '(.(x))' => CLOSE_BL | |
| 1423 if ( @up && $up[-1]->[0] - 1 == $c_bp[0] && $up[-1]->[1] + 1 == $p_bp[0] ) { | |
| 1424 my @newBL = ( @c_bp, @p_bp ); | |
| 1425 push( @bls, \@newBL ); | |
| 1426 pop(@up); | |
| 1427 print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newBL ) ) if ($i_debug); | |
| 1428 | |
| 1429 # case: '(x(x).(x))' => CLOSE_ML | |
| 1430 } elsif ( @c_mls && $c_mls[-1]->[-1]->[-1] == $p_bp[0] ) { | |
| 1431 my @newml = @{ pop(@c_mls) }; | |
| 1432 push( @{ $newml[-1] }, $p_bp[1] ); # close last BP | |
| 1433 $newml[0]->[1] = $c_bp[1]; # close 1st BP | |
| 1434 push( @mls, \@newml ); | |
| 1435 print STDERR "TEST - closed ML1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug); | |
| 1436 } else { | |
| 1437 die "ERROR: what is this case?\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1438 } | |
| 1439 | |
| 1440 # case: (x.) | |
| 1441 } else { | |
| 1442 | |
| 1443 # OPEN_STEM open new stem | |
| 1444 if (@open_blks) { | |
| 1445 @p_open = @{ pop(@open_blks) }; | |
| 1446 | |
| 1447 # get current base-pair | |
| 1448 @c_bp = ( pop(@p_open), $idx ); | |
| 1449 | |
| 1450 # add it to the base-pair hash | |
| 1451 # as the stem is not finished yet, cannot add stem information | |
| 1452 $bps{"$c_bp[0]:$c_bp[1]"} = ""; | |
| 1453 | |
| 1454 # open stem | |
| 1455 $c_stem[0] = $c_bp[0]; | |
| 1456 $c_stem[1] = $c_bp[1]; | |
| 1457 $c_stem[2] = $c_bp[0]; | |
| 1458 $c_stem[3] = $c_bp[1]; | |
| 1459 } else { | |
| 1460 die "ERROR: there have to be open blocks to match " . "current closing bracket!$curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1461 } | |
| 1462 | |
| 1463 # CLOSE_C_UP | |
| 1464 if (@c_up) { | |
| 1465 | |
| 1466 # case: (.) => close_HL | |
| 1467 if ( $c_up[0] - 1 == $c_bp[0] ) { | |
| 1468 my @newhl = @c_bp; | |
| 1469 push( @hls, \@newhl ); | |
| 1470 @c_up = (); | |
| 1471 print STDERR "TEST - closed HL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newhl ) ) if ($i_debug); | |
| 1472 | |
| 1473 # case: (x(x).) | |
| 1474 } elsif ( @p_bp && $c_up[0] - 1 == $p_bp[1] ) { | |
| 1475 | |
| 1476 # case: ((x).) => CLOSE_BL | |
| 1477 if ( @stmbrks | |
| 1478 && $stmbrks[-1]->[0] == $c_bp[0] | |
| 1479 && $stmbrks[-1]->[1] == $p_bp[0] | |
| 1480 && $stmbrks[-1]->[2] == $p_bp[1] ) { | |
| 1481 my @newbl = ( @c_bp, @p_bp ); | |
| 1482 push( @bls, \@newbl ); | |
| 1483 pop(@stmbrks); | |
| 1484 @c_up = (); | |
| 1485 print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newbl ) ) if ($i_debug); | |
| 1486 | |
| 1487 # case: (x(x).), either CLOSE_ML or CLOSE_IL | |
| 1488 } else { | |
| 1489 | |
| 1490 # case '(.(x).)' => CLOSE_IL | |
| 1491 if ( @up | |
| 1492 && ( $up[-1]->[0] - 1 == $c_bp[0] ) | |
| 1493 && ( $up[-1]->[1] + 1 == $p_bp[0] ) ) { | |
| 1494 my @p_up = @{ pop(@up) }; | |
| 1495 my @newil = ( @c_bp, @p_bp ); | |
| 1496 push( @ils, \@newil ); | |
| 1497 @c_up = (); | |
| 1498 print STDERR "TEST - closed IL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newil ) ) if ($i_debug); | |
| 1499 | |
| 1500 # case (x(x)x(x).) => CLOSE_ML remember, array of tuples | |
| 1501 } elsif ( @c_mls && $c_mls[-1]->[-1]->[0] == $p_bp[0] ) { | |
| 1502 my @newml = @{ pop(@c_mls) }; | |
| 1503 push( @{ $newml[-1] }, $p_bp[1] ); # close final BP | |
| 1504 $newml[0]->[1] = $c_bp[1]; # close 1st BP | |
| 1505 push( @mls, \@newml ); | |
| 1506 @c_up = (); | |
| 1507 print STDERR "TEST - closed ML2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug); | |
| 1508 } else { | |
| 1509 die "There should be no such case!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1510 } | |
| 1511 } | |
| 1512 | |
| 1513 # there should be no more unknown unpaired regions left! | |
| 1514 } else { | |
| 1515 die "There shouldn't be any unknown unpaired " . "regions left\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1516 } | |
| 1517 | |
| 1518 } else { | |
| 1519 die "ERROR: '()' is not allowed in $curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); | |
| 1520 } | |
| 1521 } | |
| 1522 } | |
| 1523 } | |
| 1524 } | |
| 1525 | |
| 1526 # EOS | |
| 1527 | |
| 1528 # CLOSE_STEM | |
| 1529 print STDERR "TEST - closed stem EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@c_stem ) ) | |
| 1530 if ( @c_stem && $i_debug ); | |
| 1531 close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ) if (@c_stem); | |
| 1532 die "ERROR: there should not be an open ML!\n" . $curr_shrep . "\n" | |
| 1533 if (@c_mls); | |
| 1534 | |
| 1535 # case ().() CLOSE_EL | |
| 1536 if (@c_el) { | |
| 1537 if ( $c_el[-1] == $p_bp[0] ) { | |
| 1538 push( @c_el, $p_bp[1] ); | |
| 1539 my @newel = @c_el; | |
| 1540 push( @els, \@newel ); | |
| 1541 @c_el = (); | |
| 1542 | |
| 1543 print STDERR "TEST - closed EL4 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) ) | |
| 1544 if ($i_debug); | |
| 1545 } else { | |
| 1546 die "ERROR: the base-pairs in the EL don't match" . $curr_shrep . "\n"; | |
| 1547 } | |
| 1548 } | |
| 1549 | |
| 1550 # case (). CLOSE_EL | |
| 1551 if (@c_up) { | |
| 1552 unless ($cue) { | |
| 1553 if (@p_bp) { | |
| 1554 push( @c_el, @p_bp ); | |
| 1555 } else { | |
| 1556 push( @c_el, ( $curr_gi, $curr_gi ) ); | |
| 1557 } | |
| 1558 | |
| 1559 push( @c_el, ( ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ) ); | |
| 1560 my @newel = @c_el; | |
| 1561 push( @els, \@newel ); | |
| 1562 @c_el = (); | |
| 1563 print STDERR "TEST - closed EL5 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) ) | |
| 1564 if ($i_debug); | |
| 1565 ## check case where cue is given, but window is completely unstructured | |
| 1566 } else { | |
| 1567 unless (@p_bp) { | |
| 1568 my @newel = ( $curr_gi, $curr_gi, ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ); | |
| 1569 push( @els, \@newel ); | |
| 1570 } | |
| 1571 } | |
| 1572 } | |
| 1573 | |
| 1574 # # update current graph index (since all shrep indices have been read) | |
| 1575 $curr_gi += $win_size; | |
| 1576 | |
| 1577 # =================================================== | |
| 1578 # build graph lines | |
| 1579 my @graph_lines = (); | |
| 1580 | |
| 1581 # =================================================== | |
| 1582 | |
| 1583 # graph line objects | |
| 1584 my @edg = (); | |
| 1585 my @stackgraph = (); | |
| 1586 my @abstrgraph = (); | |
| 1587 | |
| 1588 #-------------------> stems,stacks | |
| 1589 # build basic graph edges, stacks, and vertices for stems in the abstract graph | |
| 1590 my @tmp = keys %stems; | |
| 1591 foreach my $sk (@tmp) { | |
| 1592 my @stem = @{ $stems{$sk} }; # stem (i,j,k,l)=> (i,j)= (0,1) and (k,l)=(2,3) | |
| 1593 die "ERROR: There are not enough elements in this stem: ", ( join( ",", @stem ) ), "\n" unless ( scalar(@stem) == 4 ); | |
| 1594 | |
| 1595 # infer base-pairs from stem | |
| 1596 @p_bp = (); | |
| 1597 @c_bp = (); | |
| 1598 my @stem_relation_idxs = (); | |
| 1599 my @stack_relation_idxs = (); | |
| 1600 my $stacksize = $stem[2] - $stem[0] + 1; | |
| 1601 for ( my $x = 0 ; $x < $stacksize ; $x++ ) { | |
| 1602 | |
| 1603 # for (my $i = $stem[0]; $i <= $stem[2] ; $i++){ | |
| 1604 # for(my $j = $stem[1]; $j >= $stem[3]; $j--){ | |
| 1605 my $i = $stem[0] + $x; | |
| 1606 my $j = $stem[1] - $x; | |
| 1607 @p_bp = @c_bp; | |
| 1608 @c_bp = ( $i, $j ); | |
| 1609 if ( defined $bps{"$i:$j"} ) { | |
| 1610 | |
| 1611 # add indices to stem relation object | |
| 1612 push( @stem_relation_idxs, $i ); | |
| 1613 push( @stem_relation_idxs, $j ); | |
| 1614 | |
| 1615 # add edge to graph edges | |
| 1616 push( @edg, "e $i $j s" ); | |
| 1617 | |
| 1618 # add stack if option given | |
| 1619 if ($stacks) { | |
| 1620 if (@p_bp) { | |
| 1621 push( @stackgraph, "v $curr_gi P" ); | |
| 1622 push( @stackgraph, "e $curr_gi $p_bp[0] p" ); | |
| 1623 push( @stackgraph, "e $curr_gi $p_bp[1] p" ); | |
| 1624 push( @stackgraph, "e $c_bp[0] $curr_gi p" ); | |
| 1625 push( @stackgraph, "e $c_bp[1] $curr_gi p" ); | |
| 1626 push( @stack_relation_idxs, $curr_gi ); | |
| 1627 | |
| 1628 # update graph index | |
| 1629 ++$curr_gi; | |
| 1630 } | |
| 1631 } | |
| 1632 } else { | |
| 1633 die "ERROR: There is no such base-pair in the base-pair hash:" . " ($i, $j)\n"; | |
| 1634 } | |
| 1635 } | |
| 1636 | |
| 1637 # } | |
| 1638 # } | |
| 1639 # add stem to abstract graph | |
| 1640 # (graph indices become a bit jumbled, but saves time) | |
| 1641 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#S" ); | |
| 1642 | |
| 1643 # stem object now saves the curr_gi index for that stem | |
| 1644 $stems{$sk} = $curr_gi; | |
| 1645 ++$curr_gi; | |
| 1646 | |
| 1647 # add relations | |
| 1648 push( @abstrgraph, "v $curr_gi ^R" ); | |
| 1649 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); | |
| 1650 foreach my $r (@stem_relation_idxs) { | |
| 1651 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1652 } | |
| 1653 if ($stacks) { | |
| 1654 foreach my $r (@stack_relation_idxs) { | |
| 1655 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1656 } | |
| 1657 } | |
| 1658 ++$curr_gi; | |
| 1659 } | |
| 1660 | |
| 1661 #-------------------> add HLs | |
| 1662 foreach my $hl (@hls) { | |
| 1663 | |
| 1664 # check size | |
| 1665 die "ERROR: the HL object is the incorrect size\n" unless ( @{$hl} == 2 ); | |
| 1666 | |
| 1667 # add vertex | |
| 1668 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#HL" ); | |
| 1669 | |
| 1670 # add edge from stem to hl | |
| 1671 if ( defined( $stems{ $bps{"$hl->[0]:$hl->[1]"} } ) ) { | |
| 1672 push( @abstrgraph, "e " . $stems{ $bps{"$hl->[0]:$hl->[1]"} } . " $curr_gi n" ); | |
| 1673 } else { | |
| 1674 die "ERROR: stem not defined in base-pair hash for $hl->[0]:$hl->[1]\n"; | |
| 1675 } | |
| 1676 ++$curr_gi; | |
| 1677 | |
| 1678 # add relations | |
| 1679 push( @abstrgraph, "v $curr_gi ^R" ); | |
| 1680 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); | |
| 1681 | |
| 1682 # add all nodes between i to j, inclusively | |
| 1683 for ( my $r = $hl->[0] ; $r <= $hl->[1] ; $r++ ) { | |
| 1684 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1685 } | |
| 1686 ++$curr_gi; | |
| 1687 } | |
| 1688 | |
| 1689 #------------------> add BLs | |
| 1690 foreach my $bl (@bls) { | |
| 1691 | |
| 1692 # check size | |
| 1693 die "ERROR: the BL object is the incorrect size\n" unless ( scalar( @{$bl} ) == 4 ); | |
| 1694 | |
| 1695 # add vertex | |
| 1696 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#BL" ); | |
| 1697 | |
| 1698 # add edges from bl to both adjacent stems | |
| 1699 if ( defined( $stems{ $bps{"$bl->[0]:$bl->[1]"} } ) | |
| 1700 && defined( $stems{ $bps{"$bl->[2]:$bl->[3]"} } ) ) { | |
| 1701 | |
| 1702 # add outer stem: S->BL | |
| 1703 push( @abstrgraph, "e " . $stems{ $bps{"$bl->[0]:$bl->[1]"} } . " $curr_gi n" ); | |
| 1704 | |
| 1705 # add inner stem: BL->S | |
| 1706 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bl->[2]:$bl->[3]"} } . " n" ); | |
| 1707 } else { | |
| 1708 die "ERROR: stems not defined for BL\n"; | |
| 1709 } | |
| 1710 ++$curr_gi; | |
| 1711 | |
| 1712 # add relations | |
| 1713 push( @abstrgraph, "v $curr_gi ^R" ); | |
| 1714 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); | |
| 1715 | |
| 1716 # add all nodes between i to k and l to j, inclusively | |
| 1717 for ( my $r = $bl->[0] ; $r <= $bl->[2] ; $r++ ) { | |
| 1718 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1719 } | |
| 1720 for ( my $r = $bl->[3] ; $r <= $bl->[1] ; $r++ ) { | |
| 1721 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1722 } | |
| 1723 ++$curr_gi; | |
| 1724 } | |
| 1725 | |
| 1726 #------------------> add ILs | |
| 1727 foreach my $il (@ils) { | |
| 1728 | |
| 1729 # check size | |
| 1730 die "ERROR: the IL object is the incorrect size\n" unless ( scalar( @{$il} ) == 4 ); | |
| 1731 | |
| 1732 # add vertex | |
| 1733 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#IL" ); | |
| 1734 | |
| 1735 # add edges from il to both adjacent stems | |
| 1736 if ( defined( $stems{ $bps{"$il->[0]:$il->[1]"} } ) | |
| 1737 && defined( $stems{ $bps{"$il->[2]:$il->[3]"} } ) ) { | |
| 1738 | |
| 1739 # add outer stem: S->IL | |
| 1740 push( @abstrgraph, "e " . $stems{ $bps{"$il->[0]:$il->[1]"} } . " $curr_gi n" ); | |
| 1741 | |
| 1742 # add inner stem: IL->S | |
| 1743 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$il->[2]:$il->[3]"} } . " n" ); | |
| 1744 } else { | |
| 1745 die "ERROR: stems not defined for IL\n"; | |
| 1746 } | |
| 1747 ++$curr_gi; | |
| 1748 | |
| 1749 # add relations | |
| 1750 push( @abstrgraph, "v $curr_gi ^R" ); | |
| 1751 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); | |
| 1752 | |
| 1753 # add all nodes between i to k and l to j, inclusively | |
| 1754 for ( my $r = $il->[0] ; $r <= $il->[2] ; $r++ ) { | |
| 1755 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1756 } | |
| 1757 for ( my $r = $il->[3] ; $r <= $il->[1] ; $r++ ) { | |
| 1758 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1759 } | |
| 1760 ++$curr_gi; | |
| 1761 } | |
| 1762 | |
| 1763 #------------------> add ELs | |
| 1764 foreach my $el (@els) { | |
| 1765 | |
| 1766 #check size | |
| 1767 die "ERROR: the EL object is the incorrect size\n" unless ( scalar( @{$el} ) == 4 ); | |
| 1768 | |
| 1769 # add vertex | |
| 1770 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#EL" ); | |
| 1771 | |
| 1772 # add edges from el to both adjacent stems, if available | |
| 1773 unless ( $el->[0] == $el->[1] ) { | |
| 1774 if ( defined( $stems{ $bps{"$el->[0]:$el->[1]"} } ) ) { | |
| 1775 push( @abstrgraph, "e " . $stems{ $bps{"$el->[0]:$el->[1]"} } . " $curr_gi n" ); | |
| 1776 } else { | |
| 1777 die "ERROR: stems not defined for left BP of EL\n"; | |
| 1778 } | |
| 1779 } | |
| 1780 unless ( $el->[2] == $el->[3] ) { | |
| 1781 if ( defined( $stems{ $bps{"$el->[2]:$el->[3]"} } ) ) { | |
| 1782 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$el->[2]:$el->[3]"} } . " n" ); | |
| 1783 } else { | |
| 1784 die "ERROR: stems not defined for right BP of EL\n"; | |
| 1785 } | |
| 1786 } | |
| 1787 ++$curr_gi; | |
| 1788 | |
| 1789 # add relations | |
| 1790 push( @abstrgraph, "v $curr_gi ^R" ); | |
| 1791 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); | |
| 1792 | |
| 1793 # @r relation edges | |
| 1794 for ( my $r = $el->[1] ; $r <= $el->[2] ; $r++ ) { | |
| 1795 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1796 } | |
| 1797 | |
| 1798 # if BP, not EOS | |
| 1799 unless ( $el->[0] == $el->[1] ) { | |
| 1800 push( @abstrgraph, "e $curr_gi $el->[0] \@r" ); | |
| 1801 } | |
| 1802 | |
| 1803 # if BP, not EOS | |
| 1804 unless ( $el->[2] == $el->[3] ) { | |
| 1805 push( @abstrgraph, "e $curr_gi $el->[3] \@r" ); | |
| 1806 } | |
| 1807 ++$curr_gi; | |
| 1808 } | |
| 1809 | |
| 1810 #------------------> add MLs | |
| 1811 foreach my $ml (@mls) { | |
| 1812 | |
| 1813 #check size | |
| 1814 die "ERROR: the ML object is not big enough!\n" unless ( scalar( @{$ml} ) >= 3 ); | |
| 1815 | |
| 1816 # add vertex | |
| 1817 push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#ML" ); | |
| 1818 | |
| 1819 # add edge from outer stem to first BP | |
| 1820 my $closing_bp = shift( @{$ml} ); | |
| 1821 if ( defined( $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } ) ) { | |
| 1822 push( @abstrgraph, "e " . $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } . " $curr_gi n" ); | |
| 1823 } else { | |
| 1824 die "ERROR: ML is not defined for BP $closing_bp->[0]:$closing_bp->[1]\n"; | |
| 1825 } | |
| 1826 | |
| 1827 # add remaining stems | |
| 1828 foreach my $bp ( @{$ml} ) { | |
| 1829 if ( defined( $stems{ $bps{"$bp->[0]:$bp->[1]"} } ) ) { | |
| 1830 push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bp->[0]:$bp->[1]"} } . " n" ); | |
| 1831 } else { | |
| 1832 die "ERROR: ML is not defined for BP $bp->[0]:$bp->[1]\n"; | |
| 1833 } | |
| 1834 } | |
| 1835 ++$curr_gi; | |
| 1836 | |
| 1837 # add relations | |
| 1838 push( @abstrgraph, "v $curr_gi ^R" ); | |
| 1839 push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); | |
| 1840 | |
| 1841 # add @r relation edges between i and k, where (i,j) is the first BP and (k,l) | |
| 1842 # the second BP | |
| 1843 my $pbp_aref = $closing_bp; | |
| 1844 my $cbp_aref = shift( @{$ml} ); | |
| 1845 for ( my $r = $pbp_aref->[0] ; $r <= $cbp_aref->[0] ; $r++ ) { | |
| 1846 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1847 } | |
| 1848 | |
| 1849 # rest of @r relation edges | |
| 1850 foreach my $bp ( @{$ml} ) { | |
| 1851 $pbp_aref = $cbp_aref; | |
| 1852 $cbp_aref = $bp; | |
| 1853 for ( my $r = $pbp_aref->[1] ; $r <= $cbp_aref->[0] ; $r++ ) { | |
| 1854 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1855 } | |
| 1856 } | |
| 1857 | |
| 1858 # add final stretch between last BP (m,n) and first closing base-pair (i,j) | |
| 1859 for ( my $r = $cbp_aref->[1] ; $r <= $closing_bp->[1] ; $r++ ) { | |
| 1860 push( @abstrgraph, "e $curr_gi $r \@r" ); | |
| 1861 } | |
| 1862 ++$curr_gi; | |
| 1863 } | |
| 1864 | |
| 1865 @graph_lines = ( @edg, @stackgraph, @abstrgraph ); | |
| 1866 return ( \@graph_lines, $curr_gi ); | |
| 1867 } | |
| 1868 | |
| 1869 # just to test the current shrep dot-bracket structure parsing | |
| 1870 sub test { | |
| 1871 my ( $shrep, $idx, $gi, $mark_aref, $isML ) = @_; | |
| 1872 | |
| 1873 my $result = ""; | |
| 1874 | |
| 1875 $result .= "$shrep\n"; | |
| 1876 my @a = split( "", $shrep ); | |
| 1877 for ( my $i = 0 ; $i <= @a ; $i++ ) { | |
| 1878 my $mark = 0; | |
| 1879 foreach my $j ( @{$mark_aref} ) { | |
| 1880 if ($isML) { | |
| 1881 $mark = 1 if ( $i == $j->[0] - $gi ); | |
| 1882 $mark = 1 if ( scalar( @{$j} == 2 && $i == $j->[1] - $gi ) ); | |
| 1883 } else { | |
| 1884 $mark = 1 if ( $i == $j - $gi ); | |
| 1885 } | |
| 1886 | |
| 1887 } | |
| 1888 if ( $i == $idx - $gi ) { | |
| 1889 $result .= "^"; | |
| 1890 } elsif ($mark) { | |
| 1891 $result .= "'"; | |
| 1892 } else { | |
| 1893 $result .= " "; | |
| 1894 } | |
| 1895 } | |
| 1896 $result .= "\n"; | |
| 1897 | |
| 1898 return $result; | |
| 1899 } | |
| 1900 | |
| 1901 ########################################################################### | |
| 1902 # Extends or closes a multi-loop. This means a ML already exists. | |
| 1903 # We can have the case ((x)( or ((x).( | |
| 1904 # | |
| 1905 # Input: | |
| 1906 # curr_shrep The current shrep object, with shrep string at pos 0 | |
| 1907 # c_ml_aref The current open MLs array reference | |
| 1908 # p_bp_aref The previous base-pair array reference | |
| 1909 # stmbrks_aref The stem-breaks array reference | |
| 1910 # open_blks_aref The open bracket blocks array reference | |
| 1911 # idx The current index | |
| 1912 # #curr_gi The current graph index | |
| 1913 # Output: | |
| 1914 # None, it should just modify the ML accordingly | |
| 1915 ############################################################################ | |
| 1916 sub extend_or_open_nested_ML { | |
| 1917 my ( $curr_shrep, $c_mls_aref, $p_bp_aref, $up_aref, $stmbrks_aref, $open_blks_aref, $idx, $curr_gi ) = @_; | |
| 1918 | |
| 1919 # case '(x(x)x(x)x(' extend current ML | |
| 1920 if ( $c_mls_aref->[-1]->[-1]->[-1] == $p_bp_aref->[0] ) { | |
| 1921 push( @{ $c_mls_aref->[-1]->[-1] }, $p_bp_aref->[1] ); #close previous BP | |
| 1922 my @a = ($idx); | |
| 1923 push( @{ $c_mls_aref->[-1] }, \@a ); # add new open base-pair | |
| 1924 print STDERR "TEST - extended ML:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, $c_mls_aref->[-1], 1 ) ) if ($i_debug); | |
| 1925 | |
| 1926 # case '(x(x)x((x)(' nested ML, => OPEN_ML with stem-break | |
| 1927 } elsif ( @$stmbrks_aref | |
| 1928 && $stmbrks_aref->[-1]->[1] == $p_bp_aref->[0] | |
| 1929 && $stmbrks_aref->[-1]->[2] == $p_bp_aref->[1] ) { | |
| 1930 | |
| 1931 my @c_stmbrk = @{ pop( @{$stmbrks_aref} ) }; | |
| 1932 my @a = ( $c_stmbrk[0], -1 ); | |
| 1933 my @ml = (); | |
| 1934 push( @ml, \@a ); # 1st base-pair is in-waiting | |
| 1935 my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] ); | |
| 1936 push( @ml, \@a2 ); #2nd BP | |
| 1937 my @a3 = ($idx); | |
| 1938 push( @ml, \@a3 ); #first of current BP | |
| 1939 push( @{$c_mls_aref}, \@ml ); | |
| 1940 print STDERR "TEST - opened nested ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); | |
| 1941 | |
| 1942 # case '(x(x)x(.(x)(' nested ML, => OPEN_ML with UP | |
| 1943 } elsif ( @$up_aref | |
| 1944 && $up_aref->[-1]->[0] - 1 == $open_blks_aref->[-1]->[-1] | |
| 1945 && $up_aref->[-1]->[1] + 1 == $p_bp_aref->[0] ) { | |
| 1946 my @tmp_up = @{ pop( @{$up_aref} ) }; | |
| 1947 my @a = ( $open_blks_aref->[-1]->[-1], -1 ); | |
| 1948 my @ml = (); | |
| 1949 push( @ml, \@a ); # 1st BP awaits closing | |
| 1950 my @newbp = @{$p_bp_aref}; | |
| 1951 push( @ml, \@newbp ); #2nd BP | |
| 1952 my @a2 = ($idx); | |
| 1953 push( @ml, \@a2 ); # current open BP, awaits closing | |
| 1954 push( @{$c_mls_aref}, \@ml ); | |
| 1955 print STDERR "TEST - opened nested ML with prev UP\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); | |
| 1956 } else { | |
| 1957 die "ERROR: in the case extend or open nested ML, " . "but the base-pairs don't fit for either!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) if ($i_debug); | |
| 1958 } | |
| 1959 } | |
| 1960 | |
| 1961 ########################################################################### | |
| 1962 # Closes a stem-object for the abstract graph. If necessary, it also | |
| 1963 # creates a stem-break because the current open block is still open, | |
| 1964 # which can only mean that there is a stem-break between two consecutive | |
| 1965 # opening brackets. | |
| 1966 # | |
| 1967 # Input: | |
| 1968 # c_stem_aref : current stem array reference | |
| 1969 # stems_href : all stems hash reference | |
| 1970 # p_bp_aref : previous base-pair array reference | |
| 1971 # c_cp_aref : current base-pair array reference | |
| 1972 # p_open_aref : previously opened open-bracket block | |
| 1973 # p_stmbrk_aref : the previous stem-break | |
| 1974 # open_blks_aref : all old open blocks array reference | |
| 1975 # bps_href : all base-pairs hash reference | |
| 1976 # | |
| 1977 # Output: | |
| 1978 # None, it should just modify the input variables accordingly | |
| 1979 ############################################################################ | |
| 1980 sub close_stem { | |
| 1981 my ( $c_stem_aref, $stems_href, $p_bp_aref, $c_bp_aref, $p_open_aref, $stmbrks_aref, $open_blks_aref, $bps_href ) = @_; | |
| 1982 | |
| 1983 my @newstem = @$c_stem_aref; | |
| 1984 my $stemkey = "$newstem[0]:$newstem[1],$newstem[2]:$newstem[3]"; | |
| 1985 $stems_href->{$stemkey} = \@newstem; | |
| 1986 | |
| 1987 # reset and clean running variables | |
| 1988 @$c_stem_aref = (); | |
| 1989 @$p_bp_aref = @$c_bp_aref; | |
| 1990 @$c_bp_aref = (); | |
| 1991 | |
| 1992 # add stem to base-pair hash | |
| 1993 $bps_href->{"$newstem[0]:$newstem[1]"} = $stemkey; | |
| 1994 $bps_href->{"$newstem[2]:$newstem[3]"} = $stemkey; | |
| 1995 | |
| 1996 # there are still bases open in the previous open block, '[[]x', | |
| 1997 # where [] is a closed stem and [ is an open block, | |
| 1998 # must create STEM-BREAK | |
| 1999 # this cannot be the case ((x)), because there p_open is empty | |
| 2000 if (@$p_open_aref) { | |
| 2001 | |
| 2002 # check: the next open base is adjacent to the previous BP | |
| 2003 if ( $p_open_aref->[-1] == $p_bp_aref->[0] - 1 ) { | |
| 2004 my @stmbrk = ( $p_open_aref->[-1], @$p_bp_aref ); | |
| 2005 push( @$stmbrks_aref, \@stmbrk ); | |
| 2006 | |
| 2007 # push the previous open block back onto stack | |
| 2008 my @newopen = @$p_open_aref; | |
| 2009 push( @$open_blks_aref, \@newopen ); | |
| 2010 @$p_open_aref = (); | |
| 2011 } else { | |
| 2012 | |
| 2013 # error, this should not be possible | |
| 2014 die("ERROR in close_stem(), [.[]x should not occur here\n"); | |
| 2015 } | |
| 2016 } | |
| 2017 } | |
| 2018 | |
| 2019 ########################################################################### | |
| 2020 # Here the information about the structure is added to the graph. In a | |
| 2021 # first step each base-pair is added by adding edges between the vertices | |
| 2022 # of the respective nucleotides. Then, if two base-pairs are stacked, | |
| 2023 # we add an extra vertex and connect the four nucleotide vertices that | |
| 2024 # are involved with extra edges (symbols P used for vertex and p for edges). | |
| 2025 # While adding new vertices we keep track of the current graph index. | |
| 2026 # | |
| 2027 # Input: | |
| 2028 # curr_shrep : the current shrep as dot-bracket structure | |
| 2029 # curr_gi : the current graph index | |
| 2030 # win_size : the window size | |
| 2031 # stacks : the input parameter -stacks (whether to add stacking information) | |
| 2032 # | |
| 2033 # Output: | |
| 2034 # The structure graph information line-by-line as an array reference and | |
| 2035 # the current graph index. | |
| 2036 ############################################################################ | |
| 2037 sub getStructGraph { | |
| 2038 my ( $curr_shrep, $curr_gi, $win_size, $stacks ) = @_; | |
| 2039 | |
| 2040 my @struct = split( "", $curr_shrep->[0] ); | |
| 2041 | |
| 2042 my @edg = (); | |
| 2043 my @starts = (); | |
| 2044 my @pairs = (); | |
| 2045 | |
| 2046 foreach my $idx ( 0 .. @struct - 1 ) { | |
| 2047 | |
| 2048 push( @starts, $idx ) if ( $struct[$idx] eq "(" ); | |
| 2049 | |
| 2050 if ( $struct[$idx] eq ")" ) { | |
| 2051 my $start = $curr_gi + pop(@starts); | |
| 2052 my $end = $curr_gi + $idx; | |
| 2053 push( @edg, "e " . $start . " " . $end . " s " ); | |
| 2054 my @pair = ( $start, $end ); | |
| 2055 push( @pairs, \@pair ); | |
| 2056 } | |
| 2057 } | |
| 2058 | |
| 2059 # update current graph index | |
| 2060 $curr_gi = $curr_gi + $win_size; | |
| 2061 | |
| 2062 # initiate structure lines array | |
| 2063 my @stacking_info = (); | |
| 2064 | |
| 2065 # add stacking information to graph unless input option tells us not to | |
| 2066 if ($stacks) { | |
| 2067 | |
| 2068 my $stacked_pairs = 0; | |
| 2069 | |
| 2070 # get stacked base-pairs (they are ordered according to | |
| 2071 # position of closing nucleotide) | |
| 2072 for ( my $i = 1 ; $i < @pairs ; $i++ ) { | |
| 2073 | |
| 2074 # add stacked base-pairs (vertices+edges) | |
| 2075 my $curr_bp_aref = $pairs[$i]; | |
| 2076 my $prev_bp_aref = $pairs[ $i - 1 ]; | |
| 2077 | |
| 2078 # if the current base-pair is stacked on the previous base-pair, | |
| 2079 # when curr_start = prev_start - 1 AND curr_end = prev_end + 1 | |
| 2080 if ( $curr_bp_aref->[0] == $prev_bp_aref->[0] - 1 | |
| 2081 && $curr_bp_aref->[1] == $prev_bp_aref->[1] + 1 ) { | |
| 2082 | |
| 2083 # add stacking vertex P | |
| 2084 push( @stacking_info, "v $curr_gi P" ); | |
| 2085 | |
| 2086 # add four edges from involved nucleotids | |
| 2087 push( @stacking_info, "e $curr_gi $prev_bp_aref->[0] p" ); | |
| 2088 push( @stacking_info, "e $curr_gi $prev_bp_aref->[1] p" ); | |
| 2089 push( @stacking_info, "e $curr_bp_aref->[0] $curr_gi p" ); | |
| 2090 push( @stacking_info, "e $curr_bp_aref->[1] $curr_gi p" ); | |
| 2091 ++$curr_gi; # add one to the index, ready for next vertex | |
| 2092 | |
| 2093 } | |
| 2094 } | |
| 2095 } | |
| 2096 | |
| 2097 my @str_graphlines = ( @edg, @stacking_info ); | |
| 2098 return ( \@str_graphlines, $curr_gi ); | |
| 2099 } | |
| 2100 | |
| 2101 ########################################################################### | |
| 2102 # This method gives us the overall graph header for this sequence. This | |
| 2103 # graph includes all window calculations and all shreps for all windows. | |
| 2104 # Input: | |
| 2105 # seq_id : the sequence ID | |
| 2106 # seq_head : the header information for the sequence | |
| 2107 # wins_aref : the windows sizes for folding | |
| 2108 # h_shift : input parameter -shift | |
| 2109 # h_e : input parameter -e | |
| 2110 # h_c : input parameter -c | |
| 2111 # h_t : input parameter -t | |
| 2112 # h_u : input parameter -u | |
| 2113 # h_r : input parameter -r | |
| 2114 # h_M : input parameter -M | |
| 2115 # h_crop_unpaired_ends : input parameter -cue | |
| 2116 # | |
| 2117 # Output: | |
| 2118 # The header line as a string | |
| 2119 ############################################################################ | |
| 2120 sub getGraphHeader { | |
| 2121 my ( $seq_id, $seq_head, $wins_aref, $h_shift, $h_e, $h_c, $h_t, $h_u, $h_r, $h_M, $h_crop_unpaired_ends, $h_i, $h_sample_len, $h_q, $h_Tp, $h_seqlen, $h_noStr ) = @_; | |
| 2122 | |
| 2123 my $ret; | |
| 2124 | |
| 2125 $ret = "t # SEQID $seq_id "; | |
| 2126 $ret .= "$seq_head " if ( defined $seq_head ); | |
| 2127 $ret .= "SEQLEN $h_seqlen "; | |
| 2128 | |
| 2129 return $ret . "\n" if ($h_noStr); | |
| 2130 | |
| 2131 $ret .= "MAXWINSHREPS $h_M "; | |
| 2132 $ret .= "CUE 1 " if ($h_crop_unpaired_ends); | |
| 2133 $ret .= "RNASHAPES -w " . ( join( ",", @{$wins_aref} ) ) . " "; | |
| 2134 $ret .= "-SHIFT%w $h_shift " if ( not $globalFolding and $h_shift ); | |
| 2135 $ret .= "-SHIFT 1 " if ( not $globalFolding and not $h_shift ); | |
| 2136 $ret .= "-e $h_e " if ( defined $h_e ); | |
| 2137 $ret .= "-c $h_c " if ( defined $h_c ); | |
| 2138 $ret .= "-t $h_t " if ( defined $h_t ); | |
| 2139 $ret .= "-u 1 " if ($h_u); | |
| 2140 $ret .= "-r 1 " if ($h_r); | |
| 2141 $ret .= "-i $h_i SAMPLE_LEN $h_sample_len" if ( not $h_q and $h_i ); | |
| 2142 $ret .= "-q 1 " if ($h_q); | |
| 2143 $ret .= "-T $h_Tp" if ( $h_q and defined $h_Tp ); | |
| 2144 $ret .= "\n"; | |
| 2145 | |
| 2146 return $ret; | |
| 2147 } | |
| 2148 | |
| 2149 ########################################################################### | |
| 2150 # This method gives us the window header for the subgraph that includes | |
| 2151 # all shreps for the current window. | |
| 2152 # Input: | |
| 2153 # graphHead : the header line for the entire sequence graph | |
| 2154 # win_size : the size of the current window | |
| 2155 # win_start : the starting position of the current window | |
| 2156 # win_end : the end position of the current window | |
| 2157 # win_center : the centre position of the current window | |
| 2158 # win_global_fold : (boolean) whether the actual number of nucleotides | |
| 2159 # is smaller than the given window size (end of sequence?) | |
| 2160 # | |
| 2161 # Output: | |
| 2162 # The header line as a string | |
| 2163 ############################################################################ | |
| 2164 sub getWindowHeader { | |
| 2165 my ( $graphHead, $win_size, $win_start, $win_end, $win_center, $win_global_fold, $win_sample, $used_t ) = @_; | |
| 2166 | |
| 2167 chomp $graphHead; | |
| 2168 | |
| 2169 $graphHead =~ s/t # //; | |
| 2170 | |
| 2171 my $ret = "w # $graphHead "; | |
| 2172 $ret .= "SHAPE_TYPE $used_t "; | |
| 2173 $ret .= "GLOBALFOLD $win_global_fold "; | |
| 2174 $ret .= "WSIZE $win_size "; | |
| 2175 $ret .= "WSTART $win_start "; | |
| 2176 $ret .= "WEND $win_end "; | |
| 2177 $ret .= "WCENT $win_center "; | |
| 2178 $ret .= "WSAMPLE $win_sample "; | |
| 2179 $ret .= "\n"; | |
| 2180 | |
| 2181 return $ret; | |
| 2182 } | |
| 2183 | |
| 2184 ########################################################################### | |
| 2185 # This method gives us the shape header that includes the single shrep | |
| 2186 # connected components. | |
| 2187 # Input: | |
| 2188 # winHead : the header for the current window | |
| 2189 # shrep : information about the current shrep | |
| 2190 # | |
| 2191 # Output: | |
| 2192 # The header line as a string | |
| 2193 ############################################################################ | |
| 2194 sub getShapeHeader { | |
| 2195 my ( $winHead, $shrep ) = @_; | |
| 2196 | |
| 2197 chomp $winHead; | |
| 2198 | |
| 2199 $winHead =~ s/w # //; | |
| 2200 $winHead =~ s/t # //; | |
| 2201 | |
| 2202 my $ret = "s # $winHead "; | |
| 2203 | |
| 2204 my @info = @{$shrep}; | |
| 2205 $ret .= join( " ", @info[ 1 .. $#info ] ); | |
| 2206 $ret .= "\n"; | |
| 2207 | |
| 2208 return $ret; | |
| 2209 } | |
| 2210 | |
| 2211 ########################################################################### | |
| 2212 # This method gives us the sequence header that includes the unstructured | |
| 2213 # sequence connected components. | |
| 2214 # Input: | |
| 2215 # winHead : the header for the current window | |
| 2216 # shrep : information about the current shrep | |
| 2217 # | |
| 2218 # Output: | |
| 2219 # The header line as a string | |
| 2220 ############################################################################ | |
| 2221 sub getSeqHeader { | |
| 2222 my ( $winHead, $shrep ) = @_; | |
| 2223 | |
| 2224 chomp $winHead; | |
| 2225 | |
| 2226 $winHead =~ s/w # //; | |
| 2227 $winHead =~ s/t # //; | |
| 2228 | |
| 2229 my $ret = "u # $winHead "; | |
| 2230 | |
| 2231 my @info = @{$shrep}; | |
| 2232 $ret .= join( " ", @info[ 1 .. $#info ] ); | |
| 2233 $ret .= "\n"; | |
| 2234 | |
| 2235 return $ret; | |
| 2236 } | |
| 2237 | |
| 2238 | |
| 2239 ################################################################################## | |
| 2240 # This method parses a fasta file and is useful if the header ID is not-unique!! | |
| 2241 # It returns the header lines in an array and the sequence lines in the same | |
| 2242 # order. It is then up to the user to extract the parts from the header that is | |
| 2243 # necessary for the script. | |
| 2244 # Furthermore, the method deals with multiple lines, and returns a single sequence | |
| 2245 # without line breaks. | |
| 2246 # Input: | |
| 2247 # file The name of the fasta file | |
| 2248 # Output: | |
| 2249 # (1) An array reference for each header line | |
| 2250 # (2) An array reference for each sequence in same order as the headers | |
| 2251 ################################################################################## | |
| 2252 sub read_fasta_with_nonunique_headers_meta { | |
| 2253 my ($file) = @_; | |
| 2254 my $FUNCTION = "read_fasta_file in Sequences.pm"; | |
| 2255 | |
| 2256 my $header = ""; | |
| 2257 my $seqstring = ""; | |
| 2258 my @headers = (); | |
| 2259 my @sequences = (); | |
| 2260 my @meta = (); | |
| 2261 my %seq_meta = (); | |
| 2262 open( IN_HANDLE, "<$file" ) || die "ERROR in $FUNCTION:\n" . "Couldn't open the following file in package Tool," . " sub read_fasta_file: $file\n"; | |
| 2263 | |
| 2264 while ( my $line = <IN_HANDLE> ) { | |
| 2265 chomp($line); | |
| 2266 | |
| 2267 # header (can contain one space after > symbol) | |
| 2268 if ( $line =~ /^\>(.*)/ ) { | |
| 2269 if ($header) { | |
| 2270 $seqstring =~ s/\s*//g; ## do not allow spaces in sequence | |
| 2271 push( @headers, $header ); | |
| 2272 push( @sequences, $seqstring ); | |
| 2273 push( @meta, +{%seq_meta} ); ## anonymous hash reference | |
| 2274 #print keys %seq_meta; | |
| 2275 $seqstring = ""; | |
| 2276 undef(%seq_meta); | |
| 2277 } | |
| 2278 $header = $1; | |
| 2279 | |
| 2280 } elsif ( $line =~ /(.+)\s+(#\S+)$/ && $header ) { | |
| 2281 | |
| 2282 if ( exists $seq_meta{$2} ) { | |
| 2283 $seq_meta{$2} .= $1; | |
| 2284 } else { | |
| 2285 $seq_meta{$2} = $1; | |
| 2286 } | |
| 2287 | |
| 2288 } elsif ($header) { | |
| 2289 $seqstring .= $line | |
| 2290 } | |
| 2291 } | |
| 2292 | |
| 2293 if ($header) { | |
| 2294 $seqstring =~ s/\s*//g; ## do not allow spaces in sequence | |
| 2295 push( @headers, $header ); | |
| 2296 push( @sequences, $seqstring ); | |
| 2297 push( @meta, +{%seq_meta} ); | |
| 2298 $seqstring = ""; | |
| 2299 %seq_meta = (); | |
| 2300 } | |
| 2301 return ( \@headers, \@sequences, \@meta ); | |
| 2302 } | |
| 2303 | |
| 2304 | |
| 2305 1; | |
| 2306 | |
| 2307 ############################################################################# | |
| 2308 # Programming description | |
| 2309 ############################################################################# | |
| 2310 # Substructure graphs for machine learning with Fabrizio | |
| 2311 # ------------------------------------------------------- | |
| 2312 # | |
| 2313 # (1) Parameters (RNAshapes parameter): | |
| 2314 # - Window sizes [] (-w) | |
| 2315 # - window shift size (-W) | |
| 2316 # - calculate structure probabilities (-r) | |
| 2317 # - energy range kcal/mol (-e) OR energy relative percentage (%) to MFE (-c) | |
| 2318 # - shape type 1 to 5 (-t) | |
| 2319 # - ignore unstable substructures (-u) | |
| 2320 # - max shreps | |
| 2321 # | |
| 2322 # | |
| 2323 # (2) For each sequence, generate one graph/file that consists of all windows. The general format for one graph is as follows: | |
| 2324 # | |
| 2325 # t # seq_id parameters | |
| 2326 # v graph_index nt_type window_size window_centre abs_seq_pos | |
| 2327 # ... | |
| 2328 # e m m+1 > 1 (backbone) | |
| 2329 # ... | |
| 2330 # e base_i_graph_index base_j_graph_index s shrep_e shrep_p ... | |
| 2331 # | |
| 2332 # For each window (subgraph) we create a subgraph (of the subgraph) for each substructure. | |
| 2333 # We have a running index (gi) for each subgraph. All vertex and edge indices of the subgraph add | |
| 2334 # the running graph index to the actual window position. For example | |
| 2335 # | |
| 2336 # | |
| 2337 # Sequence: AAACC CUUUG GG | |
| 2338 # 01234 56789 01 | |
| 2339 # | |
| 2340 # Window=10 substructure1 = (((...))). centre 5.5 | |
| 2341 # | |
| 2342 # v 0 A 10 5.5 0 | |
| 2343 # v 1 A 10 5.5 1 | |
| 2344 # v 2 A 10 5.5 2 | |
| 2345 # v 3 C 10 5.5 3 | |
| 2346 # v 4 C 10 5.5 4 | |
| 2347 # v 5 C 10 5.5 5 | |
| 2348 # v 6 U 10 5.5 6 | |
| 2349 # v 7 U 10 5.5 7 | |
| 2350 # v 8 U 10 5.5 8 | |
| 2351 # v 9 G 10 5.5 9 | |
| 2352 # e 0 1 > 1 | |
| 2353 # e 1 2 > 1 | |
| 2354 # e 2 3 > 1 | |
| 2355 # e 3 4 > 1 | |
| 2356 # e 4 5 > 1 | |
| 2357 # e 5 6 > 1 | |
| 2358 # e 6 7 > 1 | |
| 2359 # e 7 8 > 1 | |
| 2360 # e 8 9 > 1 | |
| 2361 # e 0 8 s -15.0 0.1223 | |
| 2362 # e 1 7 s -15.0 0.1223 | |
| 2363 # e 2 6 s -15.0 0.1223 | |
| 2364 # | |
| 2365 # gi = 9+1 = 10 | |
| 2366 # | |
| 2367 # Window = 10 substructure2 = .(((...))) centre 6.5 | |
| 2368 # v 10 A 10 6.5 2 | |
| 2369 # v 11 C 10 6.5 3 | |
| 2370 # v 12 C 10 6.5 4 | |
| 2371 # v 13 C 10 6.5 5 | |
| 2372 # v 14 U 10 6.5 6 | |
| 2373 # v 15 U 10 6.5 7 | |
| 2374 # v 16 U 10 6.5 8 | |
| 2375 # v 17 G 10 6.5 9 | |
| 2376 # v 18 G 10 6.5 10 | |
| 2377 # v 19 G 10 6.5 11 | |
| 2378 # e 10 11 > 1 | |
| 2379 # e 11 12 > 1 | |
| 2380 # e 12 13 > 1 | |
| 2381 # e 13 14 > 1 | |
| 2382 # e 14 15 > 1 | |
| 2383 # e 15 16 > 1 | |
| 2384 # e 16 17 > 1 | |
| 2385 # e 17 18 > 1 | |
| 2386 # e 18 19 > 1 | |
| 2387 # e 11 19 s -17.0 0.156 | |
| 2388 # e 12 18 s -17.0 0.156 | |
| 2389 # e 13 17 s -17.0 0.156 | |
| 2390 # | |
| 2391 # gi = 19+1 = 20 | |
| 2392 # | |
| 2393 # | |
| 2394 # | |
| 2395 # Write one perl script to create graphs for a set of sequences, fasta2shrep_gspan.pl. | |
| 2396 # | |
| 2397 # INPUT: | |
| 2398 # -f fasta file with all sequences to compute | |
| 2399 # parameters as above | |
| 2400 # | |
| 2401 # OUTPUT: | |
| 2402 # one file per sequence that contains graph called seq_id.gspan | |
| 2403 # | |
| 2404 # (1) for each window size call RNAshapes and write to a tmp file | |
| 2405 # (2) parse result of RNAshapes (catch RNAshapes error - sequence too long?) check for max shreps. | |
| 2406 # (3) convert RNAshapes result to subgraph -> write to file (readpipe) look at efficiency and errors | |
| 2407 # (4) repeat (1) to (3) for each sequence | |
| 2408 | |
| 2409 # ABSTRACT STRUCTURE GRAPH | |
| 2410 # To each shrep graph (s #) add an abstract structure graph with special | |
| 2411 # abstract relations. For example, to add the abstract structure graph to | |
| 2412 # the previous shrep structure with gi numbers from 10 to 19, we first | |
| 2413 # identify the abstract shape, i.e. EL-S-HL and then create nodes (labelled | |
| 2414 # with a given name-space followed the actual node name and separated by a | |
| 2415 # hash) and edges for this graph as follows: | |
| 2416 # NOTE: at the moment we add the adjacent base-pairs to the loop definitions | |
| 2417 # but these can be removed if necessary in the future | |
| 2418 # v 20 abstruct#EL | |
| 2419 # v 21 abstruct#S | |
| 2420 # v 22 abstruct#HL | |
| 2421 # e 20 21 n | |
| 2422 # e 21 22 n | |
| 2423 # v 23 ^R | |
| 2424 # e 20 23 ^r | |
| 2425 # e 23 10 @r | |
| 2426 # e 23 11 @r | |
| 2427 # e 23 19 @r | |
| 2428 # v 24 ^R | |
| 2429 # e 21 24 ^r | |
| 2430 # e 24 11 @r | |
| 2431 # e 24 12 @r | |
| 2432 # e 24 13 @r | |
| 2433 # e 24 17 @r | |
| 2434 # e 24 18 @r | |
| 2435 # e 24 19 @r | |
| 2436 # v 25 ^R | |
| 2437 # e 22 25 ^r | |
| 2438 # e 25 13 @r | |
| 2439 # e 25 14 @r | |
| 2440 # e 25 15 @r | |
| 2441 # e 25 16 @r | |
| 2442 # e 25 17 @r | |
| 2443 | |
| 2444 # gi = 25+1 = 26 | |
| 2445 | |
| 2446 # ANOTATION FILE | |
| 2447 # This is a file that labels a sequence region with a given annotation. In one | |
| 2448 # file we can have annotations within different name-spaces, for example target | |
| 2449 # sites predicted with different tools. | |
| 2450 # File format is a tab-delimited file with the following columns: | |
| 2451 # SEQID (same ID as in the fasta file header) | |
| 2452 # i (left positon of region) | |
| 2453 # j (right position of region, if one position i=j) | |
| 2454 # NAMESPACE#LABEL (give each annotation type one name-space and choose a label depending on the task) | |
| 2455 | |
| 2456 # E.g we have 2 different miRNAs and we predict the target-sites with (1) IntaRNA and (2) TargetSearch, | |
| 2457 # then this could be IntaRNA#miR1, IntaRNA#miR2, TargetSearch#miR1, and TargetSearch#miR2. | |
| 2458 # All labels can be used more than once for one or more sequences. | |
| 2459 # all labels with the same namespace and the same sequence ID are grouped into | |
| 2460 # one abstract graph, according to the order of i. | |
| 2461 # Example | |
| 2462 # SEQID i j NAMESPACE#LABEL | |
| 2463 # s1 10 20 IntaRNA#miR1 | |
| 2464 # s1 54 60 IntaRNA#miR2 | |
| 2465 # s1 15 25 TargetSearch#miR1 | |
| 2466 # s1 54 60 TargetSearch#miR2 | |
| 2467 # s2 ... | |
| 2468 | |
| 2469 # We create connected component per sequence ID per name-space, | |
| 2470 # so that all labels with | |
| 2471 # the same namespace are grouped together (for one sequence) as follows: | |
| 2472 # v 26 IntaRNA#miR1 | |
| 2473 # v 27 IntaRNA#miR2 | |
| 2474 # e 26 27 n | |
| 2475 # then create R nodes and ^r and @r relations as before (from i to j) | |
| 2476 | |
| 2477 # If a sequence graph option is given, then these graphs (u #) have to | |
| 2478 # be labelled with this annotation | |
| 2479 |
