sequence2gspan: fasta2shrep_gspan.pl comparison

comparison fasta2shrep_gspan.pl @ 0:b01beb170290 draft default tip

Uploaded

author	bgruening
date	Tue, 29 Oct 2013 11:10:19 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:b01beb170290
+#!/usr/bin/perl
+#use feature ':5.10';
+use File::Basename;
+use lib dirname($0);    # search skript directory for modules
+use strict 'vars';
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+#use List::Util qw/ min max /;
+#use StructureLibrary::RNAtoolsConfig;
+#use StructureLibrary::Sequence;
+use Cwd qw(getcwd abs_path);
+use File::Temp qw(tempdir);
+use File::Copy;
+use POSIX qw(ceil);
+use FindBin;
+use lib "$FindBin::Bin";
+#use GraphClust;
+#use vars qw(%CONFIG);
+#*CONFIG = \%GraphClust::CONFIG;
+=head1 NAME
+fasta2shrep_gspan.pl -fasta mysequences.fasta -wins "50,100,150,200" -shift 5 -M 8
+=head1 SYNOPSIS
+Options:
+		HELP
+-help   brief help message
+-man    full documentation
+COMPULSORY
+-fasta	<STRING> e.g. "sequence.fasta"
+		All sequences in fasta format.
+	OPTIONS
+-wins		[INTEGER] e.g. "50,100,200"
+		A list of window sizes to use.
+		If none are given (empty string ''), then the entire sequence is
+		taken with no windows. Each window > 1 required!
+-shift		<INTEGER> e.g. 20
+		The shift of the window, relative to the window size given in
+		percent. So you give which percent of the window size shall be
+		used for the shift. Of course the shift is rounded down to the
+		nearest whole number.
+		Example 20 % of a window 150 would result in a step size of 30 nt.
+		It is a relative parameter, as you can give different window sizes.
+		If you do not give this parameter there is a default shift of 1 nt.
+-cue		Crop unpaired ends.
+		If you give this flag, then the unpaired ends of each
+		single structure are ignored. E.g. the structure
+		...(((...))).. becomes just (((...)))
+-stack		Adds stacking information to graphs. This adds an additional
+		vertex (type P) for each pair of stacked base-pairs and four edges
+		(type p) from each of the involved bases to the new vertex.
+-e		<FLOAT> e.g. 5.0
+		Energy range in kcal/mol (RNAshapes)
+		Use only one of -e and -c!
+-c		<INTEGER> e.g. 10
+		Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)
+		Use only one of -e and -c!
+-t		<INTEGER> [1-5] e.g. 3 OR "3=0,4=100,5=200"
+		The shape type (RNAshapes). Default is 3.
+		With the list format, the shape level can be changed for different window length
+		"4=100" means that shape level 4 is used from length 100nt (window length)
+		The first given length has to be 0! Not continuous given levels are allowed!
+-M		<INTEGER> e.g. 10
+		Max number of shreps that should be taken per window.
+-u 		Ignore unstable structures (RNAshapes).
+		This option filters out closed structures with positive free energy.
+-r		Calculate structure probabilities for shreps (RNAshapes)
+-i		<INT> e.g. 10
+		Turn on structure sampling and gives number of sampling iterations.
+		Default no sampling (i=0)
+-sample-len	<INT> e.g. 100
+		Only in sampling mode: Sampling is only used for seqs/windows >= given length,
+		Default: sample all lengths (0), if -i > 0
+-q		Turn on shape probabilities for RNAshapes, no sampling mode allowed
+-Tp		<FLOAT> e.g 0.001
+		Filter cutoff for shape probabilities, applied before -M filter!
+-seq-graph-win	add for each window a graph which contains no structure
+-seq-graph-t	add for each 't #' a graph which contains no structure
+-seq-graph-alph change the alphabet of unstructured graphs
+-annotate		<STRING> annotation.tab
+				A file with annotations to be added as abstract graphs
+				on the sequence leven (if given) and on the structure
+				(SHREP) level. The format is has the following TAB-delimited
+				columns: SEQID, START, END, NAMESPACE#LABEL.
+				Labels with the same name-space and SEQID form connected
+				components, which is a sequence of label vertices ordered
+				by the START position in the sequence.
+-abstr			Add abstract structure graphs to the single shrep graph
+				instances.
+-nostr			Calculate no structures, only add sequence information,
+				if this is given, then -seq-graph-win AND/OR -seq-graph-t
+				are required.
+-match-shape    <SHAPE>
+all seqs/windows will be constraint folded into that shape via
+RNAshapes (if structure is given in another way this struct will be kept),
+if this shape is not possible within given energy range, produce a
+specific t graph with only one vertex 'X'. By this the instance
+becomes very unsimilar to all other graphs (for knn)
+-vp     enable graph computation with viewpoints:
+svmsgdnspdk will center on those nucleotides that are given
+via capital letters and ignore those given as lowercase letters
+-tmp		<STRING> e.g. "/scratch/1/sita/tmp"
+		A directory for writing temporary files
+-o		<STRING> e.g. "ProjectX/MySequences/GSPAN/"
+		Output directory for gspan files containing graphs.
+-group		<INTEGER> e.g. 5
+Combine/group that number of input seqs into 1 gspan file
+output name is then '<INT>.group.gspan.bz2'
+-sge            Use SGE cluster for each sequence separately
+-sge-logdir     stdout directory for SGE call
+-sge-errdir     sdterr directory for SGE call
+-stdout         send graphs to stdout instead of writing to files
+-ignore-header  don't write fasta id part after first space to gspan
+-debug          additional debug output
+DEFAULT VALUES
+-wins	""
+-shift	1 nt
+-c		10
+-t		3
+-M		0 # selects all shreps
+-tmp    "/var/tmp/fasta2shrep"
+-o		"CURRENT_DIR/GSPAN/"
+SGE mode
+-sge-logdir "CURRENT_DIR/GSPAN/SGE_log"
+-sge-errdir "CURRENT_DIR/GSPAN/SGE_log"
+-task-id <NUM>
+=head1 DESCRIPTION
+=cut
+###############################################################################
+# end handler for temporary directory
+# adds an error handler that deletes the directory in case of error
+# SIGUSR{1/2} are sent by the sge prior to the uncatchable SIGKILL if the
+# option -notify was set
+###############################################################################
+$SIG{'INT'}  = 'end_handler';
+$SIG{'TERM'} = 'end_handler';
+$SIG{'ABRT'} = 'end_handler';
+$SIG{'USR1'} = 'end_handler';
+$SIG{'USR2'} = 'end_handler';
+sub end_handler {
+print STDERR "signal ", $_[0], " caught, cleaning up temporary files\n";
+# change into home directory. deletion of the temporary directory will
+# fail if it is the current working directory
+chdir();
+File::Temp::cleanup();
+die();
+}
+###############################################################################
+# PARSE COMMAND LINE OPTIONS
+###############################################################################
+# command line options
+my ( $i_help, $i_man, $i_debug, $i_fas, $i_wins, $i_shift, $i_crop_unpaired_ends,
+$i_r, $i_e, $i_c, $i_t, $i_u, $i_M, $i_o, $i_sge, $i_jobid, $i_tmp,
+$i_ignore_seq_header, $i_stacks, $i_stdout, $i_q, $i_T, $i_i,
+$i_sample_min_length, $i_sge_logDir, $i_sge_errDir, $i_groupsize, $i_annotate,
+$i_abstr, $i_no_structure, $i_vp, $i_matchShape, $i_rnashapes_binpath );
+my ( $i_add_seq_graph_win, $i_add_seq_graph_t, $i_change_seq_graph_alph );
+my $options = GetOptions(
+"help"            => \$i_help,
+"man"             => \$i_man,
+"debug"           => \$i_debug,
+"fasta=s"         => \$i_fas,
+"wins=s"          => \$i_wins,
+"shift=f"         => \$i_shift,
+"cue"             => \$i_crop_unpaired_ends,
+"stack"           => \$i_stacks,
+"r"               => \$i_r,
+"e=f"             => \$i_e,
+"c=i"             => \$i_c,
+"t=s"             => \$i_t,
+"u"               => \$i_u,
+"M=i"             => \$i_M,
+"tmp=s"           => \$i_tmp,
+"o=s"             => \$i_o,
+"i=i"             => \$i_i,
+"sample-len=i"    => \$i_sample_min_length,
+"q"               => \$i_q,
+"Tp=f"            => \$i_T,
+"seq-graph-win"   => \$i_add_seq_graph_win,
+"seq-graph-t"     => \$i_add_seq_graph_t,
+"-seq-graph-alph" => \$i_change_seq_graph_alph,
+"sge"             => \$i_sge,
+"task-id=i"       => \$i_jobid,
+"ignore-header"   => \$i_ignore_seq_header,
+"stdout"          => \$i_stdout,
+"sge-logdir=s"    => \$i_sge_logDir,
+"sge-errdir=s"    => \$i_sge_errDir,
+"group=i"         => \$i_groupsize,
+"annotate=s"      => \$i_annotate,
+"abstr"           => \$i_abstr,
+"nostr"           => \$i_no_structure,
+"vp"              => \$i_vp,
+"match-shape=s"   => \$i_matchShape,
+"rnashapes-bin=s" => \$i_rnashapes_binpath,
+);
+pod2usage( -exitstatus => 1, -verbose => 1 ) if $i_help;
+pod2usage( -exitstatus => 0, -verbose => 2 ) if $i_man;
+($options) or pod2usage(2);
+# check compulsory options
+($i_fas) or pod2usage("Error: the option -fasta is compulsory!\n");
+( -e $i_fas ) or pod2usage("Error: no such file - $i_fas!\n");
+$i_fas = abs_path($i_fas);
+# check other options and set default values
+pod2usage("Error: either -e OR -c can be given, but NOT both!\n") if ( $i_e && $i_c );
+( $i_e or $i_c ) or $i_c = 10;    # set -c 10, if neither -e or -c are given
+($i_M) or $i_M = 0;    # max number of shreps is 0 (=means take all computed)
+($i_i) or $i_i = 0;    # default no sampling else sampling iterations
+($i_sample_min_length) or $i_sample_min_length = 0;
+pod2usage("\nError: use --sample-len <INT> only with -i <1..INT> !\n") if ( not $i_i and $i_sample_min_length );
+($i_T) or $i_T = 0;
+($i_q) or $i_q = 0;
+pod2usage("\nError: Sampling (-i) not possible with shape probabilities (-q)!\n") if ( $i_i and $i_q );
+($i_add_seq_graph_win)     or $i_add_seq_graph_win     = 0;
+($i_add_seq_graph_t)       or $i_add_seq_graph_t       = 0;
+($i_change_seq_graph_alph) or $i_change_seq_graph_alph = 0;
+($i_no_structure)          or $i_no_structure          = 0;
+if ($i_change_seq_graph_alph) {
+($i_add_seq_graph_t) or ($i_add_seq_graph_win) or pod2usage( "Error: " .
+"When giving the parameter -seq-graph-alph, then either -seq-graph-t or -seq-graph-win" .
+" must also be given!\n" );
+}
+( -e $i_annotate ) or pod2usage("Error: no such file - $i_annotate!\n")
+if ($i_annotate);
+$i_add_seq_graph_t = 1 if ($i_no_structure);
+($i_t) or $i_t = 3;    # default abstraction type is 3
+my $change_shape_level = 0;
+my @level_lens = ( -1, -1, -1, -1, -1 ); ## array_idx-1=shape level, value=start length of this level
+if ( $i_t !~ /^\d+$/ ) {
+my @t_minlens = split( ",", $i_t );    ## -t "3=100,4=200"
+foreach my $idx ( 1 .. @t_minlens ) {
+my $level = $t_minlens[ $idx - 1 ];    ##  $level = "3=100"
+die "$level Wrong -t format! Example: -t 3=0,4=100,5=200\n" if ( $level !~ /^\d+\=\d+$/ );
+my @lev_len = split( "=", $level );    ##  $level = "3=100"
+die "Wrong -t format! First level given needs to be length 0! Example: -t 3=0,4=100,5=200\n" if ( $idx == 1 && $lev_len[1] != 0 );
+die "Wrong -t format! Only level 1-5 allowed! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[0] < 1 or $lev_len[0] > 5 );
+die "Wrong -t format! Length >= 0 expected! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[1] < 0 );
+$change_shape_level = 1;
+$level_lens[ $lev_len[0] - 1 ] = $lev_len[1];
+}
+($i_debug) and print STDERR "i_t = $i_t - change $change_shape_level -  shape level lengths#" . join( ":", @level_lens ) . "#" . join( ":", @t_minlens ) . "#\n";
+}
+## checks for match-shape
+if ($i_matchShape){
+die "Please provide correct match shape string like '[]'! Exit...\n\n"
+if ($i_matchShape !~ /^[\[\]_]+$/);
+## RNAshapes prodices anyway only 1 structure, no suboptimal structs in match-shape folding
+$i_M = 1;
+}
+my $CURRDIR = getcwd;
+# set up tmp directory
+# default tmp is /var/tmp, usually not NFS mounted!
+( defined $i_tmp ) or $i_tmp = '/var/tmp/';
+my $tmp_template = 'fasta2shrep-XXXXXX';
+# CLEANUP => 1 : automatically delete at exit
+$i_tmp = tempdir( $tmp_template, DIR => $i_tmp, CLEANUP => 1 );
+# create GSPAN directory when not printing to stdout
+if ( not $i_stdout ) {
+if ($i_o) {
+( -e $i_o ) or system("mkdir -p $i_o");
+} else {
+system("mkdir -p GSPAN");
+$i_o = $CURRDIR . "/GSPAN/";
+}
+}
+###############################################################################
+# GLOBAL VARIABLES
+###############################################################################
+$i_rnashapes_binpath or $i_rnashapes_binpath = "RNAshapes";
+die "Please provide full RNAshapes path (e.g. /usr/bin/RNAshapes) !\n" if (!-f "$i_rnashapes_binpath");
+my $rnashapes_loc = $i_rnashapes_binpath;
+if ( !$rnashapes_loc || !-e $rnashapes_loc ) {
+my $loc = `which RNAshapes`;
+chomp($loc);
+die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !$loc );
+die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !-e $loc );
+$rnashapes_loc = $loc;
+}
+my @WINDOWS = ();
+@WINDOWS = split( ",", $i_wins ) if ($i_wins);
+my $globalFolding;
+$globalFolding = 1 unless @WINDOWS;
+my $CURRUSER = getlogin;
+my $SEQNO    = 0;          # used to id the sequences
+my $GSPANNO  = 0;          # used for gspan filenames
+# minimum length of sequence that will work with RNAshapes
+# sequences with one or two nucleotides will be restricted to sequence-only graphs
+# this ensures that no sequences are skipped and external info kept synchronized
+my $GSPAN_SEQ_MINLEN = 3;
+# name spaces
+my $ABSTRUCT = "AS";
+###############################################################################
+# EXECUTION CODE
+###############################################################################
+# read fasta file into hash
+my ( $headers_aref, $sequences_aref, $metainfo_aref ) = read_fasta_with_nonunique_headers_meta($i_fas);
+# call script again on the sge cluster in a batch job
+if ($i_sge) {
+my $SCRIPTNAME    = $0;
+my $CLUSTERSUBMIT = $FindBin::Bin."/fasta2shrep_gspan.sge";
+die "Cannot find SGE submit script $CLUSTERSUBMIT! Exit...\n\n" if ( !-e $CLUSTERSUBMIT );
+my $SGE_jobs = @{$sequences_aref};
+$SGE_jobs = ceil( @{$sequences_aref} / $i_groupsize ) if ($i_groupsize);
+my $params = "-fasta $i_fas ";
+$params .= "-wins $i_wins "                    if ($i_wins);
+$params .= "-shift $i_shift "                  if ($i_shift);
+$params .= "-e $i_e "                          if ($i_e);
+$params .= "-c $i_c "                          if ($i_c);
+$params .= "-t $i_t "                          if ($i_t);
+$params .= "-u "                               if ($i_u);
+$params .= "-r "                               if ($i_r);
+$params .= "-M $i_M "                          if ($i_M);
+$params .= "-o $i_o "                          if ($i_o);
+$params .= "-i $i_i "                          if ($i_i);
+$params .= "--sample-len $i_sample_min_length" if ($i_sample_min_length);
+$params .= "-q "                               if ($i_q);
+$params .= "-Tp $i_T "                         if ($i_T);
+$params .= "--seq-grap-win "                   if ($i_add_seq_graph_win);
+$params .= "--seq-graph-t "                    if ($i_add_seq_graph_t);
+$params .= "--seq-graph-alph "                 if ($i_change_seq_graph_alph);
+$params .= "-ignore-header "                   if ($i_ignore_seq_header);
+$params .= "-cue "                             if ($i_crop_unpaired_ends);
+$params .= "-stack "                           if ($i_stacks);
+$params .= "--group $i_groupsize "             if ($i_groupsize);
+$params .= "-annotate $i_annotate"             if ($i_annotate);
+$params .= "-abstr"                            if ($i_abstr);
+$params .= "-nostr "                           if ($i_no_structure);
+$params .= "--debug "                          if ($i_debug);
+$params .= "-vp "                              if ($i_vp);
+$params .= "--match-shape $i_matchShape "      if ($i_matchShape);
+print "used script:" . $SCRIPTNAME . "\n";
+print "used submit script:" . $CLUSTERSUBMIT . "\n";
+$i_sge_logDir = "$i_o/SGE_log" if ( !$i_sge_logDir );
+mkdir($i_sge_logDir);
+$i_sge_errDir = $i_sge_logDir if ( !$i_sge_errDir );
+mkdir($i_sge_errDir);
+my $ssh = 1;    ## can be used for debug shell script call
+if ($ssh) {
+system( "ssh $CURRUSER\@biui.informatik.uni-freiburg.de "
+. "'export SGE_ROOT=/opt/sge-6.0/; cd $CURRDIR; "
+. "/opt/sge-6.0/bin/lx24-amd64/qsub -t 1-$SGE_jobs -o $i_sge_errDir/ -e $i_sge_errDir/ "
+. "$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME \"$params\" ' " );
+} else {
+system("$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME '$params' ");
+}
+exit;
+}
+## compute shreps and gspan, either local or after SGE submission
+# TODO read and process annotations
+my @used_seq_headers;
+my @used_seqs;
+my @used_meta;
+my $group_idx;
+if ($i_jobid) {
+## just process the one sequence as given by the jobid number
+my $used_grouping = 1;   ## if no group is given, make 1 seq per job = group=1
+$used_grouping = $i_groupsize if ($i_groupsize);
+my $st  = ( $i_jobid - 1 ) * $used_grouping + 1;
+my $end = $st + $used_grouping - 1;
+$end = @{$sequences_aref} if ( $end > @{$sequences_aref} );
+foreach my $idx ( $st .. $end ) {
+push( @used_seq_headers, $headers_aref->[ $idx - 1 ] );
+push( @used_seqs,        $sequences_aref->[ $idx - 1 ] );
+push( @used_meta,        $metainfo_aref->[ $idx - 1 ] );
+}
+$group_idx = $i_jobid;
+#($i_debug) and print STDERR "st $st end $end gr $group_idx job $i_jobid gs $i_groupsize\n";
+$GSPANNO = $i_jobid-1 if (!$i_groupsize);
+} else {
+## process all sequences at once
+@used_seq_headers = @{$headers_aref};
+@used_seqs        = @{$sequences_aref};
+@used_meta        = @{$metainfo_aref};
+}
+my $out;
+my $gspanfile;
+my $out_no_match_shape;
+if ($i_matchShape && !$i_stdout && !$i_groupsize){
+open($out_no_match_shape,">$i_o/fasta2shrep.no_match");
+}
+# for each sequence in the fasta file
+while ( my $seq = shift @used_seqs ) {
+my $tmp_header = shift @used_seq_headers;
+my $tmp_meta   = shift @used_meta;
+my ( $seq_id, $seq_header ) = ( $tmp_header =~ /(\S+)\s*([\S*\s*]*)/ );
+$i_ignore_seq_header and $seq_header = '';
+my $seq_fasta = generate_single_fasta_from_sequence_X( $seq_id, $seq );
+my $seq_len = length($seq);
+# only print sequence graphs for sequences below this threshold
+my $no_structure_override = ($seq_len < $GSPAN_SEQ_MINLEN) ? 1 : 0;
+$GSPANNO++;
+# set outstream for gspan output to correct file/STDOUT
+if ($i_stdout) {
+$out = \*STDOUT;
+} elsif ( !$i_groupsize ) {
+$gspanfile = $i_tmp . '/' . $GSPANNO . '.gspan';
+open( $out, "| bzip2 -f > $gspanfile.bz2" );
+} elsif ( ( $GSPANNO - 1 ) % $i_groupsize == 0 ) {
+if ( $GSPANNO > 1 ) {
+close($out);
+system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
+if ($out_no_match_shape){
+close($out_no_match_shape);
+system("mv $gspanfile.no_match $i_o/$group_idx.group.gspan.no_match");
+}
+}
+if ( !$i_jobid ) {
+$group_idx = int( ( $GSPANNO - 1 ) / $i_groupsize ) + 1;
+}
+$gspanfile = "$i_tmp/$group_idx.group.gspan";
+open( $out, "| bzip2 -f > $gspanfile.bz2" );
+open( $out_no_match_shape, ">$gspanfile.no_match" ) if ($out_no_match_shape);
+}
+## do not use folding windows in special cases
+if ( $globalFolding || $i_no_structure || $no_structure_override) {
+@WINDOWS = ();
+push( @WINDOWS, $seq_len );
+}
+##check win sizes for global folding
+my @WINDOWS_used = sort { $a <=> $b } @WINDOWS;
+foreach my $w_idx ( 0 .. ( @WINDOWS_used - 1 ) ) {
+if ( $WINDOWS_used[$w_idx] >= $seq_len && $WINDOWS_used[$w_idx] > 1 ) {
+if ( $w_idx < $#WINDOWS_used ) {
+@WINDOWS_used = @WINDOWS_used[ 0 .. $w_idx ];
+last;
+}
+}
+}
+## use seq graph only if no shape folding wanted
+$i_add_seq_graph_t = 1 if ($i_no_structure || $no_structure_override);
+## no shape info in graphheader if we have fixed structure (according LocaRNA handling)
+## use tags #FS or #S for provided structure
+my $graph_header;
+my @struct_meta = grep { $_ =~ /#FS/ || $_ =~ /#S/ } keys %{$tmp_meta};
+if (@struct_meta) {
+$graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, 1 );
+} else {
+$graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, $i_no_structure || $no_structure_override );
+}
+print $out $graph_header;
+my $gi = 0;    # current graph index
+## add graph with no structure at all depending on $i_add_seq_graph_t
+if ($i_add_seq_graph_t) {
+($gi) = convertSeqWindow( $seq, $seq_len, 1, $gi, $graph_header, $out, $i_annotate, $seq );
+}
+## encode fixed structure only if provided and structures wanted in general
+if ( @struct_meta && !$i_no_structure && !$no_structure_override) {
+my $struct_meta = "#FS";
+$struct_meta = "#S" if ( !exists $tmp_meta->{$struct_meta} );
+my $seq_shrep = [ $tmp_meta->{$struct_meta}, "ENERGY", "0.00", "SHAPE", $struct_meta ];
+$gi = convertShapeWindow( [$seq_shrep], $seq, $seq_len, 1, $gi, $out,
+$graph_header, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
+@WINDOWS_used = ();    ## no shape folding if we have a fixed structure
+}
+## ignore RNAshapes folding if wanted, but do correct file move afterwards
+## (just "next" does not work due to output)
+@WINDOWS_used = () if ($i_no_structure or $no_structure_override);
+#for each window size in list
+foreach my $win_size (@WINDOWS_used) {
+# calculate shift size from percentage
+my $curr_shift = 1;
+if ($i_shift) {
+$curr_shift = ( $i_shift / 100 ) * $win_size;
+$curr_shift = int($curr_shift);                 #round down
+$curr_shift = 1 unless ($curr_shift);           # just in case it is 0
+}
+($i_debug) and print STDERR "winsize: $win_size curr_shift: $curr_shift\n";
+($i_debug) and print STDERR "\nNext: $seq_id\t winsize:$win_size \n";
+# choose current shape level, depending on $i_t
+my $curr_t = 0;
+if ($change_shape_level) {
+for ( my $i = 0 ; $i < @level_lens ; $i++ ) {
+$curr_t = $i + 1 if ( $level_lens[$i] != -1 && ( $level_lens[$i] <= $win_size ) );
+}
+($i_debug) and print STDERR "$win_size curr type $curr_t\n";
+} else {
+$curr_t = $i_t;
+}
+my $rnashapesoutput_fh;
+# call RNAshapes and write to $rnashapesoutput_fh
+$rnashapesoutput_fh = call_RNAshapes( $seq_fasta, $rnashapes_loc, $win_size,
+$curr_shift, $i_e, $i_c, $curr_t, $i_u, $i_r, $i_q, $i_T, $i_i, $i_sample_min_length, $seq_len, $i_matchShape );
+# read RNAshapes output from $rnashapesoutput_fh and write subgraph
+# to gspan file
+my $gi_old = $gi;
+$gi = convert_RNAshapes_output( $rnashapesoutput_fh, $gi, $i_M, $out, $graph_header,
+$win_size, $seq_len, $curr_t, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
+## no (match) shape found at all for this seq
+if ($gi == $gi_old+1){
+$gi = convertSeqWindow( "X", 1, 1, $gi, $graph_header, $out, $i_annotate, $seq );
+print $out_no_match_shape $seq_id."\n" if (!$i_stdout && $out_no_match_shape);
+}
+}    ## foreach WINDOW_used
+if ( !$i_stdout && !$i_groupsize ) {
+close($out);
+move "$gspanfile.bz2", "$i_o/$GSPANNO.gspan.bz2";
+}
+system("rm $seq_fasta");
+}    ## while @used_seqs
+if ($i_groupsize) {
+close($out);
+system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
+close($out_no_match_shape) if ($out_no_match_shape);
+move "$gspanfile.no_match", "$i_o/$group_idx.group.gspan.no_match" if ($out_no_match_shape);
+} elsif ( $out_no_match_shape && !$i_stdout && !$i_groupsize ) {
+close($out_no_match_shape);
+}
+###############################################################################
+# METHODS
+###############################################################################
+############################################################################
+# Generates fasta file for a single sequence (one-lined-fasta). This
+# fasta is stored in the temp directory and should be deleted at the end.
+# Input:
+# seq_id : the sequence ID
+# seq : the sequence
+#
+# Output:
+# The fasta file name
+############################################################################
+sub generate_single_fasta_from_sequence_X {
+my ( $seq_id, $seq ) = @_;
+$seq = uc($seq);
+$seq =~ tr/T/U/;
+$seq =~ s/[^AUCGN]/N/g;
+my $outfas = $i_tmp . "/seq_" . $SEQNO++ . ".fasta";
+my $host   = readpipe("hostname");
+open( FAS, ">$outfas" ) or die "$host Cannot open file $outfas! Exit...\n\n";
+print FAS ">$seq_id\n$seq";
+close(FAS);
+return $outfas;
+}
+############################################################################
+# RNAshapes is called with the given input or default parameters.
+# Input:
+# seq_fasta : the sequence fasta file
+# rnashapes_location : the location of the installation files for RNAshapes
+# win_size : the current window size
+# shift : the input parameter -shift
+# e : the input parameter -e
+# c : the input parameter -c
+# t : the input parameter -t
+# u : the input parameter -u
+# r : the input parameter -r
+#
+# Output: none
+############################################################################
+sub call_RNAshapes {
+my ( $seq_fasta, $rnashapes_location, $win_size, $shift, $e, $c, $t, $u, $r, $q, $T, $i, $sample_length, $seqLen, $matchShape ) = @_;
+my $FUNCTION = "call_RNAshapes in fasta2shrep_gspan.pl";
+($seq_fasta) or die("INPUT ERROR in $FUNCTION: the fasta file is compulsory!\n");
+($rnashapes_location) or die( "INPUT ERROR in $FUNCTION: the RNAshapes location" . " is compulsory!\n" );
+die "$rnashapes_location does not exist! Exit...\n\n" if ( !-e $rnashapes_location );
+my $call = $rnashapes_location . " -o 1 ";    # the output format is of type 1
+$call .= "-q " if ($q);
+$call .= "-T $T " if ( $q and $T );
+$call .= "-w $win_size ";
+$call .= "-W $shift " if ($i_shift);
+die("ERROR in $FUNCTION: Give only one of the options -c or -e (RNAshapes)!\n")
+if ( $e && $c );
+$call .= "-e $e " if ($e);
+$call .= "-c $c " if ($c);
+$call .= "-t $t " if ($t);
+$call .= "-u "    if ( $u and not $i );       ## not possible in sampling mode
+$call .= "-r "    if ($r);
+$call .= "-m $matchShape " if ($matchShape);
+## check is a bit long but : we want to sample if the window is larger than $sample_length and full seq is larger than window or sample_len
+## necessary to do sampling a large window is given, sample_length is shorter than window, but seq is longer than sample_len
+$call .= "-i $i -A " if ( not $q and $i and ( $win_size >= $sample_length and ( $win_size <= $seqLen or $seqLen >= $sample_length ) ) ); ## -A is to omit samples and print only combined shape probs
+$call .= " < $seq_fasta";
+($i_debug) and print STDERR "$seqLen $sample_length $win_size $call\n";
+open my $rnashapesoutput, "$call |" or die( "ERROR in $FUNCTION: The following call " . "could not be carried out!\n$call\n" );
+return $rnashapesoutput;
+}
+############################################################################
+# The output of RNAshapes for one sequence and one window size is read
+# and converted into graph format.
+# Input:
+# rnashapeoutput : filehandle for RNAshapes output in format -o 1
+# curr_gi : current graph index (for vertices)
+# maxShreps : max number of shreps to convert to graphs
+# graph_file_hdl : the output handler for the graph file
+# graphHead : the header line for the complete graph (for sequence)
+# winSize : current window size in input for RNAshapes
+# seqLen  : full input seq length
+# used_t  :
+# annotate:
+# abstr   :
+# cue     :
+# stacks  :
+# orig_seq : the nucleotide sequence as read from fasta
+#
+# Output:
+# The current graph index
+############################################################################
+sub convert_RNAshapes_output {
+my ( $rnashapesoutput, $curr_gi, $maxShreps, $graph_file_hdl, $graphHead, $winSize, $seqLen, $used_t, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
+## omit first line in output, contains fasta header line
+my $line       = <$rnashapesoutput>;
+my $win_shreps = [];
+my $win_start;
+my $win_end;
+my $win_seq;
+my $win_shrep_count = 0;
+my $win_sample      = 0;
+my $winHead;
+my $win_globalFolding;
+my $win_size_real;
+# reading RNAshapes output
+while ( $line = <$rnashapesoutput> ) {
+if ( $line =~ /^(\d+)\s+(\d+)$/ ) {
+## line: "<start>    <end>"
+if ( @{$win_shreps} > 0 ) {
+print $graph_file_hdl $winHead;
+## remove SHAPE="_" shrep if it exists in $win_shreps
+## do this only when have a sequence graph already
+if ($i_add_seq_graph_t) {
+my @new_win_shreps = ();
+map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
+$win_shreps = \@new_win_shreps;
+}
+## add graph with no structure depending on $i_add_seq_graph_win to win
+if ($i_add_seq_graph_win ) {
+( $curr_gi ) = convertSeqWindow( $win_seq, $win_size_real,
+$win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
+}
+$curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
+$win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr,
+$cue, $stacks, $orig_seq );
+}
+## set new window params
+$win_shreps      = [];
+$win_start       = $1;
+$win_end         = $2;
+$win_shrep_count = 0;
+$win_size_real   = $win_end - $win_start + 1;
+if ( ($win_size_real) >= $seqLen ) {
+$win_globalFolding = 1;
+} else {
+$win_globalFolding = 0;
+}
+my $win_center = $win_start + ( ( $win_size_real + 1 ) / 2 );
+$winHead = getWindowHeader( $graphHead, $winSize, $win_start, $win_end, $win_center, $win_globalFolding, $win_sample, $used_t );
+} elsif ( $line =~ /^(\S+)$/ ) {
+## line: "CUUAUGAGUAAGGAAAAUAACGAUUCGGGGUGACGCCCGAAUCCUCACUG"
+$win_seq = uc($1); ## to be 101% sure that we have by default uppercase chars
+} elsif ( $line =~ /^Results for (\d+) iterations:$/ ) {
+## line: "Results for 10 iterations:"
+$win_sample = $1;
+} elsif ( $line =~ /^Shape\s+\S+\s+not found within energy range.*$/ ) {
+## line: "Shape [] not found within energy range (-24.75 to -27.50). Try -c or -e to increase range."
+$win_shreps      = [];
+} elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)$/ ) {
+## line:"...((((..(((....)))))))...........(((((......)))))  (-10.10)  [[]][]"
+## take only $maxShreps shreps per window if set
+next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPE", $3 ] );
+$win_shrep_count++;
+} elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)$/ ) {
+## line:"((((..((((...((.((.((.....)).)).))...))))..))))...  (-10.60)  (0.7795360)  [[[[[]]]]]"
+## take only $maxShreps shreps per window if set
+next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPE", $4 ] );
+$win_shrep_count++;
+} elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
+## line:"((((..((((...((.((.((.....)).)).))...))))..))))...  (-10.60) 0.3000000 [[[[[]]]]]"
+## take only $maxShreps shreps per window if set
+next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPEPROB", $3, "SHAPE", $4 ] );
+$win_shrep_count++;
+} elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
+## line:"((((..((((...((.((.((.....)).)).))...))))..))))...  (-10.60)  (0.7795360) 0.3000000 [[[[[]]]]]"
+## take only $maxShreps shreps per window if set
+next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPEPROB", $4, "SHAPE", $5 ] );
+$win_shrep_count++;
+} else {
+next if ( $line =~ /^$/ );
+die "Unexpected shape output format!\nline=$line\n\nExit...\n\n";
+}
+}
+## convert last windows
+if ( @{$win_shreps} > 0 ) {
+print $graph_file_hdl $winHead;
+## remove SHAPE="_" shrep if it exists in $win_shreps
+## do this only when have a sequence graph already
+if ($i_add_seq_graph_t) {
+my @new_win_shreps = ();
+map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
+$win_shreps = \@new_win_shreps;
+}
+## add graph with no structure depending on $i_add_seq_graph_win to win
+if ($i_add_seq_graph_win) {
+($curr_gi) = convertSeqWindow( $win_seq, $win_size_real, $win_start,
+$curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
+}
+$curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
+$win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, $cue,
+$stacks, $orig_seq );
+}
+close($rnashapesoutput);
+return $curr_gi + 1;    # return the gi (graph index) for the next subgraph
+}
+## Sub to create graph for a complete unstructured sequence
+# TODO: document function
+sub convertSeqWindow {
+my ( $win_seq, $win_size_real, $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ) = @_;
+my $seq_shrep;
+# use a different alphabet (lowercase) if option seq-graph-alph set
+my $seq_graph_sequence = $i_change_seq_graph_alph ? lc($win_seq) : uc($win_seq);
+$seq_shrep = [ "." x $win_size_real, "ENERGY", "0.00", "SHAPE", "_", "STRUCT", "." x $win_size_real, "SEQ", $seq_graph_sequence ];
+my $backboneGraph_ref = getBackboneGraph( $seq_graph_sequence, $curr_gi, $win_start, 0, ( $win_size_real - 1 ), $orig_seq );
+print $graph_file_hdl getSeqHeader( $winHead, $seq_shrep );
+print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
+$curr_gi += $win_size_real;
+if ($annotate) {
+## TODO add annotation graphs
+}
+return ($curr_gi);
+}
+############################################################################
+# The results for one window are converted in this method to GSPAN graphs.
+# Input:
+# win_shreps_aref : the array ref of shreps (dot-bracket structures) for the
+# 					current window
+# win_seq : the nucleotide sequence for the current window
+# win_size_real : the current (true) window size
+# win_start : the starting position of win_seq in the original seq (1-n)
+# curr_gi : the current graph index
+# graph_file_hdl : the graph file handler
+# winHead   : the header line for the current sequence window
+# TODO annotate
+# abstr : input parameter -abstr
+# cue   : input parameter -cue
+# stacks : input parameter -stacks
+# orig_seq : the nucleotide sequence as read from fasta
+#
+# Output:
+# The current graph index
+############################################################################
+sub convertShapeWindow {
+my ( $win_shreps_aref, $win_seq, $win_size_real, $win_start, $curr_gi,
+$graph_file_hdl, $winHead, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
+## generate for each shrep a connected component in gspan file
+foreach my $shrep ( @{$win_shreps_aref} ) {
+# get the current gi as it was at the beginning of this shrep
+my $shrep_gi = $curr_gi;
+# cut off unpaired ends, if option is given
+my $shrep_struct     = $shrep->[0];
+my $crop_index_left  = 0;
+my $crop_index_right = length($shrep_struct) - 1;
+# find croping indices with numbering from 0 to n-1
+if ($cue) {
+$crop_index_left = index( $shrep_struct, "(" );    # find 1st occ of "("
+$crop_index_right = rindex( $shrep_struct, ")" );  # find last occ of ")"
+# if the complete window is unpaired, then don't crop
+if ( $crop_index_left == -1 ) {
+$crop_index_left  = 0;
+$crop_index_right = length($shrep_struct) - 1;
+}
+}
+# create structure graph
+my $backboneGraph_ref = getBackboneGraph( $win_seq, $curr_gi, $win_start, $crop_index_left, $crop_index_right, $orig_seq );
+my $structGraph_ref;
+# add both basic edges, stacks and abstract graph
+if ($abstr) {
+( $structGraph_ref, $curr_gi ) = getStructPlusAbstractStructGraph( $shrep, $curr_gi, $win_size_real, $cue, $stacks );
+# just add basic structure graph edges, and stacks
+} else {
+( $structGraph_ref, $curr_gi ) = getStructGraph( $shrep, $curr_gi, $win_size_real, $stacks );
+}
+# additional information for shrep header: sequence and dot-bracket
+my $crop_length = $crop_index_right - $crop_index_left + 1;
+my $shrepheader_struct = substr( $shrep_struct, $crop_index_left, $crop_length );
+my $shrepheader_seq = substr( $win_seq, $crop_index_left, $crop_length );
+push( @{$shrep}, 'STRUCT', $shrepheader_struct, 'SEQ', $shrepheader_seq );
+# print structure graph to file
+print $graph_file_hdl getShapeHeader( $winHead, $shrep );
+print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
+# don't print empty array; sgdnspdk breaks with empty lines
+if ( @{$structGraph_ref} > 0 ) {
+print $graph_file_hdl join( "\n", @{$structGraph_ref} ) . "\n";
+if ($annotate) {
+# TODO add annotations to shrep structure
+# $win_start, $win_end (1-n)
+}
+}
+}
+return $curr_gi;
+}
+###########################################################################
+# Here the vertices for the nucleotides are generated and also the backbone
+# edges, which connect the nucleotides in their correct order (i.e order
+# of the given sequence)
+# Input:
+# win_seq : the sequence for the current window
+# win_start : the starting position for the current window
+# curr_crop_i_left : the cropping index for the left end of sequence (-cue)
+# curr_crop_i_right : the cropping index for the right end of sequence (-cue)
+# orig_seq : the nucleotide sequence as read from fasta
+#
+# Output:
+# The lines of the graph in an array reference to be printed to a file with
+# one element per line.
+############################################################################
+sub getBackboneGraph {
+my ( $win_seq, $curr_gi, $win_start, $curr_crop_i_left, $curr_crop_i_right, $orig_seq ) = @_;
+# RNAshapes substitutes T -> U, sequence only does not
+# thus, just transform all t/T -> u/U to remain consistent
+$orig_seq =~ tr /tT/uU/;
+$win_seq  =~ tr /tT/uU/;
+my @seq;
+@seq = split( "", $win_seq );
+# if the vp option is set we need to obtain the original capitalization from
+# $orig_seq and extract the windowed sequence manually
+my @seq_vp;
+if ( defined $i_vp ) {
+my $win_len = length($win_seq);
+my $capitalized_win_seq = substr( $orig_seq, $win_start - 1, $win_len );
+( uc($capitalized_win_seq) eq uc($win_seq) ) or
+die( "error: windowed sequence generated due to vp option not equal " .
+"to sequence reported by RNASHAPES.\n" .
+"'${capitalized_win_seq}' != '${win_seq}'" );
+@seq_vp = split( "", $capitalized_win_seq );
+}
+my @vert = ();
+my @edg  = ();
+my $curr_abs_pos = $win_start + $curr_crop_i_left;
+$curr_gi += $curr_crop_i_left;
+# set vertice labeled with 'v' or 'V' according to sequence and vp option
+# when vp set,
+# uppercase nucleotides are annotated with 'v'
+# lowercase nucleotides are annotated with 'V'
+my $vertice_label = ( defined $i_vp and ( $seq_vp[$curr_crop_i_left] =~ /[a-z]/ ) ) ? 'V' : 'v';
+# create backbone vertice of first nucleotide
+push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$curr_crop_i_left], $curr_abs_pos ) );
+foreach my $idx ( ( $curr_crop_i_left + 1 ) .. $curr_crop_i_right ) {
+$curr_abs_pos++;
+$curr_gi++;
+# set vertice label as described above
+$vertice_label = ( defined $i_vp and ( $seq_vp[$idx] =~ /[a-z]/ ) ) ? 'V' : 'v';
+push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$idx], $curr_abs_pos ) );
+push( @edg, join( ' ', "e", $curr_gi - 1, $curr_gi, '> 1' ) );
+}
+my @ret = ( @vert, @edg );
+return \@ret;
+}
+###########################################################################
+# This method does the same as getStructGraph, but is extended to identify
+# the abstract parts of the structure and also add this to the graph
+# via abstract relations, see information below.
+# We already have the backbone graph, now the base-pair edges need to be
+# added to this graph and the abstract graph is added after this. Finally
+# the abstract relations, pointing from the abstract level to the basic
+# structure level is added. All the while, we keep track of the current
+# index graph. Furthermore, in the basic structure graph, we have vertices
+# stacks that connects the four stacked base-pairs involved. (symbols P
+# for vertex and p for edges). The abstract vertices are labelled according
+# to the given name-space, then #, then the abstract structure type, e.g.
+# HL for hairpin-loop. The edges between the structure types are denoted
+# with n, and the relation vertex is ^R, with ^r going from the abstract
+# structure to the relation and from the relation to the basic structure
+# is @r.
+#
+# Input:
+# curr_shrep : the current shrep as dot-bracket structure
+# curr_gi : the current graph index
+# win_size : the window size
+# cue : the input parameter -cue (whether to crop unpaired ends)
+# stacks : the input parameter -stacks (whether to add stack information)
+#
+# Output:
+# The structure graph information line-by-line as an array reference and
+# the current graph index.
+#
+############################################################################
+sub getStructPlusAbstractStructGraph {
+my ( $curr_shrep, $curr_gi, $win_size, $cue, $stacks ) = @_;
+#  $win_size =  length($curr_shrep->[0]);
+my @struct = split( "", $curr_shrep->[0] );
+#  print "=================testing: new shrep : gi=$curr_gi ===============================\n" if ($i_debug);
+# OBJECTS and VARIABLES
+# all indices are saved according to the current graph index, $curr_gi,
+# which is at the beginning of the current window, so that the nucleotide
+# index can be inferred using $idx + $curr_gi
+# opening brackets
+my @open_blks = (); # open blocks of consecutive opening brackets (array of arrays)
+my @p_open = ();  # currently open block of the last identified complete block
+my @c_open = ();  # current "incomplete" block of consecutive opening brackets
+# base-pair objects
+my %bps    = ();  # hash of all base-pairs with stem for a value, if it is at
+# the BP is opening or closing a stem object
+my @c_bp   = ();  # current BP, just being closed at current index pos (i,j)
+my @p_bp   = ();  # previously closed BP at position before current index
+# stem objects
+my %stems  = ();  # all stems, key="i:j,k:l", value=stem array, (i,j) outer BP
+my @c_stem = ();  # current incomplete stem object
+my @stmbrks = (); # all stem-breaks in the form of (i,k,l), where i is the remaining
+# open base and (k,l) is the subsequent closing base-pair
+#  my @p_stmbrk	= (); # previous stem-break in the form of (i,k,l), where i is the remaining
+#  					  # open base and (k,l) is the subsequent closing base-pair
+# loop objects
+my @up    = (); # all unpaired regions (i,j) that are of unkown type
+my @c_up  = (); # current incomplete unpaired object
+my @hls   = (); # all hairpin loop objects
+my @bls   = (); # all bulge-loop objects
+my @ils   = (); # all internal loops
+my @mls   = (); # all multi-loops
+my @c_mls = (); # current incomplete multi-loops! (there can be more than one)
+my @els   = (); # all external loops
+my @c_el  = (); # current incomplete external loop
+# iterate through shrep structure an identify abstract parts
+foreach my $idx ( 0 .. @struct - 1 ) {
+#===================================================
+# current char is the opening bracket of a base-pair
+if ( $struct[$idx] eq "(" ) {
+#===================================================
+# update current window index to current graph index
+$idx += $curr_gi;
+# case: '((' currently there is an open block, extend it
+if (@c_open) {
+push( @c_open, $idx );
+# case: '.(' or ')('  or BOS'(' there is no open block, create a new one ".(" or ")("
+} else {
+# begin a new current open block
+push( @c_open, $idx );
+# case: ')('
+if (@c_stem) {
+# CLOSE STEM
+print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
+close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
+# case (x(x)(
+if (@open_blks) {
+# EXTEND_ML, if one exists
+if (@c_mls) {
+extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
+# OPEN_ML
+} else {
+# case 'x((x)(' the opening of the ML was a stem-break
+if ( @stmbrks
+&& $stmbrks[-1]->[1] == $p_bp[0]
+&& $stmbrks[-1]->[2] == $p_bp[1] ) {
+my @c_stmbrk = @{ pop(@stmbrks) };
+my @a        = ( $c_stmbrk[0], -1 );
+my @ml       = ();
+push( @ml, \@a );    # 1st base-pair is in-waiting
+my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
+push( @ml, \@a2 );    #2nd BP
+my @a3 = ($idx);
+push( @ml,    \@a3 );    #first of current BP
+push( @c_mls, \@ml );
+print STDERR "TEST - opened ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+# case 'x(.(x)('
+} elsif (@up) {
+my @tmp_up = @{ pop(@up) };
+# compare adjacent BP to UP to see if they fit
+if ( $tmp_up[1] + 1 == $p_bp[0]
+&& $open_blks[-1]->[-1] == $tmp_up[0] - 1 ) {
+my @a = ( $open_blks[-1]->[-1], -1 );
+my @ml = ();
+push( @ml, \@a );    # 1st BP awaits closing
+my @newbp = @p_bp;
+push( @ml, \@newbp );    # 2nd BP
+my @a2 = ($idx);
+push( @ml,    \@a2 );    # current open BP, awaits closing
+push( @c_mls, \@ml );
+print STDERR "TEST - opened ML with prev UP1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
+if ($i_debug);
+} else {
+die "ERROR: the base-pairs to not match the " . "previous unpaired region?!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+} else {
+die "ERROR: in case ML, but there is no initial" . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+# case: 'x(x)('
+} else {
+# case: '.(x)(' => CLOSE_EL
+if (@c_el) {
+if ( $c_el[-1] == $p_bp[0] ) {
+push( @c_el, $p_bp[1] );
+my @newel = @c_el;
+push( @els, \@newel );
+@c_el = ();
+print STDERR "TEST - closed EL1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
+} else {
+die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+# in both cases OPEN_EL
+push( @c_el, @p_bp );
+push( @c_el, $idx );
+print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
+}
+# case '.('
+} elsif (@c_up) {
+# case (x(x).( OR 'x(.('
+if (@open_blks) {
+# case 'x(x).(' ML
+if ( @p_bp && ( $p_bp[1] == $c_up[0] - 1 ) ) {
+# case '((x)x(x).(' EXTEND_ML, if one exists
+if (@c_mls) {
+extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
+# case 'x((x).(' or 'x(.(x).(' OPEN_ML
+} else {
+# case 'x((x).(' the opening of the ML was a stem-break
+if ( @stmbrks
+&& $stmbrks[-1]->[1] == $p_bp[0]
+&& $stmbrks[-1]->[2] == $p_bp[1] ) {
+my @c_stmbrk = @{ pop(@stmbrks) };
+my @a        = ( $c_stmbrk[0], -1 );
+my @ml       = ();
+push( @ml, \@a );    # 1st base-pair is in-waiting
+my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
+push( @ml, \@a2 );    #2nd BP
+my @a3 = ($idx);
+push( @ml,    \@a3 );    #first of current BP
+push( @c_mls, \@ml );
+print STDERR "TEST - open ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+# case 'x(.(x).(' opening the ML with initial UP region
+} elsif (@up) {
+# get previous unpaired region
+my $tmp  = pop(@up);
+my @p_up = @{$tmp};
+# case: '(.(x).(' compare adjacent BP to UP to see if they create a ML
+if ( $p_up[1] + 1 == $p_bp[0]
+&& $open_blks[-1]->[-1] == $p_up[0] - 1 ) {
+my @a = ( $open_blks[-1]->[-1], -1 );
+my @ml = ();
+push( @ml, \@a );    # 1st BP awaits closing
+my @newbp = @p_bp;
+push( @ml, \@newbp );    # 2nd BP
+my @a2 = ($idx);
+push( @ml,    \@a2 );    # current open BP, awaits closing
+push( @c_mls, \@ml );
+print STDERR "TEST - opened ML with prev UP2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
+if ($i_debug);
+}
+# has to be an ML, but can't find first part
+} else {
+die "ERROR: in case ML, but there is no initial " . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+# case '(.(
+} else {
+my @newup = @c_up;
+push( @up, \@newup );
+print STDERR "TEST - closed UP:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newup ) ) if ($i_debug);
+}
+# case: 'x(x).(' OR 'EOS.('
+} else {
+# case: '.(x)(' => CLOSE_EL
+if (@c_el) {
+if ( $c_el[-1] == $c_bp[0] ) {
+push( @c_el, $p_bp[1] );
+my @newel = @c_el;
+push( @els, \@newel );
+@c_el = ();
+print STDERR "TEST - closed EL2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
+} else {
+die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+# in both cases OPEN_EL
+if (@p_bp) {
+push( @c_el, @p_bp );
+push( @c_el, $idx );
+} else {
+# ignore this if -cue is given
+# TODO check the gi index
+if ($cue) {
+@c_el = ();
+} else {
+push( @c_el, ( $curr_gi, $curr_gi ) );
+push( @c_el, $idx );
+print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
+}
+}
+}
+@c_up = ();
+} else {
+# no more cases except opening bracket at beginning of sequence
+# in this case do nothing, as the index has already been added
+die "ERROR: '((', ')(', '.(' have all been covered\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) )
+unless ( $idx == $curr_gi );
+}
+}
+#===================================================
+# current char is the unpaired base
+} elsif ( $struct[$idx] eq "." ) {
+#===================================================
+# update current window index to current graph index
+$idx += $curr_gi;
+# case ').' => CLOSE_STEM
+if (@c_stem) {
+print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
+close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
+}
+# case '..' extend UP, @c_up
+if (@c_up) {
+$c_up[1] = $idx;
+# case 'x.' open new UP
+} else {
+# OPEN_UP, @c_up
+@c_up = ( $idx, $idx );
+# case '(.' just come to the end of an open block, push it onto stack
+if (@c_open) {
+my @newopen = @c_open;
+push( @open_blks, \@newopen );
+@c_open = ();
+# case ').' or '.'
+# (have already closed stem and created stem-break and emptied p_open)
+} else {
+# case 'x(x).' or '.' => CLOSE_EL
+if ( !@open_blks ) {
+# case 'x().().'
+if (@c_el) {
+# double check
+if ( $c_el[-1] == $p_bp[0] ) {
+push( @c_el, $p_bp[1] );
+my @newel = @c_el;
+push( @els, \@newel );
+@c_el = ();
+print STDERR "TEST - closed EL3:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
+} else {
+die "This case should not occur!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+}
+}
+}
+#===================================================
+# current char is the closing bracket of a base-pair
+} elsif ( $struct[$idx] eq ")" ) {
+#===================================================
+# update current window index to current graph index
+$idx += $curr_gi;
+# save previous base-pair
+if (@c_bp) {
+@p_bp = @c_bp;
+}
+# case: '((x))', extend stem
+if (@p_open) {
+# get current base-pair
+@c_bp = ( pop(@p_open), $idx );
+# add it to the base-pair hash
+$bps{"$c_bp[0]:$c_bp[1]"} = "";
+# EXTEND_STEM
+if (@c_stem) {
+$c_stem[0] = $c_bp[0];
+$c_stem[1] = $c_bp[1];
+} else {
+die "ERROR: in case '))' there has to be an open stem object!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+# case: '(x(x))' or '(x.)', '()' not allowed
+} else {
+# case: '(x(x))'  CLOSE_STEM close previous stem
+if (@c_stem) {
+print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
+close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
+# case '(x(x))' OPEN_STEM
+if (@open_blks) {
+my $tmp = pop(@open_blks);
+@p_open = @{$tmp};
+# get current base-pair
+@c_bp = ( pop(@p_open), $idx );
+# add it to the base-pair hash
+# as the stem is not finished yet, cannot add stem information
+$bps{"$c_bp[0]:$c_bp[1]"} = "";
+# open stem
+$c_stem[0] = $c_bp[0];
+$c_stem[1] = $c_bp[1];
+$c_stem[2] = $c_bp[0];
+$c_stem[3] = $c_bp[1];
+} else {
+die "ERROR: there have to be open blocks to match " . "current closing bracket!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+# case: '(.(x))' => CLOSE_BL
+if ( @up && $up[-1]->[0] - 1 == $c_bp[0] && $up[-1]->[1] + 1 == $p_bp[0] ) {
+my @newBL = ( @c_bp, @p_bp );
+push( @bls, \@newBL );
+pop(@up);
+print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newBL ) ) if ($i_debug);
+# case: '(x(x).(x))' => CLOSE_ML
+} elsif ( @c_mls && $c_mls[-1]->[-1]->[-1] == $p_bp[0] ) {
+my @newml = @{ pop(@c_mls) };
+push( @{ $newml[-1] }, $p_bp[1] );    # close last BP
+$newml[0]->[1] = $c_bp[1];            # close 1st BP
+push( @mls, \@newml );
+print STDERR "TEST - closed ML1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
+} else {
+die "ERROR: what is this case?\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+# case: (x.)
+} else {
+# OPEN_STEM open new stem
+if (@open_blks) {
+@p_open = @{ pop(@open_blks) };
+# get current base-pair
+@c_bp = ( pop(@p_open), $idx );
+# add it to the base-pair hash
+# as the stem is not finished yet, cannot add stem information
+$bps{"$c_bp[0]:$c_bp[1]"} = "";
+# open stem
+$c_stem[0] = $c_bp[0];
+$c_stem[1] = $c_bp[1];
+$c_stem[2] = $c_bp[0];
+$c_stem[3] = $c_bp[1];
+} else {
+die "ERROR: there have to be open blocks to match " . "current closing bracket!$curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+# CLOSE_C_UP
+if (@c_up) {
+# case: (.) => close_HL
+if ( $c_up[0] - 1 == $c_bp[0] ) {
+my @newhl = @c_bp;
+push( @hls, \@newhl );
+@c_up = ();
+print STDERR "TEST - closed HL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newhl ) ) if ($i_debug);
+# case: (x(x).)
+} elsif ( @p_bp && $c_up[0] - 1 == $p_bp[1] ) {
+# case: ((x).) => CLOSE_BL
+if ( @stmbrks
+&& $stmbrks[-1]->[0] == $c_bp[0]
+&& $stmbrks[-1]->[1] == $p_bp[0]
+&& $stmbrks[-1]->[2] == $p_bp[1] ) {
+my @newbl = ( @c_bp, @p_bp );
+push( @bls, \@newbl );
+pop(@stmbrks);
+@c_up = ();
+print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newbl ) ) if ($i_debug);
+# case: (x(x).), either CLOSE_ML or CLOSE_IL
+} else {
+# case '(.(x).)' => CLOSE_IL
+if ( @up
+&& ( $up[-1]->[0] - 1 == $c_bp[0] )
+&& ( $up[-1]->[1] + 1 == $p_bp[0] ) ) {
+my @p_up = @{ pop(@up) };
+my @newil = ( @c_bp, @p_bp );
+push( @ils, \@newil );
+@c_up = ();
+print STDERR "TEST - closed IL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newil ) ) if ($i_debug);
+# case (x(x)x(x).) => CLOSE_ML remember, array of tuples
+} elsif ( @c_mls && $c_mls[-1]->[-1]->[0] == $p_bp[0] ) {
+my @newml = @{ pop(@c_mls) };
+push( @{ $newml[-1] }, $p_bp[1] );    # close final BP
+$newml[0]->[1] = $c_bp[1];            # close 1st BP
+push( @mls, \@newml );
+@c_up = ();
+print STDERR "TEST - closed ML2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
+} else {
+die "There should be no such case!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+# there should be no more unknown unpaired regions left!
+} else {
+die "There shouldn't be any unknown unpaired " . "regions left\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+} else {
+die "ERROR: '()' is not allowed in $curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+}
+}
+}
+}
+}
+# EOS
+# CLOSE_STEM
+print STDERR "TEST - closed stem EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@c_stem ) )
+if ( @c_stem && $i_debug );
+close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ) if (@c_stem);
+die "ERROR: there should not be an open ML!\n" . $curr_shrep . "\n"
+if (@c_mls);
+# case ().() CLOSE_EL
+if (@c_el) {
+if ( $c_el[-1] == $p_bp[0] ) {
+push( @c_el, $p_bp[1] );
+my @newel = @c_el;
+push( @els, \@newel );
+@c_el = ();
+print STDERR "TEST - closed EL4 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
+if ($i_debug);
+} else {
+die "ERROR: the base-pairs in the EL don't match" . $curr_shrep . "\n";
+}
+}
+# case (). CLOSE_EL
+if (@c_up) {
+unless ($cue) {
+if (@p_bp) {
+push( @c_el, @p_bp );
+} else {
+push( @c_el, ( $curr_gi, $curr_gi ) );
+}
+push( @c_el, ( ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ) );
+my @newel = @c_el;
+push( @els, \@newel );
+@c_el = ();
+print STDERR "TEST - closed EL5 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
+if ($i_debug);
+## check case where cue is given, but window is completely unstructured
+} else {
+unless (@p_bp) {
+my @newel = ( $curr_gi, $curr_gi, ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) );
+push( @els, \@newel );
+}
+}
+}
+#  # update current graph index (since all shrep indices have been read)
+$curr_gi += $win_size;
+# ===================================================
+# build graph lines
+my @graph_lines = ();
+# ===================================================
+# graph line objects
+my @edg        = ();
+my @stackgraph = ();
+my @abstrgraph = ();
+#-------------------> stems,stacks
+# build basic graph edges, stacks, and vertices for stems in the abstract graph
+my @tmp = keys %stems;
+foreach my $sk (@tmp) {
+my @stem = @{ $stems{$sk} }; # stem (i,j,k,l)=> (i,j)= (0,1) and (k,l)=(2,3)
+die "ERROR: There are not enough elements in this stem: ", ( join( ",", @stem ) ), "\n" unless ( scalar(@stem) == 4 );
+# infer base-pairs from stem
+@p_bp = ();
+@c_bp = ();
+my @stem_relation_idxs  = ();
+my @stack_relation_idxs = ();
+my $stacksize           = $stem[2] - $stem[0] + 1;
+for ( my $x = 0 ; $x < $stacksize ; $x++ ) {
+#    for (my $i = $stem[0]; $i <= $stem[2] ; $i++){
+#      for(my $j = $stem[1]; $j >= $stem[3]; $j--){
+my $i = $stem[0] + $x;
+my $j = $stem[1] - $x;
+@p_bp = @c_bp;
+@c_bp = ( $i, $j );
+if ( defined $bps{"$i:$j"} ) {
+# add indices to stem relation object
+push( @stem_relation_idxs, $i );
+push( @stem_relation_idxs, $j );
+# add edge to graph edges
+push( @edg, "e $i $j s" );
+# add stack if option given
+if ($stacks) {
+if (@p_bp) {
+push( @stackgraph,          "v $curr_gi P" );
+push( @stackgraph,          "e $curr_gi $p_bp[0] p" );
+push( @stackgraph,          "e $curr_gi $p_bp[1] p" );
+push( @stackgraph,          "e $c_bp[0] $curr_gi p" );
+push( @stackgraph,          "e $c_bp[1] $curr_gi p" );
+push( @stack_relation_idxs, $curr_gi );
+# update graph index
+++$curr_gi;
+}
+}
+} else {
+die "ERROR: There is no such base-pair in the base-pair hash:" . " ($i, $j)\n";
+}
+}
+#  	  }
+#    }
+# add stem to abstract graph
+# (graph indices become a bit jumbled, but saves time)
+push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#S" );
+# stem object now saves the curr_gi index for that stem
+$stems{$sk} = $curr_gi;
+++$curr_gi;
+# add relations
+push( @abstrgraph, "v $curr_gi ^R" );
+push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+foreach my $r (@stem_relation_idxs) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+if ($stacks) {
+foreach my $r (@stack_relation_idxs) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+}
+++$curr_gi;
+}
+#-------------------> add HLs
+foreach my $hl (@hls) {
+# check size
+die "ERROR: the HL object is the incorrect size\n" unless ( @{$hl} == 2 );
+# add vertex
+push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#HL" );
+# add edge from stem to hl
+if ( defined( $stems{ $bps{"$hl->[0]:$hl->[1]"} } ) ) {
+push( @abstrgraph, "e " . $stems{ $bps{"$hl->[0]:$hl->[1]"} } . " $curr_gi n" );
+} else {
+die "ERROR: stem not defined in base-pair hash for $hl->[0]:$hl->[1]\n";
+}
+++$curr_gi;
+# add relations
+push( @abstrgraph, "v $curr_gi ^R" );
+push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+# add all nodes between i to j, inclusively
+for ( my $r = $hl->[0] ; $r <= $hl->[1] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+++$curr_gi;
+}
+#------------------> add BLs
+foreach my $bl (@bls) {
+# check size
+die "ERROR: the BL object is the incorrect size\n" unless ( scalar( @{$bl} ) == 4 );
+# add vertex
+push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#BL" );
+# add edges from bl to both adjacent stems
+if ( defined( $stems{ $bps{"$bl->[0]:$bl->[1]"} } )
+&& defined( $stems{ $bps{"$bl->[2]:$bl->[3]"} } ) ) {
+# add outer stem: S->BL
+push( @abstrgraph, "e " . $stems{ $bps{"$bl->[0]:$bl->[1]"} } . " $curr_gi n" );
+# add inner stem: BL->S
+push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bl->[2]:$bl->[3]"} } . " n" );
+} else {
+die "ERROR: stems not defined for BL\n";
+}
+++$curr_gi;
+# add relations
+push( @abstrgraph, "v $curr_gi ^R" );
+push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+# add all nodes between i to k and l to j, inclusively
+for ( my $r = $bl->[0] ; $r <= $bl->[2] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+for ( my $r = $bl->[3] ; $r <= $bl->[1] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+++$curr_gi;
+}
+#------------------> add ILs
+foreach my $il (@ils) {
+# check size
+die "ERROR: the IL object is the incorrect size\n" unless ( scalar( @{$il} ) == 4 );
+# add vertex
+push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#IL" );
+# add edges from il to both adjacent stems
+if ( defined( $stems{ $bps{"$il->[0]:$il->[1]"} } )
+&& defined( $stems{ $bps{"$il->[2]:$il->[3]"} } ) ) {
+# add outer stem: S->IL
+push( @abstrgraph, "e " . $stems{ $bps{"$il->[0]:$il->[1]"} } . " $curr_gi n" );
+# add inner stem: IL->S
+push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$il->[2]:$il->[3]"} } . " n" );
+} else {
+die "ERROR: stems not defined for IL\n";
+}
+++$curr_gi;
+# add relations
+push( @abstrgraph, "v $curr_gi ^R" );
+push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+# add all nodes between i to k and l to j, inclusively
+for ( my $r = $il->[0] ; $r <= $il->[2] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+for ( my $r = $il->[3] ; $r <= $il->[1] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+++$curr_gi;
+}
+#------------------> add ELs
+foreach my $el (@els) {
+#check size
+die "ERROR: the EL object is the incorrect size\n" unless ( scalar( @{$el} ) == 4 );
+# add vertex
+push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#EL" );
+# add edges from el to both adjacent stems, if available
+unless ( $el->[0] == $el->[1] ) {
+if ( defined( $stems{ $bps{"$el->[0]:$el->[1]"} } ) ) {
+push( @abstrgraph, "e " . $stems{ $bps{"$el->[0]:$el->[1]"} } . " $curr_gi n" );
+} else {
+die "ERROR: stems not defined for left BP of EL\n";
+}
+}
+unless ( $el->[2] == $el->[3] ) {
+if ( defined( $stems{ $bps{"$el->[2]:$el->[3]"} } ) ) {
+push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$el->[2]:$el->[3]"} } . " n" );
+} else {
+die "ERROR: stems not defined for right BP of EL\n";
+}
+}
+++$curr_gi;
+# add relations
+push( @abstrgraph, "v $curr_gi ^R" );
+push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+# @r relation edges
+for ( my $r = $el->[1] ; $r <= $el->[2] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+# if BP, not EOS
+unless ( $el->[0] == $el->[1] ) {
+push( @abstrgraph, "e $curr_gi $el->[0] \@r" );
+}
+# if BP, not EOS
+unless ( $el->[2] == $el->[3] ) {
+push( @abstrgraph, "e $curr_gi $el->[3] \@r" );
+}
+++$curr_gi;
+}
+#------------------> add MLs
+foreach my $ml (@mls) {
+#check size
+die "ERROR: the ML object is not big enough!\n" unless ( scalar( @{$ml} ) >= 3 );
+# add vertex
+push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#ML" );
+# add edge from outer stem to first BP
+my $closing_bp = shift( @{$ml} );
+if ( defined( $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } ) ) {
+push( @abstrgraph, "e " . $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } . " $curr_gi n" );
+} else {
+die "ERROR: ML is not defined for BP $closing_bp->[0]:$closing_bp->[1]\n";
+}
+# add remaining stems
+foreach my $bp ( @{$ml} ) {
+if ( defined( $stems{ $bps{"$bp->[0]:$bp->[1]"} } ) ) {
+push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bp->[0]:$bp->[1]"} } . " n" );
+} else {
+die "ERROR: ML is not defined for BP $bp->[0]:$bp->[1]\n";
+}
+}
+++$curr_gi;
+# add relations
+push( @abstrgraph, "v $curr_gi ^R" );
+push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+# add @r relation edges between i and k, where (i,j) is the first BP and (k,l)
+# the second BP
+my $pbp_aref = $closing_bp;
+my $cbp_aref = shift( @{$ml} );
+for ( my $r = $pbp_aref->[0] ; $r <= $cbp_aref->[0] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+# rest of @r relation edges
+foreach my $bp ( @{$ml} ) {
+$pbp_aref = $cbp_aref;
+$cbp_aref = $bp;
+for ( my $r = $pbp_aref->[1] ; $r <= $cbp_aref->[0] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+}
+# add final stretch between last BP (m,n) and first closing base-pair (i,j)
+for ( my $r = $cbp_aref->[1] ; $r <= $closing_bp->[1] ; $r++ ) {
+push( @abstrgraph, "e $curr_gi $r \@r" );
+}
+++$curr_gi;
+}
+@graph_lines = ( @edg, @stackgraph, @abstrgraph );
+return ( \@graph_lines, $curr_gi );
+}
+# just to test the current shrep dot-bracket structure parsing
+sub test {
+my ( $shrep, $idx, $gi, $mark_aref, $isML ) = @_;
+my $result = "";
+$result .= "$shrep\n";
+my @a = split( "", $shrep );
+for ( my $i = 0 ; $i <= @a ; $i++ ) {
+my $mark = 0;
+foreach my $j ( @{$mark_aref} ) {
+if ($isML) {
+$mark = 1 if ( $i == $j->[0] - $gi );
+$mark = 1 if ( scalar( @{$j} == 2 && $i == $j->[1] - $gi ) );
+} else {
+$mark = 1 if ( $i == $j - $gi );
+}
+}
+if ( $i == $idx - $gi ) {
+$result .= "^";
+} elsif ($mark) {
+$result .= "'";
+} else {
+$result .= " ";
+}
+}
+$result .= "\n";
+return $result;
+}
+###########################################################################
+# Extends or closes a multi-loop. This means a ML already exists.
+# We can have the case ((x)( or ((x).(
+#
+# Input:
+# curr_shrep	The current shrep object, with shrep string at pos 0
+# c_ml_aref		The current open MLs array reference
+# p_bp_aref		The previous base-pair array reference
+# stmbrks_aref	The stem-breaks array reference
+# open_blks_aref	The open bracket blocks array reference
+# idx			The current index
+# #curr_gi		The current graph index
+# Output:
+# None, it should just modify the ML accordingly
+############################################################################
+sub extend_or_open_nested_ML {
+my ( $curr_shrep, $c_mls_aref, $p_bp_aref, $up_aref, $stmbrks_aref, $open_blks_aref, $idx, $curr_gi ) = @_;
+# case '(x(x)x(x)x(' extend current ML
+if ( $c_mls_aref->[-1]->[-1]->[-1] == $p_bp_aref->[0] ) {
+push( @{ $c_mls_aref->[-1]->[-1] }, $p_bp_aref->[1] );    #close previous BP
+my @a = ($idx);
+push( @{ $c_mls_aref->[-1] }, \@a );    # add new open base-pair
+print STDERR "TEST - extended ML:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, $c_mls_aref->[-1], 1 ) ) if ($i_debug);
+# case '(x(x)x((x)(' nested ML, => OPEN_ML with stem-break
+} elsif ( @$stmbrks_aref
+&& $stmbrks_aref->[-1]->[1] == $p_bp_aref->[0]
+&& $stmbrks_aref->[-1]->[2] == $p_bp_aref->[1] ) {
+my @c_stmbrk = @{ pop( @{$stmbrks_aref} ) };
+my @a        = ( $c_stmbrk[0], -1 );
+my @ml       = ();
+push( @ml, \@a );                       # 1st base-pair is in-waiting
+my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
+push( @ml, \@a2 );                      #2nd BP
+my @a3 = ($idx);
+push( @ml,            \@a3 );           #first of current BP
+push( @{$c_mls_aref}, \@ml );
+print STDERR "TEST - opened nested ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+# case '(x(x)x(.(x)(' nested ML, => OPEN_ML with UP
+} elsif ( @$up_aref
+&& $up_aref->[-1]->[0] - 1 == $open_blks_aref->[-1]->[-1]
+&& $up_aref->[-1]->[1] + 1 == $p_bp_aref->[0] ) {
+my @tmp_up = @{ pop( @{$up_aref} ) };
+my @a      = ( $open_blks_aref->[-1]->[-1], -1 );
+my @ml     = ();
+push( @ml, \@a );                       # 1st BP awaits closing
+my @newbp = @{$p_bp_aref};
+push( @ml, \@newbp );                   #2nd BP
+my @a2 = ($idx);
+push( @ml,            \@a2 );           # current open BP, awaits closing
+push( @{$c_mls_aref}, \@ml );
+print STDERR "TEST - opened nested ML with prev UP\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+} else {
+die "ERROR: in the case extend or open nested ML, " . "but the base-pairs don't fit for either!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) if ($i_debug);
+}
+}
+###########################################################################
+# Closes a stem-object for the abstract graph. If necessary, it also
+# creates a stem-break because the current open block is still open,
+# which can only mean that there is a stem-break between two consecutive
+# opening brackets.
+#
+# Input:
+# c_stem_aref : current stem array reference
+# stems_href : all stems hash reference
+# p_bp_aref : previous base-pair array reference
+# c_cp_aref : current base-pair array reference
+# p_open_aref : previously opened open-bracket block
+# p_stmbrk_aref : the previous stem-break
+# open_blks_aref : all old open blocks array reference
+# bps_href : all base-pairs hash reference
+#
+# Output:
+# None, it should just modify the input variables accordingly
+############################################################################
+sub close_stem {
+my ( $c_stem_aref, $stems_href, $p_bp_aref, $c_bp_aref, $p_open_aref, $stmbrks_aref, $open_blks_aref, $bps_href ) = @_;
+my @newstem = @$c_stem_aref;
+my $stemkey = "$newstem[0]:$newstem[1],$newstem[2]:$newstem[3]";
+$stems_href->{$stemkey} = \@newstem;
+# reset and clean running variables
+@$c_stem_aref = ();
+@$p_bp_aref   = @$c_bp_aref;
+@$c_bp_aref   = ();
+# add stem to base-pair hash
+$bps_href->{"$newstem[0]:$newstem[1]"} = $stemkey;
+$bps_href->{"$newstem[2]:$newstem[3]"} = $stemkey;
+# there are still bases open in the previous open block, '[[]x',
+# where [] is a closed stem and [ is an open block,
+# must create STEM-BREAK
+# this cannot be the case ((x)), because there p_open is empty
+if (@$p_open_aref) {
+# check: the next open base is adjacent to the previous BP
+if ( $p_open_aref->[-1] == $p_bp_aref->[0] - 1 ) {
+my @stmbrk = ( $p_open_aref->[-1], @$p_bp_aref );
+push( @$stmbrks_aref, \@stmbrk );
+# push the previous open block back onto stack
+my @newopen = @$p_open_aref;
+push( @$open_blks_aref, \@newopen );
+@$p_open_aref = ();
+} else {
+# error, this should not be possible
+die("ERROR in close_stem(), [.[]x should not occur here\n");
+}
+}
+}
+###########################################################################
+# Here the information about the structure is added to the graph. In a
+# first step each base-pair is added by adding edges between the vertices
+# of the respective nucleotides. Then, if two base-pairs are stacked,
+# we add an extra vertex and connect the four nucleotide vertices that
+# are involved with extra edges (symbols P used for vertex and p for edges).
+# While adding new vertices we keep track of the current graph index.
+#
+# Input:
+# curr_shrep : the current shrep as dot-bracket structure
+# curr_gi : the current graph index
+# win_size : the window size
+# stacks : the input parameter -stacks (whether to add stacking information)
+#
+# Output:
+# The structure graph information line-by-line as an array reference and
+# the current graph index.
+############################################################################
+sub getStructGraph {
+my ( $curr_shrep, $curr_gi, $win_size, $stacks ) = @_;
+my @struct = split( "", $curr_shrep->[0] );
+my @edg    = ();
+my @starts = ();
+my @pairs  = ();
+foreach my $idx ( 0 .. @struct - 1 ) {
+push( @starts, $idx ) if ( $struct[$idx] eq "(" );
+if ( $struct[$idx] eq ")" ) {
+my $start = $curr_gi + pop(@starts);
+my $end   = $curr_gi + $idx;
+push( @edg, "e " . $start . " " . $end . " s " );
+my @pair = ( $start, $end );
+push( @pairs, \@pair );
+}
+}
+# update current graph index
+$curr_gi = $curr_gi + $win_size;
+# initiate structure lines array
+my @stacking_info = ();
+# add stacking information to graph unless input option tells us not to
+if ($stacks) {
+my $stacked_pairs = 0;
+# get stacked base-pairs (they are ordered according to
+# position of closing nucleotide)
+for ( my $i = 1 ; $i < @pairs ; $i++ ) {
+# add stacked base-pairs (vertices+edges)
+my $curr_bp_aref = $pairs[$i];
+my $prev_bp_aref = $pairs[ $i - 1 ];
+# if the current base-pair is stacked on the previous base-pair,
+# when curr_start = prev_start - 1 AND curr_end = prev_end + 1
+if ( $curr_bp_aref->[0] == $prev_bp_aref->[0] - 1
+&& $curr_bp_aref->[1] == $prev_bp_aref->[1] + 1 ) {
+# add stacking vertex P
+push( @stacking_info, "v $curr_gi P" );
+# add four edges from involved nucleotids
+push( @stacking_info, "e $curr_gi $prev_bp_aref->[0] p" );
+push( @stacking_info, "e $curr_gi $prev_bp_aref->[1] p" );
+push( @stacking_info, "e $curr_bp_aref->[0] $curr_gi p" );
+push( @stacking_info, "e $curr_bp_aref->[1] $curr_gi p" );
+++$curr_gi;    # add one to the index, ready for next vertex
+}
+}
+}
+my @str_graphlines = ( @edg, @stacking_info );
+return ( \@str_graphlines, $curr_gi );
+}
+###########################################################################
+# This method gives us the overall graph header for this sequence. This
+# graph includes all window calculations and all shreps for all windows.
+# Input:
+# seq_id : the sequence ID
+# seq_head : the header information for the sequence
+# wins_aref : the windows sizes for folding
+# h_shift : input parameter -shift
+# h_e : input parameter -e
+# h_c : input parameter -c
+# h_t : input parameter -t
+# h_u : input parameter -u
+# h_r : input parameter -r
+# h_M : input parameter -M
+# h_crop_unpaired_ends : input parameter -cue
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getGraphHeader {
+my ( $seq_id, $seq_head, $wins_aref, $h_shift, $h_e, $h_c, $h_t, $h_u, $h_r, $h_M, $h_crop_unpaired_ends, $h_i, $h_sample_len, $h_q, $h_Tp, $h_seqlen, $h_noStr ) = @_;
+my $ret;
+$ret = "t # SEQID $seq_id ";
+$ret .= "$seq_head " if ( defined $seq_head );
+$ret .= "SEQLEN $h_seqlen ";
+return $ret . "\n" if ($h_noStr);
+$ret .= "MAXWINSHREPS $h_M ";
+$ret .= "CUE 1 " if ($h_crop_unpaired_ends);
+$ret .= "RNASHAPES -w " . ( join( ",", @{$wins_aref} ) ) . " ";
+$ret .= "-SHIFT%w $h_shift " if ( not $globalFolding and $h_shift );
+$ret .= "-SHIFT 1 " if ( not $globalFolding and not $h_shift );
+$ret .= "-e $h_e "                         if ( defined $h_e );
+$ret .= "-c $h_c "                         if ( defined $h_c );
+$ret .= "-t $h_t "                         if ( defined $h_t );
+$ret .= "-u 1 "                            if ($h_u);
+$ret .= "-r 1 "                            if ($h_r);
+$ret .= "-i $h_i SAMPLE_LEN $h_sample_len" if ( not $h_q and $h_i );
+$ret .= "-q 1 "                            if ($h_q);
+$ret .= "-T $h_Tp"                         if ( $h_q and defined $h_Tp );
+$ret .= "\n";
+return $ret;
+}
+###########################################################################
+# This method gives us the window header for the subgraph that includes
+# all shreps for the current window.
+# Input:
+# graphHead : the header line for the entire sequence graph
+# win_size : the size of the current window
+# win_start : the starting position of the current window
+# win_end : the end position of the current window
+# win_center : the centre position of the current window
+# win_global_fold : (boolean) whether the actual number of nucleotides
+# is smaller than the given window size (end of sequence?)
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getWindowHeader {
+my ( $graphHead, $win_size, $win_start, $win_end, $win_center, $win_global_fold, $win_sample, $used_t ) = @_;
+chomp $graphHead;
+$graphHead =~ s/t # //;
+my $ret = "w # $graphHead ";
+$ret .= "SHAPE_TYPE $used_t ";
+$ret .= "GLOBALFOLD $win_global_fold ";
+$ret .= "WSIZE $win_size ";
+$ret .= "WSTART $win_start ";
+$ret .= "WEND $win_end ";
+$ret .= "WCENT $win_center ";
+$ret .= "WSAMPLE $win_sample ";
+$ret .= "\n";
+return $ret;
+}
+###########################################################################
+# This method gives us the shape header that includes the single shrep
+# connected components.
+# Input:
+# winHead : the header for the current window
+# shrep : information about the current shrep
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getShapeHeader {
+my ( $winHead, $shrep ) = @_;
+chomp $winHead;
+$winHead =~ s/w # //;
+$winHead =~ s/t # //;
+my $ret = "s # $winHead ";
+my @info = @{$shrep};
+$ret .= join( " ", @info[ 1 .. $#info ] );
+$ret .= "\n";
+return $ret;
+}
+###########################################################################
+# This method gives us the sequence header that includes the unstructured
+# sequence connected components.
+# Input:
+# winHead : the header for the current window
+# shrep : information about the current shrep
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getSeqHeader {
+my ( $winHead, $shrep ) = @_;
+chomp $winHead;
+$winHead =~ s/w # //;
+$winHead =~ s/t # //;
+my $ret = "u # $winHead ";
+my @info = @{$shrep};
+$ret .= join( " ", @info[ 1 .. $#info ] );
+$ret .= "\n";
+return $ret;
+}
+##################################################################################
+# This method parses a fasta file and is useful if the header ID is not-unique!!
+# It returns the header lines in an array and the sequence lines in the same
+# order. It is then up to the user to extract the parts from the header that is
+# necessary for the script.
+# Furthermore, the method deals with multiple lines, and returns a single sequence
+# without line breaks.
+# Input:
+#		file		The name of the fasta file
+# Output:
+#	(1)	An array reference for each header line
+#	(2) An array reference for each sequence in same order as the headers
+##################################################################################
+sub read_fasta_with_nonunique_headers_meta {
+my ($file) = @_;
+my $FUNCTION = "read_fasta_file in Sequences.pm";
+my $header    = "";
+my $seqstring = "";
+my @headers   = ();
+my @sequences = ();
+my @meta      = ();
+my %seq_meta  = ();
+open( IN_HANDLE, "<$file" ) || die "ERROR in $FUNCTION:\n" . "Couldn't open the following file in package Tool," . " sub read_fasta_file: $file\n";
+while ( my $line = <IN_HANDLE> ) {
+chomp($line);
+# header (can contain one space after > symbol)
+if ( $line =~ /^\>(.*)/ ) {
+if ($header) {
+$seqstring =~ s/\s*//g;    ## do not allow spaces in sequence
+push( @headers,   $header );
+push( @sequences, $seqstring );
+push( @meta,      +{%seq_meta} );    ## anonymous hash reference
+#print keys %seq_meta;
+$seqstring = "";
+undef(%seq_meta);
+}
+$header = $1;
+} elsif ( $line =~ /(.+)\s+(#\S+)$/ && $header ) {
+if ( exists $seq_meta{$2} ) {
+$seq_meta{$2} .= $1;
+} else {
+$seq_meta{$2} = $1;
+}
+} elsif ($header) {
+$seqstring .= $line
+}
+}
+if ($header) {
+$seqstring =~ s/\s*//g;    ## do not allow spaces in sequence
+push( @headers,   $header );
+push( @sequences, $seqstring );
+push( @meta,      +{%seq_meta} );
+$seqstring = "";
+%seq_meta  = ();
+}
+return ( \@headers, \@sequences, \@meta );
+}
+1;
+#############################################################################
+# Programming description
+#############################################################################
+#	Substructure graphs for machine learning with Fabrizio
+#	-------------------------------------------------------
+#
+#	(1) Parameters (RNAshapes parameter):
+#		- Window sizes [] (-w)
+#		- window shift size (-W)
+#	- calculate structure probabilities (-r)
+#		- energy range kcal/mol (-e) OR energy relative percentage (%) to MFE (-c)
+#		- shape type 1 to 5 (-t)
+#		- ignore unstable substructures (-u)
+#		- max shreps
+#
+#
+#	(2) For each sequence, generate one graph/file that consists of all windows. The general format for one graph is as follows:
+#
+#	t # seq_id parameters
+#	v graph_index nt_type window_size window_centre abs_seq_pos
+#	...
+#	e m m+1 > 1 (backbone)
+#	...
+#	e base_i_graph_index base_j_graph_index s shrep_e shrep_p ...
+#
+#	For each window (subgraph) we create a subgraph (of the subgraph) for each substructure.
+#	We have a running index (gi) for each subgraph. All vertex and edge indices of the subgraph add
+#	the running graph index to the actual window position. For example
+#
+#
+#	Sequence: AAACC CUUUG GG
+#		  01234 56789 01
+#
+#	Window=10 substructure1 = (((...))). centre 5.5
+#
+#	v 0 A 10 5.5 0
+#	v 1 A 10 5.5 1
+#	v 2 A 10 5.5 2
+#	v 3 C 10 5.5 3
+#	v 4 C 10 5.5 4
+#	v 5 C 10 5.5 5
+#	v 6 U 10 5.5 6
+#	v 7 U 10 5.5 7
+#	v 8 U 10 5.5 8
+#	v 9 G 10 5.5 9
+#	e 0 1 > 1
+#	e 1 2 > 1
+#	e 2 3 > 1
+#	e 3 4 > 1
+#	e 4 5 > 1
+#	e 5 6 > 1
+#	e 6 7 > 1
+#	e 7 8 > 1
+#	e 8 9 > 1
+#	e 0 8 s -15.0 0.1223
+#	e 1 7 s -15.0 0.1223
+#	e 2 6 s -15.0 0.1223
+#
+#	gi = 9+1 = 10
+#
+#	Window = 10 substructure2 = .(((...))) centre 6.5
+#	v 10 A 10 6.5 2
+#	v 11 C 10 6.5 3
+#	v 12 C 10 6.5 4
+#	v 13 C 10 6.5 5
+#	v 14 U 10 6.5 6
+#	v 15 U 10 6.5 7
+#	v 16 U 10 6.5 8
+#	v 17 G 10 6.5 9
+#	v 18 G 10 6.5 10
+#	v 19 G 10 6.5 11
+#	e 10 11 > 1
+#	e 11 12 > 1
+#	e 12 13 > 1
+#	e 13 14 > 1
+#	e 14 15 > 1
+#	e 15 16 > 1
+#	e 16 17 > 1
+#	e 17 18 > 1
+#	e 18 19 > 1
+#	e 11 19 s -17.0 0.156
+#	e 12 18 s -17.0 0.156
+#	e 13 17 s -17.0 0.156
+#
+#	gi = 19+1 = 20
+#
+#
+#
+#	Write one perl script to create graphs for a set of sequences, fasta2shrep_gspan.pl.
+#
+#	INPUT:
+#		-f fasta file with all sequences to compute
+#		parameters as above
+#
+#	OUTPUT:
+#		one file per sequence that contains graph called seq_id.gspan
+#
+#	(1) for each window size call RNAshapes and write to a tmp file
+#	(2) parse result of RNAshapes (catch RNAshapes error - sequence too long?) check for max shreps.
+#	(3) convert RNAshapes result to subgraph -> write to file (readpipe) look at efficiency and errors
+#	(4) repeat (1) to (3) for each sequence
+# ABSTRACT STRUCTURE GRAPH
+# To each shrep graph (s #) add an abstract structure graph with special
+# abstract relations. For example, to add the abstract structure graph to
+# the previous shrep structure with gi numbers from 10 to 19, we first
+# identify the abstract shape, i.e. EL-S-HL and then create nodes (labelled
+# with a given name-space followed the actual node name and separated by a
+# hash) and edges for this graph as follows:
+# NOTE: at the moment we add the adjacent base-pairs to the loop definitions
+# but these can be removed if necessary in the future
+# v 20 abstruct#EL
+# v 21 abstruct#S
+# v 22 abstruct#HL
+# e 20 21 n
+# e 21 22 n
+# v 23 ^R
+# e 20 23 ^r
+# e 23 10 @r
+# e 23 11 @r
+# e 23 19 @r
+# v 24 ^R
+# e 21 24 ^r
+# e 24 11 @r
+# e 24 12 @r
+# e 24 13 @r
+# e 24 17 @r
+# e 24 18 @r
+# e 24 19 @r
+# v 25 ^R
+# e 22 25 ^r
+# e 25 13 @r
+# e 25 14 @r
+# e 25 15 @r
+# e 25 16 @r
+# e 25 17 @r
+# gi = 25+1 = 26
+# ANOTATION FILE
+# This is a file that labels a sequence region with a given annotation. In one
+# file we can have annotations within different name-spaces, for example target
+# sites predicted with different tools.
+# File format is a tab-delimited file with the following columns:
+# SEQID (same ID as in the fasta file header)
+# i (left positon of region)
+# j (right position of region, if one position i=j)
+# NAMESPACE#LABEL (give each annotation type one name-space and choose a label depending on the task)
+# E.g we have 2 different miRNAs and we predict the target-sites with (1) IntaRNA and (2) TargetSearch,
+# then this could be IntaRNA#miR1, IntaRNA#miR2, TargetSearch#miR1, and TargetSearch#miR2.
+# All labels can be used more than once for one or more sequences.
+# all labels with the same namespace and the same sequence ID are grouped into
+# one abstract graph, according to the order of i.
+# Example
+# SEQID 	i 	j 	NAMESPACE#LABEL
+# s1	10 	20	IntaRNA#miR1
+# s1	54	60	IntaRNA#miR2
+# s1	15	25	TargetSearch#miR1
+# s1	54	60 	TargetSearch#miR2
+# s2 ...
+# We create connected component per sequence ID per name-space,
+# so that all labels with
+# the same namespace are grouped together (for one sequence) as follows:
+# v 26 IntaRNA#miR1
+# v 27 IntaRNA#miR2
+# e 26 27 n
+# then create R nodes and ^r and @r relations as before (from i to j)
+# If a sequence graph option is given, then these graphs (u #) have to
+# be labelled with this annotation

Mercurial > repos > bgruening > sequence2gspan

comparison fasta2shrep_gspan.pl @ 0:b01beb170290 draft default tip