Mercurial > repos > bgruening > sequence2gspan

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta2shrep_gspan.pl	Tue Oct 29 11:10:19 2013 -0400
@@ -0,0 +1,2479 @@
+#!/usr/bin/perl
+#use feature ':5.10';
+use File::Basename;
+use lib dirname($0);    # search skript directory for modules
+use strict 'vars';
+use warnings;
+use Getopt::Long;
+use Pod::Usage;
+#use List::Util qw/ min max /;
+
+#use StructureLibrary::RNAtoolsConfig;
+#use StructureLibrary::Sequence;
+
+use Cwd qw(getcwd abs_path);
+use File::Temp qw(tempdir);
+use File::Copy;
+use POSIX qw(ceil);
+
+use FindBin;
+use lib "$FindBin::Bin";
+#use GraphClust;
+
+#use vars qw(%CONFIG);
+#*CONFIG = \%GraphClust::CONFIG;
+
+=head1 NAME
+
+
+fasta2shrep_gspan.pl -fasta mysequences.fasta -wins "50,100,150,200" -shift 5 -M 8
+
+=head1 SYNOPSIS
+
+Options:
+
+		HELP
+        -help   brief help message
+        -man    full documentation
+
+        COMPULSORY
+        -fasta	<STRING> e.g. "sequence.fasta"
+        		All sequences in fasta format.
+
+       	OPTIONS
+        -wins		[INTEGER] e.g. "50,100,200"
+        		A list of window sizes to use.
+        		If none are given (empty string ''), then the entire sequence is
+        		taken with no windows. Each window > 1 required!
+        -shift		<INTEGER> e.g. 20
+        		The shift of the window, relative to the window size given in
+        		percent. So you give which percent of the window size shall be
+        		used for the shift. Of course the shift is rounded down to the
+        		nearest whole number.
+        		Example 20 % of a window 150 would result in a step size of 30 nt.
+        		It is a relative parameter, as you can give different window sizes.
+        		If you do not give this parameter there is a default shift of 1 nt.
+        -cue		Crop unpaired ends.
+        		If you give this flag, then the unpaired ends of each
+        		single structure are ignored. E.g. the structure
+        		...(((...))).. becomes just (((...)))
+        -stack		Adds stacking information to graphs. This adds an additional
+        		vertex (type P) for each pair of stacked base-pairs and four edges
+        		(type p) from each of the involved bases to the new vertex.
+        -e		<FLOAT> e.g. 5.0
+        		Energy range in kcal/mol (RNAshapes)
+        		Use only one of -e and -c!
+        -c		<INTEGER> e.g. 10
+        		Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)
+        		Use only one of -e and -c!
+        -t		<INTEGER> [1-5] e.g. 3 OR "3=0,4=100,5=200"
+        		The shape type (RNAshapes). Default is 3.
+        		With the list format, the shape level can be changed for different window length
+        		"4=100" means that shape level 4 is used from length 100nt (window length)
+        		The first given length has to be 0! Not continuous given levels are allowed!
+        -M		<INTEGER> e.g. 10
+        		Max number of shreps that should be taken per window.
+        -u 		Ignore unstable structures (RNAshapes).
+        		This option filters out closed structures with positive free energy.
+        -r		Calculate structure probabilities for shreps (RNAshapes)
+        -i		<INT> e.g. 10
+        		Turn on structure sampling and gives number of sampling iterations.
+        		Default no sampling (i=0)
+        -sample-len	<INT> e.g. 100
+        		Only in sampling mode: Sampling is only used for seqs/windows >= given length,
+        		Default: sample all lengths (0), if -i > 0
+        -q		Turn on shape probabilities for RNAshapes, no sampling mode allowed
+        -Tp		<FLOAT> e.g 0.001
+        		Filter cutoff for shape probabilities, applied before -M filter!
+        -seq-graph-win	add for each window a graph which contains no structure
+        -seq-graph-t	add for each 't #' a graph which contains no structure
+        -seq-graph-alph change the alphabet of unstructured graphs
+        -annotate		<STRING> annotation.tab
+        				A file with annotations to be added as abstract graphs
+        				on the sequence leven (if given) and on the structure
+        				(SHREP) level. The format is has the following TAB-delimited
+        				columns: SEQID, START, END, NAMESPACE#LABEL.
+        				Labels with the same name-space and SEQID form connected
+        				components, which is a sequence of label vertices ordered
+        				by the START position in the sequence.
+        -abstr			Add abstract structure graphs to the single shrep graph
+        				instances.
+        -nostr			Calculate no structures, only add sequence information,
+        				if this is given, then -seq-graph-win AND/OR -seq-graph-t
+        				are required.
+        -match-shape    <SHAPE>
+                all seqs/windows will be constraint folded into that shape via
+                RNAshapes (if structure is given in another way this struct will be kept),
+                if this shape is not possible within given energy range, produce a
+                specific t graph with only one vertex 'X'. By this the instance
+                becomes very unsimilar to all other graphs (for knn)
+        -vp     enable graph computation with viewpoints:
+                svmsgdnspdk will center on those nucleotides that are given
+                via capital letters and ignore those given as lowercase letters
+        -tmp		<STRING> e.g. "/scratch/1/sita/tmp"
+        		A directory for writing temporary files
+        -o		<STRING> e.g. "ProjectX/MySequences/GSPAN/"
+        		Output directory for gspan files containing graphs.
+        -group		<INTEGER> e.g. 5
+                        Combine/group that number of input seqs into 1 gspan file
+                        output name is then '<INT>.group.gspan.bz2'
+        -sge            Use SGE cluster for each sequence separately
+        -sge-logdir     stdout directory for SGE call
+        -sge-errdir     sdterr directory for SGE call
+        -stdout         send graphs to stdout instead of writing to files
+        -ignore-header  don't write fasta id part after first space to gspan
+        -debug          additional debug output
+
+
+        DEFAULT VALUES
+        -wins	""
+        -shift	1 nt
+        -c		10
+        -t		3
+        -M		0 # selects all shreps
+        -tmp    "/var/tmp/fasta2shrep"
+        -o		"CURRENT_DIR/GSPAN/"
+
+        SGE mode
+        -sge-logdir "CURRENT_DIR/GSPAN/SGE_log"
+        -sge-errdir "CURRENT_DIR/GSPAN/SGE_log"
+        -task-id <NUM>
+
+=head1 DESCRIPTION
+
+=cut
+
+###############################################################################
+# end handler for temporary directory
+# adds an error handler that deletes the directory in case of error
+# SIGUSR{1/2} are sent by the sge prior to the uncatchable SIGKILL if the
+# option -notify was set
+###############################################################################
+$SIG{'INT'}  = 'end_handler';
+$SIG{'TERM'} = 'end_handler';
+$SIG{'ABRT'} = 'end_handler';
+$SIG{'USR1'} = 'end_handler';
+$SIG{'USR2'} = 'end_handler';
+
+sub end_handler {
+  print STDERR "signal ", $_[0], " caught, cleaning up temporary files\n";
+
+  # change into home directory. deletion of the temporary directory will
+  # fail if it is the current working directory
+  chdir();
+  File::Temp::cleanup();
+  die();
+}
+
+###############################################################################
+# PARSE COMMAND LINE OPTIONS
+###############################################################################
+
+# command line options
+my ( $i_help, $i_man, $i_debug, $i_fas, $i_wins, $i_shift, $i_crop_unpaired_ends,
+  $i_r, $i_e, $i_c, $i_t, $i_u, $i_M, $i_o, $i_sge, $i_jobid, $i_tmp,
+  $i_ignore_seq_header, $i_stacks, $i_stdout, $i_q, $i_T, $i_i,
+  $i_sample_min_length, $i_sge_logDir, $i_sge_errDir, $i_groupsize, $i_annotate,
+  $i_abstr, $i_no_structure, $i_vp, $i_matchShape, $i_rnashapes_binpath );
+
+my ( $i_add_seq_graph_win, $i_add_seq_graph_t, $i_change_seq_graph_alph );
+
+my $options = GetOptions(
+  "help"            => \$i_help,
+  "man"             => \$i_man,
+  "debug"           => \$i_debug,
+  "fasta=s"         => \$i_fas,
+  "wins=s"          => \$i_wins,
+  "shift=f"         => \$i_shift,
+  "cue"             => \$i_crop_unpaired_ends,
+  "stack"           => \$i_stacks,
+  "r"               => \$i_r,
+  "e=f"             => \$i_e,
+  "c=i"             => \$i_c,
+  "t=s"             => \$i_t,
+  "u"               => \$i_u,
+  "M=i"             => \$i_M,
+  "tmp=s"           => \$i_tmp,
+  "o=s"             => \$i_o,
+  "i=i"             => \$i_i,
+  "sample-len=i"    => \$i_sample_min_length,
+  "q"               => \$i_q,
+  "Tp=f"            => \$i_T,
+  "seq-graph-win"   => \$i_add_seq_graph_win,
+  "seq-graph-t"     => \$i_add_seq_graph_t,
+  "-seq-graph-alph" => \$i_change_seq_graph_alph,
+  "sge"             => \$i_sge,
+  "task-id=i"       => \$i_jobid,
+  "ignore-header"   => \$i_ignore_seq_header,
+  "stdout"          => \$i_stdout,
+  "sge-logdir=s"    => \$i_sge_logDir,
+  "sge-errdir=s"    => \$i_sge_errDir,
+  "group=i"         => \$i_groupsize,
+  "annotate=s"      => \$i_annotate,
+  "abstr"           => \$i_abstr,
+  "nostr"           => \$i_no_structure,
+  "vp"              => \$i_vp,
+  "match-shape=s"   => \$i_matchShape,
+  "rnashapes-bin=s" => \$i_rnashapes_binpath,
+);
+
+pod2usage( -exitstatus => 1, -verbose => 1 ) if $i_help;
+pod2usage( -exitstatus => 0, -verbose => 2 ) if $i_man;
+($options) or pod2usage(2);
+
+# check compulsory options
+($i_fas) or pod2usage("Error: the option -fasta is compulsory!\n");
+( -e $i_fas ) or pod2usage("Error: no such file - $i_fas!\n");
+$i_fas = abs_path($i_fas);
+
+# check other options and set default values
+pod2usage("Error: either -e OR -c can be given, but NOT both!\n") if ( $i_e && $i_c );
+( $i_e or $i_c ) or $i_c = 10;    # set -c 10, if neither -e or -c are given
+($i_M) or $i_M = 0;    # max number of shreps is 0 (=means take all computed)
+($i_i) or $i_i = 0;    # default no sampling else sampling iterations
+($i_sample_min_length) or $i_sample_min_length = 0;
+pod2usage("\nError: use --sample-len <INT> only with -i <1..INT> !\n") if ( not $i_i and $i_sample_min_length );
+($i_T) or $i_T = 0;
+($i_q) or $i_q = 0;
+pod2usage("\nError: Sampling (-i) not possible with shape probabilities (-q)!\n") if ( $i_i and $i_q );
+($i_add_seq_graph_win)     or $i_add_seq_graph_win     = 0;
+($i_add_seq_graph_t)       or $i_add_seq_graph_t       = 0;
+($i_change_seq_graph_alph) or $i_change_seq_graph_alph = 0;
+($i_no_structure)          or $i_no_structure          = 0;
+
+if ($i_change_seq_graph_alph) {
+  ($i_add_seq_graph_t) or ($i_add_seq_graph_win) or pod2usage( "Error: " .
+"When giving the parameter -seq-graph-alph, then either -seq-graph-t or -seq-graph-win" .
+      " must also be given!\n" );
+}
+( -e $i_annotate ) or pod2usage("Error: no such file - $i_annotate!\n")
+  if ($i_annotate);
+
+$i_add_seq_graph_t = 1 if ($i_no_structure);
+
+($i_t) or $i_t = 3;    # default abstraction type is 3
+my $change_shape_level = 0;
+my @level_lens = ( -1, -1, -1, -1, -1 ); ## array_idx-1=shape level, value=start length of this level
+
+if ( $i_t !~ /^\d+$/ ) {
+
+  my @t_minlens = split( ",", $i_t );    ## -t "3=100,4=200"
+
+  foreach my $idx ( 1 .. @t_minlens ) {
+
+    my $level = $t_minlens[ $idx - 1 ];    ##  $level = "3=100"
+
+    die "$level Wrong -t format! Example: -t 3=0,4=100,5=200\n" if ( $level !~ /^\d+\=\d+$/ );
+    my @lev_len = split( "=", $level );    ##  $level = "3=100"
+
+    die "Wrong -t format! First level given needs to be length 0! Example: -t 3=0,4=100,5=200\n" if ( $idx == 1 && $lev_len[1] != 0 );
+    die "Wrong -t format! Only level 1-5 allowed! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[0] < 1 or $lev_len[0] > 5 );
+    die "Wrong -t format! Length >= 0 expected! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[1] < 0 );
+    $change_shape_level = 1;
+    $level_lens[ $lev_len[0] - 1 ] = $lev_len[1];
+  }
+
+  ($i_debug) and print STDERR "i_t = $i_t - change $change_shape_level -  shape level lengths#" . join( ":", @level_lens ) . "#" . join( ":", @t_minlens ) . "#\n";
+}
+
+## checks for match-shape
+if ($i_matchShape){
+  die "Please provide correct match shape string like '[]'! Exit...\n\n"
+    if ($i_matchShape !~ /^[\[\]_]+$/);
+  ## RNAshapes prodices anyway only 1 structure, no suboptimal structs in match-shape folding
+  $i_M = 1;
+}
+
+my $CURRDIR = getcwd;
+
+# set up tmp directory
+# default tmp is /var/tmp, usually not NFS mounted!
+( defined $i_tmp ) or $i_tmp = '/var/tmp/';
+my $tmp_template = 'fasta2shrep-XXXXXX';
+
+# CLEANUP => 1 : automatically delete at exit
+$i_tmp = tempdir( $tmp_template, DIR => $i_tmp, CLEANUP => 1 );
+
+# create GSPAN directory when not printing to stdout
+if ( not $i_stdout ) {
+  if ($i_o) {
+    ( -e $i_o ) or system("mkdir -p $i_o");
+  } else {
+    system("mkdir -p GSPAN");
+    $i_o = $CURRDIR . "/GSPAN/";
+  }
+}
+
+###############################################################################
+# GLOBAL VARIABLES
+###############################################################################
+$i_rnashapes_binpath or $i_rnashapes_binpath = "RNAshapes";
+die "Please provide full RNAshapes path (e.g. /usr/bin/RNAshapes) !\n" if (!-f "$i_rnashapes_binpath");
+my $rnashapes_loc = $i_rnashapes_binpath;
+if ( !$rnashapes_loc || !-e $rnashapes_loc ) {
+  my $loc = `which RNAshapes`;
+  chomp($loc);
+  die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !$loc );
+  die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !-e $loc );
+  $rnashapes_loc = $loc;
+}
+
+my @WINDOWS = ();
+@WINDOWS = split( ",", $i_wins ) if ($i_wins);
+my $globalFolding;
+$globalFolding = 1 unless @WINDOWS;
+my $CURRUSER = getlogin;
+my $SEQNO    = 0;          # used to id the sequences
+my $GSPANNO  = 0;          # used for gspan filenames
+# minimum length of sequence that will work with RNAshapes
+# sequences with one or two nucleotides will be restricted to sequence-only graphs
+# this ensures that no sequences are skipped and external info kept synchronized
+my $GSPAN_SEQ_MINLEN = 3;
+
+# name spaces
+my $ABSTRUCT = "AS";
+###############################################################################
+# EXECUTION CODE
+###############################################################################
+
+# read fasta file into hash
+my ( $headers_aref, $sequences_aref, $metainfo_aref ) = read_fasta_with_nonunique_headers_meta($i_fas);
+
+# call script again on the sge cluster in a batch job
+if ($i_sge) {
+  my $SCRIPTNAME    = $0;
+  my $CLUSTERSUBMIT = $FindBin::Bin."/fasta2shrep_gspan.sge";
+
+  die "Cannot find SGE submit script $CLUSTERSUBMIT! Exit...\n\n" if ( !-e $CLUSTERSUBMIT );
+
+  my $SGE_jobs = @{$sequences_aref};
+  $SGE_jobs = ceil( @{$sequences_aref} / $i_groupsize ) if ($i_groupsize);
+
+  my $params = "-fasta $i_fas ";
+  $params .= "-wins $i_wins "                    if ($i_wins);
+  $params .= "-shift $i_shift "                  if ($i_shift);
+  $params .= "-e $i_e "                          if ($i_e);
+  $params .= "-c $i_c "                          if ($i_c);
+  $params .= "-t $i_t "                          if ($i_t);
+  $params .= "-u "                               if ($i_u);
+  $params .= "-r "                               if ($i_r);
+  $params .= "-M $i_M "                          if ($i_M);
+  $params .= "-o $i_o "                          if ($i_o);
+  $params .= "-i $i_i "                          if ($i_i);
+  $params .= "--sample-len $i_sample_min_length" if ($i_sample_min_length);
+  $params .= "-q "                               if ($i_q);
+  $params .= "-Tp $i_T "                         if ($i_T);
+  $params .= "--seq-grap-win "                   if ($i_add_seq_graph_win);
+  $params .= "--seq-graph-t "                    if ($i_add_seq_graph_t);
+  $params .= "--seq-graph-alph "                 if ($i_change_seq_graph_alph);
+  $params .= "-ignore-header "                   if ($i_ignore_seq_header);
+  $params .= "-cue "                             if ($i_crop_unpaired_ends);
+  $params .= "-stack "                           if ($i_stacks);
+  $params .= "--group $i_groupsize "             if ($i_groupsize);
+  $params .= "-annotate $i_annotate"             if ($i_annotate);
+  $params .= "-abstr"                            if ($i_abstr);
+  $params .= "-nostr "                           if ($i_no_structure);
+  $params .= "--debug "                          if ($i_debug);
+  $params .= "-vp "                              if ($i_vp);
+  $params .= "--match-shape $i_matchShape "      if ($i_matchShape);
+
+  print "used script:" . $SCRIPTNAME . "\n";
+  print "used submit script:" . $CLUSTERSUBMIT . "\n";
+
+  $i_sge_logDir = "$i_o/SGE_log" if ( !$i_sge_logDir );
+  mkdir($i_sge_logDir);
+
+  $i_sge_errDir = $i_sge_logDir if ( !$i_sge_errDir );
+  mkdir($i_sge_errDir);
+
+  my $ssh = 1;    ## can be used for debug shell script call
+  if ($ssh) {
+    system( "ssh $CURRUSER\@biui.informatik.uni-freiburg.de "
+        . "'export SGE_ROOT=/opt/sge-6.0/; cd $CURRDIR; "
+        . "/opt/sge-6.0/bin/lx24-amd64/qsub -t 1-$SGE_jobs -o $i_sge_errDir/ -e $i_sge_errDir/ "
+        . "$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME \"$params\" ' " );
+  } else {
+    system("$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME '$params' ");
+  }
+  exit;
+}
+
+## compute shreps and gspan, either local or after SGE submission
+
+# TODO read and process annotations
+
+my @used_seq_headers;
+my @used_seqs;
+my @used_meta;
+my $group_idx;
+
+if ($i_jobid) {
+  ## just process the one sequence as given by the jobid number
+  my $used_grouping = 1;   ## if no group is given, make 1 seq per job = group=1
+  $used_grouping = $i_groupsize if ($i_groupsize);
+  my $st  = ( $i_jobid - 1 ) * $used_grouping + 1;
+  my $end = $st + $used_grouping - 1;
+  $end = @{$sequences_aref} if ( $end > @{$sequences_aref} );
+
+  foreach my $idx ( $st .. $end ) {
+    push( @used_seq_headers, $headers_aref->[ $idx - 1 ] );
+    push( @used_seqs,        $sequences_aref->[ $idx - 1 ] );
+    push( @used_meta,        $metainfo_aref->[ $idx - 1 ] );
+  }
+
+  $group_idx = $i_jobid;
+
+  #($i_debug) and print STDERR "st $st end $end gr $group_idx job $i_jobid gs $i_groupsize\n";
+
+  $GSPANNO = $i_jobid-1 if (!$i_groupsize);
+
+} else {
+  ## process all sequences at once
+  @used_seq_headers = @{$headers_aref};
+  @used_seqs        = @{$sequences_aref};
+  @used_meta        = @{$metainfo_aref};
+}
+
+my $out;
+my $gspanfile;
+my $out_no_match_shape;
+
+if ($i_matchShape && !$i_stdout && !$i_groupsize){
+  open($out_no_match_shape,">$i_o/fasta2shrep.no_match");
+}
+
+# for each sequence in the fasta file
+while ( my $seq = shift @used_seqs ) {
+  my $tmp_header = shift @used_seq_headers;
+  my $tmp_meta   = shift @used_meta;
+
+  my ( $seq_id, $seq_header ) = ( $tmp_header =~ /(\S+)\s*([\S*\s*]*)/ );
+  $i_ignore_seq_header and $seq_header = '';
+
+  my $seq_fasta = generate_single_fasta_from_sequence_X( $seq_id, $seq );
+  my $seq_len = length($seq);
+
+  # only print sequence graphs for sequences below this threshold
+  my $no_structure_override = ($seq_len < $GSPAN_SEQ_MINLEN) ? 1 : 0;
+
+  $GSPANNO++;
+
+  # set outstream for gspan output to correct file/STDOUT
+  if ($i_stdout) {
+    $out = \*STDOUT;
+  } elsif ( !$i_groupsize ) {
+    $gspanfile = $i_tmp . '/' . $GSPANNO . '.gspan';
+    open( $out, "| bzip2 -f > $gspanfile.bz2" );
+  } elsif ( ( $GSPANNO - 1 ) % $i_groupsize == 0 ) {
+
+    if ( $GSPANNO > 1 ) {
+      close($out);
+      system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
+      if ($out_no_match_shape){
+        close($out_no_match_shape);
+        system("mv $gspanfile.no_match $i_o/$group_idx.group.gspan.no_match");
+      }
+    }
+
+    if ( !$i_jobid ) {
+      $group_idx = int( ( $GSPANNO - 1 ) / $i_groupsize ) + 1;
+    }
+
+    $gspanfile = "$i_tmp/$group_idx.group.gspan";
+    open( $out, "| bzip2 -f > $gspanfile.bz2" );
+    open( $out_no_match_shape, ">$gspanfile.no_match" ) if ($out_no_match_shape);
+  }
+
+  ## do not use folding windows in special cases
+  if ( $globalFolding || $i_no_structure || $no_structure_override) {
+    @WINDOWS = ();
+    push( @WINDOWS, $seq_len );
+  }
+
+  ##check win sizes for global folding
+  my @WINDOWS_used = sort { $a <=> $b } @WINDOWS;
+  foreach my $w_idx ( 0 .. ( @WINDOWS_used - 1 ) ) {
+    if ( $WINDOWS_used[$w_idx] >= $seq_len && $WINDOWS_used[$w_idx] > 1 ) {
+      if ( $w_idx < $#WINDOWS_used ) {
+        @WINDOWS_used = @WINDOWS_used[ 0 .. $w_idx ];
+        last;
+      }
+    }
+  }
+
+  ## use seq graph only if no shape folding wanted
+  $i_add_seq_graph_t = 1 if ($i_no_structure || $no_structure_override);
+
+  ## no shape info in graphheader if we have fixed structure (according LocaRNA handling)
+  ## use tags #FS or #S for provided structure
+  my $graph_header;
+  my @struct_meta = grep { $_ =~ /#FS/ || $_ =~ /#S/ } keys %{$tmp_meta};
+
+  if (@struct_meta) {
+    $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, 1 );
+  } else {
+    $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, $i_no_structure || $no_structure_override );
+  }
+
+  print $out $graph_header;
+
+  my $gi = 0;    # current graph index
+
+  ## add graph with no structure at all depending on $i_add_seq_graph_t
+  if ($i_add_seq_graph_t) {
+
+    ($gi) = convertSeqWindow( $seq, $seq_len, 1, $gi, $graph_header, $out, $i_annotate, $seq );
+
+  }
+
+  ## encode fixed structure only if provided and structures wanted in general
+  if ( @struct_meta && !$i_no_structure && !$no_structure_override) {
+
+    my $struct_meta = "#FS";
+    $struct_meta = "#S" if ( !exists $tmp_meta->{$struct_meta} );
+
+    my $seq_shrep = [ $tmp_meta->{$struct_meta}, "ENERGY", "0.00", "SHAPE", $struct_meta ];
+    $gi = convertShapeWindow( [$seq_shrep], $seq, $seq_len, 1, $gi, $out,
+      $graph_header, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
+    @WINDOWS_used = ();    ## no shape folding if we have a fixed structure
+
+  }
+
+  ## ignore RNAshapes folding if wanted, but do correct file move afterwards
+  ## (just "next" does not work due to output)
+  @WINDOWS_used = () if ($i_no_structure or $no_structure_override);
+
+  #for each window size in list
+  foreach my $win_size (@WINDOWS_used) {
+
+    # calculate shift size from percentage
+    my $curr_shift = 1;
+    if ($i_shift) {
+      $curr_shift = ( $i_shift / 100 ) * $win_size;
+      $curr_shift = int($curr_shift);                 #round down
+      $curr_shift = 1 unless ($curr_shift);           # just in case it is 0
+    }
+    ($i_debug) and print STDERR "winsize: $win_size curr_shift: $curr_shift\n";
+    ($i_debug) and print STDERR "\nNext: $seq_id\t winsize:$win_size \n";
+
+    # choose current shape level, depending on $i_t
+    my $curr_t = 0;
+    if ($change_shape_level) {
+      for ( my $i = 0 ; $i < @level_lens ; $i++ ) {
+        $curr_t = $i + 1 if ( $level_lens[$i] != -1 && ( $level_lens[$i] <= $win_size ) );
+      }
+      ($i_debug) and print STDERR "$win_size curr type $curr_t\n";
+    } else {
+      $curr_t = $i_t;
+    }
+
+    my $rnashapesoutput_fh;
+
+    # call RNAshapes and write to $rnashapesoutput_fh
+    $rnashapesoutput_fh = call_RNAshapes( $seq_fasta, $rnashapes_loc, $win_size,
+      $curr_shift, $i_e, $i_c, $curr_t, $i_u, $i_r, $i_q, $i_T, $i_i, $i_sample_min_length, $seq_len, $i_matchShape );
+
+    # read RNAshapes output from $rnashapesoutput_fh and write subgraph
+    # to gspan file
+    my $gi_old = $gi;
+    $gi = convert_RNAshapes_output( $rnashapesoutput_fh, $gi, $i_M, $out, $graph_header,
+      $win_size, $seq_len, $curr_t, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq );
+
+    ## no (match) shape found at all for this seq
+    if ($gi == $gi_old+1){
+      $gi = convertSeqWindow( "X", 1, 1, $gi, $graph_header, $out, $i_annotate, $seq );
+      print $out_no_match_shape $seq_id."\n" if (!$i_stdout && $out_no_match_shape);
+    }
+  }    ## foreach WINDOW_used
+
+  if ( !$i_stdout && !$i_groupsize ) {
+    close($out);
+    move "$gspanfile.bz2", "$i_o/$GSPANNO.gspan.bz2";
+  }
+
+  system("rm $seq_fasta");
+
+}    ## while @used_seqs
+
+if ($i_groupsize) {
+  close($out);
+  system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2");
+  close($out_no_match_shape) if ($out_no_match_shape);
+  move "$gspanfile.no_match", "$i_o/$group_idx.group.gspan.no_match" if ($out_no_match_shape);
+} elsif ( $out_no_match_shape && !$i_stdout && !$i_groupsize ) {
+  close($out_no_match_shape);
+}
+
+###############################################################################
+# METHODS
+###############################################################################
+
+############################################################################
+# Generates fasta file for a single sequence (one-lined-fasta). This
+# fasta is stored in the temp directory and should be deleted at the end.
+# Input:
+# seq_id : the sequence ID
+# seq : the sequence
+#
+# Output:
+# The fasta file name
+############################################################################
+sub generate_single_fasta_from_sequence_X {
+  my ( $seq_id, $seq ) = @_;
+
+  $seq = uc($seq);
+  $seq =~ tr/T/U/;
+  $seq =~ s/[^AUCGN]/N/g;
+
+  my $outfas = $i_tmp . "/seq_" . $SEQNO++ . ".fasta";
+  my $host   = readpipe("hostname");
+  open( FAS, ">$outfas" ) or die "$host Cannot open file $outfas! Exit...\n\n";
+  print FAS ">$seq_id\n$seq";
+  close(FAS);
+
+  return $outfas;
+}
+
+############################################################################
+# RNAshapes is called with the given input or default parameters.
+# Input:
+# seq_fasta : the sequence fasta file
+# rnashapes_location : the location of the installation files for RNAshapes
+# win_size : the current window size
+# shift : the input parameter -shift
+# e : the input parameter -e
+# c : the input parameter -c
+# t : the input parameter -t
+# u : the input parameter -u
+# r : the input parameter -r
+#
+# Output: none
+############################################################################
+sub call_RNAshapes {
+  my ( $seq_fasta, $rnashapes_location, $win_size, $shift, $e, $c, $t, $u, $r, $q, $T, $i, $sample_length, $seqLen, $matchShape ) = @_;
+  my $FUNCTION = "call_RNAshapes in fasta2shrep_gspan.pl";
+
+  ($seq_fasta) or die("INPUT ERROR in $FUNCTION: the fasta file is compulsory!\n");
+  ($rnashapes_location) or die( "INPUT ERROR in $FUNCTION: the RNAshapes location" . " is compulsory!\n" );
+  die "$rnashapes_location does not exist! Exit...\n\n" if ( !-e $rnashapes_location );
+  my $call = $rnashapes_location . " -o 1 ";    # the output format is of type 1
+  $call .= "-q " if ($q);
+  $call .= "-T $T " if ( $q and $T );
+  $call .= "-w $win_size ";
+  $call .= "-W $shift " if ($i_shift);
+  die("ERROR in $FUNCTION: Give only one of the options -c or -e (RNAshapes)!\n")
+
+    if ( $e && $c );
+  $call .= "-e $e " if ($e);
+  $call .= "-c $c " if ($c);
+  $call .= "-t $t " if ($t);
+  $call .= "-u "    if ( $u and not $i );       ## not possible in sampling mode
+  $call .= "-r "    if ($r);
+  $call .= "-m $matchShape " if ($matchShape);
+
+  ## check is a bit long but : we want to sample if the window is larger than $sample_length and full seq is larger than window or sample_len
+  ## necessary to do sampling a large window is given, sample_length is shorter than window, but seq is longer than sample_len
+  $call .= "-i $i -A " if ( not $q and $i and ( $win_size >= $sample_length and ( $win_size <= $seqLen or $seqLen >= $sample_length ) ) ); ## -A is to omit samples and print only combined shape probs
+  $call .= " < $seq_fasta";
+
+  ($i_debug) and print STDERR "$seqLen $sample_length $win_size $call\n";
+
+  open my $rnashapesoutput, "$call |" or die( "ERROR in $FUNCTION: The following call " . "could not be carried out!\n$call\n" );
+
+  return $rnashapesoutput;
+}
+
+############################################################################
+# The output of RNAshapes for one sequence and one window size is read
+# and converted into graph format.
+# Input:
+# rnashapeoutput : filehandle for RNAshapes output in format -o 1
+# curr_gi : current graph index (for vertices)
+# maxShreps : max number of shreps to convert to graphs
+# graph_file_hdl : the output handler for the graph file
+# graphHead : the header line for the complete graph (for sequence)
+# winSize : current window size in input for RNAshapes
+# seqLen  : full input seq length
+# used_t  :
+# annotate:
+# abstr   :
+# cue     :
+# stacks  :
+# orig_seq : the nucleotide sequence as read from fasta
+#
+# Output:
+# The current graph index
+############################################################################
+sub convert_RNAshapes_output {
+  my ( $rnashapesoutput, $curr_gi, $maxShreps, $graph_file_hdl, $graphHead, $winSize, $seqLen, $used_t, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
+
+  ## omit first line in output, contains fasta header line
+  my $line       = <$rnashapesoutput>;
+  my $win_shreps = [];
+  my $win_start;
+  my $win_end;
+  my $win_seq;
+  my $win_shrep_count = 0;
+  my $win_sample      = 0;
+  my $winHead;
+  my $win_globalFolding;
+  my $win_size_real;
+
+  # reading RNAshapes output
+  while ( $line = <$rnashapesoutput> ) {
+
+    if ( $line =~ /^(\d+)\s+(\d+)$/ ) {
+      ## line: "<start>    <end>"
+
+      if ( @{$win_shreps} > 0 ) {
+
+        print $graph_file_hdl $winHead;
+
+        ## remove SHAPE="_" shrep if it exists in $win_shreps
+        ## do this only when have a sequence graph already
+        if ($i_add_seq_graph_t) {
+          my @new_win_shreps = ();
+          map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
+          $win_shreps = \@new_win_shreps;
+        }
+
+        ## add graph with no structure depending on $i_add_seq_graph_win to win
+        if ($i_add_seq_graph_win ) {
+          ( $curr_gi ) = convertSeqWindow( $win_seq, $win_size_real,
+            $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
+        }
+
+        $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
+          $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr,
+          $cue, $stacks, $orig_seq );
+      }
+
+      ## set new window params
+      $win_shreps      = [];
+      $win_start       = $1;
+      $win_end         = $2;
+      $win_shrep_count = 0;
+      $win_size_real   = $win_end - $win_start + 1;
+
+      if ( ($win_size_real) >= $seqLen ) {
+        $win_globalFolding = 1;
+      } else {
+        $win_globalFolding = 0;
+      }
+      my $win_center = $win_start + ( ( $win_size_real + 1 ) / 2 );
+
+      $winHead = getWindowHeader( $graphHead, $winSize, $win_start, $win_end, $win_center, $win_globalFolding, $win_sample, $used_t );
+
+    } elsif ( $line =~ /^(\S+)$/ ) {
+      ## line: "CUUAUGAGUAAGGAAAAUAACGAUUCGGGGUGACGCCCGAAUCCUCACUG"
+      $win_seq = uc($1); ## to be 101% sure that we have by default uppercase chars
+
+    } elsif ( $line =~ /^Results for (\d+) iterations:$/ ) {
+      ## line: "Results for 10 iterations:"
+      $win_sample = $1;
+
+    } elsif ( $line =~ /^Shape\s+\S+\s+not found within energy range.*$/ ) {
+      ## line: "Shape [] not found within energy range (-24.75 to -27.50). Try -c or -e to increase range."
+      $win_shreps      = [];
+
+    } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)$/ ) {
+      ## line:"...((((..(((....)))))))...........(((((......)))))  (-10.10)  [[]][]"
+      ## take only $maxShreps shreps per window if set
+      next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+      push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPE", $3 ] );
+      $win_shrep_count++;
+
+    } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)$/ ) {
+      ## line:"((((..((((...((.((.((.....)).)).))...))))..))))...  (-10.60)  (0.7795360)  [[[[[]]]]]"
+      ## take only $maxShreps shreps per window if set
+      next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+      push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPE", $4 ] );
+      $win_shrep_count++;
+
+    } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
+      ## line:"((((..((((...((.((.((.....)).)).))...))))..))))...  (-10.60) 0.3000000 [[[[[]]]]]"
+      ## take only $maxShreps shreps per window if set
+      next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+      push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPEPROB", $3, "SHAPE", $4 ] );
+      $win_shrep_count++;
+
+    } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) {
+      ## line:"((((..((((...((.((.((.....)).)).))...))))..))))...  (-10.60)  (0.7795360) 0.3000000 [[[[[]]]]]"
+      ## take only $maxShreps shreps per window if set
+      next if ( $maxShreps && $win_shrep_count >= $maxShreps );
+      push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPEPROB", $4, "SHAPE", $5 ] );
+      $win_shrep_count++;
+
+    } else {
+      next if ( $line =~ /^$/ );
+      die "Unexpected shape output format!\nline=$line\n\nExit...\n\n";
+    }
+  }
+
+  ## convert last windows
+  if ( @{$win_shreps} > 0 ) {
+
+    print $graph_file_hdl $winHead;
+
+    ## remove SHAPE="_" shrep if it exists in $win_shreps
+    ## do this only when have a sequence graph already
+    if ($i_add_seq_graph_t) {
+      my @new_win_shreps = ();
+      map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps};
+      $win_shreps = \@new_win_shreps;
+    }
+
+    ## add graph with no structure depending on $i_add_seq_graph_win to win
+    if ($i_add_seq_graph_win) {
+      ($curr_gi) = convertSeqWindow( $win_seq, $win_size_real, $win_start,
+        $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq );
+    }
+
+    $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real,
+      $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, $cue,
+      $stacks, $orig_seq );
+  }
+
+  close($rnashapesoutput);
+
+  return $curr_gi + 1;    # return the gi (graph index) for the next subgraph
+}
+
+## Sub to create graph for a complete unstructured sequence
+# TODO: document function
+sub convertSeqWindow {
+  my ( $win_seq, $win_size_real, $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ) = @_;
+
+  my $seq_shrep;
+
+  # use a different alphabet (lowercase) if option seq-graph-alph set
+  my $seq_graph_sequence = $i_change_seq_graph_alph ? lc($win_seq) : uc($win_seq);
+  $seq_shrep = [ "." x $win_size_real, "ENERGY", "0.00", "SHAPE", "_", "STRUCT", "." x $win_size_real, "SEQ", $seq_graph_sequence ];
+  my $backboneGraph_ref = getBackboneGraph( $seq_graph_sequence, $curr_gi, $win_start, 0, ( $win_size_real - 1 ), $orig_seq );
+
+  print $graph_file_hdl getSeqHeader( $winHead, $seq_shrep );
+  print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
+
+  $curr_gi += $win_size_real;
+
+  if ($annotate) {
+    ## TODO add annotation graphs
+  }
+
+  return ($curr_gi);
+}
+
+############################################################################
+# The results for one window are converted in this method to GSPAN graphs.
+# Input:
+# win_shreps_aref : the array ref of shreps (dot-bracket structures) for the
+# 					current window
+# win_seq : the nucleotide sequence for the current window
+# win_size_real : the current (true) window size
+# win_start : the starting position of win_seq in the original seq (1-n)
+# curr_gi : the current graph index
+# graph_file_hdl : the graph file handler
+# winHead   : the header line for the current sequence window
+# TODO annotate
+# abstr : input parameter -abstr
+# cue   : input parameter -cue
+# stacks : input parameter -stacks
+# orig_seq : the nucleotide sequence as read from fasta
+#
+# Output:
+# The current graph index
+############################################################################
+sub convertShapeWindow {
+  my ( $win_shreps_aref, $win_seq, $win_size_real, $win_start, $curr_gi,
+    $graph_file_hdl, $winHead, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_;
+
+  ## generate for each shrep a connected component in gspan file
+  foreach my $shrep ( @{$win_shreps_aref} ) {
+
+    # get the current gi as it was at the beginning of this shrep
+    my $shrep_gi = $curr_gi;
+
+    # cut off unpaired ends, if option is given
+    my $shrep_struct     = $shrep->[0];
+    my $crop_index_left  = 0;
+    my $crop_index_right = length($shrep_struct) - 1;
+
+    # find croping indices with numbering from 0 to n-1
+    if ($cue) {
+      $crop_index_left = index( $shrep_struct, "(" );    # find 1st occ of "("
+      $crop_index_right = rindex( $shrep_struct, ")" );  # find last occ of ")"
+
+      # if the complete window is unpaired, then don't crop
+      if ( $crop_index_left == -1 ) {
+        $crop_index_left  = 0;
+        $crop_index_right = length($shrep_struct) - 1;
+      }
+    }
+
+    # create structure graph
+    my $backboneGraph_ref = getBackboneGraph( $win_seq, $curr_gi, $win_start, $crop_index_left, $crop_index_right, $orig_seq );
+    my $structGraph_ref;
+
+    # add both basic edges, stacks and abstract graph
+    if ($abstr) {
+      ( $structGraph_ref, $curr_gi ) = getStructPlusAbstractStructGraph( $shrep, $curr_gi, $win_size_real, $cue, $stacks );
+
+      # just add basic structure graph edges, and stacks
+    } else {
+      ( $structGraph_ref, $curr_gi ) = getStructGraph( $shrep, $curr_gi, $win_size_real, $stacks );
+    }
+
+    # additional information for shrep header: sequence and dot-bracket
+    my $crop_length = $crop_index_right - $crop_index_left + 1;
+    my $shrepheader_struct = substr( $shrep_struct, $crop_index_left, $crop_length );
+    my $shrepheader_seq = substr( $win_seq, $crop_index_left, $crop_length );
+    push( @{$shrep}, 'STRUCT', $shrepheader_struct, 'SEQ', $shrepheader_seq );
+
+    # print structure graph to file
+    print $graph_file_hdl getShapeHeader( $winHead, $shrep );
+    print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n";
+
+    # don't print empty array; sgdnspdk breaks with empty lines
+    if ( @{$structGraph_ref} > 0 ) {
+      print $graph_file_hdl join( "\n", @{$structGraph_ref} ) . "\n";
+
+      if ($annotate) {
+
+        # TODO add annotations to shrep structure
+        # $win_start, $win_end (1-n)
+      }
+
+    }
+  }
+
+  return $curr_gi;
+}
+
+###########################################################################
+# Here the vertices for the nucleotides are generated and also the backbone
+# edges, which connect the nucleotides in their correct order (i.e order
+# of the given sequence)
+# Input:
+# win_seq : the sequence for the current window
+# win_start : the starting position for the current window
+# curr_crop_i_left : the cropping index for the left end of sequence (-cue)
+# curr_crop_i_right : the cropping index for the right end of sequence (-cue)
+# orig_seq : the nucleotide sequence as read from fasta
+#
+# Output:
+# The lines of the graph in an array reference to be printed to a file with
+# one element per line.
+############################################################################
+sub getBackboneGraph {
+  my ( $win_seq, $curr_gi, $win_start, $curr_crop_i_left, $curr_crop_i_right, $orig_seq ) = @_;
+
+  # RNAshapes substitutes T -> U, sequence only does not
+  # thus, just transform all t/T -> u/U to remain consistent
+  $orig_seq =~ tr /tT/uU/;
+  $win_seq  =~ tr /tT/uU/;
+
+  my @seq;
+  @seq = split( "", $win_seq );
+
+  # if the vp option is set we need to obtain the original capitalization from
+  # $orig_seq and extract the windowed sequence manually
+  my @seq_vp;
+  if ( defined $i_vp ) {
+    my $win_len = length($win_seq);
+    my $capitalized_win_seq = substr( $orig_seq, $win_start - 1, $win_len );
+    ( uc($capitalized_win_seq) eq uc($win_seq) ) or
+      die( "error: windowed sequence generated due to vp option not equal " .
+        "to sequence reported by RNASHAPES.\n" .
+        "'${capitalized_win_seq}' != '${win_seq}'" );
+    @seq_vp = split( "", $capitalized_win_seq );
+  }
+
+  my @vert = ();
+  my @edg  = ();
+
+  my $curr_abs_pos = $win_start + $curr_crop_i_left;
+  $curr_gi += $curr_crop_i_left;
+
+  # set vertice labeled with 'v' or 'V' according to sequence and vp option
+  # when vp set,
+  # uppercase nucleotides are annotated with 'v'
+  # lowercase nucleotides are annotated with 'V'
+  my $vertice_label = ( defined $i_vp and ( $seq_vp[$curr_crop_i_left] =~ /[a-z]/ ) ) ? 'V' : 'v';
+
+  # create backbone vertice of first nucleotide
+  push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$curr_crop_i_left], $curr_abs_pos ) );
+
+  foreach my $idx ( ( $curr_crop_i_left + 1 ) .. $curr_crop_i_right ) {
+    $curr_abs_pos++;
+    $curr_gi++;
+
+    # set vertice label as described above
+    $vertice_label = ( defined $i_vp and ( $seq_vp[$idx] =~ /[a-z]/ ) ) ? 'V' : 'v';
+    push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$idx], $curr_abs_pos ) );
+    push( @edg, join( ' ', "e", $curr_gi - 1, $curr_gi, '> 1' ) );
+  }
+
+  my @ret = ( @vert, @edg );
+
+  return \@ret;
+}
+
+###########################################################################
+# This method does the same as getStructGraph, but is extended to identify
+# the abstract parts of the structure and also add this to the graph
+# via abstract relations, see information below.
+# We already have the backbone graph, now the base-pair edges need to be
+# added to this graph and the abstract graph is added after this. Finally
+# the abstract relations, pointing from the abstract level to the basic
+# structure level is added. All the while, we keep track of the current
+# index graph. Furthermore, in the basic structure graph, we have vertices
+# stacks that connects the four stacked base-pairs involved. (symbols P
+# for vertex and p for edges). The abstract vertices are labelled according
+# to the given name-space, then #, then the abstract structure type, e.g.
+# HL for hairpin-loop. The edges between the structure types are denoted
+# with n, and the relation vertex is ^R, with ^r going from the abstract
+# structure to the relation and from the relation to the basic structure
+# is @r.
+#
+# Input:
+# curr_shrep : the current shrep as dot-bracket structure
+# curr_gi : the current graph index
+# win_size : the window size
+# cue : the input parameter -cue (whether to crop unpaired ends)
+# stacks : the input parameter -stacks (whether to add stack information)
+#
+# Output:
+# The structure graph information line-by-line as an array reference and
+# the current graph index.
+#
+############################################################################
+sub getStructPlusAbstractStructGraph {
+  my ( $curr_shrep, $curr_gi, $win_size, $cue, $stacks ) = @_;
+
+  #  $win_size =  length($curr_shrep->[0]);
+
+  my @struct = split( "", $curr_shrep->[0] );
+
+#  print "=================testing: new shrep : gi=$curr_gi ===============================\n" if ($i_debug);
+
+  # OBJECTS and VARIABLES
+  # all indices are saved according to the current graph index, $curr_gi,
+  # which is at the beginning of the current window, so that the nucleotide
+  # index can be inferred using $idx + $curr_gi
+  # opening brackets
+  my @open_blks = (); # open blocks of consecutive opening brackets (array of arrays)
+  my @p_open = ();  # currently open block of the last identified complete block
+  my @c_open = ();  # current "incomplete" block of consecutive opening brackets
+                    # base-pair objects
+  my %bps    = ();  # hash of all base-pairs with stem for a value, if it is at
+                    # the BP is opening or closing a stem object
+  my @c_bp   = ();  # current BP, just being closed at current index pos (i,j)
+  my @p_bp   = ();  # previously closed BP at position before current index
+                    # stem objects
+  my %stems  = ();  # all stems, key="i:j,k:l", value=stem array, (i,j) outer BP
+  my @c_stem = ();  # current incomplete stem object
+  my @stmbrks = (); # all stem-breaks in the form of (i,k,l), where i is the remaining
+                    # open base and (k,l) is the subsequent closing base-pair
+
+#  my @p_stmbrk	= (); # previous stem-break in the form of (i,k,l), where i is the remaining
+#  					  # open base and (k,l) is the subsequent closing base-pair
+# loop objects
+  my @up    = (); # all unpaired regions (i,j) that are of unkown type
+  my @c_up  = (); # current incomplete unpaired object
+  my @hls   = (); # all hairpin loop objects
+  my @bls   = (); # all bulge-loop objects
+  my @ils   = (); # all internal loops
+  my @mls   = (); # all multi-loops
+  my @c_mls = (); # current incomplete multi-loops! (there can be more than one)
+  my @els   = (); # all external loops
+  my @c_el  = (); # current incomplete external loop
+
+  # iterate through shrep structure an identify abstract parts
+  foreach my $idx ( 0 .. @struct - 1 ) {
+
+    #===================================================
+    # current char is the opening bracket of a base-pair
+    if ( $struct[$idx] eq "(" ) {
+
+      #===================================================
+
+      # update current window index to current graph index
+      $idx += $curr_gi;
+
+      # case: '((' currently there is an open block, extend it
+      if (@c_open) {
+        push( @c_open, $idx );
+
+# case: '.(' or ')('  or BOS'(' there is no open block, create a new one ".(" or ")("
+      } else {
+
+        # begin a new current open block
+        push( @c_open, $idx );
+
+        # case: ')('
+        if (@c_stem) {
+
+          # CLOSE STEM
+          print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
+          close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
+
+          # case (x(x)(
+          if (@open_blks) {
+
+            # EXTEND_ML, if one exists
+            if (@c_mls) {
+              extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
+
+              # OPEN_ML
+            } else {
+
+              # case 'x((x)(' the opening of the ML was a stem-break
+              if ( @stmbrks
+                && $stmbrks[-1]->[1] == $p_bp[0]
+                && $stmbrks[-1]->[2] == $p_bp[1] ) {
+                my @c_stmbrk = @{ pop(@stmbrks) };
+                my @a        = ( $c_stmbrk[0], -1 );
+                my @ml       = ();
+                push( @ml, \@a );    # 1st base-pair is in-waiting
+                my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
+                push( @ml, \@a2 );    #2nd BP
+                my @a3 = ($idx);
+                push( @ml,    \@a3 );    #first of current BP
+                push( @c_mls, \@ml );
+                print STDERR "TEST - opened ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+
+                # case 'x(.(x)('
+              } elsif (@up) {
+                my @tmp_up = @{ pop(@up) };
+
+                # compare adjacent BP to UP to see if they fit
+                if ( $tmp_up[1] + 1 == $p_bp[0]
+                  && $open_blks[-1]->[-1] == $tmp_up[0] - 1 ) {
+                  my @a = ( $open_blks[-1]->[-1], -1 );
+                  my @ml = ();
+                  push( @ml, \@a );    # 1st BP awaits closing
+                  my @newbp = @p_bp;
+                  push( @ml, \@newbp );    # 2nd BP
+                  my @a2 = ($idx);
+                  push( @ml,    \@a2 );    # current open BP, awaits closing
+                  push( @c_mls, \@ml );
+                  print STDERR "TEST - opened ML with prev UP1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
+
+                    if ($i_debug);
+                } else {
+                  die "ERROR: the base-pairs to not match the " . "previous unpaired region?!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+                }
+              } else {
+                die "ERROR: in case ML, but there is no initial" . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+              }
+            }
+
+            # case: 'x(x)('
+          } else {
+
+            # case: '.(x)(' => CLOSE_EL
+            if (@c_el) {
+              if ( $c_el[-1] == $p_bp[0] ) {
+                push( @c_el, $p_bp[1] );
+                my @newel = @c_el;
+                push( @els, \@newel );
+                @c_el = ();
+                print STDERR "TEST - closed EL1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
+              } else {
+                die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+              }
+            }
+
+            # in both cases OPEN_EL
+            push( @c_el, @p_bp );
+            push( @c_el, $idx );
+            print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
+          }
+
+          # case '.('
+        } elsif (@c_up) {
+
+          # case (x(x).( OR 'x(.('
+          if (@open_blks) {
+
+            # case 'x(x).(' ML
+            if ( @p_bp && ( $p_bp[1] == $c_up[0] - 1 ) ) {
+
+              # case '((x)x(x).(' EXTEND_ML, if one exists
+              if (@c_mls) {
+
+                extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi );
+
+                # case 'x((x).(' or 'x(.(x).(' OPEN_ML
+              } else {
+
+                # case 'x((x).(' the opening of the ML was a stem-break
+                if ( @stmbrks
+                  && $stmbrks[-1]->[1] == $p_bp[0]
+                  && $stmbrks[-1]->[2] == $p_bp[1] ) {
+                  my @c_stmbrk = @{ pop(@stmbrks) };
+                  my @a        = ( $c_stmbrk[0], -1 );
+                  my @ml       = ();
+                  push( @ml, \@a );    # 1st base-pair is in-waiting
+                  my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
+                  push( @ml, \@a2 );    #2nd BP
+                  my @a3 = ($idx);
+                  push( @ml,    \@a3 );    #first of current BP
+                  push( @c_mls, \@ml );
+                  print STDERR "TEST - open ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+
+                  # case 'x(.(x).(' opening the ML with initial UP region
+                } elsif (@up) {
+
+                  # get previous unpaired region
+                  my $tmp  = pop(@up);
+                  my @p_up = @{$tmp};
+
+          # case: '(.(x).(' compare adjacent BP to UP to see if they create a ML
+                  if ( $p_up[1] + 1 == $p_bp[0]
+                    && $open_blks[-1]->[-1] == $p_up[0] - 1 ) {
+                    my @a = ( $open_blks[-1]->[-1], -1 );
+                    my @ml = ();
+                    push( @ml, \@a );    # 1st BP awaits closing
+                    my @newbp = @p_bp;
+                    push( @ml, \@newbp );    # 2nd BP
+                    my @a2 = ($idx);
+                    push( @ml,    \@a2 );    # current open BP, awaits closing
+                    push( @c_mls, \@ml );
+                    print STDERR "TEST - opened ML with prev UP2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) )
+
+                      if ($i_debug);
+                  }
+
+                  # has to be an ML, but can't find first part
+                } else {
+                  die "ERROR: in case ML, but there is no initial " . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+                }
+              }
+
+              # case '(.(
+            } else {
+              my @newup = @c_up;
+              push( @up, \@newup );
+              print STDERR "TEST - closed UP:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newup ) ) if ($i_debug);
+            }
+
+            # case: 'x(x).(' OR 'EOS.('
+          } else {
+
+            # case: '.(x)(' => CLOSE_EL
+            if (@c_el) {
+              if ( $c_el[-1] == $c_bp[0] ) {
+                push( @c_el, $p_bp[1] );
+                my @newel = @c_el;
+                push( @els, \@newel );
+                @c_el = ();
+                print STDERR "TEST - closed EL2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
+              } else {
+                die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+              }
+            }
+
+            # in both cases OPEN_EL
+            if (@p_bp) {
+              push( @c_el, @p_bp );
+              push( @c_el, $idx );
+            } else {
+
+              # ignore this if -cue is given
+              # TODO check the gi index
+              if ($cue) {
+                @c_el = ();
+              } else {
+                push( @c_el, ( $curr_gi, $curr_gi ) );
+                push( @c_el, $idx );
+                print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug);
+              }
+            }
+          }
+          @c_up = ();
+
+        } else {
+
+          # no more cases except opening bracket at beginning of sequence
+          # in this case do nothing, as the index has already been added
+          die "ERROR: '((', ')(', '.(' have all been covered\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) )
+            unless ( $idx == $curr_gi );
+
+        }
+      }
+
+      #===================================================
+      # current char is the unpaired base
+    } elsif ( $struct[$idx] eq "." ) {
+
+      #===================================================
+
+      # update current window index to current graph index
+      $idx += $curr_gi;
+
+      # case ').' => CLOSE_STEM
+      if (@c_stem) {
+        print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
+        close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
+      }
+
+      # case '..' extend UP, @c_up
+      if (@c_up) {
+        $c_up[1] = $idx;
+
+        # case 'x.' open new UP
+      } else {
+
+        # OPEN_UP, @c_up
+        @c_up = ( $idx, $idx );
+
+        # case '(.' just come to the end of an open block, push it onto stack
+        if (@c_open) {
+          my @newopen = @c_open;
+          push( @open_blks, \@newopen );
+          @c_open = ();
+
+          # case ').' or '.'
+          # (have already closed stem and created stem-break and emptied p_open)
+        } else {
+
+          # case 'x(x).' or '.' => CLOSE_EL
+          if ( !@open_blks ) {
+
+            # case 'x().().'
+            if (@c_el) {
+
+              # double check
+              if ( $c_el[-1] == $p_bp[0] ) {
+                push( @c_el, $p_bp[1] );
+                my @newel = @c_el;
+                push( @els, \@newel );
+                @c_el = ();
+                print STDERR "TEST - closed EL3:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug);
+              } else {
+                die "This case should not occur!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+              }
+            }
+          }
+        }
+      }
+
+      #===================================================
+      # current char is the closing bracket of a base-pair
+    } elsif ( $struct[$idx] eq ")" ) {
+
+      #===================================================
+
+      # update current window index to current graph index
+      $idx += $curr_gi;
+
+      # save previous base-pair
+      if (@c_bp) {
+        @p_bp = @c_bp;
+      }
+
+      # case: '((x))', extend stem
+      if (@p_open) {
+
+        # get current base-pair
+        @c_bp = ( pop(@p_open), $idx );
+
+        # add it to the base-pair hash
+        $bps{"$c_bp[0]:$c_bp[1]"} = "";
+
+        # EXTEND_STEM
+        if (@c_stem) {
+          $c_stem[0] = $c_bp[0];
+          $c_stem[1] = $c_bp[1];
+        } else {
+          die "ERROR: in case '))' there has to be an open stem object!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+        }
+
+        # case: '(x(x))' or '(x.)', '()' not allowed
+      } else {
+
+        # case: '(x(x))'  CLOSE_STEM close previous stem
+        if (@c_stem) {
+          print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug);
+          close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps );
+
+          # case '(x(x))' OPEN_STEM
+          if (@open_blks) {
+            my $tmp = pop(@open_blks);
+            @p_open = @{$tmp};
+
+            # get current base-pair
+            @c_bp = ( pop(@p_open), $idx );
+
+            # add it to the base-pair hash
+            # as the stem is not finished yet, cannot add stem information
+            $bps{"$c_bp[0]:$c_bp[1]"} = "";
+
+            # open stem
+            $c_stem[0] = $c_bp[0];
+            $c_stem[1] = $c_bp[1];
+            $c_stem[2] = $c_bp[0];
+            $c_stem[3] = $c_bp[1];
+          } else {
+            die "ERROR: there have to be open blocks to match " . "current closing bracket!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+          }
+
+          # case: '(.(x))' => CLOSE_BL
+          if ( @up && $up[-1]->[0] - 1 == $c_bp[0] && $up[-1]->[1] + 1 == $p_bp[0] ) {
+            my @newBL = ( @c_bp, @p_bp );
+            push( @bls, \@newBL );
+            pop(@up);
+            print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newBL ) ) if ($i_debug);
+
+            # case: '(x(x).(x))' => CLOSE_ML
+          } elsif ( @c_mls && $c_mls[-1]->[-1]->[-1] == $p_bp[0] ) {
+            my @newml = @{ pop(@c_mls) };
+            push( @{ $newml[-1] }, $p_bp[1] );    # close last BP
+            $newml[0]->[1] = $c_bp[1];            # close 1st BP
+            push( @mls, \@newml );
+            print STDERR "TEST - closed ML1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
+          } else {
+            die "ERROR: what is this case?\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+          }
+
+          # case: (x.)
+        } else {
+
+          # OPEN_STEM open new stem
+          if (@open_blks) {
+            @p_open = @{ pop(@open_blks) };
+
+            # get current base-pair
+            @c_bp = ( pop(@p_open), $idx );
+
+            # add it to the base-pair hash
+            # as the stem is not finished yet, cannot add stem information
+            $bps{"$c_bp[0]:$c_bp[1]"} = "";
+
+            # open stem
+            $c_stem[0] = $c_bp[0];
+            $c_stem[1] = $c_bp[1];
+            $c_stem[2] = $c_bp[0];
+            $c_stem[3] = $c_bp[1];
+          } else {
+            die "ERROR: there have to be open blocks to match " . "current closing bracket!$curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+          }
+
+          # CLOSE_C_UP
+          if (@c_up) {
+
+            # case: (.) => close_HL
+            if ( $c_up[0] - 1 == $c_bp[0] ) {
+              my @newhl = @c_bp;
+              push( @hls, \@newhl );
+              @c_up = ();
+              print STDERR "TEST - closed HL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newhl ) ) if ($i_debug);
+
+              # case: (x(x).)
+            } elsif ( @p_bp && $c_up[0] - 1 == $p_bp[1] ) {
+
+              # case: ((x).) => CLOSE_BL
+              if ( @stmbrks
+                && $stmbrks[-1]->[0] == $c_bp[0]
+                && $stmbrks[-1]->[1] == $p_bp[0]
+                && $stmbrks[-1]->[2] == $p_bp[1] ) {
+                my @newbl = ( @c_bp, @p_bp );
+                push( @bls, \@newbl );
+                pop(@stmbrks);
+                @c_up = ();
+                print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newbl ) ) if ($i_debug);
+
+                # case: (x(x).), either CLOSE_ML or CLOSE_IL
+              } else {
+
+                # case '(.(x).)' => CLOSE_IL
+                if ( @up
+                  && ( $up[-1]->[0] - 1 == $c_bp[0] )
+                  && ( $up[-1]->[1] + 1 == $p_bp[0] ) ) {
+                  my @p_up = @{ pop(@up) };
+                  my @newil = ( @c_bp, @p_bp );
+                  push( @ils, \@newil );
+                  @c_up = ();
+                  print STDERR "TEST - closed IL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newil ) ) if ($i_debug);
+
+                  # case (x(x)x(x).) => CLOSE_ML remember, array of tuples
+                } elsif ( @c_mls && $c_mls[-1]->[-1]->[0] == $p_bp[0] ) {
+                  my @newml = @{ pop(@c_mls) };
+                  push( @{ $newml[-1] }, $p_bp[1] );    # close final BP
+                  $newml[0]->[1] = $c_bp[1];            # close 1st BP
+                  push( @mls, \@newml );
+                  @c_up = ();
+                  print STDERR "TEST - closed ML2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug);
+                } else {
+                  die "There should be no such case!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+                }
+              }
+
+              # there should be no more unknown unpaired regions left!
+            } else {
+              die "There shouldn't be any unknown unpaired " . "regions left\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+            }
+
+          } else {
+            die "ERROR: '()' is not allowed in $curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) );
+          }
+        }
+      }
+    }
+  }
+
+  # EOS
+
+  # CLOSE_STEM
+  print STDERR "TEST - closed stem EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@c_stem ) )
+    if ( @c_stem && $i_debug );
+  close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ) if (@c_stem);
+  die "ERROR: there should not be an open ML!\n" . $curr_shrep . "\n"
+    if (@c_mls);
+
+  # case ().() CLOSE_EL
+  if (@c_el) {
+    if ( $c_el[-1] == $p_bp[0] ) {
+      push( @c_el, $p_bp[1] );
+      my @newel = @c_el;
+      push( @els, \@newel );
+      @c_el = ();
+
+      print STDERR "TEST - closed EL4 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
+        if ($i_debug);
+    } else {
+      die "ERROR: the base-pairs in the EL don't match" . $curr_shrep . "\n";
+    }
+  }
+
+  # case (). CLOSE_EL
+  if (@c_up) {
+    unless ($cue) {
+      if (@p_bp) {
+        push( @c_el, @p_bp );
+      } else {
+        push( @c_el, ( $curr_gi, $curr_gi ) );
+      }
+
+      push( @c_el, ( ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ) );
+      my @newel = @c_el;
+      push( @els, \@newel );
+      @c_el = ();
+      print STDERR "TEST - closed EL5 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) )
+        if ($i_debug);
+      ## check case where cue is given, but window is completely unstructured
+    } else {
+      unless (@p_bp) {
+        my @newel = ( $curr_gi, $curr_gi, ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) );
+        push( @els, \@newel );
+      }
+    }
+  }
+
+  #  # update current graph index (since all shrep indices have been read)
+  $curr_gi += $win_size;
+
+  # ===================================================
+  # build graph lines
+  my @graph_lines = ();
+
+  # ===================================================
+
+  # graph line objects
+  my @edg        = ();
+  my @stackgraph = ();
+  my @abstrgraph = ();
+
+ #-------------------> stems,stacks
+ # build basic graph edges, stacks, and vertices for stems in the abstract graph
+  my @tmp = keys %stems;
+  foreach my $sk (@tmp) {
+    my @stem = @{ $stems{$sk} }; # stem (i,j,k,l)=> (i,j)= (0,1) and (k,l)=(2,3)
+    die "ERROR: There are not enough elements in this stem: ", ( join( ",", @stem ) ), "\n" unless ( scalar(@stem) == 4 );
+
+    # infer base-pairs from stem
+    @p_bp = ();
+    @c_bp = ();
+    my @stem_relation_idxs  = ();
+    my @stack_relation_idxs = ();
+    my $stacksize           = $stem[2] - $stem[0] + 1;
+    for ( my $x = 0 ; $x < $stacksize ; $x++ ) {
+
+      #    for (my $i = $stem[0]; $i <= $stem[2] ; $i++){
+      #      for(my $j = $stem[1]; $j >= $stem[3]; $j--){
+      my $i = $stem[0] + $x;
+      my $j = $stem[1] - $x;
+      @p_bp = @c_bp;
+      @c_bp = ( $i, $j );
+      if ( defined $bps{"$i:$j"} ) {
+
+        # add indices to stem relation object
+        push( @stem_relation_idxs, $i );
+        push( @stem_relation_idxs, $j );
+
+        # add edge to graph edges
+        push( @edg, "e $i $j s" );
+
+        # add stack if option given
+        if ($stacks) {
+          if (@p_bp) {
+            push( @stackgraph,          "v $curr_gi P" );
+            push( @stackgraph,          "e $curr_gi $p_bp[0] p" );
+            push( @stackgraph,          "e $curr_gi $p_bp[1] p" );
+            push( @stackgraph,          "e $c_bp[0] $curr_gi p" );
+            push( @stackgraph,          "e $c_bp[1] $curr_gi p" );
+            push( @stack_relation_idxs, $curr_gi );
+
+            # update graph index
+            ++$curr_gi;
+          }
+        }
+      } else {
+        die "ERROR: There is no such base-pair in the base-pair hash:" . " ($i, $j)\n";
+      }
+    }
+
+    #  	  }
+    #    }
+    # add stem to abstract graph
+    # (graph indices become a bit jumbled, but saves time)
+    push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#S" );
+
+    # stem object now saves the curr_gi index for that stem
+    $stems{$sk} = $curr_gi;
+    ++$curr_gi;
+
+    # add relations
+    push( @abstrgraph, "v $curr_gi ^R" );
+    push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+    foreach my $r (@stem_relation_idxs) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    if ($stacks) {
+      foreach my $r (@stack_relation_idxs) {
+        push( @abstrgraph, "e $curr_gi $r \@r" );
+      }
+    }
+    ++$curr_gi;
+  }
+
+  #-------------------> add HLs
+  foreach my $hl (@hls) {
+
+    # check size
+    die "ERROR: the HL object is the incorrect size\n" unless ( @{$hl} == 2 );
+
+    # add vertex
+    push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#HL" );
+
+    # add edge from stem to hl
+    if ( defined( $stems{ $bps{"$hl->[0]:$hl->[1]"} } ) ) {
+      push( @abstrgraph, "e " . $stems{ $bps{"$hl->[0]:$hl->[1]"} } . " $curr_gi n" );
+    } else {
+      die "ERROR: stem not defined in base-pair hash for $hl->[0]:$hl->[1]\n";
+    }
+    ++$curr_gi;
+
+    # add relations
+    push( @abstrgraph, "v $curr_gi ^R" );
+    push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+
+    # add all nodes between i to j, inclusively
+    for ( my $r = $hl->[0] ; $r <= $hl->[1] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    ++$curr_gi;
+  }
+
+  #------------------> add BLs
+  foreach my $bl (@bls) {
+
+    # check size
+    die "ERROR: the BL object is the incorrect size\n" unless ( scalar( @{$bl} ) == 4 );
+
+    # add vertex
+    push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#BL" );
+
+    # add edges from bl to both adjacent stems
+    if ( defined( $stems{ $bps{"$bl->[0]:$bl->[1]"} } )
+      && defined( $stems{ $bps{"$bl->[2]:$bl->[3]"} } ) ) {
+
+      # add outer stem: S->BL
+      push( @abstrgraph, "e " . $stems{ $bps{"$bl->[0]:$bl->[1]"} } . " $curr_gi n" );
+
+      # add inner stem: BL->S
+      push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bl->[2]:$bl->[3]"} } . " n" );
+    } else {
+      die "ERROR: stems not defined for BL\n";
+    }
+    ++$curr_gi;
+
+    # add relations
+    push( @abstrgraph, "v $curr_gi ^R" );
+    push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+
+    # add all nodes between i to k and l to j, inclusively
+    for ( my $r = $bl->[0] ; $r <= $bl->[2] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    for ( my $r = $bl->[3] ; $r <= $bl->[1] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    ++$curr_gi;
+  }
+
+  #------------------> add ILs
+  foreach my $il (@ils) {
+
+    # check size
+    die "ERROR: the IL object is the incorrect size\n" unless ( scalar( @{$il} ) == 4 );
+
+    # add vertex
+    push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#IL" );
+
+    # add edges from il to both adjacent stems
+    if ( defined( $stems{ $bps{"$il->[0]:$il->[1]"} } )
+      && defined( $stems{ $bps{"$il->[2]:$il->[3]"} } ) ) {
+
+      # add outer stem: S->IL
+      push( @abstrgraph, "e " . $stems{ $bps{"$il->[0]:$il->[1]"} } . " $curr_gi n" );
+
+      # add inner stem: IL->S
+      push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$il->[2]:$il->[3]"} } . " n" );
+    } else {
+      die "ERROR: stems not defined for IL\n";
+    }
+    ++$curr_gi;
+
+    # add relations
+    push( @abstrgraph, "v $curr_gi ^R" );
+    push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+
+    # add all nodes between i to k and l to j, inclusively
+    for ( my $r = $il->[0] ; $r <= $il->[2] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    for ( my $r = $il->[3] ; $r <= $il->[1] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    ++$curr_gi;
+  }
+
+  #------------------> add ELs
+  foreach my $el (@els) {
+
+    #check size
+    die "ERROR: the EL object is the incorrect size\n" unless ( scalar( @{$el} ) == 4 );
+
+    # add vertex
+    push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#EL" );
+
+    # add edges from el to both adjacent stems, if available
+    unless ( $el->[0] == $el->[1] ) {
+      if ( defined( $stems{ $bps{"$el->[0]:$el->[1]"} } ) ) {
+        push( @abstrgraph, "e " . $stems{ $bps{"$el->[0]:$el->[1]"} } . " $curr_gi n" );
+      } else {
+        die "ERROR: stems not defined for left BP of EL\n";
+      }
+    }
+    unless ( $el->[2] == $el->[3] ) {
+      if ( defined( $stems{ $bps{"$el->[2]:$el->[3]"} } ) ) {
+        push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$el->[2]:$el->[3]"} } . " n" );
+      } else {
+        die "ERROR: stems not defined for right BP of EL\n";
+      }
+    }
+    ++$curr_gi;
+
+    # add relations
+    push( @abstrgraph, "v $curr_gi ^R" );
+    push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+
+    # @r relation edges
+    for ( my $r = $el->[1] ; $r <= $el->[2] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+
+    # if BP, not EOS
+    unless ( $el->[0] == $el->[1] ) {
+      push( @abstrgraph, "e $curr_gi $el->[0] \@r" );
+    }
+
+    # if BP, not EOS
+    unless ( $el->[2] == $el->[3] ) {
+      push( @abstrgraph, "e $curr_gi $el->[3] \@r" );
+    }
+    ++$curr_gi;
+  }
+
+  #------------------> add MLs
+  foreach my $ml (@mls) {
+
+    #check size
+    die "ERROR: the ML object is not big enough!\n" unless ( scalar( @{$ml} ) >= 3 );
+
+    # add vertex
+    push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#ML" );
+
+    # add edge from outer stem to first BP
+    my $closing_bp = shift( @{$ml} );
+    if ( defined( $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } ) ) {
+      push( @abstrgraph, "e " . $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } . " $curr_gi n" );
+    } else {
+      die "ERROR: ML is not defined for BP $closing_bp->[0]:$closing_bp->[1]\n";
+    }
+
+    # add remaining stems
+    foreach my $bp ( @{$ml} ) {
+      if ( defined( $stems{ $bps{"$bp->[0]:$bp->[1]"} } ) ) {
+        push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bp->[0]:$bp->[1]"} } . " n" );
+      } else {
+        die "ERROR: ML is not defined for BP $bp->[0]:$bp->[1]\n";
+      }
+    }
+    ++$curr_gi;
+
+    # add relations
+    push( @abstrgraph, "v $curr_gi ^R" );
+    push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" );
+
+  # add @r relation edges between i and k, where (i,j) is the first BP and (k,l)
+  # the second BP
+    my $pbp_aref = $closing_bp;
+    my $cbp_aref = shift( @{$ml} );
+    for ( my $r = $pbp_aref->[0] ; $r <= $cbp_aref->[0] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+
+    # rest of @r relation edges
+    foreach my $bp ( @{$ml} ) {
+      $pbp_aref = $cbp_aref;
+      $cbp_aref = $bp;
+      for ( my $r = $pbp_aref->[1] ; $r <= $cbp_aref->[0] ; $r++ ) {
+        push( @abstrgraph, "e $curr_gi $r \@r" );
+      }
+    }
+
+    # add final stretch between last BP (m,n) and first closing base-pair (i,j)
+    for ( my $r = $cbp_aref->[1] ; $r <= $closing_bp->[1] ; $r++ ) {
+      push( @abstrgraph, "e $curr_gi $r \@r" );
+    }
+    ++$curr_gi;
+  }
+
+  @graph_lines = ( @edg, @stackgraph, @abstrgraph );
+  return ( \@graph_lines, $curr_gi );
+}
+
+# just to test the current shrep dot-bracket structure parsing
+sub test {
+  my ( $shrep, $idx, $gi, $mark_aref, $isML ) = @_;
+
+  my $result = "";
+
+  $result .= "$shrep\n";
+  my @a = split( "", $shrep );
+  for ( my $i = 0 ; $i <= @a ; $i++ ) {
+    my $mark = 0;
+    foreach my $j ( @{$mark_aref} ) {
+      if ($isML) {
+        $mark = 1 if ( $i == $j->[0] - $gi );
+        $mark = 1 if ( scalar( @{$j} == 2 && $i == $j->[1] - $gi ) );
+      } else {
+        $mark = 1 if ( $i == $j - $gi );
+      }
+
+    }
+    if ( $i == $idx - $gi ) {
+      $result .= "^";
+    } elsif ($mark) {
+      $result .= "'";
+    } else {
+      $result .= " ";
+    }
+  }
+  $result .= "\n";
+
+  return $result;
+}
+
+###########################################################################
+# Extends or closes a multi-loop. This means a ML already exists.
+# We can have the case ((x)( or ((x).(
+#
+# Input:
+# curr_shrep	The current shrep object, with shrep string at pos 0
+# c_ml_aref		The current open MLs array reference
+# p_bp_aref		The previous base-pair array reference
+# stmbrks_aref	The stem-breaks array reference
+# open_blks_aref	The open bracket blocks array reference
+# idx			The current index
+# #curr_gi		The current graph index
+# Output:
+# None, it should just modify the ML accordingly
+############################################################################
+sub extend_or_open_nested_ML {
+  my ( $curr_shrep, $c_mls_aref, $p_bp_aref, $up_aref, $stmbrks_aref, $open_blks_aref, $idx, $curr_gi ) = @_;
+
+  # case '(x(x)x(x)x(' extend current ML
+  if ( $c_mls_aref->[-1]->[-1]->[-1] == $p_bp_aref->[0] ) {
+    push( @{ $c_mls_aref->[-1]->[-1] }, $p_bp_aref->[1] );    #close previous BP
+    my @a = ($idx);
+    push( @{ $c_mls_aref->[-1] }, \@a );    # add new open base-pair
+    print STDERR "TEST - extended ML:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, $c_mls_aref->[-1], 1 ) ) if ($i_debug);
+
+    # case '(x(x)x((x)(' nested ML, => OPEN_ML with stem-break
+  } elsif ( @$stmbrks_aref
+    && $stmbrks_aref->[-1]->[1] == $p_bp_aref->[0]
+    && $stmbrks_aref->[-1]->[2] == $p_bp_aref->[1] ) {
+
+    my @c_stmbrk = @{ pop( @{$stmbrks_aref} ) };
+    my @a        = ( $c_stmbrk[0], -1 );
+    my @ml       = ();
+    push( @ml, \@a );                       # 1st base-pair is in-waiting
+    my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] );
+    push( @ml, \@a2 );                      #2nd BP
+    my @a3 = ($idx);
+    push( @ml,            \@a3 );           #first of current BP
+    push( @{$c_mls_aref}, \@ml );
+    print STDERR "TEST - opened nested ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+
+    # case '(x(x)x(.(x)(' nested ML, => OPEN_ML with UP
+  } elsif ( @$up_aref
+    && $up_aref->[-1]->[0] - 1 == $open_blks_aref->[-1]->[-1]
+    && $up_aref->[-1]->[1] + 1 == $p_bp_aref->[0] ) {
+    my @tmp_up = @{ pop( @{$up_aref} ) };
+    my @a      = ( $open_blks_aref->[-1]->[-1], -1 );
+    my @ml     = ();
+    push( @ml, \@a );                       # 1st BP awaits closing
+    my @newbp = @{$p_bp_aref};
+    push( @ml, \@newbp );                   #2nd BP
+    my @a2 = ($idx);
+    push( @ml,            \@a2 );           # current open BP, awaits closing
+    push( @{$c_mls_aref}, \@ml );
+    print STDERR "TEST - opened nested ML with prev UP\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug);
+  } else {
+    die "ERROR: in the case extend or open nested ML, " . "but the base-pairs don't fit for either!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) if ($i_debug);
+  }
+}
+
+###########################################################################
+# Closes a stem-object for the abstract graph. If necessary, it also
+# creates a stem-break because the current open block is still open,
+# which can only mean that there is a stem-break between two consecutive
+# opening brackets.
+#
+# Input:
+# c_stem_aref : current stem array reference
+# stems_href : all stems hash reference
+# p_bp_aref : previous base-pair array reference
+# c_cp_aref : current base-pair array reference
+# p_open_aref : previously opened open-bracket block
+# p_stmbrk_aref : the previous stem-break
+# open_blks_aref : all old open blocks array reference
+# bps_href : all base-pairs hash reference
+#
+# Output:
+# None, it should just modify the input variables accordingly
+############################################################################
+sub close_stem {
+  my ( $c_stem_aref, $stems_href, $p_bp_aref, $c_bp_aref, $p_open_aref, $stmbrks_aref, $open_blks_aref, $bps_href ) = @_;
+
+  my @newstem = @$c_stem_aref;
+  my $stemkey = "$newstem[0]:$newstem[1],$newstem[2]:$newstem[3]";
+  $stems_href->{$stemkey} = \@newstem;
+
+  # reset and clean running variables
+  @$c_stem_aref = ();
+  @$p_bp_aref   = @$c_bp_aref;
+  @$c_bp_aref   = ();
+
+  # add stem to base-pair hash
+  $bps_href->{"$newstem[0]:$newstem[1]"} = $stemkey;
+  $bps_href->{"$newstem[2]:$newstem[3]"} = $stemkey;
+
+  # there are still bases open in the previous open block, '[[]x',
+  # where [] is a closed stem and [ is an open block,
+  # must create STEM-BREAK
+  # this cannot be the case ((x)), because there p_open is empty
+  if (@$p_open_aref) {
+
+    # check: the next open base is adjacent to the previous BP
+    if ( $p_open_aref->[-1] == $p_bp_aref->[0] - 1 ) {
+      my @stmbrk = ( $p_open_aref->[-1], @$p_bp_aref );
+      push( @$stmbrks_aref, \@stmbrk );
+
+      # push the previous open block back onto stack
+      my @newopen = @$p_open_aref;
+      push( @$open_blks_aref, \@newopen );
+      @$p_open_aref = ();
+    } else {
+
+      # error, this should not be possible
+      die("ERROR in close_stem(), [.[]x should not occur here\n");
+    }
+  }
+}
+
+###########################################################################
+# Here the information about the structure is added to the graph. In a
+# first step each base-pair is added by adding edges between the vertices
+# of the respective nucleotides. Then, if two base-pairs are stacked,
+# we add an extra vertex and connect the four nucleotide vertices that
+# are involved with extra edges (symbols P used for vertex and p for edges).
+# While adding new vertices we keep track of the current graph index.
+#
+# Input:
+# curr_shrep : the current shrep as dot-bracket structure
+# curr_gi : the current graph index
+# win_size : the window size
+# stacks : the input parameter -stacks (whether to add stacking information)
+#
+# Output:
+# The structure graph information line-by-line as an array reference and
+# the current graph index.
+############################################################################
+sub getStructGraph {
+  my ( $curr_shrep, $curr_gi, $win_size, $stacks ) = @_;
+
+  my @struct = split( "", $curr_shrep->[0] );
+
+  my @edg    = ();
+  my @starts = ();
+  my @pairs  = ();
+
+  foreach my $idx ( 0 .. @struct - 1 ) {
+
+    push( @starts, $idx ) if ( $struct[$idx] eq "(" );
+
+    if ( $struct[$idx] eq ")" ) {
+      my $start = $curr_gi + pop(@starts);
+      my $end   = $curr_gi + $idx;
+      push( @edg, "e " . $start . " " . $end . " s " );
+      my @pair = ( $start, $end );
+      push( @pairs, \@pair );
+    }
+  }
+
+  # update current graph index
+  $curr_gi = $curr_gi + $win_size;
+
+  # initiate structure lines array
+  my @stacking_info = ();
+
+  # add stacking information to graph unless input option tells us not to
+  if ($stacks) {
+
+    my $stacked_pairs = 0;
+
+    # get stacked base-pairs (they are ordered according to
+    # position of closing nucleotide)
+    for ( my $i = 1 ; $i < @pairs ; $i++ ) {
+
+      # add stacked base-pairs (vertices+edges)
+      my $curr_bp_aref = $pairs[$i];
+      my $prev_bp_aref = $pairs[ $i - 1 ];
+
+      # if the current base-pair is stacked on the previous base-pair,
+      # when curr_start = prev_start - 1 AND curr_end = prev_end + 1
+      if ( $curr_bp_aref->[0] == $prev_bp_aref->[0] - 1
+        && $curr_bp_aref->[1] == $prev_bp_aref->[1] + 1 ) {
+
+        # add stacking vertex P
+        push( @stacking_info, "v $curr_gi P" );
+
+        # add four edges from involved nucleotids
+        push( @stacking_info, "e $curr_gi $prev_bp_aref->[0] p" );
+        push( @stacking_info, "e $curr_gi $prev_bp_aref->[1] p" );
+        push( @stacking_info, "e $curr_bp_aref->[0] $curr_gi p" );
+        push( @stacking_info, "e $curr_bp_aref->[1] $curr_gi p" );
+        ++$curr_gi;    # add one to the index, ready for next vertex
+
+      }
+    }
+  }
+
+  my @str_graphlines = ( @edg, @stacking_info );
+  return ( \@str_graphlines, $curr_gi );
+}
+
+###########################################################################
+# This method gives us the overall graph header for this sequence. This
+# graph includes all window calculations and all shreps for all windows.
+# Input:
+# seq_id : the sequence ID
+# seq_head : the header information for the sequence
+# wins_aref : the windows sizes for folding
+# h_shift : input parameter -shift
+# h_e : input parameter -e
+# h_c : input parameter -c
+# h_t : input parameter -t
+# h_u : input parameter -u
+# h_r : input parameter -r
+# h_M : input parameter -M
+# h_crop_unpaired_ends : input parameter -cue
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getGraphHeader {
+  my ( $seq_id, $seq_head, $wins_aref, $h_shift, $h_e, $h_c, $h_t, $h_u, $h_r, $h_M, $h_crop_unpaired_ends, $h_i, $h_sample_len, $h_q, $h_Tp, $h_seqlen, $h_noStr ) = @_;
+
+  my $ret;
+
+  $ret = "t # SEQID $seq_id ";
+  $ret .= "$seq_head " if ( defined $seq_head );
+  $ret .= "SEQLEN $h_seqlen ";
+
+  return $ret . "\n" if ($h_noStr);
+
+  $ret .= "MAXWINSHREPS $h_M ";
+  $ret .= "CUE 1 " if ($h_crop_unpaired_ends);
+  $ret .= "RNASHAPES -w " . ( join( ",", @{$wins_aref} ) ) . " ";
+  $ret .= "-SHIFT%w $h_shift " if ( not $globalFolding and $h_shift );
+  $ret .= "-SHIFT 1 " if ( not $globalFolding and not $h_shift );
+  $ret .= "-e $h_e "                         if ( defined $h_e );
+  $ret .= "-c $h_c "                         if ( defined $h_c );
+  $ret .= "-t $h_t "                         if ( defined $h_t );
+  $ret .= "-u 1 "                            if ($h_u);
+  $ret .= "-r 1 "                            if ($h_r);
+  $ret .= "-i $h_i SAMPLE_LEN $h_sample_len" if ( not $h_q and $h_i );
+  $ret .= "-q 1 "                            if ($h_q);
+  $ret .= "-T $h_Tp"                         if ( $h_q and defined $h_Tp );
+  $ret .= "\n";
+
+  return $ret;
+}
+
+###########################################################################
+# This method gives us the window header for the subgraph that includes
+# all shreps for the current window.
+# Input:
+# graphHead : the header line for the entire sequence graph
+# win_size : the size of the current window
+# win_start : the starting position of the current window
+# win_end : the end position of the current window
+# win_center : the centre position of the current window
+# win_global_fold : (boolean) whether the actual number of nucleotides
+# is smaller than the given window size (end of sequence?)
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getWindowHeader {
+  my ( $graphHead, $win_size, $win_start, $win_end, $win_center, $win_global_fold, $win_sample, $used_t ) = @_;
+
+  chomp $graphHead;
+
+  $graphHead =~ s/t # //;
+
+  my $ret = "w # $graphHead ";
+  $ret .= "SHAPE_TYPE $used_t ";
+  $ret .= "GLOBALFOLD $win_global_fold ";
+  $ret .= "WSIZE $win_size ";
+  $ret .= "WSTART $win_start ";
+  $ret .= "WEND $win_end ";
+  $ret .= "WCENT $win_center ";
+  $ret .= "WSAMPLE $win_sample ";
+  $ret .= "\n";
+
+  return $ret;
+}
+
+###########################################################################
+# This method gives us the shape header that includes the single shrep
+# connected components.
+# Input:
+# winHead : the header for the current window
+# shrep : information about the current shrep
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getShapeHeader {
+  my ( $winHead, $shrep ) = @_;
+
+  chomp $winHead;
+
+  $winHead =~ s/w # //;
+  $winHead =~ s/t # //;
+
+  my $ret = "s # $winHead ";
+
+  my @info = @{$shrep};
+  $ret .= join( " ", @info[ 1 .. $#info ] );
+  $ret .= "\n";
+
+  return $ret;
+}
+
+###########################################################################
+# This method gives us the sequence header that includes the unstructured
+# sequence connected components.
+# Input:
+# winHead : the header for the current window
+# shrep : information about the current shrep
+#
+# Output:
+# The header line as a string
+############################################################################
+sub getSeqHeader {
+  my ( $winHead, $shrep ) = @_;
+
+  chomp $winHead;
+
+  $winHead =~ s/w # //;
+  $winHead =~ s/t # //;
+
+  my $ret = "u # $winHead ";
+
+  my @info = @{$shrep};
+  $ret .= join( " ", @info[ 1 .. $#info ] );
+  $ret .= "\n";
+
+  return $ret;
+}
+
+
+##################################################################################
+# This method parses a fasta file and is useful if the header ID is not-unique!!
+# It returns the header lines in an array and the sequence lines in the same
+# order. It is then up to the user to extract the parts from the header that is
+# necessary for the script.
+# Furthermore, the method deals with multiple lines, and returns a single sequence
+# without line breaks.
+# Input:
+#		file		The name of the fasta file
+# Output:
+#	(1)	An array reference for each header line
+#	(2) An array reference for each sequence in same order as the headers
+##################################################################################
+sub read_fasta_with_nonunique_headers_meta {
+  my ($file) = @_;
+  my $FUNCTION = "read_fasta_file in Sequences.pm";
+
+  my $header    = "";
+  my $seqstring = "";
+  my @headers   = ();
+  my @sequences = ();
+  my @meta      = ();
+  my %seq_meta  = ();
+  open( IN_HANDLE, "<$file" ) || die "ERROR in $FUNCTION:\n" . "Couldn't open the following file in package Tool," . " sub read_fasta_file: $file\n";
+
+  while ( my $line = <IN_HANDLE> ) {
+    chomp($line);
+
+    # header (can contain one space after > symbol)
+    if ( $line =~ /^\>(.*)/ ) {
+      if ($header) {
+        $seqstring =~ s/\s*//g;    ## do not allow spaces in sequence
+        push( @headers,   $header );
+        push( @sequences, $seqstring );
+        push( @meta,      +{%seq_meta} );    ## anonymous hash reference
+                                             #print keys %seq_meta;
+        $seqstring = "";
+        undef(%seq_meta);
+      }
+      $header = $1;
+
+    } elsif ( $line =~ /(.+)\s+(#\S+)$/ && $header ) {
+
+      if ( exists $seq_meta{$2} ) {
+        $seq_meta{$2} .= $1;
+      } else {
+        $seq_meta{$2} = $1;
+      }
+
+    } elsif ($header) {
+      $seqstring .= $line
+    }
+  }
+
+  if ($header) {
+    $seqstring =~ s/\s*//g;    ## do not allow spaces in sequence
+    push( @headers,   $header );
+    push( @sequences, $seqstring );
+    push( @meta,      +{%seq_meta} );
+    $seqstring = "";
+    %seq_meta  = ();
+  }
+  return ( \@headers, \@sequences, \@meta );
+}
+
+
+1;
+
+#############################################################################
+# Programming description
+#############################################################################
+#	Substructure graphs for machine learning with Fabrizio
+#	-------------------------------------------------------
+#
+#	(1) Parameters (RNAshapes parameter):
+#		- Window sizes [] (-w)
+#		- window shift size (-W)
+#	- calculate structure probabilities (-r)
+#		- energy range kcal/mol (-e) OR energy relative percentage (%) to MFE (-c)
+#		- shape type 1 to 5 (-t)
+#		- ignore unstable substructures (-u)
+#		- max shreps
+#
+#
+#	(2) For each sequence, generate one graph/file that consists of all windows. The general format for one graph is as follows:
+#
+#	t # seq_id parameters
+#	v graph_index nt_type window_size window_centre abs_seq_pos
+#	...
+#	e m m+1 > 1 (backbone)
+#	...
+#	e base_i_graph_index base_j_graph_index s shrep_e shrep_p ...
+#
+#	For each window (subgraph) we create a subgraph (of the subgraph) for each substructure.
+#	We have a running index (gi) for each subgraph. All vertex and edge indices of the subgraph add
+#	the running graph index to the actual window position. For example
+#
+#
+#	Sequence: AAACC CUUUG GG
+#		  01234 56789 01
+#
+#	Window=10 substructure1 = (((...))). centre 5.5
+#
+#	v 0 A 10 5.5 0
+#	v 1 A 10 5.5 1
+#	v 2 A 10 5.5 2
+#	v 3 C 10 5.5 3
+#	v 4 C 10 5.5 4
+#	v 5 C 10 5.5 5
+#	v 6 U 10 5.5 6
+#	v 7 U 10 5.5 7
+#	v 8 U 10 5.5 8
+#	v 9 G 10 5.5 9
+#	e 0 1 > 1
+#	e 1 2 > 1
+#	e 2 3 > 1
+#	e 3 4 > 1
+#	e 4 5 > 1
+#	e 5 6 > 1
+#	e 6 7 > 1
+#	e 7 8 > 1
+#	e 8 9 > 1
+#	e 0 8 s -15.0 0.1223
+#	e 1 7 s -15.0 0.1223
+#	e 2 6 s -15.0 0.1223
+#
+#	gi = 9+1 = 10
+#
+#	Window = 10 substructure2 = .(((...))) centre 6.5
+#	v 10 A 10 6.5 2
+#	v 11 C 10 6.5 3
+#	v 12 C 10 6.5 4
+#	v 13 C 10 6.5 5
+#	v 14 U 10 6.5 6
+#	v 15 U 10 6.5 7
+#	v 16 U 10 6.5 8
+#	v 17 G 10 6.5 9
+#	v 18 G 10 6.5 10
+#	v 19 G 10 6.5 11
+#	e 10 11 > 1
+#	e 11 12 > 1
+#	e 12 13 > 1
+#	e 13 14 > 1
+#	e 14 15 > 1
+#	e 15 16 > 1
+#	e 16 17 > 1
+#	e 17 18 > 1
+#	e 18 19 > 1
+#	e 11 19 s -17.0 0.156
+#	e 12 18 s -17.0 0.156
+#	e 13 17 s -17.0 0.156
+#
+#	gi = 19+1 = 20
+#
+#
+#
+#	Write one perl script to create graphs for a set of sequences, fasta2shrep_gspan.pl.
+#
+#	INPUT:
+#		-f fasta file with all sequences to compute
+#		parameters as above
+#
+#	OUTPUT:
+#		one file per sequence that contains graph called seq_id.gspan
+#
+#	(1) for each window size call RNAshapes and write to a tmp file
+#	(2) parse result of RNAshapes (catch RNAshapes error - sequence too long?) check for max shreps.
+#	(3) convert RNAshapes result to subgraph -> write to file (readpipe) look at efficiency and errors
+#	(4) repeat (1) to (3) for each sequence
+
+# ABSTRACT STRUCTURE GRAPH
+# To each shrep graph (s #) add an abstract structure graph with special
+# abstract relations. For example, to add the abstract structure graph to
+# the previous shrep structure with gi numbers from 10 to 19, we first
+# identify the abstract shape, i.e. EL-S-HL and then create nodes (labelled
+# with a given name-space followed the actual node name and separated by a
+# hash) and edges for this graph as follows:
+# NOTE: at the moment we add the adjacent base-pairs to the loop definitions
+# but these can be removed if necessary in the future
+# v 20 abstruct#EL
+# v 21 abstruct#S
+# v 22 abstruct#HL
+# e 20 21 n
+# e 21 22 n
+# v 23 ^R
+# e 20 23 ^r
+# e 23 10 @r
+# e 23 11 @r
+# e 23 19 @r
+# v 24 ^R
+# e 21 24 ^r
+# e 24 11 @r
+# e 24 12 @r
+# e 24 13 @r
+# e 24 17 @r
+# e 24 18 @r
+# e 24 19 @r
+# v 25 ^R
+# e 22 25 ^r
+# e 25 13 @r
+# e 25 14 @r
+# e 25 15 @r
+# e 25 16 @r
+# e 25 17 @r
+
+# gi = 25+1 = 26
+
+# ANOTATION FILE
+# This is a file that labels a sequence region with a given annotation. In one
+# file we can have annotations within different name-spaces, for example target
+# sites predicted with different tools.
+# File format is a tab-delimited file with the following columns:
+# SEQID (same ID as in the fasta file header)
+# i (left positon of region)
+# j (right position of region, if one position i=j)
+# NAMESPACE#LABEL (give each annotation type one name-space and choose a label depending on the task)
+
+# E.g we have 2 different miRNAs and we predict the target-sites with (1) IntaRNA and (2) TargetSearch,
+# then this could be IntaRNA#miR1, IntaRNA#miR2, TargetSearch#miR1, and TargetSearch#miR2.
+# All labels can be used more than once for one or more sequences.
+# all labels with the same namespace and the same sequence ID are grouped into
+# one abstract graph, according to the order of i.
+# Example
+# SEQID 	i 	j 	NAMESPACE#LABEL
+# s1	10 	20	IntaRNA#miR1
+# s1	54	60	IntaRNA#miR2
+# s1	15	25	TargetSearch#miR1
+# s1	54	60 	TargetSearch#miR2
+# s2 ...
+
+# We create connected component per sequence ID per name-space,
+# so that all labels with
+# the same namespace are grouped together (for one sequence) as follows:
+# v 26 IntaRNA#miR1
+# v 27 IntaRNA#miR2
+# e 26 27 n
+# then create R nodes and ^r and @r relations as before (from i to j)
+
+# If a sequence graph option is given, then these graphs (u #) have to
+# be labelled with this annotation
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/readme.rst	Tue Oct 29 11:10:19 2013 -0400
@@ -0,0 +1,48 @@
+Galaxy wrapper for fasta2shrep_gspan
+====================================
+
+This wrapper is copyright 2013 by Björn Grüning.
+
+This is a wrapper for the command line tool of fasta2shrep_gspan.pl.
+
+
+
+
+Installation
+============
+
+The recommended installation is by means of the toolshed_.
+If you need to install it manually here is a short introduction.
+
+.. _toolshed: http://toolshed.g2.bx.psu.edu/view/bgruening/sequence2gspan
+
+
+
+History
+=======
+
+- v0.1: Initial public release
+
+
+=============
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml	Tue Oct 29 11:10:19 2013 -0400
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<repositories description="This requires the EDeN datatypes.">
+    <repository changeset_revision="59b3b6ce10bb" name="eden_toolbox" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+</repositories>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sequence2gspan.xml	Tue Oct 29 11:10:19 2013 -0400
@@ -0,0 +1,167 @@
+<tool id="sequence2gpsan_rnashapes" name="Sequence to gspan" version="0.3">
+    <description>converter using RNAshapes</description>
+    <parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10" shared_inputs="" merge_outputs="outfile"></parallelism>
+    <requirements>
+        <requirement type="package" version="2.1.6">rnashapes</requirement>
+    </requirements>
+    <command interpreter="perl">
+
+        fasta2shrep_gspan.pl
+            --fasta $infile
+
+            #if $annotate_infile:
+                -annotate $annotate_infile
+            #end if
+
+            -shift $shift
+            $cue
+            $stack
+            #if $energy_opts.energy_opts_selector == 'energy':
+                -e $energy_opts.e
+            #elif $energy_opts.energy_opts_selector == 'rel_energy':
+                -c $energy_opts.c
+            #end if
+
+            #if $structure_sampling_opts.structure_sampling_opts_selector == 'on':
+                -i $structure_sampling_opts.i
+                -sample_len $structure_sampling_opts.sample_len
+            #end if
+
+            $q
+            -Tp $Tp
+            $seq_graph_win
+            $seq_graph_t
+            $seq_graph_alph
+            $abstr
+            $nostr
+            $vp
+            $ignore_header
+            -M $M
+            $u
+            $r
+
+            -stdout
+
+            > $output
+
+    </command>
+    <inputs>
+        <param name="ifile" type="data" format="fasta" label="Nucleotide Sequence in FASTA format"/>
+
+        <!-- todo repeat tag, normally only 3 windows are specified-->
+        <param name="wins" type="text" value="" label="A list of window sizes to use"
+            help="if none are given, then the entire sequence is taken with no windows. Each window > 1 required! Example: 50,100,200. (-wins)" />
+        <param name="t" type="text" value="3=0" label="The shape type (RNAshapes types from 1-5)"
+            help="With the list format, the shape level can be changed for different window length '4=100' means that shape level 4 is used from length 100nt (window length) The first given length has to be 0! Not continuous given levels are allowed! (-t)" />
+
+
+        <param name="shift" type="integer" value="1" label="The shift of the window, relative to the window size given in percent"
+            help="Example 20 % of a window 150 would result in a step size of 30 nt. It is a relative parameter, as you can give different window sizes. Default shift of 1 nt. (-shift)">
+            <validator type="in_range" min="0" max="100" />
+        </param>
+        <param name="cue" type="boolean" label="Crop unpaired ends" truevalue="-cue" falsevalue="" checked="false"
+            help="If you give this flag, then the unpaired ends of each single structure are ignored. E.g. the structure ...(((...))).. becomes just (((...))). (-cue)" />
+
+        <param name="stack" type="boolean" label="Adds stacking information to graphs" truevalue="-stack" falsevalue="" checked="false"
+            help="This adds an additional vertex (type P) for each pair of stacked base-pairs and four edges (type p) from each of the involved bases to the new vertex. (-stack)" />
+
+        <conditional name="energy_opts">
+            <param name="energy_opts_selector" type="select" label="Energy calculation mode">
+                <option value="rel_energy">Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)</option>
+                <option value="energy">Energy range in kcal/mol (RNAshapes)</option>
+                <option value="">None</option>
+            </param>
+            <when value="" />
+            <when value="energy">
+                <param name="e" type="float" value="5.0" label="Energy range in kcal/mol (RNAshapes)" help="-e">
+                    <validator type="in_range" min="0.0" />
+                </param>
+            </when>
+            <when value="rel_energy">
+                <param name="c" type="float" value="10" label="Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)" help="(-c)">
+                    <validator type="in_range" min="0.0" />
+                </param>
+            </when>
+        </conditional>
+        <param name="M" type="integer" value="0" label="Max number of shreps that should be taken per window" help="0 means selects all shreps. (-M)">
+            <validator type="in_range" min="0" />
+        </param>
+        <param name="u" type="boolean" label="Ignore unstable RNAshapes" truevalue="-u" falsevalue="" checked="false"
+            help="This option filters out closed structures with positive free energy. (-u)" />
+
+        <param name="r" type="boolean" label="Calculate structure probabilities for RNAshapes" help="(-r)" truevalue="-r" falsevalue="" checked="false" />
+
+
+        <conditional name="structure_sampling_opts">
+            <param name="structure_sampling_opts_selector" type="select" label="Structure Sampling">
+                <option value="on">On</option>
+                <option value="off">Off</option>
+            </param>
+            <when value="on">
+                <param name="i" type="integer" value="1" label="Number of sampling iterations" help="(-i)">
+                    <validator type="in_range" min="1" />
+                </param>
+                <param name="smaple_len" type="integer" value="0" label="Sampling is only used for seqs/windows >= given length" help="Default 0 (sample all length). (-sample-len)">
+                    <validator type="in_range" min="0" />
+                </param>
+            </when>
+            <when value="off" />
+        </conditional>
+
+        <param name="q" type="boolean" label="Turn on shape probabilities for RNAshapes, no sampling mode allowed" help="(-q)" truevalue="-q" falsevalue="" checked="false" />
+        <param name="Tp" type="float" value="0.001" label="Filter cutoff for shape probabilities, applied before -M filter!" help="(-Tp)">
+            <validator type="in_range" min="0.0" />
+        </param>
+
+        <param name="seq_graph_win" type="boolean" label="Add for each window a graph which contains no structure"
+            help="(-seq-graph-win)"
+            truevalue="-seq-graph-win" falsevalue="" checked="false" />
+        <param name="seq_graph_t" type="boolean" label="add for each 't #' a graph which contains no structure"
+            help="(-seq-graph-t)"
+            truevalue="-seq-graph-t" falsevalue="" checked="false" />
+        <param name="seq_graph_alph" type="boolean" label="Change the alphabet of unstructured graphs"
+            help="(-seq-graph-alph)"
+            truevalue="-seq-graph-alph" falsevalue="" checked="false" />
+
+        <param format="tabular" name="annotate_infile" type="data" optional="True" label="A file with annotations to be added as abstract graphs"
+            help="The format is has the following TAB-delimited columns: SEQID, START, END, NAMESPACE#LABEL. Labels with the same name-space and SEQID form connected components, which is a sequence of label vertices ordered by the START position in the sequence."/>
+
+        <param name="abstr" type="boolean" label="Add abstract structure graphs to the single shrep graph instances"
+            help="(-abstr)"
+            truevalue="-abstr" falsevalue="" checked="false" />
+        <param name="nostr" type="boolean" label="Calculate no structures, only add sequence information"
+            help="-seq-graph-win AND/OR -seq-graph-t are required (-nostr)"
+            truevalue="-nostr" falsevalue="" checked="false" />
+
+        <!--
+            -match-shape    SHAPE
+            all seqs/windows will be constraint folded into that shape via
+            RNAshapes (if structure is given in another way this struct will be kept),
+            if this shape is not possible within given energy range, produce a
+            specific t graph with only one vertex 'X'. By this the instance
+            becomes very unsimilar to all other graphs (for knn)
+        -->
+        <param name="vp" type="boolean" label="Enable graph computation with viewpoints"
+            help="svmsgdnspdk will center on those nucleotides that are given via capital letters and ignore those given as lowercase letters (-vp)"
+            truevalue="-vp" falsevalue="" checked="false" />
+
+        <param name="ignore_header" type="boolean" label="Don't write fasta id part after first space to gspan" help="(-ignore-header)" truevalue="-ignore-header" falsevalue="" checked="false" />
+
+    </inputs>
+    <outputs>
+        <data format="gspan" name="output" label="${tool.name} on ${on_string}" />
+    </outputs>
+    <tests>
+    </tests>
+    <help>
+
+**What it does**
+
+
+
+**References**
+
+Sita et al...
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Tue Oct 29 11:10:19 2013 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="rnashapes" version="2.1.6">
+        <repository changeset_revision="e9a43f76591a" name="package_rnashapes_2_1_6" owner="rnateam" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>