Mercurial > repos > bgruening > sequence2gspan
changeset 0:b01beb170290 draft default tip
Uploaded
author | bgruening |
---|---|
date | Tue, 29 Oct 2013 11:10:19 -0400 |
parents | |
children | |
files | fasta2shrep_gspan.pl readme.rst repository_dependencies.xml sequence2gspan.xml tool_dependencies.xml |
diffstat | 5 files changed, 2704 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta2shrep_gspan.pl Tue Oct 29 11:10:19 2013 -0400 @@ -0,0 +1,2479 @@ +#!/usr/bin/perl +#use feature ':5.10'; +use File::Basename; +use lib dirname($0); # search skript directory for modules +use strict 'vars'; +use warnings; +use Getopt::Long; +use Pod::Usage; +#use List::Util qw/ min max /; + +#use StructureLibrary::RNAtoolsConfig; +#use StructureLibrary::Sequence; + +use Cwd qw(getcwd abs_path); +use File::Temp qw(tempdir); +use File::Copy; +use POSIX qw(ceil); + +use FindBin; +use lib "$FindBin::Bin"; +#use GraphClust; + +#use vars qw(%CONFIG); +#*CONFIG = \%GraphClust::CONFIG; + +=head1 NAME + + +fasta2shrep_gspan.pl -fasta mysequences.fasta -wins "50,100,150,200" -shift 5 -M 8 + +=head1 SYNOPSIS + +Options: + + HELP + -help brief help message + -man full documentation + + COMPULSORY + -fasta <STRING> e.g. "sequence.fasta" + All sequences in fasta format. + + OPTIONS + -wins [INTEGER] e.g. "50,100,200" + A list of window sizes to use. + If none are given (empty string ''), then the entire sequence is + taken with no windows. Each window > 1 required! + -shift <INTEGER> e.g. 20 + The shift of the window, relative to the window size given in + percent. So you give which percent of the window size shall be + used for the shift. Of course the shift is rounded down to the + nearest whole number. + Example 20 % of a window 150 would result in a step size of 30 nt. + It is a relative parameter, as you can give different window sizes. + If you do not give this parameter there is a default shift of 1 nt. + -cue Crop unpaired ends. + If you give this flag, then the unpaired ends of each + single structure are ignored. E.g. the structure + ...(((...))).. becomes just (((...))) + -stack Adds stacking information to graphs. This adds an additional + vertex (type P) for each pair of stacked base-pairs and four edges + (type p) from each of the involved bases to the new vertex. + -e <FLOAT> e.g. 5.0 + Energy range in kcal/mol (RNAshapes) + Use only one of -e and -c! + -c <INTEGER> e.g. 10 + Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes) + Use only one of -e and -c! + -t <INTEGER> [1-5] e.g. 3 OR "3=0,4=100,5=200" + The shape type (RNAshapes). Default is 3. + With the list format, the shape level can be changed for different window length + "4=100" means that shape level 4 is used from length 100nt (window length) + The first given length has to be 0! Not continuous given levels are allowed! + -M <INTEGER> e.g. 10 + Max number of shreps that should be taken per window. + -u Ignore unstable structures (RNAshapes). + This option filters out closed structures with positive free energy. + -r Calculate structure probabilities for shreps (RNAshapes) + -i <INT> e.g. 10 + Turn on structure sampling and gives number of sampling iterations. + Default no sampling (i=0) + -sample-len <INT> e.g. 100 + Only in sampling mode: Sampling is only used for seqs/windows >= given length, + Default: sample all lengths (0), if -i > 0 + -q Turn on shape probabilities for RNAshapes, no sampling mode allowed + -Tp <FLOAT> e.g 0.001 + Filter cutoff for shape probabilities, applied before -M filter! + -seq-graph-win add for each window a graph which contains no structure + -seq-graph-t add for each 't #' a graph which contains no structure + -seq-graph-alph change the alphabet of unstructured graphs + -annotate <STRING> annotation.tab + A file with annotations to be added as abstract graphs + on the sequence leven (if given) and on the structure + (SHREP) level. The format is has the following TAB-delimited + columns: SEQID, START, END, NAMESPACE#LABEL. + Labels with the same name-space and SEQID form connected + components, which is a sequence of label vertices ordered + by the START position in the sequence. + -abstr Add abstract structure graphs to the single shrep graph + instances. + -nostr Calculate no structures, only add sequence information, + if this is given, then -seq-graph-win AND/OR -seq-graph-t + are required. + -match-shape <SHAPE> + all seqs/windows will be constraint folded into that shape via + RNAshapes (if structure is given in another way this struct will be kept), + if this shape is not possible within given energy range, produce a + specific t graph with only one vertex 'X'. By this the instance + becomes very unsimilar to all other graphs (for knn) + -vp enable graph computation with viewpoints: + svmsgdnspdk will center on those nucleotides that are given + via capital letters and ignore those given as lowercase letters + -tmp <STRING> e.g. "/scratch/1/sita/tmp" + A directory for writing temporary files + -o <STRING> e.g. "ProjectX/MySequences/GSPAN/" + Output directory for gspan files containing graphs. + -group <INTEGER> e.g. 5 + Combine/group that number of input seqs into 1 gspan file + output name is then '<INT>.group.gspan.bz2' + -sge Use SGE cluster for each sequence separately + -sge-logdir stdout directory for SGE call + -sge-errdir sdterr directory for SGE call + -stdout send graphs to stdout instead of writing to files + -ignore-header don't write fasta id part after first space to gspan + -debug additional debug output + + + DEFAULT VALUES + -wins "" + -shift 1 nt + -c 10 + -t 3 + -M 0 # selects all shreps + -tmp "/var/tmp/fasta2shrep" + -o "CURRENT_DIR/GSPAN/" + + SGE mode + -sge-logdir "CURRENT_DIR/GSPAN/SGE_log" + -sge-errdir "CURRENT_DIR/GSPAN/SGE_log" + -task-id <NUM> + +=head1 DESCRIPTION + +=cut + +############################################################################### +# end handler for temporary directory +# adds an error handler that deletes the directory in case of error +# SIGUSR{1/2} are sent by the sge prior to the uncatchable SIGKILL if the +# option -notify was set +############################################################################### +$SIG{'INT'} = 'end_handler'; +$SIG{'TERM'} = 'end_handler'; +$SIG{'ABRT'} = 'end_handler'; +$SIG{'USR1'} = 'end_handler'; +$SIG{'USR2'} = 'end_handler'; + +sub end_handler { + print STDERR "signal ", $_[0], " caught, cleaning up temporary files\n"; + + # change into home directory. deletion of the temporary directory will + # fail if it is the current working directory + chdir(); + File::Temp::cleanup(); + die(); +} + +############################################################################### +# PARSE COMMAND LINE OPTIONS +############################################################################### + +# command line options +my ( $i_help, $i_man, $i_debug, $i_fas, $i_wins, $i_shift, $i_crop_unpaired_ends, + $i_r, $i_e, $i_c, $i_t, $i_u, $i_M, $i_o, $i_sge, $i_jobid, $i_tmp, + $i_ignore_seq_header, $i_stacks, $i_stdout, $i_q, $i_T, $i_i, + $i_sample_min_length, $i_sge_logDir, $i_sge_errDir, $i_groupsize, $i_annotate, + $i_abstr, $i_no_structure, $i_vp, $i_matchShape, $i_rnashapes_binpath ); + +my ( $i_add_seq_graph_win, $i_add_seq_graph_t, $i_change_seq_graph_alph ); + +my $options = GetOptions( + "help" => \$i_help, + "man" => \$i_man, + "debug" => \$i_debug, + "fasta=s" => \$i_fas, + "wins=s" => \$i_wins, + "shift=f" => \$i_shift, + "cue" => \$i_crop_unpaired_ends, + "stack" => \$i_stacks, + "r" => \$i_r, + "e=f" => \$i_e, + "c=i" => \$i_c, + "t=s" => \$i_t, + "u" => \$i_u, + "M=i" => \$i_M, + "tmp=s" => \$i_tmp, + "o=s" => \$i_o, + "i=i" => \$i_i, + "sample-len=i" => \$i_sample_min_length, + "q" => \$i_q, + "Tp=f" => \$i_T, + "seq-graph-win" => \$i_add_seq_graph_win, + "seq-graph-t" => \$i_add_seq_graph_t, + "-seq-graph-alph" => \$i_change_seq_graph_alph, + "sge" => \$i_sge, + "task-id=i" => \$i_jobid, + "ignore-header" => \$i_ignore_seq_header, + "stdout" => \$i_stdout, + "sge-logdir=s" => \$i_sge_logDir, + "sge-errdir=s" => \$i_sge_errDir, + "group=i" => \$i_groupsize, + "annotate=s" => \$i_annotate, + "abstr" => \$i_abstr, + "nostr" => \$i_no_structure, + "vp" => \$i_vp, + "match-shape=s" => \$i_matchShape, + "rnashapes-bin=s" => \$i_rnashapes_binpath, +); + +pod2usage( -exitstatus => 1, -verbose => 1 ) if $i_help; +pod2usage( -exitstatus => 0, -verbose => 2 ) if $i_man; +($options) or pod2usage(2); + +# check compulsory options +($i_fas) or pod2usage("Error: the option -fasta is compulsory!\n"); +( -e $i_fas ) or pod2usage("Error: no such file - $i_fas!\n"); +$i_fas = abs_path($i_fas); + +# check other options and set default values +pod2usage("Error: either -e OR -c can be given, but NOT both!\n") if ( $i_e && $i_c ); +( $i_e or $i_c ) or $i_c = 10; # set -c 10, if neither -e or -c are given +($i_M) or $i_M = 0; # max number of shreps is 0 (=means take all computed) +($i_i) or $i_i = 0; # default no sampling else sampling iterations +($i_sample_min_length) or $i_sample_min_length = 0; +pod2usage("\nError: use --sample-len <INT> only with -i <1..INT> !\n") if ( not $i_i and $i_sample_min_length ); +($i_T) or $i_T = 0; +($i_q) or $i_q = 0; +pod2usage("\nError: Sampling (-i) not possible with shape probabilities (-q)!\n") if ( $i_i and $i_q ); +($i_add_seq_graph_win) or $i_add_seq_graph_win = 0; +($i_add_seq_graph_t) or $i_add_seq_graph_t = 0; +($i_change_seq_graph_alph) or $i_change_seq_graph_alph = 0; +($i_no_structure) or $i_no_structure = 0; + +if ($i_change_seq_graph_alph) { + ($i_add_seq_graph_t) or ($i_add_seq_graph_win) or pod2usage( "Error: " . +"When giving the parameter -seq-graph-alph, then either -seq-graph-t or -seq-graph-win" . + " must also be given!\n" ); +} +( -e $i_annotate ) or pod2usage("Error: no such file - $i_annotate!\n") + if ($i_annotate); + +$i_add_seq_graph_t = 1 if ($i_no_structure); + +($i_t) or $i_t = 3; # default abstraction type is 3 +my $change_shape_level = 0; +my @level_lens = ( -1, -1, -1, -1, -1 ); ## array_idx-1=shape level, value=start length of this level + +if ( $i_t !~ /^\d+$/ ) { + + my @t_minlens = split( ",", $i_t ); ## -t "3=100,4=200" + + foreach my $idx ( 1 .. @t_minlens ) { + + my $level = $t_minlens[ $idx - 1 ]; ## $level = "3=100" + + die "$level Wrong -t format! Example: -t 3=0,4=100,5=200\n" if ( $level !~ /^\d+\=\d+$/ ); + my @lev_len = split( "=", $level ); ## $level = "3=100" + + die "Wrong -t format! First level given needs to be length 0! Example: -t 3=0,4=100,5=200\n" if ( $idx == 1 && $lev_len[1] != 0 ); + die "Wrong -t format! Only level 1-5 allowed! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[0] < 1 or $lev_len[0] > 5 ); + die "Wrong -t format! Length >= 0 expected! Example: -t 3=0,4=100,5=200\n" if ( $lev_len[1] < 0 ); + $change_shape_level = 1; + $level_lens[ $lev_len[0] - 1 ] = $lev_len[1]; + } + + ($i_debug) and print STDERR "i_t = $i_t - change $change_shape_level - shape level lengths#" . join( ":", @level_lens ) . "#" . join( ":", @t_minlens ) . "#\n"; +} + +## checks for match-shape +if ($i_matchShape){ + die "Please provide correct match shape string like '[]'! Exit...\n\n" + if ($i_matchShape !~ /^[\[\]_]+$/); + ## RNAshapes prodices anyway only 1 structure, no suboptimal structs in match-shape folding + $i_M = 1; +} + +my $CURRDIR = getcwd; + +# set up tmp directory +# default tmp is /var/tmp, usually not NFS mounted! +( defined $i_tmp ) or $i_tmp = '/var/tmp/'; +my $tmp_template = 'fasta2shrep-XXXXXX'; + +# CLEANUP => 1 : automatically delete at exit +$i_tmp = tempdir( $tmp_template, DIR => $i_tmp, CLEANUP => 1 ); + +# create GSPAN directory when not printing to stdout +if ( not $i_stdout ) { + if ($i_o) { + ( -e $i_o ) or system("mkdir -p $i_o"); + } else { + system("mkdir -p GSPAN"); + $i_o = $CURRDIR . "/GSPAN/"; + } +} + +############################################################################### +# GLOBAL VARIABLES +############################################################################### +$i_rnashapes_binpath or $i_rnashapes_binpath = "RNAshapes"; +die "Please provide full RNAshapes path (e.g. /usr/bin/RNAshapes) !\n" if (!-f "$i_rnashapes_binpath"); +my $rnashapes_loc = $i_rnashapes_binpath; +if ( !$rnashapes_loc || !-e $rnashapes_loc ) { + my $loc = `which RNAshapes`; + chomp($loc); + die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !$loc ); + die "\nCannot find RNAshapes binary! Exit...\n\n" if ( !-e $loc ); + $rnashapes_loc = $loc; +} + +my @WINDOWS = (); +@WINDOWS = split( ",", $i_wins ) if ($i_wins); +my $globalFolding; +$globalFolding = 1 unless @WINDOWS; +my $CURRUSER = getlogin; +my $SEQNO = 0; # used to id the sequences +my $GSPANNO = 0; # used for gspan filenames +# minimum length of sequence that will work with RNAshapes +# sequences with one or two nucleotides will be restricted to sequence-only graphs +# this ensures that no sequences are skipped and external info kept synchronized +my $GSPAN_SEQ_MINLEN = 3; + +# name spaces +my $ABSTRUCT = "AS"; +############################################################################### +# EXECUTION CODE +############################################################################### + +# read fasta file into hash +my ( $headers_aref, $sequences_aref, $metainfo_aref ) = read_fasta_with_nonunique_headers_meta($i_fas); + +# call script again on the sge cluster in a batch job +if ($i_sge) { + my $SCRIPTNAME = $0; + my $CLUSTERSUBMIT = $FindBin::Bin."/fasta2shrep_gspan.sge"; + + die "Cannot find SGE submit script $CLUSTERSUBMIT! Exit...\n\n" if ( !-e $CLUSTERSUBMIT ); + + my $SGE_jobs = @{$sequences_aref}; + $SGE_jobs = ceil( @{$sequences_aref} / $i_groupsize ) if ($i_groupsize); + + my $params = "-fasta $i_fas "; + $params .= "-wins $i_wins " if ($i_wins); + $params .= "-shift $i_shift " if ($i_shift); + $params .= "-e $i_e " if ($i_e); + $params .= "-c $i_c " if ($i_c); + $params .= "-t $i_t " if ($i_t); + $params .= "-u " if ($i_u); + $params .= "-r " if ($i_r); + $params .= "-M $i_M " if ($i_M); + $params .= "-o $i_o " if ($i_o); + $params .= "-i $i_i " if ($i_i); + $params .= "--sample-len $i_sample_min_length" if ($i_sample_min_length); + $params .= "-q " if ($i_q); + $params .= "-Tp $i_T " if ($i_T); + $params .= "--seq-grap-win " if ($i_add_seq_graph_win); + $params .= "--seq-graph-t " if ($i_add_seq_graph_t); + $params .= "--seq-graph-alph " if ($i_change_seq_graph_alph); + $params .= "-ignore-header " if ($i_ignore_seq_header); + $params .= "-cue " if ($i_crop_unpaired_ends); + $params .= "-stack " if ($i_stacks); + $params .= "--group $i_groupsize " if ($i_groupsize); + $params .= "-annotate $i_annotate" if ($i_annotate); + $params .= "-abstr" if ($i_abstr); + $params .= "-nostr " if ($i_no_structure); + $params .= "--debug " if ($i_debug); + $params .= "-vp " if ($i_vp); + $params .= "--match-shape $i_matchShape " if ($i_matchShape); + + print "used script:" . $SCRIPTNAME . "\n"; + print "used submit script:" . $CLUSTERSUBMIT . "\n"; + + $i_sge_logDir = "$i_o/SGE_log" if ( !$i_sge_logDir ); + mkdir($i_sge_logDir); + + $i_sge_errDir = $i_sge_logDir if ( !$i_sge_errDir ); + mkdir($i_sge_errDir); + + my $ssh = 1; ## can be used for debug shell script call + if ($ssh) { + system( "ssh $CURRUSER\@biui.informatik.uni-freiburg.de " + . "'export SGE_ROOT=/opt/sge-6.0/; cd $CURRDIR; " + . "/opt/sge-6.0/bin/lx24-amd64/qsub -t 1-$SGE_jobs -o $i_sge_errDir/ -e $i_sge_errDir/ " + . "$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME \"$params\" ' " ); + } else { + system("$CLUSTERSUBMIT $CURRDIR $SCRIPTNAME '$params' "); + } + exit; +} + +## compute shreps and gspan, either local or after SGE submission + +# TODO read and process annotations + +my @used_seq_headers; +my @used_seqs; +my @used_meta; +my $group_idx; + +if ($i_jobid) { + ## just process the one sequence as given by the jobid number + my $used_grouping = 1; ## if no group is given, make 1 seq per job = group=1 + $used_grouping = $i_groupsize if ($i_groupsize); + my $st = ( $i_jobid - 1 ) * $used_grouping + 1; + my $end = $st + $used_grouping - 1; + $end = @{$sequences_aref} if ( $end > @{$sequences_aref} ); + + foreach my $idx ( $st .. $end ) { + push( @used_seq_headers, $headers_aref->[ $idx - 1 ] ); + push( @used_seqs, $sequences_aref->[ $idx - 1 ] ); + push( @used_meta, $metainfo_aref->[ $idx - 1 ] ); + } + + $group_idx = $i_jobid; + + #($i_debug) and print STDERR "st $st end $end gr $group_idx job $i_jobid gs $i_groupsize\n"; + + $GSPANNO = $i_jobid-1 if (!$i_groupsize); + +} else { + ## process all sequences at once + @used_seq_headers = @{$headers_aref}; + @used_seqs = @{$sequences_aref}; + @used_meta = @{$metainfo_aref}; +} + +my $out; +my $gspanfile; +my $out_no_match_shape; + +if ($i_matchShape && !$i_stdout && !$i_groupsize){ + open($out_no_match_shape,">$i_o/fasta2shrep.no_match"); +} + +# for each sequence in the fasta file +while ( my $seq = shift @used_seqs ) { + my $tmp_header = shift @used_seq_headers; + my $tmp_meta = shift @used_meta; + + my ( $seq_id, $seq_header ) = ( $tmp_header =~ /(\S+)\s*([\S*\s*]*)/ ); + $i_ignore_seq_header and $seq_header = ''; + + my $seq_fasta = generate_single_fasta_from_sequence_X( $seq_id, $seq ); + my $seq_len = length($seq); + + # only print sequence graphs for sequences below this threshold + my $no_structure_override = ($seq_len < $GSPAN_SEQ_MINLEN) ? 1 : 0; + + $GSPANNO++; + + # set outstream for gspan output to correct file/STDOUT + if ($i_stdout) { + $out = \*STDOUT; + } elsif ( !$i_groupsize ) { + $gspanfile = $i_tmp . '/' . $GSPANNO . '.gspan'; + open( $out, "| bzip2 -f > $gspanfile.bz2" ); + } elsif ( ( $GSPANNO - 1 ) % $i_groupsize == 0 ) { + + if ( $GSPANNO > 1 ) { + close($out); + system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2"); + if ($out_no_match_shape){ + close($out_no_match_shape); + system("mv $gspanfile.no_match $i_o/$group_idx.group.gspan.no_match"); + } + } + + if ( !$i_jobid ) { + $group_idx = int( ( $GSPANNO - 1 ) / $i_groupsize ) + 1; + } + + $gspanfile = "$i_tmp/$group_idx.group.gspan"; + open( $out, "| bzip2 -f > $gspanfile.bz2" ); + open( $out_no_match_shape, ">$gspanfile.no_match" ) if ($out_no_match_shape); + } + + ## do not use folding windows in special cases + if ( $globalFolding || $i_no_structure || $no_structure_override) { + @WINDOWS = (); + push( @WINDOWS, $seq_len ); + } + + ##check win sizes for global folding + my @WINDOWS_used = sort { $a <=> $b } @WINDOWS; + foreach my $w_idx ( 0 .. ( @WINDOWS_used - 1 ) ) { + if ( $WINDOWS_used[$w_idx] >= $seq_len && $WINDOWS_used[$w_idx] > 1 ) { + if ( $w_idx < $#WINDOWS_used ) { + @WINDOWS_used = @WINDOWS_used[ 0 .. $w_idx ]; + last; + } + } + } + + ## use seq graph only if no shape folding wanted + $i_add_seq_graph_t = 1 if ($i_no_structure || $no_structure_override); + + ## no shape info in graphheader if we have fixed structure (according LocaRNA handling) + ## use tags #FS or #S for provided structure + my $graph_header; + my @struct_meta = grep { $_ =~ /#FS/ || $_ =~ /#S/ } keys %{$tmp_meta}; + + if (@struct_meta) { + $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, 1 ); + } else { + $graph_header = getGraphHeader( $seq_id, $seq_header, \@WINDOWS, $i_shift, $i_e, $i_c, $i_t, $i_u, $i_r, $i_M, $i_crop_unpaired_ends, $i_i, $i_sample_min_length, $i_q, $i_T, $seq_len, $i_no_structure || $no_structure_override ); + } + + print $out $graph_header; + + my $gi = 0; # current graph index + + ## add graph with no structure at all depending on $i_add_seq_graph_t + if ($i_add_seq_graph_t) { + + ($gi) = convertSeqWindow( $seq, $seq_len, 1, $gi, $graph_header, $out, $i_annotate, $seq ); + + } + + ## encode fixed structure only if provided and structures wanted in general + if ( @struct_meta && !$i_no_structure && !$no_structure_override) { + + my $struct_meta = "#FS"; + $struct_meta = "#S" if ( !exists $tmp_meta->{$struct_meta} ); + + my $seq_shrep = [ $tmp_meta->{$struct_meta}, "ENERGY", "0.00", "SHAPE", $struct_meta ]; + $gi = convertShapeWindow( [$seq_shrep], $seq, $seq_len, 1, $gi, $out, + $graph_header, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq ); + @WINDOWS_used = (); ## no shape folding if we have a fixed structure + + } + + ## ignore RNAshapes folding if wanted, but do correct file move afterwards + ## (just "next" does not work due to output) + @WINDOWS_used = () if ($i_no_structure or $no_structure_override); + + #for each window size in list + foreach my $win_size (@WINDOWS_used) { + + # calculate shift size from percentage + my $curr_shift = 1; + if ($i_shift) { + $curr_shift = ( $i_shift / 100 ) * $win_size; + $curr_shift = int($curr_shift); #round down + $curr_shift = 1 unless ($curr_shift); # just in case it is 0 + } + ($i_debug) and print STDERR "winsize: $win_size curr_shift: $curr_shift\n"; + ($i_debug) and print STDERR "\nNext: $seq_id\t winsize:$win_size \n"; + + # choose current shape level, depending on $i_t + my $curr_t = 0; + if ($change_shape_level) { + for ( my $i = 0 ; $i < @level_lens ; $i++ ) { + $curr_t = $i + 1 if ( $level_lens[$i] != -1 && ( $level_lens[$i] <= $win_size ) ); + } + ($i_debug) and print STDERR "$win_size curr type $curr_t\n"; + } else { + $curr_t = $i_t; + } + + my $rnashapesoutput_fh; + + # call RNAshapes and write to $rnashapesoutput_fh + $rnashapesoutput_fh = call_RNAshapes( $seq_fasta, $rnashapes_loc, $win_size, + $curr_shift, $i_e, $i_c, $curr_t, $i_u, $i_r, $i_q, $i_T, $i_i, $i_sample_min_length, $seq_len, $i_matchShape ); + + # read RNAshapes output from $rnashapesoutput_fh and write subgraph + # to gspan file + my $gi_old = $gi; + $gi = convert_RNAshapes_output( $rnashapesoutput_fh, $gi, $i_M, $out, $graph_header, + $win_size, $seq_len, $curr_t, $i_annotate, $i_abstr, $i_crop_unpaired_ends, $i_stacks, $seq ); + + ## no (match) shape found at all for this seq + if ($gi == $gi_old+1){ + $gi = convertSeqWindow( "X", 1, 1, $gi, $graph_header, $out, $i_annotate, $seq ); + print $out_no_match_shape $seq_id."\n" if (!$i_stdout && $out_no_match_shape); + } + } ## foreach WINDOW_used + + if ( !$i_stdout && !$i_groupsize ) { + close($out); + move "$gspanfile.bz2", "$i_o/$GSPANNO.gspan.bz2"; + } + + system("rm $seq_fasta"); + +} ## while @used_seqs + +if ($i_groupsize) { + close($out); + system("mv $gspanfile.bz2 $i_o/$group_idx.group.gspan.bz2"); + close($out_no_match_shape) if ($out_no_match_shape); + move "$gspanfile.no_match", "$i_o/$group_idx.group.gspan.no_match" if ($out_no_match_shape); +} elsif ( $out_no_match_shape && !$i_stdout && !$i_groupsize ) { + close($out_no_match_shape); +} + +############################################################################### +# METHODS +############################################################################### + +############################################################################ +# Generates fasta file for a single sequence (one-lined-fasta). This +# fasta is stored in the temp directory and should be deleted at the end. +# Input: +# seq_id : the sequence ID +# seq : the sequence +# +# Output: +# The fasta file name +############################################################################ +sub generate_single_fasta_from_sequence_X { + my ( $seq_id, $seq ) = @_; + + $seq = uc($seq); + $seq =~ tr/T/U/; + $seq =~ s/[^AUCGN]/N/g; + + my $outfas = $i_tmp . "/seq_" . $SEQNO++ . ".fasta"; + my $host = readpipe("hostname"); + open( FAS, ">$outfas" ) or die "$host Cannot open file $outfas! Exit...\n\n"; + print FAS ">$seq_id\n$seq"; + close(FAS); + + return $outfas; +} + +############################################################################ +# RNAshapes is called with the given input or default parameters. +# Input: +# seq_fasta : the sequence fasta file +# rnashapes_location : the location of the installation files for RNAshapes +# win_size : the current window size +# shift : the input parameter -shift +# e : the input parameter -e +# c : the input parameter -c +# t : the input parameter -t +# u : the input parameter -u +# r : the input parameter -r +# +# Output: none +############################################################################ +sub call_RNAshapes { + my ( $seq_fasta, $rnashapes_location, $win_size, $shift, $e, $c, $t, $u, $r, $q, $T, $i, $sample_length, $seqLen, $matchShape ) = @_; + my $FUNCTION = "call_RNAshapes in fasta2shrep_gspan.pl"; + + ($seq_fasta) or die("INPUT ERROR in $FUNCTION: the fasta file is compulsory!\n"); + ($rnashapes_location) or die( "INPUT ERROR in $FUNCTION: the RNAshapes location" . " is compulsory!\n" ); + die "$rnashapes_location does not exist! Exit...\n\n" if ( !-e $rnashapes_location ); + my $call = $rnashapes_location . " -o 1 "; # the output format is of type 1 + $call .= "-q " if ($q); + $call .= "-T $T " if ( $q and $T ); + $call .= "-w $win_size "; + $call .= "-W $shift " if ($i_shift); + die("ERROR in $FUNCTION: Give only one of the options -c or -e (RNAshapes)!\n") + + if ( $e && $c ); + $call .= "-e $e " if ($e); + $call .= "-c $c " if ($c); + $call .= "-t $t " if ($t); + $call .= "-u " if ( $u and not $i ); ## not possible in sampling mode + $call .= "-r " if ($r); + $call .= "-m $matchShape " if ($matchShape); + + ## check is a bit long but : we want to sample if the window is larger than $sample_length and full seq is larger than window or sample_len + ## necessary to do sampling a large window is given, sample_length is shorter than window, but seq is longer than sample_len + $call .= "-i $i -A " if ( not $q and $i and ( $win_size >= $sample_length and ( $win_size <= $seqLen or $seqLen >= $sample_length ) ) ); ## -A is to omit samples and print only combined shape probs + $call .= " < $seq_fasta"; + + ($i_debug) and print STDERR "$seqLen $sample_length $win_size $call\n"; + + open my $rnashapesoutput, "$call |" or die( "ERROR in $FUNCTION: The following call " . "could not be carried out!\n$call\n" ); + + return $rnashapesoutput; +} + +############################################################################ +# The output of RNAshapes for one sequence and one window size is read +# and converted into graph format. +# Input: +# rnashapeoutput : filehandle for RNAshapes output in format -o 1 +# curr_gi : current graph index (for vertices) +# maxShreps : max number of shreps to convert to graphs +# graph_file_hdl : the output handler for the graph file +# graphHead : the header line for the complete graph (for sequence) +# winSize : current window size in input for RNAshapes +# seqLen : full input seq length +# used_t : +# annotate: +# abstr : +# cue : +# stacks : +# orig_seq : the nucleotide sequence as read from fasta +# +# Output: +# The current graph index +############################################################################ +sub convert_RNAshapes_output { + my ( $rnashapesoutput, $curr_gi, $maxShreps, $graph_file_hdl, $graphHead, $winSize, $seqLen, $used_t, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_; + + ## omit first line in output, contains fasta header line + my $line = <$rnashapesoutput>; + my $win_shreps = []; + my $win_start; + my $win_end; + my $win_seq; + my $win_shrep_count = 0; + my $win_sample = 0; + my $winHead; + my $win_globalFolding; + my $win_size_real; + + # reading RNAshapes output + while ( $line = <$rnashapesoutput> ) { + + if ( $line =~ /^(\d+)\s+(\d+)$/ ) { + ## line: "<start> <end>" + + if ( @{$win_shreps} > 0 ) { + + print $graph_file_hdl $winHead; + + ## remove SHAPE="_" shrep if it exists in $win_shreps + ## do this only when have a sequence graph already + if ($i_add_seq_graph_t) { + my @new_win_shreps = (); + map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps}; + $win_shreps = \@new_win_shreps; + } + + ## add graph with no structure depending on $i_add_seq_graph_win to win + if ($i_add_seq_graph_win ) { + ( $curr_gi ) = convertSeqWindow( $win_seq, $win_size_real, + $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ); + } + + $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real, + $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, + $cue, $stacks, $orig_seq ); + } + + ## set new window params + $win_shreps = []; + $win_start = $1; + $win_end = $2; + $win_shrep_count = 0; + $win_size_real = $win_end - $win_start + 1; + + if ( ($win_size_real) >= $seqLen ) { + $win_globalFolding = 1; + } else { + $win_globalFolding = 0; + } + my $win_center = $win_start + ( ( $win_size_real + 1 ) / 2 ); + + $winHead = getWindowHeader( $graphHead, $winSize, $win_start, $win_end, $win_center, $win_globalFolding, $win_sample, $used_t ); + + } elsif ( $line =~ /^(\S+)$/ ) { + ## line: "CUUAUGAGUAAGGAAAAUAACGAUUCGGGGUGACGCCCGAAUCCUCACUG" + $win_seq = uc($1); ## to be 101% sure that we have by default uppercase chars + + } elsif ( $line =~ /^Results for (\d+) iterations:$/ ) { + ## line: "Results for 10 iterations:" + $win_sample = $1; + + } elsif ( $line =~ /^Shape\s+\S+\s+not found within energy range.*$/ ) { + ## line: "Shape [] not found within energy range (-24.75 to -27.50). Try -c or -e to increase range." + $win_shreps = []; + + } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)$/ ) { + ## line:"...((((..(((....)))))))...........(((((......))))) (-10.10) [[]][]" + ## take only $maxShreps shreps per window if set + next if ( $maxShreps && $win_shrep_count >= $maxShreps ); + push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPE", $3 ] ); + $win_shrep_count++; + + } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)$/ ) { + ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) [[[[[]]]]]" + ## take only $maxShreps shreps per window if set + next if ( $maxShreps && $win_shrep_count >= $maxShreps ); + push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPE", $4 ] ); + $win_shrep_count++; + + } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) { + ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) 0.3000000 [[[[[]]]]]" + ## take only $maxShreps shreps per window if set + next if ( $maxShreps && $win_shrep_count >= $maxShreps ); + push( @{$win_shreps}, [ $1, "ENERGY", $2, "SHAPEPROB", $3, "SHAPE", $4 ] ); + $win_shrep_count++; + + } elsif ( $line =~ /^([\(\)\.]+)\s+\((\S+)\)\s+\((\S+)\)\s+(\S+)\s+(\S+)$/ ) { + ## line:"((((..((((...((.((.((.....)).)).))...))))..))))... (-10.60) (0.7795360) 0.3000000 [[[[[]]]]]" + ## take only $maxShreps shreps per window if set + next if ( $maxShreps && $win_shrep_count >= $maxShreps ); + push( @{$win_shreps}, [ $1, "ENERGY", $2, "PROB", $3, "SHAPEPROB", $4, "SHAPE", $5 ] ); + $win_shrep_count++; + + } else { + next if ( $line =~ /^$/ ); + die "Unexpected shape output format!\nline=$line\n\nExit...\n\n"; + } + } + + ## convert last windows + if ( @{$win_shreps} > 0 ) { + + print $graph_file_hdl $winHead; + + ## remove SHAPE="_" shrep if it exists in $win_shreps + ## do this only when have a sequence graph already + if ($i_add_seq_graph_t) { + my @new_win_shreps = (); + map { push( @new_win_shreps, $_ ) if ( $_->[ $#{$_} ] ne "_" ) } @{$win_shreps}; + $win_shreps = \@new_win_shreps; + } + + ## add graph with no structure depending on $i_add_seq_graph_win to win + if ($i_add_seq_graph_win) { + ($curr_gi) = convertSeqWindow( $win_seq, $win_size_real, $win_start, + $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ); + } + + $curr_gi = convertShapeWindow( $win_shreps, $win_seq, $win_size_real, + $win_start, $curr_gi, $graph_file_hdl, $winHead, $annotate, $abstr, $cue, + $stacks, $orig_seq ); + } + + close($rnashapesoutput); + + return $curr_gi + 1; # return the gi (graph index) for the next subgraph +} + +## Sub to create graph for a complete unstructured sequence +# TODO: document function +sub convertSeqWindow { + my ( $win_seq, $win_size_real, $win_start, $curr_gi, $winHead, $graph_file_hdl, $annotate, $orig_seq ) = @_; + + my $seq_shrep; + + # use a different alphabet (lowercase) if option seq-graph-alph set + my $seq_graph_sequence = $i_change_seq_graph_alph ? lc($win_seq) : uc($win_seq); + $seq_shrep = [ "." x $win_size_real, "ENERGY", "0.00", "SHAPE", "_", "STRUCT", "." x $win_size_real, "SEQ", $seq_graph_sequence ]; + my $backboneGraph_ref = getBackboneGraph( $seq_graph_sequence, $curr_gi, $win_start, 0, ( $win_size_real - 1 ), $orig_seq ); + + print $graph_file_hdl getSeqHeader( $winHead, $seq_shrep ); + print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n"; + + $curr_gi += $win_size_real; + + if ($annotate) { + ## TODO add annotation graphs + } + + return ($curr_gi); +} + +############################################################################ +# The results for one window are converted in this method to GSPAN graphs. +# Input: +# win_shreps_aref : the array ref of shreps (dot-bracket structures) for the +# current window +# win_seq : the nucleotide sequence for the current window +# win_size_real : the current (true) window size +# win_start : the starting position of win_seq in the original seq (1-n) +# curr_gi : the current graph index +# graph_file_hdl : the graph file handler +# winHead : the header line for the current sequence window +# TODO annotate +# abstr : input parameter -abstr +# cue : input parameter -cue +# stacks : input parameter -stacks +# orig_seq : the nucleotide sequence as read from fasta +# +# Output: +# The current graph index +############################################################################ +sub convertShapeWindow { + my ( $win_shreps_aref, $win_seq, $win_size_real, $win_start, $curr_gi, + $graph_file_hdl, $winHead, $annotate, $abstr, $cue, $stacks, $orig_seq ) = @_; + + ## generate for each shrep a connected component in gspan file + foreach my $shrep ( @{$win_shreps_aref} ) { + + # get the current gi as it was at the beginning of this shrep + my $shrep_gi = $curr_gi; + + # cut off unpaired ends, if option is given + my $shrep_struct = $shrep->[0]; + my $crop_index_left = 0; + my $crop_index_right = length($shrep_struct) - 1; + + # find croping indices with numbering from 0 to n-1 + if ($cue) { + $crop_index_left = index( $shrep_struct, "(" ); # find 1st occ of "(" + $crop_index_right = rindex( $shrep_struct, ")" ); # find last occ of ")" + + # if the complete window is unpaired, then don't crop + if ( $crop_index_left == -1 ) { + $crop_index_left = 0; + $crop_index_right = length($shrep_struct) - 1; + } + } + + # create structure graph + my $backboneGraph_ref = getBackboneGraph( $win_seq, $curr_gi, $win_start, $crop_index_left, $crop_index_right, $orig_seq ); + my $structGraph_ref; + + # add both basic edges, stacks and abstract graph + if ($abstr) { + ( $structGraph_ref, $curr_gi ) = getStructPlusAbstractStructGraph( $shrep, $curr_gi, $win_size_real, $cue, $stacks ); + + # just add basic structure graph edges, and stacks + } else { + ( $structGraph_ref, $curr_gi ) = getStructGraph( $shrep, $curr_gi, $win_size_real, $stacks ); + } + + # additional information for shrep header: sequence and dot-bracket + my $crop_length = $crop_index_right - $crop_index_left + 1; + my $shrepheader_struct = substr( $shrep_struct, $crop_index_left, $crop_length ); + my $shrepheader_seq = substr( $win_seq, $crop_index_left, $crop_length ); + push( @{$shrep}, 'STRUCT', $shrepheader_struct, 'SEQ', $shrepheader_seq ); + + # print structure graph to file + print $graph_file_hdl getShapeHeader( $winHead, $shrep ); + print $graph_file_hdl join( "\n", @{$backboneGraph_ref} ) . "\n"; + + # don't print empty array; sgdnspdk breaks with empty lines + if ( @{$structGraph_ref} > 0 ) { + print $graph_file_hdl join( "\n", @{$structGraph_ref} ) . "\n"; + + if ($annotate) { + + # TODO add annotations to shrep structure + # $win_start, $win_end (1-n) + } + + } + } + + return $curr_gi; +} + +########################################################################### +# Here the vertices for the nucleotides are generated and also the backbone +# edges, which connect the nucleotides in their correct order (i.e order +# of the given sequence) +# Input: +# win_seq : the sequence for the current window +# win_start : the starting position for the current window +# curr_crop_i_left : the cropping index for the left end of sequence (-cue) +# curr_crop_i_right : the cropping index for the right end of sequence (-cue) +# orig_seq : the nucleotide sequence as read from fasta +# +# Output: +# The lines of the graph in an array reference to be printed to a file with +# one element per line. +############################################################################ +sub getBackboneGraph { + my ( $win_seq, $curr_gi, $win_start, $curr_crop_i_left, $curr_crop_i_right, $orig_seq ) = @_; + + # RNAshapes substitutes T -> U, sequence only does not + # thus, just transform all t/T -> u/U to remain consistent + $orig_seq =~ tr /tT/uU/; + $win_seq =~ tr /tT/uU/; + + my @seq; + @seq = split( "", $win_seq ); + + # if the vp option is set we need to obtain the original capitalization from + # $orig_seq and extract the windowed sequence manually + my @seq_vp; + if ( defined $i_vp ) { + my $win_len = length($win_seq); + my $capitalized_win_seq = substr( $orig_seq, $win_start - 1, $win_len ); + ( uc($capitalized_win_seq) eq uc($win_seq) ) or + die( "error: windowed sequence generated due to vp option not equal " . + "to sequence reported by RNASHAPES.\n" . + "'${capitalized_win_seq}' != '${win_seq}'" ); + @seq_vp = split( "", $capitalized_win_seq ); + } + + my @vert = (); + my @edg = (); + + my $curr_abs_pos = $win_start + $curr_crop_i_left; + $curr_gi += $curr_crop_i_left; + + # set vertice labeled with 'v' or 'V' according to sequence and vp option + # when vp set, + # uppercase nucleotides are annotated with 'v' + # lowercase nucleotides are annotated with 'V' + my $vertice_label = ( defined $i_vp and ( $seq_vp[$curr_crop_i_left] =~ /[a-z]/ ) ) ? 'V' : 'v'; + + # create backbone vertice of first nucleotide + push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$curr_crop_i_left], $curr_abs_pos ) ); + + foreach my $idx ( ( $curr_crop_i_left + 1 ) .. $curr_crop_i_right ) { + $curr_abs_pos++; + $curr_gi++; + + # set vertice label as described above + $vertice_label = ( defined $i_vp and ( $seq_vp[$idx] =~ /[a-z]/ ) ) ? 'V' : 'v'; + push( @vert, join( ' ', $vertice_label, $curr_gi, $seq[$idx], $curr_abs_pos ) ); + push( @edg, join( ' ', "e", $curr_gi - 1, $curr_gi, '> 1' ) ); + } + + my @ret = ( @vert, @edg ); + + return \@ret; +} + +########################################################################### +# This method does the same as getStructGraph, but is extended to identify +# the abstract parts of the structure and also add this to the graph +# via abstract relations, see information below. +# We already have the backbone graph, now the base-pair edges need to be +# added to this graph and the abstract graph is added after this. Finally +# the abstract relations, pointing from the abstract level to the basic +# structure level is added. All the while, we keep track of the current +# index graph. Furthermore, in the basic structure graph, we have vertices +# stacks that connects the four stacked base-pairs involved. (symbols P +# for vertex and p for edges). The abstract vertices are labelled according +# to the given name-space, then #, then the abstract structure type, e.g. +# HL for hairpin-loop. The edges between the structure types are denoted +# with n, and the relation vertex is ^R, with ^r going from the abstract +# structure to the relation and from the relation to the basic structure +# is @r. +# +# Input: +# curr_shrep : the current shrep as dot-bracket structure +# curr_gi : the current graph index +# win_size : the window size +# cue : the input parameter -cue (whether to crop unpaired ends) +# stacks : the input parameter -stacks (whether to add stack information) +# +# Output: +# The structure graph information line-by-line as an array reference and +# the current graph index. +# +############################################################################ +sub getStructPlusAbstractStructGraph { + my ( $curr_shrep, $curr_gi, $win_size, $cue, $stacks ) = @_; + + # $win_size = length($curr_shrep->[0]); + + my @struct = split( "", $curr_shrep->[0] ); + +# print "=================testing: new shrep : gi=$curr_gi ===============================\n" if ($i_debug); + + # OBJECTS and VARIABLES + # all indices are saved according to the current graph index, $curr_gi, + # which is at the beginning of the current window, so that the nucleotide + # index can be inferred using $idx + $curr_gi + # opening brackets + my @open_blks = (); # open blocks of consecutive opening brackets (array of arrays) + my @p_open = (); # currently open block of the last identified complete block + my @c_open = (); # current "incomplete" block of consecutive opening brackets + # base-pair objects + my %bps = (); # hash of all base-pairs with stem for a value, if it is at + # the BP is opening or closing a stem object + my @c_bp = (); # current BP, just being closed at current index pos (i,j) + my @p_bp = (); # previously closed BP at position before current index + # stem objects + my %stems = (); # all stems, key="i:j,k:l", value=stem array, (i,j) outer BP + my @c_stem = (); # current incomplete stem object + my @stmbrks = (); # all stem-breaks in the form of (i,k,l), where i is the remaining + # open base and (k,l) is the subsequent closing base-pair + +# my @p_stmbrk = (); # previous stem-break in the form of (i,k,l), where i is the remaining +# # open base and (k,l) is the subsequent closing base-pair +# loop objects + my @up = (); # all unpaired regions (i,j) that are of unkown type + my @c_up = (); # current incomplete unpaired object + my @hls = (); # all hairpin loop objects + my @bls = (); # all bulge-loop objects + my @ils = (); # all internal loops + my @mls = (); # all multi-loops + my @c_mls = (); # current incomplete multi-loops! (there can be more than one) + my @els = (); # all external loops + my @c_el = (); # current incomplete external loop + + # iterate through shrep structure an identify abstract parts + foreach my $idx ( 0 .. @struct - 1 ) { + + #=================================================== + # current char is the opening bracket of a base-pair + if ( $struct[$idx] eq "(" ) { + + #=================================================== + + # update current window index to current graph index + $idx += $curr_gi; + + # case: '((' currently there is an open block, extend it + if (@c_open) { + push( @c_open, $idx ); + +# case: '.(' or ')(' or BOS'(' there is no open block, create a new one ".(" or ")(" + } else { + + # begin a new current open block + push( @c_open, $idx ); + + # case: ')(' + if (@c_stem) { + + # CLOSE STEM + print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug); + close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ); + + # case (x(x)( + if (@open_blks) { + + # EXTEND_ML, if one exists + if (@c_mls) { + extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi ); + + # OPEN_ML + } else { + + # case 'x((x)(' the opening of the ML was a stem-break + if ( @stmbrks + && $stmbrks[-1]->[1] == $p_bp[0] + && $stmbrks[-1]->[2] == $p_bp[1] ) { + my @c_stmbrk = @{ pop(@stmbrks) }; + my @a = ( $c_stmbrk[0], -1 ); + my @ml = (); + push( @ml, \@a ); # 1st base-pair is in-waiting + my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] ); + push( @ml, \@a2 ); #2nd BP + my @a3 = ($idx); + push( @ml, \@a3 ); #first of current BP + push( @c_mls, \@ml ); + print STDERR "TEST - opened ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); + + # case 'x(.(x)(' + } elsif (@up) { + my @tmp_up = @{ pop(@up) }; + + # compare adjacent BP to UP to see if they fit + if ( $tmp_up[1] + 1 == $p_bp[0] + && $open_blks[-1]->[-1] == $tmp_up[0] - 1 ) { + my @a = ( $open_blks[-1]->[-1], -1 ); + my @ml = (); + push( @ml, \@a ); # 1st BP awaits closing + my @newbp = @p_bp; + push( @ml, \@newbp ); # 2nd BP + my @a2 = ($idx); + push( @ml, \@a2 ); # current open BP, awaits closing + push( @c_mls, \@ml ); + print STDERR "TEST - opened ML with prev UP1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) + + if ($i_debug); + } else { + die "ERROR: the base-pairs to not match the " . "previous unpaired region?!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } else { + die "ERROR: in case ML, but there is no initial" . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + + # case: 'x(x)(' + } else { + + # case: '.(x)(' => CLOSE_EL + if (@c_el) { + if ( $c_el[-1] == $p_bp[0] ) { + push( @c_el, $p_bp[1] ); + my @newel = @c_el; + push( @els, \@newel ); + @c_el = (); + print STDERR "TEST - closed EL1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug); + } else { + die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + + # in both cases OPEN_EL + push( @c_el, @p_bp ); + push( @c_el, $idx ); + print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug); + } + + # case '.(' + } elsif (@c_up) { + + # case (x(x).( OR 'x(.(' + if (@open_blks) { + + # case 'x(x).(' ML + if ( @p_bp && ( $p_bp[1] == $c_up[0] - 1 ) ) { + + # case '((x)x(x).(' EXTEND_ML, if one exists + if (@c_mls) { + + extend_or_open_nested_ML( $curr_shrep, \@c_mls, \@p_bp, \@up, \@stmbrks, \@open_blks, $idx, $curr_gi ); + + # case 'x((x).(' or 'x(.(x).(' OPEN_ML + } else { + + # case 'x((x).(' the opening of the ML was a stem-break + if ( @stmbrks + && $stmbrks[-1]->[1] == $p_bp[0] + && $stmbrks[-1]->[2] == $p_bp[1] ) { + my @c_stmbrk = @{ pop(@stmbrks) }; + my @a = ( $c_stmbrk[0], -1 ); + my @ml = (); + push( @ml, \@a ); # 1st base-pair is in-waiting + my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] ); + push( @ml, \@a2 ); #2nd BP + my @a3 = ($idx); + push( @ml, \@a3 ); #first of current BP + push( @c_mls, \@ml ); + print STDERR "TEST - open ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); + + # case 'x(.(x).(' opening the ML with initial UP region + } elsif (@up) { + + # get previous unpaired region + my $tmp = pop(@up); + my @p_up = @{$tmp}; + + # case: '(.(x).(' compare adjacent BP to UP to see if they create a ML + if ( $p_up[1] + 1 == $p_bp[0] + && $open_blks[-1]->[-1] == $p_up[0] - 1 ) { + my @a = ( $open_blks[-1]->[-1], -1 ); + my @ml = (); + push( @ml, \@a ); # 1st BP awaits closing + my @newbp = @p_bp; + push( @ml, \@newbp ); # 2nd BP + my @a2 = ($idx); + push( @ml, \@a2 ); # current open BP, awaits closing + push( @c_mls, \@ml ); + print STDERR "TEST - opened ML with prev UP2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) + + if ($i_debug); + } + + # has to be an ML, but can't find first part + } else { + die "ERROR: in case ML, but there is no initial " . "part of the ML to be identified\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + + # case '(.( + } else { + my @newup = @c_up; + push( @up, \@newup ); + print STDERR "TEST - closed UP:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newup ) ) if ($i_debug); + } + + # case: 'x(x).(' OR 'EOS.(' + } else { + + # case: '.(x)(' => CLOSE_EL + if (@c_el) { + if ( $c_el[-1] == $c_bp[0] ) { + push( @c_el, $p_bp[1] ); + my @newel = @c_el; + push( @els, \@newel ); + @c_el = (); + print STDERR "TEST - closed EL2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug); + } else { + die "ERROR: the previous BP must match the current EL\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + + # in both cases OPEN_EL + if (@p_bp) { + push( @c_el, @p_bp ); + push( @c_el, $idx ); + } else { + + # ignore this if -cue is given + # TODO check the gi index + if ($cue) { + @c_el = (); + } else { + push( @c_el, ( $curr_gi, $curr_gi ) ); + push( @c_el, $idx ); + print STDERR "TEST - opened EL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_el ) ) if ($i_debug); + } + } + } + @c_up = (); + + } else { + + # no more cases except opening bracket at beginning of sequence + # in this case do nothing, as the index has already been added + die "ERROR: '((', ')(', '.(' have all been covered\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) + unless ( $idx == $curr_gi ); + + } + } + + #=================================================== + # current char is the unpaired base + } elsif ( $struct[$idx] eq "." ) { + + #=================================================== + + # update current window index to current graph index + $idx += $curr_gi; + + # case ').' => CLOSE_STEM + if (@c_stem) { + print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug); + close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ); + } + + # case '..' extend UP, @c_up + if (@c_up) { + $c_up[1] = $idx; + + # case 'x.' open new UP + } else { + + # OPEN_UP, @c_up + @c_up = ( $idx, $idx ); + + # case '(.' just come to the end of an open block, push it onto stack + if (@c_open) { + my @newopen = @c_open; + push( @open_blks, \@newopen ); + @c_open = (); + + # case ').' or '.' + # (have already closed stem and created stem-break and emptied p_open) + } else { + + # case 'x(x).' or '.' => CLOSE_EL + if ( !@open_blks ) { + + # case 'x().().' + if (@c_el) { + + # double check + if ( $c_el[-1] == $p_bp[0] ) { + push( @c_el, $p_bp[1] ); + my @newel = @c_el; + push( @els, \@newel ); + @c_el = (); + print STDERR "TEST - closed EL3:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newel ) ) if ($i_debug); + } else { + die "This case should not occur!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + } + } + } + + #=================================================== + # current char is the closing bracket of a base-pair + } elsif ( $struct[$idx] eq ")" ) { + + #=================================================== + + # update current window index to current graph index + $idx += $curr_gi; + + # save previous base-pair + if (@c_bp) { + @p_bp = @c_bp; + } + + # case: '((x))', extend stem + if (@p_open) { + + # get current base-pair + @c_bp = ( pop(@p_open), $idx ); + + # add it to the base-pair hash + $bps{"$c_bp[0]:$c_bp[1]"} = ""; + + # EXTEND_STEM + if (@c_stem) { + $c_stem[0] = $c_bp[0]; + $c_stem[1] = $c_bp[1]; + } else { + die "ERROR: in case '))' there has to be an open stem object!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + + # case: '(x(x))' or '(x.)', '()' not allowed + } else { + + # case: '(x(x))' CLOSE_STEM close previous stem + if (@c_stem) { + print STDERR "TEST - closed stem:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@c_stem ) ) if ($i_debug); + close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ); + + # case '(x(x))' OPEN_STEM + if (@open_blks) { + my $tmp = pop(@open_blks); + @p_open = @{$tmp}; + + # get current base-pair + @c_bp = ( pop(@p_open), $idx ); + + # add it to the base-pair hash + # as the stem is not finished yet, cannot add stem information + $bps{"$c_bp[0]:$c_bp[1]"} = ""; + + # open stem + $c_stem[0] = $c_bp[0]; + $c_stem[1] = $c_bp[1]; + $c_stem[2] = $c_bp[0]; + $c_stem[3] = $c_bp[1]; + } else { + die "ERROR: there have to be open blocks to match " . "current closing bracket!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + + # case: '(.(x))' => CLOSE_BL + if ( @up && $up[-1]->[0] - 1 == $c_bp[0] && $up[-1]->[1] + 1 == $p_bp[0] ) { + my @newBL = ( @c_bp, @p_bp ); + push( @bls, \@newBL ); + pop(@up); + print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newBL ) ) if ($i_debug); + + # case: '(x(x).(x))' => CLOSE_ML + } elsif ( @c_mls && $c_mls[-1]->[-1]->[-1] == $p_bp[0] ) { + my @newml = @{ pop(@c_mls) }; + push( @{ $newml[-1] }, $p_bp[1] ); # close last BP + $newml[0]->[1] = $c_bp[1]; # close 1st BP + push( @mls, \@newml ); + print STDERR "TEST - closed ML1:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug); + } else { + die "ERROR: what is this case?\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + + # case: (x.) + } else { + + # OPEN_STEM open new stem + if (@open_blks) { + @p_open = @{ pop(@open_blks) }; + + # get current base-pair + @c_bp = ( pop(@p_open), $idx ); + + # add it to the base-pair hash + # as the stem is not finished yet, cannot add stem information + $bps{"$c_bp[0]:$c_bp[1]"} = ""; + + # open stem + $c_stem[0] = $c_bp[0]; + $c_stem[1] = $c_bp[1]; + $c_stem[2] = $c_bp[0]; + $c_stem[3] = $c_bp[1]; + } else { + die "ERROR: there have to be open blocks to match " . "current closing bracket!$curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + + # CLOSE_C_UP + if (@c_up) { + + # case: (.) => close_HL + if ( $c_up[0] - 1 == $c_bp[0] ) { + my @newhl = @c_bp; + push( @hls, \@newhl ); + @c_up = (); + print STDERR "TEST - closed HL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newhl ) ) if ($i_debug); + + # case: (x(x).) + } elsif ( @p_bp && $c_up[0] - 1 == $p_bp[1] ) { + + # case: ((x).) => CLOSE_BL + if ( @stmbrks + && $stmbrks[-1]->[0] == $c_bp[0] + && $stmbrks[-1]->[1] == $p_bp[0] + && $stmbrks[-1]->[2] == $p_bp[1] ) { + my @newbl = ( @c_bp, @p_bp ); + push( @bls, \@newbl ); + pop(@stmbrks); + @c_up = (); + print STDERR "TEST - closed BL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newbl ) ) if ($i_debug); + + # case: (x(x).), either CLOSE_ML or CLOSE_IL + } else { + + # case '(.(x).)' => CLOSE_IL + if ( @up + && ( $up[-1]->[0] - 1 == $c_bp[0] ) + && ( $up[-1]->[1] + 1 == $p_bp[0] ) ) { + my @p_up = @{ pop(@up) }; + my @newil = ( @c_bp, @p_bp ); + push( @ils, \@newil ); + @c_up = (); + print STDERR "TEST - closed IL:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newil ) ) if ($i_debug); + + # case (x(x)x(x).) => CLOSE_ML remember, array of tuples + } elsif ( @c_mls && $c_mls[-1]->[-1]->[0] == $p_bp[0] ) { + my @newml = @{ pop(@c_mls) }; + push( @{ $newml[-1] }, $p_bp[1] ); # close final BP + $newml[0]->[1] = $c_bp[1]; # close 1st BP + push( @mls, \@newml ); + @c_up = (); + print STDERR "TEST - closed ML2:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@newml, 1 ) ) if ($i_debug); + } else { + die "There should be no such case!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + + # there should be no more unknown unpaired regions left! + } else { + die "There shouldn't be any unknown unpaired " . "regions left\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + + } else { + die "ERROR: '()' is not allowed in $curr_shrep->[0]\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ); + } + } + } + } + } + + # EOS + + # CLOSE_STEM + print STDERR "TEST - closed stem EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@c_stem ) ) + if ( @c_stem && $i_debug ); + close_stem( \@c_stem, \%stems, \@p_bp, \@c_bp, \@p_open, \@stmbrks, \@open_blks, \%bps ) if (@c_stem); + die "ERROR: there should not be an open ML!\n" . $curr_shrep . "\n" + if (@c_mls); + + # case ().() CLOSE_EL + if (@c_el) { + if ( $c_el[-1] == $p_bp[0] ) { + push( @c_el, $p_bp[1] ); + my @newel = @c_el; + push( @els, \@newel ); + @c_el = (); + + print STDERR "TEST - closed EL4 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) ) + if ($i_debug); + } else { + die "ERROR: the base-pairs in the EL don't match" . $curr_shrep . "\n"; + } + } + + # case (). CLOSE_EL + if (@c_up) { + unless ($cue) { + if (@p_bp) { + push( @c_el, @p_bp ); + } else { + push( @c_el, ( $curr_gi, $curr_gi ) ); + } + + push( @c_el, ( ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ) ); + my @newel = @c_el; + push( @els, \@newel ); + @c_el = (); + print STDERR "TEST - closed EL5 EOS:\n" . ( test( $curr_shrep->[0], ( $win_size + $curr_gi ), $curr_gi, \@newel ) ) + if ($i_debug); + ## check case where cue is given, but window is completely unstructured + } else { + unless (@p_bp) { + my @newel = ( $curr_gi, $curr_gi, ( $curr_gi + $win_size - 1 ), ( $curr_gi + $win_size - 1 ) ); + push( @els, \@newel ); + } + } + } + + # # update current graph index (since all shrep indices have been read) + $curr_gi += $win_size; + + # =================================================== + # build graph lines + my @graph_lines = (); + + # =================================================== + + # graph line objects + my @edg = (); + my @stackgraph = (); + my @abstrgraph = (); + + #-------------------> stems,stacks + # build basic graph edges, stacks, and vertices for stems in the abstract graph + my @tmp = keys %stems; + foreach my $sk (@tmp) { + my @stem = @{ $stems{$sk} }; # stem (i,j,k,l)=> (i,j)= (0,1) and (k,l)=(2,3) + die "ERROR: There are not enough elements in this stem: ", ( join( ",", @stem ) ), "\n" unless ( scalar(@stem) == 4 ); + + # infer base-pairs from stem + @p_bp = (); + @c_bp = (); + my @stem_relation_idxs = (); + my @stack_relation_idxs = (); + my $stacksize = $stem[2] - $stem[0] + 1; + for ( my $x = 0 ; $x < $stacksize ; $x++ ) { + + # for (my $i = $stem[0]; $i <= $stem[2] ; $i++){ + # for(my $j = $stem[1]; $j >= $stem[3]; $j--){ + my $i = $stem[0] + $x; + my $j = $stem[1] - $x; + @p_bp = @c_bp; + @c_bp = ( $i, $j ); + if ( defined $bps{"$i:$j"} ) { + + # add indices to stem relation object + push( @stem_relation_idxs, $i ); + push( @stem_relation_idxs, $j ); + + # add edge to graph edges + push( @edg, "e $i $j s" ); + + # add stack if option given + if ($stacks) { + if (@p_bp) { + push( @stackgraph, "v $curr_gi P" ); + push( @stackgraph, "e $curr_gi $p_bp[0] p" ); + push( @stackgraph, "e $curr_gi $p_bp[1] p" ); + push( @stackgraph, "e $c_bp[0] $curr_gi p" ); + push( @stackgraph, "e $c_bp[1] $curr_gi p" ); + push( @stack_relation_idxs, $curr_gi ); + + # update graph index + ++$curr_gi; + } + } + } else { + die "ERROR: There is no such base-pair in the base-pair hash:" . " ($i, $j)\n"; + } + } + + # } + # } + # add stem to abstract graph + # (graph indices become a bit jumbled, but saves time) + push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#S" ); + + # stem object now saves the curr_gi index for that stem + $stems{$sk} = $curr_gi; + ++$curr_gi; + + # add relations + push( @abstrgraph, "v $curr_gi ^R" ); + push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); + foreach my $r (@stem_relation_idxs) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + if ($stacks) { + foreach my $r (@stack_relation_idxs) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + } + ++$curr_gi; + } + + #-------------------> add HLs + foreach my $hl (@hls) { + + # check size + die "ERROR: the HL object is the incorrect size\n" unless ( @{$hl} == 2 ); + + # add vertex + push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#HL" ); + + # add edge from stem to hl + if ( defined( $stems{ $bps{"$hl->[0]:$hl->[1]"} } ) ) { + push( @abstrgraph, "e " . $stems{ $bps{"$hl->[0]:$hl->[1]"} } . " $curr_gi n" ); + } else { + die "ERROR: stem not defined in base-pair hash for $hl->[0]:$hl->[1]\n"; + } + ++$curr_gi; + + # add relations + push( @abstrgraph, "v $curr_gi ^R" ); + push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); + + # add all nodes between i to j, inclusively + for ( my $r = $hl->[0] ; $r <= $hl->[1] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + ++$curr_gi; + } + + #------------------> add BLs + foreach my $bl (@bls) { + + # check size + die "ERROR: the BL object is the incorrect size\n" unless ( scalar( @{$bl} ) == 4 ); + + # add vertex + push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#BL" ); + + # add edges from bl to both adjacent stems + if ( defined( $stems{ $bps{"$bl->[0]:$bl->[1]"} } ) + && defined( $stems{ $bps{"$bl->[2]:$bl->[3]"} } ) ) { + + # add outer stem: S->BL + push( @abstrgraph, "e " . $stems{ $bps{"$bl->[0]:$bl->[1]"} } . " $curr_gi n" ); + + # add inner stem: BL->S + push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bl->[2]:$bl->[3]"} } . " n" ); + } else { + die "ERROR: stems not defined for BL\n"; + } + ++$curr_gi; + + # add relations + push( @abstrgraph, "v $curr_gi ^R" ); + push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); + + # add all nodes between i to k and l to j, inclusively + for ( my $r = $bl->[0] ; $r <= $bl->[2] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + for ( my $r = $bl->[3] ; $r <= $bl->[1] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + ++$curr_gi; + } + + #------------------> add ILs + foreach my $il (@ils) { + + # check size + die "ERROR: the IL object is the incorrect size\n" unless ( scalar( @{$il} ) == 4 ); + + # add vertex + push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#IL" ); + + # add edges from il to both adjacent stems + if ( defined( $stems{ $bps{"$il->[0]:$il->[1]"} } ) + && defined( $stems{ $bps{"$il->[2]:$il->[3]"} } ) ) { + + # add outer stem: S->IL + push( @abstrgraph, "e " . $stems{ $bps{"$il->[0]:$il->[1]"} } . " $curr_gi n" ); + + # add inner stem: IL->S + push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$il->[2]:$il->[3]"} } . " n" ); + } else { + die "ERROR: stems not defined for IL\n"; + } + ++$curr_gi; + + # add relations + push( @abstrgraph, "v $curr_gi ^R" ); + push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); + + # add all nodes between i to k and l to j, inclusively + for ( my $r = $il->[0] ; $r <= $il->[2] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + for ( my $r = $il->[3] ; $r <= $il->[1] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + ++$curr_gi; + } + + #------------------> add ELs + foreach my $el (@els) { + + #check size + die "ERROR: the EL object is the incorrect size\n" unless ( scalar( @{$el} ) == 4 ); + + # add vertex + push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#EL" ); + + # add edges from el to both adjacent stems, if available + unless ( $el->[0] == $el->[1] ) { + if ( defined( $stems{ $bps{"$el->[0]:$el->[1]"} } ) ) { + push( @abstrgraph, "e " . $stems{ $bps{"$el->[0]:$el->[1]"} } . " $curr_gi n" ); + } else { + die "ERROR: stems not defined for left BP of EL\n"; + } + } + unless ( $el->[2] == $el->[3] ) { + if ( defined( $stems{ $bps{"$el->[2]:$el->[3]"} } ) ) { + push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$el->[2]:$el->[3]"} } . " n" ); + } else { + die "ERROR: stems not defined for right BP of EL\n"; + } + } + ++$curr_gi; + + # add relations + push( @abstrgraph, "v $curr_gi ^R" ); + push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); + + # @r relation edges + for ( my $r = $el->[1] ; $r <= $el->[2] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + + # if BP, not EOS + unless ( $el->[0] == $el->[1] ) { + push( @abstrgraph, "e $curr_gi $el->[0] \@r" ); + } + + # if BP, not EOS + unless ( $el->[2] == $el->[3] ) { + push( @abstrgraph, "e $curr_gi $el->[3] \@r" ); + } + ++$curr_gi; + } + + #------------------> add MLs + foreach my $ml (@mls) { + + #check size + die "ERROR: the ML object is not big enough!\n" unless ( scalar( @{$ml} ) >= 3 ); + + # add vertex + push( @abstrgraph, "v $curr_gi ${ABSTRUCT}#ML" ); + + # add edge from outer stem to first BP + my $closing_bp = shift( @{$ml} ); + if ( defined( $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } ) ) { + push( @abstrgraph, "e " . $stems{ $bps{"$closing_bp->[0]:$closing_bp->[1]"} } . " $curr_gi n" ); + } else { + die "ERROR: ML is not defined for BP $closing_bp->[0]:$closing_bp->[1]\n"; + } + + # add remaining stems + foreach my $bp ( @{$ml} ) { + if ( defined( $stems{ $bps{"$bp->[0]:$bp->[1]"} } ) ) { + push( @abstrgraph, "e $curr_gi " . $stems{ $bps{"$bp->[0]:$bp->[1]"} } . " n" ); + } else { + die "ERROR: ML is not defined for BP $bp->[0]:$bp->[1]\n"; + } + } + ++$curr_gi; + + # add relations + push( @abstrgraph, "v $curr_gi ^R" ); + push( @abstrgraph, "e " . ( $curr_gi - 1 ) . " $curr_gi ^r" ); + + # add @r relation edges between i and k, where (i,j) is the first BP and (k,l) + # the second BP + my $pbp_aref = $closing_bp; + my $cbp_aref = shift( @{$ml} ); + for ( my $r = $pbp_aref->[0] ; $r <= $cbp_aref->[0] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + + # rest of @r relation edges + foreach my $bp ( @{$ml} ) { + $pbp_aref = $cbp_aref; + $cbp_aref = $bp; + for ( my $r = $pbp_aref->[1] ; $r <= $cbp_aref->[0] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + } + + # add final stretch between last BP (m,n) and first closing base-pair (i,j) + for ( my $r = $cbp_aref->[1] ; $r <= $closing_bp->[1] ; $r++ ) { + push( @abstrgraph, "e $curr_gi $r \@r" ); + } + ++$curr_gi; + } + + @graph_lines = ( @edg, @stackgraph, @abstrgraph ); + return ( \@graph_lines, $curr_gi ); +} + +# just to test the current shrep dot-bracket structure parsing +sub test { + my ( $shrep, $idx, $gi, $mark_aref, $isML ) = @_; + + my $result = ""; + + $result .= "$shrep\n"; + my @a = split( "", $shrep ); + for ( my $i = 0 ; $i <= @a ; $i++ ) { + my $mark = 0; + foreach my $j ( @{$mark_aref} ) { + if ($isML) { + $mark = 1 if ( $i == $j->[0] - $gi ); + $mark = 1 if ( scalar( @{$j} == 2 && $i == $j->[1] - $gi ) ); + } else { + $mark = 1 if ( $i == $j - $gi ); + } + + } + if ( $i == $idx - $gi ) { + $result .= "^"; + } elsif ($mark) { + $result .= "'"; + } else { + $result .= " "; + } + } + $result .= "\n"; + + return $result; +} + +########################################################################### +# Extends or closes a multi-loop. This means a ML already exists. +# We can have the case ((x)( or ((x).( +# +# Input: +# curr_shrep The current shrep object, with shrep string at pos 0 +# c_ml_aref The current open MLs array reference +# p_bp_aref The previous base-pair array reference +# stmbrks_aref The stem-breaks array reference +# open_blks_aref The open bracket blocks array reference +# idx The current index +# #curr_gi The current graph index +# Output: +# None, it should just modify the ML accordingly +############################################################################ +sub extend_or_open_nested_ML { + my ( $curr_shrep, $c_mls_aref, $p_bp_aref, $up_aref, $stmbrks_aref, $open_blks_aref, $idx, $curr_gi ) = @_; + + # case '(x(x)x(x)x(' extend current ML + if ( $c_mls_aref->[-1]->[-1]->[-1] == $p_bp_aref->[0] ) { + push( @{ $c_mls_aref->[-1]->[-1] }, $p_bp_aref->[1] ); #close previous BP + my @a = ($idx); + push( @{ $c_mls_aref->[-1] }, \@a ); # add new open base-pair + print STDERR "TEST - extended ML:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, $c_mls_aref->[-1], 1 ) ) if ($i_debug); + + # case '(x(x)x((x)(' nested ML, => OPEN_ML with stem-break + } elsif ( @$stmbrks_aref + && $stmbrks_aref->[-1]->[1] == $p_bp_aref->[0] + && $stmbrks_aref->[-1]->[2] == $p_bp_aref->[1] ) { + + my @c_stmbrk = @{ pop( @{$stmbrks_aref} ) }; + my @a = ( $c_stmbrk[0], -1 ); + my @ml = (); + push( @ml, \@a ); # 1st base-pair is in-waiting + my @a2 = ( $c_stmbrk[1], $c_stmbrk[2] ); + push( @ml, \@a2 ); #2nd BP + my @a3 = ($idx); + push( @ml, \@a3 ); #first of current BP + push( @{$c_mls_aref}, \@ml ); + print STDERR "TEST - opened nested ML with SB:\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); + + # case '(x(x)x(.(x)(' nested ML, => OPEN_ML with UP + } elsif ( @$up_aref + && $up_aref->[-1]->[0] - 1 == $open_blks_aref->[-1]->[-1] + && $up_aref->[-1]->[1] + 1 == $p_bp_aref->[0] ) { + my @tmp_up = @{ pop( @{$up_aref} ) }; + my @a = ( $open_blks_aref->[-1]->[-1], -1 ); + my @ml = (); + push( @ml, \@a ); # 1st BP awaits closing + my @newbp = @{$p_bp_aref}; + push( @ml, \@newbp ); #2nd BP + my @a2 = ($idx); + push( @ml, \@a2 ); # current open BP, awaits closing + push( @{$c_mls_aref}, \@ml ); + print STDERR "TEST - opened nested ML with prev UP\n" . ( test( $curr_shrep->[0], $idx, $curr_gi, \@ml, 1 ) ) if ($i_debug); + } else { + die "ERROR: in the case extend or open nested ML, " . "but the base-pairs don't fit for either!\n" . ( test( $curr_shrep->[0], $idx, $curr_gi ) ) if ($i_debug); + } +} + +########################################################################### +# Closes a stem-object for the abstract graph. If necessary, it also +# creates a stem-break because the current open block is still open, +# which can only mean that there is a stem-break between two consecutive +# opening brackets. +# +# Input: +# c_stem_aref : current stem array reference +# stems_href : all stems hash reference +# p_bp_aref : previous base-pair array reference +# c_cp_aref : current base-pair array reference +# p_open_aref : previously opened open-bracket block +# p_stmbrk_aref : the previous stem-break +# open_blks_aref : all old open blocks array reference +# bps_href : all base-pairs hash reference +# +# Output: +# None, it should just modify the input variables accordingly +############################################################################ +sub close_stem { + my ( $c_stem_aref, $stems_href, $p_bp_aref, $c_bp_aref, $p_open_aref, $stmbrks_aref, $open_blks_aref, $bps_href ) = @_; + + my @newstem = @$c_stem_aref; + my $stemkey = "$newstem[0]:$newstem[1],$newstem[2]:$newstem[3]"; + $stems_href->{$stemkey} = \@newstem; + + # reset and clean running variables + @$c_stem_aref = (); + @$p_bp_aref = @$c_bp_aref; + @$c_bp_aref = (); + + # add stem to base-pair hash + $bps_href->{"$newstem[0]:$newstem[1]"} = $stemkey; + $bps_href->{"$newstem[2]:$newstem[3]"} = $stemkey; + + # there are still bases open in the previous open block, '[[]x', + # where [] is a closed stem and [ is an open block, + # must create STEM-BREAK + # this cannot be the case ((x)), because there p_open is empty + if (@$p_open_aref) { + + # check: the next open base is adjacent to the previous BP + if ( $p_open_aref->[-1] == $p_bp_aref->[0] - 1 ) { + my @stmbrk = ( $p_open_aref->[-1], @$p_bp_aref ); + push( @$stmbrks_aref, \@stmbrk ); + + # push the previous open block back onto stack + my @newopen = @$p_open_aref; + push( @$open_blks_aref, \@newopen ); + @$p_open_aref = (); + } else { + + # error, this should not be possible + die("ERROR in close_stem(), [.[]x should not occur here\n"); + } + } +} + +########################################################################### +# Here the information about the structure is added to the graph. In a +# first step each base-pair is added by adding edges between the vertices +# of the respective nucleotides. Then, if two base-pairs are stacked, +# we add an extra vertex and connect the four nucleotide vertices that +# are involved with extra edges (symbols P used for vertex and p for edges). +# While adding new vertices we keep track of the current graph index. +# +# Input: +# curr_shrep : the current shrep as dot-bracket structure +# curr_gi : the current graph index +# win_size : the window size +# stacks : the input parameter -stacks (whether to add stacking information) +# +# Output: +# The structure graph information line-by-line as an array reference and +# the current graph index. +############################################################################ +sub getStructGraph { + my ( $curr_shrep, $curr_gi, $win_size, $stacks ) = @_; + + my @struct = split( "", $curr_shrep->[0] ); + + my @edg = (); + my @starts = (); + my @pairs = (); + + foreach my $idx ( 0 .. @struct - 1 ) { + + push( @starts, $idx ) if ( $struct[$idx] eq "(" ); + + if ( $struct[$idx] eq ")" ) { + my $start = $curr_gi + pop(@starts); + my $end = $curr_gi + $idx; + push( @edg, "e " . $start . " " . $end . " s " ); + my @pair = ( $start, $end ); + push( @pairs, \@pair ); + } + } + + # update current graph index + $curr_gi = $curr_gi + $win_size; + + # initiate structure lines array + my @stacking_info = (); + + # add stacking information to graph unless input option tells us not to + if ($stacks) { + + my $stacked_pairs = 0; + + # get stacked base-pairs (they are ordered according to + # position of closing nucleotide) + for ( my $i = 1 ; $i < @pairs ; $i++ ) { + + # add stacked base-pairs (vertices+edges) + my $curr_bp_aref = $pairs[$i]; + my $prev_bp_aref = $pairs[ $i - 1 ]; + + # if the current base-pair is stacked on the previous base-pair, + # when curr_start = prev_start - 1 AND curr_end = prev_end + 1 + if ( $curr_bp_aref->[0] == $prev_bp_aref->[0] - 1 + && $curr_bp_aref->[1] == $prev_bp_aref->[1] + 1 ) { + + # add stacking vertex P + push( @stacking_info, "v $curr_gi P" ); + + # add four edges from involved nucleotids + push( @stacking_info, "e $curr_gi $prev_bp_aref->[0] p" ); + push( @stacking_info, "e $curr_gi $prev_bp_aref->[1] p" ); + push( @stacking_info, "e $curr_bp_aref->[0] $curr_gi p" ); + push( @stacking_info, "e $curr_bp_aref->[1] $curr_gi p" ); + ++$curr_gi; # add one to the index, ready for next vertex + + } + } + } + + my @str_graphlines = ( @edg, @stacking_info ); + return ( \@str_graphlines, $curr_gi ); +} + +########################################################################### +# This method gives us the overall graph header for this sequence. This +# graph includes all window calculations and all shreps for all windows. +# Input: +# seq_id : the sequence ID +# seq_head : the header information for the sequence +# wins_aref : the windows sizes for folding +# h_shift : input parameter -shift +# h_e : input parameter -e +# h_c : input parameter -c +# h_t : input parameter -t +# h_u : input parameter -u +# h_r : input parameter -r +# h_M : input parameter -M +# h_crop_unpaired_ends : input parameter -cue +# +# Output: +# The header line as a string +############################################################################ +sub getGraphHeader { + my ( $seq_id, $seq_head, $wins_aref, $h_shift, $h_e, $h_c, $h_t, $h_u, $h_r, $h_M, $h_crop_unpaired_ends, $h_i, $h_sample_len, $h_q, $h_Tp, $h_seqlen, $h_noStr ) = @_; + + my $ret; + + $ret = "t # SEQID $seq_id "; + $ret .= "$seq_head " if ( defined $seq_head ); + $ret .= "SEQLEN $h_seqlen "; + + return $ret . "\n" if ($h_noStr); + + $ret .= "MAXWINSHREPS $h_M "; + $ret .= "CUE 1 " if ($h_crop_unpaired_ends); + $ret .= "RNASHAPES -w " . ( join( ",", @{$wins_aref} ) ) . " "; + $ret .= "-SHIFT%w $h_shift " if ( not $globalFolding and $h_shift ); + $ret .= "-SHIFT 1 " if ( not $globalFolding and not $h_shift ); + $ret .= "-e $h_e " if ( defined $h_e ); + $ret .= "-c $h_c " if ( defined $h_c ); + $ret .= "-t $h_t " if ( defined $h_t ); + $ret .= "-u 1 " if ($h_u); + $ret .= "-r 1 " if ($h_r); + $ret .= "-i $h_i SAMPLE_LEN $h_sample_len" if ( not $h_q and $h_i ); + $ret .= "-q 1 " if ($h_q); + $ret .= "-T $h_Tp" if ( $h_q and defined $h_Tp ); + $ret .= "\n"; + + return $ret; +} + +########################################################################### +# This method gives us the window header for the subgraph that includes +# all shreps for the current window. +# Input: +# graphHead : the header line for the entire sequence graph +# win_size : the size of the current window +# win_start : the starting position of the current window +# win_end : the end position of the current window +# win_center : the centre position of the current window +# win_global_fold : (boolean) whether the actual number of nucleotides +# is smaller than the given window size (end of sequence?) +# +# Output: +# The header line as a string +############################################################################ +sub getWindowHeader { + my ( $graphHead, $win_size, $win_start, $win_end, $win_center, $win_global_fold, $win_sample, $used_t ) = @_; + + chomp $graphHead; + + $graphHead =~ s/t # //; + + my $ret = "w # $graphHead "; + $ret .= "SHAPE_TYPE $used_t "; + $ret .= "GLOBALFOLD $win_global_fold "; + $ret .= "WSIZE $win_size "; + $ret .= "WSTART $win_start "; + $ret .= "WEND $win_end "; + $ret .= "WCENT $win_center "; + $ret .= "WSAMPLE $win_sample "; + $ret .= "\n"; + + return $ret; +} + +########################################################################### +# This method gives us the shape header that includes the single shrep +# connected components. +# Input: +# winHead : the header for the current window +# shrep : information about the current shrep +# +# Output: +# The header line as a string +############################################################################ +sub getShapeHeader { + my ( $winHead, $shrep ) = @_; + + chomp $winHead; + + $winHead =~ s/w # //; + $winHead =~ s/t # //; + + my $ret = "s # $winHead "; + + my @info = @{$shrep}; + $ret .= join( " ", @info[ 1 .. $#info ] ); + $ret .= "\n"; + + return $ret; +} + +########################################################################### +# This method gives us the sequence header that includes the unstructured +# sequence connected components. +# Input: +# winHead : the header for the current window +# shrep : information about the current shrep +# +# Output: +# The header line as a string +############################################################################ +sub getSeqHeader { + my ( $winHead, $shrep ) = @_; + + chomp $winHead; + + $winHead =~ s/w # //; + $winHead =~ s/t # //; + + my $ret = "u # $winHead "; + + my @info = @{$shrep}; + $ret .= join( " ", @info[ 1 .. $#info ] ); + $ret .= "\n"; + + return $ret; +} + + +################################################################################## +# This method parses a fasta file and is useful if the header ID is not-unique!! +# It returns the header lines in an array and the sequence lines in the same +# order. It is then up to the user to extract the parts from the header that is +# necessary for the script. +# Furthermore, the method deals with multiple lines, and returns a single sequence +# without line breaks. +# Input: +# file The name of the fasta file +# Output: +# (1) An array reference for each header line +# (2) An array reference for each sequence in same order as the headers +################################################################################## +sub read_fasta_with_nonunique_headers_meta { + my ($file) = @_; + my $FUNCTION = "read_fasta_file in Sequences.pm"; + + my $header = ""; + my $seqstring = ""; + my @headers = (); + my @sequences = (); + my @meta = (); + my %seq_meta = (); + open( IN_HANDLE, "<$file" ) || die "ERROR in $FUNCTION:\n" . "Couldn't open the following file in package Tool," . " sub read_fasta_file: $file\n"; + + while ( my $line = <IN_HANDLE> ) { + chomp($line); + + # header (can contain one space after > symbol) + if ( $line =~ /^\>(.*)/ ) { + if ($header) { + $seqstring =~ s/\s*//g; ## do not allow spaces in sequence + push( @headers, $header ); + push( @sequences, $seqstring ); + push( @meta, +{%seq_meta} ); ## anonymous hash reference + #print keys %seq_meta; + $seqstring = ""; + undef(%seq_meta); + } + $header = $1; + + } elsif ( $line =~ /(.+)\s+(#\S+)$/ && $header ) { + + if ( exists $seq_meta{$2} ) { + $seq_meta{$2} .= $1; + } else { + $seq_meta{$2} = $1; + } + + } elsif ($header) { + $seqstring .= $line + } + } + + if ($header) { + $seqstring =~ s/\s*//g; ## do not allow spaces in sequence + push( @headers, $header ); + push( @sequences, $seqstring ); + push( @meta, +{%seq_meta} ); + $seqstring = ""; + %seq_meta = (); + } + return ( \@headers, \@sequences, \@meta ); +} + + +1; + +############################################################################# +# Programming description +############################################################################# +# Substructure graphs for machine learning with Fabrizio +# ------------------------------------------------------- +# +# (1) Parameters (RNAshapes parameter): +# - Window sizes [] (-w) +# - window shift size (-W) +# - calculate structure probabilities (-r) +# - energy range kcal/mol (-e) OR energy relative percentage (%) to MFE (-c) +# - shape type 1 to 5 (-t) +# - ignore unstable substructures (-u) +# - max shreps +# +# +# (2) For each sequence, generate one graph/file that consists of all windows. The general format for one graph is as follows: +# +# t # seq_id parameters +# v graph_index nt_type window_size window_centre abs_seq_pos +# ... +# e m m+1 > 1 (backbone) +# ... +# e base_i_graph_index base_j_graph_index s shrep_e shrep_p ... +# +# For each window (subgraph) we create a subgraph (of the subgraph) for each substructure. +# We have a running index (gi) for each subgraph. All vertex and edge indices of the subgraph add +# the running graph index to the actual window position. For example +# +# +# Sequence: AAACC CUUUG GG +# 01234 56789 01 +# +# Window=10 substructure1 = (((...))). centre 5.5 +# +# v 0 A 10 5.5 0 +# v 1 A 10 5.5 1 +# v 2 A 10 5.5 2 +# v 3 C 10 5.5 3 +# v 4 C 10 5.5 4 +# v 5 C 10 5.5 5 +# v 6 U 10 5.5 6 +# v 7 U 10 5.5 7 +# v 8 U 10 5.5 8 +# v 9 G 10 5.5 9 +# e 0 1 > 1 +# e 1 2 > 1 +# e 2 3 > 1 +# e 3 4 > 1 +# e 4 5 > 1 +# e 5 6 > 1 +# e 6 7 > 1 +# e 7 8 > 1 +# e 8 9 > 1 +# e 0 8 s -15.0 0.1223 +# e 1 7 s -15.0 0.1223 +# e 2 6 s -15.0 0.1223 +# +# gi = 9+1 = 10 +# +# Window = 10 substructure2 = .(((...))) centre 6.5 +# v 10 A 10 6.5 2 +# v 11 C 10 6.5 3 +# v 12 C 10 6.5 4 +# v 13 C 10 6.5 5 +# v 14 U 10 6.5 6 +# v 15 U 10 6.5 7 +# v 16 U 10 6.5 8 +# v 17 G 10 6.5 9 +# v 18 G 10 6.5 10 +# v 19 G 10 6.5 11 +# e 10 11 > 1 +# e 11 12 > 1 +# e 12 13 > 1 +# e 13 14 > 1 +# e 14 15 > 1 +# e 15 16 > 1 +# e 16 17 > 1 +# e 17 18 > 1 +# e 18 19 > 1 +# e 11 19 s -17.0 0.156 +# e 12 18 s -17.0 0.156 +# e 13 17 s -17.0 0.156 +# +# gi = 19+1 = 20 +# +# +# +# Write one perl script to create graphs for a set of sequences, fasta2shrep_gspan.pl. +# +# INPUT: +# -f fasta file with all sequences to compute +# parameters as above +# +# OUTPUT: +# one file per sequence that contains graph called seq_id.gspan +# +# (1) for each window size call RNAshapes and write to a tmp file +# (2) parse result of RNAshapes (catch RNAshapes error - sequence too long?) check for max shreps. +# (3) convert RNAshapes result to subgraph -> write to file (readpipe) look at efficiency and errors +# (4) repeat (1) to (3) for each sequence + +# ABSTRACT STRUCTURE GRAPH +# To each shrep graph (s #) add an abstract structure graph with special +# abstract relations. For example, to add the abstract structure graph to +# the previous shrep structure with gi numbers from 10 to 19, we first +# identify the abstract shape, i.e. EL-S-HL and then create nodes (labelled +# with a given name-space followed the actual node name and separated by a +# hash) and edges for this graph as follows: +# NOTE: at the moment we add the adjacent base-pairs to the loop definitions +# but these can be removed if necessary in the future +# v 20 abstruct#EL +# v 21 abstruct#S +# v 22 abstruct#HL +# e 20 21 n +# e 21 22 n +# v 23 ^R +# e 20 23 ^r +# e 23 10 @r +# e 23 11 @r +# e 23 19 @r +# v 24 ^R +# e 21 24 ^r +# e 24 11 @r +# e 24 12 @r +# e 24 13 @r +# e 24 17 @r +# e 24 18 @r +# e 24 19 @r +# v 25 ^R +# e 22 25 ^r +# e 25 13 @r +# e 25 14 @r +# e 25 15 @r +# e 25 16 @r +# e 25 17 @r + +# gi = 25+1 = 26 + +# ANOTATION FILE +# This is a file that labels a sequence region with a given annotation. In one +# file we can have annotations within different name-spaces, for example target +# sites predicted with different tools. +# File format is a tab-delimited file with the following columns: +# SEQID (same ID as in the fasta file header) +# i (left positon of region) +# j (right position of region, if one position i=j) +# NAMESPACE#LABEL (give each annotation type one name-space and choose a label depending on the task) + +# E.g we have 2 different miRNAs and we predict the target-sites with (1) IntaRNA and (2) TargetSearch, +# then this could be IntaRNA#miR1, IntaRNA#miR2, TargetSearch#miR1, and TargetSearch#miR2. +# All labels can be used more than once for one or more sequences. +# all labels with the same namespace and the same sequence ID are grouped into +# one abstract graph, according to the order of i. +# Example +# SEQID i j NAMESPACE#LABEL +# s1 10 20 IntaRNA#miR1 +# s1 54 60 IntaRNA#miR2 +# s1 15 25 TargetSearch#miR1 +# s1 54 60 TargetSearch#miR2 +# s2 ... + +# We create connected component per sequence ID per name-space, +# so that all labels with +# the same namespace are grouped together (for one sequence) as follows: +# v 26 IntaRNA#miR1 +# v 27 IntaRNA#miR2 +# e 26 27 n +# then create R nodes and ^r and @r relations as before (from i to j) + +# If a sequence graph option is given, then these graphs (u #) have to +# be labelled with this annotation +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Tue Oct 29 11:10:19 2013 -0400 @@ -0,0 +1,48 @@ +Galaxy wrapper for fasta2shrep_gspan +==================================== + +This wrapper is copyright 2013 by Björn Grüning. + +This is a wrapper for the command line tool of fasta2shrep_gspan.pl. + + + + +Installation +============ + +The recommended installation is by means of the toolshed_. +If you need to install it manually here is a short introduction. + +.. _toolshed: http://toolshed.g2.bx.psu.edu/view/bgruening/sequence2gspan + + + +History +======= + +- v0.1: Initial public release + + +============= +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Tue Oct 29 11:10:19 2013 -0400 @@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories description="This requires the EDeN datatypes."> + <repository changeset_revision="59b3b6ce10bb" name="eden_toolbox" owner="bgruening" toolshed="http://testtoolshed.g2.bx.psu.edu" /> +</repositories>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sequence2gspan.xml Tue Oct 29 11:10:19 2013 -0400 @@ -0,0 +1,167 @@ +<tool id="sequence2gpsan_rnashapes" name="Sequence to gspan" version="0.3"> + <description>converter using RNAshapes</description> + <parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10" shared_inputs="" merge_outputs="outfile"></parallelism> + <requirements> + <requirement type="package" version="2.1.6">rnashapes</requirement> + </requirements> + <command interpreter="perl"> + + fasta2shrep_gspan.pl + --fasta $infile + + #if $annotate_infile: + -annotate $annotate_infile + #end if + + -shift $shift + $cue + $stack + #if $energy_opts.energy_opts_selector == 'energy': + -e $energy_opts.e + #elif $energy_opts.energy_opts_selector == 'rel_energy': + -c $energy_opts.c + #end if + + #if $structure_sampling_opts.structure_sampling_opts_selector == 'on': + -i $structure_sampling_opts.i + -sample_len $structure_sampling_opts.sample_len + #end if + + $q + -Tp $Tp + $seq_graph_win + $seq_graph_t + $seq_graph_alph + $abstr + $nostr + $vp + $ignore_header + -M $M + $u + $r + + -stdout + + > $output + + </command> + <inputs> + <param name="ifile" type="data" format="fasta" label="Nucleotide Sequence in FASTA format"/> + + <!-- todo repeat tag, normally only 3 windows are specified--> + <param name="wins" type="text" value="" label="A list of window sizes to use" + help="if none are given, then the entire sequence is taken with no windows. Each window > 1 required! Example: 50,100,200. (-wins)" /> + <param name="t" type="text" value="3=0" label="The shape type (RNAshapes types from 1-5)" + help="With the list format, the shape level can be changed for different window length '4=100' means that shape level 4 is used from length 100nt (window length) The first given length has to be 0! Not continuous given levels are allowed! (-t)" /> + + + <param name="shift" type="integer" value="1" label="The shift of the window, relative to the window size given in percent" + help="Example 20 % of a window 150 would result in a step size of 30 nt. It is a relative parameter, as you can give different window sizes. Default shift of 1 nt. (-shift)"> + <validator type="in_range" min="0" max="100" /> + </param> + <param name="cue" type="boolean" label="Crop unpaired ends" truevalue="-cue" falsevalue="" checked="false" + help="If you give this flag, then the unpaired ends of each single structure are ignored. E.g. the structure ...(((...))).. becomes just (((...))). (-cue)" /> + + <param name="stack" type="boolean" label="Adds stacking information to graphs" truevalue="-stack" falsevalue="" checked="false" + help="This adds an additional vertex (type P) for each pair of stacked base-pairs and four edges (type p) from each of the involved bases to the new vertex. (-stack)" /> + + <conditional name="energy_opts"> + <param name="energy_opts_selector" type="select" label="Energy calculation mode"> + <option value="rel_energy">Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)</option> + <option value="energy">Energy range in kcal/mol (RNAshapes)</option> + <option value="">None</option> + </param> + <when value="" /> + <when value="energy"> + <param name="e" type="float" value="5.0" label="Energy range in kcal/mol (RNAshapes)" help="-e"> + <validator type="in_range" min="0.0" /> + </param> + </when> + <when value="rel_energy"> + <param name="c" type="float" value="10" label="Relative energy range, i.e. percentage (%) of MFE energy (RNAshapes)" help="(-c)"> + <validator type="in_range" min="0.0" /> + </param> + </when> + </conditional> + <param name="M" type="integer" value="0" label="Max number of shreps that should be taken per window" help="0 means selects all shreps. (-M)"> + <validator type="in_range" min="0" /> + </param> + <param name="u" type="boolean" label="Ignore unstable RNAshapes" truevalue="-u" falsevalue="" checked="false" + help="This option filters out closed structures with positive free energy. (-u)" /> + + <param name="r" type="boolean" label="Calculate structure probabilities for RNAshapes" help="(-r)" truevalue="-r" falsevalue="" checked="false" /> + + + <conditional name="structure_sampling_opts"> + <param name="structure_sampling_opts_selector" type="select" label="Structure Sampling"> + <option value="on">On</option> + <option value="off">Off</option> + </param> + <when value="on"> + <param name="i" type="integer" value="1" label="Number of sampling iterations" help="(-i)"> + <validator type="in_range" min="1" /> + </param> + <param name="smaple_len" type="integer" value="0" label="Sampling is only used for seqs/windows >= given length" help="Default 0 (sample all length). (-sample-len)"> + <validator type="in_range" min="0" /> + </param> + </when> + <when value="off" /> + </conditional> + + <param name="q" type="boolean" label="Turn on shape probabilities for RNAshapes, no sampling mode allowed" help="(-q)" truevalue="-q" falsevalue="" checked="false" /> + <param name="Tp" type="float" value="0.001" label="Filter cutoff for shape probabilities, applied before -M filter!" help="(-Tp)"> + <validator type="in_range" min="0.0" /> + </param> + + <param name="seq_graph_win" type="boolean" label="Add for each window a graph which contains no structure" + help="(-seq-graph-win)" + truevalue="-seq-graph-win" falsevalue="" checked="false" /> + <param name="seq_graph_t" type="boolean" label="add for each 't #' a graph which contains no structure" + help="(-seq-graph-t)" + truevalue="-seq-graph-t" falsevalue="" checked="false" /> + <param name="seq_graph_alph" type="boolean" label="Change the alphabet of unstructured graphs" + help="(-seq-graph-alph)" + truevalue="-seq-graph-alph" falsevalue="" checked="false" /> + + <param format="tabular" name="annotate_infile" type="data" optional="True" label="A file with annotations to be added as abstract graphs" + help="The format is has the following TAB-delimited columns: SEQID, START, END, NAMESPACE#LABEL. Labels with the same name-space and SEQID form connected components, which is a sequence of label vertices ordered by the START position in the sequence."/> + + <param name="abstr" type="boolean" label="Add abstract structure graphs to the single shrep graph instances" + help="(-abstr)" + truevalue="-abstr" falsevalue="" checked="false" /> + <param name="nostr" type="boolean" label="Calculate no structures, only add sequence information" + help="-seq-graph-win AND/OR -seq-graph-t are required (-nostr)" + truevalue="-nostr" falsevalue="" checked="false" /> + + <!-- + -match-shape SHAPE + all seqs/windows will be constraint folded into that shape via + RNAshapes (if structure is given in another way this struct will be kept), + if this shape is not possible within given energy range, produce a + specific t graph with only one vertex 'X'. By this the instance + becomes very unsimilar to all other graphs (for knn) + --> + <param name="vp" type="boolean" label="Enable graph computation with viewpoints" + help="svmsgdnspdk will center on those nucleotides that are given via capital letters and ignore those given as lowercase letters (-vp)" + truevalue="-vp" falsevalue="" checked="false" /> + + <param name="ignore_header" type="boolean" label="Don't write fasta id part after first space to gspan" help="(-ignore-header)" truevalue="-ignore-header" falsevalue="" checked="false" /> + + </inputs> + <outputs> + <data format="gspan" name="output" label="${tool.name} on ${on_string}" /> + </outputs> + <tests> + </tests> + <help> + +**What it does** + + + +**References** + +Sita et al... + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Tue Oct 29 11:10:19 2013 -0400 @@ -0,0 +1,6 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="rnashapes" version="2.1.6"> + <repository changeset_revision="e9a43f76591a" name="package_rnashapes_2_1_6" owner="rnateam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>