changeset 2:8a2d16bfdae2 draft

Uploaded
author fubar
date Fri, 03 Oct 2014 21:59:39 -0400
parents ebadd2c92958
children 2468ef56337a
files README danfix.patch data_manager/rnastar_index_builder.py data_manager/rnastar_index_builder.py.orig data_manager/rnastar_index_builder.py.rej data_manager/rnastar_index_builder.xml
diffstat 6 files changed, 241 insertions(+), 17 deletions(-) [+]
line wrap: on
line diff
--- a/README	Mon Sep 29 20:47:45 2014 -0400
+++ b/README	Fri Oct 03 21:59:39 2014 -0400
@@ -1,1 +1,40 @@
-TODO
\ No newline at end of file
+*What it does*
+
+This is a Galaxy datamanager for the rna STAR gap-aware RNA aligner. It's a hack of Dan Blankenberg's BWA data manager
+and works on any fasta file you have already downloaded with the all fasta data manager - start there!
+
+Warning - this is not well tested and there are some complexities to do with splice junction annotation in rna star
+indexes - feedback welcomed. Send code.
+
+Note, currently you'll need a small patch to prevent an error when you try to generate splice junction indexes described at
+https://bitbucket.org/galaxy/galaxy-central/pull-request/510/fix-for-data-manager-failure-to-update-a#comment-3265356
+
+Please read the fine manual - that and the google group are the places to learn about the options above.
+
+*Note on sjdbOverhang*
+
+From https://groups.google.com/forum/#!topic/rna-star/h9oh10UlvhI::
+
+  James is right, using large enough --sjdbOverhang is safer and should not generally cause any problems with reads of varying length. If your reads are very short, <50b, then I would strongly recommend using optimum --sjdbOverhang=mateLength-1
+  By mate length I mean the length of one of the ends of the read, i.e. it's 100 for 2x100b PE or 1x100b SE. For longer reads you can simply use generic --sjdbOverhang 100.
+  It is a bit confusing because of the way I named this parameter. --sjdbOverhang Noverhang is only used at the genome generation step  for constructing the reference sequence out of the annotations.
+  Basically, the Noverhang exonic bases from the donor site and Noverhang exonic bases from the acceptor site are spliced together for each of the junctions, and these spliced sequences are added to the genome sequence.
+
+  At the mapping stage, the reads are aligned to both genomic and splice sequences simultaneously. If a read maps to one of spliced sequences and crosses the "junction" in the middle of it, the coordinates of two pspliced pieces are translated back to genomic space and added to the collection of mapped pieces, which are then all "stitched" together to form the final alignment. Since in the process of "maximal mapped length" search the read is split into pieces of no longer than --seedSearchStartLmax (=50 by default) bases, even if the read (mate) is longer than --sjdbOverhang, it can still be mapped to the spliced reference, as long as --sjdbOverhang > --seedSearchStartLmax.
+
+  Cheers
+  Alex
+
+*Note on gene model requirements for splice junctions*
+
+From https://groups.google.com/forum/#!msg/rna-star/3Y_aaTuzBrE/lUylTB8h5vMJ::
+
+    When you generate a genome with annotations, you need to specify --sjdbOverhang value, which ideally should be equal to (oneMateLength-1), or you could use a generic value of ~100.
+
+    Your gtf lines look fine to me. STAR needs 3 features from a GTF file:
+    1. Chromosome names in col.1 that agree with chromosome names in genome .fasta files. If you have "chr2L" names in the genome .fasta files, and "2L" in the .gtf file, then you need to use --sjdbGTFchrPrefix chr option.
+    2. 'exon' in col.3 for the exons of all transcripts (this name can be changed with --sjdbGTFfeatureExon)
+    3. 'transcript_id' attribute that assigns each exon to a transcript (--this name can be changed with --sjdbGTFtagExonParentTranscript)
+
+    Cheers
+    Alex
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/danfix.patch	Fri Oct 03 21:59:39 2014 -0400
@@ -0,0 +1,72 @@
+
+diff -r ebadd2c92958 data_manager/rnastar_index_builder.py
+--- a/data_manager/rnastar_index_builder.py Mon Sep 29 20:47:45 2014 -0400
++++ b/data_manager/rnastar_index_builder.py Fri Oct 03 13:35:35 2014 -0400
+@@ -9,7 +9,7 @@
+ import optparse
+ import subprocess
+
+-from galaxy.util.json import from_json_string, to_json_string
++from json import loads, dumps
+
+ DEFAULT_DATA_TABLE_NAME = "rnastar_indexes"
+
+@@ -44,20 +44,11 @@
+    --sjdbGTFfile %(sjdbGTFfile)s --sjdbGTFtagExonParentTranscript %(sjdbGTFtagExonParentTranscript)s''' %  pdict
+     elif sjdbFileChrStartEnd:
+         cl += '--sjdbFileChrStartEnd %(sjdbFileChrStartEnd)s --sjdbOverhang %(sjdbOverhangs)s' % pdict
+-    tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-rnastar-index-builder-stderr" )
+     args = cl.split(' ')
+-    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno() )
++    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
+     return_code = proc.wait()
+     if return_code:
+-        tmp_stderr.flush()
+-        tmp_stderr.seek(0)
+-        print >> sys.stderr, "Error building index: retcode=",retcode
+-        while True:
+-            chunk = tmp_stderr.read( CHUNK_SIZE )
+-            if not chunk:
+-                break
+-            sys.stderr.write( chunk )
+-    tmp_stderr.close()
++        sys.exit( return_code )
+     data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_name )
+     _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
+
+@@ -86,7 +77,7 @@
+     (options, args) = parser.parse_args()
+
+     filename = options.out_file
+-    params = from_json_string( open( filename ).read() )
++    params = loads( open( filename ).read() )
+     target_directory = options.out_index_path
+     os.mkdir( target_directory )
+     data_manager_dict = {}
+@@ -107,6 +98,7 @@
+
+
+     #save info to json file
+-    open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )
++    open( filename, 'wb' ).write( dumps( data_manager_dict ) )
+
+ if __name__ == "__main__": main()
++
+diff -r ebadd2c92958 data_manager/rnastar_index_builder.xml
+--- a/data_manager/rnastar_index_builder.xml    Mon Sep 29 20:47:45 2014 -0400
++++ b/data_manager/rnastar_index_builder.xml    Fri Oct 03 13:35:35 2014 -0400
+@@ -22,7 +22,8 @@
+ #end if
+ </command>
+     <stdio>
+-        <regex match=".*" source="both" level="warning" description="stdout/err chatter:"/>
++        <exit_code range=":-1" err_level="fatal" />
++        <exit_code range="1:" err_level="fatal" />
+     </stdio>
+
+     <inputs>
+@@ -124,3 +125,4 @@
+
+     </help>
+ </tool>
+
--- a/data_manager/rnastar_index_builder.py	Mon Sep 29 20:47:45 2014 -0400
+++ b/data_manager/rnastar_index_builder.py	Fri Oct 03 21:59:39 2014 -0400
@@ -9,7 +9,7 @@
 import optparse
 import subprocess
 
-from galaxy.util.json import from_json_string, to_json_string
+from json import loads, dumps
 
 DEFAULT_DATA_TABLE_NAME = "rnastar_indexes"
 
@@ -44,20 +44,11 @@
    --sjdbGTFfile %(sjdbGTFfile)s --sjdbGTFtagExonParentTranscript %(sjdbGTFtagExonParentTranscript)s''' %  pdict
     elif sjdbFileChrStartEnd:
         cl += '--sjdbFileChrStartEnd %(sjdbFileChrStartEnd)s --sjdbOverhang %(sjdbOverhangs)s' % pdict
-    tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-rnastar-index-builder-stderr" )
     args = cl.split(' ')
-    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno() )
+    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory )
     return_code = proc.wait()
     if return_code:
-        tmp_stderr.flush()
-        tmp_stderr.seek(0)
-        print >> sys.stderr, "Error building index: retcode=",retcode
-        while True:
-            chunk = tmp_stderr.read( CHUNK_SIZE )
-            if not chunk:
-                break
-            sys.stderr.write( chunk )
-    tmp_stderr.close()
+        sys.exit( return_code )
     data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_name )
     _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
 
@@ -86,9 +77,9 @@
     (options, args) = parser.parse_args()
     
     filename = options.out_file
-    params = from_json_string( open( filename ).read() )
+    params = loads( open( filename ).read() )
     target_directory = options.out_index_path
-    os.mkdir( target_directory )
+    os.mkdirs( target_directory )
     data_manager_dict = {}
     
     dbkey = options.fasta_dbkey
@@ -107,6 +98,6 @@
 
     
     #save info to json file
-    open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )
+    open( filename, 'wb' ).write( dumps( data_manager_dict ) )
         
 if __name__ == "__main__": main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/rnastar_index_builder.py.orig	Fri Oct 03 21:59:39 2014 -0400
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+# adapted from Dan's BWA one for rna star
+# ross lazarus sept 2014
+#
+import sys
+import os
+import tempfile
+import optparse
+import subprocess
+
+from galaxy.util.json import from_json_string, to_json_string
+
+DEFAULT_DATA_TABLE_NAME = "rnastar_indexes"
+
+def get_id_name( params, dbkey, fasta_description=None):
+    #TODO: ensure sequence_id is unique and does not already appear in location file
+    sequence_id = params['param_dict']['sequence_id']
+    if not sequence_id:
+        sequence_id = dbkey
+    
+    sequence_name = params['param_dict']['sequence_name']
+    if not sequence_name:
+        sequence_name = fasta_description
+        if not sequence_name:
+            sequence_name = dbkey
+    return sequence_id, sequence_name
+
+def build_rnastar_index( data_manager_dict, fasta_filename, target_directory, dbkey, sequence_id, sequence_name, data_table_name,
+    sjdbOverhang,sjdbGTFfile, sjdbFileChrStartEnd,sjdbGTFtagExonParentTranscript,sjdbGTFfeatureExon,sjdbGTFchrPrefix,n_threads):
+    #TODO: allow multiple FASTA input files
+    #tmp_dir = tempfile.mkdtemp( prefix='tmp-data-manager-bwa-index-builder-' )
+    fasta_base_name = os.path.split( fasta_filename )[-1]
+    sym_linked_fasta_filename = os.path.join( target_directory, fasta_base_name )
+    os.symlink( fasta_filename, sym_linked_fasta_filename )
+    pdict={'target_directory':target_directory,'n_threads':n_threads, 'sjdbFileChrStartEnd':sjdbFileChrStartEnd,
+           'sjdbGTFtagExonParentTranscript':sjdbGTFtagExonParentTranscript, 'sjdbGTFfeatureExon':sjdbGTFfeatureExon,
+           'sjdbGTFchrPrefix':sjdbGTFchrPrefix,'sjdbOverhang':sjdbOverhang, 'sjdbGTFfile':sjdbGTFfile,
+           'sym_linked_fasta_filename':sym_linked_fasta_filename}
+    
+    cl = 'STAR --runMode genomeGenerate --genomeFastaFiles %(sym_linked_fasta_filename)s --genomeDir %(target_directory)s --runThreadN %(n_threads)s' % pdict
+    if sjdbGTFfile:
+         cl += '''--sjdbGTFchrPrefix %(sjdbGTFchrPrefix)s --sjdbGTFfeatureExon %(sjdbGTFfeatureExon)s --sjdbOverhang %(sjdbOverhang)s
+   --sjdbGTFfile %(sjdbGTFfile)s --sjdbGTFtagExonParentTranscript %(sjdbGTFtagExonParentTranscript)s''' %  pdict
+    elif sjdbFileChrStartEnd:
+        cl += '--sjdbFileChrStartEnd %(sjdbFileChrStartEnd)s --sjdbOverhang %(sjdbOverhangs)s' % pdict
+    tmp_stderr = tempfile.NamedTemporaryFile( prefix = "tmp-data-manager-rnastar-index-builder-stderr" )
+    args = cl.split(' ')
+    proc = subprocess.Popen( args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno() )
+    return_code = proc.wait()
+    if return_code:
+        tmp_stderr.flush()
+        tmp_stderr.seek(0)
+        print >> sys.stderr, "Error building index: retcode=",retcode
+        while True:
+            chunk = tmp_stderr.read( CHUNK_SIZE )
+            if not chunk:
+                break
+            sys.stderr.write( chunk )
+    tmp_stderr.close()
+    data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_name )
+    _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
+
+def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
+    data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
+    data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
+    data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
+    return data_manager_dict
+
+def main():
+    #Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option( '-f', '--fasta_filename', dest='fasta_filename', action='store', type="string", default=None, help='fasta_filename' )
+    parser.add_option( '-d', '--fasta_dbkey', dest='fasta_dbkey', action='store', type="string", default=None, help='fasta_dbkey' )
+    parser.add_option( '-t', '--fasta_description', dest='fasta_description', action='store', type="string", default=None, help='fasta_description' )
+    parser.add_option( '-n', '--data_table_name', dest='data_table_name', action='store', type="string", default=None, help='data_table_name' )
+    parser.add_option( '--out_file', default=None)
+    parser.add_option( '--out_index_path', default=None)
+    parser.add_option( '--sjdbGTFfile', type="string", default=None )
+    parser.add_option( '--sjdbGTFchrPrefix', type="string", default=None )
+    parser.add_option( '--sjdbGTFfeatureExon', type="string", default=None )
+    parser.add_option( '--sjdbGTFtagExonParentTranscript', type="string", default=None )
+    parser.add_option( '--sjdbFileChrStartEnd', type="string", default=None )
+    parser.add_option( '--sjdbOverhang', type="int", default=100 )
+    parser.add_option( '--runThreadN', type="int", default=4 )
+    (options, args) = parser.parse_args()
+    
+    filename = options.out_file
+    params = from_json_string( open( filename ).read() )
+    target_directory = options.out_index_path
+    os.mkdirs( target_directory )
+    data_manager_dict = {}
+    
+    dbkey = options.fasta_dbkey
+    
+    if dbkey in [ None, '', '?' ]:
+        raise Exception( '"%s" is not a valid dbkey. You must specify a valid dbkey.' % ( dbkey ) )
+    
+    sequence_id, sequence_name = get_id_name( params, dbkey=dbkey, fasta_description=options.fasta_description )
+    
+    #build the index
+    build_rnastar_index( data_manager_dict, options.fasta_filename, target_directory, dbkey, sequence_id, sequence_name, data_table_name=options.data_table_name,
+      sjdbOverhang=options.sjdbOverhang,sjdbGTFfile=options.sjdbGTFfile,
+      sjdbFileChrStartEnd=options.sjdbFileChrStartEnd,sjdbGTFtagExonParentTranscript=options.sjdbGTFtagExonParentTranscript,
+      sjdbGTFfeatureExon=options.sjdbGTFfeatureExon,sjdbGTFchrPrefix=options.sjdbGTFchrPrefix,
+      n_threads=options.runThreadN )
+
+    
+    #save info to json file
+    open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )
+        
+if __name__ == "__main__": main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/rnastar_index_builder.py.rej	Fri Oct 03 21:59:39 2014 -0400
@@ -0,0 +1,11 @@
+--- rnastar_index_builder.py Mon Sep 29 20:47:45 2014 -0400
++++ rnastar_index_builder.py Fri Oct 03 13:35:35 2014 -0400
+@@ -98,6 +89,7 @@
+ 
+ 
+     #save info to json file
+-    open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )
++    open( filename, 'wb' ).write( dumps( data_manager_dict ) )
+ 
+ if __name__ == "__main__": main()
++
--- a/data_manager/rnastar_index_builder.xml	Mon Sep 29 20:47:45 2014 -0400
+++ b/data_manager/rnastar_index_builder.xml	Fri Oct 03 21:59:39 2014 -0400
@@ -24,7 +24,6 @@
     <stdio>
         <regex match=".*" source="both" level="warning" description="stdout/err chatter:"/>
     </stdio>
-
     <inputs>
         <param name="all_fasta_source" type="select" label="Source FASTA Sequence">
             <options from_data_table="all_fasta"/>