Mercurial > repos > jjohnson > snpsift_dbnsfp_datatypes

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml	Tue Oct 14 14:27:38 2014 -0400
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<datatypes>
+    <datatype_files>
+        <datatype_file name="snpsift_dbnsfp.py"/>
+    </datatype_files>
+    <registration>
+        <datatype extension="snpsiftdbnsfp" type="galaxy.datatypes.snpsift_dbnsfp:SnpSiftDbNSFP" display_in_upload="True"/>
+        <datatype extension="dbnsfp.tabular" type="galaxy.datatypes.tabular:Tabular" subclass="True" display_in_upload="True">
+            <converter file="tabular_to_dbnsfp.xml" target_datatype="snpsiftdbnsfp"/>
+        </datatype>
+    </registration>
+</datatypes>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/converters/tabular_to_dbnsfp.py	Tue Oct 14 14:27:38 2014 -0400
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+
+"""
+Uses pysam to bgzip a file
+
+usage: %prog in_file out_file
+"""
+
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "pysam" )
+import ctabix, subprocess, tempfile, sys, optparse, os.path
+
+def main():
+    # Read options, args.
+    usage = "Usage: %prog [options] tabular_input_file bgzip_output_file"
+    parser = optparse.OptionParser(usage = usage)
+    parser.add_option( '-c', '--chr-col', type='int', default=0, dest='chrom_col' )
+    parser.add_option( '-s', '--start-col', type='int', default=1, dest='start_col' )
+    parser.add_option( '-e', '--end-col', type='int', default=1, dest='end_col' )
+    (options, args) = parser.parse_args()
+    if len(args) != 2:
+      parser.print_usage()
+      exit(1)
+    input_fname, output_fname = args
+    output_dir = os.path.dirname(output_fname)
+    if not os.path.exists(output_dir):
+      os.makedirs(output_dir)
+    ctabix.tabix_compress(input_fname, output_fname, force=True)
+    # Column indices are 0-based.
+    ctabix.tabix_index(output_fname, seq_col=options.chrom_col,start_col=options.start_col,end_col=options.end_col)
+
+if __name__ == "__main__":
+    main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/converters/tabular_to_dbnsfp.xml	Tue Oct 14 14:27:38 2014 -0400
@@ -0,0 +1,12 @@
+<tool id="tabular_to_dbnsfp" name="Convert tabular to dbnsfp" version="1.0.0">
+  <description></description>
+  <command interpreter="python">tabular_to_dbnsfp.py $input $dbnsfp.dataset.extra_files_path/dbNSFP.gz</command>
+  <inputs>
+      <param format="tabular" name="input" type="data" label="Choose a dbnsfp tabular file"/>
+   </inputs>
+  <outputs>
+    <data format="snpsiftdbnsfp" name="dbnsfp"/>
+  </outputs>
+  <help>
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/snpsift_dbnsfp.py	Tue Oct 14 14:27:38 2014 -0400
@@ -0,0 +1,87 @@
+"""
+SnpSift dbNSFP datatypes
+"""
+import os,os.path,re,sys,gzip,logging
+import traceback
+import galaxy.datatypes.data
+from galaxy.datatypes.data import Text
+from galaxy.datatypes.metadata import MetadataElement
+
+log = logging.getLogger(__name__)
+
+class SnpSiftDbNSFP( Text ):
+    """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """
+    MetadataElement( name='reference_name', default='dbSNFP' , desc='Reference Name', readonly=False, visible=True, set_in_upload=True, no_value='dbSNFP' )
+    MetadataElement( name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None )
+    MetadataElement( name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None)
+    MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[] )
+    file_ext = "snpsiftdbnsfp"
+    composite_type = 'auto_primary_file'
+    allow_datatype_change = False
+    """
+    ## The dbNSFP file is a tabular file with 1 header line
+    ## The first 4 columns are required to be: chrom	pos	ref	alt
+    ## These match columns 1,2,4,5 of the VCF file
+    ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix
+    ## Example:
+    ## Compress using block-gzip algorithm
+    bgzip dbNSFP2.3.txt
+    ## Create tabix index
+    tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz
+    """
+    def __init__( self, **kwd ):
+        Text.__init__( self, **kwd )
+        self.add_composite_file( '%s.grp', description = 'Group File', substitute_name_with_metadata = 'reference_name', is_binary = False )
+        self.add_composite_file( '%s.ti', description = '', substitute_name_with_metadata = 'reference_name', is_binary = False )
+    def init_meta( self, dataset, copy_from=None ):
+        Text.init_meta( self, dataset, copy_from=copy_from )
+    def generate_primary_file( self, dataset = None ):
+        """
+        This is called only at upload to write the html file
+        cannot rename the datasets here - they come with the default unfortunately
+        """
+        regenerate_primary_file( self, dataset)
+    def regenerate_primary_file(self,dataset):
+        """
+        cannot do this until we are setting metadata
+        """
+        annotations = "dbNSFP Annotations: %s\n" % ','.join(dataset.metadata.annotation)
+        f = open(dataset.file_name,'a')
+        if dataset.metadata.bgzip:
+            bn = dataset.metadata.bgzip
+            f.write(bn)
+            f.write('\n')
+        f.write(annotations)
+        f.close()
+    def set_meta( self, dataset, overwrite=True, **kwd ):
+        try:
+            # Text.set_meta( self, dataset, **kwd)
+            efp = dataset.extra_files_path
+            if os.path.exists(efp):
+                flist = os.listdir(efp)
+                for i,fname in enumerate(flist):
+                    # log.info("set_meta fname: %d %s" % (i,fname))
+                    if fname.endswith('.gz'):
+                        dataset.metadata.bgzip = fname
+                        try:
+                            fh = gzip.open(os.path.join(efp,fname),'r')
+                            buf = fh.read(5000)
+                            lines = buf.splitlines()
+                            headers = lines[0].split('\t')
+                            dataset.metadata.annotation = headers[4:]
+                        except Exception,e:
+                            log.warn("set_meta fname: %s  %s" % (fname,str(e)))
+                            traceback.print_stack(file=sys.stderr)
+                        finally:
+                            fh.close()
+                    if fname.endswith('.tbi'):
+                        dataset.metadata.index = fname
+            self.regenerate_primary_file(dataset)
+        except Exception,e:
+            log.warn("set_meta fname: %s  %s" % (dataset.file_name if dataset and dataset.file_name else 'Unkwown',str(e)))
+            traceback.print_stack(file=sys.stderr)
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod(sys.modules[__name__])
+