comparison snpeff.py @ 3:9efd0d32fe8a

Add snpeff_version to snpeffdb metadata, this is available from snpeff v4.1
author Jim Johnson <jj@umn.edu>
date Tue, 13 Jan 2015 12:30:20 -0600
parents lib/galaxy/datatypes/snpeff.py@cd8f8c54bf9a
children 5d6f3622b99d
comparison
equal deleted inserted replaced
2:b33911fdbac4 3:9efd0d32fe8a
1 """
2 SnpEff datatypes
3 """
4 import os,os.path,re,sys,gzip,logging
5 import galaxy.datatypes.data
6 from galaxy.datatypes.data import Text
7 from galaxy.datatypes.metadata import MetadataElement
8
9 log = logging.getLogger(__name__)
10
11 class SnpEffDb( Text ):
12 """Class describing a SnpEff genome build"""
13 file_ext = "snpeffdb"
14 MetadataElement( name="genome_version", default=None, desc="Genome Version", readonly=True, visible=True, no_value=None )
15 MetadataElement( name="snpeff_version", default=None, desc="SnpEff Version", readonly=True, visible=True, no_value=None )
16 MetadataElement( name="regulation", default=[], desc="Regulation Names", readonly=True, visible=True, no_value=[], optional=True)
17 MetadataElement( name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[], optional=True)
18
19 def __init__( self, **kwd ):
20 Text.__init__( self, **kwd )
21
22 def getSnpeffVersionFromFile(self, path):
23 snpeff_version = None
24 try:
25 fh = gzip.open(path, 'rb')
26 buf = fh.read(100)
27 lines = buf.splitlines()
28 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip())
29 if m:
30 snpeff_version = m.groups()[0] + m.groups()[1]
31 fh.close()
32 except Exception, e:
33 pass
34 return snpeff_version
35
36 def set_meta( self, dataset, **kwd ):
37 Text.set_meta(self, dataset, **kwd )
38 data_dir = dataset.extra_files_path
39 ## search data_dir/genome_version for files
40 regulation_pattern = 'regulation_(.+).bin'
41 # annotation files that are included in snpEff by a flag
42 annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'}
43 regulations = []
44 annotations = []
45 genome_version = None
46 snpeff_version = None
47 if data_dir and os.path.isdir(data_dir):
48 for root, dirs, files in os.walk(data_dir):
49 for fname in files:
50 if fname.startswith('snpEffectPredictor'):
51 # if snpEffectPredictor.bin download succeeded
52 genome_version = os.path.basename(root)
53 dataset.metadata.genome_version = genome_version
54 # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version
55 snpeff_version = self.getSnpeffVersionFromFile(os.path.join(root,fname))
56 if snpeff_version:
57 dataset.metadata.snpeff_version = snpeff_version
58 else:
59 m = re.match(regulation_pattern,fname)
60 if m:
61 name = m.groups()[0]
62 regulations.append(name)
63 elif fname in annotations_dict:
64 value = annotations_dict[fname]
65 name = value.lstrip('-')
66 annotations.append(name)
67 dataset.metadata.regulation = regulations
68 dataset.metadata.annotation = annotations
69 try:
70 fh = file(dataset.file_name,'w')
71 fh.write("%s\n" % genome_version if genome_version else 'Genome unknown')
72 fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown')
73 if annotations:
74 fh.write("annotations: %s\n" % ','.join(annotations))
75 if regulations:
76 fh.write("regulations: %s\n" % ','.join(regulations))
77 fh.close()
78 except:
79 pass
80