0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 import os
|
|
5 import re
|
|
6 import optparse
|
|
7 import urllib
|
|
8 import tarfile
|
|
9 import gzip
|
|
10 import json
|
|
11 import pysam
|
|
12 from pysam import ctabix
|
|
13 import zipfile
|
|
14 import os.path
|
|
15 import shutil
|
|
16
|
|
17 """
|
|
18 # Install dbNSFP databases
|
|
19 # from DbNsfp site
|
|
20 # Download dbNSFP database
|
|
21 $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip
|
|
22 # Uncompress
|
|
23 $ unzip dbNSFP2.4.zip
|
|
24 # Create a single file version
|
|
25 $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt
|
|
26 # Compress using block-gzip algorithm
|
|
27 bgzip dbNSFP2.4.txt
|
|
28 # Create tabix index
|
|
29 tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz
|
|
30
|
|
31 data_table:
|
|
32
|
|
33 <table name="snpsift_dbnsfps" comment_char="#">
|
|
34 <columns>key, build, name, value, annotations</columns>
|
|
35 <file path="tool-data/snpsift_dbnsfps.loc" />
|
|
36 </table>
|
|
37
|
|
38 #id build description path annotations
|
|
39 #GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.gz SIFT_pred,Uniprot_acc
|
|
40 #GRCh38_dbNSFP2.7 GRCh38 GRCh38 dbNSFP2.7 /depot/snpeff/dbNSFP2.7.gz SIFT_pred,Uniprot_acc
|
|
41
|
|
42 """
|
|
43
|
|
44
|
|
45 data_table = 'snpsift_dbnsfps'
|
|
46 softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/'
|
|
47 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)'
|
|
48 tokenize = re.compile(r'(\d+)|(\D+)').findall
|
|
49 dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?'
|
|
50
|
|
51
|
|
52 def stop_err(msg):
|
|
53 sys.stderr.write(msg)
|
|
54 sys.exit(1)
|
|
55
|
|
56 def get_nsfp_genome_version(name):
|
|
57 genome_version = 'hg19'
|
|
58 dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?'
|
|
59 m = re.match(dbNSFP_name_pat,name)
|
|
60 if m:
|
|
61 (base,mid,ver) = m.groups()
|
|
62 if base == 'dbscSNV':
|
|
63 genome_version = 'hg19'
|
|
64 else:
|
|
65 genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18'
|
|
66 return genome_version
|
|
67
|
|
68 def get_annotations(gzip_path):
|
|
69 annotations = None
|
|
70 fh = None
|
|
71 try:
|
|
72 fh = gzip.open(gzip_path, 'r')
|
|
73 buf = fh.read(10000)
|
|
74 lines = buf.splitlines()
|
|
75 headers = lines[0].split('\t')
|
|
76 annotations = ','.join([x.strip() for x in headers[4:]])
|
|
77 except Exception, e:
|
|
78 stop_err('Error Reading annotations %s : %s' % (gzip_path, e))
|
|
79 finally:
|
|
80 if fh:
|
|
81 fh.close()
|
|
82 return annotations
|
|
83
|
|
84
|
|
85 def tabix_file(input_fname, output_fname):
|
|
86 print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname)
|
|
87 ctabix.tabix_compress(input_fname, output_fname, force=True)
|
|
88 # Column indices are 0-based.
|
|
89 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1)
|
|
90
|
|
91
|
|
92 def natural_sortkey(string):
|
|
93 return tuple(int(num) if num else alpha for num, alpha in tokenize(string))
|
|
94
|
|
95
|
|
96 def download_dbnsfp_database(url, output_file):
|
|
97 dbnsfp_tsv = None
|
|
98 file_path = 'downloaded_file'
|
|
99 urllib.urlretrieve(url, file_path)
|
|
100 if zipfile.is_zipfile(file_path):
|
|
101 dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv'
|
|
102 wtr = open(dbnsfp_tsv, 'w')
|
|
103 my_zip = zipfile.ZipFile(file_path, 'r')
|
|
104 allfiles = [info.filename for info in my_zip.infolist()]
|
|
105 files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)]
|
|
106 files = sorted(files, key=natural_sortkey)
|
|
107 for j, file in enumerate(files):
|
|
108 fh = my_zip.open(file, 'rU')
|
|
109 for i, line in enumerate(fh):
|
|
110 if j > 0 and i == 0:
|
|
111 continue
|
|
112 wtr.write(line)
|
|
113 return dbnsfp_tsv
|
|
114
|
|
115
|
|
116 def main():
|
|
117 # Parse Command Line
|
|
118 parser = optparse.OptionParser()
|
|
119 parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version')
|
|
120 parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset')
|
|
121 parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file')
|
|
122 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset')
|
|
123 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset')
|
|
124 (options, args) = parser.parse_args()
|
|
125
|
|
126 filename = args[0]
|
|
127 params = json.loads(open(filename).read())
|
|
128 target_directory = params['output_data'][0]['extra_files_path']
|
|
129 if not os.path.exists(target_directory):
|
|
130 os.mkdir(target_directory)
|
|
131 data_manager_dict = {}
|
|
132 genome_version = options.dbkey if options.dbkey else 'unknown'
|
|
133 dbnsfp_tsv = None
|
|
134 db_name = None
|
|
135 bzip_name = None
|
|
136 bzip_path = None
|
|
137 if options.softgenetics:
|
|
138 dbnsfp_url = softgenetics_url + options.softgenetics
|
|
139 db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics)
|
|
140 genome_version = get_nsfp_genome_version(options.softgenetics)
|
|
141 tsv = db_name + '.tsv'
|
|
142 dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv)
|
|
143 elif options.dbnsfp_tabular:
|
|
144 db_name = options.db_name
|
|
145 dbnsfp_tsv = options.dbnsfp_tabular
|
|
146 elif options.snpsiftdbnsfp:
|
|
147 (dirpath,bgzip_name) = os.path.split(options.snpsiftdbnsfp)
|
|
148 idxpath = options.snpsiftdbnsfp + '.tbi'
|
|
149 shutil.copy(options.snpsiftdbnsfp,target_directory)
|
|
150 shutil.copy(idxpath,target_directory)
|
|
151 bzip_path = os.path.join(target_directory, bgzip_name)
|
|
152 db_name = re.sub('(.txt)?.gz$','',bgzip_name)
|
|
153 else:
|
|
154 stop_err('Either --softgenetics or --dbnsfp_tabular required')
|
|
155 if dbnsfp_tsv:
|
|
156 bgzip_name = '%s.txt.gz' % db_name
|
|
157 bzip_path = os.path.join(target_directory, bgzip_name)
|
|
158 tabix_file(dbnsfp_tsv,bzip_path)
|
|
159 annotations = get_annotations(bzip_path)
|
|
160 # Create the SnpSift dbNSFP Reference Data
|
|
161 data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations)
|
|
162 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
|
|
163 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
|
|
164 data_manager_dict['data_tables'][data_table].append(data_table_entry)
|
|
165
|
|
166 # save info to json file
|
|
167 open(filename, 'wb').write(json.dumps(data_manager_dict))
|
|
168
|
|
169 if __name__ == "__main__":
|
|
170 main()
|