Mercurial > repos > iuc > data_manager_snpsift_dbnsfp
comparison data_manager/data_manager_snpsift_dbnsfp.py @ 0:5f429f5faba5 draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_snpsift_dbnsfp commit 5316af00b4a71a7b526cbc9540d5158749cc38e4
| author | iuc |
|---|---|
| date | Tue, 07 Jun 2016 10:22:35 -0400 |
| parents | |
| children | 1adeb79c05f8 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5f429f5faba5 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import gzip | |
| 4 import json | |
| 5 import optparse | |
| 6 import os | |
| 7 import os.path | |
| 8 import re | |
| 9 import shutil | |
| 10 import sys | |
| 11 import urllib | |
| 12 import zipfile | |
| 13 | |
| 14 from pysam import ctabix | |
| 15 | |
| 16 """ | |
| 17 # Install dbNSFP databases | |
| 18 # from DbNsfp site | |
| 19 # Download dbNSFP database | |
| 20 $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip | |
| 21 # Uncompress | |
| 22 $ unzip dbNSFP2.4.zip | |
| 23 # Create a single file version | |
| 24 $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt | |
| 25 # Compress using block-gzip algorithm | |
| 26 bgzip dbNSFP2.4.txt | |
| 27 # Create tabix index | |
| 28 tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz | |
| 29 | |
| 30 data_table: | |
| 31 | |
| 32 <table name="snpsift_dbnsfps" comment_char="#"> | |
| 33 <columns>key, build, name, value, annotations</columns> | |
| 34 <file path="tool-data/snpsift_dbnsfps.loc" /> | |
| 35 </table> | |
| 36 | |
| 37 #id build description path annotations | |
| 38 #GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.gz SIFT_pred,Uniprot_acc | |
| 39 #GRCh38_dbNSFP2.7 GRCh38 GRCh38 dbNSFP2.7 /depot/snpeff/dbNSFP2.7.gz SIFT_pred,Uniprot_acc | |
| 40 """ | |
| 41 | |
| 42 data_table = 'snpsift_dbnsfps' | |
| 43 softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/' | |
| 44 dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)' | |
| 45 tokenize = re.compile(r'(\d+)|(\D+)').findall | |
| 46 dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?' | |
| 47 | |
| 48 | |
| 49 def stop_err(msg): | |
| 50 sys.stderr.write(msg) | |
| 51 sys.exit(1) | |
| 52 | |
| 53 | |
| 54 def get_nsfp_genome_version(name): | |
| 55 genome_version = 'hg19' | |
| 56 dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?' | |
| 57 m = re.match(dbNSFP_name_pat, name) | |
| 58 if m: | |
| 59 (base, mid, ver) = m.groups() | |
| 60 if base == 'dbscSNV': | |
| 61 genome_version = 'hg19' | |
| 62 else: | |
| 63 genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18' | |
| 64 return genome_version | |
| 65 | |
| 66 | |
| 67 def get_annotations(gzip_path): | |
| 68 annotations = None | |
| 69 fh = None | |
| 70 try: | |
| 71 fh = gzip.open(gzip_path, 'r') | |
| 72 buf = fh.read(10000) | |
| 73 lines = buf.splitlines() | |
| 74 headers = lines[0].split('\t') | |
| 75 annotations = ','.join([x.strip() for x in headers[4:]]) | |
| 76 except Exception as e: | |
| 77 stop_err('Error Reading annotations %s : %s' % (gzip_path, e)) | |
| 78 finally: | |
| 79 if fh: | |
| 80 fh.close() | |
| 81 return annotations | |
| 82 | |
| 83 | |
| 84 def tabix_file(input_fname, output_fname): | |
| 85 print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname) | |
| 86 ctabix.tabix_compress(input_fname, output_fname, force=True) | |
| 87 # Column indices are 0-based. | |
| 88 ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1) | |
| 89 | |
| 90 | |
| 91 def natural_sortkey(string): | |
| 92 return tuple(int(num) if num else alpha for num, alpha in tokenize(string)) | |
| 93 | |
| 94 | |
| 95 def download_dbnsfp_database(url, output_file): | |
| 96 dbnsfp_tsv = None | |
| 97 file_path = 'downloaded_file' | |
| 98 urllib.urlretrieve(url, file_path) | |
| 99 with zipfile.ZipFile(file_path, 'r') as my_zip: | |
| 100 dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv' | |
| 101 wtr = open(dbnsfp_tsv, 'w') | |
| 102 allfiles = [info.filename for info in my_zip.infolist()] | |
| 103 files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)] | |
| 104 files = sorted(files, key=natural_sortkey) | |
| 105 for j, file in enumerate(files): | |
| 106 fh = my_zip.open(file, 'rU') | |
| 107 for i, line in enumerate(fh): | |
| 108 if j > 0 and i == 0: | |
| 109 continue | |
| 110 wtr.write(line) | |
| 111 return dbnsfp_tsv | |
| 112 | |
| 113 | |
| 114 def main(): | |
| 115 # Parse Command Line | |
| 116 parser = optparse.OptionParser() | |
| 117 parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version') | |
| 118 parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset') | |
| 119 parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file') | |
| 120 parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset') | |
| 121 parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset') | |
| 122 (options, args) = parser.parse_args() | |
| 123 | |
| 124 filename = args[0] | |
| 125 params = json.loads(open(filename).read()) | |
| 126 target_directory = params['output_data'][0]['extra_files_path'] | |
| 127 if not os.path.exists(target_directory): | |
| 128 os.mkdir(target_directory) | |
| 129 data_manager_dict = {} | |
| 130 genome_version = options.dbkey if options.dbkey else 'unknown' | |
| 131 dbnsfp_tsv = None | |
| 132 db_name = None | |
| 133 bzip_path = None | |
| 134 if options.softgenetics: | |
| 135 dbnsfp_url = softgenetics_url + options.softgenetics | |
| 136 db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics) | |
| 137 genome_version = get_nsfp_genome_version(options.softgenetics) | |
| 138 tsv = db_name + '.tsv' | |
| 139 dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv) | |
| 140 elif options.dbnsfp_tabular: | |
| 141 db_name = options.db_name | |
| 142 dbnsfp_tsv = options.dbnsfp_tabular | |
| 143 elif options.snpsiftdbnsfp: | |
| 144 (dirpath, bgzip_name) = os.path.split(options.snpsiftdbnsfp) | |
| 145 idxpath = options.snpsiftdbnsfp + '.tbi' | |
| 146 shutil.copy(options.snpsiftdbnsfp, target_directory) | |
| 147 shutil.copy(idxpath, target_directory) | |
| 148 bzip_path = os.path.join(target_directory, bgzip_name) | |
| 149 db_name = re.sub('(.txt)?.gz$', '', bgzip_name) | |
| 150 else: | |
| 151 stop_err('Either --softgenetics or --dbnsfp_tabular required') | |
| 152 if dbnsfp_tsv: | |
| 153 bgzip_name = '%s.txt.gz' % db_name | |
| 154 bzip_path = os.path.join(target_directory, bgzip_name) | |
| 155 tabix_file(dbnsfp_tsv, bzip_path) | |
| 156 annotations = get_annotations(bzip_path) | |
| 157 # Create the SnpSift dbNSFP Reference Data | |
| 158 data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations) | |
| 159 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
| 160 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) | |
| 161 data_manager_dict['data_tables'][data_table].append(data_table_entry) | |
| 162 | |
| 163 # save info to json file | |
| 164 open(filename, 'wb').write(json.dumps(data_manager_dict)) | |
| 165 | |
| 166 if __name__ == "__main__": | |
| 167 main() |
