comparison data_manager/data_manager_snpEff_download.py @ 5:78bcf4ac437c

Use tool_data_table with key and version columns added to allow for multiple versions in a .loc file
author Jim Johnson <jj@umn.edu>
date Tue, 13 Jan 2015 12:54:20 -0600
parents c6769a700e55
children
comparison
equal deleted inserted replaced
4:6a378d0f4856 5:78bcf4ac437c
7 import subprocess 7 import subprocess
8 import fileinput 8 import fileinput
9 import shutil 9 import shutil
10 import optparse 10 import optparse
11 import urllib2 11 import urllib2
12 import gzip
12 from ftplib import FTP 13 from ftplib import FTP
13 import tarfile 14 import tarfile
14 15
15 from galaxy.util.json import from_json_string, to_json_string 16 from galaxy.util.json import from_json_string, to_json_string
16 17
62 for genome in snpDBdict: 63 for genome in snpDBdict:
63 descriptions.append(snpDBdict[genome] if genome in snpDBdict else genome) 64 descriptions.append(snpDBdict[genome] if genome in snpDBdict else genome)
64 return ','.join(descriptions) 65 return ','.join(descriptions)
65 return organisms 66 return organisms
66 67
68 def getSnpeffVersion(jar_path):
69 snpeff_version = 'SnpEff ?.?'
70 (snpEff_dir,snpEff_jar) = os.path.split(jar_path)
71 stderr_path = 'snpeff.err'
72 stderr_fh = open(stderr_path,'w')
73 args = [ 'java','-jar', ]
74 args.append( snpEff_jar )
75 args.append( '-h' )
76 proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() )
77 return_code = proc.wait()
78 if return_code != 255:
79 sys.exit( return_code )
80 stderr_fh.close()
81 fh = open(stderr_path,'r')
82 for line in fh:
83 m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line)
84 if m:
85 snpeff_version = m.groups()[0] + m.groups()[1]
86 break
87 fh.close()
88 return snpeff_version
89
90 # Starting with SnpEff 4.1 the .bin files contain the SnpEff version:
91 # Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed):
92 """
93 SnpEff 4.1
94 CHROMOSOME 2 1 0 179197 GL000219.1 false
95 CHROMOSOME 3 1 0 81347269 HSCHR17_1 false
96 """
97 def getSnpeffVersionFromFile(path):
98 snpeff_version = None
99 try:
100 fh = gzip.open(path, 'rb')
101 buf = fh.read(100)
102 lines = buf.splitlines()
103 m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip())
104 if m:
105 snpeff_version = m.groups()[0] + m.groups()[1]
106 fh.close()
107 except Exception, e:
108 stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) )
109 return snpeff_version
110
67 """ 111 """
68 # Download human database 'hg19' 112 # Download human database 'hg19'
69 java -jar snpEff.jar download -v hg19 113 java -jar snpEff.jar download -v hg19
70 114
71 <command>java -jar \$SNPEFF_JAR_PATH/snpEff.jar download -c \$JAVA_JAR_PATH/snpEff.config $genomeVersion > $logfile </command> 115 <command>java -jar \$SNPEFF_JAR_PATH/snpEff.jar download -c \$JAVA_JAR_PATH/snpEff.config $genomeVersion > $logfile </command>
72 116
73 snpEffectPredictor.bin 117 snpEffectPredictor.bin
74 regulation_HeLa-S3.bin 118 regulation_HeLa-S3.bin
75 regulation_pattern = 'regulation_(.+).bin' 119 regulation_pattern = 'regulation_(.+).bin'
76 """ 120 """
77 def download_database(data_manager_dict, target_directory, jar_path,config,genome_version,organism): 121 def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism):
78 ## get data_dir from config 122 ## get data_dir from config
79 ##--- 123 ##---
80 ## Databases are stored here 124 ## Databases are stored here
81 ## E.g.: Information for 'hg19' is stored in data_dir/hg19/ 125 ## E.g.: Information for 'hg19' is stored in data_dir/hg19/
82 ## 126 ##
101 ## search data_dir/genome_version for files 145 ## search data_dir/genome_version for files
102 regulation_pattern = 'regulation_(.+).bin' 146 regulation_pattern = 'regulation_(.+).bin'
103 # annotation files that are included in snpEff by a flag 147 # annotation files that are included in snpEff by a flag
104 annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} 148 annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'}
105 genome_path = os.path.join(data_dir,genome_version) 149 genome_path = os.path.join(data_dir,genome_version)
150 snpeff_version = getSnpeffVersion(jar_path)
151 key = snpeff_version + '_' + genome_version
106 if os.path.isdir(genome_path): 152 if os.path.isdir(genome_path):
107 for root, dirs, files in os.walk(genome_path): 153 for root, dirs, files in os.walk(genome_path):
108 for fname in files: 154 for fname in files:
109 if fname.startswith('snpEffectPredictor'): 155 if fname.startswith('snpEffectPredictor'):
110 # if snpEffectPredictor.bin download succeeded 156 # if snpEffectPredictor.bin download succeeded
111 name = genome_version + (' : ' + organism if organism else '') 157 name = genome_version + (' : ' + organism if organism else '')
112 data_table_entry = dict(value=genome_version, name=name, path=data_dir) 158 # version = getSnpeffVersionFromFile(os.path.join(root,fname))
113 _add_data_table_entry( data_manager_dict, 'snpeff4_genomedb', data_table_entry ) 159 data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir)
160 _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry )
114 else: 161 else:
115 m = re.match(regulation_pattern,fname) 162 m = re.match(regulation_pattern,fname)
116 if m: 163 if m:
117 name = m.groups()[0] 164 name = m.groups()[0]
118 data_table_entry = dict(genome=genome_version,value=name, name=name) 165 data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name)
119 _add_data_table_entry( data_manager_dict, 'snpeff4_regulationdb', data_table_entry ) 166 _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry )
120 elif fname in annotations_dict: 167 elif fname in annotations_dict:
121 value = annotations_dict[fname] 168 value = annotations_dict[fname]
122 name = value.lstrip('-') 169 name = value.lstrip('-')
123 data_table_entry = dict(genome=genome_version,value=value, name=name) 170 data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name)
124 _add_data_table_entry( data_manager_dict, 'snpeff4_annotations', data_table_entry ) 171 _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry )
125 return data_manager_dict 172 return data_manager_dict
126 173
127 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ): 174 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):
128 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 175 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
129 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] ) 176 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] )