annotate data_manager/fetch_mothur_reference_data.py @ 0:b90e0f2bf4b1 draft default tip

Initial version.
author pjbriggs
date Tue, 18 Nov 2014 09:57:33 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
1 #!/usr/bin/env python
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
2 #
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
4 import sys
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
5 import os
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
6 import optparse
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
7 import tempfile
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
8 import shutil
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
9 import urllib2
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
10 import zipfile
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
11 import tarfile
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
12
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
13 from galaxy.util.json import from_json_string, to_json_string
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
14
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
15 # When extracting files from archives, skip names that
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
16 # start with the following strings
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
17 IGNORE_PATHS = ('.','__MACOSX/','__')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
18
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
19 # Map file extensions to data table names
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
20 MOTHUR_FILE_TYPES = { ".map": "map",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
21 ".fasta": "aligndb",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
22 ".pat": "lookup",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
23 ".tax": "taxonomy" }
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
24
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
25 # Reference data URLs
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
26 MOTHUR_REFERENCE_DATA = {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
27 # Look up data
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
28 # http://www.mothur.org/wiki/Lookup_files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
29 "lookup_titanium": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
30 "GS FLX Titanium": ["http://www.mothur.org/w/images/9/96/LookUp_Titanium.zip",]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
31 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
32 "lookup_gsflx": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
33 "GSFLX": ["http://www.mothur.org/w/images/8/84/LookUp_GSFLX.zip",]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
34 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
35 "lookup_gs20": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
36 "GS20": ["http://www.mothur.org/w/images/7/7b/LookUp_GS20.zip",]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
37 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
38 # RDP reference files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
39 # http://www.mothur.org/wiki/RDP_reference_files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
40 "RDP_v10": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
41 "16S rRNA RDP training set 10":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
42 ["http://www.mothur.org/w/images/b/b5/Trainset10_082014.rdp.tgz",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
43 "16S rRNA PDS training set 10":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
44 ["http://www.mothur.org/w/images/2/24/Trainset10_082014.pds.tgz",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
45 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
46 "RDP_v9": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
47 "16S rRNA RDP training set 9":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
48 ["http://www.mothur.org/w/images/7/72/Trainset9_032012.rdp.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
49 "16S rRNA PDS training set 9":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
50 ["http://www.mothur.org/w/images/5/59/Trainset9_032012.pds.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
51 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
52 "RDP_v7": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
53 "16S rRNA RDP training set 7":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
54 ["http://www.mothur.org/w/images/2/29/Trainset7_112011.rdp.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
55 "16S rRNA PDS training set 7":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
56 ["http://www.mothur.org/w/images/4/4a/Trainset7_112011.pds.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
57 "8S rRNA Fungi training set 7":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
58 ["http://www.mothur.org/w/images/3/36/FungiLSU_train_v7.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
59 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
60 "RDP_v6": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
61 "RDP training set 6":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
62 ["http://www.mothur.org/w/images/4/49/RDPTrainingSet.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
63 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
64 # Silva reference files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
65 # http://www.mothur.org/wiki/Silva_reference_files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
66 "silva_release_119": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
67 "SILVA release 119":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
68 ["http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
69 "http://www.mothur.org/w/images/5/56/Silva.seed_v119.tgz",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
70 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
71 "silva_release_102": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
72 "SILVA release 102":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
73 ["http://www.mothur.org/w/images/9/98/Silva.bacteria.zip",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
74 "http://www.mothur.org/w/images/3/3c/Silva.archaea.zip",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
75 "http://www.mothur.org/w/images/1/1a/Silva.eukarya.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
76 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
77 "silva_gold_bacteria": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
78 "SILVA gold":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
79 ["http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
80 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
81 # Greengenes
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
82 # http://www.mothur.org/wiki/Greengenes-formatted_databases
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
83 "greengenes_August2013": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
84 "Greengenes August 2013":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
85 ["http://www.mothur.org/w/images/1/19/Gg_13_8_99.refalign.tgz",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
86 "http://www.mothur.org/w/images/6/68/Gg_13_8_99.taxonomy.tgz",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
87 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
88 "greengenes_May2013": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
89 "Greengenes May 2013":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
90 ["http://www.mothur.org/w/images/c/cd/Gg_13_5_99.refalign.tgz",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
91 "http://www.mothur.org/w/images/9/9d/Gg_13_5_99.taxonomy.tgz",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
92 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
93 "greengenes_old": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
94 "Greengenes pre-May 2013":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
95 ["http://www.mothur.org/w/images/7/72/Greengenes.alignment.zip",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
96 "http://www.mothur.org/w/images/1/16/Greengenes.tax.tgz",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
97 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
98 "greengenes_gold_alignment": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
99 "Greengenes gold alignment":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
100 ["http://www.mothur.org/w/images/2/21/Greengenes.gold.alignment.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
101 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
102 # Secondary structure maps
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
103 # http://www.mothur.org/wiki/Secondary_structure_map
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
104 "secondary_structure_maps_silva": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
105 "SILVA":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
106 ["http://www.mothur.org/w/images/6/6d/Silva_ss_map.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
107 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
108 "secondary_structure_maps_greengenes": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
109 "Greengenes":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
110 ["http://www.mothur.org/w/images/4/4b/Gg_ss_map.zip",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
111 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
112 # Lane masks: not used here?
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
113 "lane_masks": {
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
114 "Greengenes-compatible":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
115 ["http://www.mothur.org/w/images/2/2a/Lane1241.gg.filter",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
116 "http://www.mothur.org/w/images/a/a0/Lane1287.gg.filter",
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
117 "http://www.mothur.org/w/images/3/3d/Lane1349.gg.filter",],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
118 "SILVA-compatible":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
119 ["http://www.mothur.org/w/images/6/6d/Lane1349.silva.filter",]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
120 },
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
121 }
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
122
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
123 # Utility functions for interacting with Galaxy JSON
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
124
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
125 def read_input_json(jsonfile):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
126 """Read the JSON supplied from the data manager tool
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
127
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
128 Returns a tuple (param_dict,extra_files_path)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
129
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
130 'param_dict' is an arbitrary dictionary of parameters
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
131 input into the tool; 'extra_files_path' is the path
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
132 to a directory where output files must be put for the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
133 receiving data manager to pick them up.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
134
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
135 NB the directory pointed to by 'extra_files_path'
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
136 doesn't exist initially, it is the job of the script
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
137 to create it if necessary.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
138
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
139 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
140 params = from_json_string(open(jsonfile).read())
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
141 return (params['param_dict'],
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
142 params['output_data'][0]['extra_files_path'])
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
143
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
144 # Utility functions for creating data table dictionaries
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
145 #
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
146 # Example usage:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
147 # >>> d = create_data_tables_dict()
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
148 # >>> add_data_table(d,'my_data')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
149 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
150 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
151 # >>> print str(to_json_string(d))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
152
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
153 def create_data_tables_dict():
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
154 """Return a dictionary for storing data table information
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
155
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
156 Returns a dictionary that can be used with 'add_data_table'
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
157 and 'add_data_table_entry' to store information about a
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
158 data table. It can be converted to JSON to be sent back to
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
159 the data manager.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
160
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
161 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
162 d = {}
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
163 d['data_tables'] = {}
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
164 return d
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
165
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
166 def add_data_table(d,table):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
167 """Add a data table to the data tables dictionary
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
168
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
169 Creates a placeholder for a data table called 'table'.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
170
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
171 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
172 d['data_tables'][table] = []
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
173
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
174 def add_data_table_entry(d,table,entry):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
175 """Add an entry to a data table
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
176
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
177 Appends an entry to the data table 'table'. 'entry'
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
178 should be a dictionary where the keys are the names of
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
179 columns in the data table.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
180
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
181 Raises an exception if the named data table doesn't
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
182 exist.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
183
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
184 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
185 try:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
186 d['data_tables'][table].append(entry)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
187 except KeyError:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
188 raise Exception("add_data_table_entry: no table '%s'" % table)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
189
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
190 # Utility functions for downloading and unpacking archive files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
191
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
192 def download_file(url,target=None,wd=None):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
193 """Download a file from a URL
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
194
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
195 Fetches a file from the specified URL.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
196
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
197 If 'target' is specified then the file is saved to this
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
198 name; otherwise it's saved as the basename of the URL.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
199
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
200 If 'wd' is specified then it is used as the 'working
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
201 directory' where the file will be save on the local
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
202 system.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
203
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
204 Returns the name that the file is saved with.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
205
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
206 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
207 print "Downloading %s" % url
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
208 if not target:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
209 target = os.path.basename(url)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
210 if wd:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
211 target = os.path.join(wd,target)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
212 print "Saving to %s" % target
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
213 open(target,'wb').write(urllib2.urlopen(url).read())
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
214 return target
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
215
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
216 def unpack_zip_archive(filen,wd=None):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
217 """Extract files from a ZIP archive
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
218
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
219 Given a ZIP archive, extract the files it contains
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
220 and return a list of the resulting file names and
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
221 paths.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
222
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
223 'wd' specifies the working directory to extract
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
224 the files to, otherwise they are extracted to the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
225 current working directory.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
226
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
227 Once all the files are extracted the ZIP archive
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
228 file is deleted from the file system.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
229
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
230 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
231 if not zipfile.is_zipfile(filen):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
232 print "%s: not ZIP formatted file"
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
233 return [filen]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
234 file_list = []
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
235 z = zipfile.ZipFile(filen)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
236 for name in z.namelist():
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
237 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
238 print "Ignoring %s" % name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
239 continue
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
240 if wd:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
241 target = os.path.join(wd,name)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
242 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
243 target = name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
244 if name.endswith('/'):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
245 # Make directory
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
246 print "Creating dir %s" % target
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
247 try:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
248 os.makedirs(target)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
249 except OSError:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
250 pass
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
251 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
252 # Extract file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
253 print "Extracting %s" % name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
254 try:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
255 os.makedirs(os.path.dirname(target))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
256 except OSError:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
257 pass
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
258 open(target,'wb').write(z.read(name))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
259 file_list.append(target)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
260 print "Removing %s" % filen
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
261 os.remove(filen)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
262 return file_list
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
263
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
264 def unpack_tar_archive(filen,wd=None):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
265 """Extract files from a TAR archive
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
266
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
267 Given a TAR archive (which optionally can be
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
268 compressed with either gzip or bz2), extract the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
269 files it contains and return a list of the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
270 resulting file names and paths.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
271
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
272 'wd' specifies the working directory to extract
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
273 the files to, otherwise they are extracted to the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
274 current working directory.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
275
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
276 Once all the files are extracted the TAR archive
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
277 file is deleted from the file system.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
278
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
279 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
280 file_list = []
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
281 if wd:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
282 path = wd
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
283 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
284 path = '.'
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
285 if not tarfile.is_tarfile(filen):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
286 print "%s: not TAR file"
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
287 return [filen]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
288 t = tarfile.open(filen)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
289 for name in t.getnames():
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
290 # Check for unwanted files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
291 if reduce(lambda x,y: x or name.startswith(y),IGNORE_PATHS,False):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
292 print "Ignoring %s" % name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
293 continue
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
294 # Extract file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
295 print "Extracting %s" % name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
296 t.extract(name,wd)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
297 if wd:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
298 target = os.path.join(wd,name)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
299 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
300 target = name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
301 file_list.append(target)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
302 print "Removing %s" % filen
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
303 os.remove(filen)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
304 return file_list
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
305
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
306 def unpack_archive(filen,wd=None):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
307 """Extract files from an archive
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
308
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
309 Wrapper function that calls the appropriate
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
310 unpacking function depending on the archive
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
311 type, and returns a list of files that have
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
312 been extracted.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
313
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
314 'wd' specifies the working directory to extract
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
315 the files to, otherwise they are extracted to the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
316 current working directory.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
317
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
318 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
319 print "Unpack %s" % filen
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
320 ext = os.path.splitext(filen)[1]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
321 print "Extension: %s" % ext
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
322 if ext == ".zip":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
323 return unpack_zip_archive(filen,wd=wd)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
324 elif ext == ".tgz":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
325 return unpack_tar_archive(filen,wd=wd)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
326 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
327 return [filen]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
328
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
329 def fetch_files(urls,wd=None,files=None):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
330 """Download and unpack files from a list of URLs
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
331
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
332 Given a list of URLs, download and unpack each
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
333 one, and return a list of the extracted files.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
334
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
335 'wd' specifies the working directory to extract
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
336 the files to, otherwise they are extracted to the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
337 current working directory.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
338
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
339 If 'files' is given then the list of extracted
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
340 files will be appended to this list before being
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
341 returned.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
342
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
343 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
344 if files is None:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
345 files = []
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
346 for url in urls:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
347 filen = download_file(url,wd=wd)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
348 files.extend(unpack_archive(filen,wd=wd))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
349 return files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
350
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
351 # Utility functions specific to the Mothur reference data
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
352
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
353 def identify_type(filen):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
354 """Return the data table name based on the file name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
355
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
356 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
357 ext = os.path.splitext(filen)[1]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
358 try:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
359 return MOTHUR_FILE_TYPES[ext]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
360 except KeyError:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
361 return None
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
362
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
363 def get_name(filen):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
364 """Generate a descriptive name based on the file name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
365 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
366 type_ = identify_type(filen)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
367 name = os.path.splitext(os.path.basename(filen))[0]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
368 for delim in ('.','_'):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
369 name = name.replace(delim,' ')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
370 return name
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
371
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
372 def fetch_from_mothur_website(data_tables,target_dir,datasets):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
373 """Fetch reference data from the Mothur website
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
374
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
375 For each dataset in the list 'datasets', download (and if
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
376 necessary unpack) the related files from the Mothur website,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
377 copy them to the data manager's target directory, and add
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
378 references to the files to the appropriate data table.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
379
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
380 The 'data_tables' dictionary should have been created using
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
381 the 'create_data_tables_dict' and 'add_data_table' functions.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
382
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
383 Arguments:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
384 data_tables: a dictionary containing the data table info
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
385 target_dir: directory to put the downloaded files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
386 datasets: a list of dataset names corresponding to keys in
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
387 the MOTHUR_REFERENCE_DATA dictionary
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
388
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
389 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
390 # Make working dir
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
391 wd = tempfile.mkdtemp(suffix=".mothur",dir=os.getcwd())
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
392 print "Working dir %s" % wd
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
393 # Iterate over all requested reference data URLs
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
394 for dataset in datasets:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
395 print "Handling dataset '%s'" % dataset
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
396 for name in MOTHUR_REFERENCE_DATA[dataset]:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
397 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name],wd=wd):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
398 type_ = identify_type(f)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
399 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0],name)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
400 print "%s\t\'%s'\t.../%s" % (type_,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
401 entry_name,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
402 os.path.basename(f))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
403 if type_ is not None:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
404 # Move to target dir
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
405 ref_data_file = os.path.basename(f)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
406 f1 = os.path.join(target_dir,ref_data_file)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
407 print "Moving %s to %s" % (f,f1)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
408 os.rename(f,f1)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
409 # Add entry to data table
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
410 table_name = "mothur_%s" % type_
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
411 add_data_table_entry(data_tables,table_name,dict(name=entry_name,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
412 value=ref_data_file))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
413 # Remove working dir
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
414 print "Removing %s" % wd
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
415 shutil.rmtree(wd)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
416
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
417 def files_from_filesystem_paths(paths):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
418 """Return list of file paths from arbitrary input paths
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
419
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
420 Given a list of filesystem paths, return a list of
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
421 full paths corresponding to all files found recursively
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
422 from under those paths.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
423
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
424 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
425 # Collect files to add
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
426 files = []
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
427 for path in paths:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
428 path = os.path.abspath(path)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
429 print "Examining '%s'..." % path
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
430 if os.path.isfile(path):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
431 # Store full path for file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
432 files.append(path)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
433 elif os.path.isdir(path):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
434 # Descend into directory and collect the files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
435 for f in os.listdir(path):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
436 files.extend(files_from_filesystem_paths((os.path.join(path,f),)))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
437 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
438 print "Not a file or directory, ignored"
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
439 return files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
440
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
441 def import_from_server(data_tables,target_dir,paths,description,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
442 link_to_data=False):
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
443 """Import reference data from filesystem paths
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
444
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
445 Creates references to the specified file(s) on the Galaxy
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
446 server in the appropriate data table (determined from the
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
447 file extension).
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
448
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
449 The 'data_tables' dictionary should have been created using
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
450 the 'create_data_tables_dict' and 'add_data_table' functions.
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
451
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
452 Arguments:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
453 data_tables: a dictionary containing the data table info
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
454 target_dir: directory to put copy or link to the data file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
455 paths: list of file and/or directory paths to import
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
456 description: text to associate with the files
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
457 link_to_data: boolean, if False then copy the data file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
458 into Galaxy (default); if True then make a symlink to
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
459 the data file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
460
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
461 """
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
462 # Collect list of files based on input paths
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
463 files = files_from_filesystem_paths(paths)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
464 # Handle each file individually
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
465 for f in files:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
466 type_ = identify_type(f)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
467 if type_ is None:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
468 print "%s: unrecognised type, skipped" % f
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
469 continue
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
470 ref_data_file = os.path.basename(f)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
471 target_file = os.path.join(target_dir,ref_data_file)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
472 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
473 if description:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
474 entry_name += " (%s)" % description
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
475 print "%s\t\'%s'\t.../%s" % (type_,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
476 entry_name,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
477 ref_data_file)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
478 # Link to or copy the data
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
479 if link_to_data:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
480 os.symlink(f,target_file)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
481 else:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
482 shutil.copyfile(f,target_file)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
483 # Add entry to data table
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
484 table_name = "mothur_%s" % type_
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
485 add_data_table_entry(data_tables,table_name,dict(name=entry_name,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
486 value=ref_data_file))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
487
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
488 if __name__ == "__main__":
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
489 print "Starting..."
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
490
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
491 # Read command line
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
492 parser = optparse.OptionParser()
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
493 parser.add_option('--source',action='store',dest='data_source')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
494 parser.add_option('--datasets',action='store',dest='datasets',default='')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
495 parser.add_option('--paths',action='store',dest='paths',default=[])
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
496 parser.add_option('--description',action='store',dest='description',default='')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
497 parser.add_option('--link',action='store_true',dest='link_to_data')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
498 options,args = parser.parse_args()
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
499 print "options: %s" % options
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
500 print "args : %s" % args
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
501
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
502 # Check for JSON file
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
503 if len(args) != 1:
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
504 p.error("Need to supply JSON file name")
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
505 jsonfile = args[0]
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
506
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
507 # Read the input JSON
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
508 params,target_dir = read_input_json(jsonfile)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
509
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
510 # Make the target directory
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
511 print "Making %s" % target_dir
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
512 os.mkdir(target_dir)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
513
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
514 # Set up data tables dictionary
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
515 data_tables = create_data_tables_dict()
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
516 add_data_table(data_tables,'mothur_lookup')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
517 add_data_table(data_tables,'mothur_aligndb')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
518 add_data_table(data_tables,'mothur_map')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
519 add_data_table(data_tables,'mothur_taxonomy')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
520
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
521 # Fetch data from specified data sources
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
522 if options.data_source == 'mothur_website':
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
523 datasets = options.datasets.split(',')
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
524 fetch_from_mothur_website(data_tables,target_dir,datasets)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
525 elif options.data_source == 'filesystem_paths':
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
526 # Check description text
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
527 description = options.description.strip()
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
528 # Get list of paths (need to remove any escapes for '\n' and '\r'
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
529 # that might have been inserted by Galaxy)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
530 paths = options.paths.replace('__cn__','\n').replace('__cr__','\r').split()
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
531 import_from_server(data_tables,target_dir,paths,description,
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
532 link_to_data=options.link_to_data)
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
533 # Write output JSON
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
534 print "Outputting JSON"
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
535 print str(to_json_string(data_tables))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
536 open(jsonfile,'wb').write(to_json_string(data_tables))
b90e0f2bf4b1 Initial version.
pjbriggs
parents:
diff changeset
537 print "Done."