6
|
1 #!/usr/bin/python
|
|
2 # -*- coding: utf8 -*-
|
|
3
|
|
4 """
|
|
5 This Galaxy tool permits to prepare your files to be ready for
|
|
6 Assembly Hub visualization.
|
|
7 Program test arguments:
|
|
8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html
|
|
9 """
|
|
10
|
|
11 import argparse
|
|
12 import collections
|
|
13 import json
|
|
14 import logging
|
|
15 import os
|
|
16 import sys
|
|
17
|
|
18 # Internal dependencies
|
|
19 from Bam import Bam
|
|
20 from BedSimpleRepeats import BedSimpleRepeats
|
|
21 from BedSpliceJunctions import BedSpliceJunctions
|
|
22 from Bed import Bed
|
|
23 from BigWig import BigWig
|
|
24 from util.Fasta import Fasta
|
|
25 from util.Filters import TraceBackFormatter
|
|
26 from Gff3 import Gff3
|
|
27 from Gtf import Gtf
|
|
28 from Psl import Psl
|
|
29 from TrackHub import TrackHub
|
|
30
|
|
31 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
|
|
32
|
|
33
|
|
34 def main(argv):
|
|
35 # Command Line parsing init
|
|
36 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
|
|
37
|
|
38 # Reference genome mandatory
|
|
39 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
|
|
40
|
|
41 # GFF3 Management
|
|
42 parser.add_argument('--gff3', action='append', help='GFF3 format')
|
|
43
|
|
44 # GTF Management
|
|
45 parser.add_argument('--gtf', action='append', help='GTF format')
|
|
46
|
|
47 # Bed4+12 (TrfBig)
|
|
48 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
|
|
49
|
|
50 # Bed12+1 (regtools)
|
|
51 parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as')
|
|
52
|
|
53 # Generic Bed (Blastx transformed to bed)
|
|
54 parser.add_argument('--bed', action='append', help='Bed generic format')
|
|
55
|
|
56 # BigWig Management
|
|
57 parser.add_argument('--bigwig', action='append', help='BigWig format')
|
|
58
|
|
59 # Bam Management
|
|
60 parser.add_argument('--bam', action='append', help='Bam format')
|
|
61
|
|
62 # Psl Management
|
|
63 parser.add_argument('--psl', action='append', help='Psl format')
|
|
64
|
|
65 # TODO: Check if the running directory can have issues if we run the tool outside
|
|
66 parser.add_argument('-d', '--directory',
|
|
67 help='Running tool directory, where to find the templates. Default is running directory')
|
|
68 parser.add_argument('-u', '--ucsc_tools_path',
|
|
69 help='Directory where to find the executables needed to run this tool')
|
|
70 parser.add_argument('-e', '--extra_files_path',
|
|
71 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
|
|
72 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
|
|
73
|
|
74 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
|
|
75
|
|
76 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation')
|
|
77
|
|
78 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID')
|
|
79
|
|
80 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors')
|
|
81
|
|
82 # Begin init variables
|
|
83
|
|
84 toolDirectory = '.'
|
|
85 extra_files_path = '.'
|
|
86
|
|
87 # Get the args passed in parameter
|
|
88 args = parser.parse_args()
|
|
89
|
|
90 extra_files_path = args.extra_files_path
|
|
91 toolDirectory = args.directory
|
|
92
|
|
93 #### Logging management ####
|
|
94 # If we are in Debug mode, also print in stdout the debug dump
|
|
95
|
|
96 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode)
|
|
97
|
|
98 #### END Logging management ####
|
|
99
|
|
100 array_inputs_reference_genome = json.loads(args.fasta)
|
|
101
|
|
102 # TODO: Replace these with the object Fasta
|
|
103 input_fasta_file = array_inputs_reference_genome["false_path"]
|
|
104 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"])
|
|
105 genome_name = sanitize_name_input(args.genome_name)
|
|
106
|
|
107 reference_genome = Fasta(input_fasta_file,
|
|
108 input_fasta_file_name, genome_name)
|
|
109
|
|
110 user_email = args.user_email
|
|
111
|
|
112
|
|
113 # TODO: Use a class to have a better management of the structure of these inputs
|
|
114 # These inputs are populated in the Galaxy Wrapper xml and are in this format:
|
|
115 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}]
|
|
116 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH}
|
|
117 array_inputs_bam = args.bam
|
|
118 array_inputs_bed_generic = args.bed
|
|
119 array_inputs_bed_simple_repeats = args.bedSimpleRepeats
|
|
120 array_inputs_bed_splice_junctions = args.bedSpliceJunctions
|
|
121 array_inputs_bigwig = args.bigwig
|
|
122 array_inputs_gff3 = args.gff3
|
|
123 array_inputs_gtf = args.gtf
|
|
124 array_inputs_psl = args.psl
|
|
125
|
|
126 outputFile = args.output
|
|
127
|
|
128 json_inputs_data = args.data_json
|
|
129
|
|
130 # TODO: Instead use a class to properly store the objects, with object_hook
|
|
131 inputs_data = json.loads(json_inputs_data)
|
|
132 # We remove the spaces in ["name"] of inputs_data
|
|
133 sanitize_name_inputs(inputs_data)
|
|
134
|
|
135 # TODO: Check here all the binaries / tools we need. Exception if missing
|
|
136
|
|
137 # Create the Track Hub folder
|
|
138 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory)
|
|
139
|
|
140 all_datatype_dictionary = {}
|
|
141
|
|
142 for (inputs, datatype_class) in [
|
|
143 (array_inputs_bam, Bam),
|
|
144 (array_inputs_bed_generic, Bed),
|
|
145 (array_inputs_bigwig, BigWig),
|
|
146 (array_inputs_bed_simple_repeats, BedSimpleRepeats),
|
|
147 (array_inputs_bed_splice_junctions, BedSpliceJunctions),
|
|
148 (array_inputs_gff3, Gff3),
|
|
149 (array_inputs_gtf, Gtf),
|
|
150 (array_inputs_psl, Psl)]:
|
|
151 if inputs:
|
|
152 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data))
|
|
153
|
|
154 # Create Ordered Dictionary to add the tracks in the tool form order
|
|
155 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
|
|
156
|
|
157 logging.debug("----- End of all_datatype_dictionary processing -----")
|
|
158 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values()))
|
|
159
|
|
160 logging.debug("----- Beginning of Track adding processing -----")
|
|
161 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
|
|
162 trackHub.addTrack(datatypeObject.track.trackDb)
|
|
163 logging.debug("----- End of Track adding processing -----")
|
|
164
|
|
165 # We process all the modifications to create the zip file
|
|
166 #trackHub.createZip()
|
|
167
|
|
168 # We terminate le process and so create a HTML file summarizing all the files
|
|
169 trackHub.terminate()
|
|
170
|
|
171 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####')
|
|
172
|
|
173 sys.exit(0)
|
|
174
|
|
175
|
|
176 def sanitize_name_input(string_to_sanitize):
|
|
177 """
|
|
178 Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
|
|
179
|
|
180 :param string_to_sanitize:
|
|
181 :return :
|
|
182
|
|
183 :Example:
|
|
184
|
|
185 >>> sanitize_name_input('this/is an//example')
|
|
186 this_is_an__example
|
|
187 """
|
|
188 return string_to_sanitize \
|
|
189 .replace("/", "_") \
|
|
190 .replace(" ", "_")
|
|
191
|
|
192
|
|
193 def sanitize_name_inputs(inputs_data):
|
|
194 """
|
|
195 Sanitize value of the keys "name" of the dictionary passed in parameter.
|
|
196
|
|
197 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
|
|
198 Also, it can contain '/' character and could break the use of os.path function.
|
|
199
|
|
200 :param inputs_data: dict[string, dict[string, string]]
|
|
201 """
|
|
202 for key in inputs_data:
|
|
203 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
|
|
204
|
|
205
|
|
206 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data):
|
|
207 """
|
|
208 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
|
|
209 and update the dictionary of datatype
|
|
210
|
|
211 :param ExtensionClass:
|
|
212 :param array_inputs:
|
|
213 :param inputs_data:
|
|
214 :type ExtensionClass: Datatype
|
|
215 :type array_inputs: list[string]
|
|
216 :type inputs_data: dict
|
|
217 :rtype: dict
|
|
218 """
|
|
219
|
|
220 datatype_dictionary = {}
|
|
221
|
|
222 # TODO: Optimize this double loop
|
|
223 for input_false_path in array_inputs:
|
|
224 for key, data_value in inputs_data.items():
|
|
225 if key == input_false_path:
|
|
226 logging.debug("input_false_path: " + input_false_path)
|
|
227 logging.debug("data_value: " + str(data_value))
|
|
228 extensionObject = ExtensionClass(input_false_path, data_value)
|
|
229 datatype_dictionary.update({data_value["order_index"]: extensionObject})
|
|
230 return datatype_dictionary
|
|
231
|
|
232 def configure_logger(extra_files_path=None, debug=False):
|
|
233 if not extra_files_path:
|
|
234 raise Exception("Extra files path is not set. Stopping the application")
|
|
235
|
|
236
|
|
237 # All case log: log everything in a .log file
|
|
238 logger_file_name = ''.join([__name__, '.log'])
|
|
239 logging_file_path = os.path.join(extra_files_path, logger_file_name)
|
|
240
|
|
241 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG)
|
|
242
|
|
243 log_stdout = logging.StreamHandler(sys.stdout)
|
|
244 if not debug:
|
|
245 configure_logger_user(log_stdout)
|
|
246 else:
|
|
247 configure_logger_dev(log_stdout)
|
|
248
|
|
249 # stderr configuration
|
|
250 configure_logger_stderr()
|
|
251
|
|
252 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n')
|
|
253
|
|
254 def configure_logger_user(log_stdout=None):
|
|
255 """
|
|
256 User Logger is defined as following:
|
|
257 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback
|
|
258 in STDOUT
|
|
259 - Still access to full, brute and traceback for errors
|
|
260 in STDERR
|
|
261 - And further access to debug if needed
|
|
262 in .log
|
|
263
|
|
264 """
|
|
265
|
|
266 if not log_stdout:
|
|
267 raise Exception("No log_stdout given. Stopping the application")
|
|
268
|
|
269 # stdout for INFO / WARN / ERROR / CRITICAL
|
|
270 log_stdout.setLevel(logging.INFO)
|
|
271
|
|
272 formatter = TraceBackFormatter('%(message)s')
|
|
273
|
|
274 log_stdout.setFormatter(formatter)
|
|
275
|
|
276 logging.getLogger().addHandler(log_stdout)
|
|
277
|
|
278 def configure_logger_dev(log_stdout=None):
|
|
279 """
|
|
280 Dev Logger is defined as following:
|
|
281 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout
|
|
282 - Still access to full, brute and traceback in stderr for errors
|
|
283 - And further access to debug if needed
|
|
284
|
|
285 """
|
|
286 if not log_stdout:
|
|
287 raise Exception("No log_stdout given. Stopping the application")
|
|
288 log_format = '%(message)s'
|
|
289
|
|
290 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL
|
|
291 log_stdout.setLevel(logging.DEBUG)
|
|
292
|
|
293 formatter = logging.Formatter(log_format)
|
|
294
|
|
295 log_stdout.setFormatter(formatter)
|
|
296
|
|
297 logging.getLogger().addHandler(log_stdout)
|
|
298
|
|
299 def configure_logger_stderr():
|
|
300 """
|
|
301 Configure what should be logged in stderr
|
|
302 """
|
|
303 log_error = logging.StreamHandler(sys.stderr)
|
|
304 log_error.setLevel(logging.ERROR)
|
|
305 log_error_format = '%(message)s'
|
|
306
|
|
307 formatter_error = logging.Formatter(log_error_format)
|
|
308
|
|
309 log_error.setFormatter(formatter_error)
|
|
310
|
|
311 logging.getLogger().addHandler(log_error)
|
|
312
|
|
313 if __name__ == "__main__":
|
|
314 logging.getLogger(__name__)
|
|
315 main(sys.argv)
|