comparison hubArchiveCreator.py @ 11:3f7c40fb51e8 draft

planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit c778f8ee85e9acc924c5c0a30042ac90c8e7a70d-dirty
author yating-l
date Wed, 28 Dec 2016 17:50:06 -0500
parents
children 364b8db8de17
comparison
equal deleted inserted replaced
10:cf1abb96a43d 11:3f7c40fb51e8
1 #!/usr/bin/python
2 # -*- coding: utf8 -*-
3
4 """
5 This Galaxy tool permits to prepare your files to be ready for
6 Assembly Hub visualization.
7 Program test arguments:
8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html
9 """
10
11 import argparse
12 import collections
13 import json
14 import logging
15 import os
16 import sys
17
18 # Internal dependencies
19 from Bam import Bam
20 from BedSimpleRepeats import BedSimpleRepeats
21 from BedSpliceJunctions import BedSpliceJunctions
22 from Bed import Bed
23 from BigWig import BigWig
24 from util.Fasta import Fasta
25 from util.Filters import TraceBackFormatter
26 from Gff3 import Gff3
27 from Gtf import Gtf
28 from Psl import Psl
29 from TrackHub import TrackHub
30 from bigPsl import bigPsl
31
32 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
33
34
35 def main(argv):
36 # Command Line parsing init
37 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
38
39 # Reference genome mandatory
40 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
41
42 # GFF3 Management
43 parser.add_argument('--gff3', action='append', help='GFF3 format')
44
45 # GTF Management
46 parser.add_argument('--gtf', action='append', help='GTF format')
47
48 # Bed4+12 (TrfBig)
49 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
50
51 # Bed12+1 (regtools)
52 parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as')
53
54 # Generic Bed (Blastx transformed to bed)
55 parser.add_argument('--bed', action='append', help='Bed generic format')
56
57 # Bed12+12 (tblastn)
58 parser.add_argument('--bigpsl', action='append', help='bigPsl format')
59
60 # BigWig Management
61 parser.add_argument('--bigwig', action='append', help='BigWig format')
62
63 # Bam Management
64 parser.add_argument('--bam', action='append', help='Bam format')
65
66 # Psl Management
67 parser.add_argument('--psl', action='append', help='Psl format')
68
69 # TODO: Check if the running directory can have issues if we run the tool outside
70 parser.add_argument('-d', '--directory',
71 help='Running tool directory, where to find the templates. Default is running directory')
72 parser.add_argument('-u', '--ucsc_tools_path',
73 help='Directory where to find the executables needed to run this tool')
74 parser.add_argument('-e', '--extra_files_path',
75 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
76 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
77
78 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
79
80 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation')
81
82 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID')
83
84 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors')
85
86 # Begin init variables
87
88 toolDirectory = '.'
89 extra_files_path = '.'
90
91 # Get the args passed in parameter
92 args = parser.parse_args()
93
94 extra_files_path = args.extra_files_path
95 toolDirectory = args.directory
96
97 #### Logging management ####
98 # If we are in Debug mode, also print in stdout the debug dump
99
100 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode)
101
102 #### END Logging management ####
103
104 array_inputs_reference_genome = json.loads(args.fasta)
105
106 # TODO: Replace these with the object Fasta
107 input_fasta_file = array_inputs_reference_genome["false_path"]
108 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"])
109 genome_name = sanitize_name_input(args.genome_name)
110
111 reference_genome = Fasta(input_fasta_file,
112 input_fasta_file_name, genome_name)
113
114 user_email = args.user_email
115
116
117 # TODO: Use a class to have a better management of the structure of these inputs
118 # These inputs are populated in the Galaxy Wrapper xml and are in this format:
119 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}]
120 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH}
121 array_inputs_bam = args.bam
122 array_inputs_bed_generic = args.bed
123 array_inputs_bed_simple_repeats = args.bedSimpleRepeats
124 array_inputs_bed_splice_junctions = args.bedSpliceJunctions
125 array_inputs_bigwig = args.bigwig
126 array_inputs_gff3 = args.gff3
127 array_inputs_gtf = args.gtf
128 array_inputs_psl = args.psl
129 array_inputs_bigpsl = args.bigpsl
130
131 outputFile = args.output
132
133 json_inputs_data = args.data_json
134
135 # TODO: Instead use a class to properly store the objects, with object_hook
136 inputs_data = json.loads(json_inputs_data)
137 # We remove the spaces in ["name"] of inputs_data
138 sanitize_name_inputs(inputs_data)
139
140 # TODO: Check here all the binaries / tools we need. Exception if missing
141
142 # Create the Track Hub folder
143 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory)
144
145 all_datatype_dictionary = {}
146
147 for (inputs, datatype_class) in [
148 (array_inputs_bam, Bam),
149 (array_inputs_bed_generic, Bed),
150 (array_inputs_bigwig, BigWig),
151 (array_inputs_bed_simple_repeats, BedSimpleRepeats),
152 (array_inputs_bed_splice_junctions, BedSpliceJunctions),
153 (array_inputs_gff3, Gff3),
154 (array_inputs_gtf, Gtf),
155 (array_inputs_psl, Psl),
156 (array_inputs_bigpsl, bigPsl)]:
157 if inputs:
158 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data))
159
160 # Create Ordered Dictionary to add the tracks in the tool form order
161 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
162
163 logging.debug("----- End of all_datatype_dictionary processing -----")
164 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values()))
165
166 logging.debug("----- Beginning of Track adding processing -----")
167 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
168 trackHub.addTrack(datatypeObject.track.trackDb)
169 logging.debug("----- End of Track adding processing -----")
170
171 # We process all the modifications to create the zip file
172 #trackHub.createZip()
173
174 # We terminate le process and so create a HTML file summarizing all the files
175 trackHub.terminate()
176
177 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####')
178
179 sys.exit(0)
180
181
182 def sanitize_name_input(string_to_sanitize):
183 """
184 Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
185
186 :param string_to_sanitize:
187 :return :
188
189 :Example:
190
191 >>> sanitize_name_input('this/is an//example')
192 this_is_an__example
193 """
194 return string_to_sanitize \
195 .replace("/", "_") \
196 .replace(" ", "_")
197
198
199 def sanitize_name_inputs(inputs_data):
200 """
201 Sanitize value of the keys "name" of the dictionary passed in parameter.
202
203 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
204 Also, it can contain '/' character and could break the use of os.path function.
205
206 :param inputs_data: dict[string, dict[string, string]]
207 """
208 for key in inputs_data:
209 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
210
211
212 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data):
213 """
214 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
215 and update the dictionary of datatype
216
217 :param ExtensionClass:
218 :param array_inputs:
219 :param inputs_data:
220 :type ExtensionClass: Datatype
221 :type array_inputs: list[string]
222 :type inputs_data: dict
223 :rtype: dict
224 """
225
226 datatype_dictionary = {}
227
228 # TODO: Optimize this double loop
229 for input_false_path in array_inputs:
230 for key, data_value in inputs_data.items():
231 if key == input_false_path:
232 logging.debug("input_false_path: " + input_false_path)
233 logging.debug("data_value: " + str(data_value))
234 extensionObject = ExtensionClass(input_false_path, data_value)
235 datatype_dictionary.update({data_value["order_index"]: extensionObject})
236 return datatype_dictionary
237
238 def configure_logger(extra_files_path=None, debug=False):
239 if not extra_files_path:
240 raise Exception("Extra files path is not set. Stopping the application")
241
242
243 # All case log: log everything in a .log file
244 logger_file_name = ''.join([__name__, '.log'])
245 logging_file_path = os.path.join(extra_files_path, logger_file_name)
246
247 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG)
248
249 log_stdout = logging.StreamHandler(sys.stdout)
250 if not debug:
251 configure_logger_user(log_stdout)
252 else:
253 configure_logger_dev(log_stdout)
254
255 # stderr configuration
256 configure_logger_stderr()
257
258 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n')
259
260 def configure_logger_user(log_stdout=None):
261 """
262 User Logger is defined as following:
263 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback
264 in STDOUT
265 - Still access to full, brute and traceback for errors
266 in STDERR
267 - And further access to debug if needed
268 in .log
269
270 """
271
272 if not log_stdout:
273 raise Exception("No log_stdout given. Stopping the application")
274
275 # stdout for INFO / WARN / ERROR / CRITICAL
276 log_stdout.setLevel(logging.INFO)
277
278 formatter = TraceBackFormatter('%(message)s')
279
280 log_stdout.setFormatter(formatter)
281
282 logging.getLogger().addHandler(log_stdout)
283
284 def configure_logger_dev(log_stdout=None):
285 """
286 Dev Logger is defined as following:
287 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout
288 - Still access to full, brute and traceback in stderr for errors
289 - And further access to debug if needed
290
291 """
292 if not log_stdout:
293 raise Exception("No log_stdout given. Stopping the application")
294 log_format = '%(message)s'
295
296 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL
297 log_stdout.setLevel(logging.DEBUG)
298
299 formatter = logging.Formatter(log_format)
300
301 log_stdout.setFormatter(formatter)
302
303 logging.getLogger().addHandler(log_stdout)
304
305 def configure_logger_stderr():
306 """
307 Configure what should be logged in stderr
308 """
309 log_error = logging.StreamHandler(sys.stderr)
310 log_error.setLevel(logging.ERROR)
311 log_error_format = '%(message)s'
312
313 formatter_error = logging.Formatter(log_error_format)
314
315 log_error.setFormatter(formatter_error)
316
317 logging.getLogger().addHandler(log_error)
318
319 if __name__ == "__main__":
320 logging.getLogger(__name__)
321 main(sys.argv)