comparison hubArchiveCreator.py @ 0:abcfd662b679 draft default tip

planemo upload for repository https://github.com/Yating-L/hubarchivecreator-test.git commit 199ae2b10f3b3e58cb4d4a3b9fb4b35db415c538-dirty
author yating-l
date Thu, 22 Dec 2016 17:53:00 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:abcfd662b679
1 #!/usr/bin/python
2 # -*- coding: utf8 -*-
3
4 """
5 This Galaxy tool permits to prepare your files to be ready for
6 Assembly Hub visualization.
7 Program test arguments:
8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html
9 """
10
11 import argparse
12 import collections
13 import json
14 import logging
15 import os
16 import sys
17
18 # Internal dependencies
19 from Bam import Bam
20 from BedSimpleRepeats import BedSimpleRepeats
21 from Bed import Bed
22 from BigWig import BigWig
23 from util.Fasta import Fasta
24 from util.Filters import TraceBackFormatter
25 from Gff3 import Gff3
26 from Gtf import Gtf
27 from Psl import Psl
28 from TrackHub import TrackHub
29
30 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort
31
32
33 def main(argv):
34 # Command Line parsing init
35 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.')
36
37 # Reference genome mandatory
38 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
39
40 # GFF3 Management
41 parser.add_argument('--gff3', action='append', help='GFF3 format')
42
43 # GTF Management
44 parser.add_argument('--gtf', action='append', help='GTF format')
45
46 # Bed4+12 (TrfBig)
47 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as')
48
49 # Generic Bed (Blastx transformed to bed)
50 parser.add_argument('--bed', action='append', help='Bed generic format')
51
52 # BigWig Management
53 parser.add_argument('--bigwig', action='append', help='BigWig format')
54
55 # Bam Management
56 parser.add_argument('--bam', action='append', help='Bam format')
57
58 # Psl Management
59 parser.add_argument('--psl', action='append', help='Psl format')
60
61 # TODO: Check if the running directory can have issues if we run the tool outside
62 parser.add_argument('-d', '--directory',
63 help='Running tool directory, where to find the templates. Default is running directory')
64 parser.add_argument('-u', '--ucsc_tools_path',
65 help='Directory where to find the executables needed to run this tool')
66 parser.add_argument('-e', '--extra_files_path',
67 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive')
68 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive')
69
70 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
71
72 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation')
73
74 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID')
75
76 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors')
77
78 # Begin init variables
79
80 toolDirectory = '.'
81 extra_files_path = '.'
82
83 # Get the args passed in parameter
84 args = parser.parse_args()
85
86 extra_files_path = args.extra_files_path
87 toolDirectory = args.directory
88
89 #### Logging management ####
90 # If we are in Debug mode, also print in stdout the debug dump
91
92 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode)
93
94 #### END Logging management ####
95
96 array_inputs_reference_genome = json.loads(args.fasta)
97
98 # TODO: Replace these with the object Fasta
99 input_fasta_file = array_inputs_reference_genome["false_path"]
100 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"])
101 genome_name = sanitize_name_input(args.genome_name)
102
103 reference_genome = Fasta(input_fasta_file,
104 input_fasta_file_name, genome_name)
105
106 user_email = args.user_email
107
108
109 # TODO: Use a class to have a better management of the structure of these inputs
110 # These inputs are populated in the Galaxy Wrapper xml and are in this format:
111 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}]
112 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH}
113 array_inputs_bam = args.bam
114 array_inputs_bed_generic = args.bed
115 array_inputs_bed_simple_repeats = args.bedSimpleRepeats
116 array_inputs_bigwig = args.bigwig
117 array_inputs_gff3 = args.gff3
118 array_inputs_gtf = args.gtf
119 array_inputs_psl = args.psl
120
121 outputFile = args.output
122
123 json_inputs_data = args.data_json
124
125 # TODO: Instead use a class to properly store the objects, with object_hook
126 inputs_data = json.loads(json_inputs_data)
127 # We remove the spaces in ["name"] of inputs_data
128 sanitize_name_inputs(inputs_data)
129
130 # TODO: Check here all the binaries / tools we need. Exception if missing
131
132 # Create the Track Hub folder
133 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory)
134
135 all_datatype_dictionary = {}
136
137 for (inputs, datatype_class) in [
138 (array_inputs_bam, Bam),
139 (array_inputs_bed_generic, Bed),
140 (array_inputs_bigwig, BigWig),
141 (array_inputs_bed_simple_repeats, BedSimpleRepeats),
142 (array_inputs_gff3, Gff3),
143 (array_inputs_gtf, Gtf),
144 (array_inputs_psl, Psl)]:
145 if inputs:
146 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data))
147
148 # Create Ordered Dictionary to add the tracks in the tool form order
149 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
150
151 logging.debug("----- End of all_datatype_dictionary processing -----")
152 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values()))
153
154 logging.debug("----- Beginning of Track adding processing -----")
155 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
156 trackHub.addTrack(datatypeObject.track.trackDb)
157 logging.debug("----- End of Track adding processing -----")
158
159 # We process all the modifications to create the zip file
160 #trackHub.createZip()
161
162 # We terminate le process and so create a HTML file summarizing all the files
163 trackHub.terminate()
164
165 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####')
166
167 sys.exit(0)
168
169
170 def sanitize_name_input(string_to_sanitize):
171 """
172 Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
173
174 :param string_to_sanitize:
175 :return :
176
177 :Example:
178
179 >>> sanitize_name_input('this/is an//example')
180 this_is_an__example
181 """
182 return string_to_sanitize \
183 .replace("/", "_") \
184 .replace(" ", "_")
185
186
187 def sanitize_name_inputs(inputs_data):
188 """
189 Sanitize value of the keys "name" of the dictionary passed in parameter.
190
191 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
192 Also, it can contain '/' character and could break the use of os.path function.
193
194 :param inputs_data: dict[string, dict[string, string]]
195 """
196 for key in inputs_data:
197 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
198
199
200 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data):
201 """
202 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
203 and update the dictionary of datatype
204
205 :param ExtensionClass:
206 :param array_inputs:
207 :param inputs_data:
208 :type ExtensionClass: Datatype
209 :type array_inputs: list[string]
210 :type inputs_data: dict
211 :rtype: dict
212 """
213
214 datatype_dictionary = {}
215
216 # TODO: Optimize this double loop
217 for input_false_path in array_inputs:
218 for key, data_value in inputs_data.items():
219 if key == input_false_path:
220 logging.debug("input_false_path: " + input_false_path)
221 logging.debug("data_value: " + str(data_value))
222 extensionObject = ExtensionClass(input_false_path, data_value)
223 datatype_dictionary.update({data_value["order_index"]: extensionObject})
224 return datatype_dictionary
225
226 def configure_logger(extra_files_path=None, debug=False):
227 if not extra_files_path:
228 raise Exception("Extra files path is not set. Stopping the application")
229
230
231 # All case log: log everything in a .log file
232 logger_file_name = ''.join([__name__, '.log'])
233 logging_file_path = os.path.join(extra_files_path, logger_file_name)
234
235 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG)
236
237 log_stdout = logging.StreamHandler(sys.stdout)
238 if not debug:
239 configure_logger_user(log_stdout)
240 else:
241 configure_logger_dev(log_stdout)
242
243 # stderr configuration
244 configure_logger_stderr()
245
246 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n')
247
248 def configure_logger_user(log_stdout=None):
249 """
250 User Logger is defined as following:
251 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback
252 in STDOUT
253 - Still access to full, brute and traceback for errors
254 in STDERR
255 - And further access to debug if needed
256 in .log
257
258 """
259
260 if not log_stdout:
261 raise Exception("No log_stdout given. Stopping the application")
262
263 # stdout for INFO / WARN / ERROR / CRITICAL
264 log_stdout.setLevel(logging.INFO)
265
266 formatter = TraceBackFormatter('%(message)s')
267
268 log_stdout.setFormatter(formatter)
269
270 logging.getLogger().addHandler(log_stdout)
271
272 def configure_logger_dev(log_stdout=None):
273 """
274 Dev Logger is defined as following:
275 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout
276 - Still access to full, brute and traceback in stderr for errors
277 - And further access to debug if needed
278
279 """
280 if not log_stdout:
281 raise Exception("No log_stdout given. Stopping the application")
282 log_format = '%(message)s'
283
284 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL
285 log_stdout.setLevel(logging.DEBUG)
286
287 formatter = logging.Formatter(log_format)
288
289 log_stdout.setFormatter(formatter)
290
291 logging.getLogger().addHandler(log_stdout)
292
293 def configure_logger_stderr():
294 """
295 Configure what should be logged in stderr
296 """
297 log_error = logging.StreamHandler(sys.stderr)
298 log_error.setLevel(logging.ERROR)
299 log_error_format = '%(message)s'
300
301 formatter_error = logging.Formatter(log_error_format)
302
303 log_error.setFormatter(formatter_error)
304
305 logging.getLogger().addHandler(log_error)
306
307 if __name__ == "__main__":
308 logging.getLogger(__name__)
309 main(sys.argv)