Mercurial > repos > yating-l > hubarchivecreator
comparison hubArchiveCreator.py @ 11:3f7c40fb51e8 draft
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit c778f8ee85e9acc924c5c0a30042ac90c8e7a70d-dirty
| author | yating-l |
|---|---|
| date | Wed, 28 Dec 2016 17:50:06 -0500 |
| parents | |
| children | 364b8db8de17 |
comparison
equal
deleted
inserted
replaced
| 10:cf1abb96a43d | 11:3f7c40fb51e8 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 # -*- coding: utf8 -*- | |
| 3 | |
| 4 """ | |
| 5 This Galaxy tool permits to prepare your files to be ready for | |
| 6 Assembly Hub visualization. | |
| 7 Program test arguments: | |
| 8 hubArchiveCreator.py -g test-data/augustusDbia3.gff3 -f test-data/dbia3.fa -d . -u ./tools -o output.html | |
| 9 """ | |
| 10 | |
| 11 import argparse | |
| 12 import collections | |
| 13 import json | |
| 14 import logging | |
| 15 import os | |
| 16 import sys | |
| 17 | |
| 18 # Internal dependencies | |
| 19 from Bam import Bam | |
| 20 from BedSimpleRepeats import BedSimpleRepeats | |
| 21 from BedSpliceJunctions import BedSpliceJunctions | |
| 22 from Bed import Bed | |
| 23 from BigWig import BigWig | |
| 24 from util.Fasta import Fasta | |
| 25 from util.Filters import TraceBackFormatter | |
| 26 from Gff3 import Gff3 | |
| 27 from Gtf import Gtf | |
| 28 from Psl import Psl | |
| 29 from TrackHub import TrackHub | |
| 30 from bigPsl import bigPsl | |
| 31 | |
| 32 # TODO: Verify each subprocessed dependency is accessible [gff3ToGenePred, genePredToBed, twoBitInfo, faToTwoBit, bedToBigBed, sort | |
| 33 | |
| 34 | |
| 35 def main(argv): | |
| 36 # Command Line parsing init | |
| 37 parser = argparse.ArgumentParser(description='Create a foo.txt inside the given folder.') | |
| 38 | |
| 39 # Reference genome mandatory | |
| 40 parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome') | |
| 41 | |
| 42 # GFF3 Management | |
| 43 parser.add_argument('--gff3', action='append', help='GFF3 format') | |
| 44 | |
| 45 # GTF Management | |
| 46 parser.add_argument('--gtf', action='append', help='GTF format') | |
| 47 | |
| 48 # Bed4+12 (TrfBig) | |
| 49 parser.add_argument('--bedSimpleRepeats', action='append', help='Bed4+12 format, using simpleRepeats.as') | |
| 50 | |
| 51 # Bed12+1 (regtools) | |
| 52 parser.add_argument('--bedSpliceJunctions', action='append', help='Bed12+1 format, using spliceJunctions.as') | |
| 53 | |
| 54 # Generic Bed (Blastx transformed to bed) | |
| 55 parser.add_argument('--bed', action='append', help='Bed generic format') | |
| 56 | |
| 57 # Bed12+12 (tblastn) | |
| 58 parser.add_argument('--bigpsl', action='append', help='bigPsl format') | |
| 59 | |
| 60 # BigWig Management | |
| 61 parser.add_argument('--bigwig', action='append', help='BigWig format') | |
| 62 | |
| 63 # Bam Management | |
| 64 parser.add_argument('--bam', action='append', help='Bam format') | |
| 65 | |
| 66 # Psl Management | |
| 67 parser.add_argument('--psl', action='append', help='Psl format') | |
| 68 | |
| 69 # TODO: Check if the running directory can have issues if we run the tool outside | |
| 70 parser.add_argument('-d', '--directory', | |
| 71 help='Running tool directory, where to find the templates. Default is running directory') | |
| 72 parser.add_argument('-u', '--ucsc_tools_path', | |
| 73 help='Directory where to find the executables needed to run this tool') | |
| 74 parser.add_argument('-e', '--extra_files_path', | |
| 75 help='Name, in galaxy, of the output folder. Where you would want to build the Track Hub Archive') | |
| 76 parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the Track Hub Archive') | |
| 77 | |
| 78 parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs') | |
| 79 | |
| 80 parser.add_argument('--user_email', help='Email of the user who launched the Hub Archive Creation') | |
| 81 | |
| 82 parser.add_argument('--genome_name', help='UCSC Genome Browser assembly ID') | |
| 83 | |
| 84 parser.add_argument('--debug_mode', action='store_true', help='Allow more details about the errors') | |
| 85 | |
| 86 # Begin init variables | |
| 87 | |
| 88 toolDirectory = '.' | |
| 89 extra_files_path = '.' | |
| 90 | |
| 91 # Get the args passed in parameter | |
| 92 args = parser.parse_args() | |
| 93 | |
| 94 extra_files_path = args.extra_files_path | |
| 95 toolDirectory = args.directory | |
| 96 | |
| 97 #### Logging management #### | |
| 98 # If we are in Debug mode, also print in stdout the debug dump | |
| 99 | |
| 100 configure_logger(extra_files_path=extra_files_path, debug=args.debug_mode) | |
| 101 | |
| 102 #### END Logging management #### | |
| 103 | |
| 104 array_inputs_reference_genome = json.loads(args.fasta) | |
| 105 | |
| 106 # TODO: Replace these with the object Fasta | |
| 107 input_fasta_file = array_inputs_reference_genome["false_path"] | |
| 108 input_fasta_file_name = sanitize_name_input(array_inputs_reference_genome["name"]) | |
| 109 genome_name = sanitize_name_input(args.genome_name) | |
| 110 | |
| 111 reference_genome = Fasta(input_fasta_file, | |
| 112 input_fasta_file_name, genome_name) | |
| 113 | |
| 114 user_email = args.user_email | |
| 115 | |
| 116 | |
| 117 # TODO: Use a class to have a better management of the structure of these inputs | |
| 118 # These inputs are populated in the Galaxy Wrapper xml and are in this format: | |
| 119 # ARRAY[DICT{FILE_PATH: DICT{NAME: NAME_VALUE, EXTRA_DATA: EXTRA_DATA_VALUE}}] | |
| 120 # EXTRA_DATA could be anything, for example the index of a BAM => {"index", FILE_PATH} | |
| 121 array_inputs_bam = args.bam | |
| 122 array_inputs_bed_generic = args.bed | |
| 123 array_inputs_bed_simple_repeats = args.bedSimpleRepeats | |
| 124 array_inputs_bed_splice_junctions = args.bedSpliceJunctions | |
| 125 array_inputs_bigwig = args.bigwig | |
| 126 array_inputs_gff3 = args.gff3 | |
| 127 array_inputs_gtf = args.gtf | |
| 128 array_inputs_psl = args.psl | |
| 129 array_inputs_bigpsl = args.bigpsl | |
| 130 | |
| 131 outputFile = args.output | |
| 132 | |
| 133 json_inputs_data = args.data_json | |
| 134 | |
| 135 # TODO: Instead use a class to properly store the objects, with object_hook | |
| 136 inputs_data = json.loads(json_inputs_data) | |
| 137 # We remove the spaces in ["name"] of inputs_data | |
| 138 sanitize_name_inputs(inputs_data) | |
| 139 | |
| 140 # TODO: Check here all the binaries / tools we need. Exception if missing | |
| 141 | |
| 142 # Create the Track Hub folder | |
| 143 trackHub = TrackHub(reference_genome, user_email, outputFile, extra_files_path, toolDirectory) | |
| 144 | |
| 145 all_datatype_dictionary = {} | |
| 146 | |
| 147 for (inputs, datatype_class) in [ | |
| 148 (array_inputs_bam, Bam), | |
| 149 (array_inputs_bed_generic, Bed), | |
| 150 (array_inputs_bigwig, BigWig), | |
| 151 (array_inputs_bed_simple_repeats, BedSimpleRepeats), | |
| 152 (array_inputs_bed_splice_junctions, BedSpliceJunctions), | |
| 153 (array_inputs_gff3, Gff3), | |
| 154 (array_inputs_gtf, Gtf), | |
| 155 (array_inputs_psl, Psl), | |
| 156 (array_inputs_bigpsl, bigPsl)]: | |
| 157 if inputs: | |
| 158 all_datatype_dictionary.update(create_ordered_datatype_objects(datatype_class, inputs, inputs_data)) | |
| 159 | |
| 160 # Create Ordered Dictionary to add the tracks in the tool form order | |
| 161 all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary) | |
| 162 | |
| 163 logging.debug("----- End of all_datatype_dictionary processing -----") | |
| 164 logging.debug("all_datatype_ordered_dictionary keys are: {0}".format(all_datatype_ordered_dictionary.values())) | |
| 165 | |
| 166 logging.debug("----- Beginning of Track adding processing -----") | |
| 167 for index, datatypeObject in all_datatype_ordered_dictionary.iteritems(): | |
| 168 trackHub.addTrack(datatypeObject.track.trackDb) | |
| 169 logging.debug("----- End of Track adding processing -----") | |
| 170 | |
| 171 # We process all the modifications to create the zip file | |
| 172 #trackHub.createZip() | |
| 173 | |
| 174 # We terminate le process and so create a HTML file summarizing all the files | |
| 175 trackHub.terminate() | |
| 176 | |
| 177 logging.debug('#### End of HubArchiveCreator Debug Mode: Bye! ####') | |
| 178 | |
| 179 sys.exit(0) | |
| 180 | |
| 181 | |
| 182 def sanitize_name_input(string_to_sanitize): | |
| 183 """ | |
| 184 Sanitize the string passed in parameter by replacing '/' and ' ' by '_' | |
| 185 | |
| 186 :param string_to_sanitize: | |
| 187 :return : | |
| 188 | |
| 189 :Example: | |
| 190 | |
| 191 >>> sanitize_name_input('this/is an//example') | |
| 192 this_is_an__example | |
| 193 """ | |
| 194 return string_to_sanitize \ | |
| 195 .replace("/", "_") \ | |
| 196 .replace(" ", "_") | |
| 197 | |
| 198 | |
| 199 def sanitize_name_inputs(inputs_data): | |
| 200 """ | |
| 201 Sanitize value of the keys "name" of the dictionary passed in parameter. | |
| 202 | |
| 203 Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces. | |
| 204 Also, it can contain '/' character and could break the use of os.path function. | |
| 205 | |
| 206 :param inputs_data: dict[string, dict[string, string]] | |
| 207 """ | |
| 208 for key in inputs_data: | |
| 209 inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"]) | |
| 210 | |
| 211 | |
| 212 def create_ordered_datatype_objects(ExtensionClass, array_inputs, inputs_data): | |
| 213 """ | |
| 214 Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub | |
| 215 and update the dictionary of datatype | |
| 216 | |
| 217 :param ExtensionClass: | |
| 218 :param array_inputs: | |
| 219 :param inputs_data: | |
| 220 :type ExtensionClass: Datatype | |
| 221 :type array_inputs: list[string] | |
| 222 :type inputs_data: dict | |
| 223 :rtype: dict | |
| 224 """ | |
| 225 | |
| 226 datatype_dictionary = {} | |
| 227 | |
| 228 # TODO: Optimize this double loop | |
| 229 for input_false_path in array_inputs: | |
| 230 for key, data_value in inputs_data.items(): | |
| 231 if key == input_false_path: | |
| 232 logging.debug("input_false_path: " + input_false_path) | |
| 233 logging.debug("data_value: " + str(data_value)) | |
| 234 extensionObject = ExtensionClass(input_false_path, data_value) | |
| 235 datatype_dictionary.update({data_value["order_index"]: extensionObject}) | |
| 236 return datatype_dictionary | |
| 237 | |
| 238 def configure_logger(extra_files_path=None, debug=False): | |
| 239 if not extra_files_path: | |
| 240 raise Exception("Extra files path is not set. Stopping the application") | |
| 241 | |
| 242 | |
| 243 # All case log: log everything in a .log file | |
| 244 logger_file_name = ''.join([__name__, '.log']) | |
| 245 logging_file_path = os.path.join(extra_files_path, logger_file_name) | |
| 246 | |
| 247 logging.basicConfig(filename=logging_file_path, level=logging.DEBUG) | |
| 248 | |
| 249 log_stdout = logging.StreamHandler(sys.stdout) | |
| 250 if not debug: | |
| 251 configure_logger_user(log_stdout) | |
| 252 else: | |
| 253 configure_logger_dev(log_stdout) | |
| 254 | |
| 255 # stderr configuration | |
| 256 configure_logger_stderr() | |
| 257 | |
| 258 logging.debug('#### Welcome in HubArchiveCreator Debug Mode ####\n') | |
| 259 | |
| 260 def configure_logger_user(log_stdout=None): | |
| 261 """ | |
| 262 User Logger is defined as following: | |
| 263 - User needs to have WARN, ERROR and CRITICAL but well formatted / without traceback | |
| 264 in STDOUT | |
| 265 - Still access to full, brute and traceback for errors | |
| 266 in STDERR | |
| 267 - And further access to debug if needed | |
| 268 in .log | |
| 269 | |
| 270 """ | |
| 271 | |
| 272 if not log_stdout: | |
| 273 raise Exception("No log_stdout given. Stopping the application") | |
| 274 | |
| 275 # stdout for INFO / WARN / ERROR / CRITICAL | |
| 276 log_stdout.setLevel(logging.INFO) | |
| 277 | |
| 278 formatter = TraceBackFormatter('%(message)s') | |
| 279 | |
| 280 log_stdout.setFormatter(formatter) | |
| 281 | |
| 282 logging.getLogger().addHandler(log_stdout) | |
| 283 | |
| 284 def configure_logger_dev(log_stdout=None): | |
| 285 """ | |
| 286 Dev Logger is defined as following: | |
| 287 - Dev needs to have WARN, ERROR and CRITICAL but well formatted / without traceback, in stdout | |
| 288 - Still access to full, brute and traceback in stderr for errors | |
| 289 - And further access to debug if needed | |
| 290 | |
| 291 """ | |
| 292 if not log_stdout: | |
| 293 raise Exception("No log_stdout given. Stopping the application") | |
| 294 log_format = '%(message)s' | |
| 295 | |
| 296 # stdout and stderr and both identical for INFO / WARN / ERROR / CRITICAL | |
| 297 log_stdout.setLevel(logging.DEBUG) | |
| 298 | |
| 299 formatter = logging.Formatter(log_format) | |
| 300 | |
| 301 log_stdout.setFormatter(formatter) | |
| 302 | |
| 303 logging.getLogger().addHandler(log_stdout) | |
| 304 | |
| 305 def configure_logger_stderr(): | |
| 306 """ | |
| 307 Configure what should be logged in stderr | |
| 308 """ | |
| 309 log_error = logging.StreamHandler(sys.stderr) | |
| 310 log_error.setLevel(logging.ERROR) | |
| 311 log_error_format = '%(message)s' | |
| 312 | |
| 313 formatter_error = logging.Formatter(log_error_format) | |
| 314 | |
| 315 log_error.setFormatter(formatter_error) | |
| 316 | |
| 317 logging.getLogger().addHandler(log_error) | |
| 318 | |
| 319 if __name__ == "__main__": | |
| 320 logging.getLogger(__name__) | |
| 321 main(sys.argv) |
