Mercurial > repos > yating-l > jbrowsearchivecreator
changeset 25:31a41ce128cc draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 691e5366893905d30943a3cb8cdfb6341f0f5362-dirty
line wrap: on
line diff
--- a/README.md Wed Jul 12 12:55:27 2017 -0400 +++ b/README.md Fri Oct 13 12:44:31 2017 -0400 @@ -4,11 +4,10 @@ ## Features 1. Similar interface to Hub Archive Creator. 2. Convert tracks to GFF3 datatypes (e.g Blastxml => GFF3) in order to import feature data from the flat files -3. Generate a zip file including all the tracks and configuration for JBrowse visualization -4. Group the tracks -5. Set the color for each track -6. Set the label for each track -7. Create workflows within Galaxy to automatize pipeline analysis and get them ready to visualization inside JBrowse...in a few clicks! +3. Group the tracks +4. Set the color for each track +5. Set the label for each track +6. Create workflows within Galaxy to automatize pipeline analysis and get them ready to visualization inside JBrowse...in a few clicks! At the moment, Supported datatypes are: - Bam @@ -19,6 +18,7 @@ - Gff3 - Gtf - Blastxml +- BigPsl ## Installation: 1. You would need to add this tool into your Galaxy.
--- a/TrackHub.py Wed Jul 12 12:55:27 2017 -0400 +++ b/TrackHub.py Fri Oct 13 12:44:31 2017 -0400 @@ -5,169 +5,180 @@ import shutil import zipfile import json -import utils +import tempfile +import logging + +from datatypes.Datatype import Datatype +from apollo.ApolloInstance import ApolloInstance +from tracks.TrackStyles import TrackStyles +from util import subtools +from util import santitizer class TrackHub: - def __init__(self, inputFiles, reference, outputDirect, tool_dir, genome, extra_files_path, metaData, jbrowse_host): - self.input_files = inputFiles.tracks - self.outfile = outputDirect - self.outfolder = extra_files_path - self.out_path = os.path.join(extra_files_path, 'myHub') - self.reference = reference - self.tool_dir = tool_dir - self.metaData = metaData - self.raw = os.path.join(self.out_path, 'raw') - self.json = os.path.join(self.out_path, 'json') - self.jbrowse_host = jbrowse_host - try: - if os.path.exists(self.json): - shutil.rmtree(self.json) - os.makedirs(self.json) - except OSError as e: - print "Cannot create json folder error({0}): {1}".format(e.errno, e.strerror) - else: - print "Create jbrowse folder {}".format(self.out_path) + def __init__(self, inputFastaFile, apollo_user, outputFile, extra_files_path, tool_directory, trackType, apollo_host): + + self.rootAssemblyHub = None + + self.mySpecieFolderPath = None + + # Store intermediate files, will be removed if not in debug mode + self.myTracksFolderPath = None + + # Store binary files: Bam, BigWig + self.myBinaryFolderPath = None + + self.tool_directory = tool_directory + self.trackType = trackType + self.reference_genome = inputFastaFile + self.genome_name = inputFastaFile.assembly_id + self.extra_files_path = extra_files_path + self.outputFile = outputFile + self.chromSizesFile = None + + # Set up apollo + self.apollo = ApolloInstance(apollo_host) + self.apollo_user = apollo_user + + # Set all the missing variables of this class, and create physically the folders/files + self.rootAssemblyHub = self.__createAssemblyHub__(extra_files_path=extra_files_path) + # Init the Datatype + Datatype.pre_init(self.reference_genome, self.chromSizesFile, + self.extra_files_path, self.tool_directory, + self.mySpecieFolderPath, self.myTracksFolderPath, self.myBinaryFolderPath, self.trackType) + + self._prepareRefseq() + self.trackList = os.path.join(self.mySpecieFolderPath, "trackList.json") + self._createTrackList() + + self.myTrackStyle = TrackStyles(self.tool_directory, self.mySpecieFolderPath, self.trackList) + #self.cssFolderPath = os.path.join(self.mySpecieFolderPath, 'css') + #self.cssFilePath = os.path.join(self.cssFolderPath, 'custom_track_styles.css') + self.logger = logging.getLogger(__name__) + - def createHub(self): - self.prepareRefseq() - for input_file in self.input_files: - self.addTrack(input_file) - self.indexName() - slink = self.makeArchive() - self.outHtml(slink) + + def addTrack(self, trackDbObject): + if trackDbObject['dataType'].lower() == 'bam': + #new_track = subprocess.Popen(['echo', trackDbObject['options']], stdout=subprocess.PIPE) + #subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout) + subtools.add_track_json(self.trackList, trackDbObject['options']) + #subtools.add_track_json(self.trackList, trackDbObject['track_json']) + elif trackDbObject['dataType'].lower() == 'bigwig': + subtools.add_track_json(self.trackList, trackDbObject['options']) + else: + if trackDbObject['trackType'] == 'HTMLFeatures': + self._customizeHTMLFeature(trackDbObject) + subtools.flatfile_to_json(trackDbObject['trackDataURL'], trackDbObject['dataType'], trackDbObject['trackType'], trackDbObject['trackLabel'], self.mySpecieFolderPath, trackDbObject['options']) + + + def terminate(self, debug=False): + """ Write html file """ + self._indexName() + if not debug: + self._removeRaw() + self._makeArchive() print "Success!\n" - - def prepareRefseq(self): - try: + + + def _customizeHTMLFeature(self, trackDbObject): + if trackDbObject['options']: + subfeatures = trackDbObject['options'].get('subfeatureClasses') + feature_color = trackDbObject['options']['feature_color'] + if subfeatures: + for key, value in subfeatures.items(): + self.myTrackStyle.addCustomColor(value, feature_color) + else: + customizedFeature = santitizer.sanitize_name(trackDbObject['trackLabel']) + clientConfig = json.loads(trackDbObject['options']['clientConfig']) + clientConfig['renderClassName'] = customizedFeature + trackDbObject['options']['clientConfig'] = json.dumps(clientConfig) + self.myTrackStyle.addCustomColor(customizedFeature, feature_color) + + def _removeRaw(self): + if os.path.exists(self.myTracksFolderPath): + shutil.rmtree(self.myTracksFolderPath) + + def _createTrackList(self): + if not os.path.exists(self.trackList): + os.mknod(self.trackList) + + def _prepareRefseq(self): + subtools.prepare_refseqs(self.reference_genome.false_path, self.mySpecieFolderPath) + #try: #print os.path.join(self.tool_dir, 'prepare-refseqs.pl') + ", '--fasta', " + self.reference +", '--out', self.json])" - subprocess.call(['prepare-refseqs.pl', '--fasta', self.reference, '--out', self.json]) - except OSError as e: - print "Cannot prepare reference error({0}): {1}".format(e.errno, e.strerror) - #TODO: hard coded the bam and bigwig tracks. Need to allow users to customize the settings - def addTrack(self, track): - #print "false_path" , track['false_path'] - if track['false_path'] in self.metaData.keys(): - metadata = self.metaData[track['false_path']] - else: - metadata = {} - self.SetMetadata(track, metadata) - if track['dataType'] == 'bam': - self.Bam(track, metadata) - # print "add bam track\n" - elif track['dataType'] == 'bigwig': - #create trackList.json if not exist - self.createTrackList() - json_file = os.path.join(self.json, "trackList.json") - bigwig_file = os.path.join(self.raw, track['fileName']) - subprocess.call(['add-bw-track.pl', '--label', metadata['label'], '--bw_url', bigwig_file, '--pos_color', metadata['style']['pos_color'], '--neg_color', metadata['style']['neg_color'], '--plot', 'JBrowse/View/Track/Wiggle/XYPlot', '--out', json_file, '--in', json_file]) - else: - flat_file = os.path.join(self.raw, track['fileName']) - if track['dataType'] == 'bed': - subprocess.call(['flatfile-to-json.pl', '--bed', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) - elif track['dataType'] == 'bedSpliceJunctions' or track['dataType'] == 'gtf' or track['dataType'] == 'blastxml': - subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"glyph": "JBrowse/View/FeatureGlyph/Segments", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) - elif track['dataType'] == 'gff3_transcript': - subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"transcriptType": "transcript", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) - else: - subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json]) - - def indexName(self): - subprocess.call(['generate-names.pl', '-v', '--out', self.json]) + #subprocess.call(['prepare-refseqs.pl', '--fasta', self.reference_genome.false_path, '--out', self.mySpecieFolderPath]) + #except OSError as e: + #print "Cannot prepare reference error({0}): {1}".format(e.errno, e.strerror) + + def _indexName(self): + #subprocess.call(['generate-names.pl', '-v', '--out', self.mySpecieFolderPath]) + subtools.generate_names(self.mySpecieFolderPath) print "finished name index \n" - def makeArchive(self): - file_dir = os.path.abspath(self.outfile) - source_dir = os.path.dirname(file_dir) - folder_name = os.path.basename(self.outfolder) - source_name = os.path.basename(self.out_path) - source = os.path.join(source_dir, folder_name, source_name) - slink = source.replace('/', '_') - slink = os.path.join('/var/www/html/JBrowse-1.12.1/data', slink) - try: - if os.path.islink(slink): - os.unlink(slink) - except OSError as oserror: - print "Cannot create symlink to the data({0}): {1}".format(oserror.errno, oserror.strerror) - os.symlink(source, slink) - return slink - - def outHtml(self, slink): - with open(self.outfile, 'w') as htmlfile: - htmlstr = 'The JBrowse Hub is created: <br>' - url = self.jbrowse_host + "/JBrowse-1.12.1/index.html?data=%s" - jbrowse_hub = '<li><a href = "%s" target="_blank">View JBrowse Hub</a></li>' % url - link_name = os.path.basename(slink) - relative_path = os.path.join('data', link_name + '/json') - htmlstr += jbrowse_hub % relative_path - htmlfile.write(htmlstr) + def _outHtml(self, host_name): + with open(self.outputFile, 'w') as htmlfile: + htmlstr = 'The new Organism "%s" is created on Apollo: <br>' % self.genome_name + jbrowse_hub = '<li><a href = "%s" target="_blank">View JBrowse Hub on Apollo</a></li>' % host_name + htmlstr += jbrowse_hub + htmlfile.write(htmlstr) + + def _makeArchive(self): + self.apollo.loadHubToApollo(self.apollo_user, self.genome_name, self.mySpecieFolderPath, admin=True) + apollo_host = self.apollo.getHost() + self._outHtml(apollo_host) + + + def __createAssemblyHub__(self, extra_files_path): + # Get all necessaries infos first + # 2bit file creation from input fasta + + # baseNameFasta = os.path.basename(fasta_file_name) + # suffixTwoBit, extensionTwoBit = os.path.splitext(baseNameFasta) + # nameTwoBit = suffixTwoBit + '.2bit' + twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) + subtools.faToTwoBit(self.reference_genome.false_path, twoBitFile.name) + + # Generate the twoBitInfo + twoBitInfoFile = tempfile.NamedTemporaryFile(bufsize=0) + subtools.twoBitInfo(twoBitFile.name, twoBitInfoFile.name) + + # Then we get the output to generate the chromSizes + self.chromSizesFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".chrom.sizes") + subtools.sortChromSizes(twoBitInfoFile.name, self.chromSizesFile.name) - def createTrackList(self): - trackList = os.path.join(self.json, "trackList.json") - if not os.path.exists(trackList): - os.mknod(trackList) - - def Bam(self, track, metadata): - #create trackList.json if not exist - self.createTrackList() - json_file = os.path.join(self.json, "trackList.json") - bam_track = dict() - bam_track['type'] = 'JBrowse/View/Track/Alignments2' - bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM' - bam_track['urlTemplate'] = os.path.join('../raw', track['fileName']) - bam_track['baiUrlTemplate'] = os.path.join('../raw', track['index']) - bam_track['label'] = metadata['label'] - bam_track['category'] = metadata['category'] - bam_track = json.dumps(bam_track) - #Use add-track-json.pl to add bam track to json file - new_track = subprocess.Popen(['echo', bam_track], stdout=subprocess.PIPE) - subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout) - ''' - def BigWig(self, track, metadata): - #create trackList.json if not exist - self.createTrackList() - json_file = os.path.join(self.json, "trackList.json") - bigwig_track = dict() - bigwig_track['urlTemplate'] = os.path.join('../raw', track['fileName']) - bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot' - bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig' - bigwig_track['label'] = metadata['label'] - bigwig_track['style'] = metadata['style'] - bigwig_track['category'] = metadata['category'] - bigwig_track = json.dumps(bigwig_track) - #Use add-track-json.pl to add bigwig track to json file - new_track = subprocess.Popen(['echo', bigwig_track], stdout=subprocess.PIPE) - #output = new_track.communicate()[0] - subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout) - ''' - def BigWig - #If the metadata is not set, use the default value - def SetMetadata(self, track, metadata): - if 'label' not in metadata.keys() or metadata['label'] == '': - metadata['label'] = track['fileName'] - if 'color' not in metadata.keys() or metadata['color'] == '': - metadata['color'] = "#daa520" - if track['dataType'] == 'bigwig': - if 'style' not in metadata.keys(): - metadata['style'] = {} - if 'pos_color' not in metadata['style'] or metadata['style']['pos_color'] == '': - metadata['style']['pos_color'] = "#FFA600" - if 'neg_color' not in metadata['style'] or metadata['style']['neg_color'] == '': - metadata['style']['neg_color'] = "#005EFF" - if 'category' not in metadata.keys() or metadata['category'] == '': - metadata['category'] = "Default group" - if track['dataType'] == 'blastxml': - metadata['type'] = "G-OnRamp_plugin/BlastAlignment" - elif track['dataType'] == 'bigpsl': - metadata['type'] = "G-OnRamp_plugin/BlatAlignment" - elif track['dataType'] == 'gff3_transcript' or track['dataType'] == 'gff3_mrna': - metadata['type'] = "G-OnRamp_plugin/GenePred" - else: - metadata['type'] = "CanvasFeatures" + # We can get the biggest scaffold here, with chromSizesFile + with open(self.chromSizesFile.name, 'r') as chrom_sizes: + # TODO: Check if exists + self.default_pos = chrom_sizes.readline().split()[0] + + # TODO: Manage to put every fill Function in a file dedicated for reading reasons + # Create the root directory + myHubPath = os.path.join(extra_files_path, "myHub") + if not os.path.exists(myHubPath): + os.makedirs(myHubPath) + + # Create the specie folder + # TODO: Generate the name depending on the specie + mySpecieFolderPath = os.path.join(myHubPath, self.genome_name) + if not os.path.exists(mySpecieFolderPath): + os.makedirs(mySpecieFolderPath) + self.mySpecieFolderPath = mySpecieFolderPath - + # We create the 2bit file while we just created the specie folder + #self.twoBitName = self.genome_name + ".2bit" + #self.two_bit_final_path = os.path.join(self.mySpecieFolderPath, self.twoBitName) + #shutil.copyfile(twoBitFile.name, self.two_bit_final_path) - + # Create the folder tracks into the specie folder + tracksFolderPath = os.path.join(mySpecieFolderPath, "raw") + if not os.path.exists(tracksFolderPath): + os.makedirs(tracksFolderPath) + self.myTracksFolderPath = tracksFolderPath + myBinaryFolderPath = os.path.join(mySpecieFolderPath, 'bbi') + if not os.path.exists(myBinaryFolderPath): + os.makedirs(myBinaryFolderPath) + self.myBinaryFolderPath = myBinaryFolderPath - + return myHubPath
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/apollo/ApolloInstance.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,39 @@ +#!/usr/bin/env python +import json +import logging +from util import subtools + +class ApolloInstance(object): + def __init__(self, apollo_host): + self.apollo_host = apollo_host + self.logger = logging.getLogger(__name__) + + def getHost(self): + return self.apollo_host + + def createApolloUser(self, apollo_user, admin=None): + p = subtools.arrow_create_user(apollo_user.user_email, apollo_user.firstname, apollo_user.lastname, apollo_user.password, admin) + user_info = json.loads(p) + user_id = user_info.get('userId') + if not user_id: + self.logger.debug("Cannot create new user: %s; The user may already exist", apollo_user.user_email) + user_id = subtools.arrow_get_users(apollo_user.user_email) + self.logger.debug("Got user_id for new or existing user: user_id = %s", str(user_id)) + return user_id + + def grantPermission(self, user_id, organism_id, **user_permissions): + subtools.arrow_update_organism_permissions(user_id, organism_id, **user_permissions) + self.logger.debug("Grant user %s permissions to organism %s, permissions = %s", str(user_id), str(organism_id), ','.join(user_permissions)) + + def addOrganism(self, organism_name, organism_dir): + p = subtools.arrow_add_organism(organism_name, organism_dir) + organism = json.loads(p) + organism_id = organism['id'] + self.logger.debug("Added new organism to Apollo instance, %s", p) + return organism_id + + def loadHubToApollo(self, apollo_user, organism_name, organism_dir, admin_user=False, **user_permissions): + user_id = self.createApolloUser(apollo_user, admin_user) + organism_id = self.addOrganism(organism_name, organism_dir) + self.grantPermission(user_id, organism_id, **user_permissions) + self.logger.debug("Successfully load the hub to Apollo") \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/apollo/ApolloUser.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,10 @@ +#!/usr/bin/python + +import os + +class ApolloUser(object): + def __init__(self, user_email, firstname, lastname, password): + self.user_email = user_email + self.firstname = firstname + self.lastname = lastname + self.password = password
--- a/bedToGff3.py Wed Jul 12 12:55:27 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -#!/usr/bin/env python - -''' -Convert BED format to gff3 -reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md -''' -import os -from collections import OrderedDict -import utils - -class bedToGff3(): - def __init__(self, inputBedFile, chrom_sizes, bed_type, output): - self.input = inputBedFile - #file_dir = os.path.basename(inputBedFile) - #print file_dir + "\n\n" - self.output = output - self.chrom_sizes = chrom_sizes - self.type = bed_type - if self.type == "trfbig": - self.trfbig_to_gff3() - if self.type == "regtools": - self.splicejunctions_to_gff3() - if self.type == "blat": - self.bigpsl_to_gff3() - - def trfbig_to_gff3(self): - gff3 = open(self.output, 'w') - gff3.write("##gff-version 3\n") - sizes_dict = utils.sequence_region(self.chrom_sizes) - seq_regions = dict() - with open(self.input, 'r') as bed: - for line in bed: - field = OrderedDict() - attribute = OrderedDict() - li = line.rstrip().split("\t") - field['seqid'] = li[0] - if field['seqid'] not in seq_regions: - end_region = sizes_dict[field['seqid']] - gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') - seq_regions[field['seqid']] = end_region - field['source'] = li[3] - field['type'] = 'tandem_repeat' - # The first base in a chromosome is numbered 0 in BED format - field['start'] = str(int(li[1]) + 1) - field['end'] = li[2] - field['score'] = li[9] - field['strand'] = '+' - field['phase'] = '.' - attribute['length of repeat unit'] = li[4] - attribute['mean number of copies of repeat'] = li[5] - attribute['length of consensus sequence'] = li[6] - attribute['percentage match'] = li[7] - attribute['percentage indel'] = li[8] - attribute['percent of a\'s in repeat unit'] = li[10] - attribute['percent of c\'s in repeat unit'] = li[11] - attribute['percent of g\'s in repeat unit'] = li[12] - attribute['percent of t\'s in repeat unit'] = li[13] - attribute['entropy'] = li[14] - attribute['sequence of repeat unit element'] = li[15] - utils.write_features(field, attribute, gff3) - gff3.close() - - - def splicejunctions_to_gff3(self): - gff3 = open(self.output, 'w') - gff3.write("##gff-version 3\n") - sizes_dict = utils.sequence_region(self.chrom_sizes) - seq_regions = dict() - with open(self.input, 'r') as bed: - for line in bed: - field = OrderedDict() - attribute = OrderedDict() - li = line.rstrip().split("\t") - field['seqid'] = li[0] - if field['seqid'] not in seq_regions: - end_region = sizes_dict[field['seqid']] - gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') - seq_regions[field['seqid']] = end_region - field['source'] = li[3] - field['type'] = 'junction' - # The first base in a chromosome is numbered 0 in BED format - field['start'] = int(li[1]) + 1 - field['end'] = li[2] - field['score'] = li[12] - field['strand'] = li[5] - field['phase'] = '.' - attribute['ID'] = li[0] + '_' + li[3] - attribute['Name'] = li[3] - attribute['blockcount'] = li[9] - attribute['blocksizes'] = li[10] - attribute['chromstarts'] = li[11] - utils.write_features(field, attribute, gff3) - utils.child_blocks(field, attribute, gff3, 'exon_junction') - gff3.close() - - def bigpsl_to_gff3(self): - gff3 = open(self.output, 'w') - gff3.write("##gff-version 3\n") - sizes_dict = utils.sequence_region(self.chrom_sizes) - seq_regions = dict() - with open(self.input, 'r') as bed: - for line in bed: - field = OrderedDict() - attribute = OrderedDict() - li = line.rstrip().split("\t") - field['seqid'] = li[0] - if field['seqid'] not in seq_regions: - end_region = sizes_dict[field['seqid']] - gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') - seq_regions[field['seqid']] = end_region - field['source'] = 'UCSC BLAT alignment tool' - field['type'] = 'match' - # The first base in a chromosome is numbered 0 in BED format - field['start'] = str(int(li[1]) + 1) - field['end'] = li[2] - field['score'] = li[4] - field['strand'] = li[5] - field['phase'] = '.' - attribute['ID'] = li[0] + '_' + li[3] - attribute['Name'] = li[3] - attribute['blockcount'] = li[9] - attribute['blocksizes'] = li[10] - attribute['chromstarts'] = li[11] - attribute['ochrom_start'] = li[12] - attribute['ochrom_end'] = li[13] - attribute['ochrom_strand'] = li[14] - attribute['ochrom_size'] = li[15] - attribute['ochrom_starts'] = li[16] - attribute['sequence on other chromosome'] = li[17] - attribute['cds in ncbi format'] = li[18] - attribute['size of target chromosome'] = li[19] - attribute['number of bases matched'] = li[20] - attribute['number of bases that don\'t match'] = li[21] - attribute['number of bases that match but are part of repeats'] = li[22] - attribute['number of \'N\' bases'] = li[23] - utils.write_features(field, attribute, gff3) - utils.child_blocks(field, attribute, gff3, 'match_part') - gff3.close() - \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bigGenePred.as Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,25 @@ +table bigGenePred +"bigGenePred gene models" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name or ID of item, ideally both human readable and unique" + uint score; "Score (0-1000)" + char[1] strand; "+ or - for strand" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "RGB value (use R,G,B string in input file)" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" + string name2; "Alternative/human readable name" + string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)" + string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)" + int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon" + string type; "Transcript type" + string geneName; "Primary identifier for gene" + string geneName2; "Alternative/human readable gene name" + string geneType; "Gene type" + ) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bigPsl.as Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,33 @@ +table bigPsl +"bigPsl pairwise alignment" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name or ID of item, ideally both human readable and unique" + uint score; "Score (0-1000)" + char[1] strand; "+ or - for strand" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "RGB value (use R,G,B string in input file)" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" + + uint oChromStart;"Start position in other chromosome" + uint oChromEnd; "End position in other chromosome" + char[1] oStrand; "+ or - for other strand" + uint oChromSize; "Size of other chromosome." + int[blockCount] oChromStarts; "Start positions relative to oChromStart" + + lstring oSequence; "Sequence on other chrom (or edit list, or empty)" + string oCDS; "CDS in NCBI format" + + uint chromSize;"Size of target chromosome" + + uint match; "Number of bases matched." + uint misMatch; " Number of bases that don't match " + uint repMatch; " Number of bases that match but are part of repeats " + uint nCount; " Number of 'N' bases " + ) +
--- a/blastxmlToGff3.py Wed Jul 12 12:55:27 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,159 +0,0 @@ -#!/usr/bin/env python - - -from Bio.Blast import NCBIXML -from collections import OrderedDict -import utils - - -def align2cigar(hsp_query, hsp_reference): - """ - Build CIGAR representation from an hsp_query - input: - hsp_query - hsp_sbjct - output: - CIGAR string - """ - query = hsp_query - ref = hsp_reference - # preType, curType: - # 'M' represents match, - # 'I' represents insert a gap into the reference sequence, - # 'D' represents insert a gap into the target (delete from reference) - # some ideas of this algin2cigar function are coming from - # https://gist.github.com/ozagordi/099bdb796507da8d9426 - prevType = 'M' - curType = 'M' - count = 0 - cigar = [] - num = len(query) - for i in range(num): - if query[i] == '-': - curType = 'D' - elif ref[i] == '-': - curType = 'I' - else: - curType = 'M' - if curType == prevType: - count += 1 - else: - cigar.append('%s%d' % (prevType, count)) - prevType = curType - count = 1 - cigar.append('%s%d' % (curType, count)) - return ' '.join(cigar) - -def gff3_writer(blast_records, gff3_file): - gff3 = open(gff3_file, 'a') - gff3.write("##gff-version 3\n") - seq_regions = dict() - for blast_record in blast_records: - query_name = blast_record.query.split(" ")[0] - source = blast_record.application - method = blast_record.matrix - for alignment in blast_record.alignments: - group = { - "parent_field" : OrderedDict(), - "parent_attribute" : OrderedDict(), - "alignments" : [] - } - title = alignment.title.split(" ") - contig_name = title[len(title) - 1] - length = alignment.length - group['parent_field']['seqid'] = contig_name - group['parent_field']['source'] = source - group['parent_field']['type'] = 'match' - group['parent_attribute']['ID'] = contig_name + '_' + query_name - group['parent_attribute']['method'] = method - group['parent_attribute']['length'] = length - if contig_name not in seq_regions: - gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n') - seq_regions[contig_name] = length - match_num = 0 - coords = [length, 0] - for hsp in alignment.hsps: - hsp_align = {} - field = OrderedDict() - attribute = OrderedDict() - ref = hsp.sbjct - query = hsp.query - field['seqid'] = contig_name - field['source'] = source - field['type'] = 'match_part' - - field['start'] = hsp.sbjct_start - if field['start'] < coords[0]: - coords[0] = field['start'] - ref_length = len(ref.replace('-', '')) - # if run tblastn, the actual length of reference should be multiplied by 3 - if source.lower() == "tblastn": - ref_length *= 3 - field['end'] = field['start'] + ref_length - 1 - if field['end'] > coords[1]: - coords[1] = field['end'] - field['score'] = hsp.score - #decide if the alignment in the same strand or reverse strand - #reading frame - # (+, +), (0, 0), (-, -) => + - # (+, -), (-, +) => - - if hsp.frame[1] * hsp.frame[0] > 0: - field['strand'] = '+' - elif hsp.frame[1] * hsp.frame[0] < 0: - field['strand'] = '-' - else: - if hsp.frame[0] + hsp.frame[1] >= 0: - field['strand'] = '+' - else: - field['strand'] = '-' - field['phase'] = '.' - - target_start = hsp.query_start - target_len = len(query.replace('-', '')) - # if run blastx, the actual length of query should be multiplied by 3 - if source.lower() == "blastx": - target_len *= 3 - target_end = target_start + target_len -1 - attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num) - attribute['Parent'] = group['parent_attribute']['ID'] - attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) - attribute['Gap'] = align2cigar(query, ref) - #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin - attribute['subject'] = hsp.sbjct - attribute['query'] = hsp.query - attribute['match'] = hsp.match - attribute['gaps'] = attribute['match'].count(' ') - similar = attribute['match'].count('+') - attribute['identities'] = len(attribute['match']) - similar - attribute['gaps'] - attribute['positives'] = attribute['identities'] + similar - attribute['expect'] = hsp.expect - # show reading frame attribute only if the frame is not (0, 0) - attribute['frame'] = hsp.frame[1] - match_num += 1 - hsp_align['field'] = field - hsp_align['attribute'] = attribute - group['alignments'].append(hsp_align) - group['parent_field']['start'] = coords[0] - group['parent_field']['end'] = coords[1] - group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.' - group['parent_attribute']['match_num'] = match_num - group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end'])) - utils.write_features(group['parent_field'], group['parent_attribute'], gff3) - prev_end = -1 - for align in group['alignments']: - overlap = '' - if align['field']['start'] <= prev_end: - overlap += str(align['field']['start']) + ',' + str(prev_end) - prev_end = align['field']['end'] - align['attribute']['overlap'] = overlap - utils.write_features(align['field'], align['attribute'], gff3) - gff3.close() - -def blastxml2gff3(xml_file, gff3_file): - result_handle = open(xml_file) - blast_records = NCBIXML.parse(result_handle) - gff3_writer(blast_records, gff3_file) - -if __name__ == "__main__": - blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt") -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/Datatype.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,122 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +Super Class of the managed datatype +""" + +import os +import tempfile +import collections +from util import subtools +import logging +import abc +from abc import ABCMeta +from tracks.HTMLFeatures import HTMLFeatures +from tracks.CanvasFeatures import CanvasFeatures +from tracks.BamFeatures import BamFeatures +from tracks.BigwigFeatures import BigwigFeatures +from datatypes.validators.DataValidation import DataValidation + + +class Datatype(object): + __metaclass__ = ABCMeta + + chromSizesFile = None + input_fasta_file = None + extra_files_path = None + tool_directory = None + + mySpecieFolderPath = None + myTrackFolderPath = None + myBinaryFolderPath = None + + trackType = None + + def __init__(self): + not_init_message = "The {0} is not initialized." \ + "Did you use pre_init static method first?" + if Datatype.input_fasta_file is None: + raise TypeError(not_init_message.format('reference genome')) + if Datatype.extra_files_path is None: + raise TypeError(not_init_message.format('track Hub path')) + if Datatype.tool_directory is None: + raise TypeError(not_init_message.format('tool directory')) + self.inputFile = None + self.trackType = None + self.dataType = None + self.trackFileType = None + self.track = None + self.trackSettings = dict() + self.extraSettings = collections.OrderedDict() + + + @staticmethod + def pre_init(reference_genome, chrom_sizes_file, + extra_files_path, tool_directory, specie_folder, tracks_folder, binary_folder, track_type): + Datatype.extra_files_path = extra_files_path + Datatype.tool_directory = tool_directory + + # TODO: All this should be in TrackHub and not in Datatype + Datatype.mySpecieFolderPath = specie_folder + Datatype.myTrackFolderPath = tracks_folder + Datatype.myBinaryFolderPath = binary_folder + + Datatype.input_fasta_file = reference_genome + + # 2bit file creation from input fasta + #Datatype.twoBitFile = two_bit_path + Datatype.chromSizesFile = chrom_sizes_file + Datatype.trackType = track_type + + + def generateCustomTrack(self): + self.validateData() + self.initSettings() + #Create the track file + self.createTrack() + # Create the TrackDb Object + self.createTrackDb() + logging.debug("- %s %s created", self.dataType, self.trackName) + + + @abc.abstractmethod + def validateData(self): + """validate the input data with DataValidation""" + + def initSettings(self): + #Initialize required fields: trackName, longLabel, shortLable + self.trackName = self.trackSettings["name"] + self.trackDataURL = os.path.join(self.myTrackFolderPath, self.trackName) + if self.trackSettings["long_label"]: + self.trackLabel = self.trackSettings["long_label"] + else: + self.trackLabel = self.trackName + if "trackType" in self.trackSettings and self.trackSettings["trackType"]: + self.trackType = self.trackSettings["trackType"] + if self.trackSettings["group_name"]: + self.extraSettings["category"] = self.trackSettings["group_name"] + if "track_color" in self.trackSettings and self.trackSettings["track_color"]: + self.extraSettings["color"] = self.trackSettings["track_color"] + + + @abc.abstractmethod + def createTrack(self): + """Create the final track file""" + + def createTrackDb(self): + if self.trackType == 'HTMLFeatures': + self.track = HTMLFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings) + elif self.trackType == "CanvasFeatures": + self.track = CanvasFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings) + elif self.trackType == "bam": + self.track = BamFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings) + elif self.trackType == "bigwig": + self.track = BigwigFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings) + else: + logging.error("Cannot createTrackDb, because trackType is not defined or invalid! trackType = %s", self.trackType) + self.track.createTrackDb() + + #self.track = TrackDb(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings) + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/binary/Bam.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,51 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +Class to handle Bam files to UCSC TrackHub +""" + +import logging +import os +import shutil + +from Binary import Binary +from datatypes.validators.DataValidation import DataValidation +from util import subtools + + + + +class Bam(Binary): + def __init__(self, input_bam_false_path, data_bam): + super(Bam, self).__init__() + self.inputFile = input_bam_false_path + self.trackSettings = data_bam + self.dataType = "bam" + self.trackType = "bam" + + + def validateData(self): + self.validator = DataValidation(self.inputFile, self.dataType, self.chromSizesFile.name) + self.validator.validate() + + def createTrack(self): + #shutil.copy(self.inputFile, self.trackDataURL) + extension = os.path.splitext(self.trackName)[1] + if extension != '.bam': + self.trackName = self.trackName + '.bam' + self.trackDataURL = os.path.join(self.myBinaryFolderPath, self.trackName) + #self.trackDataURL = os.path.join(self.myTrackFolderPath, self.trackName) + shutil.copyfile(self.inputFile, self.trackDataURL) + bam_index = subtools.createBamIndex(self.inputFile) + indexName = os.path.basename(bam_index) + trackIndexURL = os.path.join(self.myBinaryFolderPath, indexName) + #trackIndexURL = os.path.join(self.myTrackFolderPath, indexName) + shutil.copyfile(bam_index, trackIndexURL) + self.extraSettings['index'] = indexName + + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/binary/BigWig.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/python + +import os +import shutil +from subprocess import Popen, PIPE +import re + +# Internal dependencies +from Binary import Binary +from datatypes.validators.DataValidation import DataValidation + + + +class BigWig(Binary): + def __init__(self, input_bigwig_path, data_bigwig): + super(BigWig, self).__init__() + self.inputFile = input_bigwig_path + self.trackSettings = data_bigwig + self.dataType = "bigWig" + self.trackType= "bigwig" + + def initSettings(self): + super(BigWig, self).initSettings() + if 'style' in self.trackSettings: + self.extraSettings['style'] = self.trackSettings['style'] + + def validateData(self): + self.validator = DataValidation(self.inputFile, self.dataType, self.chromSizesFile.name) + self.validator.validate() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/binary/Binary.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,38 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +Super Class of the managed datatype +""" + +import os +import tempfile +import collections +import shutil +import util +from TrackDb import TrackDb +from datatypes.Datatype import Datatype + + +class Binary(Datatype): + + def __init__(self): + super(Binary, self).__init__() + + + def initSettings(self): + super(Binary, self).initSettings() + self.trackDataURL = os.path.join(self.myBinaryFolderPath, self.trackName) + + + def createTrack(self): + shutil.copy(self.inputFile, self.trackDataURL) + + + + + + + + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/converters/BedConversion.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,154 @@ +#!/usr/bin/env python + +''' +Convert BED format to gff3 +reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md +''' +import os +import tempfile +from collections import OrderedDict + +from util import subtools +from DataConversion import DataConversion + +class BedConversion(DataConversion): + def __init__(self, inputBedFile, outputFile, chromSizesFile, bedType, trackType, options=None): + super(BedConversion, self).__init__(inputBedFile, outputFile, chromSizesFile, bedType, options) + + + def convertFormats(self): + self.dataToJson() + + + def dataToJson(self): + if self.dataType != 'bed': + self.convertToGff3() + self.inputFile = self.gff3_file + self.dataType == 'gff' + subtools.flatfile_to_json(self.inputFile, self.dataType, self.trackType, self.trackLabel, self.outputFile, self.options) + + def convertToGff3(self): + self.gff3_file = tempfile.NamedTemporaryFile(suffix=".gff3") + if self.dataType == "trfbig": + self.trfbig_to_gff3() + elif self.dataType == "regtools": + self.splicejunctions_to_gff3() + elif self.dataType == "blat": + self.bigpsl_to_gff3() + else: + raise ValueError("dataType %s is not support for converting to GFF3", self.dataType) + + def trfbig_to_gff3(self): + gff3 = open(self.gff3_file.name, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(self.chromSizesFile) + seq_regions = dict() + with open(self.inputFile, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[3] + field['type'] = 'tandem_repeat' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = str(int(li[1]) + 1) + field['end'] = li[2] + field['score'] = li[9] + field['strand'] = '+' + field['phase'] = '.' + attribute['length of repeat unit'] = li[4] + attribute['mean number of copies of repeat'] = li[5] + attribute['length of consensus sequence'] = li[6] + attribute['percentage match'] = li[7] + attribute['percentage indel'] = li[8] + attribute['percent of a\'s in repeat unit'] = li[10] + attribute['percent of c\'s in repeat unit'] = li[11] + attribute['percent of g\'s in repeat unit'] = li[12] + attribute['percent of t\'s in repeat unit'] = li[13] + attribute['entropy'] = li[14] + attribute['sequence of repeat unit element'] = li[15] + subtools.write_features(field, attribute, gff3) + gff3.close() + + + def splicejunctions_to_gff3(self): + gff3 = open(self.gff3_file.name, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(self.chromSizesFile) + seq_regions = dict() + with open(self.inputFile, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[3] + field['type'] = 'junction' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = int(li[1]) + 1 + field['end'] = li[2] + field['score'] = li[12] + field['strand'] = li[5] + field['phase'] = '.' + attribute['ID'] = li[0] + '_' + li[3] + attribute['Name'] = li[3] + attribute['blockcount'] = li[9] + attribute['blocksizes'] = li[10] + attribute['chromstarts'] = li[11] + subtools.write_features(field, attribute, gff3) + subtools.child_blocks(field, attribute, gff3, 'exon_junction') + gff3.close() + + def bigpsl_to_gff3(self): + gff3 = open(self.gff3_file.name, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(self.chromSizesFile) + seq_regions = dict() + with open(self.inputFile, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = 'UCSC BLAT alignment tool' + field['type'] = 'match' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = str(int(li[1]) + 1) + field['end'] = li[2] + field['score'] = li[4] + field['strand'] = li[5] + field['phase'] = '.' + attribute['ID'] = li[0] + '_' + li[3] + attribute['Name'] = li[3] + attribute['blockcount'] = li[9] + attribute['blocksizes'] = li[10] + attribute['chromstarts'] = li[11] + attribute['ochrom_start'] = li[12] + attribute['ochrom_end'] = li[13] + attribute['ochrom_strand'] = li[14] + attribute['ochrom_size'] = li[15] + attribute['ochrom_starts'] = li[16] + attribute['sequence on other chromosome'] = li[17] + attribute['cds in ncbi format'] = li[18] + attribute['size of target chromosome'] = li[19] + attribute['number of bases matched'] = li[20] + attribute['number of bases that don\'t match'] = li[21] + attribute['number of bases that match but are part of repeats'] = li[22] + attribute['number of \'N\' bases'] = li[23] + subtools.write_features(field, attribute, gff3) + subtools.child_blocks(field, attribute, gff3, 'match_part') + gff3.close() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/converters/DataConversion.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,51 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +This class handles the subprocess calls of the different tools used +in HubArchiveCreator +""" + +import logging +import os +import subprocess +import sys +import string +import tempfile + +from bedToGff3 import bedToGff3 +from blastxmlToGff3 import blastxmlToGff3 +from gtfToGff3 import gtfToGff3 + + + + +class DataConversion(object): + CONVERT_OPERATIONS = {("bed", "gff"): "bedtogff3", + ("blastxml", "gff"): "blastxmltogff3", + ("gtf", "gff"): "gtftogff3"} + + def __init__(self, inputFile, outputFile, chromSizesFile, operateType, options=None): + if not operateType: + return + if not inputFile: + raise TypeError("the input file is not specified!\n") + self.inputFile = inputFile + self.chromSizesFile = chromSizesFile + self.outputFile = outputFile + self.operateType = operateType + self.options = options + + + + def convertFormats(self): + """ Convert data into JBrowse track """ + convertMethod = self.CONVERT_OPERATIONS[self.operateType] + if convertMethod == "bedtogff3": + bedToGff3(self.inputFile, self.chromSizesFile, self.outputFile, self.options) + elif convertMethod == "blastxmltogff3": + blastxmlToGff3(self.inputFile, self.outputFile) + elif convertMethod == "gtftogff3": + gtfToGff3(self.inputFile, self.outputFile, self.chromSizesFile) + else: + raise ValueError("the operation %s is not defined!\n", self.operateType)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/converters/bedToGff3.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,133 @@ +#!/usr/bin/env python + +''' +Convert BED format to gff3 +reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md +''' +import os +from collections import OrderedDict +from util import subtools + +def bedToGff3(inputBedFile, chrom_sizes, output, bed_type): + if bed_type == "trfbig": + trfbig_to_gff3(inputBedFile, chrom_sizes, output) + if bed_type == "regtools": + splicejunctions_to_gff3(inputBedFile, chrom_sizes, output) + if bed_type == "blat": + bigpsl_to_gff3(inputBedFile, chrom_sizes, output) + +def trfbig_to_gff3(inputBedFile, chrom_sizes, output): + gff3 = open(output, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(chrom_sizes) + seq_regions = dict() + with open(inputBedFile, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[3] + field['type'] = 'tandem_repeat' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = str(int(li[1]) + 1) + field['end'] = li[2] + field['score'] = li[9] + field['strand'] = '+' + field['phase'] = '.' + attribute['length of repeat unit'] = li[4] + attribute['mean number of copies of repeat'] = li[5] + attribute['length of consensus sequence'] = li[6] + attribute['percentage match'] = li[7] + attribute['percentage indel'] = li[8] + attribute['percent of a\'s in repeat unit'] = li[10] + attribute['percent of c\'s in repeat unit'] = li[11] + attribute['percent of g\'s in repeat unit'] = li[12] + attribute['percent of t\'s in repeat unit'] = li[13] + attribute['entropy'] = li[14] + attribute['sequence of repeat unit element'] = li[15] + subtools.write_features(field, attribute, gff3) + gff3.close() + +def splicejunctions_to_gff3(inputBedFile, chrom_sizes, output): + gff3 = open(output, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(chrom_sizes) + seq_regions = dict() + with open(inputBedFile, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[3] + field['type'] = 'junction' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = int(li[1]) + 1 + field['end'] = li[2] + field['score'] = li[12] + field['strand'] = li[5] + field['phase'] = '.' + attribute['ID'] = li[0] + '_' + li[3] + attribute['Name'] = li[3] + attribute['blockcount'] = li[9] + attribute['blocksizes'] = li[10] + attribute['chromstarts'] = li[11] + subtools.write_features(field, attribute, gff3) + subtools.child_blocks(field, attribute, gff3, 'exon_junction') + gff3.close() + +def bigpsl_to_gff3(inputBedFile, chrom_sizes, output): + gff3 = open(output, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(chrom_sizes) + seq_regions = dict() + with open(inputBedFile, 'r') as bed: + for line in bed: + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + field['seqid'] = li[0] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = 'UCSC BLAT alignment tool' + field['type'] = 'match' + # The first base in a chromosome is numbered 0 in BED format + field['start'] = str(int(li[1]) + 1) + field['end'] = li[2] + field['score'] = li[4] + field['strand'] = li[5] + field['phase'] = '.' + attribute['ID'] = li[0] + '_' + li[3] + attribute['Name'] = li[3] + attribute['blockcount'] = li[9] + attribute['blocksizes'] = li[10] + attribute['chromstarts'] = li[11] + attribute['ochrom_start'] = li[12] + attribute['ochrom_end'] = li[13] + attribute['ochrom_strand'] = li[14] + attribute['ochrom_size'] = li[15] + attribute['ochrom_starts'] = li[16] + attribute['sequence on other chromosome'] = li[17] + attribute['cds in ncbi format'] = li[18] + attribute['size of target chromosome'] = li[19] + attribute['number of bases matched'] = li[20] + attribute['number of bases that don\'t match'] = li[21] + attribute['number of bases that match but are part of repeats'] = li[22] + attribute['number of \'N\' bases'] = li[23] + subtools.write_features(field, attribute, gff3) + subtools.child_blocks(field, attribute, gff3, 'match_part') + gff3.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/converters/blastxmlToGff3.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,158 @@ +#!/usr/bin/env python + + +from Bio.Blast import NCBIXML +from collections import OrderedDict +import utils + + +def align2cigar(hsp_query, hsp_reference): + """ + Build CIGAR representation from an hsp_query + input: + hsp_query + hsp_sbjct + output: + CIGAR string + """ + query = hsp_query + ref = hsp_reference + # preType, curType: + # 'M' represents match, + # 'I' represents insert a gap into the reference sequence, + # 'D' represents insert a gap into the target (delete from reference) + # some ideas of this algin2cigar function are coming from + # https://gist.github.com/ozagordi/099bdb796507da8d9426 + prevType = 'M' + curType = 'M' + count = 0 + cigar = [] + num = len(query) + for i in range(num): + if query[i] == '-': + curType = 'D' + elif ref[i] == '-': + curType = 'I' + else: + curType = 'M' + if curType == prevType: + count += 1 + else: + cigar.append('%s%d' % (prevType, count)) + prevType = curType + count = 1 + cigar.append('%s%d' % (curType, count)) + return ' '.join(cigar) + +def gff3_writer(blast_records, gff3_file): + gff3 = open(gff3_file, 'a') + gff3.write("##gff-version 3\n") + seq_regions = dict() + for blast_record in blast_records: + query_name = blast_record.query.split(" ")[0] + source = blast_record.application + method = blast_record.matrix + for alignment in blast_record.alignments: + group = { + "parent_field" : OrderedDict(), + "parent_attribute" : OrderedDict(), + "alignments" : [] + } + title = alignment.title.split(" ") + contig_name = title[len(title) - 1] + length = alignment.length + group['parent_field']['seqid'] = contig_name + group['parent_field']['source'] = source + group['parent_field']['type'] = 'match' + group['parent_attribute']['ID'] = contig_name + '_' + query_name + group['parent_attribute']['method'] = method + group['parent_attribute']['length'] = length + if contig_name not in seq_regions: + gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n') + seq_regions[contig_name] = length + match_num = 0 + coords = [length, 0] + for hsp in alignment.hsps: + hsp_align = {} + field = OrderedDict() + attribute = OrderedDict() + ref = hsp.sbjct + query = hsp.query + field['seqid'] = contig_name + field['source'] = source + field['type'] = 'match_part' + + field['start'] = hsp.sbjct_start + if field['start'] < coords[0]: + coords[0] = field['start'] + ref_length = len(ref.replace('-', '')) + # if run tblastn, the actual length of reference should be multiplied by 3 + if source.lower() == "tblastn": + ref_length *= 3 + field['end'] = field['start'] + ref_length - 1 + if field['end'] > coords[1]: + coords[1] = field['end'] + field['score'] = hsp.score + #decide if the alignment in the same strand or reverse strand + #reading frame + # (+, +), (0, 0), (-, -) => + + # (+, -), (-, +) => - + if hsp.frame[1] * hsp.frame[0] > 0: + field['strand'] = '+' + elif hsp.frame[1] * hsp.frame[0] < 0: + field['strand'] = '-' + else: + if hsp.frame[0] + hsp.frame[1] >= 0: + field['strand'] = '+' + else: + field['strand'] = '-' + field['phase'] = '.' + + target_start = hsp.query_start + target_len = len(query.replace('-', '')) + # if run blastx, the actual length of query should be multiplied by 3 + if source.lower() == "blastx": + target_len *= 3 + target_end = target_start + target_len -1 + attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num) + attribute['Parent'] = group['parent_attribute']['ID'] + attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) + attribute['Gap'] = align2cigar(query, ref) + #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin + attribute['subject'] = hsp.sbjct + attribute['query'] = hsp.query + attribute['match'] = hsp.match + attribute['gaps'] = attribute['match'].count(' ') + similar = attribute['match'].count('+') + attribute['identities'] = len(attribute['match']) - similar - attribute['gaps'] + attribute['positives'] = attribute['identities'] + similar + attribute['expect'] = hsp.expect + # show reading frame attribute only if the frame is not (0, 0) + attribute['frame'] = hsp.frame[1] + match_num += 1 + hsp_align['field'] = field + hsp_align['attribute'] = attribute + group['alignments'].append(hsp_align) + group['parent_field']['start'] = coords[0] + group['parent_field']['end'] = coords[1] + group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.' + group['parent_attribute']['match_num'] = match_num + group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end'])) + utils.write_features(group['parent_field'], group['parent_attribute'], gff3) + prev_end = -1 + for align in group['alignments']: + overlap = '' + if align['field']['start'] <= prev_end: + overlap += str(align['field']['start']) + ',' + str(prev_end) + prev_end = align['field']['end'] + align['attribute']['overlap'] = overlap + utils.write_features(align['field'], align['attribute'], gff3) + gff3.close() + +def blastxmlToGff3(xml_file, gff3_file): + result_handle = open(xml_file) + blast_records = NCBIXML.parse(result_handle) + gff3_writer(blast_records, gff3_file) + +if __name__ == "__main__": + blastxmlToGff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/converters/gtfToGff3.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +''' +Convert GTF format to GFF3 +reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md +''' +import os +from collections import OrderedDict +from util import subtools + + + + +def gtfToGff3(gtf_file, gff3_file, chrom_sizes): + """ + Covert gtf file output from StringTie to gff3 format + """ + gff3 = open(gff3_file, 'w') + gff3.write("##gff-version 3\n") + sizes_dict = subtools.sequence_region(chrom_sizes) + seq_regions = dict() + parents = dict() + with open(gtf_file, 'r') as gtf: + for line in gtf: + if line.startswith('#') or not line.strip(): + continue + field = OrderedDict() + attribute = OrderedDict() + li = line.rstrip().split("\t") + #print li + field['seqid'] = li[0] + #print field['seqid'] + if field['seqid'] not in seq_regions: + end_region = sizes_dict[field['seqid']] + gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') + seq_regions[field['seqid']] = end_region + field['source'] = li[1] + field['type'] = li[2] + # The first base in a chromosome is numbered 0 in BED format + field['start'] = li[3] + field['end'] = li[4] + field['score'] = li[5] + field['strand'] = li[6] + field['phase'] = li[7] + attr_li = li[8].split(';') + gene_id = attr_li[0].split()[1].strip('"') + attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end']) + if field['type'] == 'transcript': + parents[gene_id] = attribute['ID'] + attribute['transcript_id'] = attr_li[1].split()[1].strip('"') + attribute['coverage'] = attr_li[2].split()[1].strip('"') + attribute['fpkm'] = attr_li[3].split()[1].strip('"') + attribute['tpm'] = attr_li[4].split()[1].strip('"') + elif field['type'] == 'exon': + attribute['Parent'] = parents[gene_id] + attribute['transcript_id'] = attr_li[1].split()[1].strip('"') + attribute['coverage'] = attr_li[3].split()[1].strip('"') + subtools.write_features(field, attribute, gff3) + gff3.close() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Bed.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,48 @@ +#!/usr/bin/python + +import os +import tempfile +import logging +import shutil + +# Internal dependencies +from Interval import Interval +from datatypes.validators.DataValidation import DataValidation +from datatypes.converters.DataConversion import DataConversion + +class Bed(Interval): + def __init__(self, inputBedGeneric, data_bed_generic): + super(Bed, self).__init__() + self.inputFile = inputBedGeneric + self.trackSettings = data_bed_generic + self.bedFields = None + self.extFields = None + self.dataType = "bed" + + def createTrack(self): + shutil.copyfile(self.inputFile, self.trackDataURL) + + def validateData(self): + self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name) + self.validator.validate() + + def _getBedFields(self): + """count number of bed fields for generic bed format""" + with open(self.inputFile, 'r') as bed: + l = bed.readline().split() + return len(l) + + def getValidateType(self): + if not self.bedFields: + logging.debug("bedFields is not defined, consider the file as Bed generic format, datatype = bed%s", str(self.bedFields)) + self.bedFields = self._getBedFields() + return self.dataType + str(self.bedFields) + elif not self.extFields: + return self.dataType + str(self.bedFields) + else: + return self.dataType + str(self.bedFields) + "+" + str(self.extFields) + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/BedBlastAlignments.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,25 @@ +#!/usr/bin/python + +import os +import tempfile +import string + +from BigPsl import BigPsl +from datatypes.converters.DataConversion import DataConversion +from util import subtools + + +class BedBlastAlignments( BigPsl ): + def __init__(self, input_bed_blast_alignments_false_path, data_bed_blast_alignments): + + super(BedBlastAlignments, self).__init__(input_bed_blast_alignments_false_path, data_bed_blast_alignments) + #self.seqType = 1 + self.trackType = "G-OnRamp_plugin/BlastAlignment" + + def initSettings(self): + super(BedBlastAlignments, self).initSettings() + self.extraSettings["subfeatureClasses"] = "match_part" + + + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/BedBlatAlignments.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,23 @@ +#!/usr/bin/python + +import os +import tempfile +import string + +from BigPsl import BigPsl +from datatypes.converters.DataConversion import DataConversion +from util import subtools + + +class BedBlatAlignments( BigPsl ): + def __init__(self, input_bed_blast_alignments_false_path, data_bed_blast_alignments): + + super(BedBlatAlignments, self).__init__(input_bed_blast_alignments_false_path, data_bed_blast_alignments) + #self.seqType = 1 + #self.trackType = "G-OnRamp_plugin/BlatAlignment" + + def initSettings(self): + super(BedBlatAlignments, self).initSettings() + self.extraSettings["subfeatureClasses"] = "match_part" + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/BedSimpleRepeats.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,33 @@ +#!/usr/bin/python + +import os +import tempfile + +from Bed import Bed +from datatypes.validators.DataValidation import DataValidation +from datatypes.converters.DataConversion import DataConversion + + +class BedSimpleRepeats( Bed ): + def __init__(self, input_bed_simple_repeats_false_path, data_bed_simple_repeats): + + super(BedSimpleRepeats, self).__init__(input_bed_simple_repeats_false_path, data_bed_simple_repeats) + self.bedFields = 4 + self.extFields = 12 + self.autoSql = os.path.join(self.tool_directory, 'trf_simpleRepeat.as') + self.trackFileType = "gff" + + + + def validateData(self): + self.validateOptions = self.getValidateOptions(tab="True", autoSql=self.autoSql) + self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name, self.validateOptions) + self.validator.validate() + + + def createTrack(self): + self.convertType = self.getConvertType() + self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, 'trfbig') + self.converter.convertFormats() + self.dataType = self.trackFileType +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/BedSpliceJunctions.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,36 @@ +#!/usr/bin/python + +import os +import tempfile + +from Bed import Bed +from datatypes.validators.DataValidation import DataValidation +from datatypes.converters.DataConversion import DataConversion + + + +class BedSpliceJunctions( Bed ): + def __init__(self, input_bed_splice_junctions_false_path, data_bed_splice_junctions): + + super(BedSpliceJunctions, self).__init__(input_bed_splice_junctions_false_path, data_bed_splice_junctions) + self.bedFields = 12 + self.extFields = 1 + self.autoSql = os.path.join(self.tool_directory, 'spliceJunctions.as') + self.trackFileType = "gff" + + def initSettings(self): + super(BedSpliceJunctions, self).initSettings() + self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments" + self.extraSettings["subfeatureClasses"] = "exon_junction" + + def validateData(self): + self.validateOptions = self.getValidateOptions(tab="True", autoSql=self.autoSql) + self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name, self.validateOptions) + self.validator.validate() + + def createTrack(self): + self.convertType = self.getConvertType() + self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, 'regtools') + self.converter.convertFormats() + self.dataType = self.trackFileType +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/BigPsl.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,53 @@ +#!/usr/bin/python + +import os +import tempfile +import string + +from Interval import Interval +from util.index.DatabaseIndex import DatabaseIndex +from util.index.TrixIndex import TrixIndex +from datatypes.validators.DataValidation import DataValidation +from datatypes.converters.DataConversion import DataConversion + + +class BigPsl(Interval): + def __init__(self, input_bigpsl_false_path, data_bigpsl): + + super(BigPsl, self).__init__() + self.inputFile = input_bigpsl_false_path + self.trackSettings = data_bigpsl + self.dataType = "bed" + self.bedFields = 12 + self.extFields = 12 + #self.seqType = None + self.autoSql = os.path.join(self.tool_directory, 'bigPsl.as') + + def initSettings(self): + super(BigPsl, self).initSettings() + self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments" + #self.extraSettings["subfeatureClasses"] = "match_part" + + def validateData(self): + self.validateOptions = self.getValidateOptions(tab="True", autoSql=self.autoSql) + self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name, self.validateOptions) + self.validator.validate() + + def createTrack(self): + self.convertType = self.getConvertType() + self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, 'blat') + self.converter.convertFormats() + self.dataType = self.trackFileType + + def getValidateType(self): + if not self.bedFields or not self.extFields: + raise Exception("Invalid bigPsl format, no {0} or {1}".format("bedFields", "extFields")) + return self.dataType + str(self.bedFields) + "+" + str(self.extFields) + + def _getSeqType(self): + with open(self.inputFile, "r") as bigpsl: + sampleSeq = bigpsl.readline().split() + if len(sampleSeq) == 25: + return sampleSeq[-1] + else: + return None
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/BlastXml.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,34 @@ +#!/usr/bin/python + +import os +import tempfile +import string + +from Interval import Interval +from datatypes.converters.DataConversion import DataConversion +from util import subtools + + +class BlastXml( Interval ): + def __init__(self, input_blast_alignments_false_path, data_blast_alignments): + + super(BlastXml, self).__init__() + self.inputFile = input_blast_alignments_false_path + self.trackSettings = data_blast_alignments + self.dataType = "blastxml" + #self.trackType = "G-OnRamp_plugin/BlatAlignment" + + def initSettings(self): + super(BlastXml, self).initSettings() + self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments" + self.extraSettings["subfeatureClasses"] = "match_part" + + def validateData(self): + return + + def createTrack(self): + self.convertType = self.getConvertType() + self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType) + self.converter.convertFormats() + self.dataType = self.trackFileType +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Gff.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/python + +import os +import tempfile +import abc +import shutil + +# Internal dependencies +from Interval import Interval +from datatypes.validators.DataValidation import DataValidation +from datatypes.converters.DataConversion import DataConversion + +class Gff(Interval): + def __init__(self): + super(Gff, self).__init__() + self.autoSql = os.path.join(self.tool_directory, 'bigGenePred.as') + + + def createTrack(self): + shutil.copyfile(self.inputFile, self.trackDataURL) + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Gff3.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,22 @@ +#!/usr/bin/python + +import os +import tempfile + +# Internal dependencies +from Gff import Gff +from datatypes.validators.Gff3Validation import Gff3Validation + + +class Gff3( Gff ): + def __init__(self, input_Gff3_false_path, data_gff3): + super( Gff3, self ).__init__() + self.inputFile = input_Gff3_false_path + self.trackSettings = data_gff3 + self.dataType = "gff" + + + def validateData(self): + self.validator = Gff3Validation(self.inputFile, self.dataType, self.chromSizesFile.name) + self.inputFile = self.validator.validate() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Gff3_mrna.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/python + +import os +import tempfile + +# Internal dependencies +from Gff import Gff +from datatypes.validators.Gff3Validation import Gff3Validation + + +class Gff3_mrna( Gff ): + def __init__(self, input_Gff3_false_path, data_gff3): + super( Gff3_mrna, self ).__init__() + self.inputFile = input_Gff3_false_path + self.trackSettings = data_gff3 + self.dataType = "gff" + #self.trackType = "G-OnRamp_plugin/GenePred" + + def initSettings(self): + super(Gff3_mrna, self).initSettings() + self.extraSettings["type"] = "mRNA" + self.extraSettings["subfeatureClasses"] = "CDS" + + def validateData(self): + self.validator = Gff3Validation(self.inputFile, self.dataType, self.chromSizesFile.name) + self.inputFile = self.validator.validate() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Gff3_transcript.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,28 @@ +#!/usr/bin/python + +import os +import tempfile + +# Internal dependencies +from Gff import Gff +from datatypes.validators.Gff3Validation import Gff3Validation + + +class Gff3_transcript( Gff ): + def __init__(self, input_Gff3_false_path, data_gff3): + super( Gff3_transcript, self ).__init__() + self.inputFile = input_Gff3_false_path + self.trackSettings = data_gff3 + self.dataType = "gff" + #self.trackType = "G-OnRamp_plugin/GenePred" + + def initSettings(self): + super(Gff3_transcript, self).initSettings() + self.extraSettings["transcriptType"] = "transcript" + self.extraSettings["type"] = "transcript" + self.extraSettings["subfeatureClasses"] = "CDS" + + def validateData(self): + self.validator = Gff3Validation(self.inputFile, self.dataType, self.chromSizesFile.name) + self.inputFile = self.validator.validate() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Gtf.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,33 @@ +#!/usr/bin/python + +import os +import tempfile + +# Internal dependencies +from Gff import Gff +from datatypes.validators.GtfValidation import GtfValidation +from datatypes.converters.DataConversion import DataConversion + + +class Gtf(Gff): + def __init__( self, input_gtf_false_path, data_gtf): + + super(Gtf, self).__init__() + self.inputFile = input_gtf_false_path + self.trackSettings = data_gtf + self.dataType = "gtf" + + def initSettings(self): + super(Gtf, self).initSettings() + self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments" + + def createTrack(self): + self.convertType = self.getConvertType() + self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType) + self.converter.convertFormats() + self.dataType = self.trackFileType + + def validateData(self): + self.validator = GtfValidation(self.inputFile, self.dataType, self.chromSizesFile.name) + self.inputFile = self.validator.validate() +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/GtfStringTie.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,23 @@ +#!/usr/bin/python + +import os +import tempfile + +# Internal dependencies +from Gtf import Gtf +from datatypes.validators.GtfValidation import GtfValidation +from datatypes.converters.DataConversion import DataConversion + + +class GtfStringTie(Gtf): + def __init__( self, input_gtf_false_path, data_gtf): + + super(GtfStringTie, self).__init__(input_gtf_false_path, data_gtf) + + + def initSettings(self): + super(GtfStringTie, self).initSettings() + self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments" + self.extraSettings["subfeatureClasses"] = "UTR" + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Interval.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,42 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +Super Class of the managed datatype +""" + +import logging +from datatypes.Datatype import Datatype + + +class Interval(Datatype): + + def __init__(self): + super(Interval, self).__init__() + if not Datatype.trackType: + self.trackType = "HTMLFeatures" + else: + self.trackType = Datatype.trackType + logging.debug("Set default trackType = %s for feature tracks", self.trackType) + self.trackFileType = "gff" + + + def getValidateOptions(self, tab=None, autoSql=None): + options = dict() + if tab: + options["tab"] = tab + if autoSql: + options["autoSql"] = autoSql + return options + + def getConvertType(self): + if not self.trackFileType or not self.dataType: + raise ValueError("dataType or trackFileType has not been set!") + return (self.dataType.lower(), self.trackFileType.lower()) + + + + + + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/interval/Psl.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,42 @@ +import logging +import os +import tempfile + +# Internal dependencies +from Interval import Interval +from datatypes.validators.PslValidation import PslValidation +from datatypes.converters.DataConversion import DataConversion + + +class Psl(Interval): + def __init__(self, input_psl_path, data_psl): + super(Psl, self).__init__() + self.inputFile = input_psl_path + self.trackSettings = data_psl + self.dataType = "psl" + self.trackType = "bigPsl" + self.autoSql = os.path.join(self.tool_directory, 'bigPsl.as') + + def initSettings(self): + super(Psl, self).initSettings() + self.trackName = "".join( ( self.trackName, ".bb") ) + self.trackDataURL = os.path.join(self.myTrackFolderPath, self.trackName) + if "track_color" in self.trackSettings: + self.extraSettings["color"] = self.trackSettings["track_color"] + if "group_name" in self.trackSettings: + self.extraSettings["group"] = self.trackSettings["group_name"] + self.extraSettings["visibility"] = "dense" + self.extraSettings["priority"] = self.trackSettings["order_index"] + + def validateData(self): + self.validator = PslValidation(self.inputFile, self.dataType, self.chromSizesFile) + self.validator.validate() + + def createTrack(self): + self.convertType = self.getConvertType() + self.options = self.getConvertOptions("bed12+12", tab="True", autoSql=self.autoSql, extraIndex="name") + self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, self.options) + self.converter.convertFormats() + + def getConvertType(self): + return (self.dataType.lower(), self.trackType.lower())
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/sequence/Fasta.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,16 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +Class describing the Fasta format +(As of the 07/20/2016, only used with the reference genome) +""" + +class Fasta(object): + def __init__(self, false_path, name, assembly_id): + self.false_path = false_path + self.name = name + + if not assembly_id: + assembly_id = "unknown" + self.assembly_id = assembly_id \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/validators/DataValidation.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,43 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +This class handles the subprocess calls of the different tools used +in HubArchiveCreator +""" + +import logging +import os +import subprocess +import sys +import string +import tempfile +import re + +from util import subtools + + +class DataValidation(object): + BED_TYPE = re.compile(r'bed([1-9][0-9]?)\+?([1-9][0-9]?)?$') + BIGBED_TYPE = re.compile(r'bigBed([1-9][0-9]?)\+?([1-9][0-9]?)?$') + FILE_TYPE = ["fasta", "fastq", "bam", "bigwig", "bed", "bigbed", "bedgraph"] + + def __init__(self, inputFile, fileType, chromSizesFile, options=None): + self.inputFile = inputFile + self.fileType = fileType + self.chromSizesFile = chromSizesFile + self.options = options + + def validate(self): + """validate input file format""" + if self._checkDatatype(): + subtools.validateFiles(self.inputFile, self.chromSizesFile, self.fileType, self.options) + else: + raise TypeError("validateFiles cannot validate format {0}. Only the following formats can be validated by this tool: \n{1}\n".format(self.fileType, self.FILE_TYPE)) + + def _checkDatatype(self): + if re.match(self.BED_TYPE, self.fileType) or re.match(self.BIGBED_TYPE, self.fileType): + return True + elif self.fileType.lower() in self.FILE_TYPE: + return True + return False
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/validators/Gff3Validation.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,48 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +This class handles the subprocess calls of the different tools used +in HubArchiveCreator +""" + +import logging +import os +import subprocess +import sys +import string +import tempfile +import re + +from DataValidation import DataValidation + + + +class Gff3Validation(DataValidation): + + def __init__(self, inputFile, fileType, chromSizesFile, options=None): + super(Gff3Validation, self).__init__(inputFile, fileType, chromSizesFile, options) + + def validate(self): + """validate input file format""" + if self._removeExtraHeader() > 1: + print("- Warning: Gff3 created with a modified version of your Gff3 by removing extra headers '##gff-version 3'.") + return self.inputFile + + def _removeExtraHeader(self): + """ + Remove extra meta line: ##gff-version 3 + """ + valid_gff3_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gff3", delete=False) + valid = open(valid_gff3_file.name, 'w') + num = 0 + with open(self.inputFile, 'r') as f: + for line in f: + if '##gff-version 3' in line: + if num == 0: + num += 1 + else: + continue + valid.write(line) + self.inputFile = valid_gff3_file.name + return num
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/validators/GtfValidation.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,108 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +This class handles the subprocess calls of the different tools used +in HubArchiveCreator +""" + +import logging +import os +import subprocess +import sys +import string +import tempfile +import re + +from DataValidation import DataValidation + + +class GtfValidation(DataValidation): + + def __init__(self, inputFile, fileType, chromSizesFile, options=None): + super(GtfValidation, self).__init__(inputFile, fileType, chromSizesFile, options) + + def validate(self): + """validate input file format""" + self._checkAndFixGtf() + if self.is_modified: + print("- Warning: Gtf created with a modified version of your Gtf because of start/end coordinates issues.") + print("Here are the lines removed: " + self._get_str_modified_lines()) + return self.inputFile + + + + def _checkAndFixGtf(self): + """ + Call _checkAndFixGtf, check the integrity of gtf file, + if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold + depending on the user choice + default: remove the whole line(s) + """ + # Set the boolean telling if we had to modify the file + self.is_modified = False + self.array_modified_lines = [] + # Create a temp gtf just in case we have issues + temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False) + + # TODO: Get the user choice and use it + # TODO: Check if the start > 0 and the end <= chromosome size + # Get the chrom.sizes into a dictionary to have a faster access + # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary + dict_chrom_sizes = {} + with open(self.chromSizesFile, 'r') as chromSizes: + lines = chromSizes.readlines() + for line in lines: + fields = line.split() + # fields[1] should be the name of the scaffold + # fields[2] should be the size of the scaffold + # TODO: Ensure this is true for all lines + dict_chrom_sizes[fields[0]] = fields[1] + + # Parse the GTF and check each line using the chrom sizes dictionary + with open(temp_gtf.name, 'a+') as tmp: + with open(self.inputFile, 'r') as gtf: + lines = gtf.readlines() + for index, line in enumerate(lines): + # If this is not a comment, we check the fields + if not line.startswith('#'): + fields = line.split() + # We are interested in fields[0] => Seqname (scaffold) + # We are interested in fields[3] => Start of the scaffold + # We are interested in fields[4] => End of the scaffold + scaffold_size = dict_chrom_sizes[fields[0]] + start_position = fields[3] + end_position = fields[4] + + if start_position > 0 and end_position <= scaffold_size: + # We are good, so we copy this line + tmp.write(line) + tmp.write(os.linesep) + + + # The sequence is not good, we are going to process it regarding the user choice + # TODO: Process the user choice + # By default, we are assuming the user choice is to remove the lines: We don't copy it + + # If we are here, it means the gtf has been modified + else: + # We save the line for the feedback to the user + self.array_modified_lines.append(index + 1) + + if self.is_modified is False: + self.is_modified = True + else: + pass + else: + tmp.write(line) + tmp.write(os.linesep) + + # Once the process it completed, we just replace the path of the gtf + self.inputFile = temp_gtf.name + + # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False + #return modified_gtf + + def _get_str_modified_lines(self): + return ','.join(map(str, self.array_modified_lines)) + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes/validators/PslValidation.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +This class handles the subprocess calls of the different tools used +in HubArchiveCreator +""" + +import logging +import os +import subprocess +import sys +import string +import tempfile +import re + +from util import subtools +from datatypes.validators.DataValidation import DataValidation + + +class PslValidation(DataValidation): + + def __init__(self, inputFile, fileType, chromSizesFile, options=None): + super(PslValidation, self).__init__(inputFile, fileType, chromSizesFile, options) + + def validate(self): + """validate input file format""" + self.pslCheck() + + def pslCheck(self): + subtools.pslCheck(self.inputFile)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowseArchiveCreator.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# -*- coding: utf8 -*- + +""" +This Galaxy tool permits to prepare your files to be ready for JBrowse visualization. +""" + +import sys +import argparse +import json +import logging +import collections + + +# Internal dependencies +from util.Reader import Reader +from util.Logger import Logger +from TrackHub import TrackHub + + +def main(argv): + parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.') + parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs') + parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the JBrowse Hub Archive') + + # Get the args passed in parameter + args = parser.parse_args() + json_inputs_data = args.data_json + outputFile = args.output + + ##Parse JSON file with Reader + reader = Reader(json_inputs_data) + + # Begin init variables + extra_files_path = reader.getExtFilesPath() + toolDirectory = reader.getToolDir() + #outputFile = reader.getOutputDir() + user_email = reader.getUserEmail() + reference_genome = reader.getRefGenome() + debug_mode = reader.getDebugMode() + track_type = reader.getTrackType() + #jbrowse_path = reader.getJBrowsePath() + apollo_host = reader.getApolloHost() + apollo_user = reader.getApolloUser() + + #### Logging management #### + # If we are in Debug mode, also print in stdout the debug dump + log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path) + log.setup_logging() + logging.info('#### JBrowseArchiveCreator: Start ####\n') + logging.debug('---- Welcome in JBrowseArchiveCreator Debug Mode ----\n') + logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args)) + #### END Logging management #### + + # Create the Track Hub folder + logging.info('#### JBrowseArchiveCreator: Creating the Track Hub folder ####\n') + trackHub = TrackHub(reference_genome, apollo_user, outputFile, extra_files_path, toolDirectory, track_type, apollo_host) + + # Create Ordered Dictionary to add the tracks in the tool form order + logging.info('#### JBrowseArchiveCreator: Preparing track data ####\n') + all_datatype_dictionary = reader.getTracksData() + all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary) + + logging.debug("----- End of all_datatype_dictionary processing -----") + #logging.debug("all_datatype_ordered_dictionary are: %s", json.dumps(all_datatype_ordered_dictionary)) + + logging.info('#### JBrowseArchiveCreator: Adding tracks to Track Hub ####\n') + logging.debug("----- Beginning of Track adding processing -----") + + for index, datatypeObject in all_datatype_ordered_dictionary.iteritems(): + trackHub.addTrack(datatypeObject.track.track_db) + + logging.debug("----- End of Track adding processing -----") + + # We terminate the process and so create a HTML file summarizing all the files + logging.info('#### JBrowseArchiveCreator: Creating the HTML file ####\n') + trackHub.terminate(debug_mode) + + logging.debug('---- End of JBrowseArchiveCreator Debug Mode: Bye! ----\n') + logging.info('#### JBrowseArchiveCreator: Congratulation! Assembly Hub is created! ####\n') + + sys.exit(0) + +if __name__ == "__main__": + main(sys.argv) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowseArchiveCreator.xml Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,446 @@ +<tool id="jbrowse_hub" name="JBrowse Archive Creator" version="1.0.1"> + <description> + This Galaxy tool is used to prepare your files to be ready for displaying on JBrowse with Apollo plugin + </description> + + <requirements> + <requirement type="package" version="1.2">samtools</requirement> + <requirement type="package" version="1.9">numpy</requirement> + <requirement type="package" version="1.68">biopython</requirement> + <requirement type="package" version="1.0">ucsc_tools_340</requirement> + <requirement type="package" version="1.12.1">jbrowse_tools</requirement> + </requirements> + + <stdio> + </stdio> + + <command detect_errors="exit_code"><![CDATA[ + mkdir -p $output.extra_files_path; + + ## Dump the tool parameters into a JSON file + python $json_file parameters.json; + + python $__tool_directory__/jbrowseArchiveCreator.py --data_json parameters.json -o $output + ]]></command> + <configfiles> + <configfile name="json_file"> +import json +import sys + +file_path = sys.argv[1] +#set global data_parameter_dict = {} + +## Ask the user to enter the genome name +#silent $data_parameter_dict.update({"genome_name": str($genome_name)}) +#silent $data_parameter_dict.update({"apollo_host": str($apollo_host)}) +if $apollo_users_settings.apollo_users_selector == "yes": + #set apollo_user = {"firstname": str($apollo_users_settings.firstname), "lastname": str($apollo_users_settings.lastname), "password": str($apollo_users_settings.password), "user_email": str($apollo_users_settings.user_email)} + $data_parameter_dict.update({"apollo_user": $apollo_user}) + +## Function to retrieve the data of the inputs +#def prepare_json($datatype, $input_to_prepare, $order_index, $extra_data_dict={}) + #set false_path = str($input_to_prepare) + #set $data_dict = {"false_path": $false_path} + + #set name = str($input_to_prepare.name) + #silent $data_dict.update({"name": $name}) + #silent $data_dict.update($extra_data_dict) + ## Add the ordering by taking the tool form indexes + #silent $data_dict.update({"order_index": $order_index}) + + #if $datatype in $data_parameter_dict + #silent $data_parameter_dict[$datatype].append($data_dict) + #else + #set array_inputs = [] + #silent $array_inputs.append($data_dict) + #silent $data_parameter_dict.update({$datatype: $array_inputs}) + #end if +#end def + +## Get the number of digits from tracks, to have a unique integer from group index and track index + +#set temp_max_digit = 0 + +#for $g in $group + #if len($g.format) > $temp_max_digit + #silent temp_max_digit = len($g.format) + #end if +#end for + +#set nb_digits_max_track = len(str($temp_max_digit)) + +## END Get the number of digits + +#for $i_g, $g in enumerate( $group ) + #for $i, $f in enumerate( $g.format ) + ## Create the order index using index_group+1 concatenated with index_track + #set index_group_final = str($i_g + 1) + #set index_track_final = str($index_group_final) + str($i).zfill($nb_digits_max_track) + + ## For each format, we have a few mandatory fields we store in a dict + #set track_color = str($f.formatChoice.track_color) + #set group_name = str($g.group_name) + #set longLabel = str($f.formatChoice.longLabel) + #set extra_data_dict = {"track_color": $track_color, + "group_name": $group_name, + "long_label": $longLabel} + #if $f.formatChoice.format_select == "bam" + #set bam_index = $f.formatChoice.BAM.metadata.bam_index + + ## Add Bam format specific fields + #silent $extra_data_dict.update({"index": $bam_index}) + + #silent $prepare_json("Bam", $f.formatChoice.BAM, $index_track_final, $extra_data_dict) + #end if + #if $f.formatChoice.format_select == "bed" + #if $f.formatChoice.bedChoice.bed_select == "bed_generic" + #silent $prepare_json("Bed", $f.formatChoice.bedChoice.BED, $index_track_final, + $extra_data_dict) + #end if + #if $f.formatChoice.bedChoice.bed_select == "bed_simple_repeats_option" + #silent $prepare_json("BedSimpleRepeats", $f.formatChoice.bedChoice.BED_simple_repeats, $index_track_final, + $extra_data_dict) + #end if + #if $f.formatChoice.bedChoice.bed_select == "bed_splice_junctions_option" + #silent $prepare_json("BedSpliceJunctions", $f.formatChoice.bedChoice.BED_splice_junctions, $index_track_final, + $extra_data_dict) + #end if + #if $f.formatChoice.bedChoice.bed_select == "bed_blast_alignment_option" + ##set database = str($f.formatChoice.bedChoice.database) + ##silent $extra_data_dict.update({"database": $database}) + #silent $prepare_json("BedBlastAlignments", $f.formatChoice.bedChoice.BED_blast_alignment, $index_track_final, + $extra_data_dict) + #end if + #if $f.formatChoice.bedChoice.bed_select == "bed_blat_alignment_option" + ##set database = str($f.formatChoice.bedChoice.database) + ##silent $extra_data_dict.update({"database": $database}) + #silent $prepare_json("BedBlatAlignments", $f.formatChoice.bedChoice.BED_blat_alignment, $index_track_final, + $extra_data_dict) + #end if + #end if + #if $f.formatChoice.format_select == "blastxml" + #silent $prepare_json($f.formatChoice.BlastXML, extra_data_dict) + #end if + #if $f.formatChoice.format_select == "bigwig" + #set pos_color = str($f.formatChoice.pos_color) + #set neg_color = str($f.formatChoice.neg_color) + #silent $extra_data_dict.update({"style" : {"pos_color" : $pos_color, "neg_color" : $neg_color}}) + #silent $prepare_json("BigWig", $f.formatChoice.BIGWIG, $index_track_final, + $extra_data_dict) + #end if + #if $f.formatChoice.format_select == 'gff3' + #if $f.formatChoice.gff3Choice.gff3_select == 'gff3_generic' + #silent $prepare_json("Gff3", $f.formatChoice.GFF3, $index_track_final, + $extra_data_dict) + #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_transcript' + #silent $prepare_json("Gff3_transcript", $f.formatChoice.GFF3, $index_track_final, + $extra_data_dict) + #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_mrna' + #silent $prepare_json("Gff3_mrna", $f.formatChoice.GFF3, $index_track_final, + $extra_data_dict) + #end if + #end if + #if $f.formatChoice.format_select == "gtf" + ## Add also GTF from Agustus? See https://github.com/ENCODE-DCC/kentUtils/issues/8 + #silent $prepare_json("Gtf", $f.formatChoice.GTF, $index_track_final, + $extra_data_dict) + #end if + #end for +#end for + +## We combine the fasta file dataset name with his false path in a JSON object +#set fasta_json = {"false_path": str($fasta_file), "name": str($fasta_file.name)} +$data_parameter_dict.update({"fasta": $fasta_json}) + +## Retrieve the user email +#silent $data_parameter_dict.update({"user_email": str($__user_email__)}) + +#silent $data_parameter_dict.update({"tool_directory": str($__tool_directory__)}) + +#silent $data_parameter_dict.update({"extra_files_path": str($output.extra_files_path)}) + +#silent $data_parameter_dict.update({"debug_mode": str($advanced_options.debug_mode)}) + +with open(file_path, 'w') as f: + json.dump($data_parameter_dict, f) + </configfile> + </configfiles> + + <inputs> + <param + name="genome_name" + type="text" + size="30" + value="unknown" + label="JBrowse Hub Name" + /> + <param + format="fasta" + name="fasta_file" + type="data" + label="Reference genome" + /> + <param + name="apollo_host" + type="text" + label="Apollo host" + /> + <conditional name="apollo_users_settings"> + <param name="apollo_users_selector" type="select" label="Create or specify your Apollo account"> + <option value="no" selected="true">Use exist demo user account (will use your galaxy email address for apollo, password: gonramp) </option> + <option value="yes">Create or use your own Apollo account</option> + </param> + <!-- TODO: Avoid redundancy here --> + <when value="yes"> + <param + name="firstname" + type="text" + label="First Name" + /> + <param + name="lastname" + type="text" + label="Last Name" + /> + <param + name="user_email" + type="text" + label="Email Address" + /> + <param + name="password" + type="text" + label="Password" + /> + </when> + <when value="no"> + <param name="default_user" type="hidden" + value="false"> + </param> + </when> + </conditional> + + <repeat name="group" title="New group"> + <param type="text" name="group_name" label="Group name" value="Default group"/> + <repeat name="format" title="New track"> + <conditional name="formatChoice"> + <param name="format_select" type="select" label="Format"> + <option value="bam" selected="true">BAM</option> + <option value="bed">BED</option> + <option value="blastxml">BlastXML</option> + <option value="bigwig">BigWig</option> + <option value="gff3">GFF3</option> + <option value="gtf">GTF</option> + </param> + + <when value="bam"> + <param + format="bam" + name="BAM" + type="data" + label="BAM File" + /> + <param name="longLabel" type="text" size="30" value = "Sequence Alignment" label="Track label" /> + <param name="track_color" type="color" label="Track color" value="#000000"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="bed"> + <conditional name="bedChoice"> + <param name="bed_select" type="select" label="Bed Choice"> + <option value="bed_generic">BED format</option> + <option value="bed_simple_repeats_option">BED Simple repeat (bed4+12 / simpleRepeat.as)</option> + <option value="bed_splice_junctions_option">BED Splice junctions (bed12+1 / spliceJunctions.as)</option> + <option value="bed_blast_alignment_option">Blast alignments (bed12+12 / bigPsl.as)</option> + <option value="bed_blat_alignment_option">BLAT alignments (bigPsl / bigPsl.as)</option> + </param> + <when value="bed_generic"> + <param + format="bed" + name="BED_generic" + type="data" + label="Bed File" + /> + </when> + <when value="bed_simple_repeats_option"> + <param + format="bed" + name="BED_simple_repeats" + type="data" + label="Bed Simple Repeats (Bed4+12) File" + /> + </when> + <when value="bed_splice_junctions_option"> + <param + format="bed" + name="BED_splice_junctions" + type="data" + label="Bed Splice Junctions (Bed12+1) File" + /> + </when> + <when value="bed_blast_alignment_option"> + <param + format="bed" + name="BED_blast_alignment" + type="data" + label="Bed Blast Alignments (Bed12+12) File" + /> + </when> + <when value="bed_blat_alignment_option"> + <param + format="bed" + name="BED_blat_alignment" + type="data" + label="Bed BLAT Alignments (bigPsl) File" + /> + </when> + </conditional> + <param name="longLabel" type="text" size="30" label="Track label" /> + <param name="track_color" type="color" label="Track color" value="#000000"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="blastxml"> + <param + format="blastxml" + name="BlastXML" + type="data" + label="Blast Alignments File" + /> + <param name="longLabel" type="text" size="30" value="Blast Alignment" label="Track label" /> + <param name="track_color" type="color" label="Track color" value="#000000"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="bigwig"> + <param + format="bigwig" + name="BIGWIG" + type="data" + label="BIGWIG File" + /> + <param name="longLabel" type="text" size="30" value="Sequence Coverage" label="Track label" /> + <param name="track_color" type="color" label="Track color" value="#000000"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="gff3"> + <conditional name="gff3Choice"> + <param name="gff3_select" type="select" label="gff3 type"> + <option value="gff3_generic">GFF3 format</option> + <option value="gff3_transcript">GFF3 format output from gene prediction tools (e.g. Augustus), structure: gene->transcription->CDS</option> + <option value="gff3_mrna">GFF3 format output from gene prediction tools (e.g. SNAP), structure: gene->mRNA->CDS</option> + </param> + <when value="gff3_generic"> + <param + format="gff3" + name="GFF3_generic" + type="data" + label="GFF3 File" + /> + </when> + <when value="gff3_transcript"> + <param + format="gff3" + name="GFF3_transcript" + type="data" + label="GFF3 File from gene prediction" + /> + </when> + <when value="gff3_mrna"> + <param + format="gff3" + name="GFF3_mrna" + type="data" + label="GFF3 File from gene prediction" + /> + </when> + </conditional> + <param name="longLabel" type="text" size="30" value="Gene Prediction" label="Track name" /> + <param name="track_color" type="color" label="Track color" value="#daa520"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + <when value="gtf"> + <param + format="gtf" + name="GTF" + type="data" + label="GTF File" + /> + <param name="longLabel" type="text" size="30" value="Assembled Transcripts" label="Track name" /> + <param name="track_color" type="color" label="Track color" value="#000000"> + <sanitizer> + <valid initial="string.letters,string.digits"> + <add value="#"/> + </valid> + </sanitizer> + </param> + </when> + </conditional> + </repeat> + </repeat> + <conditional name="advanced_options"> + <param name="advanced_options_selector" type="select" label="Advanced options"> + <option value="off" selected="true">Hide advanced options</option> + <option value="on">Display advanced options</option> + </param> + <!-- TODO: Avoid redundancy here --> + <when value="on"> + <param name="debug_mode" type="select" label="Activate debug mode"> + <option value="false" selected="true">No</option> + <option value="true">Yes</option> + <help> + Use this option if you are a G-OnRamp developer + </help> + </param> + </when> + <when value="off"> + <param name="debug_mode" type="hidden" + value="false"> + </param> + </when> + </conditional> + </inputs> + + <outputs> + <data format="jbrowsehub" name="output" label="${tool.name}" /> + </outputs> + <tests> + <test> + <param name="reference" value="dbia3/raw/dbia3.fa" /> + <param name="genome_name" value="unknown" /> + <param name="group_name" value="Default group"/> + <param name="format_select" value="bam" /> + <param name="BAM" value="dbia3/raw/HISAT.bam" /> + <param name="label" value="" /> + <output name="output" file="JBrowse_Archive_Creator_html.html" /> + </test> + </tests> + <help> + This Galaxy tool will create a jbrowse hub which including binary datasets and json datasets that can be used for + JBrowse visualization. + </help> + <citations> + </citations> +</tool> \ No newline at end of file
--- a/jbrowse_hub.py Wed Jul 12 12:55:27 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,176 +0,0 @@ -#!/usr/bin/env python - -import sys -import argparse -import json -import utils -import trackObject -import TrackHub - - - -def main(argv): - parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.') - - # Reference genome mandatory - parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome (Required)') - - # Genome name - parser.add_argument('-g', '--genome_name', help='Name of reference genome') - - # Output folder - parser.add_argument('-o', '--out', help='output html') - - # Output folder - parser.add_argument('-e', '--extra_files_path', help='Directory of JBrowse Hub folder') - - #Tool Directory - parser.add_argument('-d', '--tool_directory', help='The directory of JBrowse file convertion scripts and UCSC tools') - - #GFF3 - parser.add_argument('--gff3', action='append', help='GFF3 format') - - # GFF3 structure: gene->transcription->CDS - parser.add_argument('--gff3_transcript', action='append', help='GFF3 format for gene prediction, structure: gene->transcription->CDS') - - # GFF3 structure: gene->mRNA->CDS - parser.add_argument('--gff3_mrna', action='append', help='GFF3 format for gene prediction, structure: gene->mRNA->CDS') - - # generic BED - parser.add_argument('--bed', action='append', help='BED format') - - # trfBig simple repeats (BED 4+12) - parser.add_argument('--bedSimpleRepeats', action='append', help='BED 4+12 format, using simpleRepeats.as') - - # regtools (BED 12+1) - parser.add_argument('--bedSpliceJunctions', action='append', help='BED 12+1 format, using spliceJunctions.as') - - # tblastn alignment (blastxml) - parser.add_argument('--blastxml', action='append', help='blastxml format from tblastn') - - # blat alignment (bigpsl 12+12) - parser.add_argument('--bigpsl', action='append', help='bigpsl format from blat alignment') - - # BAM format - parser.add_argument('--bam', action='append', help='BAM format from HISAT') - - # BIGWIG format - parser.add_argument('--bigwig', action='append', help='BIGWIG format to show rnaseq coverage') - - # GTF format - parser.add_argument('--gtf', action='append', help='GTF format from StringTie') - - # Metadata json format - parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs') - - #JBrowse host - parser.add_argument('--jbrowse_host', help="JBrowse Host") - - args = parser.parse_args() - all_datatype_dictionary = dict() - - - if not args.fasta: - parser.print_help() - raise RuntimeError("No reference genome\n") - reference = args.fasta - genome = 'unknown' - out_path = 'unknown.html' - extra_files_path = '.' - tool_directory = '.' - jbrowse_host = '' - if args.jbrowse_host: - jbrowse_host = args.jbrowse_host - if args.genome_name: - genome = args.genome_name - if args.out: - out_path = args.out - if args.extra_files_path: - extra_files_path = args.extra_files_path - - #tool_directory not work for Galaxy tool, all tools need to exist in the current PATH, deal with it with tool dependencies - if args.tool_directory: - tool_directory = args.tool_directory - - #Calculate chromsome sizes using genome reference and uscs tools - chrom_size = utils.getChromSizes(reference, tool_directory) - - #get metadata from json file - json_inputs_data = args.data_json - if json_inputs_data: - inputs_data = json.loads(json_inputs_data) - else: - inputs_data = {} - - #print inputs_data - - #Initate trackObject - all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path) - - array_inputs_bam = args.bam - array_inputs_bed = args.bed - array_inputs_bed_simple_repeats = args.bedSimpleRepeats - array_inputs_bed_splice_junctions = args.bedSpliceJunctions - array_inputs_bigwig = args.bigwig - array_inputs_gff3 = args.gff3 - array_inputs_gff3_transcript = args.gff3_transcript - array_inputs_gff3_mrna = args.gff3_mrna - array_inputs_gtf = args.gtf - array_inputs_blastxml = args.blastxml - array_inputs_bigpsl = args.bigpsl - - if array_inputs_bam: - all_datatype_dictionary['bam'] = array_inputs_bam - if array_inputs_bed: - all_datatype_dictionary['bed'] = array_inputs_bed - if array_inputs_bed_simple_repeats: - all_datatype_dictionary['bedSimpleRepeats'] = array_inputs_bed_simple_repeats - if array_inputs_bed_splice_junctions: - all_datatype_dictionary['bedSpliceJunctions'] = array_inputs_bed_splice_junctions - if array_inputs_bigwig: - all_datatype_dictionary['bigwig'] = array_inputs_bigwig - if array_inputs_gff3: - all_datatype_dictionary['gff3'] = array_inputs_gff3 - if array_inputs_gff3_transcript: - all_datatype_dictionary['gff3_transcript'] = array_inputs_gff3_transcript - if array_inputs_gff3_mrna: - all_datatype_dictionary['gff3_mrna'] = array_inputs_gff3_mrna - if array_inputs_gtf: - all_datatype_dictionary['gtf'] = array_inputs_gtf - if array_inputs_blastxml: - all_datatype_dictionary['blastxml'] = array_inputs_blastxml - if array_inputs_bigpsl: - all_datatype_dictionary['bigpsl'] = array_inputs_bigpsl - print "input tracks: \n", all_datatype_dictionary - - for datatype, inputfiles in all_datatype_dictionary.items(): - try: - if not inputfiles: - raise ValueError('empty input, must provide track files!\n') - except IOError: - print 'Cannot open', datatype - else: - for f in inputfiles: - #metadata = {} - #print f - #if f in inputs_data.keys(): - # metadata = inputs_data[f] - #print metadata - #Convert tracks into gff3 format - all_tracks.addToRaw(f, datatype) - - jbrowseHub = TrackHub.TrackHub(all_tracks, reference, out_path, tool_directory, genome, extra_files_path, inputs_data, jbrowse_host) - jbrowseHub.createHub() - -""" -def extractMetadata(array_inputs, inputs_data): - metadata_dict = {} - for input_false_path in array_inputs: - for key, data_value in inputs_data.items(): - if key == input_false_path: - metadata_dict[input_false_path] -""" - -if __name__ == "__main__": - main(sys.argv) -
--- a/jbrowse_hub.xml Wed Jul 12 12:55:27 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,306 +0,0 @@ -<tool id="jbrowse_hub" name="JBrowse Archive Creator" version="1.0.0"> - <description> - This Galaxy tool is used to prepare your files to be ready for displaying on JBrowse - </description> - - <requirements> - <requirement type="package" version="1.2">samtools</requirement> - <requirement type="package" version="1.9">numpy</requirement> - <requirement type="package" version="1.68">biopython</requirement> - <requirement type="package" version="1.0">ucsc_tools_340</requirement> - <requirement type="package" version="1.12.1">jbrowse_tools</requirement> - </requirements> - - <stdio> - </stdio> - - <command detect_errors="exit_code"><![CDATA[ - python $__tool_directory__/jbrowse_hub.py - --fasta '$reference' - --genome_name '$genome_name' - - #set galaxy_url = str($GALAXY_URL) - #set $jbrowse_url = galaxy_url.replace("8080", "80") - --jbrowse_host '$jbrowse_url' - - ## json metadata recording from Remi's hub-archive-creator.xml - #import json - #set global data_parameter_dict = {} - - ## Function to retrieve the data of the inputs - #def prepare_json($input_to_prepare, $extra_data_dict={}) - #set false_path = str($input_to_prepare) - #set name = $input_to_prepare.name - - #set data_dict = {"name": $name} - #silent data_dict.update($extra_data_dict) - - #silent $data_parameter_dict.update({$false_path: $data_dict}) - - #end def - - #for $g in $group - #for $f in $g.format - #set track_label = str($f.formatChoice.label) - #set group_name = str($g.group_name) - #set extra_data_dict = {"label" : $track_label, "category" : $group_name} - #if $f.formatChoice.format_select == 'bed' - #set track_color = str($f.formatChoice.track_color) - #silent extra_data_dict.update({"color" : $track_color}) - #if $f.formatChoice.bedChoice.bed_select == 'bed_generic_option' - --bed $f.formatChoice.bedChoice.BED_generic - #silent $prepare_json($f.formatChoice.bedChoice.BED_generic, extra_data_dict) - #elif $f.formatChoice.bedChoice.bed_select == 'bed_simple_repeats_option' - --bedSimpleRepeats $f.formatChoice.bedChoice.BED_simple_repeats - #silent $prepare_json($f.formatChoice.bedChoice.BED_simple_repeats, extra_data_dict) - #elif $f.formatChoice.bedChoice.bed_select == 'bed_splice_junctions_option' - --bedSpliceJunctions $f.formatChoice.bedChoice.BED_splice_junctions - #silent $prepare_json($f.formatChoice.bedChoice.BED_splice_junctions, extra_data_dict) - #elif $f.formatChoice.bedChoice.bed_select == 'bigpsl' - --bigpsl $f.formatChoice.bedChoice.BigPsl - #silent $prepare_json($f.formatChoice.bedChoice.BigPsl, extra_data_dict) - #end if - #end if - #if $f.formatChoice.format_select == 'bam' - --bam $f.formatChoice.BAM - #silent $prepare_json($f.formatChoice.BAM, extra_data_dict) - #end if - #if $f.formatChoice.format_select == 'gff3' - #set track_color = str($f.formatChoice.track_color) - #silent extra_data_dict.update({"color" : $track_color}) - #if $f.formatChoice.gff3Choice.gff3_select == 'gff3_generic' - --gff3 $f.formatChoice.gff3Choice.GFF3_generic - #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_generic, extra_data_dict) - #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_transcript' - --gff3_transcript $f.formatChoice.gff3Choice.GFF3_transcript - #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_transcript, extra_data_dict) - #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_mrna' - --gff3_mrna $f.formatChoice.gff3Choice.GFF3_mrna - #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_mrna, extra_data_dict) - #end if - #end if - #if $f.formatChoice.format_select == 'blastxml' - --blastxml $f.formatChoice.BlastXML - #silent $prepare_json($f.formatChoice.BlastXML, extra_data_dict) - #end if - #if $f.formatChoice.format_select == 'gtf' - --gtf $f.formatChoice.GTF - #set track_color = str($f.formatChoice.track_color) - #silent extra_data_dict.update({"color" : $track_color}) - #silent $prepare_json($f.formatChoice.GTF, extra_data_dict) - #end if - #if $f.formatChoice.format_select == 'bigwig' - --bigwig $f.formatChoice.BIGWIG - #set pos_color = str($f.formatChoice.pos_color) - #set neg_color = str($f.formatChoice.neg_color) - #silent $extra_data_dict.update({"style" : {"pos_color" : $pos_color, "neg_color" : $neg_color}}) - #silent $prepare_json($f.formatChoice.BIGWIG, extra_data_dict) - #end if - #end for - #end for - - #set all_data_json = json.dumps($data_parameter_dict) - -j '$all_data_json' - -e '$output.extra_files_path' - -o '$output' - - ]]></command> - - <inputs> - <param name="GALAXY_URL" type="baseurl" value="" /> - <param name="reference" type="data" format="fasta" label="Reference Genome" /> - <param name="genome_name" type="text" size="30" value="unknown" label="Genome name" /> - <repeat name="group" title="New group"> - <param type="text" name="group_name" label="Group name" value="Default group"/> - <repeat name="format" title="New track"> - <conditional name="formatChoice"> - <param name="format_select" type="select" label="Format"> - <option value="bam" selected="true">BAM</option> - <option value="bed">BED</option> - <option value="blastxml">BlastXML</option> - <option value="bigwig">BigWig</option> - <option value="gff3">GFF3</option> - <option value="gtf">GTF</option> - </param> - - <when value="bam"> - <param - format="bam" - name="BAM" - type="data" - label="BAM File" - /> - <param name="label" type="text" size="30" value = "Sequence Alignment" label="Track name" /> - </when> - <when value="bed"> - <conditional name="bedChoice"> - <param name="bed_select" type="select" label="Bed Choice"> - <option value="bed_generic_option">BED format</option> - <option value="bed_simple_repeats_option">BED Simple repeat (bed4+12 / simpleRepeat.as)</option> - <option value="bed_splice_junctions_option">BED Splice junctions (bed12+1 / spliceJunctions.as)</option> - <option value="bigpsl">Blat Alignment (bed12+12 / bigPsl.as)</option> - </param> - <when value="bed_generic_option"> - <param - format="bed" - name="BED_generic" - type="data" - label="Bed File" - /> - </when> - <when value="bed_simple_repeats_option"> - <param - format="bed" - name="BED_simple_repeats" - type="data" - label="Bed Simple Repeats (Bed4+12) File" - /> - </when> - <when value="bed_splice_junctions_option"> - <param - format="bed" - name="BED_splice_junctions" - type="data" - label="Bed Splice Junctions (Bed12+1) File" - /> - </when> - <when value="bigpsl"> - <param - format="bed" - name="BigPsl" - type="data" - label="Blat Alignments File" - /> - </when> - </conditional> - <param name="label" type="text" size="30" value="BED file" label="Track name" /> - <param name="track_color" type="color" label="Track color" value="#daa520"> - <sanitizer> - <valid initial="string.letters,string.digits"> - <add value="#"/> - </valid> - </sanitizer> - </param> - </when> - <when value="blastxml"> - <param - format="blastxml" - name="BlastXML" - type="data" - label="Blast Alignments File" - /> - <param name="label" type="text" size="30" value="Blast Alignment" label="Track name" /> - <param name="track_color" type="color" label="Track color" value="#daa520"> - <sanitizer> - <valid initial="string.letters,string.digits"> - <add value="#"/> - </valid> - </sanitizer> - </param> - </when> - <when value="bigwig"> - <param - format="bigwig" - name="BIGWIG" - type="data" - label="BIGWIG File" - /> - <param name="label" type="text" size="30" value="Sequence Coverage" label="Track name" /> - <param name="pos_color" type="color" label="Positive Coverage Color" value="#FFA600"> - <sanitizer> - <valid initial="string.letters,string.digits"> - <add value="#"/> - </valid> - </sanitizer> - </param> - <param name="neg_color" type="color" label="Negative Coverage Color" value="#005EFF"> - <sanitizer> - <valid initial="string.letters,string.digits"> - <add value="#"/> - </valid> - </sanitizer> - </param> - </when> - <when value="gff3"> - <conditional name="gff3Choice"> - <param name="gff3_select" type="select" label="gff3 type"> - <option value="gff3_generic">GFF3 format</option> - <option value="gff3_transcript">GFF3 format output from gene prediction tools (e.g. Augustus), structure: gene->transcription->CDS</option> - <option value="gff3_mrna">GFF3 format output from gene prediction tools (e.g. SNAP), structure: gene->mRNA->CDS</option> - </param> - <when value="gff3_generic"> - <param - format="gff3" - name="GFF3_generic" - type="data" - label="GFF3 File" - /> - </when> - <when value="gff3_transcript"> - <param - format="gff3" - name="GFF3_transcript" - type="data" - label="GFF3 File from gene prediction" - /> - </when> - <when value="gff3_mrna"> - <param - format="gff3" - name="GFF3_mrna" - type="data" - label="GFF3 File from gene prediction" - /> - </when> - </conditional> - <param name="label" type="text" size="30" value="Gene Prediction" label="Track name" /> - <param name="track_color" type="color" label="Track color" value="#daa520"> - <sanitizer> - <valid initial="string.letters,string.digits"> - <add value="#"/> - </valid> - </sanitizer> - </param> - </when> - <when value="gtf"> - <param - format="gtf" - name="GTF" - type="data" - label="GTF File" - /> - <param name="label" type="text" size="30" value="Assembled Transcripts" label="Track name" /> - <param name="track_color" type="color" label="Track color" value="#daa520"> - <sanitizer> - <valid initial="string.letters,string.digits"> - <add value="#"/> - </valid> - </sanitizer> - </param> - </when> - </conditional> - </repeat> - </repeat> - </inputs> - - <outputs> - <data format="jbrowsehub" name="output" label="${tool.name}" /> - </outputs> - <tests> - <test> - <param name="reference" value="dbia3/raw/dbia3.fa" /> - <param name="genome_name" value="unknown" /> - <param name="group_name" value="Default group"/> - <param name="format_select" value="bam" /> - <param name="BAM" value="dbia3/raw/HISAT.bam" /> - <param name="label" value="" /> - <output name="output" file="JBrowse_Archive_Creator_html.html" /> - </test> - </tests> - <help> - This Galaxy tool will create a tar file which including raw datasets and json datasets that can be used for - JBrowse visualization. - </help> - <citations> - </citations> -</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/logging.json Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,48 @@ +{ + "version": 1, + "disable_existing_loggers": false, + "formatters": { + "simple": { + "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + } + }, + + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": "INFO", + "formatter": "simple", + "stream": "ext://sys.stdout" + }, + + "console_stderr": { + "class": "logging.StreamHandler", + "level": "ERROR", + "formatter": "simple", + "stream": "ext://sys.stderr" + }, + + "debug_file_handler": { + "class": "logging.handlers.RotatingFileHandler", + "level": "DEBUG", + "formatter": "simple", + "filename": "__main__.log", + "maxBytes": 10485760, + "backupCount": 20, + "encoding": "utf8" + } + }, + + "loggers": { + "Reader": { + "level": "INFO", + "handlers": ["console"], + "propagate": "yes" + } + }, + + "root": { + "level": "DEBUG", + "handlers": ["console", "console_stderr", "debug_file_handler"] + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spliceJunctions.as Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,17 @@ +table spliceJunctions +"Predicted splice junctions" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name of item" + uint score; "Score from 0-1000" + char[1] strand; "+ or -" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "Used as itemRgb as of 2004-11-22" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" + uint junctionScore; "Number of reads supporting the splice junction" + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/custom_track_styles.css Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,9 @@ +.${label}, +.plus-${label}, +.minus-${label} +{ + background-color: ${color}; + height: 90%; + top: 5%; +} +
--- a/trackObject.py Wed Jul 12 12:55:27 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -#!/usr/bin/env python - -import os -import shutil -import utils -import bedToGff3 -import blastxmlToGff3 - - -class trackObject: - def __init__(self, chrom_size, genome, extra_files_path): - self.chrom_size = chrom_size - outputDirect = os.path.join(extra_files_path, 'myHub') - self.raw_folder = os.path.join(outputDirect, 'raw') - #Store metadata of the tracks - self.tracks = [] - try: - if os.path.exists(self.raw_folder): - if os.path.isdir(self.raw_folder): - shutil.rmtree(self.raw_folder) - else: - os.remove(self.raw_folder) - os.makedirs(self.raw_folder) - except OSError as oserror: - print "Cannot create raw folder error({0}): {1}".format(oserror.errno, oserror.strerror) - - def addToRaw(self, dataFile, dataType): - """ - Convert gff3, BED, blastxml and gtf files into gff3 files - and store converted files in folder 'raw' - """ - false_path = os.path.abspath(dataFile) - fileName = os.path.basename(dataFile) - des_path = os.path.join(self.raw_folder, fileName) - track = {} - if dataType == 'bed' or dataType == 'gff3' or dataType == 'gff3_mrna' or dataType == 'gff3_transcript' or dataType == 'fasta' or dataType == 'bam' or dataType == 'bigwig': - if dataType == 'bam': - # JBrowse will raise error: not a BAM file if the filename hasn't .bam extension - extension = os.path.splitext(fileName)[1] - if extension != '.bam': - fileName = fileName + '.bam' - des_path = os.path.join(self.raw_folder, fileName) - bam_index = utils.createBamIndex(dataFile) - indexname = os.path.basename(bam_index) - des_path_for_index = os.path.join(self.raw_folder, indexname) - shutil.copyfile(bam_index, des_path_for_index) - track['index'] = indexname - - try: - shutil.copyfile(dataFile, des_path) - except shutil.Error as err1: - print "Cannot move file, error({0}: {1})".format(err1.errno, err1.strerror) - except IOError as err2: - print "Cannot move file, error({0}: {1})".format(err2.errno, err2.strerror) - elif dataType == 'bedSimpleRepeats': - bedToGff3.bedToGff3(dataFile, self.chrom_size, 'trfbig', des_path) - elif dataType == 'bedSpliceJunctions': - bedToGff3.bedToGff3(dataFile, self.chrom_size, 'regtools', des_path) - elif dataType == 'bigpsl': - bedToGff3.bedToGff3(dataFile, self.chrom_size, 'blat', des_path) - elif dataType == 'blastxml': - blastxmlToGff3.blastxml2gff3(dataFile, des_path) - elif dataType == 'gtf': - utils.gtfToGff3(dataFile, des_path, self.chrom_size) - track['fileName'] = fileName - track['dataType'] = dataType - track['false_path'] = false_path - #self.SetMetadata(track, metaData) - self.tracks.append(track) - - \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tracks/BamFeatures.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,29 @@ +#!/usr/bin/env python +import os +import json +import logging + +from TrackDb import TrackDb +from util import subtools +from util import santitizer + + +class BamFeatures(TrackDb): + def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None): + super(BamFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings) + + def prepareExtraSetting(self): + if 'category' not in self.extraSettings or not self.extraSettings['category']: + self.extraSettings['category'] = "Default group" + bam_track = dict() + bam_track['type'] = 'JBrowse/View/Track/Alignments2' + bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM' + bam_track['urlTemplate'] = os.path.join('bbi', self.trackName) + bam_track['baiUrlTemplate'] = os.path.join('bbi', self.extraSettings['index']) + bam_track['label'] = self.trackLabel + bam_track['category'] = self.extraSettings['category'] + #extraConfigs = json.dumps(bam_track) + extraConfigs = bam_track + return extraConfigs + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tracks/BigwigFeatures.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,45 @@ +#!/usr/bin/env python +import os +import json +import logging + +from TrackDb import TrackDb +from util import subtools +from util import santitizer + + +class BigwigFeatures(TrackDb): + def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None): + super(BigwigFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings) + + def prepareExtraSetting(self): + if 'category' not in self.extraSettings or not self.extraSettings['category']: + self.extraSettings['category'] = "Default group" + if 'color' not in self.extraSettings or not self.extraSettings['color']: + self.extraSettings['style'] = {} + self.extraSettings['style']['pos_color'] = "#FFA600" + else: + self.extraSettings['style'] = {} + self.extraSettings['style']['pos_color'] = self.extraSettings['color'] + + + ''' + if 'style' not in self.extraSettings: + self.extraSettings['style'] = {} + if 'pos_color' not in self.extraSettings['style'] or self.extraSettings['style']['pos_color'] == '': + self.extraSettings['style']['pos_color'] = "#FFA600" + if 'neg_color' not in self.extraSettings['style'] or self.extraSettings['style']['neg_color'] == '': + self.extraSettings['style']['neg_color'] = "#005EFF" + ''' + bigwig_track = dict() + bigwig_track['urlTemplate'] = os.path.join('bbi', self.trackName) + bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot' + bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig' + bigwig_track['label'] = self.trackLabel + bigwig_track['style'] = self.extraSettings['style'] + bigwig_track['category'] = self.extraSettings['category'] + #extraConfigs = json.dumps(bigwig_track) + extraConfigs = bigwig_track + return extraConfigs + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tracks/CanvasFeatures.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import json +import logging + +from TrackDb import TrackDb +from util import subtools + + +class CanvasFeatures(TrackDb): + def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None): + super(CanvasFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings) + + def prepareExtraSetting(self): + """ set CanvasFeatures configuration options """ + extraConfigs = dict() + self.extraSettings["clientConfig"] = dict() + self.extraSettings["config"] = dict() + if 'color' not in self.extraSettings or not self.extraSettings['color']: + self.extraSettings["clientConfig"]['color'] = "#daa520" + else: + self.extraSettings["clientConfig"]['color'] = self.extraSettings['color'] + if 'category' not in self.extraSettings or not self.extraSettings['category']: + self.extraSettings["config"]['category'] = "Default group" + else: + self.extraSettings["config"]['category'] = self.extraSettings['category'] + if 'glyph' in self.extraSettings: + self.extraSettings["config"]['glyph'] = self.extraSettings['glyph'] + if 'transcriptType' in self.extraSettings: + self.extraSettings['config']['transcriptType'] = self.extraSettings['transcriptType'] + extraConfigs["config"] = json.dumps(self.extraSettings["config"]) + extraConfigs["clientConfig"] = json.dumps(self.extraSettings["clientConfig"]) + return extraConfigs \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tracks/HTMLFeatures.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,39 @@ +#!/usr/bin/env python +import json +import logging + +from TrackDb import TrackDb +from util import subtools +from util import santitizer + + +class HTMLFeatures(TrackDb): + def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None): + super(HTMLFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings) + + def prepareExtraSetting(self): + """ set HTMLFeatures configuration options """ + extraConfigs = dict() + self.extraSettings["clientConfig"] = dict() + self.extraSettings["config"] = dict() + if 'type' in self.extraSettings: + extraConfigs["type"] = self.extraSettings['type'] + if 'color' in self.extraSettings and self.extraSettings['color']: + extraConfigs['feature_color'] = self.extraSettings['color'] + else: + extraConfigs['feature_color'] = "#000000" + #self.extraSettings['clientConfig']['color'] = self.extraSettings['color'] + if 'subfeatureClasses' in self.extraSettings: + subfeature_css_class = santitizer.sanitize_name(self.trackLabel + "_" + self.extraSettings['subfeatureClasses']) + extraConfigs['subfeatureClasses'] = {self.extraSettings['subfeatureClasses']: subfeature_css_class} + + if 'category' not in self.extraSettings or not self.extraSettings['category']: + self.extraSettings['config']['category'] = "Default group" + else: + self.extraSettings['config']['category'] = self.extraSettings['category'] + + extraConfigs['config'] = json.dumps(self.extraSettings["config"]) + extraConfigs['clientConfig'] = json.dumps(self.extraSettings["clientConfig"]) + return extraConfigs + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tracks/TrackDb.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,53 @@ +#!/usr/bin/python +""" +Super Class of the tracks +""" +import os +import abc +from abc import ABCMeta +import collections +import json +import logging +from util import santitizer + +class TrackDb(object): + """docstring for TrackDb""" + __metaclass__ = ABCMeta + + def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None): + #super(TrackDb, self).__init__() + + not_init_message = "The {0} is not initialized." + if trackName is None: + raise TypeError(not_init_message.format('trackName')) + if trackLabel is None: + raise TypeError(not_init_message.format('trackLabel')) + if trackType is None: + raise TypeError(not_init_message.format('trackType')) + self.trackName = trackName + self.trackLabel = trackLabel + self.trackDataURL = trackDataURL + self.trackType = trackType + self.dataType = dataType + self.extraSettings = extraSettings + self.logger = logging.getLogger(__name__) + #self.createTrackDb() + + def createTrackDb(self): + self.track_db = collections.OrderedDict([("track",self.trackName), + ("trackLabel",self.trackLabel), + ("trackDataURL",self.trackDataURL), + ("dataType", self.dataType), + ("trackType", self.trackType)] + ) + + + extraConfigs = self.prepareExtraSetting() + self.logger.debug("Generate extraConfigs = %s", json.dumps(extraConfigs)) + self.track_db["options"] = extraConfigs + #print self.track_db + self.logger.debug("TrackDb object is created track_db = %s ", json.dumps(self.track_db)) + + @abc.abstractmethod + def prepareExtraSetting(self): + """ set optional configurations for the track """
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tracks/TrackStyles.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,58 @@ +#!/usr/bin/env python +import os +import json +import logging +from mako.lookup import TemplateLookup + +class TrackStyles(object): + def __init__(self, tool_directory, species_folder, trackListFile, cssFolderName="css", cssFileName="custom_track_styles.css"): + self.logger = logging.getLogger(__name__) + self.tool_directory = tool_directory + self.species_folder = species_folder + self.trackList = trackListFile + self.cssFolderName = cssFolderName + self.cssFileName = cssFileName + self.cssFilePath = self._createCssFile() + self.cssTemplate = self._getCssTemplate() + self._addCssToTrackList() + + + def addCustomColor(self, feature_class_name, feature_color): + with open(self.cssFilePath, 'a+') as css: + htmlMakoRendered = self.cssTemplate.render( + label = feature_class_name, + color = feature_color + ) + css.write(htmlMakoRendered) + self.logger.debug("create customized track css class: cssFilePath= %s", self.cssFilePath) + + + def _createCssFile(self): + cssFolderPath = os.path.join(self.species_folder, self.cssFolderName) + cssFilePath = os.path.join(cssFolderPath, self.cssFileName) + if not os.path.exists(cssFilePath): + if not os.path.exists(cssFolderPath): + os.mkdir(cssFolderPath) + os.mknod(cssFilePath) + return cssFilePath + + def _getCssTemplate(self): + mylookup = TemplateLookup(directories=[os.path.join(self.tool_directory, 'templates')], + output_encoding='utf-8', encoding_errors='replace') + cssTemplate = mylookup.get_template("custom_track_styles.css") + return cssTemplate + + + def _addCssToTrackList(self): + with open(self.trackList, 'r+') as track: + data = json.load(track) + css_path = os.path.join('data', self.cssFolderName, self.cssFileName) + data['css'] = {'url': css_path} + json_string = json.dumps(data, indent=4, separators=(',', ': ')) + track.seek(0) + track.write(json_string) + track.truncate() + self.logger.debug("added customized css url to trackList.json") + + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trf_simpleRepeat.as Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,20 @@ +table simpleRepeat +"Describes the Simple Tandem Repeats" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Simple Repeats tag name" + uint period; "Length of repeat unit" + float copyNum; "Mean number of copies of repeat" + uint consensusSize; "Length of consensus sequence" + uint perMatch; "Percentage Match" + uint perIndel; "Percentage Indel" + uint score; "Alignment Score = 2*match-7*mismatch-7*indel; minscore=50" + uint A; "Percent of A's in repeat unit" + uint C; "Percent of C's in repeat unit" + uint G; "Percent of G's in repeat unit" + uint T; "Percent of T's in repeat unit" + float entropy; "Entropy" + lstring sequence; "Sequence of repeat unit element" + )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/Logger.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,38 @@ +import os +import sys +import json +import logging +import logging.config + +#from util.Filters import TraceBackFormatter + +class Logger(object): + def __init__(self, tool_directory, debug="False", extra_files_path=None): + self.tool_directory = tool_directory + self.default_level = logging.INFO + self.debug = debug + self.extra_files_path = extra_files_path + + def setup_logging(self): + """Setup logging configuration + reference: https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/ + """ + config_path = os.path.join(self.tool_directory, 'logging.json') + default_level=logging.INFO + if self.debug.lower() == "true": + default_level=logging.DEBUG + if os.path.exists(config_path): + with open(config_path, 'rt') as f: + config = json.load(f) + config["handlers"]["console"]["level"] = default_level + if self.extra_files_path: + for i in config["handlers"]: + if "filename" in config["handlers"][i]: + config["handlers"][i]["filename"] = os.path.join(self.extra_files_path, config["handlers"][i]["filename"]) + logging.config.dictConfig(config) + else: + logging.warn("Extra files path is not set. The log files will exist at current working directory instead of final output folder") + else: + logging.basicConfig(level=default_level) + logging.warn("Cannot find logging configuration file!\n") +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/Reader.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,146 @@ +import json +import logging +import codecs + + +# Internal dependencies +from datatypes.binary.Bam import Bam +from datatypes.binary.BigWig import BigWig +from datatypes.interval.Bed import Bed +from datatypes.interval.BedSimpleRepeats import BedSimpleRepeats +from datatypes.interval.BedSpliceJunctions import BedSpliceJunctions +from datatypes.interval.BlastXml import BlastXml +from datatypes.interval.Gff3 import Gff3 +from datatypes.interval.Gff3_mrna import Gff3_mrna +from datatypes.interval.Gff3_transcript import Gff3_transcript +from datatypes.interval.Gtf import Gtf +from datatypes.interval.GtfStringTie import GtfStringTie +from datatypes.interval.BigPsl import BigPsl +from datatypes.interval.BedBlatAlignments import BedBlatAlignments +from datatypes.interval.BedBlastAlignments import BedBlastAlignments +from datatypes.interval.Psl import Psl +from datatypes.sequence.Fasta import Fasta +from apollo.ApolloUser import ApolloUser +from util import santitizer + +class Reader(object): + + DATATYPE_CLASS = [Bam, BigWig, Bed, BedSimpleRepeats, + BedSpliceJunctions, BigPsl, BedBlatAlignments, BedBlastAlignments, + BlastXml, Gff3, Gff3_mrna, Gff3_transcript, Gff3_mrna, Gtf, GtfStringTie, Psl, Fasta] + + def __init__(self, input_json_file): + self.inputFile = input_json_file + self.args = self.loadJson() + + + def loadJson(self): + try: + data_file = codecs.open(self.inputFile, 'r', 'utf-8') + return json.load(data_file) + except IOError: + print "Cannot find JSON file\n" + exit(1) + + def getToolDir(self): + try: + return self.args["tool_directory"] + except KeyError: + print ("tool_directory is not defined in the input file!") + exit(1) + + def getExtFilesPath(self): + try: + return self.args["extra_files_path"] + except KeyError: + print ("extra_files_path is not defined in the input file!") + exit(1) + + def getUserEmail(self): + try: + return self.args["user_email"] + except KeyError: + print ("user_email is not defined in the input file!") + exit(1) + + def getDebugMode(self): + try: + return self.args["debug_mode"] + except KeyError: + print ("debug_mode is not defined in the input file!") + exit(1) + + def getTrackType(self): + track_type = self.args.get("track_type") + return track_type + + def getApolloHost(self): + apollo_host = self.args.get("apollo_host") + return apollo_host + + + def getRefGenome(self): + array_inputs_reference_genome = self.args["fasta"] + # TODO: Replace these with the object Fasta + input_fasta_file = array_inputs_reference_genome["false_path"] + input_fasta_file_name = santitizer.sanitize_name_input(array_inputs_reference_genome["name"]) + genome_name = santitizer.sanitize_name_input(self.args["genome_name"]) + reference_genome = Fasta(input_fasta_file, + input_fasta_file_name, genome_name) + return reference_genome + + def getApolloUser(self): + user_info = self.args.get("apollo_user") + if not user_info: + firstname = "demo" + lastname = "user" + password = "gonramp" + user_email = self.getUserEmail() + else: + firstname = user_info['firstname'] + lastname = user_info['lastname'] + user_email = user_info['user_email'] + password = user_info['password'] + apollo_user = ApolloUser(user_email, firstname, lastname, password) + return apollo_user + + def getTracksData(self): + self.logger = logging.getLogger(__name__) + all_datatype_dictionary = dict() + for datatype in self.DATATYPE_CLASS: + class_name = datatype.__name__ + array_inputs = self.args.get(str(class_name)) + if array_inputs: + self.logger.debug("Creating %s objects\n", class_name) + self.logger.debug("array_inputs: %s", array_inputs) + all_datatype_dictionary.update(self.create_ordered_datatype_objects(datatype, array_inputs)) + + return all_datatype_dictionary + + def create_ordered_datatype_objects(self, ExtensionClass, array_inputs): + """ + Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub + and update the dictionary of datatype + + :param ExtensionClass: + :param array_inputs: + :type ExtensionClass: Datatype + :type array_inputs: list[string] + """ + + datatype_dictionary = {} + + # TODO: Optimize this double loop + for input_data in array_inputs: + input_false_path = input_data["false_path"] + input_data["name"] = santitizer.sanitize_name_input(input_data["name"]) + extensionObject = ExtensionClass(input_false_path, input_data) + extensionObject.generateCustomTrack() + datatype_dictionary.update({input_data["order_index"]: extensionObject}) + self.logger.debug("%s object: %s has been created", ExtensionClass, input_data["name"]) + return datatype_dictionary + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/index/DatabaseIndex.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,44 @@ +#!/usr/bin/python + +import collections +from ExternIndex import ExternIndex + +class DatabaseIndex(ExternIndex): + def __init__(self, database, **args): + self.database = database + self.seqType=args.get("seqType") + self.useIframe=args.get("useIframe") + self.iframeHeight=args.get("iframeHeight") + self.iframeWidth=args.get("iframeWidth") + + def setExtLink(self): + return self.setDatabaseLink(self.database, self.seqType, self.useIframe, self.iframeHeight, self.iframeWidth) + + + def setDatabaseLink(self, database, seqType=None, useIframe=None, iframeHeight=None, iframeWidth=None): + database_settings = collections.OrderedDict() + if "NCBI" in database: + if not seqType: + database_settings["url"] = "https://www.ncbi.nlm.nih.gov/gquery/?term=$$" + elif seqType == 2: + database_settings["url"] = "https://www.ncbi.nlm.nih.gov/protein/$$" + elif seqType == 1: + database_settings["url"] = "https://www.ncbi.nlm.nih.gov/nuccore/$$" + else: + raise Exception("Sequence Type {0} is not valid, should be either protein (seqType==2) or nucleotide (seqType==1). Stopping the application".format(seqType)) + elif "UniProt" in database: + database_settings["url"] = "http://www.uniprot.org/uniprot/$$" + elif "FlyBase" in database: + database_settings["url"] = "http://flybase.org/reports/$$" + else: + database_settings["url"] = "https://www.ncbi.nlm.nih.gov/gquery/?term=$$" + database_settings["urlLabel"] = database + " Details:" + if useIframe or useIframe is None: + database_settings["iframeUrl"] = database_settings["url"] + if not iframeHeight: + iframeHeight = "600" + if not iframeWidth: + iframeWidth = "800" + database_settings["iframeOptions"] = "height= %s width= %s" % (iframeHeight, iframeWidth) + return database_settings +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/index/ExternIndex.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,16 @@ +#!/usr/bin/python +import collections +import abc +from abc import ABCMeta + +class ExternIndex(object): + __metaclass__ = ABCMeta + + @abc.abstractmethod + def __init__(self): + """init""" + + @abc.abstractmethod + def setExtLink(self): + """set external link""" + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/index/TrixIndex.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,55 @@ +#!/usr/bin/python + +import os +import collections +import shutil +import logging +from ExternIndex import ExternIndex + +class TrixIndex(ExternIndex): + def __init__(self, indexIx, indexIxx, trackName, mySpecieFolderPath, trixId, **args): + self.logger = logging.getLogger(__name__) + self.indexIx = indexIx + self.indexIxx = indexIxx + self.trackName = trackName + self.mySpecieFolderPath = mySpecieFolderPath + self.trixId = trixId.strip() + if not self.trixId: + self.logger.error("Didn't specify the Trix identifier. To use TRIX index, you need to specify the identifier") + exit(1) + if "default_index" in args: + self.default_index = args["default_index"] + else: + self.default_index = None + self.index_settings = collections.OrderedDict() + + def setExtLink(self): + self.setSearchIndex() + self.moveIndexFile() + self.index_settings["searchTrix"] = "trix/%s" % self.indexIxName + return self.index_settings + + def moveIndexFile(self): + indexFolder = os.path.join(self.mySpecieFolderPath, 'trix') + self.indexIxName = "".join( ( self.trackName, ".ix") ) + self.indexIxxName = "".join( ( self.trackName, ".ixx") ) + if not os.path.exists(indexFolder): + os.makedirs(indexFolder) + + # Move index files to the index folder + self.indexIxPath = os.path.join(indexFolder, self.indexIxName) + shutil.copyfile(self.indexIx, self.indexIxPath) + self.indexIxxPath = os.path.join(indexFolder, self.indexIxxName) + shutil.copyfile(self.indexIxx, self.indexIxxPath) + + def setSearchIndex(self): + if self.default_index: + set_index = set() + set_index.add(self.trixId) + set_index.add(self.default_index) + search_index = ",".join(set_index) + else: + search_index = self.trixId + logging.debug("trixId= %s, searchIndex= %s", self.trixId, search_index) + self.index_settings["searchIndex"] = search_index +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/santitizer.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,70 @@ +#!/usr/bin/python +# -*- coding: utf8 -*- + +""" +This class handles the subprocess calls of the different tools used +in HubArchiveCreator +""" + +import logging +import os +import subprocess +import sys +import string +import tempfile + + +def prefixTrackName(filename): + """ + santitize trackName. Because track name must begin with a letter and + contain only the following chars: [a-zA-Z0-9_]. + See the "track" Common settings at: + https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html#bigPsl_-_Pairwise_Alignments + skip the santitization for cytoBandIdeo track + """ + if filename == 'cytoBandIdeo': + return filename + valid_chars = "_%s%s" % (string.ascii_letters, string.digits) + sanitize_name = ''.join([c if c in valid_chars else '_' for c in filename]) + sanitize_name = "gonramp_" + sanitize_name + return sanitize_name + +def sanitize_name_input(string_to_sanitize): + """ + Sanitize the string passed in parameter by replacing '/' and ' ' by '_' + + :param string_to_sanitize: + :return : + + :Example: + + >>> sanitize_name_input('this/is an//example') + this_is_an__example + """ + return string_to_sanitize \ + .replace("/", "_") \ + .replace(" ", "_") + +def sanitize_name_inputs(inputs_data): + """ + Sanitize value of the keys "name" of the dictionary passed in parameter. + + Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces. + Also, it can contain '/' character and could break the use of os.path function. + + :param inputs_data: dict[string, dict[string, string]] + """ + for key in inputs_data: + inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"]) + +def sanitize_group_name(group_name): + return group_name.lower().replace(' ', '_') + +def sanitize_name(input_name): + """ + Galaxy will name all the files and dirs as *.dat, + the function can replace '.' to '_' for the dirs + """ + validChars = "_-%s%s" % (string.ascii_letters, string.digits) + sanitized_name = ''.join([c if c in validChars else '_' for c in input_name]) + return "gonramp_" + sanitized_name
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/util/subtools.py Fri Oct 13 12:44:31 2017 -0400 @@ -0,0 +1,372 @@ +#!/usr/bin/env python + +""" +This file include common used functions for converting file format to gff3 +""" +from collections import OrderedDict +import json +import subprocess +import os +import sys +import tempfile +import string +import logging + +class PopenError(Exception): + def __init__(self, cmd, error, return_code): + self.cmd = cmd + self.error = error + self.return_code = return_code + + def __str__(self): + message = "The subprocess {0} has returned the error: {1}.".format( + self.cmd, self.return_code) + message = ','.join( + (message, "Its error message is: {0}".format(self.error))) + return repr(message) + + +def _handleExceptionAndCheckCall(array_call, **kwargs): + """ + This class handle exceptions and call the tool. + It maps the signature of subprocess.check_call: + See https://docs.python.org/2/library/subprocess.html#subprocess.check_call + """ + stdout = kwargs.get('stdout', subprocess.PIPE) + stderr = kwargs.get('stderr', subprocess.PIPE) + shell = kwargs.get('shell', False) + stdin = kwargs.get('stdin', None) + + cmd = array_call[0] + + output = None + error = None + + # TODO: Check the value of array_call and <=[0] + logging.debug("Calling {0}:".format(cmd)) + logging.debug("%s", array_call) + logging.debug("---------") + + # TODO: Use universal_newlines option from Popen? + try: + p = subprocess.Popen(array_call, stdout=stdout, + stderr=stderr, shell=shell, stdin=stdin) + + # TODO: Change this because of possible memory issues => https://docs.python.org/2/library/subprocess.html#subprocess.Popen.communicate + + output, error = p.communicate() + + if stdout == subprocess.PIPE: + logging.debug("\t{0}".format(output)) + else: + logging.debug("\tOutput in file {0}".format(stdout.name)) + # If we detect an error from the subprocess, then we raise an exception + # TODO: Manage if we raise an exception for everything, or use CRITICAL etc... but not stop process + # TODO: The responsability of returning a sys.exit() should not be there, but up in the app. + if p.returncode: + if stderr == subprocess.PIPE: + raise PopenError(cmd, error, p.returncode) + else: + # TODO: To Handle properly with a design behind, if we received a option as a file for the error + raise Exception("Error when calling {0}. Error as been logged in your file {1}. Error code: {2}" + .format(cmd, stderr.name, p.returncode)) + + except OSError as e: + message = "The subprocess {0} has encountered an OSError: {1}".format( + cmd, e.strerror) + if e.filename: + message = '\n'.join( + (message, ", against this file: {0}".format(e.filename))) + logging.error(message) + sys.exit(-1) + except PopenError as p: + message = "The subprocess {0} has returned the error: {1}.".format( + p.cmd, p.return_code) + message = '\n'.join( + (message, "Its error message is: {0}".format(p.error))) + + logging.exception(message) + + sys.exit(p.return_code) + except Exception as e: + message = "The subprocess {0} has encountered an unknown error: {1}".format( + cmd, e) + logging.exception(message) + + sys.exit(-1) + return p + + +def write_features(field, attribute, gff3): + """ + The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) + field, attribute are ordered dictionary + gff3 is the file handler + """ + attr = [] + for v in field.values(): + gff3.write(str(v) + '\t') + for k, v in attribute.items(): + s = str(k) + '=' + str(v) + attr.append(s) + gff3.write(';'.join(attr)) + gff3.write('\n') + +def twoBitInfo(two_bit_file_name, two_bit_info_file): + """ + Call twoBitInfo and write the result into twoBit_info_file + :param two_bit_file_name: + :param two_bit_info_file: + :return the subprocess.check_call return object: + """ + array_call = ['twoBitInfo', two_bit_file_name, two_bit_info_file] + p = _handleExceptionAndCheckCall(array_call) + return p + + +def faToTwoBit(fasta_file_name, twoBitFile): + """ + This function call faToTwoBit UCSC tool, and return the twoBitFile + :param fasta_file_name: + :param mySpecieFolder: + :return: + """ + + array_call = ['faToTwoBit', fasta_file_name, twoBitFile] + _handleExceptionAndCheckCall(array_call) + + return twoBitFile + +def sortChromSizes(two_bit_info_file_name, chrom_sizes_file_name): + """ + Call sort with -k2rn on two_bit_info_file_name and write the result into chrom_sizes_file_name + :param two_bit_info_file_name: + :param chrom_sizes_file_name: + :return: + """ + array_call = ['sort', '-k2rn', two_bit_info_file_name, + '-o', chrom_sizes_file_name] + p = _handleExceptionAndCheckCall(array_call) + return p + +def getChromSizes(reference, tool_dir): + #TODO: find a better way instead of shipping the two exec files with the tool + faToTwoBit = os.path.join(tool_dir, 'faToTwoBit') + twoBitInfo = os.path.join(tool_dir, 'twoBitInfo') + try: + twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) + chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False) + except IOError as err: + print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror) + try: + subprocess.call(['faToTwoBit', reference, twoBitFile.name]) + except OSError as err: + print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror) + try: + subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name]) + except OSError as err: + print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror) + return chrom_sizes + +def sequence_region(chrom_sizes): + """ + This function read from a chromatin size file generated by twoBitInfo and write the information to dict + return a dict + """ + f = open(chrom_sizes, 'r') + sizes = f.readlines() + sizes_dict = {} + for line in sizes: + chrom_info = line.rstrip().split('\t') + sizes_dict[chrom_info[0]] = chrom_info[1] + return sizes_dict + +def child_blocks(parent_field, parent_attr, gff3, child_type): + num = 0 + blockcount = int(parent_attr['blockcount']) + chromstart = parent_attr['chromstarts'].split(',') + blocksize = parent_attr['blocksizes'].split(',') + parent_start = parent_field['start'] + while num < blockcount: + child_attr = OrderedDict() + child_field = parent_field + child_field['type'] = child_type + child_field['start'] = int(chromstart[num]) + int(parent_start) + child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1 + child_attr['ID'] = parent_attr['ID'] + '_part_' + str(num+1) + child_attr['Parent'] = parent_attr['ID'] + write_features(child_field, child_attr, gff3) + num = num + 1 + +def add_tracks_to_json(trackList_json, new_tracks, modify_type): + """ + Add to track configuration (trackList.json) + # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() + # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) + """ + with open(trackList_json, 'r+') as f: + data = json.load(f) + if modify_type == 'add_tracks': + data['tracks'].append(new_tracks) + elif modify_type == 'add_attr': + for k in new_tracks: + for track in data['tracks']: + if k.lower() in track['urlTemplate'].lower(): + attr = new_tracks[k] + for k, v in attr.items(): + track[k] = v + f.seek(0, 0) + f.write(json.dumps(data, separators=(',' , ':'), indent=4)) + f.truncate() + f.close() + + +def createBamIndex(bamfile): + subprocess.call(['samtools', 'index', bamfile]) + filename = bamfile + '.bai' + if os.path.exists(filename): + return filename + else: + raise ValueError('Did not find bai file') + +def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=False): + if "bed" in dataType: + fileType = "--bed" + elif "gff" in dataType: + fileType = "--gff" + else: + raise ValueError("%s is not a valid filetype for flatfile_to_json" % dataType) + + + array_call = ['flatfile-to-json.pl', + fileType, inputFile, + '--trackType', trackType, + '--trackLabel', trackLabel, + '--out', outputFolder] + if compress: + array_call.append('--compress') + if options: + config = options.get("config") + clientConfig = options.get("clientConfig") + renderClassName = options.get('renderClassName') + subfeatureClasses = options.get('subfeatureClasses') + load_type = options.get("type") + if clientConfig: + array_call.append('--clientConfig') + array_call.append(clientConfig) + if config: + array_call.append('--config') + array_call.append(config) + if load_type: + array_call.append('--type') + array_call.append(load_type) + if renderClassName: + array_call.append('--renderClassName') + array_call.append(renderClassName) + if subfeatureClasses: + array_call.append('--subfeatureClasses') + array_call.append(json.dumps(subfeatureClasses)) + + p = _handleExceptionAndCheckCall(array_call) + return p + +def bam_to_json(inputFile, trackLabel, outputFolder, options=None, compress=False): + + array_call = ['bam-to-json.pl', + '--bam', inputFile, + '--trackLabel', trackLabel, + '--out', outputFolder] + if compress: + array_call.append('--compress') + if options: + config = options.get('config') + clientConfig = options.get('clientConfig') + if clientConfig: + array_call.append('--clientConfig') + array_call.append(clientConfig) + if config: + array_call.append('--config') + array_call.append(config) + + p = _handleExceptionAndCheckCall(array_call) + return p + +def add_track_json(trackList, track_json): + track_json = json.dumps(track_json) + new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE) + p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout) + return p + +def prepare_refseqs(fasta_file_name, outputFolder): + array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder] + p = _handleExceptionAndCheckCall(array_call) + return p + +def generate_names(outputFolder): + array_call = ['generate-names.pl', '-v', '--out', outputFolder] + p = _handleExceptionAndCheckCall(array_call) + return p + +def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None): + """ + Call validateFiles on input_file, using chrom_sizes_file_name and file_type + :param input_file: + :param chrom_sizes_file_name: + :param file_type: + :return: + """ + + array_call = ['validateFiles', '-chromInfo=' + chrom_sizes_file_name, '-type='+ file_type, input_file] + if options: + tab = options.get("tab") + autoSql = options.get("autoSql") + logging.debug("tab: {0}".format(tab)) + logging.debug("autoSql: {0}".format(autoSql)) + if autoSql: + autoSql = ''.join(['-as=', autoSql]) + array_call.append(autoSql) + if tab: + array_call.append('-tab') + p = _handleExceptionAndCheckCall(array_call) + return p + +def arrow_add_organism(organism_name, organism_dir, public=False): + array_call = ['arrow', 'organisms', 'add_organism', organism_name, organism_dir] + if public: + array_call.append('--public') + p = subprocess.check_output(array_call) + return p + +def arrow_create_user(user_email, firstname, lastname, password, admin=False): + """ Create a new user of Apollo, the default user_role is "user" """ + array_call = ['arrow', 'users', 'create_user', user_email, firstname, lastname, password] + if admin: + array_call += ['--role', 'admin'] + p = subprocess.check_output(array_call) + return p + +def arrow_update_organism_permissions(user_id, organism, **user_permissions): + array_call = ['arrow', 'users', 'update_organism_permissions', str(user_id), str(organism)] + admin = user_permissions.get("admin", False) + write = user_permissions.get("write", False) + read = user_permissions.get("read", False) + export = user_permissions.get("export", False) + if admin: + array_call.append('--administrate') + if write: + array_call.append('--write') + if read: + array_call.append('--read') + if export: + array_call.append('--export') + p = subprocess.check_output(array_call) + return p + +def arrow_get_users(user_email): + array_call = ['arrow', 'users', 'get_users'] + p = subprocess.check_output(array_call) + all_users = json.loads(p) + for d in all_users: + if d['username'] == user_email: + return d['userId'] + logging.error("Cannot find user %s", user_email)
--- a/utils.py Wed Jul 12 12:55:27 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,162 +0,0 @@ -#!/usr/bin/env python - -""" -This file include common used functions for converting file format to gff3 -""" -from collections import OrderedDict -import json -import subprocess -import os -import tempfile -import string - -def write_features(field, attribute, gff3): - """ - The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) - field, attribute are ordered dictionary - gff3 is the file handler - """ - attr = [] - for v in field.values(): - gff3.write(str(v) + '\t') - for k, v in attribute.items(): - s = str(k) + '=' + str(v) - attr.append(s) - gff3.write(';'.join(attr)) - gff3.write('\n') - -def getChromSizes(reference, tool_dir): - #TODO: find a better way instead of shipping the two exec files with the tool - faToTwoBit = os.path.join(tool_dir, 'faToTwoBit') - twoBitInfo = os.path.join(tool_dir, 'twoBitInfo') - try: - twoBitFile = tempfile.NamedTemporaryFile(bufsize=0) - chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False) - except IOError as err: - print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror) - try: - subprocess.call(['faToTwoBit', reference, twoBitFile.name]) - except OSError as err: - print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror) - try: - subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name]) - except OSError as err: - print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror) - return chrom_sizes - -def sequence_region(chrom_sizes): - """ - This function read from a chromatin size file generated by twoBitInfo and write the information to dict - return a dict - """ - f = open(chrom_sizes, 'r') - sizes = f.readlines() - sizes_dict = {} - for line in sizes: - chrom_info = line.rstrip().split('\t') - sizes_dict[chrom_info[0]] = chrom_info[1] - return sizes_dict - -def child_blocks(parent_field, parent_attr, gff3, child_type): - num = 0 - blockcount = int(parent_attr['blockcount']) - chromstart = parent_attr['chromstarts'].split(',') - blocksize = parent_attr['blocksizes'].split(',') - parent_start = parent_field['start'] - while num < blockcount: - child_attr = OrderedDict() - child_field = parent_field - child_field['type'] = child_type - child_field['start'] = int(chromstart[num]) + int(parent_start) - child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1 - child_attr['ID'] = parent_attr['ID'] + '_part_' + str(num+1) - child_attr['Parent'] = parent_attr['ID'] - write_features(child_field, child_attr, gff3) - num = num + 1 - -def add_tracks_to_json(trackList_json, new_tracks, modify_type): - """ - Add to track configuration (trackList.json) - # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() - # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) - """ - with open(trackList_json, 'r+') as f: - data = json.load(f) - if modify_type == 'add_tracks': - data['tracks'].append(new_tracks) - elif modify_type == 'add_attr': - for k in new_tracks: - for track in data['tracks']: - if k.lower() in track['urlTemplate'].lower(): - attr = new_tracks[k] - for k, v in attr.items(): - track[k] = v - f.seek(0, 0) - f.write(json.dumps(data, separators=(',' , ':'), indent=4)) - f.truncate() - f.close() - -def gtfToGff3(gtf_file, gff3_file, chrom_sizes): - """ - Covert gtf file output from StringTie to gff3 format - """ - gff3 = open(gff3_file, 'w') - gff3.write("##gff-version 3\n") - sizes_dict = sequence_region(chrom_sizes) - seq_regions = dict() - parents = dict() - with open(gtf_file, 'r') as gtf: - for line in gtf: - if line.startswith('#'): - continue - field = OrderedDict() - attribute = OrderedDict() - li = line.rstrip().split("\t") - #print li - field['seqid'] = li[0] - #print field['seqid'] - if field['seqid'] not in seq_regions: - end_region = sizes_dict[field['seqid']] - gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n') - seq_regions[field['seqid']] = end_region - field['source'] = li[1] - field['type'] = li[2] - # The first base in a chromosome is numbered 0 in BED format - field['start'] = li[3] - field['end'] = li[4] - field['score'] = li[5] - field['strand'] = li[6] - field['phase'] = li[7] - attr_li = li[8].split(';') - gene_id = attr_li[0].split()[1].strip('"') - attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end']) - if field['type'] == 'transcript': - parents[gene_id] = attribute['ID'] - attribute['transcript_id'] = attr_li[1].split()[1].strip('"') - attribute['coverage'] = attr_li[2].split()[1].strip('"') - attribute['fpkm'] = attr_li[3].split()[1].strip('"') - attribute['tpm'] = attr_li[4].split()[1].strip('"') - elif field['type'] == 'exon': - attribute['Parent'] = parents[gene_id] - attribute['transcript_id'] = attr_li[1].split()[1].strip('"') - attribute['coverage'] = attr_li[3].split()[1].strip('"') - write_features(field, attribute, gff3) - gff3.close() - - -def sanitize_name(input_name): - """ - Galaxy will name all the files and dirs as *.dat, - the function can replace '.' to '_' for the dirs - """ - validChars = "_-%s%s" % (string.ascii_letters, string.digits) - sanitized_name = ''.join([c if c in validChars else '_' for c in input_name]) - return "gonramp_" + sanitized_name - -def createBamIndex(bamfile): - subprocess.call(['samtools', 'index', bamfile]) - filename = bamfile + '.bai' - if os.path.exists(filename): - return filename - else: - raise ValueError('Did not find bai file')