# HG changeset patch
# User yating-l
# Date 1507913071 14400
# Node ID 31a41ce128cc30451fba1eef9c8d639b3f8ec7a1
# Parent bb6fdccef4744c86c1b9e5c6982ffa57c6b34c11
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit 691e5366893905d30943a3cb8cdfb6341f0f5362-dirty
diff -r bb6fdccef474 -r 31a41ce128cc ApolloUser.pyc
Binary file ApolloUser.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc README.md
--- a/README.md Wed Jul 12 12:55:27 2017 -0400
+++ b/README.md Fri Oct 13 12:44:31 2017 -0400
@@ -4,11 +4,10 @@
## Features
1. Similar interface to Hub Archive Creator.
2. Convert tracks to GFF3 datatypes (e.g Blastxml => GFF3) in order to import feature data from the flat files
-3. Generate a zip file including all the tracks and configuration for JBrowse visualization
-4. Group the tracks
-5. Set the color for each track
-6. Set the label for each track
-7. Create workflows within Galaxy to automatize pipeline analysis and get them ready to visualization inside JBrowse...in a few clicks!
+3. Group the tracks
+4. Set the color for each track
+5. Set the label for each track
+6. Create workflows within Galaxy to automatize pipeline analysis and get them ready to visualization inside JBrowse...in a few clicks!
At the moment, Supported datatypes are:
- Bam
@@ -19,6 +18,7 @@
- Gff3
- Gtf
- Blastxml
+- BigPsl
## Installation:
1. You would need to add this tool into your Galaxy.
diff -r bb6fdccef474 -r 31a41ce128cc TrackDb.pyc
Binary file TrackDb.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc TrackHub.py
--- a/TrackHub.py Wed Jul 12 12:55:27 2017 -0400
+++ b/TrackHub.py Fri Oct 13 12:44:31 2017 -0400
@@ -5,169 +5,180 @@
import shutil
import zipfile
import json
-import utils
+import tempfile
+import logging
+
+from datatypes.Datatype import Datatype
+from apollo.ApolloInstance import ApolloInstance
+from tracks.TrackStyles import TrackStyles
+from util import subtools
+from util import santitizer
class TrackHub:
- def __init__(self, inputFiles, reference, outputDirect, tool_dir, genome, extra_files_path, metaData, jbrowse_host):
- self.input_files = inputFiles.tracks
- self.outfile = outputDirect
- self.outfolder = extra_files_path
- self.out_path = os.path.join(extra_files_path, 'myHub')
- self.reference = reference
- self.tool_dir = tool_dir
- self.metaData = metaData
- self.raw = os.path.join(self.out_path, 'raw')
- self.json = os.path.join(self.out_path, 'json')
- self.jbrowse_host = jbrowse_host
- try:
- if os.path.exists(self.json):
- shutil.rmtree(self.json)
- os.makedirs(self.json)
- except OSError as e:
- print "Cannot create json folder error({0}): {1}".format(e.errno, e.strerror)
- else:
- print "Create jbrowse folder {}".format(self.out_path)
+ def __init__(self, inputFastaFile, apollo_user, outputFile, extra_files_path, tool_directory, trackType, apollo_host):
+
+ self.rootAssemblyHub = None
+
+ self.mySpecieFolderPath = None
+
+ # Store intermediate files, will be removed if not in debug mode
+ self.myTracksFolderPath = None
+
+ # Store binary files: Bam, BigWig
+ self.myBinaryFolderPath = None
+
+ self.tool_directory = tool_directory
+ self.trackType = trackType
+ self.reference_genome = inputFastaFile
+ self.genome_name = inputFastaFile.assembly_id
+ self.extra_files_path = extra_files_path
+ self.outputFile = outputFile
+ self.chromSizesFile = None
+
+ # Set up apollo
+ self.apollo = ApolloInstance(apollo_host)
+ self.apollo_user = apollo_user
+
+ # Set all the missing variables of this class, and create physically the folders/files
+ self.rootAssemblyHub = self.__createAssemblyHub__(extra_files_path=extra_files_path)
+ # Init the Datatype
+ Datatype.pre_init(self.reference_genome, self.chromSizesFile,
+ self.extra_files_path, self.tool_directory,
+ self.mySpecieFolderPath, self.myTracksFolderPath, self.myBinaryFolderPath, self.trackType)
+
+ self._prepareRefseq()
+ self.trackList = os.path.join(self.mySpecieFolderPath, "trackList.json")
+ self._createTrackList()
+
+ self.myTrackStyle = TrackStyles(self.tool_directory, self.mySpecieFolderPath, self.trackList)
+ #self.cssFolderPath = os.path.join(self.mySpecieFolderPath, 'css')
+ #self.cssFilePath = os.path.join(self.cssFolderPath, 'custom_track_styles.css')
+ self.logger = logging.getLogger(__name__)
+
- def createHub(self):
- self.prepareRefseq()
- for input_file in self.input_files:
- self.addTrack(input_file)
- self.indexName()
- slink = self.makeArchive()
- self.outHtml(slink)
+
+ def addTrack(self, trackDbObject):
+ if trackDbObject['dataType'].lower() == 'bam':
+ #new_track = subprocess.Popen(['echo', trackDbObject['options']], stdout=subprocess.PIPE)
+ #subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout)
+ subtools.add_track_json(self.trackList, trackDbObject['options'])
+ #subtools.add_track_json(self.trackList, trackDbObject['track_json'])
+ elif trackDbObject['dataType'].lower() == 'bigwig':
+ subtools.add_track_json(self.trackList, trackDbObject['options'])
+ else:
+ if trackDbObject['trackType'] == 'HTMLFeatures':
+ self._customizeHTMLFeature(trackDbObject)
+ subtools.flatfile_to_json(trackDbObject['trackDataURL'], trackDbObject['dataType'], trackDbObject['trackType'], trackDbObject['trackLabel'], self.mySpecieFolderPath, trackDbObject['options'])
+
+
+ def terminate(self, debug=False):
+ """ Write html file """
+ self._indexName()
+ if not debug:
+ self._removeRaw()
+ self._makeArchive()
print "Success!\n"
-
- def prepareRefseq(self):
- try:
+
+
+ def _customizeHTMLFeature(self, trackDbObject):
+ if trackDbObject['options']:
+ subfeatures = trackDbObject['options'].get('subfeatureClasses')
+ feature_color = trackDbObject['options']['feature_color']
+ if subfeatures:
+ for key, value in subfeatures.items():
+ self.myTrackStyle.addCustomColor(value, feature_color)
+ else:
+ customizedFeature = santitizer.sanitize_name(trackDbObject['trackLabel'])
+ clientConfig = json.loads(trackDbObject['options']['clientConfig'])
+ clientConfig['renderClassName'] = customizedFeature
+ trackDbObject['options']['clientConfig'] = json.dumps(clientConfig)
+ self.myTrackStyle.addCustomColor(customizedFeature, feature_color)
+
+ def _removeRaw(self):
+ if os.path.exists(self.myTracksFolderPath):
+ shutil.rmtree(self.myTracksFolderPath)
+
+ def _createTrackList(self):
+ if not os.path.exists(self.trackList):
+ os.mknod(self.trackList)
+
+ def _prepareRefseq(self):
+ subtools.prepare_refseqs(self.reference_genome.false_path, self.mySpecieFolderPath)
+ #try:
#print os.path.join(self.tool_dir, 'prepare-refseqs.pl') + ", '--fasta', " + self.reference +", '--out', self.json])"
- subprocess.call(['prepare-refseqs.pl', '--fasta', self.reference, '--out', self.json])
- except OSError as e:
- print "Cannot prepare reference error({0}): {1}".format(e.errno, e.strerror)
- #TODO: hard coded the bam and bigwig tracks. Need to allow users to customize the settings
- def addTrack(self, track):
- #print "false_path" , track['false_path']
- if track['false_path'] in self.metaData.keys():
- metadata = self.metaData[track['false_path']]
- else:
- metadata = {}
- self.SetMetadata(track, metadata)
- if track['dataType'] == 'bam':
- self.Bam(track, metadata)
- # print "add bam track\n"
- elif track['dataType'] == 'bigwig':
- #create trackList.json if not exist
- self.createTrackList()
- json_file = os.path.join(self.json, "trackList.json")
- bigwig_file = os.path.join(self.raw, track['fileName'])
- subprocess.call(['add-bw-track.pl', '--label', metadata['label'], '--bw_url', bigwig_file, '--pos_color', metadata['style']['pos_color'], '--neg_color', metadata['style']['neg_color'], '--plot', 'JBrowse/View/Track/Wiggle/XYPlot', '--out', json_file, '--in', json_file])
- else:
- flat_file = os.path.join(self.raw, track['fileName'])
- if track['dataType'] == 'bed':
- subprocess.call(['flatfile-to-json.pl', '--bed', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
- elif track['dataType'] == 'bedSpliceJunctions' or track['dataType'] == 'gtf' or track['dataType'] == 'blastxml':
- subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"glyph": "JBrowse/View/FeatureGlyph/Segments", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
- elif track['dataType'] == 'gff3_transcript':
- subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"transcriptType": "transcript", "category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
- else:
- subprocess.call(['flatfile-to-json.pl', '--gff', flat_file, '--trackType', metadata['type'], '--trackLabel', metadata['label'], '--Config', '{"category" : "%s"}' % metadata['category'], '--clientConfig', '{"color" : "%s"}' % metadata['color'], '--out', self.json])
-
- def indexName(self):
- subprocess.call(['generate-names.pl', '-v', '--out', self.json])
+ #subprocess.call(['prepare-refseqs.pl', '--fasta', self.reference_genome.false_path, '--out', self.mySpecieFolderPath])
+ #except OSError as e:
+ #print "Cannot prepare reference error({0}): {1}".format(e.errno, e.strerror)
+
+ def _indexName(self):
+ #subprocess.call(['generate-names.pl', '-v', '--out', self.mySpecieFolderPath])
+ subtools.generate_names(self.mySpecieFolderPath)
print "finished name index \n"
- def makeArchive(self):
- file_dir = os.path.abspath(self.outfile)
- source_dir = os.path.dirname(file_dir)
- folder_name = os.path.basename(self.outfolder)
- source_name = os.path.basename(self.out_path)
- source = os.path.join(source_dir, folder_name, source_name)
- slink = source.replace('/', '_')
- slink = os.path.join('/var/www/html/JBrowse-1.12.1/data', slink)
- try:
- if os.path.islink(slink):
- os.unlink(slink)
- except OSError as oserror:
- print "Cannot create symlink to the data({0}): {1}".format(oserror.errno, oserror.strerror)
- os.symlink(source, slink)
- return slink
-
- def outHtml(self, slink):
- with open(self.outfile, 'w') as htmlfile:
- htmlstr = 'The JBrowse Hub is created:
'
- url = self.jbrowse_host + "/JBrowse-1.12.1/index.html?data=%s"
- jbrowse_hub = '
View JBrowse Hub' % url
- link_name = os.path.basename(slink)
- relative_path = os.path.join('data', link_name + '/json')
- htmlstr += jbrowse_hub % relative_path
- htmlfile.write(htmlstr)
+ def _outHtml(self, host_name):
+ with open(self.outputFile, 'w') as htmlfile:
+ htmlstr = 'The new Organism "%s" is created on Apollo:
' % self.genome_name
+ jbrowse_hub = 'View JBrowse Hub on Apollo' % host_name
+ htmlstr += jbrowse_hub
+ htmlfile.write(htmlstr)
+
+ def _makeArchive(self):
+ self.apollo.loadHubToApollo(self.apollo_user, self.genome_name, self.mySpecieFolderPath, admin=True)
+ apollo_host = self.apollo.getHost()
+ self._outHtml(apollo_host)
+
+
+ def __createAssemblyHub__(self, extra_files_path):
+ # Get all necessaries infos first
+ # 2bit file creation from input fasta
+
+ # baseNameFasta = os.path.basename(fasta_file_name)
+ # suffixTwoBit, extensionTwoBit = os.path.splitext(baseNameFasta)
+ # nameTwoBit = suffixTwoBit + '.2bit'
+ twoBitFile = tempfile.NamedTemporaryFile(bufsize=0)
+ subtools.faToTwoBit(self.reference_genome.false_path, twoBitFile.name)
+
+ # Generate the twoBitInfo
+ twoBitInfoFile = tempfile.NamedTemporaryFile(bufsize=0)
+ subtools.twoBitInfo(twoBitFile.name, twoBitInfoFile.name)
+
+ # Then we get the output to generate the chromSizes
+ self.chromSizesFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".chrom.sizes")
+ subtools.sortChromSizes(twoBitInfoFile.name, self.chromSizesFile.name)
- def createTrackList(self):
- trackList = os.path.join(self.json, "trackList.json")
- if not os.path.exists(trackList):
- os.mknod(trackList)
-
- def Bam(self, track, metadata):
- #create trackList.json if not exist
- self.createTrackList()
- json_file = os.path.join(self.json, "trackList.json")
- bam_track = dict()
- bam_track['type'] = 'JBrowse/View/Track/Alignments2'
- bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM'
- bam_track['urlTemplate'] = os.path.join('../raw', track['fileName'])
- bam_track['baiUrlTemplate'] = os.path.join('../raw', track['index'])
- bam_track['label'] = metadata['label']
- bam_track['category'] = metadata['category']
- bam_track = json.dumps(bam_track)
- #Use add-track-json.pl to add bam track to json file
- new_track = subprocess.Popen(['echo', bam_track], stdout=subprocess.PIPE)
- subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout)
- '''
- def BigWig(self, track, metadata):
- #create trackList.json if not exist
- self.createTrackList()
- json_file = os.path.join(self.json, "trackList.json")
- bigwig_track = dict()
- bigwig_track['urlTemplate'] = os.path.join('../raw', track['fileName'])
- bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot'
- bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig'
- bigwig_track['label'] = metadata['label']
- bigwig_track['style'] = metadata['style']
- bigwig_track['category'] = metadata['category']
- bigwig_track = json.dumps(bigwig_track)
- #Use add-track-json.pl to add bigwig track to json file
- new_track = subprocess.Popen(['echo', bigwig_track], stdout=subprocess.PIPE)
- #output = new_track.communicate()[0]
- subprocess.call(['add-track-json.pl', json_file], stdin=new_track.stdout)
- '''
- def BigWig
- #If the metadata is not set, use the default value
- def SetMetadata(self, track, metadata):
- if 'label' not in metadata.keys() or metadata['label'] == '':
- metadata['label'] = track['fileName']
- if 'color' not in metadata.keys() or metadata['color'] == '':
- metadata['color'] = "#daa520"
- if track['dataType'] == 'bigwig':
- if 'style' not in metadata.keys():
- metadata['style'] = {}
- if 'pos_color' not in metadata['style'] or metadata['style']['pos_color'] == '':
- metadata['style']['pos_color'] = "#FFA600"
- if 'neg_color' not in metadata['style'] or metadata['style']['neg_color'] == '':
- metadata['style']['neg_color'] = "#005EFF"
- if 'category' not in metadata.keys() or metadata['category'] == '':
- metadata['category'] = "Default group"
- if track['dataType'] == 'blastxml':
- metadata['type'] = "G-OnRamp_plugin/BlastAlignment"
- elif track['dataType'] == 'bigpsl':
- metadata['type'] = "G-OnRamp_plugin/BlatAlignment"
- elif track['dataType'] == 'gff3_transcript' or track['dataType'] == 'gff3_mrna':
- metadata['type'] = "G-OnRamp_plugin/GenePred"
- else:
- metadata['type'] = "CanvasFeatures"
+ # We can get the biggest scaffold here, with chromSizesFile
+ with open(self.chromSizesFile.name, 'r') as chrom_sizes:
+ # TODO: Check if exists
+ self.default_pos = chrom_sizes.readline().split()[0]
+
+ # TODO: Manage to put every fill Function in a file dedicated for reading reasons
+ # Create the root directory
+ myHubPath = os.path.join(extra_files_path, "myHub")
+ if not os.path.exists(myHubPath):
+ os.makedirs(myHubPath)
+
+ # Create the specie folder
+ # TODO: Generate the name depending on the specie
+ mySpecieFolderPath = os.path.join(myHubPath, self.genome_name)
+ if not os.path.exists(mySpecieFolderPath):
+ os.makedirs(mySpecieFolderPath)
+ self.mySpecieFolderPath = mySpecieFolderPath
-
+ # We create the 2bit file while we just created the specie folder
+ #self.twoBitName = self.genome_name + ".2bit"
+ #self.two_bit_final_path = os.path.join(self.mySpecieFolderPath, self.twoBitName)
+ #shutil.copyfile(twoBitFile.name, self.two_bit_final_path)
-
+ # Create the folder tracks into the specie folder
+ tracksFolderPath = os.path.join(mySpecieFolderPath, "raw")
+ if not os.path.exists(tracksFolderPath):
+ os.makedirs(tracksFolderPath)
+ self.myTracksFolderPath = tracksFolderPath
+ myBinaryFolderPath = os.path.join(mySpecieFolderPath, 'bbi')
+ if not os.path.exists(myBinaryFolderPath):
+ os.makedirs(myBinaryFolderPath)
+ self.myBinaryFolderPath = myBinaryFolderPath
-
+ return myHubPath
diff -r bb6fdccef474 -r 31a41ce128cc TrackHub.pyc
Binary file TrackHub.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc apollo/ApolloInstance.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/apollo/ApolloInstance.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+import json
+import logging
+from util import subtools
+
+class ApolloInstance(object):
+ def __init__(self, apollo_host):
+ self.apollo_host = apollo_host
+ self.logger = logging.getLogger(__name__)
+
+ def getHost(self):
+ return self.apollo_host
+
+ def createApolloUser(self, apollo_user, admin=None):
+ p = subtools.arrow_create_user(apollo_user.user_email, apollo_user.firstname, apollo_user.lastname, apollo_user.password, admin)
+ user_info = json.loads(p)
+ user_id = user_info.get('userId')
+ if not user_id:
+ self.logger.debug("Cannot create new user: %s; The user may already exist", apollo_user.user_email)
+ user_id = subtools.arrow_get_users(apollo_user.user_email)
+ self.logger.debug("Got user_id for new or existing user: user_id = %s", str(user_id))
+ return user_id
+
+ def grantPermission(self, user_id, organism_id, **user_permissions):
+ subtools.arrow_update_organism_permissions(user_id, organism_id, **user_permissions)
+ self.logger.debug("Grant user %s permissions to organism %s, permissions = %s", str(user_id), str(organism_id), ','.join(user_permissions))
+
+ def addOrganism(self, organism_name, organism_dir):
+ p = subtools.arrow_add_organism(organism_name, organism_dir)
+ organism = json.loads(p)
+ organism_id = organism['id']
+ self.logger.debug("Added new organism to Apollo instance, %s", p)
+ return organism_id
+
+ def loadHubToApollo(self, apollo_user, organism_name, organism_dir, admin_user=False, **user_permissions):
+ user_id = self.createApolloUser(apollo_user, admin_user)
+ organism_id = self.addOrganism(organism_name, organism_dir)
+ self.grantPermission(user_id, organism_id, **user_permissions)
+ self.logger.debug("Successfully load the hub to Apollo")
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc apollo/ApolloInstance.pyc
Binary file apollo/ApolloInstance.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc apollo/ApolloUser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/apollo/ApolloUser.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,10 @@
+#!/usr/bin/python
+
+import os
+
+class ApolloUser(object):
+ def __init__(self, user_email, firstname, lastname, password):
+ self.user_email = user_email
+ self.firstname = firstname
+ self.lastname = lastname
+ self.password = password
diff -r bb6fdccef474 -r 31a41ce128cc apollo/ApolloUser.pyc
Binary file apollo/ApolloUser.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc apollo/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc apollo/__init__.pyc
Binary file apollo/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc bedToGff3.py
--- a/bedToGff3.py Wed Jul 12 12:55:27 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,139 +0,0 @@
-#!/usr/bin/env python
-
-'''
-Convert BED format to gff3
-reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
-'''
-import os
-from collections import OrderedDict
-import utils
-
-class bedToGff3():
- def __init__(self, inputBedFile, chrom_sizes, bed_type, output):
- self.input = inputBedFile
- #file_dir = os.path.basename(inputBedFile)
- #print file_dir + "\n\n"
- self.output = output
- self.chrom_sizes = chrom_sizes
- self.type = bed_type
- if self.type == "trfbig":
- self.trfbig_to_gff3()
- if self.type == "regtools":
- self.splicejunctions_to_gff3()
- if self.type == "blat":
- self.bigpsl_to_gff3()
-
- def trfbig_to_gff3(self):
- gff3 = open(self.output, 'w')
- gff3.write("##gff-version 3\n")
- sizes_dict = utils.sequence_region(self.chrom_sizes)
- seq_regions = dict()
- with open(self.input, 'r') as bed:
- for line in bed:
- field = OrderedDict()
- attribute = OrderedDict()
- li = line.rstrip().split("\t")
- field['seqid'] = li[0]
- if field['seqid'] not in seq_regions:
- end_region = sizes_dict[field['seqid']]
- gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
- seq_regions[field['seqid']] = end_region
- field['source'] = li[3]
- field['type'] = 'tandem_repeat'
- # The first base in a chromosome is numbered 0 in BED format
- field['start'] = str(int(li[1]) + 1)
- field['end'] = li[2]
- field['score'] = li[9]
- field['strand'] = '+'
- field['phase'] = '.'
- attribute['length of repeat unit'] = li[4]
- attribute['mean number of copies of repeat'] = li[5]
- attribute['length of consensus sequence'] = li[6]
- attribute['percentage match'] = li[7]
- attribute['percentage indel'] = li[8]
- attribute['percent of a\'s in repeat unit'] = li[10]
- attribute['percent of c\'s in repeat unit'] = li[11]
- attribute['percent of g\'s in repeat unit'] = li[12]
- attribute['percent of t\'s in repeat unit'] = li[13]
- attribute['entropy'] = li[14]
- attribute['sequence of repeat unit element'] = li[15]
- utils.write_features(field, attribute, gff3)
- gff3.close()
-
-
- def splicejunctions_to_gff3(self):
- gff3 = open(self.output, 'w')
- gff3.write("##gff-version 3\n")
- sizes_dict = utils.sequence_region(self.chrom_sizes)
- seq_regions = dict()
- with open(self.input, 'r') as bed:
- for line in bed:
- field = OrderedDict()
- attribute = OrderedDict()
- li = line.rstrip().split("\t")
- field['seqid'] = li[0]
- if field['seqid'] not in seq_regions:
- end_region = sizes_dict[field['seqid']]
- gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
- seq_regions[field['seqid']] = end_region
- field['source'] = li[3]
- field['type'] = 'junction'
- # The first base in a chromosome is numbered 0 in BED format
- field['start'] = int(li[1]) + 1
- field['end'] = li[2]
- field['score'] = li[12]
- field['strand'] = li[5]
- field['phase'] = '.'
- attribute['ID'] = li[0] + '_' + li[3]
- attribute['Name'] = li[3]
- attribute['blockcount'] = li[9]
- attribute['blocksizes'] = li[10]
- attribute['chromstarts'] = li[11]
- utils.write_features(field, attribute, gff3)
- utils.child_blocks(field, attribute, gff3, 'exon_junction')
- gff3.close()
-
- def bigpsl_to_gff3(self):
- gff3 = open(self.output, 'w')
- gff3.write("##gff-version 3\n")
- sizes_dict = utils.sequence_region(self.chrom_sizes)
- seq_regions = dict()
- with open(self.input, 'r') as bed:
- for line in bed:
- field = OrderedDict()
- attribute = OrderedDict()
- li = line.rstrip().split("\t")
- field['seqid'] = li[0]
- if field['seqid'] not in seq_regions:
- end_region = sizes_dict[field['seqid']]
- gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
- seq_regions[field['seqid']] = end_region
- field['source'] = 'UCSC BLAT alignment tool'
- field['type'] = 'match'
- # The first base in a chromosome is numbered 0 in BED format
- field['start'] = str(int(li[1]) + 1)
- field['end'] = li[2]
- field['score'] = li[4]
- field['strand'] = li[5]
- field['phase'] = '.'
- attribute['ID'] = li[0] + '_' + li[3]
- attribute['Name'] = li[3]
- attribute['blockcount'] = li[9]
- attribute['blocksizes'] = li[10]
- attribute['chromstarts'] = li[11]
- attribute['ochrom_start'] = li[12]
- attribute['ochrom_end'] = li[13]
- attribute['ochrom_strand'] = li[14]
- attribute['ochrom_size'] = li[15]
- attribute['ochrom_starts'] = li[16]
- attribute['sequence on other chromosome'] = li[17]
- attribute['cds in ncbi format'] = li[18]
- attribute['size of target chromosome'] = li[19]
- attribute['number of bases matched'] = li[20]
- attribute['number of bases that don\'t match'] = li[21]
- attribute['number of bases that match but are part of repeats'] = li[22]
- attribute['number of \'N\' bases'] = li[23]
- utils.write_features(field, attribute, gff3)
- utils.child_blocks(field, attribute, gff3, 'match_part')
- gff3.close()
-
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc bigGenePred.as
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bigGenePred.as Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,25 @@
+table bigGenePred
+"bigGenePred gene models"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Name or ID of item, ideally both human readable and unique"
+ uint score; "Score (0-1000)"
+ char[1] strand; "+ or - for strand"
+ uint thickStart; "Start of where display should be thick (start codon)"
+ uint thickEnd; "End of where display should be thick (stop codon)"
+ uint reserved; "RGB value (use R,G,B string in input file)"
+ int blockCount; "Number of blocks"
+ int[blockCount] blockSizes; "Comma separated list of block sizes"
+ int[blockCount] chromStarts; "Start positions relative to chromStart"
+ string name2; "Alternative/human readable name"
+ string cdsStartStat; "Status of CDS start annotation (none, unknown, incomplete, or complete)"
+ string cdsEndStat; "Status of CDS end annotation (none, unknown, incomplete, or complete)"
+ int[blockCount] exonFrames; "Exon frame {0,1,2}, or -1 if no frame for exon"
+ string type; "Transcript type"
+ string geneName; "Primary identifier for gene"
+ string geneName2; "Alternative/human readable gene name"
+ string geneType; "Gene type"
+ )
+
diff -r bb6fdccef474 -r 31a41ce128cc bigPsl.as
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bigPsl.as Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,33 @@
+table bigPsl
+"bigPsl pairwise alignment"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Name or ID of item, ideally both human readable and unique"
+ uint score; "Score (0-1000)"
+ char[1] strand; "+ or - for strand"
+ uint thickStart; "Start of where display should be thick (start codon)"
+ uint thickEnd; "End of where display should be thick (stop codon)"
+ uint reserved; "RGB value (use R,G,B string in input file)"
+ int blockCount; "Number of blocks"
+ int[blockCount] blockSizes; "Comma separated list of block sizes"
+ int[blockCount] chromStarts; "Start positions relative to chromStart"
+
+ uint oChromStart;"Start position in other chromosome"
+ uint oChromEnd; "End position in other chromosome"
+ char[1] oStrand; "+ or - for other strand"
+ uint oChromSize; "Size of other chromosome."
+ int[blockCount] oChromStarts; "Start positions relative to oChromStart"
+
+ lstring oSequence; "Sequence on other chrom (or edit list, or empty)"
+ string oCDS; "CDS in NCBI format"
+
+ uint chromSize;"Size of target chromosome"
+
+ uint match; "Number of bases matched."
+ uint misMatch; " Number of bases that don't match "
+ uint repMatch; " Number of bases that match but are part of repeats "
+ uint nCount; " Number of 'N' bases "
+ )
+
diff -r bb6fdccef474 -r 31a41ce128cc blastxmlToGff3.py
--- a/blastxmlToGff3.py Wed Jul 12 12:55:27 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,159 +0,0 @@
-#!/usr/bin/env python
-
-
-from Bio.Blast import NCBIXML
-from collections import OrderedDict
-import utils
-
-
-def align2cigar(hsp_query, hsp_reference):
- """
- Build CIGAR representation from an hsp_query
- input:
- hsp_query
- hsp_sbjct
- output:
- CIGAR string
- """
- query = hsp_query
- ref = hsp_reference
- # preType, curType:
- # 'M' represents match,
- # 'I' represents insert a gap into the reference sequence,
- # 'D' represents insert a gap into the target (delete from reference)
- # some ideas of this algin2cigar function are coming from
- # https://gist.github.com/ozagordi/099bdb796507da8d9426
- prevType = 'M'
- curType = 'M'
- count = 0
- cigar = []
- num = len(query)
- for i in range(num):
- if query[i] == '-':
- curType = 'D'
- elif ref[i] == '-':
- curType = 'I'
- else:
- curType = 'M'
- if curType == prevType:
- count += 1
- else:
- cigar.append('%s%d' % (prevType, count))
- prevType = curType
- count = 1
- cigar.append('%s%d' % (curType, count))
- return ' '.join(cigar)
-
-def gff3_writer(blast_records, gff3_file):
- gff3 = open(gff3_file, 'a')
- gff3.write("##gff-version 3\n")
- seq_regions = dict()
- for blast_record in blast_records:
- query_name = blast_record.query.split(" ")[0]
- source = blast_record.application
- method = blast_record.matrix
- for alignment in blast_record.alignments:
- group = {
- "parent_field" : OrderedDict(),
- "parent_attribute" : OrderedDict(),
- "alignments" : []
- }
- title = alignment.title.split(" ")
- contig_name = title[len(title) - 1]
- length = alignment.length
- group['parent_field']['seqid'] = contig_name
- group['parent_field']['source'] = source
- group['parent_field']['type'] = 'match'
- group['parent_attribute']['ID'] = contig_name + '_' + query_name
- group['parent_attribute']['method'] = method
- group['parent_attribute']['length'] = length
- if contig_name not in seq_regions:
- gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n')
- seq_regions[contig_name] = length
- match_num = 0
- coords = [length, 0]
- for hsp in alignment.hsps:
- hsp_align = {}
- field = OrderedDict()
- attribute = OrderedDict()
- ref = hsp.sbjct
- query = hsp.query
- field['seqid'] = contig_name
- field['source'] = source
- field['type'] = 'match_part'
-
- field['start'] = hsp.sbjct_start
- if field['start'] < coords[0]:
- coords[0] = field['start']
- ref_length = len(ref.replace('-', ''))
- # if run tblastn, the actual length of reference should be multiplied by 3
- if source.lower() == "tblastn":
- ref_length *= 3
- field['end'] = field['start'] + ref_length - 1
- if field['end'] > coords[1]:
- coords[1] = field['end']
- field['score'] = hsp.score
- #decide if the alignment in the same strand or reverse strand
- #reading frame
- # (+, +), (0, 0), (-, -) => +
- # (+, -), (-, +) => -
- if hsp.frame[1] * hsp.frame[0] > 0:
- field['strand'] = '+'
- elif hsp.frame[1] * hsp.frame[0] < 0:
- field['strand'] = '-'
- else:
- if hsp.frame[0] + hsp.frame[1] >= 0:
- field['strand'] = '+'
- else:
- field['strand'] = '-'
- field['phase'] = '.'
-
- target_start = hsp.query_start
- target_len = len(query.replace('-', ''))
- # if run blastx, the actual length of query should be multiplied by 3
- if source.lower() == "blastx":
- target_len *= 3
- target_end = target_start + target_len -1
- attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num)
- attribute['Parent'] = group['parent_attribute']['ID']
- attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end)
- attribute['Gap'] = align2cigar(query, ref)
- #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin
- attribute['subject'] = hsp.sbjct
- attribute['query'] = hsp.query
- attribute['match'] = hsp.match
- attribute['gaps'] = attribute['match'].count(' ')
- similar = attribute['match'].count('+')
- attribute['identities'] = len(attribute['match']) - similar - attribute['gaps']
- attribute['positives'] = attribute['identities'] + similar
- attribute['expect'] = hsp.expect
- # show reading frame attribute only if the frame is not (0, 0)
- attribute['frame'] = hsp.frame[1]
- match_num += 1
- hsp_align['field'] = field
- hsp_align['attribute'] = attribute
- group['alignments'].append(hsp_align)
- group['parent_field']['start'] = coords[0]
- group['parent_field']['end'] = coords[1]
- group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.'
- group['parent_attribute']['match_num'] = match_num
- group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end']))
- utils.write_features(group['parent_field'], group['parent_attribute'], gff3)
- prev_end = -1
- for align in group['alignments']:
- overlap = ''
- if align['field']['start'] <= prev_end:
- overlap += str(align['field']['start']) + ',' + str(prev_end)
- prev_end = align['field']['end']
- align['attribute']['overlap'] = overlap
- utils.write_features(align['field'], align['attribute'], gff3)
- gff3.close()
-
-def blastxml2gff3(xml_file, gff3_file):
- result_handle = open(xml_file)
- blast_records = NCBIXML.parse(result_handle)
- gff3_writer(blast_records, gff3_file)
-
-if __name__ == "__main__":
- blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt")
-
diff -r bb6fdccef474 -r 31a41ce128cc blastxmlToGff3.pyc
Binary file blastxmlToGff3.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/Datatype.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/Datatype.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+Super Class of the managed datatype
+"""
+
+import os
+import tempfile
+import collections
+from util import subtools
+import logging
+import abc
+from abc import ABCMeta
+from tracks.HTMLFeatures import HTMLFeatures
+from tracks.CanvasFeatures import CanvasFeatures
+from tracks.BamFeatures import BamFeatures
+from tracks.BigwigFeatures import BigwigFeatures
+from datatypes.validators.DataValidation import DataValidation
+
+
+class Datatype(object):
+ __metaclass__ = ABCMeta
+
+ chromSizesFile = None
+ input_fasta_file = None
+ extra_files_path = None
+ tool_directory = None
+
+ mySpecieFolderPath = None
+ myTrackFolderPath = None
+ myBinaryFolderPath = None
+
+ trackType = None
+
+ def __init__(self):
+ not_init_message = "The {0} is not initialized." \
+ "Did you use pre_init static method first?"
+ if Datatype.input_fasta_file is None:
+ raise TypeError(not_init_message.format('reference genome'))
+ if Datatype.extra_files_path is None:
+ raise TypeError(not_init_message.format('track Hub path'))
+ if Datatype.tool_directory is None:
+ raise TypeError(not_init_message.format('tool directory'))
+ self.inputFile = None
+ self.trackType = None
+ self.dataType = None
+ self.trackFileType = None
+ self.track = None
+ self.trackSettings = dict()
+ self.extraSettings = collections.OrderedDict()
+
+
+ @staticmethod
+ def pre_init(reference_genome, chrom_sizes_file,
+ extra_files_path, tool_directory, specie_folder, tracks_folder, binary_folder, track_type):
+ Datatype.extra_files_path = extra_files_path
+ Datatype.tool_directory = tool_directory
+
+ # TODO: All this should be in TrackHub and not in Datatype
+ Datatype.mySpecieFolderPath = specie_folder
+ Datatype.myTrackFolderPath = tracks_folder
+ Datatype.myBinaryFolderPath = binary_folder
+
+ Datatype.input_fasta_file = reference_genome
+
+ # 2bit file creation from input fasta
+ #Datatype.twoBitFile = two_bit_path
+ Datatype.chromSizesFile = chrom_sizes_file
+ Datatype.trackType = track_type
+
+
+ def generateCustomTrack(self):
+ self.validateData()
+ self.initSettings()
+ #Create the track file
+ self.createTrack()
+ # Create the TrackDb Object
+ self.createTrackDb()
+ logging.debug("- %s %s created", self.dataType, self.trackName)
+
+
+ @abc.abstractmethod
+ def validateData(self):
+ """validate the input data with DataValidation"""
+
+ def initSettings(self):
+ #Initialize required fields: trackName, longLabel, shortLable
+ self.trackName = self.trackSettings["name"]
+ self.trackDataURL = os.path.join(self.myTrackFolderPath, self.trackName)
+ if self.trackSettings["long_label"]:
+ self.trackLabel = self.trackSettings["long_label"]
+ else:
+ self.trackLabel = self.trackName
+ if "trackType" in self.trackSettings and self.trackSettings["trackType"]:
+ self.trackType = self.trackSettings["trackType"]
+ if self.trackSettings["group_name"]:
+ self.extraSettings["category"] = self.trackSettings["group_name"]
+ if "track_color" in self.trackSettings and self.trackSettings["track_color"]:
+ self.extraSettings["color"] = self.trackSettings["track_color"]
+
+
+ @abc.abstractmethod
+ def createTrack(self):
+ """Create the final track file"""
+
+ def createTrackDb(self):
+ if self.trackType == 'HTMLFeatures':
+ self.track = HTMLFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings)
+ elif self.trackType == "CanvasFeatures":
+ self.track = CanvasFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings)
+ elif self.trackType == "bam":
+ self.track = BamFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings)
+ elif self.trackType == "bigwig":
+ self.track = BigwigFeatures(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings)
+ else:
+ logging.error("Cannot createTrackDb, because trackType is not defined or invalid! trackType = %s", self.trackType)
+ self.track.createTrackDb()
+
+ #self.track = TrackDb(self.trackName, self.trackLabel, self.trackDataURL, self.trackType, self.dataType, self.extraSettings)
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/Datatype.pyc
Binary file datatypes/Datatype.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/__init__.pyc
Binary file datatypes/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/Bam.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/binary/Bam.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+Class to handle Bam files to UCSC TrackHub
+"""
+
+import logging
+import os
+import shutil
+
+from Binary import Binary
+from datatypes.validators.DataValidation import DataValidation
+from util import subtools
+
+
+
+
+class Bam(Binary):
+ def __init__(self, input_bam_false_path, data_bam):
+ super(Bam, self).__init__()
+ self.inputFile = input_bam_false_path
+ self.trackSettings = data_bam
+ self.dataType = "bam"
+ self.trackType = "bam"
+
+
+ def validateData(self):
+ self.validator = DataValidation(self.inputFile, self.dataType, self.chromSizesFile.name)
+ self.validator.validate()
+
+ def createTrack(self):
+ #shutil.copy(self.inputFile, self.trackDataURL)
+ extension = os.path.splitext(self.trackName)[1]
+ if extension != '.bam':
+ self.trackName = self.trackName + '.bam'
+ self.trackDataURL = os.path.join(self.myBinaryFolderPath, self.trackName)
+ #self.trackDataURL = os.path.join(self.myTrackFolderPath, self.trackName)
+ shutil.copyfile(self.inputFile, self.trackDataURL)
+ bam_index = subtools.createBamIndex(self.inputFile)
+ indexName = os.path.basename(bam_index)
+ trackIndexURL = os.path.join(self.myBinaryFolderPath, indexName)
+ #trackIndexURL = os.path.join(self.myTrackFolderPath, indexName)
+ shutil.copyfile(bam_index, trackIndexURL)
+ self.extraSettings['index'] = indexName
+
+
+
+
+
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/Bam.pyc
Binary file datatypes/binary/Bam.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/BigBed.pyc
Binary file datatypes/binary/BigBed.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/BigWig.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/binary/BigWig.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+
+import os
+import shutil
+from subprocess import Popen, PIPE
+import re
+
+# Internal dependencies
+from Binary import Binary
+from datatypes.validators.DataValidation import DataValidation
+
+
+
+class BigWig(Binary):
+ def __init__(self, input_bigwig_path, data_bigwig):
+ super(BigWig, self).__init__()
+ self.inputFile = input_bigwig_path
+ self.trackSettings = data_bigwig
+ self.dataType = "bigWig"
+ self.trackType= "bigwig"
+
+ def initSettings(self):
+ super(BigWig, self).initSettings()
+ if 'style' in self.trackSettings:
+ self.extraSettings['style'] = self.trackSettings['style']
+
+ def validateData(self):
+ self.validator = DataValidation(self.inputFile, self.dataType, self.chromSizesFile.name)
+ self.validator.validate()
+
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/BigWig.pyc
Binary file datatypes/binary/BigWig.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/Binary.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/binary/Binary.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,38 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+Super Class of the managed datatype
+"""
+
+import os
+import tempfile
+import collections
+import shutil
+import util
+from TrackDb import TrackDb
+from datatypes.Datatype import Datatype
+
+
+class Binary(Datatype):
+
+ def __init__(self):
+ super(Binary, self).__init__()
+
+
+ def initSettings(self):
+ super(Binary, self).initSettings()
+ self.trackDataURL = os.path.join(self.myBinaryFolderPath, self.trackName)
+
+
+ def createTrack(self):
+ shutil.copy(self.inputFile, self.trackDataURL)
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/Binary.pyc
Binary file datatypes/binary/Binary.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/binary/__init__.pyc
Binary file datatypes/binary/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/BedConversion.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/converters/BedConversion.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+'''
+Convert BED format to gff3
+reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
+'''
+import os
+import tempfile
+from collections import OrderedDict
+
+from util import subtools
+from DataConversion import DataConversion
+
+class BedConversion(DataConversion):
+ def __init__(self, inputBedFile, outputFile, chromSizesFile, bedType, trackType, options=None):
+ super(BedConversion, self).__init__(inputBedFile, outputFile, chromSizesFile, bedType, options)
+
+
+ def convertFormats(self):
+ self.dataToJson()
+
+
+ def dataToJson(self):
+ if self.dataType != 'bed':
+ self.convertToGff3()
+ self.inputFile = self.gff3_file
+ self.dataType == 'gff'
+ subtools.flatfile_to_json(self.inputFile, self.dataType, self.trackType, self.trackLabel, self.outputFile, self.options)
+
+ def convertToGff3(self):
+ self.gff3_file = tempfile.NamedTemporaryFile(suffix=".gff3")
+ if self.dataType == "trfbig":
+ self.trfbig_to_gff3()
+ elif self.dataType == "regtools":
+ self.splicejunctions_to_gff3()
+ elif self.dataType == "blat":
+ self.bigpsl_to_gff3()
+ else:
+ raise ValueError("dataType %s is not support for converting to GFF3", self.dataType)
+
+ def trfbig_to_gff3(self):
+ gff3 = open(self.gff3_file.name, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(self.chromSizesFile)
+ seq_regions = dict()
+ with open(self.inputFile, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[3]
+ field['type'] = 'tandem_repeat'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = str(int(li[1]) + 1)
+ field['end'] = li[2]
+ field['score'] = li[9]
+ field['strand'] = '+'
+ field['phase'] = '.'
+ attribute['length of repeat unit'] = li[4]
+ attribute['mean number of copies of repeat'] = li[5]
+ attribute['length of consensus sequence'] = li[6]
+ attribute['percentage match'] = li[7]
+ attribute['percentage indel'] = li[8]
+ attribute['percent of a\'s in repeat unit'] = li[10]
+ attribute['percent of c\'s in repeat unit'] = li[11]
+ attribute['percent of g\'s in repeat unit'] = li[12]
+ attribute['percent of t\'s in repeat unit'] = li[13]
+ attribute['entropy'] = li[14]
+ attribute['sequence of repeat unit element'] = li[15]
+ subtools.write_features(field, attribute, gff3)
+ gff3.close()
+
+
+ def splicejunctions_to_gff3(self):
+ gff3 = open(self.gff3_file.name, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(self.chromSizesFile)
+ seq_regions = dict()
+ with open(self.inputFile, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[3]
+ field['type'] = 'junction'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = int(li[1]) + 1
+ field['end'] = li[2]
+ field['score'] = li[12]
+ field['strand'] = li[5]
+ field['phase'] = '.'
+ attribute['ID'] = li[0] + '_' + li[3]
+ attribute['Name'] = li[3]
+ attribute['blockcount'] = li[9]
+ attribute['blocksizes'] = li[10]
+ attribute['chromstarts'] = li[11]
+ subtools.write_features(field, attribute, gff3)
+ subtools.child_blocks(field, attribute, gff3, 'exon_junction')
+ gff3.close()
+
+ def bigpsl_to_gff3(self):
+ gff3 = open(self.gff3_file.name, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(self.chromSizesFile)
+ seq_regions = dict()
+ with open(self.inputFile, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = 'UCSC BLAT alignment tool'
+ field['type'] = 'match'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = str(int(li[1]) + 1)
+ field['end'] = li[2]
+ field['score'] = li[4]
+ field['strand'] = li[5]
+ field['phase'] = '.'
+ attribute['ID'] = li[0] + '_' + li[3]
+ attribute['Name'] = li[3]
+ attribute['blockcount'] = li[9]
+ attribute['blocksizes'] = li[10]
+ attribute['chromstarts'] = li[11]
+ attribute['ochrom_start'] = li[12]
+ attribute['ochrom_end'] = li[13]
+ attribute['ochrom_strand'] = li[14]
+ attribute['ochrom_size'] = li[15]
+ attribute['ochrom_starts'] = li[16]
+ attribute['sequence on other chromosome'] = li[17]
+ attribute['cds in ncbi format'] = li[18]
+ attribute['size of target chromosome'] = li[19]
+ attribute['number of bases matched'] = li[20]
+ attribute['number of bases that don\'t match'] = li[21]
+ attribute['number of bases that match but are part of repeats'] = li[22]
+ attribute['number of \'N\' bases'] = li[23]
+ subtools.write_features(field, attribute, gff3)
+ subtools.child_blocks(field, attribute, gff3, 'match_part')
+ gff3.close()
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/BedConversion.pyc
Binary file datatypes/converters/BedConversion.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/DataConversion.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/converters/DataConversion.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This class handles the subprocess calls of the different tools used
+in HubArchiveCreator
+"""
+
+import logging
+import os
+import subprocess
+import sys
+import string
+import tempfile
+
+from bedToGff3 import bedToGff3
+from blastxmlToGff3 import blastxmlToGff3
+from gtfToGff3 import gtfToGff3
+
+
+
+
+class DataConversion(object):
+ CONVERT_OPERATIONS = {("bed", "gff"): "bedtogff3",
+ ("blastxml", "gff"): "blastxmltogff3",
+ ("gtf", "gff"): "gtftogff3"}
+
+ def __init__(self, inputFile, outputFile, chromSizesFile, operateType, options=None):
+ if not operateType:
+ return
+ if not inputFile:
+ raise TypeError("the input file is not specified!\n")
+ self.inputFile = inputFile
+ self.chromSizesFile = chromSizesFile
+ self.outputFile = outputFile
+ self.operateType = operateType
+ self.options = options
+
+
+
+ def convertFormats(self):
+ """ Convert data into JBrowse track """
+ convertMethod = self.CONVERT_OPERATIONS[self.operateType]
+ if convertMethod == "bedtogff3":
+ bedToGff3(self.inputFile, self.chromSizesFile, self.outputFile, self.options)
+ elif convertMethod == "blastxmltogff3":
+ blastxmlToGff3(self.inputFile, self.outputFile)
+ elif convertMethod == "gtftogff3":
+ gtfToGff3(self.inputFile, self.outputFile, self.chromSizesFile)
+ else:
+ raise ValueError("the operation %s is not defined!\n", self.operateType)
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/DataConversion.pyc
Binary file datatypes/converters/DataConversion.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/__init__.pyc
Binary file datatypes/converters/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/bedToGff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/converters/bedToGff3.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+
+'''
+Convert BED format to gff3
+reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
+'''
+import os
+from collections import OrderedDict
+from util import subtools
+
+def bedToGff3(inputBedFile, chrom_sizes, output, bed_type):
+ if bed_type == "trfbig":
+ trfbig_to_gff3(inputBedFile, chrom_sizes, output)
+ if bed_type == "regtools":
+ splicejunctions_to_gff3(inputBedFile, chrom_sizes, output)
+ if bed_type == "blat":
+ bigpsl_to_gff3(inputBedFile, chrom_sizes, output)
+
+def trfbig_to_gff3(inputBedFile, chrom_sizes, output):
+ gff3 = open(output, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(chrom_sizes)
+ seq_regions = dict()
+ with open(inputBedFile, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " +
+ field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[3]
+ field['type'] = 'tandem_repeat'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = str(int(li[1]) + 1)
+ field['end'] = li[2]
+ field['score'] = li[9]
+ field['strand'] = '+'
+ field['phase'] = '.'
+ attribute['length of repeat unit'] = li[4]
+ attribute['mean number of copies of repeat'] = li[5]
+ attribute['length of consensus sequence'] = li[6]
+ attribute['percentage match'] = li[7]
+ attribute['percentage indel'] = li[8]
+ attribute['percent of a\'s in repeat unit'] = li[10]
+ attribute['percent of c\'s in repeat unit'] = li[11]
+ attribute['percent of g\'s in repeat unit'] = li[12]
+ attribute['percent of t\'s in repeat unit'] = li[13]
+ attribute['entropy'] = li[14]
+ attribute['sequence of repeat unit element'] = li[15]
+ subtools.write_features(field, attribute, gff3)
+ gff3.close()
+
+def splicejunctions_to_gff3(inputBedFile, chrom_sizes, output):
+ gff3 = open(output, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(chrom_sizes)
+ seq_regions = dict()
+ with open(inputBedFile, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " +
+ field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[3]
+ field['type'] = 'junction'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = int(li[1]) + 1
+ field['end'] = li[2]
+ field['score'] = li[12]
+ field['strand'] = li[5]
+ field['phase'] = '.'
+ attribute['ID'] = li[0] + '_' + li[3]
+ attribute['Name'] = li[3]
+ attribute['blockcount'] = li[9]
+ attribute['blocksizes'] = li[10]
+ attribute['chromstarts'] = li[11]
+ subtools.write_features(field, attribute, gff3)
+ subtools.child_blocks(field, attribute, gff3, 'exon_junction')
+ gff3.close()
+
+def bigpsl_to_gff3(inputBedFile, chrom_sizes, output):
+ gff3 = open(output, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(chrom_sizes)
+ seq_regions = dict()
+ with open(inputBedFile, 'r') as bed:
+ for line in bed:
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ field['seqid'] = li[0]
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " +
+ field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = 'UCSC BLAT alignment tool'
+ field['type'] = 'match'
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = str(int(li[1]) + 1)
+ field['end'] = li[2]
+ field['score'] = li[4]
+ field['strand'] = li[5]
+ field['phase'] = '.'
+ attribute['ID'] = li[0] + '_' + li[3]
+ attribute['Name'] = li[3]
+ attribute['blockcount'] = li[9]
+ attribute['blocksizes'] = li[10]
+ attribute['chromstarts'] = li[11]
+ attribute['ochrom_start'] = li[12]
+ attribute['ochrom_end'] = li[13]
+ attribute['ochrom_strand'] = li[14]
+ attribute['ochrom_size'] = li[15]
+ attribute['ochrom_starts'] = li[16]
+ attribute['sequence on other chromosome'] = li[17]
+ attribute['cds in ncbi format'] = li[18]
+ attribute['size of target chromosome'] = li[19]
+ attribute['number of bases matched'] = li[20]
+ attribute['number of bases that don\'t match'] = li[21]
+ attribute['number of bases that match but are part of repeats'] = li[22]
+ attribute['number of \'N\' bases'] = li[23]
+ subtools.write_features(field, attribute, gff3)
+ subtools.child_blocks(field, attribute, gff3, 'match_part')
+ gff3.close()
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/bedToGff3.pyc
Binary file datatypes/converters/bedToGff3.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/blastxmlToGff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/converters/blastxmlToGff3.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+
+
+from Bio.Blast import NCBIXML
+from collections import OrderedDict
+import utils
+
+
+def align2cigar(hsp_query, hsp_reference):
+ """
+ Build CIGAR representation from an hsp_query
+ input:
+ hsp_query
+ hsp_sbjct
+ output:
+ CIGAR string
+ """
+ query = hsp_query
+ ref = hsp_reference
+ # preType, curType:
+ # 'M' represents match,
+ # 'I' represents insert a gap into the reference sequence,
+ # 'D' represents insert a gap into the target (delete from reference)
+ # some ideas of this algin2cigar function are coming from
+ # https://gist.github.com/ozagordi/099bdb796507da8d9426
+ prevType = 'M'
+ curType = 'M'
+ count = 0
+ cigar = []
+ num = len(query)
+ for i in range(num):
+ if query[i] == '-':
+ curType = 'D'
+ elif ref[i] == '-':
+ curType = 'I'
+ else:
+ curType = 'M'
+ if curType == prevType:
+ count += 1
+ else:
+ cigar.append('%s%d' % (prevType, count))
+ prevType = curType
+ count = 1
+ cigar.append('%s%d' % (curType, count))
+ return ' '.join(cigar)
+
+def gff3_writer(blast_records, gff3_file):
+ gff3 = open(gff3_file, 'a')
+ gff3.write("##gff-version 3\n")
+ seq_regions = dict()
+ for blast_record in blast_records:
+ query_name = blast_record.query.split(" ")[0]
+ source = blast_record.application
+ method = blast_record.matrix
+ for alignment in blast_record.alignments:
+ group = {
+ "parent_field" : OrderedDict(),
+ "parent_attribute" : OrderedDict(),
+ "alignments" : []
+ }
+ title = alignment.title.split(" ")
+ contig_name = title[len(title) - 1]
+ length = alignment.length
+ group['parent_field']['seqid'] = contig_name
+ group['parent_field']['source'] = source
+ group['parent_field']['type'] = 'match'
+ group['parent_attribute']['ID'] = contig_name + '_' + query_name
+ group['parent_attribute']['method'] = method
+ group['parent_attribute']['length'] = length
+ if contig_name not in seq_regions:
+ gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n')
+ seq_regions[contig_name] = length
+ match_num = 0
+ coords = [length, 0]
+ for hsp in alignment.hsps:
+ hsp_align = {}
+ field = OrderedDict()
+ attribute = OrderedDict()
+ ref = hsp.sbjct
+ query = hsp.query
+ field['seqid'] = contig_name
+ field['source'] = source
+ field['type'] = 'match_part'
+
+ field['start'] = hsp.sbjct_start
+ if field['start'] < coords[0]:
+ coords[0] = field['start']
+ ref_length = len(ref.replace('-', ''))
+ # if run tblastn, the actual length of reference should be multiplied by 3
+ if source.lower() == "tblastn":
+ ref_length *= 3
+ field['end'] = field['start'] + ref_length - 1
+ if field['end'] > coords[1]:
+ coords[1] = field['end']
+ field['score'] = hsp.score
+ #decide if the alignment in the same strand or reverse strand
+ #reading frame
+ # (+, +), (0, 0), (-, -) => +
+ # (+, -), (-, +) => -
+ if hsp.frame[1] * hsp.frame[0] > 0:
+ field['strand'] = '+'
+ elif hsp.frame[1] * hsp.frame[0] < 0:
+ field['strand'] = '-'
+ else:
+ if hsp.frame[0] + hsp.frame[1] >= 0:
+ field['strand'] = '+'
+ else:
+ field['strand'] = '-'
+ field['phase'] = '.'
+
+ target_start = hsp.query_start
+ target_len = len(query.replace('-', ''))
+ # if run blastx, the actual length of query should be multiplied by 3
+ if source.lower() == "blastx":
+ target_len *= 3
+ target_end = target_start + target_len -1
+ attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num)
+ attribute['Parent'] = group['parent_attribute']['ID']
+ attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end)
+ attribute['Gap'] = align2cigar(query, ref)
+ #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin
+ attribute['subject'] = hsp.sbjct
+ attribute['query'] = hsp.query
+ attribute['match'] = hsp.match
+ attribute['gaps'] = attribute['match'].count(' ')
+ similar = attribute['match'].count('+')
+ attribute['identities'] = len(attribute['match']) - similar - attribute['gaps']
+ attribute['positives'] = attribute['identities'] + similar
+ attribute['expect'] = hsp.expect
+ # show reading frame attribute only if the frame is not (0, 0)
+ attribute['frame'] = hsp.frame[1]
+ match_num += 1
+ hsp_align['field'] = field
+ hsp_align['attribute'] = attribute
+ group['alignments'].append(hsp_align)
+ group['parent_field']['start'] = coords[0]
+ group['parent_field']['end'] = coords[1]
+ group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.'
+ group['parent_attribute']['match_num'] = match_num
+ group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end']))
+ utils.write_features(group['parent_field'], group['parent_attribute'], gff3)
+ prev_end = -1
+ for align in group['alignments']:
+ overlap = ''
+ if align['field']['start'] <= prev_end:
+ overlap += str(align['field']['start']) + ',' + str(prev_end)
+ prev_end = align['field']['end']
+ align['attribute']['overlap'] = overlap
+ utils.write_features(align['field'], align['attribute'], gff3)
+ gff3.close()
+
+def blastxmlToGff3(xml_file, gff3_file):
+ result_handle = open(xml_file)
+ blast_records = NCBIXML.parse(result_handle)
+ gff3_writer(blast_records, gff3_file)
+
+if __name__ == "__main__":
+ blastxmlToGff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt")
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/blastxmlToGff3.pyc
Binary file datatypes/converters/blastxmlToGff3.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/gtfToGff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/converters/gtfToGff3.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+'''
+Convert GTF format to GFF3
+reference for gff3: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
+'''
+import os
+from collections import OrderedDict
+from util import subtools
+
+
+
+
+def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
+ """
+ Covert gtf file output from StringTie to gff3 format
+ """
+ gff3 = open(gff3_file, 'w')
+ gff3.write("##gff-version 3\n")
+ sizes_dict = subtools.sequence_region(chrom_sizes)
+ seq_regions = dict()
+ parents = dict()
+ with open(gtf_file, 'r') as gtf:
+ for line in gtf:
+ if line.startswith('#') or not line.strip():
+ continue
+ field = OrderedDict()
+ attribute = OrderedDict()
+ li = line.rstrip().split("\t")
+ #print li
+ field['seqid'] = li[0]
+ #print field['seqid']
+ if field['seqid'] not in seq_regions:
+ end_region = sizes_dict[field['seqid']]
+ gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
+ seq_regions[field['seqid']] = end_region
+ field['source'] = li[1]
+ field['type'] = li[2]
+ # The first base in a chromosome is numbered 0 in BED format
+ field['start'] = li[3]
+ field['end'] = li[4]
+ field['score'] = li[5]
+ field['strand'] = li[6]
+ field['phase'] = li[7]
+ attr_li = li[8].split(';')
+ gene_id = attr_li[0].split()[1].strip('"')
+ attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end'])
+ if field['type'] == 'transcript':
+ parents[gene_id] = attribute['ID']
+ attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
+ attribute['coverage'] = attr_li[2].split()[1].strip('"')
+ attribute['fpkm'] = attr_li[3].split()[1].strip('"')
+ attribute['tpm'] = attr_li[4].split()[1].strip('"')
+ elif field['type'] == 'exon':
+ attribute['Parent'] = parents[gene_id]
+ attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
+ attribute['coverage'] = attr_li[3].split()[1].strip('"')
+ subtools.write_features(field, attribute, gff3)
+ gff3.close()
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/converters/gtfToGff3.pyc
Binary file datatypes/converters/gtfToGff3.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Bed.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Bed.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+import logging
+import shutil
+
+# Internal dependencies
+from Interval import Interval
+from datatypes.validators.DataValidation import DataValidation
+from datatypes.converters.DataConversion import DataConversion
+
+class Bed(Interval):
+ def __init__(self, inputBedGeneric, data_bed_generic):
+ super(Bed, self).__init__()
+ self.inputFile = inputBedGeneric
+ self.trackSettings = data_bed_generic
+ self.bedFields = None
+ self.extFields = None
+ self.dataType = "bed"
+
+ def createTrack(self):
+ shutil.copyfile(self.inputFile, self.trackDataURL)
+
+ def validateData(self):
+ self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name)
+ self.validator.validate()
+
+ def _getBedFields(self):
+ """count number of bed fields for generic bed format"""
+ with open(self.inputFile, 'r') as bed:
+ l = bed.readline().split()
+ return len(l)
+
+ def getValidateType(self):
+ if not self.bedFields:
+ logging.debug("bedFields is not defined, consider the file as Bed generic format, datatype = bed%s", str(self.bedFields))
+ self.bedFields = self._getBedFields()
+ return self.dataType + str(self.bedFields)
+ elif not self.extFields:
+ return self.dataType + str(self.bedFields)
+ else:
+ return self.dataType + str(self.bedFields) + "+" + str(self.extFields)
+
+
+
+
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Bed.pyc
Binary file datatypes/interval/Bed.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedBlastAlignments.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/BedBlastAlignments.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+import string
+
+from BigPsl import BigPsl
+from datatypes.converters.DataConversion import DataConversion
+from util import subtools
+
+
+class BedBlastAlignments( BigPsl ):
+ def __init__(self, input_bed_blast_alignments_false_path, data_bed_blast_alignments):
+
+ super(BedBlastAlignments, self).__init__(input_bed_blast_alignments_false_path, data_bed_blast_alignments)
+ #self.seqType = 1
+ self.trackType = "G-OnRamp_plugin/BlastAlignment"
+
+ def initSettings(self):
+ super(BedBlastAlignments, self).initSettings()
+ self.extraSettings["subfeatureClasses"] = "match_part"
+
+
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedBlastAlignments.pyc
Binary file datatypes/interval/BedBlastAlignments.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedBlatAlignments.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/BedBlatAlignments.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+import string
+
+from BigPsl import BigPsl
+from datatypes.converters.DataConversion import DataConversion
+from util import subtools
+
+
+class BedBlatAlignments( BigPsl ):
+ def __init__(self, input_bed_blast_alignments_false_path, data_bed_blast_alignments):
+
+ super(BedBlatAlignments, self).__init__(input_bed_blast_alignments_false_path, data_bed_blast_alignments)
+ #self.seqType = 1
+ #self.trackType = "G-OnRamp_plugin/BlatAlignment"
+
+ def initSettings(self):
+ super(BedBlatAlignments, self).initSettings()
+ self.extraSettings["subfeatureClasses"] = "match_part"
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedBlatAlignments.pyc
Binary file datatypes/interval/BedBlatAlignments.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedSimpleRepeats.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/BedSimpleRepeats.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+from Bed import Bed
+from datatypes.validators.DataValidation import DataValidation
+from datatypes.converters.DataConversion import DataConversion
+
+
+class BedSimpleRepeats( Bed ):
+ def __init__(self, input_bed_simple_repeats_false_path, data_bed_simple_repeats):
+
+ super(BedSimpleRepeats, self).__init__(input_bed_simple_repeats_false_path, data_bed_simple_repeats)
+ self.bedFields = 4
+ self.extFields = 12
+ self.autoSql = os.path.join(self.tool_directory, 'trf_simpleRepeat.as')
+ self.trackFileType = "gff"
+
+
+
+ def validateData(self):
+ self.validateOptions = self.getValidateOptions(tab="True", autoSql=self.autoSql)
+ self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name, self.validateOptions)
+ self.validator.validate()
+
+
+ def createTrack(self):
+ self.convertType = self.getConvertType()
+ self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, 'trfbig')
+ self.converter.convertFormats()
+ self.dataType = self.trackFileType
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedSimpleRepeats.pyc
Binary file datatypes/interval/BedSimpleRepeats.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedSpliceJunctions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/BedSpliceJunctions.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,36 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+from Bed import Bed
+from datatypes.validators.DataValidation import DataValidation
+from datatypes.converters.DataConversion import DataConversion
+
+
+
+class BedSpliceJunctions( Bed ):
+ def __init__(self, input_bed_splice_junctions_false_path, data_bed_splice_junctions):
+
+ super(BedSpliceJunctions, self).__init__(input_bed_splice_junctions_false_path, data_bed_splice_junctions)
+ self.bedFields = 12
+ self.extFields = 1
+ self.autoSql = os.path.join(self.tool_directory, 'spliceJunctions.as')
+ self.trackFileType = "gff"
+
+ def initSettings(self):
+ super(BedSpliceJunctions, self).initSettings()
+ self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments"
+ self.extraSettings["subfeatureClasses"] = "exon_junction"
+
+ def validateData(self):
+ self.validateOptions = self.getValidateOptions(tab="True", autoSql=self.autoSql)
+ self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name, self.validateOptions)
+ self.validator.validate()
+
+ def createTrack(self):
+ self.convertType = self.getConvertType()
+ self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, 'regtools')
+ self.converter.convertFormats()
+ self.dataType = self.trackFileType
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BedSpliceJunctions.pyc
Binary file datatypes/interval/BedSpliceJunctions.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BigPsl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/BigPsl.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+import string
+
+from Interval import Interval
+from util.index.DatabaseIndex import DatabaseIndex
+from util.index.TrixIndex import TrixIndex
+from datatypes.validators.DataValidation import DataValidation
+from datatypes.converters.DataConversion import DataConversion
+
+
+class BigPsl(Interval):
+ def __init__(self, input_bigpsl_false_path, data_bigpsl):
+
+ super(BigPsl, self).__init__()
+ self.inputFile = input_bigpsl_false_path
+ self.trackSettings = data_bigpsl
+ self.dataType = "bed"
+ self.bedFields = 12
+ self.extFields = 12
+ #self.seqType = None
+ self.autoSql = os.path.join(self.tool_directory, 'bigPsl.as')
+
+ def initSettings(self):
+ super(BigPsl, self).initSettings()
+ self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments"
+ #self.extraSettings["subfeatureClasses"] = "match_part"
+
+ def validateData(self):
+ self.validateOptions = self.getValidateOptions(tab="True", autoSql=self.autoSql)
+ self.validator = DataValidation(self.inputFile, self.getValidateType(), self.chromSizesFile.name, self.validateOptions)
+ self.validator.validate()
+
+ def createTrack(self):
+ self.convertType = self.getConvertType()
+ self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, 'blat')
+ self.converter.convertFormats()
+ self.dataType = self.trackFileType
+
+ def getValidateType(self):
+ if not self.bedFields or not self.extFields:
+ raise Exception("Invalid bigPsl format, no {0} or {1}".format("bedFields", "extFields"))
+ return self.dataType + str(self.bedFields) + "+" + str(self.extFields)
+
+ def _getSeqType(self):
+ with open(self.inputFile, "r") as bigpsl:
+ sampleSeq = bigpsl.readline().split()
+ if len(sampleSeq) == 25:
+ return sampleSeq[-1]
+ else:
+ return None
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BigPsl.pyc
Binary file datatypes/interval/BigPsl.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BlastXml.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/BlastXml.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,34 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+import string
+
+from Interval import Interval
+from datatypes.converters.DataConversion import DataConversion
+from util import subtools
+
+
+class BlastXml( Interval ):
+ def __init__(self, input_blast_alignments_false_path, data_blast_alignments):
+
+ super(BlastXml, self).__init__()
+ self.inputFile = input_blast_alignments_false_path
+ self.trackSettings = data_blast_alignments
+ self.dataType = "blastxml"
+ #self.trackType = "G-OnRamp_plugin/BlatAlignment"
+
+ def initSettings(self):
+ super(BlastXml, self).initSettings()
+ self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments"
+ self.extraSettings["subfeatureClasses"] = "match_part"
+
+ def validateData(self):
+ return
+
+ def createTrack(self):
+ self.convertType = self.getConvertType()
+ self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType)
+ self.converter.convertFormats()
+ self.dataType = self.trackFileType
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/BlastXml.pyc
Binary file datatypes/interval/BlastXml.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/CytoBand.pyc
Binary file datatypes/interval/CytoBand.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Gff.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,21 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+import abc
+import shutil
+
+# Internal dependencies
+from Interval import Interval
+from datatypes.validators.DataValidation import DataValidation
+from datatypes.converters.DataConversion import DataConversion
+
+class Gff(Interval):
+ def __init__(self):
+ super(Gff, self).__init__()
+ self.autoSql = os.path.join(self.tool_directory, 'bigGenePred.as')
+
+
+ def createTrack(self):
+ shutil.copyfile(self.inputFile, self.trackDataURL)
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff.pyc
Binary file datatypes/interval/Gff.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff3.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Gff3.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+# Internal dependencies
+from Gff import Gff
+from datatypes.validators.Gff3Validation import Gff3Validation
+
+
+class Gff3( Gff ):
+ def __init__(self, input_Gff3_false_path, data_gff3):
+ super( Gff3, self ).__init__()
+ self.inputFile = input_Gff3_false_path
+ self.trackSettings = data_gff3
+ self.dataType = "gff"
+
+
+ def validateData(self):
+ self.validator = Gff3Validation(self.inputFile, self.dataType, self.chromSizesFile.name)
+ self.inputFile = self.validator.validate()
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff3.pyc
Binary file datatypes/interval/Gff3.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff3_mrna.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Gff3_mrna.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,27 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+# Internal dependencies
+from Gff import Gff
+from datatypes.validators.Gff3Validation import Gff3Validation
+
+
+class Gff3_mrna( Gff ):
+ def __init__(self, input_Gff3_false_path, data_gff3):
+ super( Gff3_mrna, self ).__init__()
+ self.inputFile = input_Gff3_false_path
+ self.trackSettings = data_gff3
+ self.dataType = "gff"
+ #self.trackType = "G-OnRamp_plugin/GenePred"
+
+ def initSettings(self):
+ super(Gff3_mrna, self).initSettings()
+ self.extraSettings["type"] = "mRNA"
+ self.extraSettings["subfeatureClasses"] = "CDS"
+
+ def validateData(self):
+ self.validator = Gff3Validation(self.inputFile, self.dataType, self.chromSizesFile.name)
+ self.inputFile = self.validator.validate()
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff3_mrna.pyc
Binary file datatypes/interval/Gff3_mrna.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff3_transcript.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Gff3_transcript.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+# Internal dependencies
+from Gff import Gff
+from datatypes.validators.Gff3Validation import Gff3Validation
+
+
+class Gff3_transcript( Gff ):
+ def __init__(self, input_Gff3_false_path, data_gff3):
+ super( Gff3_transcript, self ).__init__()
+ self.inputFile = input_Gff3_false_path
+ self.trackSettings = data_gff3
+ self.dataType = "gff"
+ #self.trackType = "G-OnRamp_plugin/GenePred"
+
+ def initSettings(self):
+ super(Gff3_transcript, self).initSettings()
+ self.extraSettings["transcriptType"] = "transcript"
+ self.extraSettings["type"] = "transcript"
+ self.extraSettings["subfeatureClasses"] = "CDS"
+
+ def validateData(self):
+ self.validator = Gff3Validation(self.inputFile, self.dataType, self.chromSizesFile.name)
+ self.inputFile = self.validator.validate()
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gff3_transcript.pyc
Binary file datatypes/interval/Gff3_transcript.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gtf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Gtf.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+# Internal dependencies
+from Gff import Gff
+from datatypes.validators.GtfValidation import GtfValidation
+from datatypes.converters.DataConversion import DataConversion
+
+
+class Gtf(Gff):
+ def __init__( self, input_gtf_false_path, data_gtf):
+
+ super(Gtf, self).__init__()
+ self.inputFile = input_gtf_false_path
+ self.trackSettings = data_gtf
+ self.dataType = "gtf"
+
+ def initSettings(self):
+ super(Gtf, self).initSettings()
+ self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments"
+
+ def createTrack(self):
+ self.convertType = self.getConvertType()
+ self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType)
+ self.converter.convertFormats()
+ self.dataType = self.trackFileType
+
+ def validateData(self):
+ self.validator = GtfValidation(self.inputFile, self.dataType, self.chromSizesFile.name)
+ self.inputFile = self.validator.validate()
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Gtf.pyc
Binary file datatypes/interval/Gtf.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/GtfStringTie.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/GtfStringTie.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+
+import os
+import tempfile
+
+# Internal dependencies
+from Gtf import Gtf
+from datatypes.validators.GtfValidation import GtfValidation
+from datatypes.converters.DataConversion import DataConversion
+
+
+class GtfStringTie(Gtf):
+ def __init__( self, input_gtf_false_path, data_gtf):
+
+ super(GtfStringTie, self).__init__(input_gtf_false_path, data_gtf)
+
+
+ def initSettings(self):
+ super(GtfStringTie, self).initSettings()
+ self.extraSettings["glyph"] = "JBrowse/View/FeatureGlyph/Segments"
+ self.extraSettings["subfeatureClasses"] = "UTR"
+
+
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/GtfStringTie.pyc
Binary file datatypes/interval/GtfStringTie.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Interval.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Interval.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+Super Class of the managed datatype
+"""
+
+import logging
+from datatypes.Datatype import Datatype
+
+
+class Interval(Datatype):
+
+ def __init__(self):
+ super(Interval, self).__init__()
+ if not Datatype.trackType:
+ self.trackType = "HTMLFeatures"
+ else:
+ self.trackType = Datatype.trackType
+ logging.debug("Set default trackType = %s for feature tracks", self.trackType)
+ self.trackFileType = "gff"
+
+
+ def getValidateOptions(self, tab=None, autoSql=None):
+ options = dict()
+ if tab:
+ options["tab"] = tab
+ if autoSql:
+ options["autoSql"] = autoSql
+ return options
+
+ def getConvertType(self):
+ if not self.trackFileType or not self.dataType:
+ raise ValueError("dataType or trackFileType has not been set!")
+ return (self.dataType.lower(), self.trackFileType.lower())
+
+
+
+
+
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Interval.pyc
Binary file datatypes/interval/Interval.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Psl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/interval/Psl.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,42 @@
+import logging
+import os
+import tempfile
+
+# Internal dependencies
+from Interval import Interval
+from datatypes.validators.PslValidation import PslValidation
+from datatypes.converters.DataConversion import DataConversion
+
+
+class Psl(Interval):
+ def __init__(self, input_psl_path, data_psl):
+ super(Psl, self).__init__()
+ self.inputFile = input_psl_path
+ self.trackSettings = data_psl
+ self.dataType = "psl"
+ self.trackType = "bigPsl"
+ self.autoSql = os.path.join(self.tool_directory, 'bigPsl.as')
+
+ def initSettings(self):
+ super(Psl, self).initSettings()
+ self.trackName = "".join( ( self.trackName, ".bb") )
+ self.trackDataURL = os.path.join(self.myTrackFolderPath, self.trackName)
+ if "track_color" in self.trackSettings:
+ self.extraSettings["color"] = self.trackSettings["track_color"]
+ if "group_name" in self.trackSettings:
+ self.extraSettings["group"] = self.trackSettings["group_name"]
+ self.extraSettings["visibility"] = "dense"
+ self.extraSettings["priority"] = self.trackSettings["order_index"]
+
+ def validateData(self):
+ self.validator = PslValidation(self.inputFile, self.dataType, self.chromSizesFile)
+ self.validator.validate()
+
+ def createTrack(self):
+ self.convertType = self.getConvertType()
+ self.options = self.getConvertOptions("bed12+12", tab="True", autoSql=self.autoSql, extraIndex="name")
+ self.converter = DataConversion(self.inputFile, self.trackDataURL, self.chromSizesFile.name, self.convertType, self.options)
+ self.converter.convertFormats()
+
+ def getConvertType(self):
+ return (self.dataType.lower(), self.trackType.lower())
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/Psl.pyc
Binary file datatypes/interval/Psl.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/interval/__init__.pyc
Binary file datatypes/interval/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/sequence/Fasta.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/sequence/Fasta.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,16 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+Class describing the Fasta format
+(As of the 07/20/2016, only used with the reference genome)
+"""
+
+class Fasta(object):
+ def __init__(self, false_path, name, assembly_id):
+ self.false_path = false_path
+ self.name = name
+
+ if not assembly_id:
+ assembly_id = "unknown"
+ self.assembly_id = assembly_id
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/sequence/Fasta.pyc
Binary file datatypes/sequence/Fasta.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/sequence/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/sequence/__init__.pyc
Binary file datatypes/sequence/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/DataValidation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/validators/DataValidation.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This class handles the subprocess calls of the different tools used
+in HubArchiveCreator
+"""
+
+import logging
+import os
+import subprocess
+import sys
+import string
+import tempfile
+import re
+
+from util import subtools
+
+
+class DataValidation(object):
+ BED_TYPE = re.compile(r'bed([1-9][0-9]?)\+?([1-9][0-9]?)?$')
+ BIGBED_TYPE = re.compile(r'bigBed([1-9][0-9]?)\+?([1-9][0-9]?)?$')
+ FILE_TYPE = ["fasta", "fastq", "bam", "bigwig", "bed", "bigbed", "bedgraph"]
+
+ def __init__(self, inputFile, fileType, chromSizesFile, options=None):
+ self.inputFile = inputFile
+ self.fileType = fileType
+ self.chromSizesFile = chromSizesFile
+ self.options = options
+
+ def validate(self):
+ """validate input file format"""
+ if self._checkDatatype():
+ subtools.validateFiles(self.inputFile, self.chromSizesFile, self.fileType, self.options)
+ else:
+ raise TypeError("validateFiles cannot validate format {0}. Only the following formats can be validated by this tool: \n{1}\n".format(self.fileType, self.FILE_TYPE))
+
+ def _checkDatatype(self):
+ if re.match(self.BED_TYPE, self.fileType) or re.match(self.BIGBED_TYPE, self.fileType):
+ return True
+ elif self.fileType.lower() in self.FILE_TYPE:
+ return True
+ return False
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/DataValidation.pyc
Binary file datatypes/validators/DataValidation.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/Gff3Validation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/validators/Gff3Validation.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,48 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This class handles the subprocess calls of the different tools used
+in HubArchiveCreator
+"""
+
+import logging
+import os
+import subprocess
+import sys
+import string
+import tempfile
+import re
+
+from DataValidation import DataValidation
+
+
+
+class Gff3Validation(DataValidation):
+
+ def __init__(self, inputFile, fileType, chromSizesFile, options=None):
+ super(Gff3Validation, self).__init__(inputFile, fileType, chromSizesFile, options)
+
+ def validate(self):
+ """validate input file format"""
+ if self._removeExtraHeader() > 1:
+ print("- Warning: Gff3 created with a modified version of your Gff3 by removing extra headers '##gff-version 3'.")
+ return self.inputFile
+
+ def _removeExtraHeader(self):
+ """
+ Remove extra meta line: ##gff-version 3
+ """
+ valid_gff3_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gff3", delete=False)
+ valid = open(valid_gff3_file.name, 'w')
+ num = 0
+ with open(self.inputFile, 'r') as f:
+ for line in f:
+ if '##gff-version 3' in line:
+ if num == 0:
+ num += 1
+ else:
+ continue
+ valid.write(line)
+ self.inputFile = valid_gff3_file.name
+ return num
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/Gff3Validation.pyc
Binary file datatypes/validators/Gff3Validation.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/GtfValidation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/validators/GtfValidation.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This class handles the subprocess calls of the different tools used
+in HubArchiveCreator
+"""
+
+import logging
+import os
+import subprocess
+import sys
+import string
+import tempfile
+import re
+
+from DataValidation import DataValidation
+
+
+class GtfValidation(DataValidation):
+
+ def __init__(self, inputFile, fileType, chromSizesFile, options=None):
+ super(GtfValidation, self).__init__(inputFile, fileType, chromSizesFile, options)
+
+ def validate(self):
+ """validate input file format"""
+ self._checkAndFixGtf()
+ if self.is_modified:
+ print("- Warning: Gtf created with a modified version of your Gtf because of start/end coordinates issues.")
+ print("Here are the lines removed: " + self._get_str_modified_lines())
+ return self.inputFile
+
+
+
+ def _checkAndFixGtf(self):
+ """
+ Call _checkAndFixGtf, check the integrity of gtf file,
+ if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold
+ depending on the user choice
+ default: remove the whole line(s)
+ """
+ # Set the boolean telling if we had to modify the file
+ self.is_modified = False
+ self.array_modified_lines = []
+ # Create a temp gtf just in case we have issues
+ temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False)
+
+ # TODO: Get the user choice and use it
+ # TODO: Check if the start > 0 and the end <= chromosome size
+ # Get the chrom.sizes into a dictionary to have a faster access
+ # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary
+ dict_chrom_sizes = {}
+ with open(self.chromSizesFile, 'r') as chromSizes:
+ lines = chromSizes.readlines()
+ for line in lines:
+ fields = line.split()
+ # fields[1] should be the name of the scaffold
+ # fields[2] should be the size of the scaffold
+ # TODO: Ensure this is true for all lines
+ dict_chrom_sizes[fields[0]] = fields[1]
+
+ # Parse the GTF and check each line using the chrom sizes dictionary
+ with open(temp_gtf.name, 'a+') as tmp:
+ with open(self.inputFile, 'r') as gtf:
+ lines = gtf.readlines()
+ for index, line in enumerate(lines):
+ # If this is not a comment, we check the fields
+ if not line.startswith('#'):
+ fields = line.split()
+ # We are interested in fields[0] => Seqname (scaffold)
+ # We are interested in fields[3] => Start of the scaffold
+ # We are interested in fields[4] => End of the scaffold
+ scaffold_size = dict_chrom_sizes[fields[0]]
+ start_position = fields[3]
+ end_position = fields[4]
+
+ if start_position > 0 and end_position <= scaffold_size:
+ # We are good, so we copy this line
+ tmp.write(line)
+ tmp.write(os.linesep)
+
+
+ # The sequence is not good, we are going to process it regarding the user choice
+ # TODO: Process the user choice
+ # By default, we are assuming the user choice is to remove the lines: We don't copy it
+
+ # If we are here, it means the gtf has been modified
+ else:
+ # We save the line for the feedback to the user
+ self.array_modified_lines.append(index + 1)
+
+ if self.is_modified is False:
+ self.is_modified = True
+ else:
+ pass
+ else:
+ tmp.write(line)
+ tmp.write(os.linesep)
+
+ # Once the process it completed, we just replace the path of the gtf
+ self.inputFile = temp_gtf.name
+
+ # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False
+ #return modified_gtf
+
+ def _get_str_modified_lines(self):
+ return ','.join(map(str, self.array_modified_lines))
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/GtfValidation.pyc
Binary file datatypes/validators/GtfValidation.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/PslValidation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes/validators/PslValidation.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This class handles the subprocess calls of the different tools used
+in HubArchiveCreator
+"""
+
+import logging
+import os
+import subprocess
+import sys
+import string
+import tempfile
+import re
+
+from util import subtools
+from datatypes.validators.DataValidation import DataValidation
+
+
+class PslValidation(DataValidation):
+
+ def __init__(self, inputFile, fileType, chromSizesFile, options=None):
+ super(PslValidation, self).__init__(inputFile, fileType, chromSizesFile, options)
+
+ def validate(self):
+ """validate input file format"""
+ self.pslCheck()
+
+ def pslCheck(self):
+ subtools.pslCheck(self.inputFile)
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/PslValidation.pyc
Binary file datatypes/validators/PslValidation.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc datatypes/validators/__init__.pyc
Binary file datatypes/validators/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc jbrowseArchiveCreator.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jbrowseArchiveCreator.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+# -*- coding: utf8 -*-
+
+"""
+This Galaxy tool permits to prepare your files to be ready for JBrowse visualization.
+"""
+
+import sys
+import argparse
+import json
+import logging
+import collections
+
+
+# Internal dependencies
+from util.Reader import Reader
+from util.Logger import Logger
+from TrackHub import TrackHub
+
+
+def main(argv):
+ parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.')
+ parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs')
+ parser.add_argument('-o', '--output', help='Name of the HTML summarizing the content of the JBrowse Hub Archive')
+
+ # Get the args passed in parameter
+ args = parser.parse_args()
+ json_inputs_data = args.data_json
+ outputFile = args.output
+
+ ##Parse JSON file with Reader
+ reader = Reader(json_inputs_data)
+
+ # Begin init variables
+ extra_files_path = reader.getExtFilesPath()
+ toolDirectory = reader.getToolDir()
+ #outputFile = reader.getOutputDir()
+ user_email = reader.getUserEmail()
+ reference_genome = reader.getRefGenome()
+ debug_mode = reader.getDebugMode()
+ track_type = reader.getTrackType()
+ #jbrowse_path = reader.getJBrowsePath()
+ apollo_host = reader.getApolloHost()
+ apollo_user = reader.getApolloUser()
+
+ #### Logging management ####
+ # If we are in Debug mode, also print in stdout the debug dump
+ log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path)
+ log.setup_logging()
+ logging.info('#### JBrowseArchiveCreator: Start ####\n')
+ logging.debug('---- Welcome in JBrowseArchiveCreator Debug Mode ----\n')
+ logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args))
+ #### END Logging management ####
+
+ # Create the Track Hub folder
+ logging.info('#### JBrowseArchiveCreator: Creating the Track Hub folder ####\n')
+ trackHub = TrackHub(reference_genome, apollo_user, outputFile, extra_files_path, toolDirectory, track_type, apollo_host)
+
+ # Create Ordered Dictionary to add the tracks in the tool form order
+ logging.info('#### JBrowseArchiveCreator: Preparing track data ####\n')
+ all_datatype_dictionary = reader.getTracksData()
+ all_datatype_ordered_dictionary = collections.OrderedDict(all_datatype_dictionary)
+
+ logging.debug("----- End of all_datatype_dictionary processing -----")
+ #logging.debug("all_datatype_ordered_dictionary are: %s", json.dumps(all_datatype_ordered_dictionary))
+
+ logging.info('#### JBrowseArchiveCreator: Adding tracks to Track Hub ####\n')
+ logging.debug("----- Beginning of Track adding processing -----")
+
+ for index, datatypeObject in all_datatype_ordered_dictionary.iteritems():
+ trackHub.addTrack(datatypeObject.track.track_db)
+
+ logging.debug("----- End of Track adding processing -----")
+
+ # We terminate the process and so create a HTML file summarizing all the files
+ logging.info('#### JBrowseArchiveCreator: Creating the HTML file ####\n')
+ trackHub.terminate(debug_mode)
+
+ logging.debug('---- End of JBrowseArchiveCreator Debug Mode: Bye! ----\n')
+ logging.info('#### JBrowseArchiveCreator: Congratulation! Assembly Hub is created! ####\n')
+
+ sys.exit(0)
+
+if __name__ == "__main__":
+ main(sys.argv)
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc jbrowseArchiveCreator.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jbrowseArchiveCreator.xml Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,446 @@
+
+
+ This Galaxy tool is used to prepare your files to be ready for displaying on JBrowse with Apollo plugin
+
+
+
+ samtools
+ numpy
+ biopython
+ ucsc_tools_340
+ jbrowse_tools
+
+
+
+
+
+
+
+
+import json
+import sys
+
+file_path = sys.argv[1]
+#set global data_parameter_dict = {}
+
+## Ask the user to enter the genome name
+#silent $data_parameter_dict.update({"genome_name": str($genome_name)})
+#silent $data_parameter_dict.update({"apollo_host": str($apollo_host)})
+if $apollo_users_settings.apollo_users_selector == "yes":
+ #set apollo_user = {"firstname": str($apollo_users_settings.firstname), "lastname": str($apollo_users_settings.lastname), "password": str($apollo_users_settings.password), "user_email": str($apollo_users_settings.user_email)}
+ $data_parameter_dict.update({"apollo_user": $apollo_user})
+
+## Function to retrieve the data of the inputs
+#def prepare_json($datatype, $input_to_prepare, $order_index, $extra_data_dict={})
+ #set false_path = str($input_to_prepare)
+ #set $data_dict = {"false_path": $false_path}
+
+ #set name = str($input_to_prepare.name)
+ #silent $data_dict.update({"name": $name})
+ #silent $data_dict.update($extra_data_dict)
+ ## Add the ordering by taking the tool form indexes
+ #silent $data_dict.update({"order_index": $order_index})
+
+ #if $datatype in $data_parameter_dict
+ #silent $data_parameter_dict[$datatype].append($data_dict)
+ #else
+ #set array_inputs = []
+ #silent $array_inputs.append($data_dict)
+ #silent $data_parameter_dict.update({$datatype: $array_inputs})
+ #end if
+#end def
+
+## Get the number of digits from tracks, to have a unique integer from group index and track index
+
+#set temp_max_digit = 0
+
+#for $g in $group
+ #if len($g.format) > $temp_max_digit
+ #silent temp_max_digit = len($g.format)
+ #end if
+#end for
+
+#set nb_digits_max_track = len(str($temp_max_digit))
+
+## END Get the number of digits
+
+#for $i_g, $g in enumerate( $group )
+ #for $i, $f in enumerate( $g.format )
+ ## Create the order index using index_group+1 concatenated with index_track
+ #set index_group_final = str($i_g + 1)
+ #set index_track_final = str($index_group_final) + str($i).zfill($nb_digits_max_track)
+
+ ## For each format, we have a few mandatory fields we store in a dict
+ #set track_color = str($f.formatChoice.track_color)
+ #set group_name = str($g.group_name)
+ #set longLabel = str($f.formatChoice.longLabel)
+ #set extra_data_dict = {"track_color": $track_color,
+ "group_name": $group_name,
+ "long_label": $longLabel}
+ #if $f.formatChoice.format_select == "bam"
+ #set bam_index = $f.formatChoice.BAM.metadata.bam_index
+
+ ## Add Bam format specific fields
+ #silent $extra_data_dict.update({"index": $bam_index})
+
+ #silent $prepare_json("Bam", $f.formatChoice.BAM, $index_track_final, $extra_data_dict)
+ #end if
+ #if $f.formatChoice.format_select == "bed"
+ #if $f.formatChoice.bedChoice.bed_select == "bed_generic"
+ #silent $prepare_json("Bed", $f.formatChoice.bedChoice.BED, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #if $f.formatChoice.bedChoice.bed_select == "bed_simple_repeats_option"
+ #silent $prepare_json("BedSimpleRepeats", $f.formatChoice.bedChoice.BED_simple_repeats, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #if $f.formatChoice.bedChoice.bed_select == "bed_splice_junctions_option"
+ #silent $prepare_json("BedSpliceJunctions", $f.formatChoice.bedChoice.BED_splice_junctions, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #if $f.formatChoice.bedChoice.bed_select == "bed_blast_alignment_option"
+ ##set database = str($f.formatChoice.bedChoice.database)
+ ##silent $extra_data_dict.update({"database": $database})
+ #silent $prepare_json("BedBlastAlignments", $f.formatChoice.bedChoice.BED_blast_alignment, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #if $f.formatChoice.bedChoice.bed_select == "bed_blat_alignment_option"
+ ##set database = str($f.formatChoice.bedChoice.database)
+ ##silent $extra_data_dict.update({"database": $database})
+ #silent $prepare_json("BedBlatAlignments", $f.formatChoice.bedChoice.BED_blat_alignment, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #end if
+ #if $f.formatChoice.format_select == "blastxml"
+ #silent $prepare_json($f.formatChoice.BlastXML, extra_data_dict)
+ #end if
+ #if $f.formatChoice.format_select == "bigwig"
+ #set pos_color = str($f.formatChoice.pos_color)
+ #set neg_color = str($f.formatChoice.neg_color)
+ #silent $extra_data_dict.update({"style" : {"pos_color" : $pos_color, "neg_color" : $neg_color}})
+ #silent $prepare_json("BigWig", $f.formatChoice.BIGWIG, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #if $f.formatChoice.format_select == 'gff3'
+ #if $f.formatChoice.gff3Choice.gff3_select == 'gff3_generic'
+ #silent $prepare_json("Gff3", $f.formatChoice.GFF3, $index_track_final,
+ $extra_data_dict)
+ #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_transcript'
+ #silent $prepare_json("Gff3_transcript", $f.formatChoice.GFF3, $index_track_final,
+ $extra_data_dict)
+ #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_mrna'
+ #silent $prepare_json("Gff3_mrna", $f.formatChoice.GFF3, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #end if
+ #if $f.formatChoice.format_select == "gtf"
+ ## Add also GTF from Agustus? See https://github.com/ENCODE-DCC/kentUtils/issues/8
+ #silent $prepare_json("Gtf", $f.formatChoice.GTF, $index_track_final,
+ $extra_data_dict)
+ #end if
+ #end for
+#end for
+
+## We combine the fasta file dataset name with his false path in a JSON object
+#set fasta_json = {"false_path": str($fasta_file), "name": str($fasta_file.name)}
+$data_parameter_dict.update({"fasta": $fasta_json})
+
+## Retrieve the user email
+#silent $data_parameter_dict.update({"user_email": str($__user_email__)})
+
+#silent $data_parameter_dict.update({"tool_directory": str($__tool_directory__)})
+
+#silent $data_parameter_dict.update({"extra_files_path": str($output.extra_files_path)})
+
+#silent $data_parameter_dict.update({"debug_mode": str($advanced_options.debug_mode)})
+
+with open(file_path, 'w') as f:
+ json.dump($data_parameter_dict, f)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Use this option if you are a G-OnRamp developer
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This Galaxy tool will create a jbrowse hub which including binary datasets and json datasets that can be used for
+ JBrowse visualization.
+
+
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc jbrowse_hub.py
--- a/jbrowse_hub.py Wed Jul 12 12:55:27 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,176 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import argparse
-import json
-import utils
-import trackObject
-import TrackHub
-
-
-
-def main(argv):
- parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.')
-
- # Reference genome mandatory
- parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome (Required)')
-
- # Genome name
- parser.add_argument('-g', '--genome_name', help='Name of reference genome')
-
- # Output folder
- parser.add_argument('-o', '--out', help='output html')
-
- # Output folder
- parser.add_argument('-e', '--extra_files_path', help='Directory of JBrowse Hub folder')
-
- #Tool Directory
- parser.add_argument('-d', '--tool_directory', help='The directory of JBrowse file convertion scripts and UCSC tools')
-
- #GFF3
- parser.add_argument('--gff3', action='append', help='GFF3 format')
-
- # GFF3 structure: gene->transcription->CDS
- parser.add_argument('--gff3_transcript', action='append', help='GFF3 format for gene prediction, structure: gene->transcription->CDS')
-
- # GFF3 structure: gene->mRNA->CDS
- parser.add_argument('--gff3_mrna', action='append', help='GFF3 format for gene prediction, structure: gene->mRNA->CDS')
-
- # generic BED
- parser.add_argument('--bed', action='append', help='BED format')
-
- # trfBig simple repeats (BED 4+12)
- parser.add_argument('--bedSimpleRepeats', action='append', help='BED 4+12 format, using simpleRepeats.as')
-
- # regtools (BED 12+1)
- parser.add_argument('--bedSpliceJunctions', action='append', help='BED 12+1 format, using spliceJunctions.as')
-
- # tblastn alignment (blastxml)
- parser.add_argument('--blastxml', action='append', help='blastxml format from tblastn')
-
- # blat alignment (bigpsl 12+12)
- parser.add_argument('--bigpsl', action='append', help='bigpsl format from blat alignment')
-
- # BAM format
- parser.add_argument('--bam', action='append', help='BAM format from HISAT')
-
- # BIGWIG format
- parser.add_argument('--bigwig', action='append', help='BIGWIG format to show rnaseq coverage')
-
- # GTF format
- parser.add_argument('--gtf', action='append', help='GTF format from StringTie')
-
- # Metadata json format
- parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
-
- #JBrowse host
- parser.add_argument('--jbrowse_host', help="JBrowse Host")
-
- args = parser.parse_args()
- all_datatype_dictionary = dict()
-
-
- if not args.fasta:
- parser.print_help()
- raise RuntimeError("No reference genome\n")
- reference = args.fasta
- genome = 'unknown'
- out_path = 'unknown.html'
- extra_files_path = '.'
- tool_directory = '.'
- jbrowse_host = ''
- if args.jbrowse_host:
- jbrowse_host = args.jbrowse_host
- if args.genome_name:
- genome = args.genome_name
- if args.out:
- out_path = args.out
- if args.extra_files_path:
- extra_files_path = args.extra_files_path
-
- #tool_directory not work for Galaxy tool, all tools need to exist in the current PATH, deal with it with tool dependencies
- if args.tool_directory:
- tool_directory = args.tool_directory
-
- #Calculate chromsome sizes using genome reference and uscs tools
- chrom_size = utils.getChromSizes(reference, tool_directory)
-
- #get metadata from json file
- json_inputs_data = args.data_json
- if json_inputs_data:
- inputs_data = json.loads(json_inputs_data)
- else:
- inputs_data = {}
-
- #print inputs_data
-
- #Initate trackObject
- all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path)
-
- array_inputs_bam = args.bam
- array_inputs_bed = args.bed
- array_inputs_bed_simple_repeats = args.bedSimpleRepeats
- array_inputs_bed_splice_junctions = args.bedSpliceJunctions
- array_inputs_bigwig = args.bigwig
- array_inputs_gff3 = args.gff3
- array_inputs_gff3_transcript = args.gff3_transcript
- array_inputs_gff3_mrna = args.gff3_mrna
- array_inputs_gtf = args.gtf
- array_inputs_blastxml = args.blastxml
- array_inputs_bigpsl = args.bigpsl
-
- if array_inputs_bam:
- all_datatype_dictionary['bam'] = array_inputs_bam
- if array_inputs_bed:
- all_datatype_dictionary['bed'] = array_inputs_bed
- if array_inputs_bed_simple_repeats:
- all_datatype_dictionary['bedSimpleRepeats'] = array_inputs_bed_simple_repeats
- if array_inputs_bed_splice_junctions:
- all_datatype_dictionary['bedSpliceJunctions'] = array_inputs_bed_splice_junctions
- if array_inputs_bigwig:
- all_datatype_dictionary['bigwig'] = array_inputs_bigwig
- if array_inputs_gff3:
- all_datatype_dictionary['gff3'] = array_inputs_gff3
- if array_inputs_gff3_transcript:
- all_datatype_dictionary['gff3_transcript'] = array_inputs_gff3_transcript
- if array_inputs_gff3_mrna:
- all_datatype_dictionary['gff3_mrna'] = array_inputs_gff3_mrna
- if array_inputs_gtf:
- all_datatype_dictionary['gtf'] = array_inputs_gtf
- if array_inputs_blastxml:
- all_datatype_dictionary['blastxml'] = array_inputs_blastxml
- if array_inputs_bigpsl:
- all_datatype_dictionary['bigpsl'] = array_inputs_bigpsl
- print "input tracks: \n", all_datatype_dictionary
-
- for datatype, inputfiles in all_datatype_dictionary.items():
- try:
- if not inputfiles:
- raise ValueError('empty input, must provide track files!\n')
- except IOError:
- print 'Cannot open', datatype
- else:
- for f in inputfiles:
- #metadata = {}
- #print f
- #if f in inputs_data.keys():
- # metadata = inputs_data[f]
- #print metadata
- #Convert tracks into gff3 format
- all_tracks.addToRaw(f, datatype)
-
- jbrowseHub = TrackHub.TrackHub(all_tracks, reference, out_path, tool_directory, genome, extra_files_path, inputs_data, jbrowse_host)
- jbrowseHub.createHub()
-
-"""
-def extractMetadata(array_inputs, inputs_data):
- metadata_dict = {}
- for input_false_path in array_inputs:
- for key, data_value in inputs_data.items():
- if key == input_false_path:
- metadata_dict[input_false_path]
-"""
-
-if __name__ == "__main__":
- main(sys.argv)
-
diff -r bb6fdccef474 -r 31a41ce128cc jbrowse_hub.xml
--- a/jbrowse_hub.xml Wed Jul 12 12:55:27 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,306 +0,0 @@
-
-
- This Galaxy tool is used to prepare your files to be ready for displaying on JBrowse
-
-
-
- samtools
- numpy
- biopython
- ucsc_tools_340
- jbrowse_tools
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- This Galaxy tool will create a tar file which including raw datasets and json datasets that can be used for
- JBrowse visualization.
-
-
-
-
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc logging.json
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/logging.json Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,48 @@
+{
+ "version": 1,
+ "disable_existing_loggers": false,
+ "formatters": {
+ "simple": {
+ "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+ }
+ },
+
+ "handlers": {
+ "console": {
+ "class": "logging.StreamHandler",
+ "level": "INFO",
+ "formatter": "simple",
+ "stream": "ext://sys.stdout"
+ },
+
+ "console_stderr": {
+ "class": "logging.StreamHandler",
+ "level": "ERROR",
+ "formatter": "simple",
+ "stream": "ext://sys.stderr"
+ },
+
+ "debug_file_handler": {
+ "class": "logging.handlers.RotatingFileHandler",
+ "level": "DEBUG",
+ "formatter": "simple",
+ "filename": "__main__.log",
+ "maxBytes": 10485760,
+ "backupCount": 20,
+ "encoding": "utf8"
+ }
+ },
+
+ "loggers": {
+ "Reader": {
+ "level": "INFO",
+ "handlers": ["console"],
+ "propagate": "yes"
+ }
+ },
+
+ "root": {
+ "level": "DEBUG",
+ "handlers": ["console", "console_stderr", "debug_file_handler"]
+ }
+}
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc spliceJunctions.as
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/spliceJunctions.as Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,17 @@
+table spliceJunctions
+"Predicted splice junctions"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Name of item"
+ uint score; "Score from 0-1000"
+ char[1] strand; "+ or -"
+ uint thickStart; "Start of where display should be thick (start codon)"
+ uint thickEnd; "End of where display should be thick (stop codon)"
+ uint reserved; "Used as itemRgb as of 2004-11-22"
+ int blockCount; "Number of blocks"
+ int[blockCount] blockSizes; "Comma separated list of block sizes"
+ int[blockCount] chromStarts; "Start positions relative to chromStart"
+ uint junctionScore; "Number of reads supporting the splice junction"
+ )
diff -r bb6fdccef474 -r 31a41ce128cc templates/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc templates/custom_track_styles.css
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/templates/custom_track_styles.css Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,9 @@
+.${label},
+.plus-${label},
+.minus-${label}
+{
+ background-color: ${color};
+ height: 90%;
+ top: 5%;
+}
+
diff -r bb6fdccef474 -r 31a41ce128cc trackObject.py
--- a/trackObject.py Wed Jul 12 12:55:27 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,71 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import shutil
-import utils
-import bedToGff3
-import blastxmlToGff3
-
-
-class trackObject:
- def __init__(self, chrom_size, genome, extra_files_path):
- self.chrom_size = chrom_size
- outputDirect = os.path.join(extra_files_path, 'myHub')
- self.raw_folder = os.path.join(outputDirect, 'raw')
- #Store metadata of the tracks
- self.tracks = []
- try:
- if os.path.exists(self.raw_folder):
- if os.path.isdir(self.raw_folder):
- shutil.rmtree(self.raw_folder)
- else:
- os.remove(self.raw_folder)
- os.makedirs(self.raw_folder)
- except OSError as oserror:
- print "Cannot create raw folder error({0}): {1}".format(oserror.errno, oserror.strerror)
-
- def addToRaw(self, dataFile, dataType):
- """
- Convert gff3, BED, blastxml and gtf files into gff3 files
- and store converted files in folder 'raw'
- """
- false_path = os.path.abspath(dataFile)
- fileName = os.path.basename(dataFile)
- des_path = os.path.join(self.raw_folder, fileName)
- track = {}
- if dataType == 'bed' or dataType == 'gff3' or dataType == 'gff3_mrna' or dataType == 'gff3_transcript' or dataType == 'fasta' or dataType == 'bam' or dataType == 'bigwig':
- if dataType == 'bam':
- # JBrowse will raise error: not a BAM file if the filename hasn't .bam extension
- extension = os.path.splitext(fileName)[1]
- if extension != '.bam':
- fileName = fileName + '.bam'
- des_path = os.path.join(self.raw_folder, fileName)
- bam_index = utils.createBamIndex(dataFile)
- indexname = os.path.basename(bam_index)
- des_path_for_index = os.path.join(self.raw_folder, indexname)
- shutil.copyfile(bam_index, des_path_for_index)
- track['index'] = indexname
-
- try:
- shutil.copyfile(dataFile, des_path)
- except shutil.Error as err1:
- print "Cannot move file, error({0}: {1})".format(err1.errno, err1.strerror)
- except IOError as err2:
- print "Cannot move file, error({0}: {1})".format(err2.errno, err2.strerror)
- elif dataType == 'bedSimpleRepeats':
- bedToGff3.bedToGff3(dataFile, self.chrom_size, 'trfbig', des_path)
- elif dataType == 'bedSpliceJunctions':
- bedToGff3.bedToGff3(dataFile, self.chrom_size, 'regtools', des_path)
- elif dataType == 'bigpsl':
- bedToGff3.bedToGff3(dataFile, self.chrom_size, 'blat', des_path)
- elif dataType == 'blastxml':
- blastxmlToGff3.blastxml2gff3(dataFile, des_path)
- elif dataType == 'gtf':
- utils.gtfToGff3(dataFile, des_path, self.chrom_size)
- track['fileName'] = fileName
- track['dataType'] = dataType
- track['false_path'] = false_path
- #self.SetMetadata(track, metaData)
- self.tracks.append(track)
-
-
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc trackObject.pyc
Binary file trackObject.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/BamFeatures.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tracks/BamFeatures.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+import os
+import json
+import logging
+
+from TrackDb import TrackDb
+from util import subtools
+from util import santitizer
+
+
+class BamFeatures(TrackDb):
+ def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None):
+ super(BamFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings)
+
+ def prepareExtraSetting(self):
+ if 'category' not in self.extraSettings or not self.extraSettings['category']:
+ self.extraSettings['category'] = "Default group"
+ bam_track = dict()
+ bam_track['type'] = 'JBrowse/View/Track/Alignments2'
+ bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM'
+ bam_track['urlTemplate'] = os.path.join('bbi', self.trackName)
+ bam_track['baiUrlTemplate'] = os.path.join('bbi', self.extraSettings['index'])
+ bam_track['label'] = self.trackLabel
+ bam_track['category'] = self.extraSettings['category']
+ #extraConfigs = json.dumps(bam_track)
+ extraConfigs = bam_track
+ return extraConfigs
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc tracks/BamFeatures.pyc
Binary file tracks/BamFeatures.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/BigwigFeatures.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tracks/BigwigFeatures.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+import os
+import json
+import logging
+
+from TrackDb import TrackDb
+from util import subtools
+from util import santitizer
+
+
+class BigwigFeatures(TrackDb):
+ def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None):
+ super(BigwigFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings)
+
+ def prepareExtraSetting(self):
+ if 'category' not in self.extraSettings or not self.extraSettings['category']:
+ self.extraSettings['category'] = "Default group"
+ if 'color' not in self.extraSettings or not self.extraSettings['color']:
+ self.extraSettings['style'] = {}
+ self.extraSettings['style']['pos_color'] = "#FFA600"
+ else:
+ self.extraSettings['style'] = {}
+ self.extraSettings['style']['pos_color'] = self.extraSettings['color']
+
+
+ '''
+ if 'style' not in self.extraSettings:
+ self.extraSettings['style'] = {}
+ if 'pos_color' not in self.extraSettings['style'] or self.extraSettings['style']['pos_color'] == '':
+ self.extraSettings['style']['pos_color'] = "#FFA600"
+ if 'neg_color' not in self.extraSettings['style'] or self.extraSettings['style']['neg_color'] == '':
+ self.extraSettings['style']['neg_color'] = "#005EFF"
+ '''
+ bigwig_track = dict()
+ bigwig_track['urlTemplate'] = os.path.join('bbi', self.trackName)
+ bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot'
+ bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig'
+ bigwig_track['label'] = self.trackLabel
+ bigwig_track['style'] = self.extraSettings['style']
+ bigwig_track['category'] = self.extraSettings['category']
+ #extraConfigs = json.dumps(bigwig_track)
+ extraConfigs = bigwig_track
+ return extraConfigs
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc tracks/BigwigFeatures.pyc
Binary file tracks/BigwigFeatures.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/CanvasFeatures.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tracks/CanvasFeatures.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+import json
+import logging
+
+from TrackDb import TrackDb
+from util import subtools
+
+
+class CanvasFeatures(TrackDb):
+ def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None):
+ super(CanvasFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings)
+
+ def prepareExtraSetting(self):
+ """ set CanvasFeatures configuration options """
+ extraConfigs = dict()
+ self.extraSettings["clientConfig"] = dict()
+ self.extraSettings["config"] = dict()
+ if 'color' not in self.extraSettings or not self.extraSettings['color']:
+ self.extraSettings["clientConfig"]['color'] = "#daa520"
+ else:
+ self.extraSettings["clientConfig"]['color'] = self.extraSettings['color']
+ if 'category' not in self.extraSettings or not self.extraSettings['category']:
+ self.extraSettings["config"]['category'] = "Default group"
+ else:
+ self.extraSettings["config"]['category'] = self.extraSettings['category']
+ if 'glyph' in self.extraSettings:
+ self.extraSettings["config"]['glyph'] = self.extraSettings['glyph']
+ if 'transcriptType' in self.extraSettings:
+ self.extraSettings['config']['transcriptType'] = self.extraSettings['transcriptType']
+ extraConfigs["config"] = json.dumps(self.extraSettings["config"])
+ extraConfigs["clientConfig"] = json.dumps(self.extraSettings["clientConfig"])
+ return extraConfigs
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc tracks/CanvasFeatures.pyc
Binary file tracks/CanvasFeatures.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/HTMLFeatures.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tracks/HTMLFeatures.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+import json
+import logging
+
+from TrackDb import TrackDb
+from util import subtools
+from util import santitizer
+
+
+class HTMLFeatures(TrackDb):
+ def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None):
+ super(HTMLFeatures, self).__init__(trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings)
+
+ def prepareExtraSetting(self):
+ """ set HTMLFeatures configuration options """
+ extraConfigs = dict()
+ self.extraSettings["clientConfig"] = dict()
+ self.extraSettings["config"] = dict()
+ if 'type' in self.extraSettings:
+ extraConfigs["type"] = self.extraSettings['type']
+ if 'color' in self.extraSettings and self.extraSettings['color']:
+ extraConfigs['feature_color'] = self.extraSettings['color']
+ else:
+ extraConfigs['feature_color'] = "#000000"
+ #self.extraSettings['clientConfig']['color'] = self.extraSettings['color']
+ if 'subfeatureClasses' in self.extraSettings:
+ subfeature_css_class = santitizer.sanitize_name(self.trackLabel + "_" + self.extraSettings['subfeatureClasses'])
+ extraConfigs['subfeatureClasses'] = {self.extraSettings['subfeatureClasses']: subfeature_css_class}
+
+ if 'category' not in self.extraSettings or not self.extraSettings['category']:
+ self.extraSettings['config']['category'] = "Default group"
+ else:
+ self.extraSettings['config']['category'] = self.extraSettings['category']
+
+ extraConfigs['config'] = json.dumps(self.extraSettings["config"])
+ extraConfigs['clientConfig'] = json.dumps(self.extraSettings["clientConfig"])
+ return extraConfigs
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc tracks/HTMLFeatures.pyc
Binary file tracks/HTMLFeatures.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/TrackDb.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tracks/TrackDb.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+"""
+Super Class of the tracks
+"""
+import os
+import abc
+from abc import ABCMeta
+import collections
+import json
+import logging
+from util import santitizer
+
+class TrackDb(object):
+ """docstring for TrackDb"""
+ __metaclass__ = ABCMeta
+
+ def __init__(self, trackName, trackLabel, trackDataURL, trackType, dataType, extraSettings=None):
+ #super(TrackDb, self).__init__()
+
+ not_init_message = "The {0} is not initialized."
+ if trackName is None:
+ raise TypeError(not_init_message.format('trackName'))
+ if trackLabel is None:
+ raise TypeError(not_init_message.format('trackLabel'))
+ if trackType is None:
+ raise TypeError(not_init_message.format('trackType'))
+ self.trackName = trackName
+ self.trackLabel = trackLabel
+ self.trackDataURL = trackDataURL
+ self.trackType = trackType
+ self.dataType = dataType
+ self.extraSettings = extraSettings
+ self.logger = logging.getLogger(__name__)
+ #self.createTrackDb()
+
+ def createTrackDb(self):
+ self.track_db = collections.OrderedDict([("track",self.trackName),
+ ("trackLabel",self.trackLabel),
+ ("trackDataURL",self.trackDataURL),
+ ("dataType", self.dataType),
+ ("trackType", self.trackType)]
+ )
+
+
+ extraConfigs = self.prepareExtraSetting()
+ self.logger.debug("Generate extraConfigs = %s", json.dumps(extraConfigs))
+ self.track_db["options"] = extraConfigs
+ #print self.track_db
+ self.logger.debug("TrackDb object is created track_db = %s ", json.dumps(self.track_db))
+
+ @abc.abstractmethod
+ def prepareExtraSetting(self):
+ """ set optional configurations for the track """
diff -r bb6fdccef474 -r 31a41ce128cc tracks/TrackDb.pyc
Binary file tracks/TrackDb.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/TrackStyles.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tracks/TrackStyles.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+import os
+import json
+import logging
+from mako.lookup import TemplateLookup
+
+class TrackStyles(object):
+ def __init__(self, tool_directory, species_folder, trackListFile, cssFolderName="css", cssFileName="custom_track_styles.css"):
+ self.logger = logging.getLogger(__name__)
+ self.tool_directory = tool_directory
+ self.species_folder = species_folder
+ self.trackList = trackListFile
+ self.cssFolderName = cssFolderName
+ self.cssFileName = cssFileName
+ self.cssFilePath = self._createCssFile()
+ self.cssTemplate = self._getCssTemplate()
+ self._addCssToTrackList()
+
+
+ def addCustomColor(self, feature_class_name, feature_color):
+ with open(self.cssFilePath, 'a+') as css:
+ htmlMakoRendered = self.cssTemplate.render(
+ label = feature_class_name,
+ color = feature_color
+ )
+ css.write(htmlMakoRendered)
+ self.logger.debug("create customized track css class: cssFilePath= %s", self.cssFilePath)
+
+
+ def _createCssFile(self):
+ cssFolderPath = os.path.join(self.species_folder, self.cssFolderName)
+ cssFilePath = os.path.join(cssFolderPath, self.cssFileName)
+ if not os.path.exists(cssFilePath):
+ if not os.path.exists(cssFolderPath):
+ os.mkdir(cssFolderPath)
+ os.mknod(cssFilePath)
+ return cssFilePath
+
+ def _getCssTemplate(self):
+ mylookup = TemplateLookup(directories=[os.path.join(self.tool_directory, 'templates')],
+ output_encoding='utf-8', encoding_errors='replace')
+ cssTemplate = mylookup.get_template("custom_track_styles.css")
+ return cssTemplate
+
+
+ def _addCssToTrackList(self):
+ with open(self.trackList, 'r+') as track:
+ data = json.load(track)
+ css_path = os.path.join('data', self.cssFolderName, self.cssFileName)
+ data['css'] = {'url': css_path}
+ json_string = json.dumps(data, indent=4, separators=(',', ': '))
+ track.seek(0)
+ track.write(json_string)
+ track.truncate()
+ self.logger.debug("added customized css url to trackList.json")
+
+
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc tracks/TrackStyles.pyc
Binary file tracks/TrackStyles.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc tracks/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc tracks/__init__.pyc
Binary file tracks/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc trf_simpleRepeat.as
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/trf_simpleRepeat.as Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,20 @@
+table simpleRepeat
+"Describes the Simple Tandem Repeats"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Simple Repeats tag name"
+ uint period; "Length of repeat unit"
+ float copyNum; "Mean number of copies of repeat"
+ uint consensusSize; "Length of consensus sequence"
+ uint perMatch; "Percentage Match"
+ uint perIndel; "Percentage Indel"
+ uint score; "Alignment Score = 2*match-7*mismatch-7*indel; minscore=50"
+ uint A; "Percent of A's in repeat unit"
+ uint C; "Percent of C's in repeat unit"
+ uint G; "Percent of G's in repeat unit"
+ uint T; "Percent of T's in repeat unit"
+ float entropy; "Entropy"
+ lstring sequence; "Sequence of repeat unit element"
+ )
diff -r bb6fdccef474 -r 31a41ce128cc util/Logger.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/Logger.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,38 @@
+import os
+import sys
+import json
+import logging
+import logging.config
+
+#from util.Filters import TraceBackFormatter
+
+class Logger(object):
+ def __init__(self, tool_directory, debug="False", extra_files_path=None):
+ self.tool_directory = tool_directory
+ self.default_level = logging.INFO
+ self.debug = debug
+ self.extra_files_path = extra_files_path
+
+ def setup_logging(self):
+ """Setup logging configuration
+ reference: https://fangpenlin.com/posts/2012/08/26/good-logging-practice-in-python/
+ """
+ config_path = os.path.join(self.tool_directory, 'logging.json')
+ default_level=logging.INFO
+ if self.debug.lower() == "true":
+ default_level=logging.DEBUG
+ if os.path.exists(config_path):
+ with open(config_path, 'rt') as f:
+ config = json.load(f)
+ config["handlers"]["console"]["level"] = default_level
+ if self.extra_files_path:
+ for i in config["handlers"]:
+ if "filename" in config["handlers"][i]:
+ config["handlers"][i]["filename"] = os.path.join(self.extra_files_path, config["handlers"][i]["filename"])
+ logging.config.dictConfig(config)
+ else:
+ logging.warn("Extra files path is not set. The log files will exist at current working directory instead of final output folder")
+ else:
+ logging.basicConfig(level=default_level)
+ logging.warn("Cannot find logging configuration file!\n")
+
diff -r bb6fdccef474 -r 31a41ce128cc util/Logger.pyc
Binary file util/Logger.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/Reader.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/Reader.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,146 @@
+import json
+import logging
+import codecs
+
+
+# Internal dependencies
+from datatypes.binary.Bam import Bam
+from datatypes.binary.BigWig import BigWig
+from datatypes.interval.Bed import Bed
+from datatypes.interval.BedSimpleRepeats import BedSimpleRepeats
+from datatypes.interval.BedSpliceJunctions import BedSpliceJunctions
+from datatypes.interval.BlastXml import BlastXml
+from datatypes.interval.Gff3 import Gff3
+from datatypes.interval.Gff3_mrna import Gff3_mrna
+from datatypes.interval.Gff3_transcript import Gff3_transcript
+from datatypes.interval.Gtf import Gtf
+from datatypes.interval.GtfStringTie import GtfStringTie
+from datatypes.interval.BigPsl import BigPsl
+from datatypes.interval.BedBlatAlignments import BedBlatAlignments
+from datatypes.interval.BedBlastAlignments import BedBlastAlignments
+from datatypes.interval.Psl import Psl
+from datatypes.sequence.Fasta import Fasta
+from apollo.ApolloUser import ApolloUser
+from util import santitizer
+
+class Reader(object):
+
+ DATATYPE_CLASS = [Bam, BigWig, Bed, BedSimpleRepeats,
+ BedSpliceJunctions, BigPsl, BedBlatAlignments, BedBlastAlignments,
+ BlastXml, Gff3, Gff3_mrna, Gff3_transcript, Gff3_mrna, Gtf, GtfStringTie, Psl, Fasta]
+
+ def __init__(self, input_json_file):
+ self.inputFile = input_json_file
+ self.args = self.loadJson()
+
+
+ def loadJson(self):
+ try:
+ data_file = codecs.open(self.inputFile, 'r', 'utf-8')
+ return json.load(data_file)
+ except IOError:
+ print "Cannot find JSON file\n"
+ exit(1)
+
+ def getToolDir(self):
+ try:
+ return self.args["tool_directory"]
+ except KeyError:
+ print ("tool_directory is not defined in the input file!")
+ exit(1)
+
+ def getExtFilesPath(self):
+ try:
+ return self.args["extra_files_path"]
+ except KeyError:
+ print ("extra_files_path is not defined in the input file!")
+ exit(1)
+
+ def getUserEmail(self):
+ try:
+ return self.args["user_email"]
+ except KeyError:
+ print ("user_email is not defined in the input file!")
+ exit(1)
+
+ def getDebugMode(self):
+ try:
+ return self.args["debug_mode"]
+ except KeyError:
+ print ("debug_mode is not defined in the input file!")
+ exit(1)
+
+ def getTrackType(self):
+ track_type = self.args.get("track_type")
+ return track_type
+
+ def getApolloHost(self):
+ apollo_host = self.args.get("apollo_host")
+ return apollo_host
+
+
+ def getRefGenome(self):
+ array_inputs_reference_genome = self.args["fasta"]
+ # TODO: Replace these with the object Fasta
+ input_fasta_file = array_inputs_reference_genome["false_path"]
+ input_fasta_file_name = santitizer.sanitize_name_input(array_inputs_reference_genome["name"])
+ genome_name = santitizer.sanitize_name_input(self.args["genome_name"])
+ reference_genome = Fasta(input_fasta_file,
+ input_fasta_file_name, genome_name)
+ return reference_genome
+
+ def getApolloUser(self):
+ user_info = self.args.get("apollo_user")
+ if not user_info:
+ firstname = "demo"
+ lastname = "user"
+ password = "gonramp"
+ user_email = self.getUserEmail()
+ else:
+ firstname = user_info['firstname']
+ lastname = user_info['lastname']
+ user_email = user_info['user_email']
+ password = user_info['password']
+ apollo_user = ApolloUser(user_email, firstname, lastname, password)
+ return apollo_user
+
+ def getTracksData(self):
+ self.logger = logging.getLogger(__name__)
+ all_datatype_dictionary = dict()
+ for datatype in self.DATATYPE_CLASS:
+ class_name = datatype.__name__
+ array_inputs = self.args.get(str(class_name))
+ if array_inputs:
+ self.logger.debug("Creating %s objects\n", class_name)
+ self.logger.debug("array_inputs: %s", array_inputs)
+ all_datatype_dictionary.update(self.create_ordered_datatype_objects(datatype, array_inputs))
+
+ return all_datatype_dictionary
+
+ def create_ordered_datatype_objects(self, ExtensionClass, array_inputs):
+ """
+ Function which executes the creation all the necessary files / folders for a special Datatype, for TrackHub
+ and update the dictionary of datatype
+
+ :param ExtensionClass:
+ :param array_inputs:
+ :type ExtensionClass: Datatype
+ :type array_inputs: list[string]
+ """
+
+ datatype_dictionary = {}
+
+ # TODO: Optimize this double loop
+ for input_data in array_inputs:
+ input_false_path = input_data["false_path"]
+ input_data["name"] = santitizer.sanitize_name_input(input_data["name"])
+ extensionObject = ExtensionClass(input_false_path, input_data)
+ extensionObject.generateCustomTrack()
+ datatype_dictionary.update({input_data["order_index"]: extensionObject})
+ self.logger.debug("%s object: %s has been created", ExtensionClass, input_data["name"])
+ return datatype_dictionary
+
+
+
+
+
diff -r bb6fdccef474 -r 31a41ce128cc util/Reader.pyc
Binary file util/Reader.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc util/__init__.pyc
Binary file util/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/index/DatabaseIndex.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/index/DatabaseIndex.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+import collections
+from ExternIndex import ExternIndex
+
+class DatabaseIndex(ExternIndex):
+ def __init__(self, database, **args):
+ self.database = database
+ self.seqType=args.get("seqType")
+ self.useIframe=args.get("useIframe")
+ self.iframeHeight=args.get("iframeHeight")
+ self.iframeWidth=args.get("iframeWidth")
+
+ def setExtLink(self):
+ return self.setDatabaseLink(self.database, self.seqType, self.useIframe, self.iframeHeight, self.iframeWidth)
+
+
+ def setDatabaseLink(self, database, seqType=None, useIframe=None, iframeHeight=None, iframeWidth=None):
+ database_settings = collections.OrderedDict()
+ if "NCBI" in database:
+ if not seqType:
+ database_settings["url"] = "https://www.ncbi.nlm.nih.gov/gquery/?term=$$"
+ elif seqType == 2:
+ database_settings["url"] = "https://www.ncbi.nlm.nih.gov/protein/$$"
+ elif seqType == 1:
+ database_settings["url"] = "https://www.ncbi.nlm.nih.gov/nuccore/$$"
+ else:
+ raise Exception("Sequence Type {0} is not valid, should be either protein (seqType==2) or nucleotide (seqType==1). Stopping the application".format(seqType))
+ elif "UniProt" in database:
+ database_settings["url"] = "http://www.uniprot.org/uniprot/$$"
+ elif "FlyBase" in database:
+ database_settings["url"] = "http://flybase.org/reports/$$"
+ else:
+ database_settings["url"] = "https://www.ncbi.nlm.nih.gov/gquery/?term=$$"
+ database_settings["urlLabel"] = database + " Details:"
+ if useIframe or useIframe is None:
+ database_settings["iframeUrl"] = database_settings["url"]
+ if not iframeHeight:
+ iframeHeight = "600"
+ if not iframeWidth:
+ iframeWidth = "800"
+ database_settings["iframeOptions"] = "height= %s width= %s" % (iframeHeight, iframeWidth)
+ return database_settings
+
diff -r bb6fdccef474 -r 31a41ce128cc util/index/DatabaseIndex.pyc
Binary file util/index/DatabaseIndex.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/index/ExternIndex.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/index/ExternIndex.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,16 @@
+#!/usr/bin/python
+import collections
+import abc
+from abc import ABCMeta
+
+class ExternIndex(object):
+ __metaclass__ = ABCMeta
+
+ @abc.abstractmethod
+ def __init__(self):
+ """init"""
+
+ @abc.abstractmethod
+ def setExtLink(self):
+ """set external link"""
+
\ No newline at end of file
diff -r bb6fdccef474 -r 31a41ce128cc util/index/ExternIndex.pyc
Binary file util/index/ExternIndex.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/index/TrixIndex.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/index/TrixIndex.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+
+import os
+import collections
+import shutil
+import logging
+from ExternIndex import ExternIndex
+
+class TrixIndex(ExternIndex):
+ def __init__(self, indexIx, indexIxx, trackName, mySpecieFolderPath, trixId, **args):
+ self.logger = logging.getLogger(__name__)
+ self.indexIx = indexIx
+ self.indexIxx = indexIxx
+ self.trackName = trackName
+ self.mySpecieFolderPath = mySpecieFolderPath
+ self.trixId = trixId.strip()
+ if not self.trixId:
+ self.logger.error("Didn't specify the Trix identifier. To use TRIX index, you need to specify the identifier")
+ exit(1)
+ if "default_index" in args:
+ self.default_index = args["default_index"]
+ else:
+ self.default_index = None
+ self.index_settings = collections.OrderedDict()
+
+ def setExtLink(self):
+ self.setSearchIndex()
+ self.moveIndexFile()
+ self.index_settings["searchTrix"] = "trix/%s" % self.indexIxName
+ return self.index_settings
+
+ def moveIndexFile(self):
+ indexFolder = os.path.join(self.mySpecieFolderPath, 'trix')
+ self.indexIxName = "".join( ( self.trackName, ".ix") )
+ self.indexIxxName = "".join( ( self.trackName, ".ixx") )
+ if not os.path.exists(indexFolder):
+ os.makedirs(indexFolder)
+
+ # Move index files to the index folder
+ self.indexIxPath = os.path.join(indexFolder, self.indexIxName)
+ shutil.copyfile(self.indexIx, self.indexIxPath)
+ self.indexIxxPath = os.path.join(indexFolder, self.indexIxxName)
+ shutil.copyfile(self.indexIxx, self.indexIxxPath)
+
+ def setSearchIndex(self):
+ if self.default_index:
+ set_index = set()
+ set_index.add(self.trixId)
+ set_index.add(self.default_index)
+ search_index = ",".join(set_index)
+ else:
+ search_index = self.trixId
+ logging.debug("trixId= %s, searchIndex= %s", self.trixId, search_index)
+ self.index_settings["searchIndex"] = search_index
+
diff -r bb6fdccef474 -r 31a41ce128cc util/index/TrixIndex.pyc
Binary file util/index/TrixIndex.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/index/__init__.py
diff -r bb6fdccef474 -r 31a41ce128cc util/index/__init__.pyc
Binary file util/index/__init__.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/santitizer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/santitizer.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+# -*- coding: utf8 -*-
+
+"""
+This class handles the subprocess calls of the different tools used
+in HubArchiveCreator
+"""
+
+import logging
+import os
+import subprocess
+import sys
+import string
+import tempfile
+
+
+def prefixTrackName(filename):
+ """
+ santitize trackName. Because track name must begin with a letter and
+ contain only the following chars: [a-zA-Z0-9_].
+ See the "track" Common settings at:
+ https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html#bigPsl_-_Pairwise_Alignments
+ skip the santitization for cytoBandIdeo track
+ """
+ if filename == 'cytoBandIdeo':
+ return filename
+ valid_chars = "_%s%s" % (string.ascii_letters, string.digits)
+ sanitize_name = ''.join([c if c in valid_chars else '_' for c in filename])
+ sanitize_name = "gonramp_" + sanitize_name
+ return sanitize_name
+
+def sanitize_name_input(string_to_sanitize):
+ """
+ Sanitize the string passed in parameter by replacing '/' and ' ' by '_'
+
+ :param string_to_sanitize:
+ :return :
+
+ :Example:
+
+ >>> sanitize_name_input('this/is an//example')
+ this_is_an__example
+ """
+ return string_to_sanitize \
+ .replace("/", "_") \
+ .replace(" ", "_")
+
+def sanitize_name_inputs(inputs_data):
+ """
+ Sanitize value of the keys "name" of the dictionary passed in parameter.
+
+ Because sometimes output from Galaxy, or even just file name, from user inputs, have spaces.
+ Also, it can contain '/' character and could break the use of os.path function.
+
+ :param inputs_data: dict[string, dict[string, string]]
+ """
+ for key in inputs_data:
+ inputs_data[key]["name"] = sanitize_name_input(inputs_data[key]["name"])
+
+def sanitize_group_name(group_name):
+ return group_name.lower().replace(' ', '_')
+
+def sanitize_name(input_name):
+ """
+ Galaxy will name all the files and dirs as *.dat,
+ the function can replace '.' to '_' for the dirs
+ """
+ validChars = "_-%s%s" % (string.ascii_letters, string.digits)
+ sanitized_name = ''.join([c if c in validChars else '_' for c in input_name])
+ return "gonramp_" + sanitized_name
diff -r bb6fdccef474 -r 31a41ce128cc util/santitizer.pyc
Binary file util/santitizer.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc util/subtools.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/util/subtools.py Fri Oct 13 12:44:31 2017 -0400
@@ -0,0 +1,372 @@
+#!/usr/bin/env python
+
+"""
+This file include common used functions for converting file format to gff3
+"""
+from collections import OrderedDict
+import json
+import subprocess
+import os
+import sys
+import tempfile
+import string
+import logging
+
+class PopenError(Exception):
+ def __init__(self, cmd, error, return_code):
+ self.cmd = cmd
+ self.error = error
+ self.return_code = return_code
+
+ def __str__(self):
+ message = "The subprocess {0} has returned the error: {1}.".format(
+ self.cmd, self.return_code)
+ message = ','.join(
+ (message, "Its error message is: {0}".format(self.error)))
+ return repr(message)
+
+
+def _handleExceptionAndCheckCall(array_call, **kwargs):
+ """
+ This class handle exceptions and call the tool.
+ It maps the signature of subprocess.check_call:
+ See https://docs.python.org/2/library/subprocess.html#subprocess.check_call
+ """
+ stdout = kwargs.get('stdout', subprocess.PIPE)
+ stderr = kwargs.get('stderr', subprocess.PIPE)
+ shell = kwargs.get('shell', False)
+ stdin = kwargs.get('stdin', None)
+
+ cmd = array_call[0]
+
+ output = None
+ error = None
+
+ # TODO: Check the value of array_call and <=[0]
+ logging.debug("Calling {0}:".format(cmd))
+ logging.debug("%s", array_call)
+ logging.debug("---------")
+
+ # TODO: Use universal_newlines option from Popen?
+ try:
+ p = subprocess.Popen(array_call, stdout=stdout,
+ stderr=stderr, shell=shell, stdin=stdin)
+
+ # TODO: Change this because of possible memory issues => https://docs.python.org/2/library/subprocess.html#subprocess.Popen.communicate
+
+ output, error = p.communicate()
+
+ if stdout == subprocess.PIPE:
+ logging.debug("\t{0}".format(output))
+ else:
+ logging.debug("\tOutput in file {0}".format(stdout.name))
+ # If we detect an error from the subprocess, then we raise an exception
+ # TODO: Manage if we raise an exception for everything, or use CRITICAL etc... but not stop process
+ # TODO: The responsability of returning a sys.exit() should not be there, but up in the app.
+ if p.returncode:
+ if stderr == subprocess.PIPE:
+ raise PopenError(cmd, error, p.returncode)
+ else:
+ # TODO: To Handle properly with a design behind, if we received a option as a file for the error
+ raise Exception("Error when calling {0}. Error as been logged in your file {1}. Error code: {2}"
+ .format(cmd, stderr.name, p.returncode))
+
+ except OSError as e:
+ message = "The subprocess {0} has encountered an OSError: {1}".format(
+ cmd, e.strerror)
+ if e.filename:
+ message = '\n'.join(
+ (message, ", against this file: {0}".format(e.filename)))
+ logging.error(message)
+ sys.exit(-1)
+ except PopenError as p:
+ message = "The subprocess {0} has returned the error: {1}.".format(
+ p.cmd, p.return_code)
+ message = '\n'.join(
+ (message, "Its error message is: {0}".format(p.error)))
+
+ logging.exception(message)
+
+ sys.exit(p.return_code)
+ except Exception as e:
+ message = "The subprocess {0} has encountered an unknown error: {1}".format(
+ cmd, e)
+ logging.exception(message)
+
+ sys.exit(-1)
+ return p
+
+
+def write_features(field, attribute, gff3):
+ """
+ The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
+ field, attribute are ordered dictionary
+ gff3 is the file handler
+ """
+ attr = []
+ for v in field.values():
+ gff3.write(str(v) + '\t')
+ for k, v in attribute.items():
+ s = str(k) + '=' + str(v)
+ attr.append(s)
+ gff3.write(';'.join(attr))
+ gff3.write('\n')
+
+def twoBitInfo(two_bit_file_name, two_bit_info_file):
+ """
+ Call twoBitInfo and write the result into twoBit_info_file
+ :param two_bit_file_name:
+ :param two_bit_info_file:
+ :return the subprocess.check_call return object:
+ """
+ array_call = ['twoBitInfo', two_bit_file_name, two_bit_info_file]
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+
+def faToTwoBit(fasta_file_name, twoBitFile):
+ """
+ This function call faToTwoBit UCSC tool, and return the twoBitFile
+ :param fasta_file_name:
+ :param mySpecieFolder:
+ :return:
+ """
+
+ array_call = ['faToTwoBit', fasta_file_name, twoBitFile]
+ _handleExceptionAndCheckCall(array_call)
+
+ return twoBitFile
+
+def sortChromSizes(two_bit_info_file_name, chrom_sizes_file_name):
+ """
+ Call sort with -k2rn on two_bit_info_file_name and write the result into chrom_sizes_file_name
+ :param two_bit_info_file_name:
+ :param chrom_sizes_file_name:
+ :return:
+ """
+ array_call = ['sort', '-k2rn', two_bit_info_file_name,
+ '-o', chrom_sizes_file_name]
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+def getChromSizes(reference, tool_dir):
+ #TODO: find a better way instead of shipping the two exec files with the tool
+ faToTwoBit = os.path.join(tool_dir, 'faToTwoBit')
+ twoBitInfo = os.path.join(tool_dir, 'twoBitInfo')
+ try:
+ twoBitFile = tempfile.NamedTemporaryFile(bufsize=0)
+ chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False)
+ except IOError as err:
+ print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror)
+ try:
+ subprocess.call(['faToTwoBit', reference, twoBitFile.name])
+ except OSError as err:
+ print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror)
+ try:
+ subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name])
+ except OSError as err:
+ print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror)
+ return chrom_sizes
+
+def sequence_region(chrom_sizes):
+ """
+ This function read from a chromatin size file generated by twoBitInfo and write the information to dict
+ return a dict
+ """
+ f = open(chrom_sizes, 'r')
+ sizes = f.readlines()
+ sizes_dict = {}
+ for line in sizes:
+ chrom_info = line.rstrip().split('\t')
+ sizes_dict[chrom_info[0]] = chrom_info[1]
+ return sizes_dict
+
+def child_blocks(parent_field, parent_attr, gff3, child_type):
+ num = 0
+ blockcount = int(parent_attr['blockcount'])
+ chromstart = parent_attr['chromstarts'].split(',')
+ blocksize = parent_attr['blocksizes'].split(',')
+ parent_start = parent_field['start']
+ while num < blockcount:
+ child_attr = OrderedDict()
+ child_field = parent_field
+ child_field['type'] = child_type
+ child_field['start'] = int(chromstart[num]) + int(parent_start)
+ child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1
+ child_attr['ID'] = parent_attr['ID'] + '_part_' + str(num+1)
+ child_attr['Parent'] = parent_attr['ID']
+ write_features(child_field, child_attr, gff3)
+ num = num + 1
+
+def add_tracks_to_json(trackList_json, new_tracks, modify_type):
+ """
+ Add to track configuration (trackList.json)
+ # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict()
+ # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict())
+ """
+ with open(trackList_json, 'r+') as f:
+ data = json.load(f)
+ if modify_type == 'add_tracks':
+ data['tracks'].append(new_tracks)
+ elif modify_type == 'add_attr':
+ for k in new_tracks:
+ for track in data['tracks']:
+ if k.lower() in track['urlTemplate'].lower():
+ attr = new_tracks[k]
+ for k, v in attr.items():
+ track[k] = v
+ f.seek(0, 0)
+ f.write(json.dumps(data, separators=(',' , ':'), indent=4))
+ f.truncate()
+ f.close()
+
+
+def createBamIndex(bamfile):
+ subprocess.call(['samtools', 'index', bamfile])
+ filename = bamfile + '.bai'
+ if os.path.exists(filename):
+ return filename
+ else:
+ raise ValueError('Did not find bai file')
+
+def flatfile_to_json(inputFile, dataType, trackType, trackLabel, outputFolder, options=None, compress=False):
+ if "bed" in dataType:
+ fileType = "--bed"
+ elif "gff" in dataType:
+ fileType = "--gff"
+ else:
+ raise ValueError("%s is not a valid filetype for flatfile_to_json" % dataType)
+
+
+ array_call = ['flatfile-to-json.pl',
+ fileType, inputFile,
+ '--trackType', trackType,
+ '--trackLabel', trackLabel,
+ '--out', outputFolder]
+ if compress:
+ array_call.append('--compress')
+ if options:
+ config = options.get("config")
+ clientConfig = options.get("clientConfig")
+ renderClassName = options.get('renderClassName')
+ subfeatureClasses = options.get('subfeatureClasses')
+ load_type = options.get("type")
+ if clientConfig:
+ array_call.append('--clientConfig')
+ array_call.append(clientConfig)
+ if config:
+ array_call.append('--config')
+ array_call.append(config)
+ if load_type:
+ array_call.append('--type')
+ array_call.append(load_type)
+ if renderClassName:
+ array_call.append('--renderClassName')
+ array_call.append(renderClassName)
+ if subfeatureClasses:
+ array_call.append('--subfeatureClasses')
+ array_call.append(json.dumps(subfeatureClasses))
+
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+def bam_to_json(inputFile, trackLabel, outputFolder, options=None, compress=False):
+
+ array_call = ['bam-to-json.pl',
+ '--bam', inputFile,
+ '--trackLabel', trackLabel,
+ '--out', outputFolder]
+ if compress:
+ array_call.append('--compress')
+ if options:
+ config = options.get('config')
+ clientConfig = options.get('clientConfig')
+ if clientConfig:
+ array_call.append('--clientConfig')
+ array_call.append(clientConfig)
+ if config:
+ array_call.append('--config')
+ array_call.append(config)
+
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+def add_track_json(trackList, track_json):
+ track_json = json.dumps(track_json)
+ new_track = subprocess.Popen(['echo', track_json], stdout=subprocess.PIPE)
+ p = subprocess.call(['add-track-json.pl', trackList], stdin=new_track.stdout)
+ return p
+
+def prepare_refseqs(fasta_file_name, outputFolder):
+ array_call = ['prepare-refseqs.pl', '--fasta', fasta_file_name, '--out', outputFolder]
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+def generate_names(outputFolder):
+ array_call = ['generate-names.pl', '-v', '--out', outputFolder]
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+def validateFiles(input_file, chrom_sizes_file_name, file_type, options=None):
+ """
+ Call validateFiles on input_file, using chrom_sizes_file_name and file_type
+ :param input_file:
+ :param chrom_sizes_file_name:
+ :param file_type:
+ :return:
+ """
+
+ array_call = ['validateFiles', '-chromInfo=' + chrom_sizes_file_name, '-type='+ file_type, input_file]
+ if options:
+ tab = options.get("tab")
+ autoSql = options.get("autoSql")
+ logging.debug("tab: {0}".format(tab))
+ logging.debug("autoSql: {0}".format(autoSql))
+ if autoSql:
+ autoSql = ''.join(['-as=', autoSql])
+ array_call.append(autoSql)
+ if tab:
+ array_call.append('-tab')
+ p = _handleExceptionAndCheckCall(array_call)
+ return p
+
+def arrow_add_organism(organism_name, organism_dir, public=False):
+ array_call = ['arrow', 'organisms', 'add_organism', organism_name, organism_dir]
+ if public:
+ array_call.append('--public')
+ p = subprocess.check_output(array_call)
+ return p
+
+def arrow_create_user(user_email, firstname, lastname, password, admin=False):
+ """ Create a new user of Apollo, the default user_role is "user" """
+ array_call = ['arrow', 'users', 'create_user', user_email, firstname, lastname, password]
+ if admin:
+ array_call += ['--role', 'admin']
+ p = subprocess.check_output(array_call)
+ return p
+
+def arrow_update_organism_permissions(user_id, organism, **user_permissions):
+ array_call = ['arrow', 'users', 'update_organism_permissions', str(user_id), str(organism)]
+ admin = user_permissions.get("admin", False)
+ write = user_permissions.get("write", False)
+ read = user_permissions.get("read", False)
+ export = user_permissions.get("export", False)
+ if admin:
+ array_call.append('--administrate')
+ if write:
+ array_call.append('--write')
+ if read:
+ array_call.append('--read')
+ if export:
+ array_call.append('--export')
+ p = subprocess.check_output(array_call)
+ return p
+
+def arrow_get_users(user_email):
+ array_call = ['arrow', 'users', 'get_users']
+ p = subprocess.check_output(array_call)
+ all_users = json.loads(p)
+ for d in all_users:
+ if d['username'] == user_email:
+ return d['userId']
+ logging.error("Cannot find user %s", user_email)
diff -r bb6fdccef474 -r 31a41ce128cc util/subtools.pyc
Binary file util/subtools.pyc has changed
diff -r bb6fdccef474 -r 31a41ce128cc utils.py
--- a/utils.py Wed Jul 12 12:55:27 2017 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-#!/usr/bin/env python
-
-"""
-This file include common used functions for converting file format to gff3
-"""
-from collections import OrderedDict
-import json
-import subprocess
-import os
-import tempfile
-import string
-
-def write_features(field, attribute, gff3):
- """
- The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
- field, attribute are ordered dictionary
- gff3 is the file handler
- """
- attr = []
- for v in field.values():
- gff3.write(str(v) + '\t')
- for k, v in attribute.items():
- s = str(k) + '=' + str(v)
- attr.append(s)
- gff3.write(';'.join(attr))
- gff3.write('\n')
-
-def getChromSizes(reference, tool_dir):
- #TODO: find a better way instead of shipping the two exec files with the tool
- faToTwoBit = os.path.join(tool_dir, 'faToTwoBit')
- twoBitInfo = os.path.join(tool_dir, 'twoBitInfo')
- try:
- twoBitFile = tempfile.NamedTemporaryFile(bufsize=0)
- chrom_sizes = tempfile.NamedTemporaryFile(bufsize=0, suffix='.chrom.sizes', delete=False)
- except IOError as err:
- print "Cannot create tempfile err({0}): {1}".format(err.errno, err.strerror)
- try:
- subprocess.call(['faToTwoBit', reference, twoBitFile.name])
- except OSError as err:
- print "Cannot generate twoBitFile from faToTwoBit err({0}): {1}".format(err.errno, err.strerror)
- try:
- subprocess.call(['twoBitInfo', twoBitFile.name, chrom_sizes.name])
- except OSError as err:
- print "Cannot generate chrom_sizes from twoBitInfo err({0}): {1}".format(err.errno, err.strerror)
- return chrom_sizes
-
-def sequence_region(chrom_sizes):
- """
- This function read from a chromatin size file generated by twoBitInfo and write the information to dict
- return a dict
- """
- f = open(chrom_sizes, 'r')
- sizes = f.readlines()
- sizes_dict = {}
- for line in sizes:
- chrom_info = line.rstrip().split('\t')
- sizes_dict[chrom_info[0]] = chrom_info[1]
- return sizes_dict
-
-def child_blocks(parent_field, parent_attr, gff3, child_type):
- num = 0
- blockcount = int(parent_attr['blockcount'])
- chromstart = parent_attr['chromstarts'].split(',')
- blocksize = parent_attr['blocksizes'].split(',')
- parent_start = parent_field['start']
- while num < blockcount:
- child_attr = OrderedDict()
- child_field = parent_field
- child_field['type'] = child_type
- child_field['start'] = int(chromstart[num]) + int(parent_start)
- child_field['end'] = int(child_field['start']) + int(blocksize[num]) - 1
- child_attr['ID'] = parent_attr['ID'] + '_part_' + str(num+1)
- child_attr['Parent'] = parent_attr['ID']
- write_features(child_field, child_attr, gff3)
- num = num + 1
-
-def add_tracks_to_json(trackList_json, new_tracks, modify_type):
- """
- Add to track configuration (trackList.json)
- # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict()
- # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict())
- """
- with open(trackList_json, 'r+') as f:
- data = json.load(f)
- if modify_type == 'add_tracks':
- data['tracks'].append(new_tracks)
- elif modify_type == 'add_attr':
- for k in new_tracks:
- for track in data['tracks']:
- if k.lower() in track['urlTemplate'].lower():
- attr = new_tracks[k]
- for k, v in attr.items():
- track[k] = v
- f.seek(0, 0)
- f.write(json.dumps(data, separators=(',' , ':'), indent=4))
- f.truncate()
- f.close()
-
-def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
- """
- Covert gtf file output from StringTie to gff3 format
- """
- gff3 = open(gff3_file, 'w')
- gff3.write("##gff-version 3\n")
- sizes_dict = sequence_region(chrom_sizes)
- seq_regions = dict()
- parents = dict()
- with open(gtf_file, 'r') as gtf:
- for line in gtf:
- if line.startswith('#'):
- continue
- field = OrderedDict()
- attribute = OrderedDict()
- li = line.rstrip().split("\t")
- #print li
- field['seqid'] = li[0]
- #print field['seqid']
- if field['seqid'] not in seq_regions:
- end_region = sizes_dict[field['seqid']]
- gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(end_region) + '\n')
- seq_regions[field['seqid']] = end_region
- field['source'] = li[1]
- field['type'] = li[2]
- # The first base in a chromosome is numbered 0 in BED format
- field['start'] = li[3]
- field['end'] = li[4]
- field['score'] = li[5]
- field['strand'] = li[6]
- field['phase'] = li[7]
- attr_li = li[8].split(';')
- gene_id = attr_li[0].split()[1].strip('"')
- attribute['ID'] = gene_id + '_' + field['type'] + '_' + str(field['start']) + '_' + str(field['end'])
- if field['type'] == 'transcript':
- parents[gene_id] = attribute['ID']
- attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
- attribute['coverage'] = attr_li[2].split()[1].strip('"')
- attribute['fpkm'] = attr_li[3].split()[1].strip('"')
- attribute['tpm'] = attr_li[4].split()[1].strip('"')
- elif field['type'] == 'exon':
- attribute['Parent'] = parents[gene_id]
- attribute['transcript_id'] = attr_li[1].split()[1].strip('"')
- attribute['coverage'] = attr_li[3].split()[1].strip('"')
- write_features(field, attribute, gff3)
- gff3.close()
-
-
-def sanitize_name(input_name):
- """
- Galaxy will name all the files and dirs as *.dat,
- the function can replace '.' to '_' for the dirs
- """
- validChars = "_-%s%s" % (string.ascii_letters, string.digits)
- sanitized_name = ''.join([c if c in validChars else '_' for c in input_name])
- return "gonramp_" + sanitized_name
-
-def createBamIndex(bamfile):
- subprocess.call(['samtools', 'index', bamfile])
- filename = bamfile + '.bai'
- if os.path.exists(filename):
- return filename
- else:
- raise ValueError('Did not find bai file')
diff -r bb6fdccef474 -r 31a41ce128cc utils.pyc
Binary file utils.pyc has changed