Mercurial > repos > yating-l > jbrowse_hub

--- a/TrackHub.py	Wed Mar 15 11:46:38 2017 -0400
+++ b/TrackHub.py	Fri Mar 17 12:28:32 2017 -0400
@@ -1,14 +1,10 @@
 #!/usr/bin/env python

 import os
-import trackObject
+import subprocess
+import shutil
 import utils
-import subprocess
-import string
-import shutil
-import tempfile

-#TODO: package JBrowse file conversion .pl files

 class TrackHub:
     def __init__(self, inputFiles, reference, outputDirect, tool_dir, genome, extra_files_path):
@@ -21,10 +17,6 @@
         self.raw = os.path.join(self.out_path, 'raw')
         self.json = os.path.join(self.out_path, 'json')
         try:
-            if not self.out_path:
-                raise ValueError('empty output path\n')
-            if not os.path.exists(self.out_path):
-                raise ValueError('the output folder has not been created')
             if os.path.exists(self.json):
                 shutil.rmtree(self.json)
             os.makedirs(self.json)
@@ -39,7 +31,6 @@
             self.addTrack(input_file)
         self.indexName()
         self.makeArchive()
-        #shutil.rmtree(self.out_path)
         self.outHtml()
         print "Success!\n"

@@ -59,7 +50,7 @@
             bam_track = dict()
             bam_track['type'] = 'JBrowse/View/Track/Alignments2'
             bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM'
-            bam_track['label'] = track['fileName']
+            bam_track['label'] = track['label']
             bam_track['urlTemplate'] = os.path.join('../raw', track['fileName'])
             bam_track['baiUrlTemplate'] = os.path.join('../raw', track['index'])
             utils.add_tracks_to_json(json_file, bam_track, 'add_tracks')
@@ -68,14 +59,14 @@
             self.createTrackList()
             json_file = os.path.join(self.json, "trackList.json")
             bigwig_track = dict()
-            bigwig_track['label'] = track['fileName']
+            bigwig_track['label'] = track['label']
             bigwig_track['urlTemplate'] = os.path.join('../raw', track['fileName'])
             bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot'
             bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig'
             utils.add_tracks_to_json(json_file, bigwig_track, 'add_tracks')
         else:
             gff3_file = os.path.join(self.raw, track['fileName'])
-            label = track['fileName']
+            label = track['label']
             if track['dataType'] == 'bedSpliceJunctions' or track['dataType'] == 'gtf':
                 p = subprocess.Popen(['flatfile-to-json.pl', '--gff', gff3_file, '--trackType', 'CanvasFeatures', '--trackLabel', label, '--config', '{"glyph": "JBrowse/View/FeatureGlyph/Segments"}', '--out', self.json])
             elif track['dataType'] == 'gff3_transcript':
@@ -98,7 +89,6 @@
     #TODO: this will list all zip files in the filedir and sub-dirs. worked in Galaxy but all list zip files in test-data when
     #run it locally. May need modify
     def outHtml(self):
-        #htmloutput = tempfile.NamedTemporaryFile(self.outfile, suffix = '.html', bufsize=0, delete=False)
         with open(self.outfile, 'w') as htmlfile:
             htmlstr = 'The JBrowse Hub is created: <br>'
             zipfiles = '<li><a href = "%s">Download</a></li>'
@@ -112,15 +102,13 @@
                         relative_file_path = os.path.join(relative_directory, file)
                         htmlstr += zipfiles % relative_file_path

-            #htmlstr = htmlstr % zipfile
             htmlfile.write(htmlstr)

     def createTrackList(self):
         trackList = os.path.join(self.json, "trackList.json")
         if not os.path.exists(trackList):
             os.mknod(trackList)
-            #open(trackList,'w').close()
-
+
--- a/blastxmlToGff3.py	Wed Mar 15 11:46:38 2017 -0400
+++ b/blastxmlToGff3.py	Fri Mar 17 12:28:32 2017 -0400
@@ -7,14 +7,14 @@


 def align2cigar(hsp_query, hsp_reference):
-    '''
+    """
         Build CIGAR representation from an hsp_query
         input:
             hsp_query
             hsp_sbjct
         output:
             CIGAR string
-    '''
+    """
     query = hsp_query
     ref = hsp_reference
     # preType, curType:
@@ -98,6 +98,8 @@
                 attribute['ID'] = field['seqid'] + '_' + str(field['start']) + '_' + str(field['end']) + '_' + query_name + '_' + str(target_start) + '_' + str(target_end)
                 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end)
                 attribute['Gap'] = align2cigar(query, ref)
+                #store the query sequence in the file in order to display alignment with BlastAlignment plugin
+                attribute['query'] = hsp.query
                 # show reading frame attribute only if the frame is not (0, 0)
                 if hsp.frame[0] != 0 or hsp.frame[1] != 0:
                     attribute['reading_frame'] = str(hsp.frame[0]) + ", " + str(hsp.frame[1])
--- a/jbrowse_hub.py	Wed Mar 15 11:46:38 2017 -0400
+++ b/jbrowse_hub.py	Fri Mar 17 12:28:32 2017 -0400
@@ -1,22 +1,19 @@
 #!/usr/bin/env python

-import os
 import sys
 import argparse
-import subprocess
-from bedToGff3 import bedToGff3
-import blastxmlToGff3
+import json
 import utils
-import tempfile
 import trackObject
 import TrackHub
-import shutil
+
+

 def main(argv):
     parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.')

     # Reference genome mandatory
-    parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome')
+    parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome (Required)')

     # Genome name
     parser.add_argument('-g', '--genome_name', help='Name of reference genome')
@@ -25,7 +22,10 @@
     parser.add_argument('-o', '--out', help='output html')

     # Output folder
-    parser.add_argument('-e', '--extra_files_path', help="Directory of JBrowse Hub folder")
+    parser.add_argument('-e', '--extra_files_path', help='Directory of JBrowse Hub folder')
+
+    #Tool Directory
+    parser.add_argument('-d', '--tool_directory', help='The directory of JBrowse file convertion scripts and UCSC tools')

     # GFF3 structure: gene->transcription->CDS
     parser.add_argument('--gff3_transcript', action='append', help='GFF3 format, structure: gene->transcription->CDS')
@@ -51,25 +51,45 @@
     # GTF format
     parser.add_argument('--gtf', action='append', help='GTF format from StringTie')

+    # Metadata json format
+    parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs')
+
     args = parser.parse_args()
     all_datatype_dictionary = dict()


+    if not args.fasta:
+        parser.print_help()
+        raise RuntimeError("No reference genome\n")
     reference = args.fasta
     genome = 'unknown'
-    out_path = '.'
+    out_path = 'unknown.html'
     extra_files_path = '.'
+    tool_directory = '.'
     if args.genome_name:
-        genome = utils.sanitize_name_path(args.genome_name)
+        genome = utils.sanitize_name(args.genome_name)
     if args.out:
         out_path = args.out
     if args.extra_files_path:
-        extra_files_path = utils.sanitize_name_path(args.extra_files_path)
-    cwd = os.getcwd()
+        extra_files_path = utils.sanitize_name(args.extra_files_path)
+
     #tool_directory not work for Galaxy tool, all tools need to exist in the current PATH, deal with it with tool dependencies
-    tool_directory = os.path.join(cwd, 'JBrowse-1.12.1/bin')
+    if args.tool_directory:
+        tool_directory = args.tool_directory
+
+    #Calculate chromsome sizes using genome reference and uscs tools
     chrom_size = utils.getChromSizes(reference, tool_directory)
-    all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path) #store converted files in the array: all_tracks.tracks
+
+    #get metadata from json file
+    json_inputs_data = args.data_json
+    if json_inputs_data:
+        inputs_data = json.loads(json_inputs_data)
+    else:
+        inputs_data = {}
+
+    #Initate trackObject
+    all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path)
+
     array_inputs_bam = args.bam
     array_inputs_bed_simple_repeats = args.bedSimpleRepeats
     array_inputs_bed_splice_junctions = args.bedSpliceJunctions
@@ -78,6 +98,7 @@
     array_inputs_gff3_mrna = args.gff3_mrna
     array_inputs_gtf = args.gtf
     array_inputs_blastxml = args.blastxml
+
     if array_inputs_bam:
         all_datatype_dictionary['bam'] = array_inputs_bam
     if array_inputs_bed_simple_repeats:
@@ -95,7 +116,7 @@
     if array_inputs_blastxml:
         all_datatype_dictionary['blastxml'] = array_inputs_blastxml

-    print all_datatype_dictionary
+    print "input tracks: \n", all_datatype_dictionary

     for datatype, inputfiles in all_datatype_dictionary.items():
         try:
@@ -105,11 +126,23 @@
             print 'Cannot open', datatype
         else:
             for f in inputfiles:
-                all_tracks.addToRaw(f, datatype)
+                metadata = {}
+                if f in inputs_data.keys():
+                    metadata = inputs_data[f]
+                #Convert tracks into gff3 format
+                all_tracks.addToRaw(f, datatype, metadata)

     jbrowseHub = TrackHub.TrackHub(all_tracks, reference, out_path, tool_directory, genome, extra_files_path)
     jbrowseHub.createHub()
-
+
+"""
+def extractMetadata(array_inputs, inputs_data):
+    metadata_dict = {}
+    for input_false_path in array_inputs:
+        for key, data_value in inputs_data.items():
+            if key == input_false_path:
+                metadata_dict[input_false_path]
+"""

 if __name__ == "__main__":
     main(sys.argv)
--- a/jbrowse_hub.xml	Wed Mar 15 11:46:38 2017 -0400
+++ b/jbrowse_hub.xml	Fri Mar 17 12:28:32 2017 -0400
@@ -17,36 +17,70 @@
     <command detect_errors="exit_code"><![CDATA[
         python $__tool_directory__/jbrowse_hub.py
         --fasta '$reference'
+        --genome_name '$genome_name'
+
+        ## json metadata recording from Remi's hub-archive-creator.xml
+        #import json
+        #set global data_parameter_dict = {}
+
+        ## Function to retrieve the data of the inputs
+        #def prepare_json($input_to_prepare, $extra_data_dict={})
+            #set false_path = str($input_to_prepare)
+            #set name = $input_to_prepare.name
+
+            #set data_dict = {"name": $name}
+            #silent data_dict.update($extra_data_dict)
+
+            #silent $data_parameter_dict.update({$false_path: $data_dict})
+
+        #end def
+
+
         #for $f in $format
+            #set track_label =  $f.formatChoice.label
+            #set extra_data_dict = {'label' : $track_label}
             #if $f.formatChoice.format_select == 'bed'
                 #if $f.formatChoice.bedChoice.bed_select == 'bed_simple_repeats_option'
                     --bedSimpleRepeats $f.formatChoice.bedChoice.BED_simple_repeats
+                    #silent $prepare_json($f.formatChoice.bedChoice.BED_simple_repeats, extra_data_dict)
                 #elif $f.formatChoice.bedChoice.bed_select == 'bed_splice_junctions_option'
                     --bedSpliceJunctions $f.formatChoice.bedChoice.BED_splice_junctions
+                    #silent $prepare_json($f.formatChoice.bedChoice.BED_splice_junctions, extra_data_dict)
                 #end if
             #end if
             #if $f.formatChoice.format_select == 'bam'
                 --bam $f.formatChoice.BAM
+                #silent $prepare_json($f.formatChoice.BAM, extra_data_dict)
             #end if
-            #if $f.formatChoice.format_select == 'gff3_transcript'
-                --gff3_transcript $f.formatChoice.GFF3_transcript
-            #end if
-            #if $f.formatChoice.format_select == 'gff3_mrna'
-                --gff3_mrna $f.formatChoice.GFF3_mrna
+            #if $f.formatChoice.format_select == 'gff3'
+                #if $f.formatChoice.gff3Choice.gff3_select == 'gff3_transcript'
+                    --gff3_transcript $f.formatChoice.gff3Choice.GFF3_transcript
+                    #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_transcript, extra_data_dict)
+                #end if
+                #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_mrna'
+                    --gff3_mrna $f.formatChoice.gff3Choice.GFF3_mrna
+                    #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_mrna, extra_data_dict)
+                #end if
             #end if
             #if $f.formatChoice.format_select == 'blastxml'
                 --blastxml $f.formatChoice.BlastXML
+                #silent $prepare_json($f.formatChoice.BlastXML, extra_data_dict)
             #end if
             #if $f.formatChoice.format_select == 'gtf'
                 --gtf $f.formatChoice.GTF
+                #silent $prepare_json($f.formatChoice.GTF, extra_data_dict)
             #end if
             #if $f.formatChoice.format_select == 'bigwig'
                 --bigwig $f.formatChoice.BIGWIG
+                #silent $prepare_json($f.formatChoice.BIGWIG, extra_data_dict)
             #end if
        #end for
-        --genome_name '$genome_name'
+
+       #set all_data_json = json.dumps($data_parameter_dict)
+        -j '$all_data_json'
         -e '$output.extra_files_path'
         -o '$output'
+
     ]]></command>

     <inputs>
@@ -71,6 +105,7 @@
                                 type="data"
                                 label="BAM File"
                         />
+                        <param name="label" type="text" size="30" value="${on_string}" label="Track name" />
                     </when>
                     <when value="bed">
                         <conditional name="bedChoice">
@@ -95,6 +130,7 @@
                                 />
                             </when>
                         </conditional>
+                        <param name="label" type="text" size="30" value="${on_string}" label="Track name" />
                     </when>
                     <when value="blastxml">
                         <param
@@ -103,6 +139,7 @@
                                 type="data"
                                 label="Blast Alignments File"
                         />
+                        <param name="label" type="text" size="30" value="${on_string}" label="Track name" />
                     </when>
                     <when value="bigwig">
                         <param
@@ -111,22 +148,32 @@
                                 type="data"
                                 label="BIGWIG File"
                         />
+                        <param name="label" type="text" size="30" value="${on_string}" label="Track name" />
                     </when>
-                    <when value="gff3_transcript">
-                        <param
-                                format="gff3"
-                                name="GFF3_transcript"
-                                type="data"
-                                label="GFF3 File"
-                        />
-                    </when>
-                    <when value="gff3_mrna">
-                        <param
-                                format="gff3"
-                                name="GFF3_mrna"
-                                type="data"
-                                label="GFF3 File"
-                        />
+                    <when value="gff3">
+                        <conditional name="gff3Choice">
+                            <param name="gff3_select" type="select" label="gff3 type">
+                                <option value="gff3_transcript">GFF3 format, structure: gene->transcription->CDS</option>
+                                <option value="gff3_mrna">GFF3 format, structure: gene->mRNA->CDS</option>
+                            </param>
+                            <when value="gff3_transcript">
+                                <param
+                                    format="gff3"
+                                    name="GFF3_transcript"
+                                    type="data"
+                                    label="GFF3 File"
+                                />
+                            </when>
+                            <when value="gff3_mrna">
+                                <param
+                                    format="gff3"
+                                    name="GFF3_mrna"
+                                    type="data"
+                                    label="GFF3 File"
+                                />
+                            </when>
+                        </conditional>
+                        <param name="label" type="text" size="30" value="${on_string}" label="Track name" />
                     </when>
                     <when value="gtf">
                         <param
@@ -135,6 +182,7 @@
                                 type="data"
                                 label="GTF File"
                         />
+                        <param name="label" type="text" size="30" value="${on_string}" label="Track name" />
                     </when>
             </conditional>
         </repeat>
--- a/tool_dependencies.xml	Wed Mar 15 11:46:38 2017 -0400
+++ b/tool_dependencies.xml	Fri Mar 17 12:28:32 2017 -0400
@@ -8,6 +8,9 @@
     </package>
     <package name="biopython" version="1.68">
         <readme>
+This package is based on package_biopython_1_67 owned by biopython.
+https://toolshed.g2.bx.psu.edu/repository?user_id=fd5c6d0f82f315d8
+
 This Galaxy Tool Shed package installs Biopython from source, having
 first installed NumPy which is a build time depencency. This requires
 and assumes a standard C compiler is already installed, along with
--- a/trackObject.py	Wed Mar 15 11:46:38 2017 -0400
+++ b/trackObject.py	Fri Mar 17 12:28:32 2017 -0400
@@ -5,15 +5,14 @@
 import utils
 import bedToGff3
 import blastxmlToGff3
-import tempfile
-import subprocess
+

 class trackObject:
     def __init__(self, chrom_size, genome, extra_files_path):
         self.chrom_size = chrom_size
         outputDirect = os.path.join(extra_files_path, genome)
         self.raw_folder = os.path.join(outputDirect, 'raw')
-        print self.raw_folder
+        #Store metadata of the tracks
         self.tracks = []
         try:
             if os.path.exists(self.raw_folder):
@@ -25,11 +24,11 @@
         except OSError as oserror:
             print "Cannot create raw folder error({0}): {1}".format(oserror.errno, oserror.strerror)

-    def addToRaw(self, dataFile, dataType):
-        '''
+    def addToRaw(self, dataFile, dataType, metaData):
+        """
         Convert gff3, BED, blastxml and gtf files into gff3 files
         and store converted files in folder 'raw'
-        '''
+        """

         fileName = os.path.basename(dataFile)
         des_path = os.path.join(self.raw_folder, fileName)
@@ -61,18 +60,7 @@
             utils.gtfToGff3(dataFile, des_path, self.chrom_size)
         track['fileName'] = fileName
         track['dataType'] = dataType
+        track.update(metaData)
+        if 'label' not in metaData.keys():
+            track['label'] = fileName
         self.tracks.append(track)
-
-
-
-'''
-    def checkGff3(self, dataFile, dataType):
-        with open(dataFile, 'r') as f:
-            for line in f:
-                if not line.startswith('#'):
-                    seq_type = line.rstrip().split('\t')[2]
-                    if seq_type == 'transcript':
-                        return 'gff3-transcript'
-                    if seq_type == 'mRNA':
-                        return 'gff3'
-'''
\ No newline at end of file
--- a/utils.py	Wed Mar 15 11:46:38 2017 -0400
+++ b/utils.py	Fri Mar 17 12:28:32 2017 -0400
@@ -1,21 +1,21 @@
 #!/usr/bin/env python

-'''
+"""
 This file include common used functions for converting file format to gff3
-'''
+"""
 from collections import OrderedDict
 import json
 import subprocess
 import os
 import tempfile
-
+import string

 def write_features(field, attribute, gff3):
-    '''
+    """
     The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md)
     field, attribute are ordered dictionary
     gff3 is the file handler
-    '''
+    """
     attr = []
     for v in field.values():
         gff3.write(str(v) + '\t')
@@ -47,10 +47,10 @@
     return chrom_sizes

 def sequence_region(chrom_sizes):
-    '''
+    """
     This function read from a chromatin size file generated by twoBitInfo and write the information to dict
     return a dict
-    '''
+    """
     f = open(chrom_sizes, 'r')
     sizes = f.readlines()
     sizes_dict = {}
@@ -76,11 +76,11 @@
         num = num + 1

 def add_tracks_to_json(trackList_json, new_tracks, modify_type):
-    '''
+    """
     Add to track configuration (trackList.json)
     # modify_type =  'add_tracks': add a new track like bam or bigwig, new_track = dict()
     # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict())
-    '''
+    """
     with open(trackList_json, 'r+') as f:
         data = json.load(f)
         if modify_type == 'add_tracks':
@@ -98,9 +98,9 @@
         f.close()

 def gtfToGff3(gtf_file, gff3_file, chrom_sizes):
-    '''
+    """
     Covert gtf file output from StringTie to gff3 format
-    '''
+    """
     gff3 = open(gff3_file, 'w')
     gff3.write("##gff-version 3\n")
     sizes_dict = sequence_region(chrom_sizes)
@@ -144,12 +144,15 @@
             write_features(field, attribute, gff3)
     gff3.close()

-def sanitize_name_path(input_path):
-    '''
+
+def sanitize_name(input_name):
+    """
     Galaxy will name all the files and dirs as *.dat,
-    the function is simply replacing '.' to '_' for the dirs
-    '''
-    return input_path.replace('.', '_')
+    the function can replace '.' to '_' for the dirs
+    """
+    validChars = "_-%s%s" % (string.ascii_letters, string.digits)
+    sanitized_name = ''.join([c if c in validChars else '_' for c in input_name])
+    return "gonramp_" + sanitized_name

 def createBamIndex(bamfile):
     p = subprocess.Popen(['samtools', 'index', bamfile])