Mercurial > repos > yating-l > jbrowse_hub
changeset 31:d8049deb0c97 draft
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit faeedda55e23f1197bc454d3db2d52af29d786e8-dirty
| author | yating-l | 
|---|---|
| date | Fri, 17 Mar 2017 12:28:32 -0400 | 
| parents | daf6a1122200 | 
| children | f45a253f7c6a | 
| files | TrackHub.py blastxmlToGff3.py jbrowse_hub.py jbrowse_hub.xml tool_dependencies.xml trackObject.py utils.py | 
| diffstat | 7 files changed, 159 insertions(+), 94 deletions(-) [+] | 
line wrap: on
 line diff
--- a/TrackHub.py Wed Mar 15 11:46:38 2017 -0400 +++ b/TrackHub.py Fri Mar 17 12:28:32 2017 -0400 @@ -1,14 +1,10 @@ #!/usr/bin/env python import os -import trackObject +import subprocess +import shutil import utils -import subprocess -import string -import shutil -import tempfile -#TODO: package JBrowse file conversion .pl files class TrackHub: def __init__(self, inputFiles, reference, outputDirect, tool_dir, genome, extra_files_path): @@ -21,10 +17,6 @@ self.raw = os.path.join(self.out_path, 'raw') self.json = os.path.join(self.out_path, 'json') try: - if not self.out_path: - raise ValueError('empty output path\n') - if not os.path.exists(self.out_path): - raise ValueError('the output folder has not been created') if os.path.exists(self.json): shutil.rmtree(self.json) os.makedirs(self.json) @@ -39,7 +31,6 @@ self.addTrack(input_file) self.indexName() self.makeArchive() - #shutil.rmtree(self.out_path) self.outHtml() print "Success!\n" @@ -59,7 +50,7 @@ bam_track = dict() bam_track['type'] = 'JBrowse/View/Track/Alignments2' bam_track['storeClass'] = 'JBrowse/Store/SeqFeature/BAM' - bam_track['label'] = track['fileName'] + bam_track['label'] = track['label'] bam_track['urlTemplate'] = os.path.join('../raw', track['fileName']) bam_track['baiUrlTemplate'] = os.path.join('../raw', track['index']) utils.add_tracks_to_json(json_file, bam_track, 'add_tracks') @@ -68,14 +59,14 @@ self.createTrackList() json_file = os.path.join(self.json, "trackList.json") bigwig_track = dict() - bigwig_track['label'] = track['fileName'] + bigwig_track['label'] = track['label'] bigwig_track['urlTemplate'] = os.path.join('../raw', track['fileName']) bigwig_track['type'] = 'JBrowse/View/Track/Wiggle/XYPlot' bigwig_track['storeClass'] = 'JBrowse/Store/SeqFeature/BigWig' utils.add_tracks_to_json(json_file, bigwig_track, 'add_tracks') else: gff3_file = os.path.join(self.raw, track['fileName']) - label = track['fileName'] + label = track['label'] if track['dataType'] == 'bedSpliceJunctions' or track['dataType'] == 'gtf': p = subprocess.Popen(['flatfile-to-json.pl', '--gff', gff3_file, '--trackType', 'CanvasFeatures', '--trackLabel', label, '--config', '{"glyph": "JBrowse/View/FeatureGlyph/Segments"}', '--out', self.json]) elif track['dataType'] == 'gff3_transcript': @@ -98,7 +89,6 @@ #TODO: this will list all zip files in the filedir and sub-dirs. worked in Galaxy but all list zip files in test-data when #run it locally. May need modify def outHtml(self): - #htmloutput = tempfile.NamedTemporaryFile(self.outfile, suffix = '.html', bufsize=0, delete=False) with open(self.outfile, 'w') as htmlfile: htmlstr = 'The JBrowse Hub is created: <br>' zipfiles = '<li><a href = "%s">Download</a></li>' @@ -112,15 +102,13 @@ relative_file_path = os.path.join(relative_directory, file) htmlstr += zipfiles % relative_file_path - #htmlstr = htmlstr % zipfile htmlfile.write(htmlstr) def createTrackList(self): trackList = os.path.join(self.json, "trackList.json") if not os.path.exists(trackList): os.mknod(trackList) - #open(trackList,'w').close() - +
--- a/blastxmlToGff3.py Wed Mar 15 11:46:38 2017 -0400 +++ b/blastxmlToGff3.py Fri Mar 17 12:28:32 2017 -0400 @@ -7,14 +7,14 @@ def align2cigar(hsp_query, hsp_reference): - ''' + """ Build CIGAR representation from an hsp_query input: hsp_query hsp_sbjct output: CIGAR string - ''' + """ query = hsp_query ref = hsp_reference # preType, curType: @@ -98,6 +98,8 @@ attribute['ID'] = field['seqid'] + '_' + str(field['start']) + '_' + str(field['end']) + '_' + query_name + '_' + str(target_start) + '_' + str(target_end) attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) attribute['Gap'] = align2cigar(query, ref) + #store the query sequence in the file in order to display alignment with BlastAlignment plugin + attribute['query'] = hsp.query # show reading frame attribute only if the frame is not (0, 0) if hsp.frame[0] != 0 or hsp.frame[1] != 0: attribute['reading_frame'] = str(hsp.frame[0]) + ", " + str(hsp.frame[1])
--- a/jbrowse_hub.py Wed Mar 15 11:46:38 2017 -0400 +++ b/jbrowse_hub.py Fri Mar 17 12:28:32 2017 -0400 @@ -1,22 +1,19 @@ #!/usr/bin/env python -import os import sys import argparse -import subprocess -from bedToGff3 import bedToGff3 -import blastxmlToGff3 +import json import utils -import tempfile import trackObject import TrackHub -import shutil + + def main(argv): parser = argparse.ArgumentParser(description='Create a hub to display in jbrowse.') # Reference genome mandatory - parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome') + parser.add_argument('-f', '--fasta', help='Fasta file of the reference genome (Required)') # Genome name parser.add_argument('-g', '--genome_name', help='Name of reference genome') @@ -25,7 +22,10 @@ parser.add_argument('-o', '--out', help='output html') # Output folder - parser.add_argument('-e', '--extra_files_path', help="Directory of JBrowse Hub folder") + parser.add_argument('-e', '--extra_files_path', help='Directory of JBrowse Hub folder') + + #Tool Directory + parser.add_argument('-d', '--tool_directory', help='The directory of JBrowse file convertion scripts and UCSC tools') # GFF3 structure: gene->transcription->CDS parser.add_argument('--gff3_transcript', action='append', help='GFF3 format, structure: gene->transcription->CDS') @@ -51,25 +51,45 @@ # GTF format parser.add_argument('--gtf', action='append', help='GTF format from StringTie') + # Metadata json format + parser.add_argument('-j', '--data_json', help='Json containing the metadata of the inputs') + args = parser.parse_args() all_datatype_dictionary = dict() + if not args.fasta: + parser.print_help() + raise RuntimeError("No reference genome\n") reference = args.fasta genome = 'unknown' - out_path = '.' + out_path = 'unknown.html' extra_files_path = '.' + tool_directory = '.' if args.genome_name: - genome = utils.sanitize_name_path(args.genome_name) + genome = utils.sanitize_name(args.genome_name) if args.out: out_path = args.out if args.extra_files_path: - extra_files_path = utils.sanitize_name_path(args.extra_files_path) - cwd = os.getcwd() + extra_files_path = utils.sanitize_name(args.extra_files_path) + #tool_directory not work for Galaxy tool, all tools need to exist in the current PATH, deal with it with tool dependencies - tool_directory = os.path.join(cwd, 'JBrowse-1.12.1/bin') + if args.tool_directory: + tool_directory = args.tool_directory + + #Calculate chromsome sizes using genome reference and uscs tools chrom_size = utils.getChromSizes(reference, tool_directory) - all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path) #store converted files in the array: all_tracks.tracks + + #get metadata from json file + json_inputs_data = args.data_json + if json_inputs_data: + inputs_data = json.loads(json_inputs_data) + else: + inputs_data = {} + + #Initate trackObject + all_tracks = trackObject.trackObject(chrom_size.name, genome, extra_files_path) + array_inputs_bam = args.bam array_inputs_bed_simple_repeats = args.bedSimpleRepeats array_inputs_bed_splice_junctions = args.bedSpliceJunctions @@ -78,6 +98,7 @@ array_inputs_gff3_mrna = args.gff3_mrna array_inputs_gtf = args.gtf array_inputs_blastxml = args.blastxml + if array_inputs_bam: all_datatype_dictionary['bam'] = array_inputs_bam if array_inputs_bed_simple_repeats: @@ -95,7 +116,7 @@ if array_inputs_blastxml: all_datatype_dictionary['blastxml'] = array_inputs_blastxml - print all_datatype_dictionary + print "input tracks: \n", all_datatype_dictionary for datatype, inputfiles in all_datatype_dictionary.items(): try: @@ -105,11 +126,23 @@ print 'Cannot open', datatype else: for f in inputfiles: - all_tracks.addToRaw(f, datatype) + metadata = {} + if f in inputs_data.keys(): + metadata = inputs_data[f] + #Convert tracks into gff3 format + all_tracks.addToRaw(f, datatype, metadata) jbrowseHub = TrackHub.TrackHub(all_tracks, reference, out_path, tool_directory, genome, extra_files_path) jbrowseHub.createHub() - + +""" +def extractMetadata(array_inputs, inputs_data): + metadata_dict = {} + for input_false_path in array_inputs: + for key, data_value in inputs_data.items(): + if key == input_false_path: + metadata_dict[input_false_path] +""" if __name__ == "__main__": main(sys.argv)
--- a/jbrowse_hub.xml Wed Mar 15 11:46:38 2017 -0400 +++ b/jbrowse_hub.xml Fri Mar 17 12:28:32 2017 -0400 @@ -17,36 +17,70 @@ <command detect_errors="exit_code"><![CDATA[ python $__tool_directory__/jbrowse_hub.py --fasta '$reference' + --genome_name '$genome_name' + + ## json metadata recording from Remi's hub-archive-creator.xml + #import json + #set global data_parameter_dict = {} + + ## Function to retrieve the data of the inputs + #def prepare_json($input_to_prepare, $extra_data_dict={}) + #set false_path = str($input_to_prepare) + #set name = $input_to_prepare.name + + #set data_dict = {"name": $name} + #silent data_dict.update($extra_data_dict) + + #silent $data_parameter_dict.update({$false_path: $data_dict}) + + #end def + + #for $f in $format + #set track_label = $f.formatChoice.label + #set extra_data_dict = {'label' : $track_label} #if $f.formatChoice.format_select == 'bed' #if $f.formatChoice.bedChoice.bed_select == 'bed_simple_repeats_option' --bedSimpleRepeats $f.formatChoice.bedChoice.BED_simple_repeats + #silent $prepare_json($f.formatChoice.bedChoice.BED_simple_repeats, extra_data_dict) #elif $f.formatChoice.bedChoice.bed_select == 'bed_splice_junctions_option' --bedSpliceJunctions $f.formatChoice.bedChoice.BED_splice_junctions + #silent $prepare_json($f.formatChoice.bedChoice.BED_splice_junctions, extra_data_dict) #end if #end if #if $f.formatChoice.format_select == 'bam' --bam $f.formatChoice.BAM + #silent $prepare_json($f.formatChoice.BAM, extra_data_dict) #end if - #if $f.formatChoice.format_select == 'gff3_transcript' - --gff3_transcript $f.formatChoice.GFF3_transcript - #end if - #if $f.formatChoice.format_select == 'gff3_mrna' - --gff3_mrna $f.formatChoice.GFF3_mrna + #if $f.formatChoice.format_select == 'gff3' + #if $f.formatChoice.gff3Choice.gff3_select == 'gff3_transcript' + --gff3_transcript $f.formatChoice.gff3Choice.GFF3_transcript + #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_transcript, extra_data_dict) + #end if + #elif $f.formatChoice.gff3Choice.gff3_select == 'gff3_mrna' + --gff3_mrna $f.formatChoice.gff3Choice.GFF3_mrna + #silent $prepare_json($f.formatChoice.gff3Choice.GFF3_mrna, extra_data_dict) + #end if #end if #if $f.formatChoice.format_select == 'blastxml' --blastxml $f.formatChoice.BlastXML + #silent $prepare_json($f.formatChoice.BlastXML, extra_data_dict) #end if #if $f.formatChoice.format_select == 'gtf' --gtf $f.formatChoice.GTF + #silent $prepare_json($f.formatChoice.GTF, extra_data_dict) #end if #if $f.formatChoice.format_select == 'bigwig' --bigwig $f.formatChoice.BIGWIG + #silent $prepare_json($f.formatChoice.BIGWIG, extra_data_dict) #end if #end for - --genome_name '$genome_name' + + #set all_data_json = json.dumps($data_parameter_dict) + -j '$all_data_json' -e '$output.extra_files_path' -o '$output' + ]]></command> <inputs> @@ -71,6 +105,7 @@ type="data" label="BAM File" /> + <param name="label" type="text" size="30" value="${on_string}" label="Track name" /> </when> <when value="bed"> <conditional name="bedChoice"> @@ -95,6 +130,7 @@ /> </when> </conditional> + <param name="label" type="text" size="30" value="${on_string}" label="Track name" /> </when> <when value="blastxml"> <param @@ -103,6 +139,7 @@ type="data" label="Blast Alignments File" /> + <param name="label" type="text" size="30" value="${on_string}" label="Track name" /> </when> <when value="bigwig"> <param @@ -111,22 +148,32 @@ type="data" label="BIGWIG File" /> + <param name="label" type="text" size="30" value="${on_string}" label="Track name" /> </when> - <when value="gff3_transcript"> - <param - format="gff3" - name="GFF3_transcript" - type="data" - label="GFF3 File" - /> - </when> - <when value="gff3_mrna"> - <param - format="gff3" - name="GFF3_mrna" - type="data" - label="GFF3 File" - /> + <when value="gff3"> + <conditional name="gff3Choice"> + <param name="gff3_select" type="select" label="gff3 type"> + <option value="gff3_transcript">GFF3 format, structure: gene->transcription->CDS</option> + <option value="gff3_mrna">GFF3 format, structure: gene->mRNA->CDS</option> + </param> + <when value="gff3_transcript"> + <param + format="gff3" + name="GFF3_transcript" + type="data" + label="GFF3 File" + /> + </when> + <when value="gff3_mrna"> + <param + format="gff3" + name="GFF3_mrna" + type="data" + label="GFF3 File" + /> + </when> + </conditional> + <param name="label" type="text" size="30" value="${on_string}" label="Track name" /> </when> <when value="gtf"> <param @@ -135,6 +182,7 @@ type="data" label="GTF File" /> + <param name="label" type="text" size="30" value="${on_string}" label="Track name" /> </when> </conditional> </repeat>
--- a/tool_dependencies.xml Wed Mar 15 11:46:38 2017 -0400 +++ b/tool_dependencies.xml Fri Mar 17 12:28:32 2017 -0400 @@ -8,6 +8,9 @@ </package> <package name="biopython" version="1.68"> <readme> +This package is based on package_biopython_1_67 owned by biopython. +https://toolshed.g2.bx.psu.edu/repository?user_id=fd5c6d0f82f315d8 + This Galaxy Tool Shed package installs Biopython from source, having first installed NumPy which is a build time depencency. This requires and assumes a standard C compiler is already installed, along with
--- a/trackObject.py Wed Mar 15 11:46:38 2017 -0400 +++ b/trackObject.py Fri Mar 17 12:28:32 2017 -0400 @@ -5,15 +5,14 @@ import utils import bedToGff3 import blastxmlToGff3 -import tempfile -import subprocess + class trackObject: def __init__(self, chrom_size, genome, extra_files_path): self.chrom_size = chrom_size outputDirect = os.path.join(extra_files_path, genome) self.raw_folder = os.path.join(outputDirect, 'raw') - print self.raw_folder + #Store metadata of the tracks self.tracks = [] try: if os.path.exists(self.raw_folder): @@ -25,11 +24,11 @@ except OSError as oserror: print "Cannot create raw folder error({0}): {1}".format(oserror.errno, oserror.strerror) - def addToRaw(self, dataFile, dataType): - ''' + def addToRaw(self, dataFile, dataType, metaData): + """ Convert gff3, BED, blastxml and gtf files into gff3 files and store converted files in folder 'raw' - ''' + """ fileName = os.path.basename(dataFile) des_path = os.path.join(self.raw_folder, fileName) @@ -61,18 +60,7 @@ utils.gtfToGff3(dataFile, des_path, self.chrom_size) track['fileName'] = fileName track['dataType'] = dataType + track.update(metaData) + if 'label' not in metaData.keys(): + track['label'] = fileName self.tracks.append(track) - - - -''' - def checkGff3(self, dataFile, dataType): - with open(dataFile, 'r') as f: - for line in f: - if not line.startswith('#'): - seq_type = line.rstrip().split('\t')[2] - if seq_type == 'transcript': - return 'gff3-transcript' - if seq_type == 'mRNA': - return 'gff3' -''' \ No newline at end of file
--- a/utils.py Wed Mar 15 11:46:38 2017 -0400 +++ b/utils.py Fri Mar 17 12:28:32 2017 -0400 @@ -1,21 +1,21 @@ #!/usr/bin/env python -''' +""" This file include common used functions for converting file format to gff3 -''' +""" from collections import OrderedDict import json import subprocess import os import tempfile - +import string def write_features(field, attribute, gff3): - ''' + """ The function write the features to gff3 format (defined in https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) field, attribute are ordered dictionary gff3 is the file handler - ''' + """ attr = [] for v in field.values(): gff3.write(str(v) + '\t') @@ -47,10 +47,10 @@ return chrom_sizes def sequence_region(chrom_sizes): - ''' + """ This function read from a chromatin size file generated by twoBitInfo and write the information to dict return a dict - ''' + """ f = open(chrom_sizes, 'r') sizes = f.readlines() sizes_dict = {} @@ -76,11 +76,11 @@ num = num + 1 def add_tracks_to_json(trackList_json, new_tracks, modify_type): - ''' + """ Add to track configuration (trackList.json) # modify_type = 'add_tracks': add a new track like bam or bigwig, new_track = dict() # modify_type = 'add_attr': add configuration to the existing track, new_track = dict(track_name: dict()) - ''' + """ with open(trackList_json, 'r+') as f: data = json.load(f) if modify_type == 'add_tracks': @@ -98,9 +98,9 @@ f.close() def gtfToGff3(gtf_file, gff3_file, chrom_sizes): - ''' + """ Covert gtf file output from StringTie to gff3 format - ''' + """ gff3 = open(gff3_file, 'w') gff3.write("##gff-version 3\n") sizes_dict = sequence_region(chrom_sizes) @@ -144,12 +144,15 @@ write_features(field, attribute, gff3) gff3.close() -def sanitize_name_path(input_path): - ''' + +def sanitize_name(input_name): + """ Galaxy will name all the files and dirs as *.dat, - the function is simply replacing '.' to '_' for the dirs - ''' - return input_path.replace('.', '_') + the function can replace '.' to '_' for the dirs + """ + validChars = "_-%s%s" % (string.ascii_letters, string.digits) + sanitized_name = ''.join([c if c in validChars else '_' for c in input_name]) + return "gonramp_" + sanitized_name def createBamIndex(bamfile): p = subprocess.Popen(['samtools', 'index', bamfile])
