# HG changeset patch # User bgruening # Date 1402587321 14400 # Node ID 95a776023fbc4ae3a815ab53d7c7184c3af7ce11 Uploaded diff -r 000000000000 -r 95a776023fbc eden_cluster_motif_finder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_cluster_motif_finder.py Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,50 @@ +#!/usr/bin/env python +import argparse +import sys +import os +import subprocess +from Bio import SeqIO +import random +import tempfile +import shlex +import shutil +import logging +from eden_wrapper import EDeNWrapper +import eden_iterative_motif_finder +from eden_utilities import log +from eden_utilities import create_params +import eden_cluster_splitter + +def main(args): + #run cluster split + eden_cluster_splitter.main(args) + #work in the output dir + print 'start motif finder', args.output_dir_path + motif_set = set() + for root, dirs, files in os.walk( args.output_dir_path ): + for filename in files: + if filename.startswith('cluster_'): + filepath = os.path.join(root, filename) + args.fasta_file = filepath + out_filepath = os.path.join(root, '%s.motif' % filename) + args.output_file_path = open(out_filepath, 'w+') + eden_iterative_motif_finder.main(args) + args.output_file_path.close() + for line in open(out_filepath): + motif_set.add(line.split()[0]) + + with open(args.fasta_file, 'r') as f: + for record in SeqIO.parse(f, 'fasta'): + for motif_id, motif in enumerate(motif_set): + seq = eden_iterative_motif_finder.NormaliseSequenceToDNA(record.seq) + motif_count = seq.count(motif) + if motif_count > 0: + res = '%s\t%s\t%s\t%s\n' % (record.id, motif_id, motif, motif_count) + print res, + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.') + parser = create_params(parser, 'eden_cluster_motif') + args = parser.parse_args() + main(args) diff -r 000000000000 -r 95a776023fbc eden_cluster_splitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_cluster_splitter.py Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,64 @@ +#!/usr/bin/env python +import argparse +import sys +import os +import subprocess +from Bio import SeqIO +import random +import tempfile +import shlex +import shutil +import logging +from eden_wrapper import EDeNWrapper +import eden_iterative_motif_finder +from eden_utilities import log +from eden_utilities import create_params + +def main(args): + #setup directory for output + if not os.path.exists(args.output_dir_path) : + os.mkdir(args.output_dir_path) + else: + sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path) + + #make seq file + tmp_output_dir=tempfile.mkdtemp() + seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir) + + #cluster sequences + param = vars(args) + param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir}) + EDeN=EDeNWrapper(param) + EDeN.Cluster() + + #build the inverse index seqid->clusterid + #and create a dict of file_handles + files_handle_list=dict() + map_seqid2clusterid = dict() + with open(os.path.join( tmp_output_dir, 'cluster' )) as f: + for clusterid, line in enumerate(f): + str_vec = line.strip().split() + int_vec = map(int, str_vec) + for seqid in int_vec: + map_seqid2clusterid[seqid]=clusterid + filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid ) + files_handle_list[clusterid]=open(filename, 'w+') + + #write each fasta sequence in the corresponding cluster files + log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) ) + with open(args.fasta_file, 'r') as f: + for seqid, record in enumerate(SeqIO.parse(f, 'fasta')): + if seqid in map_seqid2clusterid: #note: not all sequences have a cluster + clusterid=map_seqid2clusterid[seqid]; + SeqIO.write(record, files_handle_list[clusterid], "fasta") + + #cleanup + shutil.rmtree(tmp_output_dir) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.') + parser = create_params(parser, 'eden_cluster_splitter') + args = parser.parse_args() + main(args) diff -r 000000000000 -r 95a776023fbc eden_iterative_motif_finder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_iterative_motif_finder.py Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,215 @@ +#!/usr/bin/env python +import logging +import argparse +import sys +import os +import subprocess +from Bio import SeqIO +import random +import tempfile +import shlex +import shutil +import logging +from eden_wrapper import EDeNWrapper +from eden_utilities import log +from eden_utilities import create_params + + +SEQ_POS_FILE="pos.seq" +SEQ_NEG_FILE="neg.seq" +SEQ_FILE="dat.seq" +TARGET_FILE="dat.target" +SCORE_FILE="pos.score" + +def SolveMaximumSubarrayProblem(vec): + begin_temp = begin = end = 0 + start_val=vec[0] + max_ending_here = max_so_far = start_val + for pos,x in enumerate(vec[1:],1): + if max_ending_here < 0: + max_ending_here=x + begin_temp=pos + else: + max_ending_here=max_ending_here+x + if max_ending_here > max_so_far: + max_so_far=max_ending_here + begin=begin_temp + end=pos + return begin,end + +def ComputeIteratedMaximumSubarray(seq_list, score_list, min_motif_size): + motif_list=[] + assert len(seq_list) == len(score_list) + for seq,score in zip(seq_list, score_list): + while 1: + #find (begin,end) of motif in each element + begin,end=SolveMaximumSubarrayProblem(score) + if end-begin < min_motif_size: + break + else: + #extract subsequence + motif=seq[begin:end+1] + #print motif, begin, end, seq, score + motif_item=[motif, begin, end, seq] + motif_list.append(motif_item) + #remove motif by zeoring importance values + motif_size=end+1-begin + assert motif_size == len(motif) + score[begin:end+1]=[0.0]*motif_size + #iterate + return motif_list + +def ComputeAndOutputIteratedMaximumSubarray(seq_file, score_file, min_subarray_size, opath): + """ + Description and unit test + """ + #import the sequence file + seq_list=[] + with open(seq_file, 'r') as f: + for line in f: + seq_list.append(line) + #import the score file + score_list=[] + with open(score_file, 'r') as f: + for line in f: + str_vec=line.split() + int_vec = map(float, str_vec) + score_list.append(int_vec) + + #extract for each sequence the motif + motif_list=ComputeIteratedMaximumSubarray(seq_list, score_list, min_subarray_size) + + #output to file + for motif, begin, end, seq in motif_list: + opath.write('{}\t{}\t{}\t{}'.format( motif, begin, end, seq)) + +def IsRNA(seq): + if 'U' in seq.upper(): + return True + else: + return False + +def ShuffleSequence(seq, order =1): + nucleotides = [str(seq)[nucl:nucl+order] for nucl in range(0,len(seq)-order+1,order)] + if order != 1: + start = len(seq)%order + if start != 0 : + nucleotides.append(str(seq)[-start:]) + random.shuffle(nucleotides) + shuffled_seq=''.join(nucleotides) + return shuffled_seq + +def NormaliseSequenceToDNA(seq): + dna=seq + if IsRNA(seq): + dna=seq.back_transcribe() + normdna=dna.upper() + return normdna + +def MakePositiveSequenceFile(fasta_file, output_dir): + seq_pos_file_path = os.path.join(output_dir, SEQ_POS_FILE) + seq_pos_handle=open(seq_pos_file_path, 'w+') + pos_counter=0 + with open(fasta_file, 'r') as f: + for record in SeqIO.parse(f, 'fasta'): + dna=NormaliseSequenceToDNA(record.seq) + seq_pos_handle.write("%s\n" % dna) + pos_counter+=1 + seq_pos_handle.close() + log.info( "Created %s positive sequences in file: %s" % (pos_counter, seq_pos_file_path) ) + return seq_pos_file_path + +def MakeNegativeSequenceFile(sequence_file, output_dir, negative_repeat_factor, max_shuffle_order): + seq_neg_file_path = os.path.join(output_dir, SEQ_NEG_FILE) + seq_neg_handle=open(seq_neg_file_path, 'w+') + neg_counter=0 + with open(sequence_file, 'r') as file_handle: + for seq in file_handle: + seq = seq.strip() + for rep in range(negative_repeat_factor): + for order in range(1,max_shuffle_order): + sdna=ShuffleSequence(seq,order) + seq_neg_handle.write("%s\n" % sdna) + neg_counter+=1 + seq_neg_handle.close() + log.info( "Created %s negative sequences in file: %s" % (neg_counter, seq_neg_file_path) ) + return seq_neg_file_path + +def MakeDataFile(fasta_file, negative_repeat_factor, max_shuffle_order): + output_dir=tempfile.mkdtemp() + + seq_pos_file_path=MakePositiveSequenceFile(fasta_file, output_dir) + seq_neg_file_path=MakeNegativeSequenceFile(seq_pos_file_path, output_dir, negative_repeat_factor, max_shuffle_order) + + dat_file_path = os.path.join(output_dir, SEQ_FILE) + target_file_path = os.path.join(output_dir, TARGET_FILE) + + merged_seq_file_path = open(dat_file_path, 'wb') + shutil.copyfileobj(open(seq_pos_file_path,'rb'), merged_seq_file_path) + shutil.copyfileobj(open(seq_neg_file_path,'rb'), merged_seq_file_path) + merged_seq_file_path.close() + + with open(target_file_path, 'w+') as target_handle: + for line in open(seq_pos_file_path): + target_handle.write('1\n') + for line in open(seq_neg_file_path): + target_handle.write('-1\n') + + return {'output_dir': output_dir, + 'seq_pos_file_path': seq_pos_file_path, + 'seq_neg_file_path': seq_neg_file_path, + 'dat_file_path':dat_file_path, + 'target_file_path':target_file_path} + +def MakeScoreFile(output_dir): + score_file_path = os.path.join(output_dir, SCORE_FILE) + score_file = open( score_file_path, 'w+') + counter_old = -1 + values = list() + for line in open(os.path.join( output_dir, 'prediction_part' )): + counter, vertex_id, value = line.strip().split() + counter = int(counter) + if counter != counter_old : + if values : + score_file.write(' '.join(values) + '\n') + values = list() + values.append(value) + counter_old = counter + if values : + score_file.write(' '.join(values) + '\n') + score_file.close() + log.info( "Score file in: %s" % score_file_path ) + return score_file_path + +def main(args): + #create data file, i.e. positive normalised sequences and negative shuffled sequences + param_dict = MakeDataFile(args.fasta_file, + args.neg_rep, + args.max_order) + param_dict.update({'radius':args.radius}) + param_dict.update({'distance':args.distance}) + #train a model + eden=EDeNWrapper(param_dict) + eden.Train() + #test + eden.param.update({'dat_file_path':param_dict['seq_pos_file_path']}) + eden.TestPart() + + #extract scores + score_file_path=MakeScoreFile(param_dict['output_dir']) + + ComputeAndOutputIteratedMaximumSubarray( param_dict['seq_pos_file_path'], + score_file_path, + args.min_motif_size, + args.output_file_path ) + #cleanup + shutil.rmtree(param_dict['output_dir']) + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.') + parser = create_params(parser, 'eden_iterative_motif_finder') + args = parser.parse_args() + main(args) diff -r 000000000000 -r 95a776023fbc eden_utilities.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_utilities.py Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +__author__ = "Fabrizio Costa, Bjoern Gruening" +__copyright__ = "Copyright 2014, Fabrizio Costa" +__credits__ = ["Fabrizio Costa", "Bjoern Gruening"] +__license__ = "GPL" +__version__ = "1.0.1" +__maintainer__ = "Fabrizio Costa" +__email__ = "costa@informatik.uni-freiburg.de" +__status__ = "Production" + +import logging +import sys +import argparse + +log = logging.getLogger() +log.setLevel(logging.DEBUG) + +std = logging.StreamHandler(sys.stdout) +std.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') +std.setFormatter(formatter) +log.addHandler(std) + +def create_params(parser, mode): + #common parameters + parser.add_argument('--input-file', '-i', + dest="fasta_file", + type=str, + help='fasta file name with sequences.', + required=True) + parser.add_argument('--radius', '-r', + dest="radius", + type=int, + help='Size of the largest radius used in EDeN.', + default=2) + parser.add_argument('--distance', '-d', + dest="distance", + type=int, + help='Size of the largest distance used in EDeN.', + default=5) + parser.add_argument('--version', action="version", version=__version__) + + if mode == 'eden_cluster_splitter' or mode == 'eden_cluster_motif' : + parser.add_argument("-o", "--output-dir", + dest="output_dir_path", + help="Path to output directory.") + parser.add_argument('-n', '--num-nearest-neighbors', + dest="num_nearest_neighbors", + type=int, + help='Number of nearest neighbors used in the density estimation.', + default=10) + parser.add_argument('-z', '--neighborhood-intersection-size', + dest="neighborhood_intersection_size", + type=int, + help='Number of nearest neighbors that must be shared in neighborhood in order to trigger neighborhood joining.', + default=5) + parser.add_argument('-c','--fraction-center-scan', + dest="fraction_center_scan", + type=int, + help='Fraction of total number of instances which is used to compute densities.', + default=1.0) + + if mode == 'eden_iterative_motif_finder' or mode == 'eden_cluster_motif' : + parser.add_argument("-f", "--output-file", + dest="output_file_path", + type=argparse.FileType('w'), + default=sys.stdout, + help="Path to output file.") + parser.add_argument('-k', '--max-order', + dest="max_order", + type=int, + help='Size of the order for negative shuffling.', + default=3) + parser.add_argument('-s', '--min-motif-size', + dest="min_motif_size", + type=int, + help='Size of the smallest returned motif.', + default=4) + parser.add_argument('-e', '--num-negative-repetitions', + dest="neg_rep", + type=int, + help='Number of negative instances per order per positive instance.', + default=3) + + return parser \ No newline at end of file diff -r 000000000000 -r 95a776023fbc eden_wrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_wrapper.py Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,75 @@ +#!/usr/bin/env python +import argparse +import sys +import os +import subprocess +import random +import tempfile +import shlex +import shutil +from eden_utilities import log + +class EDeNWrapper: + + def __init__(self, args): + self._param={} + #default values + self._param['path_to_EDeN']="EDeN" + self._param['model_file_path']="model" + self._param['radius']=2 + self._param['distance']=5 + self._param['output_dir']=tempfile.mkdtemp() + self._param['kernel_type']="STRING" + self._param['file_type']="STRINGSEQ" + self._param['cluster_type']="DENSE_CONNECTED_CENTERS" + self._param['sample_size']=1000 + self._param['num_nearest_neighbors']=10 + self._param['neighborhood_intersection_size']=int(self._param['num_nearest_neighbors']/2) + self._param['fraction_center_scan']=1.0 + #update + self._param.update(args) + + @property + def param(self): + return self._param + + def Train(self): + assert self._param.has_key('dat_file_path') + assert self._param.has_key('target_file_path') + assert self._param.has_key('model_file_path') + + cmd = "%(path_to_EDeN)s -a TRAIN -i %(dat_file_path)s -t %(target_file_path)s --kernel_type %(kernel_type)s --file_type %(file_type)s --radius %(radius)s --distance %(distance)s -y %(output_dir)s -m %(model_file_path)s" + cmd = cmd % self._param + + log.info( "Executing call: %s" % cmd ) + subprocess.call( shlex.split(cmd) ) + + def TestPart(self): + assert self._param.has_key('dat_file_path') + assert self._param.has_key('model_file_path') + + cmd = "%(path_to_EDeN)s -a TEST_PART -i %(dat_file_path)s --kernel_type %(kernel_type)s --file_type %(file_type)s --radius %(radius)s --distance %(distance)s -y %(output_dir)s -m %(output_dir)s/%(model_file_path)s" + cmd = cmd % self._param + + log.info( "Executing call: %s" % cmd ) + subprocess.call( shlex.split(cmd) ) + + def Test(self): + assert self._param.has_key('dat_file_path') + assert self._param.has_key('model_file_path') + + cmd = "%(path_to_EDeN)s -a TEST -i %(dat_file_path)s --kernel_type %(kernel_type)s --file_type %(file_type)s --radius %(radius)s --distance %(distance)s -y %(output_dir)s -m %(output_dir)s/%(model_file_path)s" + cmd = cmd % self._param + + log.info( "Executing call: %s" % cmd ) + subprocess.call( shlex.split(cmd) ) + + def Cluster(self): + assert self._param.has_key('dat_file_path') + + cmd = "%(path_to_EDeN)s -a CLUSTER -i %(dat_file_path)s --kernel_type %(kernel_type)s --file_type %(file_type)s --radius %(radius)s --distance %(distance)s -y %(output_dir)s " + cmd = cmd + "--cluster_type %(cluster_type)s --sample_size %(sample_size)s --num_nearest_neighbors %(num_nearest_neighbors)s --neighborhood_intersection_size %(neighborhood_intersection_size)s --fraction_center_scan %(fraction_center_scan)s" + cmd = cmd % self._param + + log.info( "Executing call: %s" % cmd ) + subprocess.call( shlex.split(cmd) ) \ No newline at end of file diff -r 000000000000 -r 95a776023fbc motif_finder.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/motif_finder.xml Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,57 @@ + + based on EDeN + + biopython + eden + + + eden_cluster_motif_finder.py + + --input-file $infile + --radius $radius + --distance $distance + --output-dir ./ + --num-nearest-neighbors $num_nearest_neighbors + --neighborhood-intersection-size $neighborhood_intersection_size + --fraction-center-scan $fraction_center_scan + --output-file foo.txt + --max-order $max_order + --min-motif-size $min_motif_size + --num-negative-repetitions $num_negative_repetitions + > $output + + + + + + + + + + + + + + + + + + + +**What it does** + +EDeN Motif finder + +----- + +**Parameters** + + + +**References** + +Fabrizio Costa + + + diff -r 000000000000 -r 95a776023fbc readme.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/readme.rst Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,84 @@ +Galaxy wrapper for Augustus +=========================== + +This wrapper is copyright 2012-2013 by Björn Grüning. + +This is a wrapper for the command line tool of Augustus_. + +.. _augustus: http://bioinf.uni-greifswald.de/augustus/ + +AUGUSTUS is a program that predicts genes in eukaryotic genomic sequences. + +Oliver Keller, Martin Kollmar, Mario Stanke, Stephan Waack (2011) +A novel hybrid gene prediction method employing protein multiple sequence alignments +Bioinformatics, doi: 10.1093/bioinformatics/btr010 + +Mario Stanke, Mark Diekhans, Robert Baertsch, David Haussler (2008) +Using native and syntenically mapped cDNA alignments to improve de novo gene finding +Bioinformatics, doi: 10.1093/bioinformatics/btn013 + +Mario Stanke and Stephan Waack (2003) +Gene Prediction with a Hidden-Markov Model and a new Intron Submodel. +Bioinformatics, Vol. 19, Suppl. 2, pages ii215-ii225 + + +Installation +============ + +The recommended installation is by means of the toolshed_. +If you need to install it manually here is a short introduction. + +.. _toolshed: http://toolshed.g2.bx.psu.edu/view/bgruening/augustus + + +Install or downlaod augustus from:: + + http://bioinf.uni-greifswald.de/augustus/binaries/ + +and follow the installation instructions or copy the binaries into your $PATH. To install the wrapper copy the augustus folder in the galaxy tools folder and modify the tools_conf.xml file to make the tool available to Galaxy. + +For example:: + +
+ +
+ + +Set the *AUGUSTUS_CONFIG_PATH* to /path_to_augustus/augustus/config with:: + + export AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config + +or modify the wrapper and use the following additional commandline switch:: + + --AUGUSTUS_CONFIG_PATH=/path_to_augustus/augustus/config + + +History +======= + +- v0.1: Initial public release +- v0.2: Added tool_dependencies.xml file and update the augustus version (thanks to James Johnson) +- v0.3: upgrade to augustus 2.7, added new organisms and new parameters, output additional sequence files +- v0.3.1: added parallelism and changed the output parameters from boolean to a select box + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff -r 000000000000 -r 95a776023fbc tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,9 @@ + + + + + + + + +