Mercurial > repos > bgruening > eden_cluster_motif_finder
diff eden_cluster_splitter.py @ 0:95a776023fbc draft
Uploaded
author | bgruening |
---|---|
date | Thu, 12 Jun 2014 11:35:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eden_cluster_splitter.py Thu Jun 12 11:35:21 2014 -0400 @@ -0,0 +1,64 @@ +#!/usr/bin/env python +import argparse +import sys +import os +import subprocess +from Bio import SeqIO +import random +import tempfile +import shlex +import shutil +import logging +from eden_wrapper import EDeNWrapper +import eden_iterative_motif_finder +from eden_utilities import log +from eden_utilities import create_params + +def main(args): + #setup directory for output + if not os.path.exists(args.output_dir_path) : + os.mkdir(args.output_dir_path) + else: + sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path) + + #make seq file + tmp_output_dir=tempfile.mkdtemp() + seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir) + + #cluster sequences + param = vars(args) + param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir}) + EDeN=EDeNWrapper(param) + EDeN.Cluster() + + #build the inverse index seqid->clusterid + #and create a dict of file_handles + files_handle_list=dict() + map_seqid2clusterid = dict() + with open(os.path.join( tmp_output_dir, 'cluster' )) as f: + for clusterid, line in enumerate(f): + str_vec = line.strip().split() + int_vec = map(int, str_vec) + for seqid in int_vec: + map_seqid2clusterid[seqid]=clusterid + filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid ) + files_handle_list[clusterid]=open(filename, 'w+') + + #write each fasta sequence in the corresponding cluster files + log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) ) + with open(args.fasta_file, 'r') as f: + for seqid, record in enumerate(SeqIO.parse(f, 'fasta')): + if seqid in map_seqid2clusterid: #note: not all sequences have a cluster + clusterid=map_seqid2clusterid[seqid]; + SeqIO.write(record, files_handle_list[clusterid], "fasta") + + #cleanup + shutil.rmtree(tmp_output_dir) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.') + parser = create_params(parser, 'eden_cluster_splitter') + args = parser.parse_args() + main(args)