Mercurial > repos > bgruening > eden_cluster_motif_finder
view eden_cluster_splitter.py @ 0:95a776023fbc draft
Uploaded
author | bgruening |
---|---|
date | Thu, 12 Jun 2014 11:35:21 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import argparse import sys import os import subprocess from Bio import SeqIO import random import tempfile import shlex import shutil import logging from eden_wrapper import EDeNWrapper import eden_iterative_motif_finder from eden_utilities import log from eden_utilities import create_params def main(args): #setup directory for output if not os.path.exists(args.output_dir_path) : os.mkdir(args.output_dir_path) else: sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path) #make seq file tmp_output_dir=tempfile.mkdtemp() seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir) #cluster sequences param = vars(args) param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir}) EDeN=EDeNWrapper(param) EDeN.Cluster() #build the inverse index seqid->clusterid #and create a dict of file_handles files_handle_list=dict() map_seqid2clusterid = dict() with open(os.path.join( tmp_output_dir, 'cluster' )) as f: for clusterid, line in enumerate(f): str_vec = line.strip().split() int_vec = map(int, str_vec) for seqid in int_vec: map_seqid2clusterid[seqid]=clusterid filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid ) files_handle_list[clusterid]=open(filename, 'w+') #write each fasta sequence in the corresponding cluster files log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) ) with open(args.fasta_file, 'r') as f: for seqid, record in enumerate(SeqIO.parse(f, 'fasta')): if seqid in map_seqid2clusterid: #note: not all sequences have a cluster clusterid=map_seqid2clusterid[seqid]; SeqIO.write(record, files_handle_list[clusterid], "fasta") #cleanup shutil.rmtree(tmp_output_dir) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.') parser = create_params(parser, 'eden_cluster_splitter') args = parser.parse_args() main(args)