view eden_cluster_splitter.py @ 0:95a776023fbc draft

Uploaded
author bgruening
date Thu, 12 Jun 2014 11:35:21 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python
import argparse
import sys
import os
import subprocess
from Bio import SeqIO
import random
import tempfile
import shlex
import shutil
import logging
from eden_wrapper import EDeNWrapper
import eden_iterative_motif_finder
from eden_utilities import log
from eden_utilities import create_params
    
def main(args):
    #setup directory for output
    if not os.path.exists(args.output_dir_path) :
        os.mkdir(args.output_dir_path)
    else:
        sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path) 

    #make seq file
    tmp_output_dir=tempfile.mkdtemp()
    seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir)
        
    #cluster sequences 
    param = vars(args)
    param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir})   
    EDeN=EDeNWrapper(param)
    EDeN.Cluster()
    
    #build the inverse index seqid->clusterid
    #and create a dict of file_handles
    files_handle_list=dict()
    map_seqid2clusterid = dict()
    with open(os.path.join( tmp_output_dir, 'cluster' )) as f:
        for clusterid, line in enumerate(f):
            str_vec = line.strip().split()
            int_vec = map(int, str_vec)
            for seqid in int_vec:
                map_seqid2clusterid[seqid]=clusterid
            filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid )
            files_handle_list[clusterid]=open(filename, 'w+')
    
    #write each fasta sequence in the corresponding cluster files
    log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) )
    with open(args.fasta_file, 'r') as f:
        for seqid, record in enumerate(SeqIO.parse(f, 'fasta')):
            if seqid in map_seqid2clusterid: #note: not all sequences have a cluster
                clusterid=map_seqid2clusterid[seqid];
                SeqIO.write(record, files_handle_list[clusterid], "fasta")
    
    #cleanup        
    shutil.rmtree(tmp_output_dir)



if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.')
    parser = create_params(parser, 'eden_cluster_splitter')
    args = parser.parse_args()
    main(args)