diff eden_cluster_splitter.py @ 0:95a776023fbc draft

Uploaded
author bgruening
date Thu, 12 Jun 2014 11:35:21 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/eden_cluster_splitter.py	Thu Jun 12 11:35:21 2014 -0400
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+import argparse
+import sys
+import os
+import subprocess
+from Bio import SeqIO
+import random
+import tempfile
+import shlex
+import shutil
+import logging
+from eden_wrapper import EDeNWrapper
+import eden_iterative_motif_finder
+from eden_utilities import log
+from eden_utilities import create_params
+    
+def main(args):
+    #setup directory for output
+    if not os.path.exists(args.output_dir_path) :
+        os.mkdir(args.output_dir_path)
+    else:
+        sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path) 
+
+    #make seq file
+    tmp_output_dir=tempfile.mkdtemp()
+    seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir)
+        
+    #cluster sequences 
+    param = vars(args)
+    param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir})   
+    EDeN=EDeNWrapper(param)
+    EDeN.Cluster()
+    
+    #build the inverse index seqid->clusterid
+    #and create a dict of file_handles
+    files_handle_list=dict()
+    map_seqid2clusterid = dict()
+    with open(os.path.join( tmp_output_dir, 'cluster' )) as f:
+        for clusterid, line in enumerate(f):
+            str_vec = line.strip().split()
+            int_vec = map(int, str_vec)
+            for seqid in int_vec:
+                map_seqid2clusterid[seqid]=clusterid
+            filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid )
+            files_handle_list[clusterid]=open(filename, 'w+')
+    
+    #write each fasta sequence in the corresponding cluster files
+    log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) )
+    with open(args.fasta_file, 'r') as f:
+        for seqid, record in enumerate(SeqIO.parse(f, 'fasta')):
+            if seqid in map_seqid2clusterid: #note: not all sequences have a cluster
+                clusterid=map_seqid2clusterid[seqid];
+                SeqIO.write(record, files_handle_list[clusterid], "fasta")
+    
+    #cleanup        
+    shutil.rmtree(tmp_output_dir)
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.')
+    parser = create_params(parser, 'eden_cluster_splitter')
+    args = parser.parse_args()
+    main(args)