0
|
1 #!/usr/bin/env python
|
|
2 import argparse
|
|
3 import sys
|
|
4 import os
|
|
5 import subprocess
|
|
6 from Bio import SeqIO
|
|
7 import random
|
|
8 import tempfile
|
|
9 import shlex
|
|
10 import shutil
|
|
11 import logging
|
|
12 from eden_wrapper import EDeNWrapper
|
|
13 import eden_iterative_motif_finder
|
|
14 from eden_utilities import log
|
|
15 from eden_utilities import create_params
|
|
16
|
|
17 def main(args):
|
|
18 #setup directory for output
|
|
19 if not os.path.exists(args.output_dir_path) :
|
|
20 os.mkdir(args.output_dir_path)
|
|
21 else:
|
|
22 sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path)
|
|
23
|
|
24 #make seq file
|
|
25 tmp_output_dir=tempfile.mkdtemp()
|
|
26 seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir)
|
|
27
|
|
28 #cluster sequences
|
|
29 param = vars(args)
|
|
30 param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir})
|
|
31 EDeN=EDeNWrapper(param)
|
|
32 EDeN.Cluster()
|
|
33
|
|
34 #build the inverse index seqid->clusterid
|
|
35 #and create a dict of file_handles
|
|
36 files_handle_list=dict()
|
|
37 map_seqid2clusterid = dict()
|
|
38 with open(os.path.join( tmp_output_dir, 'cluster' )) as f:
|
|
39 for clusterid, line in enumerate(f):
|
|
40 str_vec = line.strip().split()
|
|
41 int_vec = map(int, str_vec)
|
|
42 for seqid in int_vec:
|
|
43 map_seqid2clusterid[seqid]=clusterid
|
|
44 filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid )
|
|
45 files_handle_list[clusterid]=open(filename, 'w+')
|
|
46
|
|
47 #write each fasta sequence in the corresponding cluster files
|
|
48 log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) )
|
|
49 with open(args.fasta_file, 'r') as f:
|
|
50 for seqid, record in enumerate(SeqIO.parse(f, 'fasta')):
|
|
51 if seqid in map_seqid2clusterid: #note: not all sequences have a cluster
|
|
52 clusterid=map_seqid2clusterid[seqid];
|
|
53 SeqIO.write(record, files_handle_list[clusterid], "fasta")
|
|
54
|
|
55 #cleanup
|
|
56 shutil.rmtree(tmp_output_dir)
|
|
57
|
|
58
|
|
59
|
|
60 if __name__ == '__main__':
|
|
61 parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.')
|
|
62 parser = create_params(parser, 'eden_cluster_splitter')
|
|
63 args = parser.parse_args()
|
|
64 main(args)
|