annotate eden_cluster_splitter.py @ 0:95a776023fbc draft

Uploaded
author bgruening
date Thu, 12 Jun 2014 11:35:21 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
95a776023fbc Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
95a776023fbc Uploaded
bgruening
parents:
diff changeset
2 import argparse
95a776023fbc Uploaded
bgruening
parents:
diff changeset
3 import sys
95a776023fbc Uploaded
bgruening
parents:
diff changeset
4 import os
95a776023fbc Uploaded
bgruening
parents:
diff changeset
5 import subprocess
95a776023fbc Uploaded
bgruening
parents:
diff changeset
6 from Bio import SeqIO
95a776023fbc Uploaded
bgruening
parents:
diff changeset
7 import random
95a776023fbc Uploaded
bgruening
parents:
diff changeset
8 import tempfile
95a776023fbc Uploaded
bgruening
parents:
diff changeset
9 import shlex
95a776023fbc Uploaded
bgruening
parents:
diff changeset
10 import shutil
95a776023fbc Uploaded
bgruening
parents:
diff changeset
11 import logging
95a776023fbc Uploaded
bgruening
parents:
diff changeset
12 from eden_wrapper import EDeNWrapper
95a776023fbc Uploaded
bgruening
parents:
diff changeset
13 import eden_iterative_motif_finder
95a776023fbc Uploaded
bgruening
parents:
diff changeset
14 from eden_utilities import log
95a776023fbc Uploaded
bgruening
parents:
diff changeset
15 from eden_utilities import create_params
95a776023fbc Uploaded
bgruening
parents:
diff changeset
16
95a776023fbc Uploaded
bgruening
parents:
diff changeset
17 def main(args):
95a776023fbc Uploaded
bgruening
parents:
diff changeset
18 #setup directory for output
95a776023fbc Uploaded
bgruening
parents:
diff changeset
19 if not os.path.exists(args.output_dir_path) :
95a776023fbc Uploaded
bgruening
parents:
diff changeset
20 os.mkdir(args.output_dir_path)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
21 else:
95a776023fbc Uploaded
bgruening
parents:
diff changeset
22 sys.exit("Output directory %s already exists. Bailing out." % args.output_dir_path)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
23
95a776023fbc Uploaded
bgruening
parents:
diff changeset
24 #make seq file
95a776023fbc Uploaded
bgruening
parents:
diff changeset
25 tmp_output_dir=tempfile.mkdtemp()
95a776023fbc Uploaded
bgruening
parents:
diff changeset
26 seq_pos_file_path=eden_iterative_motif_finder.MakePositiveSequenceFile(args.fasta_file, tmp_output_dir)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
27
95a776023fbc Uploaded
bgruening
parents:
diff changeset
28 #cluster sequences
95a776023fbc Uploaded
bgruening
parents:
diff changeset
29 param = vars(args)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
30 param.update({'dat_file_path':seq_pos_file_path,'output_dir':tmp_output_dir})
95a776023fbc Uploaded
bgruening
parents:
diff changeset
31 EDeN=EDeNWrapper(param)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
32 EDeN.Cluster()
95a776023fbc Uploaded
bgruening
parents:
diff changeset
33
95a776023fbc Uploaded
bgruening
parents:
diff changeset
34 #build the inverse index seqid->clusterid
95a776023fbc Uploaded
bgruening
parents:
diff changeset
35 #and create a dict of file_handles
95a776023fbc Uploaded
bgruening
parents:
diff changeset
36 files_handle_list=dict()
95a776023fbc Uploaded
bgruening
parents:
diff changeset
37 map_seqid2clusterid = dict()
95a776023fbc Uploaded
bgruening
parents:
diff changeset
38 with open(os.path.join( tmp_output_dir, 'cluster' )) as f:
95a776023fbc Uploaded
bgruening
parents:
diff changeset
39 for clusterid, line in enumerate(f):
95a776023fbc Uploaded
bgruening
parents:
diff changeset
40 str_vec = line.strip().split()
95a776023fbc Uploaded
bgruening
parents:
diff changeset
41 int_vec = map(int, str_vec)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
42 for seqid in int_vec:
95a776023fbc Uploaded
bgruening
parents:
diff changeset
43 map_seqid2clusterid[seqid]=clusterid
95a776023fbc Uploaded
bgruening
parents:
diff changeset
44 filename=os.path.join( args.output_dir_path, 'cluster_%s.fa' % clusterid )
95a776023fbc Uploaded
bgruening
parents:
diff changeset
45 files_handle_list[clusterid]=open(filename, 'w+')
95a776023fbc Uploaded
bgruening
parents:
diff changeset
46
95a776023fbc Uploaded
bgruening
parents:
diff changeset
47 #write each fasta sequence in the corresponding cluster files
95a776023fbc Uploaded
bgruening
parents:
diff changeset
48 log.info( "Writing %s cluster files in directory %s " % (clusterid,args.output_dir_path) )
95a776023fbc Uploaded
bgruening
parents:
diff changeset
49 with open(args.fasta_file, 'r') as f:
95a776023fbc Uploaded
bgruening
parents:
diff changeset
50 for seqid, record in enumerate(SeqIO.parse(f, 'fasta')):
95a776023fbc Uploaded
bgruening
parents:
diff changeset
51 if seqid in map_seqid2clusterid: #note: not all sequences have a cluster
95a776023fbc Uploaded
bgruening
parents:
diff changeset
52 clusterid=map_seqid2clusterid[seqid];
95a776023fbc Uploaded
bgruening
parents:
diff changeset
53 SeqIO.write(record, files_handle_list[clusterid], "fasta")
95a776023fbc Uploaded
bgruening
parents:
diff changeset
54
95a776023fbc Uploaded
bgruening
parents:
diff changeset
55 #cleanup
95a776023fbc Uploaded
bgruening
parents:
diff changeset
56 shutil.rmtree(tmp_output_dir)
95a776023fbc Uploaded
bgruening
parents:
diff changeset
57
95a776023fbc Uploaded
bgruening
parents:
diff changeset
58
95a776023fbc Uploaded
bgruening
parents:
diff changeset
59
95a776023fbc Uploaded
bgruening
parents:
diff changeset
60 if __name__ == '__main__':
95a776023fbc Uploaded
bgruening
parents:
diff changeset
61 parser = argparse.ArgumentParser(description='Extract motifs patterns with EDeN.')
95a776023fbc Uploaded
bgruening
parents:
diff changeset
62 parser = create_params(parser, 'eden_cluster_splitter')
95a776023fbc Uploaded
bgruening
parents:
diff changeset
63 args = parser.parse_args()
95a776023fbc Uploaded
bgruening
parents:
diff changeset
64 main(args)