annotate data_manager/data_manager_fetch_motifs.py @ 11:24a2c2783fb8 draft

Uploaded
author jeremyjliu
date Tue, 07 Apr 2015 23:19:04 -0400
parents a5421f83f972
children 377723319b45
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
1 #!/usr/bin/env python
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
2 # Jeremy Liu
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
3 # February 2015
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
4 # Adapted from Dan Blackenburg's sample data manager
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
5
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
6 import sys
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
7 import os
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
8 import tempfile
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
9 import shutil
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
10 import optparse
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
11 import urllib2
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
12 #import uuid
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
13 from ftplib import FTP
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
14 import tarfile
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
15 import zipfile
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
16 import gzip
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
17 import bz2
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
18
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
19 from galaxy.util.json import from_json_string, to_json_string
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
20
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
21 CHUNK_SIZE = 2**20 #1mb
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
22
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
23 def download_motif_databases( data_manager_dict, params, target_directory, motif_db ):
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
24
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
25 # Select download URL, file name, data table name, and path using motif_db selector variable
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
26 if motif_db == "encode":
9
e4229c66d78d Uploaded
jeremyjliu
parents: 8
diff changeset
27 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_motifs.bed.bgz',
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
28 "pouya_motifs.bed.bgz", "encode_bgz", "Encode Motifs (hg19) BGZ"]
9
e4229c66d78d Uploaded
jeremyjliu
parents: 8
diff changeset
29 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_motifs.bed.bgz.tbi',
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
30 "pouya_motifs.bed.bgz.tbi", "encode_tbi", "Encode Motifs (hg19) TBI"]
11
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
31 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/pouya.pwms.from.seq.meme.txt',
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
32 "pouya.pwms.from.seq.RData", "encode_pwm", "Encode Motifs (hg19) PWM MEME"]
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
33 elif motif_db == "jaspar":
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
34 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/jaspar_jolma_motifs.bed.bgz',
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
35 "jaspar_jolma_motifs.bed.bgz", "jaspar_bgz", "Jaspar and Jolma Motifs (hg19) BGZ"]
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
36 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/jaspar_jolma_motifs.bed.bgz.tbi',
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
37 "jaspar_jolma_motifs.bed.bgz.tbi", "jaspar_tbi", "Jaspar and Jolma Motifs (hg19) TBI"]
11
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
38 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/jaspar.jolma.pwms.from.seq.meme.txt',
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
39 "jaspar.jolma.pwms.from.seq.RData", "jaspar_pwm", "Jaspar and Jolma Motifs (hg19) PWM MEME"]
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
40 elif motif_db == "mouse":
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
41 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/mm9_motifs_split.bed.bgz',
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
42 "mm9_motifs_split.bed.bgz", "mouse_bgz", "Mouse Motifs (mm9) BGZ"]
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
43 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/mm9_motifs_split.bed.bgz.tbi',
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
44 "mm9_motifs_split.bed.bgz.tbi", "mouse_tbi", "Mouse Motifs (mm9) TBI"]
11
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
45 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/mm9.pwms.from.seq.meme.txt',
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
46 "mm9.pwms.from.seq.RData", "mouse_pwm", "Mouse Motifs (mm9) PWM MEME"]
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
47 else:
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
48 BGZ = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_test_motifs.bed.bgz',
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
49 "pouya_test_motifs.bed.bgz", "test_bgz", "Test Encode Motifs (hg19) BGZ"]
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
50 TBI = ['http://compbio.med.harvard.edu/motif-enrichment/pouya_test_motifs.bed.bgz.tbi',
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
51 "pouya_test_motifs.bed.bgz.tbi", "test_tbi", "Test Encode Motifs (hg19) TBI"]
11
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
52 PWM = ['http://compbio.med.harvard.edu/motif-enrichment/pwms/pouya.pwms.from.seq.meme.txt',
24a2c2783fb8 Uploaded
jeremyjliu
parents: 10
diff changeset
53 "pouya.pwms.from.seq.RData", "test_pwm", "Test Encode Motifs (hg19) PWM MEME"]
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
54
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
55
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
56 # Save and add motif bgz file to motif_databases data table
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
57 bgz_reader = urllib2.urlopen( BGZ[0] )
4
75d825e1b00d Uploaded
jeremyjliu
parents: 2
diff changeset
58 bgz_data_table_entry = _stream_fasta_to_file( bgz_reader, target_directory, params,
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
59 BGZ[1], BGZ[2], BGZ[3] )
4
75d825e1b00d Uploaded
jeremyjliu
parents: 2
diff changeset
60 _add_data_table_entry( data_manager_dict, 'motif_databases', bgz_data_table_entry )
75d825e1b00d Uploaded
jeremyjliu
parents: 2
diff changeset
61
8
b4c2836d2e4e Uploaded
jeremyjliu
parents: 6
diff changeset
62 # Save and add motif tbi file to motif_databases data table
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
63 tbi_reader = urllib2.urlopen( TBI[0] )
4
75d825e1b00d Uploaded
jeremyjliu
parents: 2
diff changeset
64 tbi_data_table_entry = _stream_fasta_to_file( tbi_reader, target_directory, params,
6
aa0d1b185070 Uploaded
jeremyjliu
parents: 5
diff changeset
65 TBI[1], TBI[2], TBI[3] )
4
75d825e1b00d Uploaded
jeremyjliu
parents: 2
diff changeset
66 _add_data_table_entry( data_manager_dict, 'motif_databases', tbi_data_table_entry )
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
67
10
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
68 # Save and add motif pwm file to motif_databases data table
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
69 tbi_reader = urllib2.urlopen( PWM[0] )
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
70 tbi_data_table_entry = _stream_fasta_to_file( tbi_reader, target_directory, params,
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
71 PWM[1], PWM[2], PWM[3] )
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
72 _add_data_table_entry( data_manager_dict, 'motif_databases', tbi_data_table_entry )
a5421f83f972 Uploaded
jeremyjliu
parents: 9
diff changeset
73
2
d5faf2b51b07 Uploaded
jeremyjliu
parents: 1
diff changeset
74 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
75 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
2
d5faf2b51b07 Uploaded
jeremyjliu
parents: 1
diff changeset
76 data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get( data_table, [] )
d5faf2b51b07 Uploaded
jeremyjliu
parents: 1
diff changeset
77 data_manager_dict['data_tables'][data_table].append( data_table_entry )
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
78 return data_manager_dict
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
79
5
6621a6ac8bb4 Uploaded
jeremyjliu
parents: 4
diff changeset
80 def _stream_fasta_to_file( fasta_stream, target_directory, params,
6621a6ac8bb4 Uploaded
jeremyjliu
parents: 4
diff changeset
81 fasta_base_filename, value, name, close_stream=True ):
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
82 fasta_filename = os.path.join( target_directory, fasta_base_filename )
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
83 fasta_writer = open( fasta_filename, 'wb+' )
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
84
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
85 while True:
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
86 buffer = fasta_stream.read(CHUNK_SIZE)
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
87 if not buffer:
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
88 break
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
89
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
90 fasta_writer.write(buffer)
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
91
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
92 fasta_stream.close()
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
93 fasta_writer.close()
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
94
4
75d825e1b00d Uploaded
jeremyjliu
parents: 2
diff changeset
95 return dict( value=value, name=name, path=fasta_base_filename )
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
96
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
97 def main():
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
98 #Parse Command Line
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
99 parser = optparse.OptionParser()
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
100 parser.add_option( '-m', '--motif_db', dest='motif_db', action='store', type="string", default=None, help='motif_db' )
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
101 (options, args) = parser.parse_args()
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
102
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
103 filename = args[0]
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
104
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
105 params = from_json_string( open( filename ).read() )
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
106 target_directory = params[ 'output_data' ][0]['extra_files_path']
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
107 os.mkdir( target_directory )
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
108 data_manager_dict = {}
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
109
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
110 #Fetch the Motif Database
1
2ed33f9b9a47 Uploaded
jeremyjliu
parents: 0
diff changeset
111 download_motif_databases( data_manager_dict, params, target_directory, options.motif_db )
0
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
112
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
113 #save info to json file
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
114 open( filename, 'wb' ).write( to_json_string( data_manager_dict ) )
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
115
ba049ce65693 Initial upload
jeremyjliu
parents:
diff changeset
116 if __name__ == "__main__": main()