annotate mpagenomics_normalize-7dc6ce39fb89/preprocess.py @ 0:84b13b0e2b85

Uploaded
author sblanck
date Thu, 07 May 2015 08:22:36 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
1 import os
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
2 import re
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
3 import shutil
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
4 import sys
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
5 import subprocess
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
6 import zipfile
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
7 import optparse
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
8
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
9 def main():
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
10
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
11 parser = optparse.OptionParser()
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
12 parser.add_option('-s', action="store", dest='summary')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
13 parser.add_option('-e', action="store", dest='dataSetName')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
14 parser.add_option('-p', action="store", dest='new_file_path')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
15 parser.add_option('-c', action="store", dest='inputcdffull_name')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
16 parser.add_option('-f', action="store", dest='inputufl_name')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
17 parser.add_option('-g', action="store", dest='inputugp_name')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
18 parser.add_option('-a', action="store", dest='inputacs_name')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
19 parser.add_option('-d', action="store", dest='inputcdffull')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
20 parser.add_option('-v', action="store", dest='inputufl')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
21 parser.add_option('-w', action="store", dest='inputugp')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
22 parser.add_option('-b', action="store", dest='inputacs')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
23 parser.add_option('-t', action="store", dest='tumorcsv')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
24 parser.add_option('-y', action="store", dest='settingsType')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
25 parser.add_option('-o', action="store", dest='outputgraph')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
26 parser.add_option('-z', action="store", dest='zipfigures')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
27 parser.add_option('-k', action="store", dest='outputlog')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
28 parser.add_option('-l', action="store", dest='log')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
29 parser.add_option('-u', action="store", dest='user_id')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
30
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
31 parser.add_option('-i', action="append", dest='inputFile', default=[])
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
32 parser.add_option('-n', action='append', dest='inputFileName', default=[])
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
33
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
34 options, args = parser.parse_args()
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
35
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
36 dataSetName=options.dataSetName
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
37 destinationPath=os.path.join(options.new_file_path, options.user_id, dataSetName)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
38
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
39 mpagenomics_dir = os.path.join(options.new_file_path,"mpagenomics",options.user_id)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
40 data_dir = os.path.join(options.new_file_path, options.user_id)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
41
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
42 try:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
43 os.makedirs(data_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
44 except:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
45 shutil.rmtree(data_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
46 os.makedirs(data_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
47
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
48 if (not os.path.isdir(mpagenomics_dir)):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
49 os.makedirs(mpagenomics_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
50
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
51 for inputFile, inputFileName in zip(options.inputFile,options.inputFileName):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
52 source = inputFile
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
53 destination=os.path.join(data_dir,inputFileName)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
54 _copy(source,destination)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
55
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
56
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
57 cdffull_name=options.inputcdffull_name
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
58 if (cdffull_name.count(",") != 0):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
59 chipType=cdffull_name.split(",",1)[0]
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
60 tagExt=cdffull_name.split(",",1)[1]
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
61 tag=tagExt.split(".",1)[0]
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
62 else:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
63 chipType=cdffull_name.split(".",1)[0]
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
64 tag=""
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
65
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
66 _copy(options.inputcdffull,os.path.join(data_dir, options.inputcdffull_name))
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
67 _copy(options.inputugp,os.path.join(data_dir, options.inputugp_name))
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
68 _copy(options.inputufl,os.path.join(data_dir, options.inputufl_name))
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
69 _copy(options.inputacs,os.path.join(data_dir, options.inputacs_name))
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
70
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
71
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
72 fig_dir = os.path.join("mpagenomics", options.user_id, "figures", dataSetName, "signal")
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
73 abs_fig_dir = os.path.join(options.new_file_path, fig_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
74
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
75
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
76 retcode = _preprocess(chipType, dataSetName, mpagenomics_dir, data_dir, options.new_file_path, options.tumorcsv, options.settingsType, options.outputgraph, options.outputlog, options.log, tag)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
77
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
78 if (retcode == 0):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
79 if (os.path.isdir(abs_fig_dir)) and (options.outputgraph == "TRUE"):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
80
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
81 new_files = os.listdir(abs_fig_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
82 zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir, options.zipfigures), 'w', zipfile.ZIP_DEFLATED)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
83 for current_file in new_files:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
84 fn = os.path.join(abs_fig_dir, current_file)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
85 relfn = fn[len(abs_fig_dir) + len(os.sep):]
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
86 zipbuf.write(fn, relfn)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
87
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
88 f = open(options.summary, "w")
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
89 # Create report
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
90 try:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
91 for inputFileName in options.inputFileName:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
92 f.write("%s\t%s\t%s\n" %(inputFileName,dataSetName,chipType))
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
93 finally:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
94 shutil.rmtree(data_dir)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
95 f.close()
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
96
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
97 sys.exit(retcode)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
98
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
99 sys.exit(retcode)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
100
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
101
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
102 def _copy(source, destination):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
103 try:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
104 os.symlink(source, destination)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
105 except:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
106 shutil.copy(source, destination)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
107
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
108 def _preprocess (chipType,dataset,mpagenomics_dir,data_dir,tmp_dir,tumor,settingType,outputgraph,outputlog,log,tag):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
109 script_dir=os.path.dirname(os.path.abspath(__file__))
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
110
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
111 if (outputlog=="TRUE"):
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
112 errfile=open(log,'w')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
113 else:
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
114 errfile=open(os.path.join(tmp_dir,"errfile.log"),'w')
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
115
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
116 retcode = subprocess.call(["Rscript", os.path.join(script_dir,"preprocess.R"), chipType, dataset, mpagenomics_dir, data_dir, tumor, settingType, outputgraph, tag], stdout = errfile, stderr = errfile)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
117 return(retcode)
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
118
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
119
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
120 if __name__ == "__main__":
84b13b0e2b85 Uploaded
sblanck
parents:
diff changeset
121 main()