annotate preprocess.py @ 1:4d25dec9707e

correction
author blanck
date Tue, 28 Apr 2015 11:23:47 +0200
parents a89bae08bf2d
children 54d549210759
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
1 import os
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
2 import re
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
3 import shutil
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
4 import sys
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
5 import subprocess
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
6 import zipfile
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
7
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
8
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
9 def main():
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
10
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
11 extra_files_directory = sys.argv[1]
4d25dec9707e correction
blanck
parents: 0
diff changeset
12 report = sys.argv[4]
4d25dec9707e correction
blanck
parents: 0
diff changeset
13 new_files_directory = sys.argv[6]
4d25dec9707e correction
blanck
parents: 0
diff changeset
14 dataset=sys.argv[7]
4d25dec9707e correction
blanck
parents: 0
diff changeset
15 cdffull_name=sys.argv[9]
4d25dec9707e correction
blanck
parents: 0
diff changeset
16 ufl_name=sys.argv[10]
4d25dec9707e correction
blanck
parents: 0
diff changeset
17 ugp_name=sys.argv[11]
4d25dec9707e correction
blanck
parents: 0
diff changeset
18 acs_name=sys.argv[12]
4d25dec9707e correction
blanck
parents: 0
diff changeset
19 cdffull=sys.argv[14]
4d25dec9707e correction
blanck
parents: 0
diff changeset
20 ufl=sys.argv[15]
4d25dec9707e correction
blanck
parents: 0
diff changeset
21 ugp=sys.argv[16]
4d25dec9707e correction
blanck
parents: 0
diff changeset
22 acs=sys.argv[17]
4d25dec9707e correction
blanck
parents: 0
diff changeset
23 tumor=sys.argv[18]
4d25dec9707e correction
blanck
parents: 0
diff changeset
24 settingType=sys.argv[19]
4d25dec9707e correction
blanck
parents: 0
diff changeset
25 outputgraph=sys.argv[20]
4d25dec9707e correction
blanck
parents: 0
diff changeset
26 zipfigures=sys.argv[21]
4d25dec9707e correction
blanck
parents: 0
diff changeset
27 outputlog=sys.argv[22]
4d25dec9707e correction
blanck
parents: 0
diff changeset
28 log=sys.argv[23]
4d25dec9707e correction
blanck
parents: 0
diff changeset
29 user=sys.argv[24]
4d25dec9707e correction
blanck
parents: 0
diff changeset
30
4d25dec9707e correction
blanck
parents: 0
diff changeset
31 extra_file_names = sorted(os.listdir(extra_files_directory))
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
32
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
33 if (cdffull_name.count(",") != 0):
4d25dec9707e correction
blanck
parents: 0
diff changeset
34 chipType=cdffull_name.split(",",1)[0]
4d25dec9707e correction
blanck
parents: 0
diff changeset
35 tagExt=cdffull_name.split(",",1)[1]
4d25dec9707e correction
blanck
parents: 0
diff changeset
36 tag=tagExt.split(".",1)[0]
4d25dec9707e correction
blanck
parents: 0
diff changeset
37 else:
4d25dec9707e correction
blanck
parents: 0
diff changeset
38 chipType=cdffull_name.split(".",1)[0]
4d25dec9707e correction
blanck
parents: 0
diff changeset
39 tag=""
4d25dec9707e correction
blanck
parents: 0
diff changeset
40
4d25dec9707e correction
blanck
parents: 0
diff changeset
41 data_dir = os.path.join(new_files_directory, user, dataset)
4d25dec9707e correction
blanck
parents: 0
diff changeset
42 mpagenomics_dir = os.path.join(new_files_directory, "mpagenomics",user)
4d25dec9707e correction
blanck
parents: 0
diff changeset
43
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
44 try:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
45 os.makedirs(data_dir)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
46 except:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
47 shutil.rmtree(data_dir)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
48 os.makedirs(data_dir)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
49
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
50 if (not os.path.isdir(mpagenomics_dir)):
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
51 os.makedirs(mpagenomics_dir)
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
52
4d25dec9707e correction
blanck
parents: 0
diff changeset
53 for name in extra_file_names:
4d25dec9707e correction
blanck
parents: 0
diff changeset
54 source = os.path.join(extra_files_directory, name)
4d25dec9707e correction
blanck
parents: 0
diff changeset
55 # Strip _task_XXX from end of name
4d25dec9707e correction
blanck
parents: 0
diff changeset
56 name_match = re.match(r"^\d+_task_(.*).dat$", name)
4d25dec9707e correction
blanck
parents: 0
diff changeset
57 if name_match:
4d25dec9707e correction
blanck
parents: 0
diff changeset
58 name = name_match.group(1)
4d25dec9707e correction
blanck
parents: 0
diff changeset
59 else:
4d25dec9707e correction
blanck
parents: 0
diff changeset
60 # Skip indices, composite extra_files_paths, etc...
4d25dec9707e correction
blanck
parents: 0
diff changeset
61 continue
4d25dec9707e correction
blanck
parents: 0
diff changeset
62 #escaped_name = name.replace("_", "-")
4d25dec9707e correction
blanck
parents: 0
diff changeset
63 #dataset_name = "%s" % (name, 'visible', ext, db_key)
4d25dec9707e correction
blanck
parents: 0
diff changeset
64 destination = os.path.join(data_dir, name)
4d25dec9707e correction
blanck
parents: 0
diff changeset
65 _copy(source, destination)
4d25dec9707e correction
blanck
parents: 0
diff changeset
66 # datasets_created.append(name)
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
67
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
68 _copy(cdffull,os.path.join(data_dir, cdffull_name))
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
69 _copy(ugp,os.path.join(data_dir, ugp_name))
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
70 _copy(ufl,os.path.join(data_dir, ufl_name))
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
71 _copy(acs,os.path.join(data_dir, acs_name))
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
72
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
73
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
74 fig_dir = os.path.join("mpagenomics", user, "figures", dataset, "signal")
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
75 abs_fig_dir = os.path.join(new_files_directory, fig_dir)
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
76
4d25dec9707e correction
blanck
parents: 0
diff changeset
77
4d25dec9707e correction
blanck
parents: 0
diff changeset
78 retcode = _preprocess(chipType, dataset, mpagenomics_dir, data_dir, new_files_directory, tumor, settingType, outputgraph, outputlog, log, tag)
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
79
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
80 if (retcode == 0):
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
81 if (os.path.isdir(abs_fig_dir)) and (outputgraph == "TRUE"):
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
82
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
83 new_files = os.listdir(abs_fig_dir)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
84 zipbuf = zipfile.ZipFile(os.path.join(abs_fig_dir, zipfigures), 'w', zipfile.ZIP_DEFLATED)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
85 for current_file in new_files:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
86 fn = os.path.join(abs_fig_dir, current_file)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
87 relfn = fn[len(abs_fig_dir) + len(os.sep):]
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
88 zipbuf.write(fn, relfn)
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
89
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
90 f = open(report, "w")
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
91 # Create report
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
92 try:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
93 for name in extra_file_names:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
94 f.write("%s\t%s\t%s\n" %(re.match(r"^\d+_task_(.*).dat$", name).group(1),dataset,chipType))
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
95 finally:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
96 shutil.rmtree(data_dir)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
97 f.close()
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
98
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
99 sys.exit(retcode)
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
100
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
101 sys.exit(retcode)
1
4d25dec9707e correction
blanck
parents: 0
diff changeset
102
0
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
103
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
104 def _copy(source, destination):
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
105 try:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
106 os.link(source, destination)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
107 except:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
108 shutil.copy(source, destination)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
109
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
110 def _preprocess (chipType,dataset,mpagenomics_dir,data_dir,tmp_dir,tumor,settingType,outputgraph,outputlog,log,tag):
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
111 script_dir=os.path.dirname(os.path.abspath(__file__))
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
112
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
113 if (outputlog=="TRUE"):
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
114 errfile=open(log,'w')
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
115 else:
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
116 errfile=open(os.path.join(tmp_dir,"errfile.log"),'w')
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
117
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
118 retcode = subprocess.call(["Rscript", os.path.join(script_dir,"preprocess.R"), chipType, dataset, mpagenomics_dir, data_dir, tumor, settingType, outputgraph, tag], stdout = errfile, stderr = errfile)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
119 return(retcode)
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
120
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
121
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
122 if __name__ == "__main__":
a89bae08bf2d Uploaded
sblanck
parents:
diff changeset
123 main()