Mercurial > repos > jeremie > pindel_test
comparison pindel.py @ 8:de8a643a41e3 draft
Uploaded
author | jeremie |
---|---|
date | Mon, 07 Jul 2014 05:05:50 -0400 |
parents | 544c5266fc0f |
children | 844ad5ce74cc |
comparison
equal
deleted
inserted
replaced
7:74009865b3ad | 8:de8a643a41e3 |
---|---|
1 | 1 |
2 import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex | 2 import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex |
3 | 3 |
4 parser = argparse.ArgumentParser(description='') | 4 parser = argparse.ArgumentParser(description='') |
5 parser.add_argument('-r', dest='input_reference', required=True, help='the reference file') | 5 parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file') |
6 parser.add_argument('-b', dest='input_bam', required=True, help='the bam file') | 6 parser.add_argument('-b', dest='inputBamFile', required=True, help='the bam file') |
7 parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size') | 7 parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size') |
8 parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag') | 8 parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag') |
9 parser.add_argument('-o', dest='output_vcf', help='the output vcf', required=True) | 9 # parser.add_argument('-o1', dest='outputRaw', help='the output raw', required=True) |
10 parser.add_argument('--name', dest='name', default='defaults') | 10 parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', required=True) |
11 parser.add_argument('--name', dest='name', default='name') | |
11 parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1') | 12 parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1') |
12 parser.add_argument('--window_size', dest='window_size', type=int, default='5') | 13 parser.add_argument('--window_size', dest='window_size', type=int, default='5') |
13 parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01') | 14 parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01') |
14 parser.add_argument('--sensitivity', dest='sensitivity', default='0.95', type=float) | 15 parser.add_argument('--sensitivity', dest='sensitivity', default='0.95', type=float) |
15 parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False) | 16 parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False) |
16 parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False) | 17 parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False) |
17 parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False) | 18 parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False) |
18 parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False) | 19 parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False) |
19 | 20 |
20 tmp_dir = tempfile.mkdtemp() | 21 parser.add_argument('--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default='0.02') |
21 pindel_dir=tmp_dir+"/pindel" | 22 parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False) |
22 errorFile = tmp_dir+"/errorLog" | 23 parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False) |
23 | 24 parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False) |
24 | 25 parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False) |
25 def execute( cmd, output="" ): | 26 parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False) |
27 parser.add_argument('--additional_mismatch', dest='additional_mismatch', type=int, default='1') | |
28 parser.add_argument('--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default='3') | |
29 parser.add_argument('--min_inversion_size', dest='min_inversion_size', type=int, default='50') | |
30 parser.add_argument('--min_num_matched_bases', dest='min_num_matched_bases', type=int, default='30') | |
31 parser.add_argument('--balance_cutoff', dest='balance_cutoff', type=int, default='0') | |
32 parser.add_argument('--anchor_quality', dest='anchor_quality', type=int, default='0') | |
33 parser.add_argument('--minimum_support_for_event', dest='minimum_support_for_event', type=int, default='3') | |
34 parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False) | |
35 parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350') | |
36 parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100') | |
37 parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3') | |
38 parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3') | |
39 parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000') | |
40 parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False) | |
41 | |
42 | |
43 # def execute(cmd, output=""): | |
44 # try: | |
45 # err = open(tempDir+"/errorLog", 'a') | |
46 # if output != "": | |
47 # out = open(output, 'w') | |
48 # else: | |
49 # out = subprocess.PIPE | |
50 # process = subprocess.Popen(args=shlex.split(cmd), stdout=out, stderr=err) | |
51 # process.wait() | |
52 # err.close() | |
53 # if out != subprocess.PIPE: | |
54 # out.close() | |
55 # except Exception, e: | |
56 # sys.stderr.write("problem doing : %s\n" %(cmd)) | |
57 # sys.stderr.write('%s\n\n' % str(e)) | |
58 | |
59 def execute(cmd, output=None): | |
60 import subprocess, sys, shlex | |
61 # function to execute a cmd and report if an error occur | |
62 print(cmd) | |
26 try: | 63 try: |
27 err = open(tmp_dir+"/errorLog", 'a') | 64 process = subprocess.Popen(args=shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
28 if output != "": | |
29 out = open(output, 'w') | |
30 else: | |
31 out = subprocess.PIPE | |
32 process = subprocess.Popen( args=shlex.split(cmd), stdout=out, stderr=err ) | |
33 process.wait() | 65 process.wait() |
34 err.close() | 66 stdout,stderr = process.communicate() |
35 if out != subprocess.PIPE: | 67 except Exception, e: # une erreur de ma commande : stderr |
36 out.close() | 68 sys.stderr.write("problem doing : %s\n%s\n" %(cmd, e)) |
37 except Exception, e: | 69 return |
38 sys.stderr.write("problem doing : %s\n" %(cmd)) | 70 if output: |
39 sys.stderr.write( '%s\n\n' % str(e) ) | 71 output = open(output, 'w') |
40 | 72 output.write(stdout) |
41 | 73 output.close() |
42 def index (args): | 74 if stderr != '': # une erreur interne au programme : stdout (sinon, souvent des warning arrete les programmes) |
43 cmd = "samtools index %s" % (args.input_bam) | 75 sys.stdout.write("warning or error while doing : %s\n-----\n%s-----\n\n" %(cmd, stderr)) |
44 # print( "commande = "+cmd ) | 76 |
45 # os.system(cmd) | 77 |
46 execute(cmd) | 78 def indexBam(inputFastaFile, inputBamFile): |
47 | 79 cmd = "samtools indexBam %s" %(inputBamFile) |
48 cmd = "samtools faidx %s" % (args.input_reference) | 80 execute(cmd) |
49 # print( "commande = "+cmd ) | 81 cmd = "samtools faidx %s" %(inputFastaFile) |
50 # os.system(cmd) | 82 execute(cmd) |
51 execute(cmd) | 83 |
52 | 84 |
53 | 85 def config(inputBamFile, meanInsertSize, name, tempDir): |
54 def config( args ): | 86 configFile = tempDir+"/pindel_configFile" |
55 # print("config") | 87 # cmd = 'echo %s\t%s\t%s > %s' %(bam, size, name, configFile) |
56 config = tmp_dir+"/pindel_config" | |
57 # cmd = 'echo %s\t%s\t%s > %s' % (bam, size, name, config) | |
58 # print("commande = "+cmd) | 88 # print("commande = "+cmd) |
59 # os.system(cmd) | 89 # os.system(cmd) |
60 fil = open(config, 'w') | 90 fil = open(configFile, 'w') |
61 fil.write("%s\t%s\t%s" % (args.input_bam, args.insert_size, args.name) ) | 91 fil.write("%s\t%s\t%s" %(inputBamFile, meanInsertSize, name)) |
62 fil.close() | 92 fil.close() |
63 return config | 93 return configFile |
64 | 94 |
65 | 95 |
66 # def pindel( reference, config, output ): | 96 # def pindel(reference, configFile, output): |
67 # print("pindel") | 97 # print("pindel") |
68 # cmd="pindel -f %s -i %s -o %s" % (reference, config, output) | 98 # cmd="pindel -f %s -i %s -o %s" %(reference, configFile, output) |
69 # print("commande = "+cmd) | 99 # print("commande = "+cmd) |
70 # os.system(cmd) | 100 # os.system(cmd) |
71 | 101 |
72 # os.system("rm %s" % (output)) | 102 # os.system("rm %s" %(output)) |
73 # if not os.path.exists(output): | 103 # if not os.path.exists(output): |
74 # os.makedirs(output) | 104 # os.makedirs(output) |
75 # os.system("mv %s_* %s" %(output, output) ) | 105 # os.system("mv %s_* %s" %(output, output)) |
76 | 106 |
77 | 107 |
78 def pindel( reference, config, temp, args ): | 108 def pindel(reference, configFile, args, tempDir): |
79 cmd = "pindel -f %s -i %s -o %s " % (reference, config, pindel_dir) | 109 pindelTempDir=tempDir+"/pindel" |
80 cmd += "--number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" % (args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity) | 110 cmd = "pindel -f %s -i %s -o %s " %(reference, configFile, pindelTempDir) |
111 cmd += " --number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" %(args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity) | |
112 cmd += " -u %f -n %d -a %d -m %d -v %d -d %d -B %d -A %d -M %d " %(args.maximum_allowed_mismatch_rate, args.NM, args.additional_mismatch, args.min_perfect_match_around_BP, args.min_inversion_size, args.min_num_matched_bases, args.balance_cutoff, args.anchor_quality, args.minimum_support_for_event) | |
113 | |
81 if args.report_long_insertions: | 114 if args.report_long_insertions: |
82 cmd += ' --report_long_insertions ' | 115 cmd += ' --report_long_insertions ' |
83 if args.report_duplications: | 116 if args.report_duplications: |
84 cmd += ' --report_duplications ' | 117 cmd += ' --report_duplications ' |
85 if args.report_inversions: | 118 if args.report_inversions: |
86 cmd += ' --report_inversions ' | 119 cmd += ' --report_inversions ' |
87 if args.report_breakpoints: | 120 if args.report_breakpoints: |
88 cmd += ' --report_breakpoints ' | 121 cmd += ' --report_breakpoints ' |
89 print("commande = "+cmd) | 122 |
90 execute( cmd ) | 123 if args.report_close_mapped_reads: |
91 | 124 cmd += ' --report_close_mapped_reads ' |
92 | 125 if args.report_only_close_mapped_reads: |
93 | 126 cmd += ' --report_only_close_mapped_reads ' |
94 | 127 if args.report_interchromosomal_events: |
95 | 128 cmd += ' --report_interchromosomal_events ' |
96 def move( avant, apres ): | 129 if args.IndelCorrection: |
130 cmd += ' --IndelCorrection ' | |
131 if args.NormalSamples: | |
132 cmd += ' --NormalSamples ' | |
133 if args.input_SV_Calls_for_assembly: | |
134 cmd += ' --input_SV_Calls_for_assembly %s ' %(args.input_SV_Calls_for_assembly) | |
135 | |
136 if args.detect_DD: | |
137 cmd += ' -q ' | |
138 cmd += ' --MAX_DD_BREAKPOINT_DISTANCE '+str(args.MAX_DD_BREAKPOINT_DISTANCE) | |
139 cmd += ' --MAX_DISTANCE_CLUSTER_READS '+str(args.MAX_DISTANCE_CLUSTER_READS) | |
140 cmd += ' --MIN_DD_CLUSTER_SIZE '+str(args.MIN_DD_CLUSTER_SIZE) | |
141 cmd += ' --MIN_DD_BREAKPOINT_SUPPORT '+str(args.MIN_DD_BREAKPOINT_SUPPORT) | |
142 cmd += ' --MIN_DD_MAP_DISTANCE '+str(args.MIN_DD_MAP_DISTANCE) | |
143 if args.DD_REPORT_DUPLICATION_READS: | |
144 cmd += ' --DD_REPORT_DUPLICATION_READS ' | |
145 | |
146 execute(cmd) | |
147 return pindelTempDir | |
148 | |
149 | |
150 def move(avant, apres): | |
97 if os.path.exists(avant): | 151 if os.path.exists(avant): |
98 execute("mv %s %s" %(avant, apres)) | 152 execute("mv %s %s" %(avant, apres)) |
99 | 153 |
100 | 154 |
101 def pindel2vcf( args ): | 155 def pindel2vcf(outputVcf, inputFastaFile, name, pindelTempDir): |
102 date = str(time.strftime('%d/%m/%y',time.localtime())) | 156 date = str(time.strftime('%d/%m/%y',time.localtime())) |
103 name = "test" | 157 cmd = "pindel2vcf -P %s -r %s -R %s -d %s" %(pindelTempDir, inputFastaFile, name, date) |
104 cmd = "pindel2vcf -P %s -r %s -R %s -d %s" % (pindel_dir, args.input_reference, name, date ) | 158 print("commande = " + cmd) |
105 print ("commande = " + cmd) | 159 execute(cmd) |
106 execute(cmd) | 160 if os.path.exists(pindelTempDir+".vcf"): |
107 if os.path.exists(pindel_dir+".vcf"): | 161 execute("cp %s %s" %(pindelTempDir+".vcf", outputVcf)) |
108 execute("cp %s %s" %(pindel_dir+".vcf", args.output_vcf)) | 162 |
163 | |
164 def getMeanInsertSize(bamFile, tempDir): | |
165 samFile = tempDir+"/sam" | |
166 histogramFile = tempDir+"/histogram" | |
167 outputFile = tempDir+"/output" | |
168 cmd = "samtools view -h %s %s" % (bamFile, samFile) | |
169 execute(cmd) | |
170 cmd = "picard-tools CollectInsertSizeMetrics H=%s O=%s I=%s" % (histogramFile, outputFile, samFile) | |
171 execute(cmd) | |
172 try: | |
173 f = open(outputFile, 'r') | |
174 lignes = f.readlines() | |
175 f.close() | |
176 meanInsertSize = lignes[7].split('\t')[0] | |
177 except: | |
178 sys.stderr.write("problem while opening file %s " %(outputFile)) | |
179 return 0 | |
180 return meanInsertSize | |
109 | 181 |
110 | 182 |
111 def __main__(): | 183 def __main__(): |
112 | 184 |
113 args = parser.parse_args() | 185 args = parser.parse_args() |
186 tempDir = tempfile.mkdtemp() | |
114 | 187 |
115 # try: | 188 # try: |
116 # cmd = 'pindel' | 189 # cmd = 'pindel' |
117 # cmd += ' -i1 %s -i2 %s -o %s ' % (args.input1, args.input2, args.output1) | 190 # cmd += ' -i1 %s -i2 %s -o %s ' %(args.input1, args.input2, args.output1) |
118 # except: | 191 # except: |
119 # sys.stdout.write( 'problem args : %s' % (cmd) ) | 192 # sys.stdout.write('problem args : %s' %(cmd)) |
120 # try: | 193 # try: |
121 # os.system(cmd) | 194 # os.system(cmd) |
122 # except: | 195 # except: |
123 # sys.stdout.write( 'problem cmd' ) | 196 # sys.stdout.write('problem cmd') |
124 # pipeline(args.reference, args.bam, args.output_raw, args.output_vcf, size, name, args) | 197 # pipeline(args.reference, args.bam, args.output_raw, args.outputVcfFile, size, name, args) |
125 | |
126 reference = args.input_reference | |
127 bam = args.input_bam | |
128 temp = args.output_vcf+"_temp/temp" | |
129 output_vcf = args.output_vcf | |
130 | 198 |
131 try: | 199 try: |
132 index( args ) | 200 indexBam(args.inputFastaFile, args.inputBamFile) |
133 conf = config( args ) | 201 meanInsertSize = int(getMeanInsertSize(args.inputBamFile)) |
134 | 202 configFile = config(args.inputBamFile, meanInsertSize, args.name, tempDir) |
135 pindel( reference, conf, temp, args ) | 203 pindelTempDir = pindel(args.inputFastaFile, configFile, args, tempDir) |
136 pindel2vcf(args) | 204 pindel2vcf(args.outputVcfFile, args.inputFastaFile, args.name, pindelTempDir) |
137 | 205 |
138 | |
139 finally: | 206 finally: |
140 if os.path.exists( tmp_dir ): | 207 if os.path.exists(tempDir): |
141 shutil.rmtree( tmp_dir ) | 208 shutil.rmtree(tempDir) |
142 | 209 |
143 if __name__=="__main__": | 210 if __name__=="__main__": |
144 __main__() | 211 __main__() |