comparison pindel.py @ 8:de8a643a41e3 draft

Uploaded
author jeremie
date Mon, 07 Jul 2014 05:05:50 -0400
parents 544c5266fc0f
children 844ad5ce74cc
comparison
equal deleted inserted replaced
7:74009865b3ad 8:de8a643a41e3
1 1
2 import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex 2 import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex
3 3
4 parser = argparse.ArgumentParser(description='') 4 parser = argparse.ArgumentParser(description='')
5 parser.add_argument('-r', dest='input_reference', required=True, help='the reference file') 5 parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file')
6 parser.add_argument('-b', dest='input_bam', required=True, help='the bam file') 6 parser.add_argument('-b', dest='inputBamFile', required=True, help='the bam file')
7 parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size') 7 parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size')
8 parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag') 8 parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag')
9 parser.add_argument('-o', dest='output_vcf', help='the output vcf', required=True) 9 # parser.add_argument('-o1', dest='outputRaw', help='the output raw', required=True)
10 parser.add_argument('--name', dest='name', default='defaults') 10 parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', required=True)
11 parser.add_argument('--name', dest='name', default='name')
11 parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1') 12 parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1')
12 parser.add_argument('--window_size', dest='window_size', type=int, default='5') 13 parser.add_argument('--window_size', dest='window_size', type=int, default='5')
13 parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01') 14 parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01')
14 parser.add_argument('--sensitivity', dest='sensitivity', default='0.95', type=float) 15 parser.add_argument('--sensitivity', dest='sensitivity', default='0.95', type=float)
15 parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False) 16 parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False)
16 parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False) 17 parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False)
17 parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False) 18 parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False)
18 parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False) 19 parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False)
19 20
20 tmp_dir = tempfile.mkdtemp() 21 parser.add_argument('--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default='0.02')
21 pindel_dir=tmp_dir+"/pindel" 22 parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False)
22 errorFile = tmp_dir+"/errorLog" 23 parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False)
23 24 parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False)
24 25 parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False)
25 def execute( cmd, output="" ): 26 parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False)
27 parser.add_argument('--additional_mismatch', dest='additional_mismatch', type=int, default='1')
28 parser.add_argument('--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default='3')
29 parser.add_argument('--min_inversion_size', dest='min_inversion_size', type=int, default='50')
30 parser.add_argument('--min_num_matched_bases', dest='min_num_matched_bases', type=int, default='30')
31 parser.add_argument('--balance_cutoff', dest='balance_cutoff', type=int, default='0')
32 parser.add_argument('--anchor_quality', dest='anchor_quality', type=int, default='0')
33 parser.add_argument('--minimum_support_for_event', dest='minimum_support_for_event', type=int, default='3')
34 parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False)
35 parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350')
36 parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100')
37 parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3')
38 parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3')
39 parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000')
40 parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False)
41
42
43 # def execute(cmd, output=""):
44 # try:
45 # err = open(tempDir+"/errorLog", 'a')
46 # if output != "":
47 # out = open(output, 'w')
48 # else:
49 # out = subprocess.PIPE
50 # process = subprocess.Popen(args=shlex.split(cmd), stdout=out, stderr=err)
51 # process.wait()
52 # err.close()
53 # if out != subprocess.PIPE:
54 # out.close()
55 # except Exception, e:
56 # sys.stderr.write("problem doing : %s\n" %(cmd))
57 # sys.stderr.write('%s\n\n' % str(e))
58
59 def execute(cmd, output=None):
60 import subprocess, sys, shlex
61 # function to execute a cmd and report if an error occur
62 print(cmd)
26 try: 63 try:
27 err = open(tmp_dir+"/errorLog", 'a') 64 process = subprocess.Popen(args=shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
28 if output != "":
29 out = open(output, 'w')
30 else:
31 out = subprocess.PIPE
32 process = subprocess.Popen( args=shlex.split(cmd), stdout=out, stderr=err )
33 process.wait() 65 process.wait()
34 err.close() 66 stdout,stderr = process.communicate()
35 if out != subprocess.PIPE: 67 except Exception, e: # une erreur de ma commande : stderr
36 out.close() 68 sys.stderr.write("problem doing : %s\n%s\n" %(cmd, e))
37 except Exception, e: 69 return
38 sys.stderr.write("problem doing : %s\n" %(cmd)) 70 if output:
39 sys.stderr.write( '%s\n\n' % str(e) ) 71 output = open(output, 'w')
40 72 output.write(stdout)
41 73 output.close()
42 def index (args): 74 if stderr != '': # une erreur interne au programme : stdout (sinon, souvent des warning arrete les programmes)
43 cmd = "samtools index %s" % (args.input_bam) 75 sys.stdout.write("warning or error while doing : %s\n-----\n%s-----\n\n" %(cmd, stderr))
44 # print( "commande = "+cmd ) 76
45 # os.system(cmd) 77
46 execute(cmd) 78 def indexBam(inputFastaFile, inputBamFile):
47 79 cmd = "samtools indexBam %s" %(inputBamFile)
48 cmd = "samtools faidx %s" % (args.input_reference) 80 execute(cmd)
49 # print( "commande = "+cmd ) 81 cmd = "samtools faidx %s" %(inputFastaFile)
50 # os.system(cmd) 82 execute(cmd)
51 execute(cmd) 83
52 84
53 85 def config(inputBamFile, meanInsertSize, name, tempDir):
54 def config( args ): 86 configFile = tempDir+"/pindel_configFile"
55 # print("config") 87 # cmd = 'echo %s\t%s\t%s > %s' %(bam, size, name, configFile)
56 config = tmp_dir+"/pindel_config"
57 # cmd = 'echo %s\t%s\t%s > %s' % (bam, size, name, config)
58 # print("commande = "+cmd) 88 # print("commande = "+cmd)
59 # os.system(cmd) 89 # os.system(cmd)
60 fil = open(config, 'w') 90 fil = open(configFile, 'w')
61 fil.write("%s\t%s\t%s" % (args.input_bam, args.insert_size, args.name) ) 91 fil.write("%s\t%s\t%s" %(inputBamFile, meanInsertSize, name))
62 fil.close() 92 fil.close()
63 return config 93 return configFile
64 94
65 95
66 # def pindel( reference, config, output ): 96 # def pindel(reference, configFile, output):
67 # print("pindel") 97 # print("pindel")
68 # cmd="pindel -f %s -i %s -o %s" % (reference, config, output) 98 # cmd="pindel -f %s -i %s -o %s" %(reference, configFile, output)
69 # print("commande = "+cmd) 99 # print("commande = "+cmd)
70 # os.system(cmd) 100 # os.system(cmd)
71 101
72 # os.system("rm %s" % (output)) 102 # os.system("rm %s" %(output))
73 # if not os.path.exists(output): 103 # if not os.path.exists(output):
74 # os.makedirs(output) 104 # os.makedirs(output)
75 # os.system("mv %s_* %s" %(output, output) ) 105 # os.system("mv %s_* %s" %(output, output))
76 106
77 107
78 def pindel( reference, config, temp, args ): 108 def pindel(reference, configFile, args, tempDir):
79 cmd = "pindel -f %s -i %s -o %s " % (reference, config, pindel_dir) 109 pindelTempDir=tempDir+"/pindel"
80 cmd += "--number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" % (args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity) 110 cmd = "pindel -f %s -i %s -o %s " %(reference, configFile, pindelTempDir)
111 cmd += " --number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" %(args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity)
112 cmd += " -u %f -n %d -a %d -m %d -v %d -d %d -B %d -A %d -M %d " %(args.maximum_allowed_mismatch_rate, args.NM, args.additional_mismatch, args.min_perfect_match_around_BP, args.min_inversion_size, args.min_num_matched_bases, args.balance_cutoff, args.anchor_quality, args.minimum_support_for_event)
113
81 if args.report_long_insertions: 114 if args.report_long_insertions:
82 cmd += ' --report_long_insertions ' 115 cmd += ' --report_long_insertions '
83 if args.report_duplications: 116 if args.report_duplications:
84 cmd += ' --report_duplications ' 117 cmd += ' --report_duplications '
85 if args.report_inversions: 118 if args.report_inversions:
86 cmd += ' --report_inversions ' 119 cmd += ' --report_inversions '
87 if args.report_breakpoints: 120 if args.report_breakpoints:
88 cmd += ' --report_breakpoints ' 121 cmd += ' --report_breakpoints '
89 print("commande = "+cmd) 122
90 execute( cmd ) 123 if args.report_close_mapped_reads:
91 124 cmd += ' --report_close_mapped_reads '
92 125 if args.report_only_close_mapped_reads:
93 126 cmd += ' --report_only_close_mapped_reads '
94 127 if args.report_interchromosomal_events:
95 128 cmd += ' --report_interchromosomal_events '
96 def move( avant, apres ): 129 if args.IndelCorrection:
130 cmd += ' --IndelCorrection '
131 if args.NormalSamples:
132 cmd += ' --NormalSamples '
133 if args.input_SV_Calls_for_assembly:
134 cmd += ' --input_SV_Calls_for_assembly %s ' %(args.input_SV_Calls_for_assembly)
135
136 if args.detect_DD:
137 cmd += ' -q '
138 cmd += ' --MAX_DD_BREAKPOINT_DISTANCE '+str(args.MAX_DD_BREAKPOINT_DISTANCE)
139 cmd += ' --MAX_DISTANCE_CLUSTER_READS '+str(args.MAX_DISTANCE_CLUSTER_READS)
140 cmd += ' --MIN_DD_CLUSTER_SIZE '+str(args.MIN_DD_CLUSTER_SIZE)
141 cmd += ' --MIN_DD_BREAKPOINT_SUPPORT '+str(args.MIN_DD_BREAKPOINT_SUPPORT)
142 cmd += ' --MIN_DD_MAP_DISTANCE '+str(args.MIN_DD_MAP_DISTANCE)
143 if args.DD_REPORT_DUPLICATION_READS:
144 cmd += ' --DD_REPORT_DUPLICATION_READS '
145
146 execute(cmd)
147 return pindelTempDir
148
149
150 def move(avant, apres):
97 if os.path.exists(avant): 151 if os.path.exists(avant):
98 execute("mv %s %s" %(avant, apres)) 152 execute("mv %s %s" %(avant, apres))
99 153
100 154
101 def pindel2vcf( args ): 155 def pindel2vcf(outputVcf, inputFastaFile, name, pindelTempDir):
102 date = str(time.strftime('%d/%m/%y',time.localtime())) 156 date = str(time.strftime('%d/%m/%y',time.localtime()))
103 name = "test" 157 cmd = "pindel2vcf -P %s -r %s -R %s -d %s" %(pindelTempDir, inputFastaFile, name, date)
104 cmd = "pindel2vcf -P %s -r %s -R %s -d %s" % (pindel_dir, args.input_reference, name, date ) 158 print("commande = " + cmd)
105 print ("commande = " + cmd) 159 execute(cmd)
106 execute(cmd) 160 if os.path.exists(pindelTempDir+".vcf"):
107 if os.path.exists(pindel_dir+".vcf"): 161 execute("cp %s %s" %(pindelTempDir+".vcf", outputVcf))
108 execute("cp %s %s" %(pindel_dir+".vcf", args.output_vcf)) 162
163
164 def getMeanInsertSize(bamFile, tempDir):
165 samFile = tempDir+"/sam"
166 histogramFile = tempDir+"/histogram"
167 outputFile = tempDir+"/output"
168 cmd = "samtools view -h %s %s" % (bamFile, samFile)
169 execute(cmd)
170 cmd = "picard-tools CollectInsertSizeMetrics H=%s O=%s I=%s" % (histogramFile, outputFile, samFile)
171 execute(cmd)
172 try:
173 f = open(outputFile, 'r')
174 lignes = f.readlines()
175 f.close()
176 meanInsertSize = lignes[7].split('\t')[0]
177 except:
178 sys.stderr.write("problem while opening file %s " %(outputFile))
179 return 0
180 return meanInsertSize
109 181
110 182
111 def __main__(): 183 def __main__():
112 184
113 args = parser.parse_args() 185 args = parser.parse_args()
186 tempDir = tempfile.mkdtemp()
114 187
115 # try: 188 # try:
116 # cmd = 'pindel' 189 # cmd = 'pindel'
117 # cmd += ' -i1 %s -i2 %s -o %s ' % (args.input1, args.input2, args.output1) 190 # cmd += ' -i1 %s -i2 %s -o %s ' %(args.input1, args.input2, args.output1)
118 # except: 191 # except:
119 # sys.stdout.write( 'problem args : %s' % (cmd) ) 192 # sys.stdout.write('problem args : %s' %(cmd))
120 # try: 193 # try:
121 # os.system(cmd) 194 # os.system(cmd)
122 # except: 195 # except:
123 # sys.stdout.write( 'problem cmd' ) 196 # sys.stdout.write('problem cmd')
124 # pipeline(args.reference, args.bam, args.output_raw, args.output_vcf, size, name, args) 197 # pipeline(args.reference, args.bam, args.output_raw, args.outputVcfFile, size, name, args)
125
126 reference = args.input_reference
127 bam = args.input_bam
128 temp = args.output_vcf+"_temp/temp"
129 output_vcf = args.output_vcf
130 198
131 try: 199 try:
132 index( args ) 200 indexBam(args.inputFastaFile, args.inputBamFile)
133 conf = config( args ) 201 meanInsertSize = int(getMeanInsertSize(args.inputBamFile))
134 202 configFile = config(args.inputBamFile, meanInsertSize, args.name, tempDir)
135 pindel( reference, conf, temp, args ) 203 pindelTempDir = pindel(args.inputFastaFile, configFile, args, tempDir)
136 pindel2vcf(args) 204 pindel2vcf(args.outputVcfFile, args.inputFastaFile, args.name, pindelTempDir)
137 205
138
139 finally: 206 finally:
140 if os.path.exists( tmp_dir ): 207 if os.path.exists(tempDir):
141 shutil.rmtree( tmp_dir ) 208 shutil.rmtree(tempDir)
142 209
143 if __name__=="__main__": 210 if __name__=="__main__":
144 __main__() 211 __main__()