0
|
1
|
|
2 import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex
|
|
3
|
|
4 parser = argparse.ArgumentParser(description='')
|
8
|
5 parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file')
|
|
6 parser.add_argument('-b', dest='inputBamFile', required=True, help='the bam file')
|
0
|
7 parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size')
|
|
8 parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag')
|
8
|
9 # parser.add_argument('-o1', dest='outputRaw', help='the output raw', required=True)
|
|
10 parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', required=True)
|
|
11 parser.add_argument('--name', dest='name', default='name')
|
0
|
12 parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1')
|
|
13 parser.add_argument('--window_size', dest='window_size', type=int, default='5')
|
|
14 parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01')
|
|
15 parser.add_argument('--sensitivity', dest='sensitivity', default='0.95', type=float)
|
|
16 parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False)
|
|
17 parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False)
|
|
18 parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False)
|
|
19 parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False)
|
|
20
|
9
|
21 parser.add_argument('-u', '--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default='0.02')
|
8
|
22 parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False)
|
|
23 parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False)
|
|
24 parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False)
|
|
25 parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False)
|
|
26 parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False)
|
9
|
27 parser.add_argument('-a', '--additional_mismatch', dest='additional_mismatch', type=int, default='1')
|
|
28 parser.add_argument('-m', '--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default='3')
|
|
29 parser.add_argument('-v', '--min_inversion_size', dest='min_inversion_size', type=int, default='50')
|
|
30 parser.add_argument('-d', '--min_num_matched_bases', dest='min_num_matched_bases', type=int, default='30')
|
|
31 parser.add_argument('-B', '--balance_cutoff', dest='balance_cutoff', type=int, default='0')
|
|
32 parser.add_argument('-A', '--anchor_quality', dest='anchor_quality', type=int, default='0')
|
|
33 parser.add_argument('-M', '--minimum_support_for_event', dest='minimum_support_for_event', type=int, default='3')
|
8
|
34 parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False)
|
|
35 parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350')
|
|
36 parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100')
|
|
37 parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3')
|
|
38 parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3')
|
|
39 parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000')
|
|
40 parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False)
|
0
|
41
|
|
42
|
8
|
43 # def execute(cmd, output=""):
|
|
44 # try:
|
|
45 # err = open(tempDir+"/errorLog", 'a')
|
|
46 # if output != "":
|
|
47 # out = open(output, 'w')
|
|
48 # else:
|
|
49 # out = subprocess.PIPE
|
|
50 # process = subprocess.Popen(args=shlex.split(cmd), stdout=out, stderr=err)
|
|
51 # process.wait()
|
|
52 # err.close()
|
|
53 # if out != subprocess.PIPE:
|
|
54 # out.close()
|
|
55 # except Exception, e:
|
|
56 # sys.stderr.write("problem doing : %s\n" %(cmd))
|
|
57 # sys.stderr.write('%s\n\n' % str(e))
|
|
58
|
|
59 def execute(cmd, output=None):
|
|
60 import subprocess, sys, shlex
|
|
61 # function to execute a cmd and report if an error occur
|
|
62 print(cmd)
|
0
|
63 try:
|
8
|
64 process = subprocess.Popen(args=shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
0
|
65 process.wait()
|
8
|
66 stdout,stderr = process.communicate()
|
|
67 except Exception, e: # une erreur de ma commande : stderr
|
|
68 sys.stderr.write("problem doing : %s\n%s\n" %(cmd, e))
|
|
69 return
|
|
70 if output:
|
|
71 output = open(output, 'w')
|
|
72 output.write(stdout)
|
|
73 output.close()
|
|
74 if stderr != '': # une erreur interne au programme : stdout (sinon, souvent des warning arrete les programmes)
|
|
75 sys.stdout.write("warning or error while doing : %s\n-----\n%s-----\n\n" %(cmd, stderr))
|
0
|
76
|
|
77
|
8
|
78 def indexBam(inputFastaFile, inputBamFile):
|
10
|
79 cmd = "samtools index %s" %(inputBamFile)
|
0
|
80 execute(cmd)
|
8
|
81 cmd = "samtools faidx %s" %(inputFastaFile)
|
0
|
82 execute(cmd)
|
|
83
|
|
84
|
8
|
85 def config(inputBamFile, meanInsertSize, name, tempDir):
|
|
86 configFile = tempDir+"/pindel_configFile"
|
|
87 # cmd = 'echo %s\t%s\t%s > %s' %(bam, size, name, configFile)
|
0
|
88 # print("commande = "+cmd)
|
|
89 # os.system(cmd)
|
8
|
90 fil = open(configFile, 'w')
|
|
91 fil.write("%s\t%s\t%s" %(inputBamFile, meanInsertSize, name))
|
0
|
92 fil.close()
|
8
|
93 return configFile
|
0
|
94
|
|
95
|
8
|
96 # def pindel(reference, configFile, output):
|
0
|
97 # print("pindel")
|
8
|
98 # cmd="pindel -f %s -i %s -o %s" %(reference, configFile, output)
|
0
|
99 # print("commande = "+cmd)
|
|
100 # os.system(cmd)
|
|
101
|
8
|
102 # os.system("rm %s" %(output))
|
0
|
103 # if not os.path.exists(output):
|
|
104 # os.makedirs(output)
|
8
|
105 # os.system("mv %s_* %s" %(output, output))
|
0
|
106
|
|
107
|
8
|
108 def pindel(reference, configFile, args, tempDir):
|
|
109 pindelTempDir=tempDir+"/pindel"
|
|
110 cmd = "pindel -f %s -i %s -o %s " %(reference, configFile, pindelTempDir)
|
|
111 cmd += " --number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" %(args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity)
|
|
112 cmd += " -u %f -n %d -a %d -m %d -v %d -d %d -B %d -A %d -M %d " %(args.maximum_allowed_mismatch_rate, args.NM, args.additional_mismatch, args.min_perfect_match_around_BP, args.min_inversion_size, args.min_num_matched_bases, args.balance_cutoff, args.anchor_quality, args.minimum_support_for_event)
|
|
113
|
0
|
114 if args.report_long_insertions:
|
|
115 cmd += ' --report_long_insertions '
|
|
116 if args.report_duplications:
|
|
117 cmd += ' --report_duplications '
|
|
118 if args.report_inversions:
|
|
119 cmd += ' --report_inversions '
|
|
120 if args.report_breakpoints:
|
|
121 cmd += ' --report_breakpoints '
|
8
|
122
|
|
123 if args.report_close_mapped_reads:
|
|
124 cmd += ' --report_close_mapped_reads '
|
|
125 if args.report_only_close_mapped_reads:
|
|
126 cmd += ' --report_only_close_mapped_reads '
|
|
127 if args.report_interchromosomal_events:
|
|
128 cmd += ' --report_interchromosomal_events '
|
|
129 if args.IndelCorrection:
|
|
130 cmd += ' --IndelCorrection '
|
|
131 if args.NormalSamples:
|
|
132 cmd += ' --NormalSamples '
|
|
133 if args.input_SV_Calls_for_assembly:
|
|
134 cmd += ' --input_SV_Calls_for_assembly %s ' %(args.input_SV_Calls_for_assembly)
|
|
135
|
|
136 if args.detect_DD:
|
|
137 cmd += ' -q '
|
|
138 cmd += ' --MAX_DD_BREAKPOINT_DISTANCE '+str(args.MAX_DD_BREAKPOINT_DISTANCE)
|
|
139 cmd += ' --MAX_DISTANCE_CLUSTER_READS '+str(args.MAX_DISTANCE_CLUSTER_READS)
|
|
140 cmd += ' --MIN_DD_CLUSTER_SIZE '+str(args.MIN_DD_CLUSTER_SIZE)
|
|
141 cmd += ' --MIN_DD_BREAKPOINT_SUPPORT '+str(args.MIN_DD_BREAKPOINT_SUPPORT)
|
|
142 cmd += ' --MIN_DD_MAP_DISTANCE '+str(args.MIN_DD_MAP_DISTANCE)
|
|
143 if args.DD_REPORT_DUPLICATION_READS:
|
|
144 cmd += ' --DD_REPORT_DUPLICATION_READS '
|
|
145
|
|
146 execute(cmd)
|
|
147 return pindelTempDir
|
0
|
148
|
|
149
|
8
|
150 def move(avant, apres):
|
0
|
151 if os.path.exists(avant):
|
|
152 execute("mv %s %s" %(avant, apres))
|
|
153
|
|
154
|
8
|
155 def pindel2vcf(outputVcf, inputFastaFile, name, pindelTempDir):
|
0
|
156 date = str(time.strftime('%d/%m/%y',time.localtime()))
|
8
|
157 cmd = "pindel2vcf -P %s -r %s -R %s -d %s" %(pindelTempDir, inputFastaFile, name, date)
|
|
158 print("commande = " + cmd)
|
0
|
159 execute(cmd)
|
8
|
160 if os.path.exists(pindelTempDir+".vcf"):
|
|
161 execute("cp %s %s" %(pindelTempDir+".vcf", outputVcf))
|
|
162
|
|
163
|
|
164 def getMeanInsertSize(bamFile, tempDir):
|
|
165 samFile = tempDir+"/sam"
|
|
166 histogramFile = tempDir+"/histogram"
|
|
167 outputFile = tempDir+"/output"
|
11
|
168 cmd = "samtools view -h -o %s %s" % (bamFile, samFile)
|
|
169 execute(cmd, )
|
8
|
170 cmd = "picard-tools CollectInsertSizeMetrics H=%s O=%s I=%s" % (histogramFile, outputFile, samFile)
|
|
171 execute(cmd)
|
|
172 try:
|
|
173 f = open(outputFile, 'r')
|
|
174 lignes = f.readlines()
|
|
175 f.close()
|
|
176 meanInsertSize = lignes[7].split('\t')[0]
|
|
177 except:
|
|
178 sys.stderr.write("problem while opening file %s " %(outputFile))
|
|
179 return 0
|
|
180 return meanInsertSize
|
0
|
181
|
|
182
|
|
183 def __main__():
|
|
184
|
|
185 args = parser.parse_args()
|
8
|
186 tempDir = tempfile.mkdtemp()
|
0
|
187
|
|
188 # try:
|
|
189 # cmd = 'pindel'
|
8
|
190 # cmd += ' -i1 %s -i2 %s -o %s ' %(args.input1, args.input2, args.output1)
|
0
|
191 # except:
|
8
|
192 # sys.stdout.write('problem args : %s' %(cmd))
|
0
|
193 # try:
|
|
194 # os.system(cmd)
|
|
195 # except:
|
8
|
196 # sys.stdout.write('problem cmd')
|
|
197 # pipeline(args.reference, args.bam, args.output_raw, args.outputVcfFile, size, name, args)
|
0
|
198
|
|
199 try:
|
8
|
200 indexBam(args.inputFastaFile, args.inputBamFile)
|
10
|
201 meanInsertSize = int(getMeanInsertSize(args.inputBamFile, tempDir))
|
8
|
202 configFile = config(args.inputBamFile, meanInsertSize, args.name, tempDir)
|
|
203 pindelTempDir = pindel(args.inputFastaFile, configFile, args, tempDir)
|
|
204 pindel2vcf(args.outputVcfFile, args.inputFastaFile, args.name, pindelTempDir)
|
0
|
205
|
|
206 finally:
|
8
|
207 if os.path.exists(tempDir):
|
|
208 shutil.rmtree(tempDir)
|
0
|
209
|
|
210 if __name__=="__main__":
|
|
211 __main__()
|