changeset 8:de8a643a41e3 draft

Uploaded
author jeremie
date Mon, 07 Jul 2014 05:05:50 -0400
parents 74009865b3ad
children 844ad5ce74cc
files pindel.py
diffstat 1 files changed, 138 insertions(+), 71 deletions(-) [+]
line wrap: on
line diff
--- a/pindel.py	Mon Jul 07 05:03:23 2014 -0400
+++ b/pindel.py	Mon Jul 07 05:05:50 2014 -0400
@@ -2,12 +2,13 @@
 import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex
 
 parser = argparse.ArgumentParser(description='')
-parser.add_argument('-r', dest='input_reference', required=True, help='the reference file')
-parser.add_argument('-b', dest='input_bam', required=True, help='the bam file')
+parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file')
+parser.add_argument('-b', dest='inputBamFile', required=True, help='the bam file')
 parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size')
 parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag')
-parser.add_argument('-o', dest='output_vcf', help='the output vcf', required=True)
-parser.add_argument('--name', dest='name', default='defaults')
+# parser.add_argument('-o1', dest='outputRaw', help='the output raw', required=True)
+parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', required=True)
+parser.add_argument('--name', dest='name', default='name')
 parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1')
 parser.add_argument('--window_size', dest='window_size', type=int, default='5')
 parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01')
@@ -17,67 +18,99 @@
 parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False)
 parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False)
 
-tmp_dir = tempfile.mkdtemp()
-pindel_dir=tmp_dir+"/pindel"
-errorFile = tmp_dir+"/errorLog"
+parser.add_argument('--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default='0.02')
+parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False)
+parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False)
+parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False)
+parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False)
+parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False)
+parser.add_argument('--additional_mismatch', dest='additional_mismatch', type=int, default='1')
+parser.add_argument('--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default='3')
+parser.add_argument('--min_inversion_size', dest='min_inversion_size', type=int, default='50')
+parser.add_argument('--min_num_matched_bases', dest='min_num_matched_bases', type=int, default='30')
+parser.add_argument('--balance_cutoff', dest='balance_cutoff', type=int, default='0')
+parser.add_argument('--anchor_quality', dest='anchor_quality', type=int, default='0')
+parser.add_argument('--minimum_support_for_event', dest='minimum_support_for_event', type=int, default='3')
+parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False)
+parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350')
+parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100')
+parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3')
+parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3')
+parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000')
+parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False)
 
 
-def execute( cmd, output="" ):
+# def execute(cmd, output=""):
+# 	try: 
+# 		err = open(tempDir+"/errorLog", 'a')
+# 		if output != "":
+# 			out = open(output, 'w')
+# 		else:
+# 			out = subprocess.PIPE
+# 		process = subprocess.Popen(args=shlex.split(cmd), stdout=out, stderr=err)
+# 		process.wait()
+# 		err.close()
+# 		if out != subprocess.PIPE:
+# 			out.close()
+# 	except Exception, e:
+# 		sys.stderr.write("problem doing : %s\n" %(cmd))
+# 		sys.stderr.write('%s\n\n' % str(e))
+
+def execute(cmd, output=None):
+	import subprocess, sys, shlex
+	# function to execute a cmd and report if an error occur
+	print(cmd)
 	try: 
-		err = open(tmp_dir+"/errorLog", 'a')
-		if output != "":
-			out = open(output, 'w')
-		else:
-			out = subprocess.PIPE
-		process = subprocess.Popen( args=shlex.split(cmd), stdout=out, stderr=err )
+		process = subprocess.Popen(args=shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 		process.wait()
-		err.close()
-		if out != subprocess.PIPE:
-			out.close()
-	except Exception, e:
-		sys.stderr.write("problem doing : %s\n" %(cmd))
-		sys.stderr.write( '%s\n\n' % str(e) )
+		stdout,stderr = process.communicate()
+	except Exception, e: # une erreur de ma commande : stderr
+		sys.stderr.write("problem doing : %s\n%s\n" %(cmd, e))
+		return
+	if output:
+		output = open(output, 'w')
+		output.write(stdout)
+		output.close()
+	if stderr != '': # une erreur interne au programme : stdout (sinon, souvent des warning arrete les programmes)
+		sys.stdout.write("warning or error while doing : %s\n-----\n%s-----\n\n" %(cmd, stderr))
 
 
-def index (args):
-	cmd = "samtools index %s" % (args.input_bam)
-	# print( "commande = "+cmd )
-	# os.system(cmd)
+def indexBam(inputFastaFile, inputBamFile):
+	cmd = "samtools indexBam %s" %(inputBamFile)
 	execute(cmd)
-
-	cmd = "samtools faidx %s" % (args.input_reference)
-	# print( "commande = "+cmd )
-	# os.system(cmd)
+	cmd = "samtools faidx %s" %(inputFastaFile)
 	execute(cmd)
 
 
-def config( args ):
-	# print("config")
-	config = tmp_dir+"/pindel_config"
-	# cmd = 'echo %s\t%s\t%s > %s' % (bam, size, name, config)
+def config(inputBamFile, meanInsertSize, name, tempDir):
+	configFile = tempDir+"/pindel_configFile"
+	# cmd = 'echo %s\t%s\t%s > %s' %(bam, size, name, configFile)
 	# print("commande = "+cmd)
 	# os.system(cmd)
-	fil = open(config, 'w')
-	fil.write("%s\t%s\t%s" % (args.input_bam, args.insert_size, args.name) )
+	fil = open(configFile, 'w')
+	fil.write("%s\t%s\t%s" %(inputBamFile, meanInsertSize, name))
 	fil.close()
-	return config
+	return configFile
 
 
-# def pindel( reference, config, output ):
+# def pindel(reference, configFile, output):
 # 	print("pindel")
-# 	cmd="pindel -f %s -i %s -o %s" % (reference, config, output)
+# 	cmd="pindel -f %s -i %s -o %s" %(reference, configFile, output)
 # 	print("commande = "+cmd)
 # 	os.system(cmd)
 
-# 	os.system("rm %s" % (output))
+# 	os.system("rm %s" %(output))
 # 	if not os.path.exists(output):
 # 		os.makedirs(output)
-# 	os.system("mv %s_* %s" %(output, output) )
+# 	os.system("mv %s_* %s" %(output, output))
 
 
-def pindel( reference, config, temp, args ):
-	cmd = "pindel -f %s -i %s -o %s " % (reference, config, pindel_dir)
-	cmd += "--number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" % (args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity)
+def pindel(reference, configFile, args, tempDir):
+	pindelTempDir=tempDir+"/pindel" 
+	cmd = "pindel -f %s -i %s -o %s " %(reference, configFile, pindelTempDir)
+	cmd += " --number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" %(args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity)
+	cmd += " -u %f -n %d -a %d -m %d -v %d -d %d -B %d -A %d -M %d " %(args.maximum_allowed_mismatch_rate, args.NM, args.additional_mismatch, args.min_perfect_match_around_BP, args.min_inversion_size, args.min_num_matched_bases, args.balance_cutoff, args.anchor_quality, args.minimum_support_for_event)
+
 	if args.report_long_insertions:
 		cmd += ' --report_long_insertions '
 	if args.report_duplications:
@@ -86,59 +119,93 @@
 		cmd += ' --report_inversions '
 	if args.report_breakpoints:
 		cmd += ' --report_breakpoints '
-	print("commande = "+cmd)
-	execute( cmd )
+
+	if args.report_close_mapped_reads:
+		cmd += ' --report_close_mapped_reads '
+	if args.report_only_close_mapped_reads:
+		cmd += ' --report_only_close_mapped_reads '
+	if args.report_interchromosomal_events:
+		cmd += ' --report_interchromosomal_events '
+	if args.IndelCorrection:
+		cmd += ' --IndelCorrection '
+	if args.NormalSamples:
+		cmd += ' --NormalSamples '
+	if args.input_SV_Calls_for_assembly:
+		cmd += ' --input_SV_Calls_for_assembly %s ' %(args.input_SV_Calls_for_assembly)
+
+	if args.detect_DD:
+		cmd += ' -q '
+		cmd += ' --MAX_DD_BREAKPOINT_DISTANCE '+str(args.MAX_DD_BREAKPOINT_DISTANCE)
+		cmd += ' --MAX_DISTANCE_CLUSTER_READS '+str(args.MAX_DISTANCE_CLUSTER_READS)
+		cmd += ' --MIN_DD_CLUSTER_SIZE '+str(args.MIN_DD_CLUSTER_SIZE)
+		cmd += ' --MIN_DD_BREAKPOINT_SUPPORT '+str(args.MIN_DD_BREAKPOINT_SUPPORT)
+		cmd += ' --MIN_DD_MAP_DISTANCE '+str(args.MIN_DD_MAP_DISTANCE)
+		if args.DD_REPORT_DUPLICATION_READS:
+			cmd += ' --DD_REPORT_DUPLICATION_READS '
+
+	execute(cmd)
+	return pindelTempDir
 
 
-
-
-
-def move( avant, apres ):
+def move(avant, apres):
 	if os.path.exists(avant):
 		execute("mv %s %s" %(avant, apres))
 
 
-def pindel2vcf( args ):
+def pindel2vcf(outputVcf, inputFastaFile, name, pindelTempDir):
 	date = str(time.strftime('%d/%m/%y',time.localtime()))
-	name = "test"
-	cmd = "pindel2vcf -P %s -r %s -R %s -d %s" % (pindel_dir, args.input_reference, name, date )
-	print ("commande = " + cmd)
+	cmd = "pindel2vcf -P %s -r %s -R %s -d %s" %(pindelTempDir, inputFastaFile, name, date)
+	print("commande = " + cmd)
 	execute(cmd)
-	if os.path.exists(pindel_dir+".vcf"):
-		execute("cp %s %s" %(pindel_dir+".vcf", args.output_vcf))
+	if os.path.exists(pindelTempDir+".vcf"):
+		execute("cp %s %s" %(pindelTempDir+".vcf", outputVcf))
+
+
+def getMeanInsertSize(bamFile, tempDir):
+	samFile = tempDir+"/sam"
+	histogramFile = tempDir+"/histogram"
+	outputFile = tempDir+"/output"
+	cmd = "samtools view -h %s %s" % (bamFile, samFile)
+	execute(cmd)
+	cmd = "picard-tools CollectInsertSizeMetrics H=%s O=%s I=%s" % (histogramFile, outputFile, samFile)
+	execute(cmd)
+	try:
+		f = open(outputFile, 'r')
+		lignes = f.readlines()
+		f.close()
+		meanInsertSize = lignes[7].split('\t')[0]
+	except:
+		sys.stderr.write("problem while opening file %s " %(outputFile))
+		return 0
+	return meanInsertSize
 
 
 def __main__():
 	
 	args = parser.parse_args()
+	tempDir = tempfile.mkdtemp()
 
 	# try:
 	# 	cmd = 'pindel'
-	# 	cmd += ' -i1 %s -i2 %s -o %s ' % (args.input1, args.input2, args.output1)
+	# 	cmd += ' -i1 %s -i2 %s -o %s ' %(args.input1, args.input2, args.output1)
 	# except:
-	# 	sys.stdout.write( 'problem args : %s' % (cmd) )
+	# 	sys.stdout.write('problem args : %s' %(cmd))
 	# try:
 	# 	os.system(cmd)
 	# except:
-	# 	sys.stdout.write( 'problem cmd' )
-	# pipeline(args.reference, args.bam, args.output_raw, args.output_vcf, size, name, args)
-
-	reference = args.input_reference
-	bam = args.input_bam
-	temp = args.output_vcf+"_temp/temp"
-	output_vcf = args.output_vcf
+	# 	sys.stdout.write('problem cmd')
+	# pipeline(args.reference, args.bam, args.output_raw, args.outputVcfFile, size, name, args)
 	
 	try:
-		index( args )
-		conf = config( args )
-
-		pindel( reference, conf, temp, args )
-		pindel2vcf(args)
+		indexBam(args.inputFastaFile, args.inputBamFile)
+		meanInsertSize = int(getMeanInsertSize(args.inputBamFile))
+		configFile = config(args.inputBamFile, meanInsertSize, args.name, tempDir)
+		pindelTempDir = pindel(args.inputFastaFile, configFile, args, tempDir)
+		pindel2vcf(args.outputVcfFile, args.inputFastaFile, args.name, pindelTempDir)
 		
-
 	finally:
-		if os.path.exists( tmp_dir ):
-			shutil.rmtree( tmp_dir )
+		if os.path.exists(tempDir):
+			shutil.rmtree(tempDir)
 
 if __name__=="__main__":
 	__main__()