Mercurial > repos > jeremie > pindel_test
changeset 8:de8a643a41e3 draft
Uploaded
author | jeremie |
---|---|
date | Mon, 07 Jul 2014 05:05:50 -0400 |
parents | 74009865b3ad |
children | 844ad5ce74cc |
files | pindel.py |
diffstat | 1 files changed, 138 insertions(+), 71 deletions(-) [+] |
line wrap: on
line diff
--- a/pindel.py Mon Jul 07 05:03:23 2014 -0400 +++ b/pindel.py Mon Jul 07 05:05:50 2014 -0400 @@ -2,12 +2,13 @@ import argparse, optparse, os, shutil, subprocess, sys, tempfile, time, shlex parser = argparse.ArgumentParser(description='') -parser.add_argument('-r', dest='input_reference', required=True, help='the reference file') -parser.add_argument('-b', dest='input_bam', required=True, help='the bam file') +parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file') +parser.add_argument('-b', dest='inputBamFile', required=True, help='the bam file') parser.add_argument('-s', dest='insert_size', type=int, default='500', required=True, help='the insert size') parser.add_argument('-n', dest='sampleTag', default='sample', required=False, help='the sample tag') -parser.add_argument('-o', dest='output_vcf', help='the output vcf', required=True) -parser.add_argument('--name', dest='name', default='defaults') +# parser.add_argument('-o1', dest='outputRaw', help='the output raw', required=True) +parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', required=True) +parser.add_argument('--name', dest='name', default='name') parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default='1') parser.add_argument('--window_size', dest='window_size', type=int, default='5') parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default='0.01') @@ -17,67 +18,99 @@ parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False) parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False) -tmp_dir = tempfile.mkdtemp() -pindel_dir=tmp_dir+"/pindel" -errorFile = tmp_dir+"/errorLog" +parser.add_argument('--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default='0.02') +parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False) +parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False) +parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False) +parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False) +parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False) +parser.add_argument('--additional_mismatch', dest='additional_mismatch', type=int, default='1') +parser.add_argument('--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default='3') +parser.add_argument('--min_inversion_size', dest='min_inversion_size', type=int, default='50') +parser.add_argument('--min_num_matched_bases', dest='min_num_matched_bases', type=int, default='30') +parser.add_argument('--balance_cutoff', dest='balance_cutoff', type=int, default='0') +parser.add_argument('--anchor_quality', dest='anchor_quality', type=int, default='0') +parser.add_argument('--minimum_support_for_event', dest='minimum_support_for_event', type=int, default='3') +parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False) +parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350') +parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100') +parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3') +parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3') +parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000') +parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False) -def execute( cmd, output="" ): +# def execute(cmd, output=""): +# try: +# err = open(tempDir+"/errorLog", 'a') +# if output != "": +# out = open(output, 'w') +# else: +# out = subprocess.PIPE +# process = subprocess.Popen(args=shlex.split(cmd), stdout=out, stderr=err) +# process.wait() +# err.close() +# if out != subprocess.PIPE: +# out.close() +# except Exception, e: +# sys.stderr.write("problem doing : %s\n" %(cmd)) +# sys.stderr.write('%s\n\n' % str(e)) + +def execute(cmd, output=None): + import subprocess, sys, shlex + # function to execute a cmd and report if an error occur + print(cmd) try: - err = open(tmp_dir+"/errorLog", 'a') - if output != "": - out = open(output, 'w') - else: - out = subprocess.PIPE - process = subprocess.Popen( args=shlex.split(cmd), stdout=out, stderr=err ) + process = subprocess.Popen(args=shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE) process.wait() - err.close() - if out != subprocess.PIPE: - out.close() - except Exception, e: - sys.stderr.write("problem doing : %s\n" %(cmd)) - sys.stderr.write( '%s\n\n' % str(e) ) + stdout,stderr = process.communicate() + except Exception, e: # une erreur de ma commande : stderr + sys.stderr.write("problem doing : %s\n%s\n" %(cmd, e)) + return + if output: + output = open(output, 'w') + output.write(stdout) + output.close() + if stderr != '': # une erreur interne au programme : stdout (sinon, souvent des warning arrete les programmes) + sys.stdout.write("warning or error while doing : %s\n-----\n%s-----\n\n" %(cmd, stderr)) -def index (args): - cmd = "samtools index %s" % (args.input_bam) - # print( "commande = "+cmd ) - # os.system(cmd) +def indexBam(inputFastaFile, inputBamFile): + cmd = "samtools indexBam %s" %(inputBamFile) execute(cmd) - - cmd = "samtools faidx %s" % (args.input_reference) - # print( "commande = "+cmd ) - # os.system(cmd) + cmd = "samtools faidx %s" %(inputFastaFile) execute(cmd) -def config( args ): - # print("config") - config = tmp_dir+"/pindel_config" - # cmd = 'echo %s\t%s\t%s > %s' % (bam, size, name, config) +def config(inputBamFile, meanInsertSize, name, tempDir): + configFile = tempDir+"/pindel_configFile" + # cmd = 'echo %s\t%s\t%s > %s' %(bam, size, name, configFile) # print("commande = "+cmd) # os.system(cmd) - fil = open(config, 'w') - fil.write("%s\t%s\t%s" % (args.input_bam, args.insert_size, args.name) ) + fil = open(configFile, 'w') + fil.write("%s\t%s\t%s" %(inputBamFile, meanInsertSize, name)) fil.close() - return config + return configFile -# def pindel( reference, config, output ): +# def pindel(reference, configFile, output): # print("pindel") -# cmd="pindel -f %s -i %s -o %s" % (reference, config, output) +# cmd="pindel -f %s -i %s -o %s" %(reference, configFile, output) # print("commande = "+cmd) # os.system(cmd) -# os.system("rm %s" % (output)) +# os.system("rm %s" %(output)) # if not os.path.exists(output): # os.makedirs(output) -# os.system("mv %s_* %s" %(output, output) ) +# os.system("mv %s_* %s" %(output, output)) -def pindel( reference, config, temp, args ): - cmd = "pindel -f %s -i %s -o %s " % (reference, config, pindel_dir) - cmd += "--number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" % (args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity) +def pindel(reference, configFile, args, tempDir): + pindelTempDir=tempDir+"/pindel" + cmd = "pindel -f %s -i %s -o %s " %(reference, configFile, pindelTempDir) + cmd += " --number_of_threads %d --window_size %d --sequencing_error_rate %f --sensitivity %f" %(args.number_of_threads, args.window_size, args.sequencing_error_rate, args.sensitivity) + cmd += " -u %f -n %d -a %d -m %d -v %d -d %d -B %d -A %d -M %d " %(args.maximum_allowed_mismatch_rate, args.NM, args.additional_mismatch, args.min_perfect_match_around_BP, args.min_inversion_size, args.min_num_matched_bases, args.balance_cutoff, args.anchor_quality, args.minimum_support_for_event) + if args.report_long_insertions: cmd += ' --report_long_insertions ' if args.report_duplications: @@ -86,59 +119,93 @@ cmd += ' --report_inversions ' if args.report_breakpoints: cmd += ' --report_breakpoints ' - print("commande = "+cmd) - execute( cmd ) + + if args.report_close_mapped_reads: + cmd += ' --report_close_mapped_reads ' + if args.report_only_close_mapped_reads: + cmd += ' --report_only_close_mapped_reads ' + if args.report_interchromosomal_events: + cmd += ' --report_interchromosomal_events ' + if args.IndelCorrection: + cmd += ' --IndelCorrection ' + if args.NormalSamples: + cmd += ' --NormalSamples ' + if args.input_SV_Calls_for_assembly: + cmd += ' --input_SV_Calls_for_assembly %s ' %(args.input_SV_Calls_for_assembly) + + if args.detect_DD: + cmd += ' -q ' + cmd += ' --MAX_DD_BREAKPOINT_DISTANCE '+str(args.MAX_DD_BREAKPOINT_DISTANCE) + cmd += ' --MAX_DISTANCE_CLUSTER_READS '+str(args.MAX_DISTANCE_CLUSTER_READS) + cmd += ' --MIN_DD_CLUSTER_SIZE '+str(args.MIN_DD_CLUSTER_SIZE) + cmd += ' --MIN_DD_BREAKPOINT_SUPPORT '+str(args.MIN_DD_BREAKPOINT_SUPPORT) + cmd += ' --MIN_DD_MAP_DISTANCE '+str(args.MIN_DD_MAP_DISTANCE) + if args.DD_REPORT_DUPLICATION_READS: + cmd += ' --DD_REPORT_DUPLICATION_READS ' + + execute(cmd) + return pindelTempDir - - - -def move( avant, apres ): +def move(avant, apres): if os.path.exists(avant): execute("mv %s %s" %(avant, apres)) -def pindel2vcf( args ): +def pindel2vcf(outputVcf, inputFastaFile, name, pindelTempDir): date = str(time.strftime('%d/%m/%y',time.localtime())) - name = "test" - cmd = "pindel2vcf -P %s -r %s -R %s -d %s" % (pindel_dir, args.input_reference, name, date ) - print ("commande = " + cmd) + cmd = "pindel2vcf -P %s -r %s -R %s -d %s" %(pindelTempDir, inputFastaFile, name, date) + print("commande = " + cmd) execute(cmd) - if os.path.exists(pindel_dir+".vcf"): - execute("cp %s %s" %(pindel_dir+".vcf", args.output_vcf)) + if os.path.exists(pindelTempDir+".vcf"): + execute("cp %s %s" %(pindelTempDir+".vcf", outputVcf)) + + +def getMeanInsertSize(bamFile, tempDir): + samFile = tempDir+"/sam" + histogramFile = tempDir+"/histogram" + outputFile = tempDir+"/output" + cmd = "samtools view -h %s %s" % (bamFile, samFile) + execute(cmd) + cmd = "picard-tools CollectInsertSizeMetrics H=%s O=%s I=%s" % (histogramFile, outputFile, samFile) + execute(cmd) + try: + f = open(outputFile, 'r') + lignes = f.readlines() + f.close() + meanInsertSize = lignes[7].split('\t')[0] + except: + sys.stderr.write("problem while opening file %s " %(outputFile)) + return 0 + return meanInsertSize def __main__(): args = parser.parse_args() + tempDir = tempfile.mkdtemp() # try: # cmd = 'pindel' - # cmd += ' -i1 %s -i2 %s -o %s ' % (args.input1, args.input2, args.output1) + # cmd += ' -i1 %s -i2 %s -o %s ' %(args.input1, args.input2, args.output1) # except: - # sys.stdout.write( 'problem args : %s' % (cmd) ) + # sys.stdout.write('problem args : %s' %(cmd)) # try: # os.system(cmd) # except: - # sys.stdout.write( 'problem cmd' ) - # pipeline(args.reference, args.bam, args.output_raw, args.output_vcf, size, name, args) - - reference = args.input_reference - bam = args.input_bam - temp = args.output_vcf+"_temp/temp" - output_vcf = args.output_vcf + # sys.stdout.write('problem cmd') + # pipeline(args.reference, args.bam, args.output_raw, args.outputVcfFile, size, name, args) try: - index( args ) - conf = config( args ) - - pindel( reference, conf, temp, args ) - pindel2vcf(args) + indexBam(args.inputFastaFile, args.inputBamFile) + meanInsertSize = int(getMeanInsertSize(args.inputBamFile)) + configFile = config(args.inputBamFile, meanInsertSize, args.name, tempDir) + pindelTempDir = pindel(args.inputFastaFile, configFile, args, tempDir) + pindel2vcf(args.outputVcfFile, args.inputFastaFile, args.name, pindelTempDir) - finally: - if os.path.exists( tmp_dir ): - shutil.rmtree( tmp_dir ) + if os.path.exists(tempDir): + shutil.rmtree(tempDir) if __name__=="__main__": __main__()