comparison kmer_analysis.py @ 13:7c944fd9907e draft

release 2
author rlegendre
date Thu, 09 Apr 2015 11:35:48 -0400
parents 707807fee542
children a121cce43f90
comparison
equal deleted inserted replaced
12:ee3a3435ce43 13:7c944fd9907e
78 ''' 78 '''
79 write footprint coverage file for each sam file in tmpdir and get kmer distribution 79 write footprint coverage file for each sam file in tmpdir and get kmer distribution
80 ''' 80 '''
81 global total_mapped_read 81 global total_mapped_read
82 82
83 ## tags by default
84 multi_tag = "XS:i:"
85 tag = "IH:i:1"
86
83 ## init kmer dict 87 ## init kmer dict
84 KMER = OrderedDict({}) 88 KMER = OrderedDict({})
85 89
86 try: 90 try:
87 file_array = (commands.getoutput('ls '+tmpdir)).split('\n') 91 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
101 elif '@PG\tID' in line : 105 elif '@PG\tID' in line :
102 if 'bowtie' in line: 106 if 'bowtie' in line:
103 multi_tag = "XS:i:" 107 multi_tag = "XS:i:"
104 elif 'bwa' in line: 108 elif 'bwa' in line:
105 multi_tag = "XT:A:R" 109 multi_tag = "XT:A:R"
106 #elif 'TopHat' in line: 110 elif 'TopHat' in line:
107 # multi_tag = "NH:i:1" 111 tag = "NH:i:1"
108 else : 112 else :
109 stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") 113 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
110 114
111 # get footprint 115 # get footprint
112 elif re.search('^[^@].+', line) : 116 elif re.search('^[^@].+', line) :
113 len_read = len(line.split('\t')[9]) 117 len_read = len(line.split('\t')[9])
114 ##full kmer dict 118 ##full kmer dict
119 123
120 #print line.rstrip() 124 #print line.rstrip()
121 read_pos = int(line.split('\t')[3]) 125 read_pos = int(line.split('\t')[3])
122 read_sens = int(line.split('\t')[1]) 126 read_sens = int(line.split('\t')[1])
123 #len_read = len(line.split('\t')[9]) 127 #len_read = len(line.split('\t')[9])
124 if len_read == kmer and multi_tag not in line: 128 if len_read == kmer and (tag in line or multi_tag not in line):
125 ###if line.split('\t')[5] == '28M' : 129 ###if line.split('\t')[5] == '28M' :
126 total_mapped_read +=1 130 total_mapped_read +=1
127 #if it's a forward read 131 #if it's a forward read
128 if read_sens == 0 : 132 if read_sens == 0 :
129 #get 5' base 133 #get 5' base
397 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) 401 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
398 #GFF = store_gff(options.gff) 402 #GFF = store_gff(options.gff)
399 #GFF = ribo_functions.store_gtf(options.gff) 403 #GFF = ribo_functions.store_gtf(options.gff)
400 ## check gff reading 404 ## check gff reading
401 if not GFF['order'] : 405 if not GFF['order'] :
402 stop_err( 'Incorrect GFF file' + str( e ) ) 406 stop_err( 'Incorrect GFF file' )
403 407
404 ## split bam 408 ## split bam
405 split_bam(options.bamfile,tmpdir) 409 split_bam(options.bamfile,tmpdir)
406 ################################### 410 ###################################
407 ## First analysis with 28mer : 411 ## First analysis with 28mer :
414 results = {} 418 results = {}
415 results[28] = whole_phasing 419 results[28] = whole_phasing
416 ## compute analysis with other kmer 420 ## compute analysis with other kmer
417 for keys in kmer.iterkeys() : 421 for keys in kmer.iterkeys() :
418 if keys != 28 : 422 if keys != 28 :
419 ## remove all txt files in tmp directory 423
420 if os.system("rm "+tmpdir+"/*.txt") != 0 :
421 stop_err( 'Error during tmp directory cleaning : ' + str( e ) )
422
423 ## If not enought reads in this kmer : 424 ## If not enought reads in this kmer :
424 if kmer[keys] > 100 : 425 if kmer[keys] > 100 :
426 ## remove all txt files in tmp directory
427 if os.system("rm "+tmpdir+"/*.txt") != 0 :
428 stop_err( 'Error during tmp directory cleaning')
425 ## compute coverage and distribution kmer 429 ## compute coverage and distribution kmer
426 tmp = get_first_base(tmpdir, keys) 430 tmp = get_first_base(tmpdir, keys)
427 ## compute phasing 431 ## compute phasing
428 whole_phasing = frame_analysis(tmpdir,GFF) 432 whole_phasing = frame_analysis(tmpdir,GFF)
429 433