Mercurial > repos > rlegendre > ribo_tools
comparison kmer_analysis.py @ 13:7c944fd9907e draft
release 2
author | rlegendre |
---|---|
date | Thu, 09 Apr 2015 11:35:48 -0400 |
parents | 707807fee542 |
children | a121cce43f90 |
comparison
equal
deleted
inserted
replaced
12:ee3a3435ce43 | 13:7c944fd9907e |
---|---|
78 ''' | 78 ''' |
79 write footprint coverage file for each sam file in tmpdir and get kmer distribution | 79 write footprint coverage file for each sam file in tmpdir and get kmer distribution |
80 ''' | 80 ''' |
81 global total_mapped_read | 81 global total_mapped_read |
82 | 82 |
83 ## tags by default | |
84 multi_tag = "XS:i:" | |
85 tag = "IH:i:1" | |
86 | |
83 ## init kmer dict | 87 ## init kmer dict |
84 KMER = OrderedDict({}) | 88 KMER = OrderedDict({}) |
85 | 89 |
86 try: | 90 try: |
87 file_array = (commands.getoutput('ls '+tmpdir)).split('\n') | 91 file_array = (commands.getoutput('ls '+tmpdir)).split('\n') |
101 elif '@PG\tID' in line : | 105 elif '@PG\tID' in line : |
102 if 'bowtie' in line: | 106 if 'bowtie' in line: |
103 multi_tag = "XS:i:" | 107 multi_tag = "XS:i:" |
104 elif 'bwa' in line: | 108 elif 'bwa' in line: |
105 multi_tag = "XT:A:R" | 109 multi_tag = "XT:A:R" |
106 #elif 'TopHat' in line: | 110 elif 'TopHat' in line: |
107 # multi_tag = "NH:i:1" | 111 tag = "NH:i:1" |
108 else : | 112 else : |
109 stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") | 113 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") |
110 | 114 |
111 # get footprint | 115 # get footprint |
112 elif re.search('^[^@].+', line) : | 116 elif re.search('^[^@].+', line) : |
113 len_read = len(line.split('\t')[9]) | 117 len_read = len(line.split('\t')[9]) |
114 ##full kmer dict | 118 ##full kmer dict |
119 | 123 |
120 #print line.rstrip() | 124 #print line.rstrip() |
121 read_pos = int(line.split('\t')[3]) | 125 read_pos = int(line.split('\t')[3]) |
122 read_sens = int(line.split('\t')[1]) | 126 read_sens = int(line.split('\t')[1]) |
123 #len_read = len(line.split('\t')[9]) | 127 #len_read = len(line.split('\t')[9]) |
124 if len_read == kmer and multi_tag not in line: | 128 if len_read == kmer and (tag in line or multi_tag not in line): |
125 ###if line.split('\t')[5] == '28M' : | 129 ###if line.split('\t')[5] == '28M' : |
126 total_mapped_read +=1 | 130 total_mapped_read +=1 |
127 #if it's a forward read | 131 #if it's a forward read |
128 if read_sens == 0 : | 132 if read_sens == 0 : |
129 #get 5' base | 133 #get 5' base |
397 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) | 401 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) |
398 #GFF = store_gff(options.gff) | 402 #GFF = store_gff(options.gff) |
399 #GFF = ribo_functions.store_gtf(options.gff) | 403 #GFF = ribo_functions.store_gtf(options.gff) |
400 ## check gff reading | 404 ## check gff reading |
401 if not GFF['order'] : | 405 if not GFF['order'] : |
402 stop_err( 'Incorrect GFF file' + str( e ) ) | 406 stop_err( 'Incorrect GFF file' ) |
403 | 407 |
404 ## split bam | 408 ## split bam |
405 split_bam(options.bamfile,tmpdir) | 409 split_bam(options.bamfile,tmpdir) |
406 ################################### | 410 ################################### |
407 ## First analysis with 28mer : | 411 ## First analysis with 28mer : |
414 results = {} | 418 results = {} |
415 results[28] = whole_phasing | 419 results[28] = whole_phasing |
416 ## compute analysis with other kmer | 420 ## compute analysis with other kmer |
417 for keys in kmer.iterkeys() : | 421 for keys in kmer.iterkeys() : |
418 if keys != 28 : | 422 if keys != 28 : |
419 ## remove all txt files in tmp directory | 423 |
420 if os.system("rm "+tmpdir+"/*.txt") != 0 : | |
421 stop_err( 'Error during tmp directory cleaning : ' + str( e ) ) | |
422 | |
423 ## If not enought reads in this kmer : | 424 ## If not enought reads in this kmer : |
424 if kmer[keys] > 100 : | 425 if kmer[keys] > 100 : |
426 ## remove all txt files in tmp directory | |
427 if os.system("rm "+tmpdir+"/*.txt") != 0 : | |
428 stop_err( 'Error during tmp directory cleaning') | |
425 ## compute coverage and distribution kmer | 429 ## compute coverage and distribution kmer |
426 tmp = get_first_base(tmpdir, keys) | 430 tmp = get_first_base(tmpdir, keys) |
427 ## compute phasing | 431 ## compute phasing |
428 whole_phasing = frame_analysis(tmpdir,GFF) | 432 whole_phasing = frame_analysis(tmpdir,GFF) |
429 | 433 |