Mercurial > repos > rlegendre > ribo_tools
diff ribo_functions.py @ 17:c87c40e642af draft
Uploaded
author | rlegendre |
---|---|
date | Fri, 29 May 2015 09:17:29 -0400 |
parents | 7c944fd9907e |
children | a121cce43f90 |
line wrap: on
line diff
--- a/ribo_functions.py Wed May 13 04:34:59 2015 -0400 +++ b/ribo_functions.py Fri May 29 09:17:29 2015 -0400 @@ -59,53 +59,65 @@ -def get_first_base(tmpdir,kmer): +def get_first_base(tmpdir, kmer, frame): ''' write footprint coverage file for each sam file in tmpdir ''' global total_mapped_read + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" + try: + ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint) + p_site_pos = 16-frame + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') ##write coverage for each sam file in tmpdir for samfile in file_array: with open(tmpdir+'/'+samfile, 'r') as sam : ##get chromosome name - chrom = samfile.split(".")[0] + chrom = samfile.split(".sam")[0] for line in sam: #initialize dictionnary - if re.search('@SQ', line) : + if '@SQ' in line : size = int(line.split('LN:')[1]) genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper - elif re.search('@PG', line) : - if re.search('bowtie', line): + elif '@PG\tID' in line : + if 'bowtie' in line: multi_tag = "XS:i:" - elif re.search('bwa', line): + elif 'bwa' in line: multi_tag = "XT:A:R" + elif 'TopHat' in line: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") # get footprint elif re.search('^[^@].+', line) : #print line.rstrip() read_pos = int(line.split('\t')[3]) read_sens = int(line.split('\t')[1]) - #len_read = len(line.split('\t')[9]) - if line.split('\t')[5] == kmer+'M' and multi_tag not in line: + len_read = len(line.split('\t')[9]) + read_qual = int(line.split('\t')[4]) + if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 : ###if line.split('\t')[5] == '28M' : # print line.rstrip() total_mapped_read +=1 #if it's a forward read if read_sens == 0 : #get P-site : start read + 12 nt - read_pos += 12 + #read_pos += 15 + read_pos += p_site_pos genomeF[read_pos] += 1 #if it's a reverse read elif read_sens == 16 : #get P-site - read_pos += 15 + #read_pos += 12 + read_pos += (len_read-1) - p_site_pos genomeR[read_pos] += 1 #try: @@ -115,10 +127,13 @@ cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') #except Exception,e: # stop_err( 'Error during coverage file writting : ' + str( e ) ) - + del genomeF + del genomeR sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read) except Exception, e: stop_err( 'Error during footprinting : ' + str( e ) ) + + def store_gff(gff): '''