ribo_tools: ribo_functions.py comparison

comparison ribo_functions.py @ 13:7c944fd9907e draft

release 2

author	rlegendre
date	Thu, 09 Apr 2015 11:35:48 -0400
parents	707807fee542
children	c87c40e642af

comparison

equal deleted inserted replaced

-:ee3a3435ce43
+:7c944fd9907e
 @author: rachel legendre
 @copyright:  rachel.legendre@igmors.u-psud.fr
 @license: GPL v3
 '''
-import sys, subprocess, re, commands, time, urllib
+import sys, subprocess, re, commands, time, urllib,  tempfile
 from copy import copy
 def stop_err( msg ):
 sys.stderr.write( "%s\n" % msg )
 parse and store gff file in a dictionnary
 '''
 try:
 GFF = {}
 with open(gff, 'r') as f_gff :
 GFF['order'] = []
+#for line in itertools.islice( f_gff, 25 ):
 for line in f_gff:
 ## switch commented lines
 line = line.split("#")[0]
 if line != "" :
 feature = (line.split('\t')[8]).split(';')
 # first line is already gene line :
 if line.split('\t')[2] == 'gene' :
 gene = feature[0].replace("ID=","")
-if re.search('gene',feature[2]) :
+if 'Name' in line :
-Name = feature[2].replace("gene=","")
+regex = re.compile('(Name=)([^;]*);')
+res = regex.search(line.split('\t')[8])
+Name = res.group(2)
+Name = Name.rstrip()
 else :
 Name = "Unknown"
 ##get annotation
-note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
+if 'Note' in line :
-note = urllib.unquote(str(note)).replace("\n","")
+regex = re.compile('(Note=)([^;]*);')
+res = regex.search(line.split('\t')[8])
+note = res.group(2)
+note = urllib.unquote(str(note)).replace("\n","")
+else :
+note = ""
 ## store gene information
 GFF['order'].append(gene)
 GFF[gene] = {}
 GFF[gene]['chrom'] = line.split('\t')[0]
 GFF[gene]['start'] = int(line.split('\t')[3])
 GFF[gene]['note'] = note
 GFF[gene]['exon'] = {}
 GFF[gene]['exon_number'] = 0
 #print Name
 elif line.split('\t')[2] == 'CDS' :
-gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
+regex = re.compile('(Parent=)([^;]*);')
+res = regex.search(line.split('\t')[8])
+gene = res.group(2)
+if 'mRNA' in gene:
+gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
 if GFF.has_key(gene) :
 GFF[gene]['exon_number'] += 1
 exon_number = GFF[gene]['exon_number']
 GFF[gene]['exon'][exon_number] = {}
 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
 ## if there is a five prim UTR intron, we change start of gene
-elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
+elif line.split('\t')[2] == 'five_prime_UTR_intron' :
 if GFF[gene]['strand'] == "+" :
 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
 else :
 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
 return GFF
 line = line.split("#")[0]
 if line != "" :
 # first line is already gene line :
 if 'protein_coding' in line :
 ##get name
-gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
+gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip()
 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
 if line.split('\t')[2] == 'CDS' :
 ##if its first time we get this gene
 if gene not in GFF.keys() :
 ## store gene information
 else :
 GFF[gene]['stop'] = int(line.split('\t')[4])
 return __reverse_coordinates__(GFF)
 except Exception,e:
-stop_err( 'Error during gff storage : ' + str( e ) )
+stop_err( 'Error during gtf storage : ' + str( e ) )
 ##IV      protein_coding  exon    307766  307789  .       -       .       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
 ## exon_id "YDL083C.1";
 ##IV      protein_coding  CDS     307766  307789  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
 '''
 try :
 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
 #header = results.split('\n')
+## tags by default
+multi_tag = "XS:i:"
+tag = "IH:i:1"
 # check mapper for define multiple tag
-if re.search('bowtie', header):
+if 'bowtie' in header:
 multi_tag = "XS:i:"
-elif re.search('bwa', header):
+elif 'bwa' in  header:
 multi_tag = "XT:A:R"
+elif 'TopHat' in  header:
+tag = "NH:i:1"
 else :
-stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping")
+stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
 tmp_sam = tempfile.mktemp()
 cmd = "samtools view %s > %s" % (bam, tmp_sam)
 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
 returncode = proc.wait()
 with open(tempfile.mktemp(),'w') as out :
 out.write(header)
 with open(tmp_sam,'r') as sam :
 for line in sam :
-if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
+if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
 out.write(line)
 bamfile = tempfile.mktemp()+'.bam'
 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile)
 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)

Mercurial > repos > rlegendre > ribo_tools

comparison ribo_functions.py @ 13:7c944fd9907e draft