comparison ribo_functions.py @ 13:7c944fd9907e draft

release 2
author rlegendre
date Thu, 09 Apr 2015 11:35:48 -0400
parents 707807fee542
children c87c40e642af
comparison
equal deleted inserted replaced
12:ee3a3435ce43 13:7c944fd9907e
5 @author: rachel legendre 5 @author: rachel legendre
6 @copyright: rachel.legendre@igmors.u-psud.fr 6 @copyright: rachel.legendre@igmors.u-psud.fr
7 @license: GPL v3 7 @license: GPL v3
8 ''' 8 '''
9 9
10 import sys, subprocess, re, commands, time, urllib 10 import sys, subprocess, re, commands, time, urllib, tempfile
11 from copy import copy 11 from copy import copy
12 12
13 13
14 def stop_err( msg ): 14 def stop_err( msg ):
15 sys.stderr.write( "%s\n" % msg ) 15 sys.stderr.write( "%s\n" % msg )
125 parse and store gff file in a dictionnary 125 parse and store gff file in a dictionnary
126 ''' 126 '''
127 try: 127 try:
128 GFF = {} 128 GFF = {}
129 with open(gff, 'r') as f_gff : 129 with open(gff, 'r') as f_gff :
130
130 GFF['order'] = [] 131 GFF['order'] = []
132 #for line in itertools.islice( f_gff, 25 ):
131 for line in f_gff: 133 for line in f_gff:
132 ## switch commented lines 134 ## switch commented lines
133 line = line.split("#")[0] 135 line = line.split("#")[0]
134 if line != "" : 136 if line != "" :
135 feature = (line.split('\t')[8]).split(';') 137 feature = (line.split('\t')[8]).split(';')
136 # first line is already gene line : 138 # first line is already gene line :
137 if line.split('\t')[2] == 'gene' : 139 if line.split('\t')[2] == 'gene' :
138 gene = feature[0].replace("ID=","") 140 gene = feature[0].replace("ID=","")
139 if re.search('gene',feature[2]) : 141 if 'Name' in line :
140 Name = feature[2].replace("gene=","") 142 regex = re.compile('(Name=)([^;]*);')
143 res = regex.search(line.split('\t')[8])
144 Name = res.group(2)
145 Name = Name.rstrip()
141 else : 146 else :
142 Name = "Unknown" 147 Name = "Unknown"
143 ##get annotation 148 ##get annotation
144 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) 149 if 'Note' in line :
145 note = urllib.unquote(str(note)).replace("\n","") 150 regex = re.compile('(Note=)([^;]*);')
151 res = regex.search(line.split('\t')[8])
152 note = res.group(2)
153 note = urllib.unquote(str(note)).replace("\n","")
154 else :
155 note = ""
146 ## store gene information 156 ## store gene information
147 GFF['order'].append(gene) 157 GFF['order'].append(gene)
148 GFF[gene] = {} 158 GFF[gene] = {}
149 GFF[gene]['chrom'] = line.split('\t')[0] 159 GFF[gene]['chrom'] = line.split('\t')[0]
150 GFF[gene]['start'] = int(line.split('\t')[3]) 160 GFF[gene]['start'] = int(line.split('\t')[3])
154 GFF[gene]['note'] = note 164 GFF[gene]['note'] = note
155 GFF[gene]['exon'] = {} 165 GFF[gene]['exon'] = {}
156 GFF[gene]['exon_number'] = 0 166 GFF[gene]['exon_number'] = 0
157 #print Name 167 #print Name
158 elif line.split('\t')[2] == 'CDS' : 168 elif line.split('\t')[2] == 'CDS' :
159 gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) 169 regex = re.compile('(Parent=)([^;]*);')
170 res = regex.search(line.split('\t')[8])
171 gene = res.group(2)
172 if 'mRNA' in gene:
173 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
160 if GFF.has_key(gene) : 174 if GFF.has_key(gene) :
161 GFF[gene]['exon_number'] += 1 175 GFF[gene]['exon_number'] += 1
162 exon_number = GFF[gene]['exon_number'] 176 exon_number = GFF[gene]['exon_number']
163 GFF[gene]['exon'][exon_number] = {} 177 GFF[gene]['exon'][exon_number] = {}
164 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] 178 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
165 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) 179 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
166 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) 180 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
167 181
168 ## if there is a five prim UTR intron, we change start of gene 182 ## if there is a five prim UTR intron, we change start of gene
169 elif line.split('\t')[2] == 'five_prime_UTR_intron' : 183 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
170 if GFF[gene]['strand'] == "+" : 184 if GFF[gene]['strand'] == "+" :
171 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] 185 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
172 else : 186 else :
173 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] 187 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
174 return GFF 188 return GFF
197 line = line.split("#")[0] 211 line = line.split("#")[0]
198 if line != "" : 212 if line != "" :
199 # first line is already gene line : 213 # first line is already gene line :
200 if 'protein_coding' in line : 214 if 'protein_coding' in line :
201 ##get name 215 ##get name
202 gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() 216 gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip()
203 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() 217 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
204 if line.split('\t')[2] == 'CDS' : 218 if line.split('\t')[2] == 'CDS' :
205 ##if its first time we get this gene 219 ##if its first time we get this gene
206 if gene not in GFF.keys() : 220 if gene not in GFF.keys() :
207 ## store gene information 221 ## store gene information
243 else : 257 else :
244 GFF[gene]['stop'] = int(line.split('\t')[4]) 258 GFF[gene]['stop'] = int(line.split('\t')[4])
245 259
246 return __reverse_coordinates__(GFF) 260 return __reverse_coordinates__(GFF)
247 except Exception,e: 261 except Exception,e:
248 stop_err( 'Error during gff storage : ' + str( e ) ) 262 stop_err( 'Error during gtf storage : ' + str( e ) )
249 263
250 264
251 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; 265 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
252 ## exon_id "YDL083C.1"; 266 ## exon_id "YDL083C.1";
253 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; 267 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
286 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 300 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
287 ''' 301 '''
288 try : 302 try :
289 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) 303 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
290 #header = results.split('\n') 304 #header = results.split('\n')
291 305 ## tags by default
306 multi_tag = "XS:i:"
307 tag = "IH:i:1"
292 # check mapper for define multiple tag 308 # check mapper for define multiple tag
293 if re.search('bowtie', header): 309 if 'bowtie' in header:
294 multi_tag = "XS:i:" 310 multi_tag = "XS:i:"
295 elif re.search('bwa', header): 311 elif 'bwa' in header:
296 multi_tag = "XT:A:R" 312 multi_tag = "XT:A:R"
313 elif 'TopHat' in header:
314 tag = "NH:i:1"
297 else : 315 else :
298 stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") 316 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
299 317
300 tmp_sam = tempfile.mktemp() 318 tmp_sam = tempfile.mktemp()
301 cmd = "samtools view %s > %s" % (bam, tmp_sam) 319 cmd = "samtools view %s > %s" % (bam, tmp_sam)
302 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) 320 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
303 returncode = proc.wait() 321 returncode = proc.wait()
304 322
305 323
306 with open(tempfile.mktemp(),'w') as out : 324 with open(tempfile.mktemp(),'w') as out :
307 out.write(header) 325 out.write(header)
308 with open(tmp_sam,'r') as sam : 326 with open(tmp_sam,'r') as sam :
309 for line in sam : 327 for line in sam :
310 if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : 328 if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
311 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : 329 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
312 out.write(line) 330 out.write(line)
313 bamfile = tempfile.mktemp()+'.bam' 331 bamfile = tempfile.mktemp()+'.bam'
314 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) 332 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile)
315 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) 333 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)