comparison ribo_functions.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents d7739f797a26
children 7c944fd9907e
comparison
equal deleted inserted replaced
9:d7739f797a26 10:707807fee542
6 @copyright: rachel.legendre@igmors.u-psud.fr 6 @copyright: rachel.legendre@igmors.u-psud.fr
7 @license: GPL v3 7 @license: GPL v3
8 ''' 8 '''
9 9
10 import sys, subprocess, re, commands, time, urllib 10 import sys, subprocess, re, commands, time, urllib
11 from copy import copy
12
11 13
12 def stop_err( msg ): 14 def stop_err( msg ):
13 sys.stderr.write( "%s\n" % msg ) 15 sys.stderr.write( "%s\n" % msg )
14 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) 16 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
15 sys.exit() 17 sys.exit()
152 GFF[gene]['note'] = note 154 GFF[gene]['note'] = note
153 GFF[gene]['exon'] = {} 155 GFF[gene]['exon'] = {}
154 GFF[gene]['exon_number'] = 0 156 GFF[gene]['exon_number'] = 0
155 #print Name 157 #print Name
156 elif line.split('\t')[2] == 'CDS' : 158 elif line.split('\t')[2] == 'CDS' :
157 gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) 159 gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
158 if GFF.has_key(gene) : 160 if GFF.has_key(gene) :
159 GFF[gene]['exon_number'] += 1 161 GFF[gene]['exon_number'] += 1
160 exon_number = GFF[gene]['exon_number'] 162 exon_number = GFF[gene]['exon_number']
161 GFF[gene]['exon'][exon_number] = {} 163 GFF[gene]['exon'][exon_number] = {}
162 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] 164 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
182 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified 184 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
183 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified 185 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified
184 186
185 def store_gtf(gff): 187 def store_gtf(gff):
186 ''' 188 '''
187 parse and store gtf file in a dictionnary (DEPRECATED) 189 parse and store gtf file in a dictionnary
188 ''' 190 '''
189 try: 191 try:
190 GFF = {} 192 GFF = {}
191 with open(gff, 'r') as f_gff : 193 with open(gff, 'r') as f_gff :
192 GFF['order'] = [] 194 GFF['order'] = []
193 for line in f_gff: 195 for line in f_gff:
194 ## switch commented lines 196 ## switch commented lines
195 line = line.split("#")[0] 197 line = line.split("#")[0]
196 if line != "" : 198 if line != "" :
197 # first line is already gene line : 199 # first line is already gene line :
198 if line.split('\t')[1] == 'protein_coding' : 200 if 'protein_coding' in line :
199 ##get name 201 ##get name
200 gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() 202 gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
201 Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip() 203 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
202 if line.split('\t')[2] == 'exon' : 204 if line.split('\t')[2] == 'CDS' :
203 ##if its first time we get this gene 205 ##if its first time we get this gene
204 if gene not in GFF.keys() : 206 if gene not in GFF.keys() :
205 ## store gene information 207 ## store gene information
206 GFF['order'].append(gene) 208 GFF['order'].append(gene)
207 GFF[gene] = {} 209 GFF[gene] = {}
208 GFF[gene]['chrom'] = line.split('\t')[0] 210 GFF[gene]['chrom'] = line.split('\t')[0]
209 GFF[gene]['strand'] = line.split('\t')[6] 211 GFF[gene]['strand'] = line.split('\t')[6]
212 GFF[gene]['start'] = int(line.split('\t')[3])
213 GFF[gene]['stop'] = int(line.split('\t')[4])
210 GFF[gene]['name'] = Name 214 GFF[gene]['name'] = Name
215 GFF[gene]['note'] = ""
211 GFF[gene]['exon_number'] = 1 216 GFF[gene]['exon_number'] = 1
212 GFF[gene]['exon'] = {} 217 GFF[gene]['exon'] = {}
213 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) 218 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
219 ## some exons are non codant
220 exon_number = 1
214 GFF[gene]['exon'][exon_number] = {} 221 GFF[gene]['exon'][exon_number] = {}
215 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) 222 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
216 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) 223 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
217 else : 224 else :
218 ## we add exon 225 ## we add exon
219 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) 226 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
227 exon_number += 1
220 GFF[gene]['exon_number'] = exon_number 228 GFF[gene]['exon_number'] = exon_number
221 GFF[gene]['exon'][exon_number] = {} 229 GFF[gene]['exon'][exon_number] = {}
222 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) 230 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
223 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) 231 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
224 elif line.split('\t')[2] == 'CDS' : 232 #elif line.split('\t')[2] == 'CDS' :
225 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) 233 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
226 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] 234 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
227 elif line.split('\t')[2] == 'start_codon' : 235 elif line.split('\t')[2] == 'start_codon' :
228 if GFF[gene]['strand'] == '-' : 236 if GFF[gene]['strand'] == '-' :
229 GFF[gene]['start'] = int(line.split('\t')[4]) 237 GFF[gene]['stop'] = int(line.split('\t')[4])
230 else : 238 else :
231 GFF[gene]['start'] = int(line.split('\t')[3]) 239 GFF[gene]['start'] = int(line.split('\t')[3])
232 elif line.split('\t')[2] == 'stop_codon' : 240 elif line.split('\t')[2] == 'stop_codon' :
233 if GFF[gene]['strand'] == '-' : 241 if GFF[gene]['strand'] == '-' :
234 GFF[gene]['stop'] = int(line.split('\t')[3]) 242 GFF[gene]['start'] = int(line.split('\t')[3])
235 else : 243 else :
236 GFF[gene]['stop'] = int(line.split('\t')[4]) 244 GFF[gene]['stop'] = int(line.split('\t')[4])
237 245
238 return GFF 246 return __reverse_coordinates__(GFF)
239 except Exception,e: 247 except Exception,e:
240 stop_err( 'Error during gff storage : ' + str( e ) ) 248 stop_err( 'Error during gff storage : ' + str( e ) )
241 249
242 250
243 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; 251 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
250 ## exon_id "YDL083C.2"; 258 ## exon_id "YDL083C.2";
251 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; 259 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
252 ## protein_id "YDL083C"; 260 ## protein_id "YDL083C";
253 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " 261 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
254 ##RPS16B"; 262 ##RPS16B";
255 263 def __reverse_coordinates__(GFF):
256 264
265 for gene in GFF['order']:
266 ## for reverse gene
267 if GFF[gene]['strand'] == "-":
268 ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
269 if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
270 tmp = copy(GFF[gene]['exon'])
271 exon_number = GFF[gene]['exon_number']
272 rev_index = exon_number+1
273 for z in range(1,exon_number+1):
274 rev_index -= 1
275 GFF[gene]['exon'][z] = tmp[rev_index]
276
277 ## check start
278 if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
279 GFF[gene]['exon'][1]['start'] = GFF[gene]['start']
280
281 return GFF
282
257 283
258 def cleaning_bam(bam): 284 def cleaning_bam(bam):
259 ''' 285 '''
260 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 286 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
261 ''' 287 '''