Mercurial > repos > rlegendre > ribo_tools
comparison ribo_functions.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | d7739f797a26 |
children | 7c944fd9907e |
comparison
equal
deleted
inserted
replaced
9:d7739f797a26 | 10:707807fee542 |
---|---|
6 @copyright: rachel.legendre@igmors.u-psud.fr | 6 @copyright: rachel.legendre@igmors.u-psud.fr |
7 @license: GPL v3 | 7 @license: GPL v3 |
8 ''' | 8 ''' |
9 | 9 |
10 import sys, subprocess, re, commands, time, urllib | 10 import sys, subprocess, re, commands, time, urllib |
11 from copy import copy | |
12 | |
11 | 13 |
12 def stop_err( msg ): | 14 def stop_err( msg ): |
13 sys.stderr.write( "%s\n" % msg ) | 15 sys.stderr.write( "%s\n" % msg ) |
14 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | 16 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) |
15 sys.exit() | 17 sys.exit() |
152 GFF[gene]['note'] = note | 154 GFF[gene]['note'] = note |
153 GFF[gene]['exon'] = {} | 155 GFF[gene]['exon'] = {} |
154 GFF[gene]['exon_number'] = 0 | 156 GFF[gene]['exon_number'] = 0 |
155 #print Name | 157 #print Name |
156 elif line.split('\t')[2] == 'CDS' : | 158 elif line.split('\t')[2] == 'CDS' : |
157 gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) | 159 gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) |
158 if GFF.has_key(gene) : | 160 if GFF.has_key(gene) : |
159 GFF[gene]['exon_number'] += 1 | 161 GFF[gene]['exon_number'] += 1 |
160 exon_number = GFF[gene]['exon_number'] | 162 exon_number = GFF[gene]['exon_number'] |
161 GFF[gene]['exon'][exon_number] = {} | 163 GFF[gene]['exon'][exon_number] = {} |
162 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | 164 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] |
182 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | 184 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified |
183 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified | 185 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified |
184 | 186 |
185 def store_gtf(gff): | 187 def store_gtf(gff): |
186 ''' | 188 ''' |
187 parse and store gtf file in a dictionnary (DEPRECATED) | 189 parse and store gtf file in a dictionnary |
188 ''' | 190 ''' |
189 try: | 191 try: |
190 GFF = {} | 192 GFF = {} |
191 with open(gff, 'r') as f_gff : | 193 with open(gff, 'r') as f_gff : |
192 GFF['order'] = [] | 194 GFF['order'] = [] |
193 for line in f_gff: | 195 for line in f_gff: |
194 ## switch commented lines | 196 ## switch commented lines |
195 line = line.split("#")[0] | 197 line = line.split("#")[0] |
196 if line != "" : | 198 if line != "" : |
197 # first line is already gene line : | 199 # first line is already gene line : |
198 if line.split('\t')[1] == 'protein_coding' : | 200 if 'protein_coding' in line : |
199 ##get name | 201 ##get name |
200 gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() | 202 gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() |
201 Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip() | 203 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() |
202 if line.split('\t')[2] == 'exon' : | 204 if line.split('\t')[2] == 'CDS' : |
203 ##if its first time we get this gene | 205 ##if its first time we get this gene |
204 if gene not in GFF.keys() : | 206 if gene not in GFF.keys() : |
205 ## store gene information | 207 ## store gene information |
206 GFF['order'].append(gene) | 208 GFF['order'].append(gene) |
207 GFF[gene] = {} | 209 GFF[gene] = {} |
208 GFF[gene]['chrom'] = line.split('\t')[0] | 210 GFF[gene]['chrom'] = line.split('\t')[0] |
209 GFF[gene]['strand'] = line.split('\t')[6] | 211 GFF[gene]['strand'] = line.split('\t')[6] |
212 GFF[gene]['start'] = int(line.split('\t')[3]) | |
213 GFF[gene]['stop'] = int(line.split('\t')[4]) | |
210 GFF[gene]['name'] = Name | 214 GFF[gene]['name'] = Name |
215 GFF[gene]['note'] = "" | |
211 GFF[gene]['exon_number'] = 1 | 216 GFF[gene]['exon_number'] = 1 |
212 GFF[gene]['exon'] = {} | 217 GFF[gene]['exon'] = {} |
213 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) | 218 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) |
219 ## some exons are non codant | |
220 exon_number = 1 | |
214 GFF[gene]['exon'][exon_number] = {} | 221 GFF[gene]['exon'][exon_number] = {} |
215 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | 222 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) |
216 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | 223 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) |
217 else : | 224 else : |
218 ## we add exon | 225 ## we add exon |
219 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) | 226 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) |
227 exon_number += 1 | |
220 GFF[gene]['exon_number'] = exon_number | 228 GFF[gene]['exon_number'] = exon_number |
221 GFF[gene]['exon'][exon_number] = {} | 229 GFF[gene]['exon'][exon_number] = {} |
222 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | 230 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) |
223 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | 231 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) |
224 elif line.split('\t')[2] == 'CDS' : | 232 #elif line.split('\t')[2] == 'CDS' : |
225 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) | 233 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) |
226 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | 234 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] |
227 elif line.split('\t')[2] == 'start_codon' : | 235 elif line.split('\t')[2] == 'start_codon' : |
228 if GFF[gene]['strand'] == '-' : | 236 if GFF[gene]['strand'] == '-' : |
229 GFF[gene]['start'] = int(line.split('\t')[4]) | 237 GFF[gene]['stop'] = int(line.split('\t')[4]) |
230 else : | 238 else : |
231 GFF[gene]['start'] = int(line.split('\t')[3]) | 239 GFF[gene]['start'] = int(line.split('\t')[3]) |
232 elif line.split('\t')[2] == 'stop_codon' : | 240 elif line.split('\t')[2] == 'stop_codon' : |
233 if GFF[gene]['strand'] == '-' : | 241 if GFF[gene]['strand'] == '-' : |
234 GFF[gene]['stop'] = int(line.split('\t')[3]) | 242 GFF[gene]['start'] = int(line.split('\t')[3]) |
235 else : | 243 else : |
236 GFF[gene]['stop'] = int(line.split('\t')[4]) | 244 GFF[gene]['stop'] = int(line.split('\t')[4]) |
237 | 245 |
238 return GFF | 246 return __reverse_coordinates__(GFF) |
239 except Exception,e: | 247 except Exception,e: |
240 stop_err( 'Error during gff storage : ' + str( e ) ) | 248 stop_err( 'Error during gff storage : ' + str( e ) ) |
241 | 249 |
242 | 250 |
243 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | 251 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; |
250 ## exon_id "YDL083C.2"; | 258 ## exon_id "YDL083C.2"; |
251 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | 259 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; |
252 ## protein_id "YDL083C"; | 260 ## protein_id "YDL083C"; |
253 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " | 261 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " |
254 ##RPS16B"; | 262 ##RPS16B"; |
255 | 263 def __reverse_coordinates__(GFF): |
256 | 264 |
265 for gene in GFF['order']: | |
266 ## for reverse gene | |
267 if GFF[gene]['strand'] == "-": | |
268 ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates | |
269 if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 : | |
270 tmp = copy(GFF[gene]['exon']) | |
271 exon_number = GFF[gene]['exon_number'] | |
272 rev_index = exon_number+1 | |
273 for z in range(1,exon_number+1): | |
274 rev_index -= 1 | |
275 GFF[gene]['exon'][z] = tmp[rev_index] | |
276 | |
277 ## check start | |
278 if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']: | |
279 GFF[gene]['exon'][1]['start'] = GFF[gene]['start'] | |
280 | |
281 return GFF | |
282 | |
257 | 283 |
258 def cleaning_bam(bam): | 284 def cleaning_bam(bam): |
259 ''' | 285 ''' |
260 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 | 286 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 |
261 ''' | 287 ''' |