Mercurial > repos > rlegendre > ribo_tools
comparison ribo_functions.py @ 13:7c944fd9907e draft
release 2
author | rlegendre |
---|---|
date | Thu, 09 Apr 2015 11:35:48 -0400 |
parents | 707807fee542 |
children | c87c40e642af |
comparison
equal
deleted
inserted
replaced
12:ee3a3435ce43 | 13:7c944fd9907e |
---|---|
5 @author: rachel legendre | 5 @author: rachel legendre |
6 @copyright: rachel.legendre@igmors.u-psud.fr | 6 @copyright: rachel.legendre@igmors.u-psud.fr |
7 @license: GPL v3 | 7 @license: GPL v3 |
8 ''' | 8 ''' |
9 | 9 |
10 import sys, subprocess, re, commands, time, urllib | 10 import sys, subprocess, re, commands, time, urllib, tempfile |
11 from copy import copy | 11 from copy import copy |
12 | 12 |
13 | 13 |
14 def stop_err( msg ): | 14 def stop_err( msg ): |
15 sys.stderr.write( "%s\n" % msg ) | 15 sys.stderr.write( "%s\n" % msg ) |
125 parse and store gff file in a dictionnary | 125 parse and store gff file in a dictionnary |
126 ''' | 126 ''' |
127 try: | 127 try: |
128 GFF = {} | 128 GFF = {} |
129 with open(gff, 'r') as f_gff : | 129 with open(gff, 'r') as f_gff : |
130 | |
130 GFF['order'] = [] | 131 GFF['order'] = [] |
132 #for line in itertools.islice( f_gff, 25 ): | |
131 for line in f_gff: | 133 for line in f_gff: |
132 ## switch commented lines | 134 ## switch commented lines |
133 line = line.split("#")[0] | 135 line = line.split("#")[0] |
134 if line != "" : | 136 if line != "" : |
135 feature = (line.split('\t')[8]).split(';') | 137 feature = (line.split('\t')[8]).split(';') |
136 # first line is already gene line : | 138 # first line is already gene line : |
137 if line.split('\t')[2] == 'gene' : | 139 if line.split('\t')[2] == 'gene' : |
138 gene = feature[0].replace("ID=","") | 140 gene = feature[0].replace("ID=","") |
139 if re.search('gene',feature[2]) : | 141 if 'Name' in line : |
140 Name = feature[2].replace("gene=","") | 142 regex = re.compile('(Name=)([^;]*);') |
143 res = regex.search(line.split('\t')[8]) | |
144 Name = res.group(2) | |
145 Name = Name.rstrip() | |
141 else : | 146 else : |
142 Name = "Unknown" | 147 Name = "Unknown" |
143 ##get annotation | 148 ##get annotation |
144 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) | 149 if 'Note' in line : |
145 note = urllib.unquote(str(note)).replace("\n","") | 150 regex = re.compile('(Note=)([^;]*);') |
151 res = regex.search(line.split('\t')[8]) | |
152 note = res.group(2) | |
153 note = urllib.unquote(str(note)).replace("\n","") | |
154 else : | |
155 note = "" | |
146 ## store gene information | 156 ## store gene information |
147 GFF['order'].append(gene) | 157 GFF['order'].append(gene) |
148 GFF[gene] = {} | 158 GFF[gene] = {} |
149 GFF[gene]['chrom'] = line.split('\t')[0] | 159 GFF[gene]['chrom'] = line.split('\t')[0] |
150 GFF[gene]['start'] = int(line.split('\t')[3]) | 160 GFF[gene]['start'] = int(line.split('\t')[3]) |
154 GFF[gene]['note'] = note | 164 GFF[gene]['note'] = note |
155 GFF[gene]['exon'] = {} | 165 GFF[gene]['exon'] = {} |
156 GFF[gene]['exon_number'] = 0 | 166 GFF[gene]['exon_number'] = 0 |
157 #print Name | 167 #print Name |
158 elif line.split('\t')[2] == 'CDS' : | 168 elif line.split('\t')[2] == 'CDS' : |
159 gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) | 169 regex = re.compile('(Parent=)([^;]*);') |
170 res = regex.search(line.split('\t')[8]) | |
171 gene = res.group(2) | |
172 if 'mRNA' in gene: | |
173 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) | |
160 if GFF.has_key(gene) : | 174 if GFF.has_key(gene) : |
161 GFF[gene]['exon_number'] += 1 | 175 GFF[gene]['exon_number'] += 1 |
162 exon_number = GFF[gene]['exon_number'] | 176 exon_number = GFF[gene]['exon_number'] |
163 GFF[gene]['exon'][exon_number] = {} | 177 GFF[gene]['exon'][exon_number] = {} |
164 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | 178 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] |
165 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | 179 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) |
166 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | 180 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) |
167 | 181 |
168 ## if there is a five prim UTR intron, we change start of gene | 182 ## if there is a five prim UTR intron, we change start of gene |
169 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | 183 elif line.split('\t')[2] == 'five_prime_UTR_intron' : |
170 if GFF[gene]['strand'] == "+" : | 184 if GFF[gene]['strand'] == "+" : |
171 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | 185 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] |
172 else : | 186 else : |
173 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | 187 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] |
174 return GFF | 188 return GFF |
197 line = line.split("#")[0] | 211 line = line.split("#")[0] |
198 if line != "" : | 212 if line != "" : |
199 # first line is already gene line : | 213 # first line is already gene line : |
200 if 'protein_coding' in line : | 214 if 'protein_coding' in line : |
201 ##get name | 215 ##get name |
202 gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() | 216 gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip() |
203 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() | 217 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() |
204 if line.split('\t')[2] == 'CDS' : | 218 if line.split('\t')[2] == 'CDS' : |
205 ##if its first time we get this gene | 219 ##if its first time we get this gene |
206 if gene not in GFF.keys() : | 220 if gene not in GFF.keys() : |
207 ## store gene information | 221 ## store gene information |
243 else : | 257 else : |
244 GFF[gene]['stop'] = int(line.split('\t')[4]) | 258 GFF[gene]['stop'] = int(line.split('\t')[4]) |
245 | 259 |
246 return __reverse_coordinates__(GFF) | 260 return __reverse_coordinates__(GFF) |
247 except Exception,e: | 261 except Exception,e: |
248 stop_err( 'Error during gff storage : ' + str( e ) ) | 262 stop_err( 'Error during gtf storage : ' + str( e ) ) |
249 | 263 |
250 | 264 |
251 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | 265 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; |
252 ## exon_id "YDL083C.1"; | 266 ## exon_id "YDL083C.1"; |
253 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | 267 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; |
286 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 | 300 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 |
287 ''' | 301 ''' |
288 try : | 302 try : |
289 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) | 303 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) |
290 #header = results.split('\n') | 304 #header = results.split('\n') |
291 | 305 ## tags by default |
306 multi_tag = "XS:i:" | |
307 tag = "IH:i:1" | |
292 # check mapper for define multiple tag | 308 # check mapper for define multiple tag |
293 if re.search('bowtie', header): | 309 if 'bowtie' in header: |
294 multi_tag = "XS:i:" | 310 multi_tag = "XS:i:" |
295 elif re.search('bwa', header): | 311 elif 'bwa' in header: |
296 multi_tag = "XT:A:R" | 312 multi_tag = "XT:A:R" |
313 elif 'TopHat' in header: | |
314 tag = "NH:i:1" | |
297 else : | 315 else : |
298 stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") | 316 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") |
299 | 317 |
300 tmp_sam = tempfile.mktemp() | 318 tmp_sam = tempfile.mktemp() |
301 cmd = "samtools view %s > %s" % (bam, tmp_sam) | 319 cmd = "samtools view %s > %s" % (bam, tmp_sam) |
302 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | 320 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) |
303 returncode = proc.wait() | 321 returncode = proc.wait() |
304 | 322 |
305 | 323 |
306 with open(tempfile.mktemp(),'w') as out : | 324 with open(tempfile.mktemp(),'w') as out : |
307 out.write(header) | 325 out.write(header) |
308 with open(tmp_sam,'r') as sam : | 326 with open(tmp_sam,'r') as sam : |
309 for line in sam : | 327 for line in sam : |
310 if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : | 328 if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : |
311 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : | 329 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : |
312 out.write(line) | 330 out.write(line) |
313 bamfile = tempfile.mktemp()+'.bam' | 331 bamfile = tempfile.mktemp()+'.bam' |
314 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) | 332 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) |
315 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | 333 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) |