Mercurial > repos > rlegendre > ribo_tools
comparison ribo_functions.py @ 9:d7739f797a26
Uploaded
author | rlegendre |
---|---|
date | Tue, 09 Dec 2014 09:43:07 -0500 |
parents | |
children | 707807fee542 |
comparison
equal
deleted
inserted
replaced
8:adc01e560eae | 9:d7739f797a26 |
---|---|
1 #!/usr/bin/env python2.7 | |
2 | |
3 ''' | |
4 Created on Jan. 2014 | |
5 @author: rachel legendre | |
6 @copyright: rachel.legendre@igmors.u-psud.fr | |
7 @license: GPL v3 | |
8 ''' | |
9 | |
10 import sys, subprocess, re, commands, time, urllib | |
11 | |
12 def stop_err( msg ): | |
13 sys.stderr.write( "%s\n" % msg ) | |
14 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) | |
15 sys.exit() | |
16 | |
17 def split_bam(bamfile,tmpdir): | |
18 ''' | |
19 split bam by chromosome and write sam file in tmpdir | |
20 ''' | |
21 try: | |
22 #get header | |
23 results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) | |
24 header = results.split('\n') | |
25 | |
26 #define genome size | |
27 genome = [] | |
28 for line in header: | |
29 result = re.search('SN', line) | |
30 if result : | |
31 #print line | |
32 feat = line.split('\t') | |
33 chrom = re.split(":", feat[1]) | |
34 #print feat[1] | |
35 genome.append(chrom[1]) | |
36 | |
37 #split sam by chrom | |
38 n = 0 | |
39 for chrm in genome: | |
40 with open(tmpdir+'/'+chrm+'.sam', 'w') as f : | |
41 #write header correctly for each chromosome | |
42 f.write(header[0]+'\n') | |
43 expr = re.compile(chrm+'\t') | |
44 el =[elem for elem in header if expr.search(elem)][0] | |
45 f.write(el+'\n') | |
46 f.write(header[-2]+'\n') | |
47 #write all reads for each chromosome | |
48 reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) | |
49 f.write(reads) | |
50 # calculate number of reads | |
51 n += reads.count(chrm) | |
52 | |
53 sys.stdout.write("%d reads are presents in your bam file\n" % n) | |
54 | |
55 except Exception, e: | |
56 stop_err( 'Error during bam file splitting : ' + str( e ) ) | |
57 | |
58 | |
59 | |
60 def get_first_base(tmpdir,kmer): | |
61 ''' | |
62 write footprint coverage file for each sam file in tmpdir | |
63 ''' | |
64 global total_mapped_read | |
65 try: | |
66 file_array = (commands.getoutput('ls '+tmpdir)).split('\n') | |
67 ##write coverage for each sam file in tmpdir | |
68 for samfile in file_array: | |
69 with open(tmpdir+'/'+samfile, 'r') as sam : | |
70 ##get chromosome name | |
71 chrom = samfile.split(".")[0] | |
72 | |
73 for line in sam: | |
74 #initialize dictionnary | |
75 if re.search('@SQ', line) : | |
76 size = int(line.split('LN:')[1]) | |
77 genomeF = [0]*size | |
78 genomeR = [0]*size | |
79 # define multiple reads keys from mapper | |
80 elif re.search('@PG', line) : | |
81 if re.search('bowtie', line): | |
82 multi_tag = "XS:i:" | |
83 elif re.search('bwa', line): | |
84 multi_tag = "XT:A:R" | |
85 else : | |
86 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") | |
87 | |
88 # get footprint | |
89 elif re.search('^[^@].+', line) : | |
90 #print line.rstrip() | |
91 read_pos = int(line.split('\t')[3]) | |
92 read_sens = int(line.split('\t')[1]) | |
93 #len_read = len(line.split('\t')[9]) | |
94 if line.split('\t')[5] == kmer+'M' and multi_tag not in line: | |
95 ###if line.split('\t')[5] == '28M' : | |
96 # print line.rstrip() | |
97 total_mapped_read +=1 | |
98 #if it's a forward read | |
99 if read_sens == 0 : | |
100 #get P-site : start read + 12 nt | |
101 read_pos += 12 | |
102 genomeF[read_pos] += 1 | |
103 #if it's a reverse read | |
104 elif read_sens == 16 : | |
105 #get P-site | |
106 read_pos += 15 | |
107 genomeR[read_pos] += 1 | |
108 | |
109 #try: | |
110 #write coverage in files | |
111 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : | |
112 for i in range(0,size): | |
113 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') | |
114 #except Exception,e: | |
115 # stop_err( 'Error during coverage file writting : ' + str( e ) ) | |
116 | |
117 sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read) | |
118 except Exception, e: | |
119 stop_err( 'Error during footprinting : ' + str( e ) ) | |
120 | |
121 def store_gff(gff): | |
122 ''' | |
123 parse and store gff file in a dictionnary | |
124 ''' | |
125 try: | |
126 GFF = {} | |
127 with open(gff, 'r') as f_gff : | |
128 GFF['order'] = [] | |
129 for line in f_gff: | |
130 ## switch commented lines | |
131 line = line.split("#")[0] | |
132 if line != "" : | |
133 feature = (line.split('\t')[8]).split(';') | |
134 # first line is already gene line : | |
135 if line.split('\t')[2] == 'gene' : | |
136 gene = feature[0].replace("ID=","") | |
137 if re.search('gene',feature[2]) : | |
138 Name = feature[2].replace("gene=","") | |
139 else : | |
140 Name = "Unknown" | |
141 ##get annotation | |
142 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) | |
143 note = urllib.unquote(str(note)).replace("\n","") | |
144 ## store gene information | |
145 GFF['order'].append(gene) | |
146 GFF[gene] = {} | |
147 GFF[gene]['chrom'] = line.split('\t')[0] | |
148 GFF[gene]['start'] = int(line.split('\t')[3]) | |
149 GFF[gene]['stop'] = int(line.split('\t')[4]) | |
150 GFF[gene]['strand'] = line.split('\t')[6] | |
151 GFF[gene]['name'] = Name | |
152 GFF[gene]['note'] = note | |
153 GFF[gene]['exon'] = {} | |
154 GFF[gene]['exon_number'] = 0 | |
155 #print Name | |
156 elif line.split('\t')[2] == 'CDS' : | |
157 gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) | |
158 if GFF.has_key(gene) : | |
159 GFF[gene]['exon_number'] += 1 | |
160 exon_number = GFF[gene]['exon_number'] | |
161 GFF[gene]['exon'][exon_number] = {} | |
162 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | |
163 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | |
164 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | |
165 | |
166 ## if there is a five prim UTR intron, we change start of gene | |
167 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | |
168 if GFF[gene]['strand'] == "+" : | |
169 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | |
170 else : | |
171 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | |
172 return GFF | |
173 except Exception,e: | |
174 stop_err( 'Error during gff storage : ' + str( e ) ) | |
175 | |
176 | |
177 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 | |
178 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra | |
179 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr | |
180 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified | |
181 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | |
182 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified | |
183 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified | |
184 | |
185 def store_gtf(gff): | |
186 ''' | |
187 parse and store gtf file in a dictionnary (DEPRECATED) | |
188 ''' | |
189 try: | |
190 GFF = {} | |
191 with open(gff, 'r') as f_gff : | |
192 GFF['order'] = [] | |
193 for line in f_gff: | |
194 ## switch commented lines | |
195 line = line.split("#")[0] | |
196 if line != "" : | |
197 # first line is already gene line : | |
198 if line.split('\t')[1] == 'protein_coding' : | |
199 ##get name | |
200 gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() | |
201 Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip() | |
202 if line.split('\t')[2] == 'exon' : | |
203 ##if its first time we get this gene | |
204 if gene not in GFF.keys() : | |
205 ## store gene information | |
206 GFF['order'].append(gene) | |
207 GFF[gene] = {} | |
208 GFF[gene]['chrom'] = line.split('\t')[0] | |
209 GFF[gene]['strand'] = line.split('\t')[6] | |
210 GFF[gene]['name'] = Name | |
211 GFF[gene]['exon_number'] = 1 | |
212 GFF[gene]['exon'] = {} | |
213 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) | |
214 GFF[gene]['exon'][exon_number] = {} | |
215 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | |
216 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | |
217 else : | |
218 ## we add exon | |
219 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) | |
220 GFF[gene]['exon_number'] = exon_number | |
221 GFF[gene]['exon'][exon_number] = {} | |
222 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | |
223 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | |
224 elif line.split('\t')[2] == 'CDS' : | |
225 exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) | |
226 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | |
227 elif line.split('\t')[2] == 'start_codon' : | |
228 if GFF[gene]['strand'] == '-' : | |
229 GFF[gene]['start'] = int(line.split('\t')[4]) | |
230 else : | |
231 GFF[gene]['start'] = int(line.split('\t')[3]) | |
232 elif line.split('\t')[2] == 'stop_codon' : | |
233 if GFF[gene]['strand'] == '-' : | |
234 GFF[gene]['stop'] = int(line.split('\t')[3]) | |
235 else : | |
236 GFF[gene]['stop'] = int(line.split('\t')[4]) | |
237 | |
238 return GFF | |
239 except Exception,e: | |
240 stop_err( 'Error during gff storage : ' + str( e ) ) | |
241 | |
242 | |
243 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | |
244 ## exon_id "YDL083C.1"; | |
245 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | |
246 ## protein_id "YDL083C"; | |
247 ##IV protein_coding start_codon 307787 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " | |
248 ##RPS16B"; | |
249 ##IV protein_coding exon 306926 307333 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | |
250 ## exon_id "YDL083C.2"; | |
251 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | |
252 ## protein_id "YDL083C"; | |
253 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " | |
254 ##RPS16B"; | |
255 | |
256 | |
257 | |
258 def cleaning_bam(bam): | |
259 ''' | |
260 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 | |
261 ''' | |
262 try : | |
263 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) | |
264 #header = results.split('\n') | |
265 | |
266 # check mapper for define multiple tag | |
267 if re.search('bowtie', header): | |
268 multi_tag = "XS:i:" | |
269 elif re.search('bwa', header): | |
270 multi_tag = "XT:A:R" | |
271 else : | |
272 stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") | |
273 | |
274 tmp_sam = tempfile.mktemp() | |
275 cmd = "samtools view %s > %s" % (bam, tmp_sam) | |
276 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | |
277 returncode = proc.wait() | |
278 | |
279 | |
280 with open(tempfile.mktemp(),'w') as out : | |
281 out.write(header) | |
282 with open(tmp_sam,'r') as sam : | |
283 for line in sam : | |
284 if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : | |
285 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : | |
286 out.write(line) | |
287 bamfile = tempfile.mktemp()+'.bam' | |
288 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) | |
289 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | |
290 returncode = proc.wait() | |
291 | |
292 return bamfile | |
293 | |
294 except Exception,e: | |
295 stop_err( 'Error during cleaning bam : ' + str( e ) ) | |
296 |