9
|
1 #!/usr/bin/env python2.7
|
|
2
|
|
3 '''
|
|
4 Created on Jan. 2014
|
|
5 @author: rachel legendre
|
|
6 @copyright: rachel.legendre@igmors.u-psud.fr
|
|
7 @license: GPL v3
|
|
8 '''
|
|
9
|
13
|
10 import sys, subprocess, re, commands, time, urllib, tempfile
|
10
|
11 from copy import copy
|
|
12
|
9
|
13
|
|
14 def stop_err( msg ):
|
|
15 sys.stderr.write( "%s\n" % msg )
|
|
16 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
|
|
17 sys.exit()
|
|
18
|
|
19 def split_bam(bamfile,tmpdir):
|
|
20 '''
|
|
21 split bam by chromosome and write sam file in tmpdir
|
|
22 '''
|
|
23 try:
|
|
24 #get header
|
|
25 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
|
|
26 header = results.split('\n')
|
|
27
|
|
28 #define genome size
|
|
29 genome = []
|
|
30 for line in header:
|
|
31 result = re.search('SN', line)
|
|
32 if result :
|
|
33 #print line
|
|
34 feat = line.split('\t')
|
|
35 chrom = re.split(":", feat[1])
|
|
36 #print feat[1]
|
|
37 genome.append(chrom[1])
|
|
38
|
|
39 #split sam by chrom
|
|
40 n = 0
|
|
41 for chrm in genome:
|
|
42 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
|
|
43 #write header correctly for each chromosome
|
|
44 f.write(header[0]+'\n')
|
|
45 expr = re.compile(chrm+'\t')
|
|
46 el =[elem for elem in header if expr.search(elem)][0]
|
|
47 f.write(el+'\n')
|
|
48 f.write(header[-2]+'\n')
|
|
49 #write all reads for each chromosome
|
|
50 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
|
|
51 f.write(reads)
|
|
52 # calculate number of reads
|
|
53 n += reads.count(chrm)
|
|
54
|
|
55 sys.stdout.write("%d reads are presents in your bam file\n" % n)
|
|
56
|
|
57 except Exception, e:
|
|
58 stop_err( 'Error during bam file splitting : ' + str( e ) )
|
|
59
|
|
60
|
|
61
|
17
|
62 def get_first_base(tmpdir, kmer, frame):
|
9
|
63 '''
|
|
64 write footprint coverage file for each sam file in tmpdir
|
|
65 '''
|
|
66 global total_mapped_read
|
17
|
67 ## tags by default
|
|
68 multi_tag = "XS:i:"
|
|
69 tag = "IH:i:1"
|
|
70
|
9
|
71 try:
|
17
|
72 ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint)
|
|
73 p_site_pos = 16-frame
|
|
74
|
9
|
75 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
|
|
76 ##write coverage for each sam file in tmpdir
|
|
77 for samfile in file_array:
|
|
78 with open(tmpdir+'/'+samfile, 'r') as sam :
|
|
79 ##get chromosome name
|
17
|
80 chrom = samfile.split(".sam")[0]
|
9
|
81
|
|
82 for line in sam:
|
|
83 #initialize dictionnary
|
17
|
84 if '@SQ' in line :
|
9
|
85 size = int(line.split('LN:')[1])
|
|
86 genomeF = [0]*size
|
|
87 genomeR = [0]*size
|
|
88 # define multiple reads keys from mapper
|
17
|
89 elif '@PG\tID' in line :
|
|
90 if 'bowtie' in line:
|
9
|
91 multi_tag = "XS:i:"
|
17
|
92 elif 'bwa' in line:
|
9
|
93 multi_tag = "XT:A:R"
|
17
|
94 elif 'TopHat' in line:
|
|
95 tag = "NH:i:1"
|
9
|
96 else :
|
17
|
97 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
|
9
|
98
|
|
99 # get footprint
|
|
100 elif re.search('^[^@].+', line) :
|
|
101 #print line.rstrip()
|
|
102 read_pos = int(line.split('\t')[3])
|
|
103 read_sens = int(line.split('\t')[1])
|
17
|
104 len_read = len(line.split('\t')[9])
|
|
105 read_qual = int(line.split('\t')[4])
|
|
106 if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 :
|
9
|
107 ###if line.split('\t')[5] == '28M' :
|
|
108 # print line.rstrip()
|
|
109 total_mapped_read +=1
|
|
110 #if it's a forward read
|
|
111 if read_sens == 0 :
|
|
112 #get P-site : start read + 12 nt
|
17
|
113 #read_pos += 15
|
|
114 read_pos += p_site_pos
|
9
|
115 genomeF[read_pos] += 1
|
|
116 #if it's a reverse read
|
|
117 elif read_sens == 16 :
|
|
118 #get P-site
|
17
|
119 #read_pos += 12
|
|
120 read_pos += (len_read-1) - p_site_pos
|
9
|
121 genomeR[read_pos] += 1
|
|
122
|
|
123 #try:
|
|
124 #write coverage in files
|
|
125 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
|
|
126 for i in range(0,size):
|
|
127 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
|
|
128 #except Exception,e:
|
|
129 # stop_err( 'Error during coverage file writting : ' + str( e ) )
|
17
|
130 del genomeF
|
|
131 del genomeR
|
9
|
132 sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read)
|
|
133 except Exception, e:
|
|
134 stop_err( 'Error during footprinting : ' + str( e ) )
|
17
|
135
|
|
136
|
9
|
137
|
|
138 def store_gff(gff):
|
|
139 '''
|
|
140 parse and store gff file in a dictionnary
|
|
141 '''
|
|
142 try:
|
|
143 GFF = {}
|
|
144 with open(gff, 'r') as f_gff :
|
13
|
145
|
9
|
146 GFF['order'] = []
|
13
|
147 #for line in itertools.islice( f_gff, 25 ):
|
9
|
148 for line in f_gff:
|
|
149 ## switch commented lines
|
|
150 line = line.split("#")[0]
|
|
151 if line != "" :
|
|
152 feature = (line.split('\t')[8]).split(';')
|
|
153 # first line is already gene line :
|
|
154 if line.split('\t')[2] == 'gene' :
|
|
155 gene = feature[0].replace("ID=","")
|
13
|
156 if 'Name' in line :
|
|
157 regex = re.compile('(Name=)([^;]*);')
|
|
158 res = regex.search(line.split('\t')[8])
|
|
159 Name = res.group(2)
|
|
160 Name = Name.rstrip()
|
9
|
161 else :
|
|
162 Name = "Unknown"
|
|
163 ##get annotation
|
13
|
164 if 'Note' in line :
|
|
165 regex = re.compile('(Note=)([^;]*);')
|
|
166 res = regex.search(line.split('\t')[8])
|
|
167 note = res.group(2)
|
|
168 note = urllib.unquote(str(note)).replace("\n","")
|
|
169 else :
|
|
170 note = ""
|
9
|
171 ## store gene information
|
|
172 GFF['order'].append(gene)
|
|
173 GFF[gene] = {}
|
|
174 GFF[gene]['chrom'] = line.split('\t')[0]
|
|
175 GFF[gene]['start'] = int(line.split('\t')[3])
|
|
176 GFF[gene]['stop'] = int(line.split('\t')[4])
|
|
177 GFF[gene]['strand'] = line.split('\t')[6]
|
|
178 GFF[gene]['name'] = Name
|
|
179 GFF[gene]['note'] = note
|
|
180 GFF[gene]['exon'] = {}
|
|
181 GFF[gene]['exon_number'] = 0
|
|
182 #print Name
|
|
183 elif line.split('\t')[2] == 'CDS' :
|
13
|
184 regex = re.compile('(Parent=)([^;]*);')
|
|
185 res = regex.search(line.split('\t')[8])
|
|
186 gene = res.group(2)
|
|
187 if 'mRNA' in gene:
|
|
188 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
|
9
|
189 if GFF.has_key(gene) :
|
|
190 GFF[gene]['exon_number'] += 1
|
|
191 exon_number = GFF[gene]['exon_number']
|
|
192 GFF[gene]['exon'][exon_number] = {}
|
|
193 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
|
|
194 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
|
|
195 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
|
|
196
|
|
197 ## if there is a five prim UTR intron, we change start of gene
|
13
|
198 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
|
9
|
199 if GFF[gene]['strand'] == "+" :
|
|
200 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
|
|
201 else :
|
|
202 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
|
|
203 return GFF
|
|
204 except Exception,e:
|
|
205 stop_err( 'Error during gff storage : ' + str( e ) )
|
|
206
|
|
207
|
|
208 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
|
|
209 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
|
|
210 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
|
|
211 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
|
|
212 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
|
|
213 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
|
|
214 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified
|
|
215
|
|
216 def store_gtf(gff):
|
|
217 '''
|
10
|
218 parse and store gtf file in a dictionnary
|
9
|
219 '''
|
|
220 try:
|
|
221 GFF = {}
|
|
222 with open(gff, 'r') as f_gff :
|
|
223 GFF['order'] = []
|
|
224 for line in f_gff:
|
|
225 ## switch commented lines
|
|
226 line = line.split("#")[0]
|
|
227 if line != "" :
|
|
228 # first line is already gene line :
|
10
|
229 if 'protein_coding' in line :
|
9
|
230 ##get name
|
13
|
231 gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip()
|
10
|
232 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
|
|
233 if line.split('\t')[2] == 'CDS' :
|
9
|
234 ##if its first time we get this gene
|
|
235 if gene not in GFF.keys() :
|
|
236 ## store gene information
|
|
237 GFF['order'].append(gene)
|
|
238 GFF[gene] = {}
|
|
239 GFF[gene]['chrom'] = line.split('\t')[0]
|
|
240 GFF[gene]['strand'] = line.split('\t')[6]
|
10
|
241 GFF[gene]['start'] = int(line.split('\t')[3])
|
|
242 GFF[gene]['stop'] = int(line.split('\t')[4])
|
9
|
243 GFF[gene]['name'] = Name
|
10
|
244 GFF[gene]['note'] = ""
|
9
|
245 GFF[gene]['exon_number'] = 1
|
|
246 GFF[gene]['exon'] = {}
|
10
|
247 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
|
|
248 ## some exons are non codant
|
|
249 exon_number = 1
|
9
|
250 GFF[gene]['exon'][exon_number] = {}
|
|
251 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
|
|
252 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
|
|
253 else :
|
|
254 ## we add exon
|
10
|
255 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
|
|
256 exon_number += 1
|
9
|
257 GFF[gene]['exon_number'] = exon_number
|
|
258 GFF[gene]['exon'][exon_number] = {}
|
|
259 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
|
|
260 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
|
10
|
261 #elif line.split('\t')[2] == 'CDS' :
|
|
262 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
|
9
|
263 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
|
|
264 elif line.split('\t')[2] == 'start_codon' :
|
|
265 if GFF[gene]['strand'] == '-' :
|
10
|
266 GFF[gene]['stop'] = int(line.split('\t')[4])
|
9
|
267 else :
|
|
268 GFF[gene]['start'] = int(line.split('\t')[3])
|
|
269 elif line.split('\t')[2] == 'stop_codon' :
|
|
270 if GFF[gene]['strand'] == '-' :
|
10
|
271 GFF[gene]['start'] = int(line.split('\t')[3])
|
9
|
272 else :
|
|
273 GFF[gene]['stop'] = int(line.split('\t')[4])
|
|
274
|
10
|
275 return __reverse_coordinates__(GFF)
|
9
|
276 except Exception,e:
|
13
|
277 stop_err( 'Error during gtf storage : ' + str( e ) )
|
9
|
278
|
|
279
|
|
280 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
|
|
281 ## exon_id "YDL083C.1";
|
|
282 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
|
|
283 ## protein_id "YDL083C";
|
|
284 ##IV protein_coding start_codon 307787 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
|
|
285 ##RPS16B";
|
|
286 ##IV protein_coding exon 306926 307333 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
|
|
287 ## exon_id "YDL083C.2";
|
|
288 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
|
|
289 ## protein_id "YDL083C";
|
|
290 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
|
|
291 ##RPS16B";
|
10
|
292 def __reverse_coordinates__(GFF):
|
|
293
|
|
294 for gene in GFF['order']:
|
|
295 ## for reverse gene
|
|
296 if GFF[gene]['strand'] == "-":
|
|
297 ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
|
|
298 if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
|
|
299 tmp = copy(GFF[gene]['exon'])
|
|
300 exon_number = GFF[gene]['exon_number']
|
|
301 rev_index = exon_number+1
|
|
302 for z in range(1,exon_number+1):
|
|
303 rev_index -= 1
|
|
304 GFF[gene]['exon'][z] = tmp[rev_index]
|
9
|
305
|
10
|
306 ## check start
|
|
307 if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
|
|
308 GFF[gene]['exon'][1]['start'] = GFF[gene]['start']
|
9
|
309
|
10
|
310 return GFF
|
|
311
|
9
|
312
|
|
313 def cleaning_bam(bam):
|
|
314 '''
|
|
315 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
|
|
316 '''
|
|
317 try :
|
|
318 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
|
|
319 #header = results.split('\n')
|
13
|
320 ## tags by default
|
|
321 multi_tag = "XS:i:"
|
|
322 tag = "IH:i:1"
|
9
|
323 # check mapper for define multiple tag
|
13
|
324 if 'bowtie' in header:
|
9
|
325 multi_tag = "XS:i:"
|
13
|
326 elif 'bwa' in header:
|
9
|
327 multi_tag = "XT:A:R"
|
13
|
328 elif 'TopHat' in header:
|
|
329 tag = "NH:i:1"
|
9
|
330 else :
|
13
|
331 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
|
|
332
|
9
|
333 tmp_sam = tempfile.mktemp()
|
|
334 cmd = "samtools view %s > %s" % (bam, tmp_sam)
|
|
335 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
|
|
336 returncode = proc.wait()
|
|
337
|
|
338
|
|
339 with open(tempfile.mktemp(),'w') as out :
|
|
340 out.write(header)
|
|
341 with open(tmp_sam,'r') as sam :
|
|
342 for line in sam :
|
13
|
343 if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
|
9
|
344 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
|
|
345 out.write(line)
|
|
346 bamfile = tempfile.mktemp()+'.bam'
|
|
347 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile)
|
|
348 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
|
|
349 returncode = proc.wait()
|
|
350
|
|
351 return bamfile
|
|
352
|
|
353 except Exception,e:
|
|
354 stop_err( 'Error during cleaning bam : ' + str( e ) )
|
|
355 |