annotate ribo_functions.py @ 19:385fc64fa988 draft default tip

Uploaded
author rlegendre
date Fri, 12 Jun 2015 11:32:59 -0400
parents a121cce43f90
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
1 #!/usr/bin/env python2.7
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
2
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
3 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
4 Created on Jan. 2014
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
5 @author: rachel legendre
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
6 @copyright: rachel.legendre@igmors.u-psud.fr
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
7 @license: GPL v3
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
8 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
9
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
10 import sys, subprocess, re, commands, time, urllib, tempfile
10
rlegendre
parents: 9
diff changeset
11 from copy import copy
rlegendre
parents: 9
diff changeset
12
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
13
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
14 def stop_err( msg ):
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
15 sys.stderr.write( "%s\n" % msg )
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
16 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
17 sys.exit()
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
18
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
19 def split_bam(bamfile,tmpdir):
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
20 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
21 split bam by chromosome and write sam file in tmpdir
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
22 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
23 try:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
24 #get header
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
25 results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
26 header = results.split('\n')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
27
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
28 #define genome size
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
29 genome = []
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
30 for line in header:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
31 result = re.search('SN', line)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
32 if result :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
33 #print line
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
34 feat = line.split('\t')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
35 chrom = re.split(":", feat[1])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
36 #print feat[1]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
37 genome.append(chrom[1])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
38
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
39 #split sam by chrom
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
40 n = 0
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
41 for chrm in genome:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
42 with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
43 #write header correctly for each chromosome
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
44 f.write(header[0]+'\n')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
45 expr = re.compile(chrm+'\t')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
46 el =[elem for elem in header if expr.search(elem)][0]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
47 f.write(el+'\n')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
48 f.write(header[-2]+'\n')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
49 #write all reads for each chromosome
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
50 reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
51 f.write(reads)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
52 # calculate number of reads
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
53 n += reads.count(chrm)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
54
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
55 sys.stdout.write("%d reads are presents in your bam file\n" % n)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
56
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
57 except Exception, e:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
58 stop_err( 'Error during bam file splitting : ' + str( e ) )
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
59
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
60
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
61
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
62 def get_first_base(tmpdir, kmer, frame):
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
63 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
64 write footprint coverage file for each sam file in tmpdir
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
65 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
66 global total_mapped_read
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
67 ## tags by default
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
68 multi_tag = "XS:i:"
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
69 tag = "IH:i:1"
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
70
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
71 try:
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
72 ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint)
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
73 p_site_pos = 16-frame
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
74
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
75 file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
76 ##write coverage for each sam file in tmpdir
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
77 for samfile in file_array:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
78 with open(tmpdir+'/'+samfile, 'r') as sam :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
79 ##get chromosome name
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
80 chrom = samfile.split(".sam")[0]
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
81
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
82 for line in sam:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
83 #initialize dictionnary
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
84 if '@SQ' in line :
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
85 size = int(line.split('LN:')[1])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
86 genomeF = [0]*size
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
87 genomeR = [0]*size
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
88 # define multiple reads keys from mapper
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
89 elif '@PG\tID' in line :
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
90 if 'bowtie' in line:
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
91 multi_tag = "XS:i:"
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
92 elif 'bwa' in line:
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
93 multi_tag = "XT:A:R"
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
94 elif 'TopHat' in line:
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
95 tag = "NH:i:1"
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
96 else :
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
97 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
98
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
99 # get footprint
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
100 elif re.search('^[^@].+', line) :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
101 #print line.rstrip()
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
102 read_pos = int(line.split('\t')[3])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
103 read_sens = int(line.split('\t')[1])
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
104 len_read = len(line.split('\t')[9])
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
105 read_qual = int(line.split('\t')[4])
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
106 if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 :
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
107 ###if line.split('\t')[5] == '28M' :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
108 # print line.rstrip()
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
109 total_mapped_read +=1
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
110 #if it's a forward read
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
111 if read_sens == 0 :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
112 #get P-site : start read + 12 nt
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
113 #read_pos += 15
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
114 read_pos += p_site_pos
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
115 genomeF[read_pos] += 1
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
116 #if it's a reverse read
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
117 elif read_sens == 16 :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
118 #get P-site
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
119 #read_pos += 12
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
120 read_pos += (len_read-1) - p_site_pos
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
121 genomeR[read_pos] += 1
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
122
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
123 #try:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
124 #write coverage in files
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
125 with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
126 for i in range(0,size):
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
127 cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
128 #except Exception,e:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
129 # stop_err( 'Error during coverage file writting : ' + str( e ) )
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
130 del genomeF
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
131 del genomeR
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
132 sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
133 except Exception, e:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
134 stop_err( 'Error during footprinting : ' + str( e ) )
17
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
135
c87c40e642af Uploaded
rlegendre
parents: 13
diff changeset
136
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
137
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
138 def store_gff(gff):
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
139 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
140 parse and store gff file in a dictionnary
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
141 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
142 try:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
143 GFF = {}
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
144 mRNA = {}
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
145 with open(gff, 'r') as f_gff :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
146
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
147 GFF['order'] = []
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
148 #for line in itertools.islice( f_gff, 25 ):
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
149 for line in f_gff:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
150 ## switch commented lines
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
151 line = line.split("#")[0]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
152 if line != "" :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
153 feature = (line.split('\t')[8]).split(';')
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
154 # first line is already gene line :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
155 if line.split('\t')[2] == 'gene' :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
156 gene = feature[0].replace("ID=","")
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
157 curent_gene = gene
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
158 if 'Name' in line :
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
159 regex = re.compile('(Name=)([^;]*);?')
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
160 res = regex.search(line.split('\t')[8])
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
161 Name = res.group(2)
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
162 Name = Name.rstrip()
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
163 else :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
164 Name = "Unknown"
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
165 ##get annotation
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
166 if 'Note' in line :
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
167 regex = re.compile('(Note=)([^;]*);?')
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
168 res = regex.search(line.split('\t')[8])
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
169 note = res.group(2)
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
170 note = urllib.unquote(str(note)).replace("\n","")
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
171 else :
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
172 note = ""
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
173 ## store gene information
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
174 GFF['order'].append(gene)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
175 GFF[gene] = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
176 GFF[gene]['chrom'] = line.split('\t')[0]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
177 GFF[gene]['start'] = int(line.split('\t')[3])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
178 GFF[gene]['stop'] = int(line.split('\t')[4])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
179 GFF[gene]['strand'] = line.split('\t')[6]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
180 GFF[gene]['name'] = Name
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
181 GFF[gene]['note'] = note
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
182 GFF[gene]['exon'] = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
183 GFF[gene]['exon_number'] = 0
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
184 #print Name
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
185 elif line.split('\t')[2] == 'mRNA' :
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
186 regex = re.compile('(Parent=)([^;]*);?')
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
187 res = regex.search(line.split('\t')[8])
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
188 gene_name = res.group(2)
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
189 regex = re.compile('(ID=)([^;]*);?')
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
190 res = regex.search(line.split('\t')[8])
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
191 mRNA_name = res.group(2)
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
192 if gene not in mRNA.viewvalues() and gene_name == curent_gene :
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
193 mRNA[mRNA_name] = gene_name
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
194
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
195 elif line.split('\t')[2] == 'CDS' :
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
196 regex = re.compile('(Parent=)([^;]*);?')
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
197 res = regex.search(line.split('\t')[8])
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
198 gene = res.group(2)
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
199
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
200 if 'mRNA' in gene:
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
201 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
202 if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]):
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
203
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
204 gene = gene_name
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
205
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
206 if GFF.has_key(gene) :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
207 GFF[gene]['exon_number'] += 1
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
208 exon_number = GFF[gene]['exon_number']
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
209 GFF[gene]['exon'][exon_number] = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
210 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
211 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
212 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
18
a121cce43f90 Uploaded
rlegendre
parents: 17
diff changeset
213
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
214 ## if there is a five prim UTR intron, we change start of gene
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
215 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
216 if GFF[gene]['strand'] == "+" :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
217 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
218 else :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
219 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
220 return GFF
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
221 except Exception,e:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
222 stop_err( 'Error during gff storage : ' + str( e ) )
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
223
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
224
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
225 #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
226 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
227 #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
228 #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
229 #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
230 #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
231 #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
232
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
233 def store_gtf(gff):
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
234 '''
10
rlegendre
parents: 9
diff changeset
235 parse and store gtf file in a dictionnary
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
236 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
237 try:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
238 GFF = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
239 with open(gff, 'r') as f_gff :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
240 GFF['order'] = []
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
241 for line in f_gff:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
242 ## switch commented lines
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
243 line = line.split("#")[0]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
244 if line != "" :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
245 # first line is already gene line :
10
rlegendre
parents: 9
diff changeset
246 if 'protein_coding' in line :
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
247 ##get name
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
248 gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip()
10
rlegendre
parents: 9
diff changeset
249 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
rlegendre
parents: 9
diff changeset
250 if line.split('\t')[2] == 'CDS' :
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
251 ##if its first time we get this gene
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
252 if gene not in GFF.keys() :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
253 ## store gene information
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
254 GFF['order'].append(gene)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
255 GFF[gene] = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
256 GFF[gene]['chrom'] = line.split('\t')[0]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
257 GFF[gene]['strand'] = line.split('\t')[6]
10
rlegendre
parents: 9
diff changeset
258 GFF[gene]['start'] = int(line.split('\t')[3])
rlegendre
parents: 9
diff changeset
259 GFF[gene]['stop'] = int(line.split('\t')[4])
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
260 GFF[gene]['name'] = Name
10
rlegendre
parents: 9
diff changeset
261 GFF[gene]['note'] = ""
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
262 GFF[gene]['exon_number'] = 1
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
263 GFF[gene]['exon'] = {}
10
rlegendre
parents: 9
diff changeset
264 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
rlegendre
parents: 9
diff changeset
265 ## some exons are non codant
rlegendre
parents: 9
diff changeset
266 exon_number = 1
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
267 GFF[gene]['exon'][exon_number] = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
268 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
269 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
270 else :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
271 ## we add exon
10
rlegendre
parents: 9
diff changeset
272 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
rlegendre
parents: 9
diff changeset
273 exon_number += 1
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
274 GFF[gene]['exon_number'] = exon_number
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
275 GFF[gene]['exon'][exon_number] = {}
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
276 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
277 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
10
rlegendre
parents: 9
diff changeset
278 #elif line.split('\t')[2] == 'CDS' :
rlegendre
parents: 9
diff changeset
279 #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
280 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
281 elif line.split('\t')[2] == 'start_codon' :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
282 if GFF[gene]['strand'] == '-' :
10
rlegendre
parents: 9
diff changeset
283 GFF[gene]['stop'] = int(line.split('\t')[4])
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
284 else :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
285 GFF[gene]['start'] = int(line.split('\t')[3])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
286 elif line.split('\t')[2] == 'stop_codon' :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
287 if GFF[gene]['strand'] == '-' :
10
rlegendre
parents: 9
diff changeset
288 GFF[gene]['start'] = int(line.split('\t')[3])
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
289 else :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
290 GFF[gene]['stop'] = int(line.split('\t')[4])
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
291
10
rlegendre
parents: 9
diff changeset
292 return __reverse_coordinates__(GFF)
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
293 except Exception,e:
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
294 stop_err( 'Error during gtf storage : ' + str( e ) )
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
295
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
296
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
297 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
298 ## exon_id "YDL083C.1";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
299 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
300 ## protein_id "YDL083C";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
301 ##IV protein_coding start_codon 307787 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
302 ##RPS16B";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
303 ##IV protein_coding exon 306926 307333 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
304 ## exon_id "YDL083C.2";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
305 ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
306 ## protein_id "YDL083C";
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
307 ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
308 ##RPS16B";
10
rlegendre
parents: 9
diff changeset
309 def __reverse_coordinates__(GFF):
rlegendre
parents: 9
diff changeset
310
rlegendre
parents: 9
diff changeset
311 for gene in GFF['order']:
rlegendre
parents: 9
diff changeset
312 ## for reverse gene
rlegendre
parents: 9
diff changeset
313 if GFF[gene]['strand'] == "-":
rlegendre
parents: 9
diff changeset
314 ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
rlegendre
parents: 9
diff changeset
315 if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
rlegendre
parents: 9
diff changeset
316 tmp = copy(GFF[gene]['exon'])
rlegendre
parents: 9
diff changeset
317 exon_number = GFF[gene]['exon_number']
rlegendre
parents: 9
diff changeset
318 rev_index = exon_number+1
rlegendre
parents: 9
diff changeset
319 for z in range(1,exon_number+1):
rlegendre
parents: 9
diff changeset
320 rev_index -= 1
rlegendre
parents: 9
diff changeset
321 GFF[gene]['exon'][z] = tmp[rev_index]
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
322
10
rlegendre
parents: 9
diff changeset
323 ## check start
rlegendre
parents: 9
diff changeset
324 if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
rlegendre
parents: 9
diff changeset
325 GFF[gene]['exon'][1]['start'] = GFF[gene]['start']
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
326
10
rlegendre
parents: 9
diff changeset
327 return GFF
rlegendre
parents: 9
diff changeset
328
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
329
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
330 def cleaning_bam(bam):
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
331 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
332 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
333 '''
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
334 try :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
335 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
336 #header = results.split('\n')
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
337 ## tags by default
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
338 multi_tag = "XS:i:"
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
339 tag = "IH:i:1"
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
340 # check mapper for define multiple tag
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
341 if 'bowtie' in header:
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
342 multi_tag = "XS:i:"
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
343 elif 'bwa' in header:
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
344 multi_tag = "XT:A:R"
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
345 elif 'TopHat' in header:
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
346 tag = "NH:i:1"
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
347 else :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
348 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
349
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
350 tmp_sam = tempfile.mktemp()
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
351 cmd = "samtools view %s > %s" % (bam, tmp_sam)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
352 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
353 returncode = proc.wait()
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
354
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
355
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
356 with open(tempfile.mktemp(),'w') as out :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
357 out.write(header)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
358 with open(tmp_sam,'r') as sam :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
359 for line in sam :
13
7c944fd9907e release 2
rlegendre
parents: 10
diff changeset
360 if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
9
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
361 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
362 out.write(line)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
363 bamfile = tempfile.mktemp()+'.bam'
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
364 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
365 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
366 returncode = proc.wait()
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
367
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
368 return bamfile
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
369
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
370 except Exception,e:
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
371 stop_err( 'Error during cleaning bam : ' + str( e ) )
d7739f797a26 Uploaded
rlegendre
parents:
diff changeset
372