comparison ribo_functions.py @ 18:a121cce43f90 draft

Uploaded
author rlegendre
date Tue, 09 Jun 2015 09:06:17 -0400
parents c87c40e642af
children
comparison
equal deleted inserted replaced
17:c87c40e642af 18:a121cce43f90
139 ''' 139 '''
140 parse and store gff file in a dictionnary 140 parse and store gff file in a dictionnary
141 ''' 141 '''
142 try: 142 try:
143 GFF = {} 143 GFF = {}
144 mRNA = {}
144 with open(gff, 'r') as f_gff : 145 with open(gff, 'r') as f_gff :
145 146
146 GFF['order'] = [] 147 GFF['order'] = []
147 #for line in itertools.islice( f_gff, 25 ): 148 #for line in itertools.islice( f_gff, 25 ):
148 for line in f_gff: 149 for line in f_gff:
151 if line != "" : 152 if line != "" :
152 feature = (line.split('\t')[8]).split(';') 153 feature = (line.split('\t')[8]).split(';')
153 # first line is already gene line : 154 # first line is already gene line :
154 if line.split('\t')[2] == 'gene' : 155 if line.split('\t')[2] == 'gene' :
155 gene = feature[0].replace("ID=","") 156 gene = feature[0].replace("ID=","")
157 curent_gene = gene
156 if 'Name' in line : 158 if 'Name' in line :
157 regex = re.compile('(Name=)([^;]*);') 159 regex = re.compile('(Name=)([^;]*);?')
158 res = regex.search(line.split('\t')[8]) 160 res = regex.search(line.split('\t')[8])
159 Name = res.group(2) 161 Name = res.group(2)
160 Name = Name.rstrip() 162 Name = Name.rstrip()
161 else : 163 else :
162 Name = "Unknown" 164 Name = "Unknown"
163 ##get annotation 165 ##get annotation
164 if 'Note' in line : 166 if 'Note' in line :
165 regex = re.compile('(Note=)([^;]*);') 167 regex = re.compile('(Note=)([^;]*);?')
166 res = regex.search(line.split('\t')[8]) 168 res = regex.search(line.split('\t')[8])
167 note = res.group(2) 169 note = res.group(2)
168 note = urllib.unquote(str(note)).replace("\n","") 170 note = urllib.unquote(str(note)).replace("\n","")
169 else : 171 else :
170 note = "" 172 note = ""
178 GFF[gene]['name'] = Name 180 GFF[gene]['name'] = Name
179 GFF[gene]['note'] = note 181 GFF[gene]['note'] = note
180 GFF[gene]['exon'] = {} 182 GFF[gene]['exon'] = {}
181 GFF[gene]['exon_number'] = 0 183 GFF[gene]['exon_number'] = 0
182 #print Name 184 #print Name
185 elif line.split('\t')[2] == 'mRNA' :
186 regex = re.compile('(Parent=)([^;]*);?')
187 res = regex.search(line.split('\t')[8])
188 gene_name = res.group(2)
189 regex = re.compile('(ID=)([^;]*);?')
190 res = regex.search(line.split('\t')[8])
191 mRNA_name = res.group(2)
192 if gene not in mRNA.viewvalues() and gene_name == curent_gene :
193 mRNA[mRNA_name] = gene_name
194
183 elif line.split('\t')[2] == 'CDS' : 195 elif line.split('\t')[2] == 'CDS' :
184 regex = re.compile('(Parent=)([^;]*);') 196 regex = re.compile('(Parent=)([^;]*);?')
185 res = regex.search(line.split('\t')[8]) 197 res = regex.search(line.split('\t')[8])
186 gene = res.group(2) 198 gene = res.group(2)
199
187 if 'mRNA' in gene: 200 if 'mRNA' in gene:
188 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) 201 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
202 if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]):
203
204 gene = gene_name
205
189 if GFF.has_key(gene) : 206 if GFF.has_key(gene) :
190 GFF[gene]['exon_number'] += 1 207 GFF[gene]['exon_number'] += 1
191 exon_number = GFF[gene]['exon_number'] 208 exon_number = GFF[gene]['exon_number']
192 GFF[gene]['exon'][exon_number] = {} 209 GFF[gene]['exon'][exon_number] = {}
193 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] 210 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
194 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) 211 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
195 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) 212 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
196 213
197 ## if there is a five prim UTR intron, we change start of gene 214 ## if there is a five prim UTR intron, we change start of gene
198 elif line.split('\t')[2] == 'five_prime_UTR_intron' : 215 elif line.split('\t')[2] == 'five_prime_UTR_intron' :
199 if GFF[gene]['strand'] == "+" : 216 if GFF[gene]['strand'] == "+" :
200 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] 217 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
201 else : 218 else :