Mercurial > repos > rlegendre > ribo_tools
comparison ribo_functions.py @ 18:a121cce43f90 draft
Uploaded
author | rlegendre |
---|---|
date | Tue, 09 Jun 2015 09:06:17 -0400 |
parents | c87c40e642af |
children |
comparison
equal
deleted
inserted
replaced
17:c87c40e642af | 18:a121cce43f90 |
---|---|
139 ''' | 139 ''' |
140 parse and store gff file in a dictionnary | 140 parse and store gff file in a dictionnary |
141 ''' | 141 ''' |
142 try: | 142 try: |
143 GFF = {} | 143 GFF = {} |
144 mRNA = {} | |
144 with open(gff, 'r') as f_gff : | 145 with open(gff, 'r') as f_gff : |
145 | 146 |
146 GFF['order'] = [] | 147 GFF['order'] = [] |
147 #for line in itertools.islice( f_gff, 25 ): | 148 #for line in itertools.islice( f_gff, 25 ): |
148 for line in f_gff: | 149 for line in f_gff: |
151 if line != "" : | 152 if line != "" : |
152 feature = (line.split('\t')[8]).split(';') | 153 feature = (line.split('\t')[8]).split(';') |
153 # first line is already gene line : | 154 # first line is already gene line : |
154 if line.split('\t')[2] == 'gene' : | 155 if line.split('\t')[2] == 'gene' : |
155 gene = feature[0].replace("ID=","") | 156 gene = feature[0].replace("ID=","") |
157 curent_gene = gene | |
156 if 'Name' in line : | 158 if 'Name' in line : |
157 regex = re.compile('(Name=)([^;]*);') | 159 regex = re.compile('(Name=)([^;]*);?') |
158 res = regex.search(line.split('\t')[8]) | 160 res = regex.search(line.split('\t')[8]) |
159 Name = res.group(2) | 161 Name = res.group(2) |
160 Name = Name.rstrip() | 162 Name = Name.rstrip() |
161 else : | 163 else : |
162 Name = "Unknown" | 164 Name = "Unknown" |
163 ##get annotation | 165 ##get annotation |
164 if 'Note' in line : | 166 if 'Note' in line : |
165 regex = re.compile('(Note=)([^;]*);') | 167 regex = re.compile('(Note=)([^;]*);?') |
166 res = regex.search(line.split('\t')[8]) | 168 res = regex.search(line.split('\t')[8]) |
167 note = res.group(2) | 169 note = res.group(2) |
168 note = urllib.unquote(str(note)).replace("\n","") | 170 note = urllib.unquote(str(note)).replace("\n","") |
169 else : | 171 else : |
170 note = "" | 172 note = "" |
178 GFF[gene]['name'] = Name | 180 GFF[gene]['name'] = Name |
179 GFF[gene]['note'] = note | 181 GFF[gene]['note'] = note |
180 GFF[gene]['exon'] = {} | 182 GFF[gene]['exon'] = {} |
181 GFF[gene]['exon_number'] = 0 | 183 GFF[gene]['exon_number'] = 0 |
182 #print Name | 184 #print Name |
185 elif line.split('\t')[2] == 'mRNA' : | |
186 regex = re.compile('(Parent=)([^;]*);?') | |
187 res = regex.search(line.split('\t')[8]) | |
188 gene_name = res.group(2) | |
189 regex = re.compile('(ID=)([^;]*);?') | |
190 res = regex.search(line.split('\t')[8]) | |
191 mRNA_name = res.group(2) | |
192 if gene not in mRNA.viewvalues() and gene_name == curent_gene : | |
193 mRNA[mRNA_name] = gene_name | |
194 | |
183 elif line.split('\t')[2] == 'CDS' : | 195 elif line.split('\t')[2] == 'CDS' : |
184 regex = re.compile('(Parent=)([^;]*);') | 196 regex = re.compile('(Parent=)([^;]*);?') |
185 res = regex.search(line.split('\t')[8]) | 197 res = regex.search(line.split('\t')[8]) |
186 gene = res.group(2) | 198 gene = res.group(2) |
199 | |
187 if 'mRNA' in gene: | 200 if 'mRNA' in gene: |
188 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) | 201 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) |
202 if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]): | |
203 | |
204 gene = gene_name | |
205 | |
189 if GFF.has_key(gene) : | 206 if GFF.has_key(gene) : |
190 GFF[gene]['exon_number'] += 1 | 207 GFF[gene]['exon_number'] += 1 |
191 exon_number = GFF[gene]['exon_number'] | 208 exon_number = GFF[gene]['exon_number'] |
192 GFF[gene]['exon'][exon_number] = {} | 209 GFF[gene]['exon'][exon_number] = {} |
193 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | 210 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] |
194 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | 211 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) |
195 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | 212 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) |
196 | 213 |
197 ## if there is a five prim UTR intron, we change start of gene | 214 ## if there is a five prim UTR intron, we change start of gene |
198 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | 215 elif line.split('\t')[2] == 'five_prime_UTR_intron' : |
199 if GFF[gene]['strand'] == "+" : | 216 if GFF[gene]['strand'] == "+" : |
200 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | 217 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] |
201 else : | 218 else : |