Mercurial > repos > rlegendre > ribo_tools
comparison ribo_functions.py @ 18:a121cce43f90 draft
Uploaded
| author | rlegendre |
|---|---|
| date | Tue, 09 Jun 2015 09:06:17 -0400 |
| parents | c87c40e642af |
| children |
comparison
equal
deleted
inserted
replaced
| 17:c87c40e642af | 18:a121cce43f90 |
|---|---|
| 139 ''' | 139 ''' |
| 140 parse and store gff file in a dictionnary | 140 parse and store gff file in a dictionnary |
| 141 ''' | 141 ''' |
| 142 try: | 142 try: |
| 143 GFF = {} | 143 GFF = {} |
| 144 mRNA = {} | |
| 144 with open(gff, 'r') as f_gff : | 145 with open(gff, 'r') as f_gff : |
| 145 | 146 |
| 146 GFF['order'] = [] | 147 GFF['order'] = [] |
| 147 #for line in itertools.islice( f_gff, 25 ): | 148 #for line in itertools.islice( f_gff, 25 ): |
| 148 for line in f_gff: | 149 for line in f_gff: |
| 151 if line != "" : | 152 if line != "" : |
| 152 feature = (line.split('\t')[8]).split(';') | 153 feature = (line.split('\t')[8]).split(';') |
| 153 # first line is already gene line : | 154 # first line is already gene line : |
| 154 if line.split('\t')[2] == 'gene' : | 155 if line.split('\t')[2] == 'gene' : |
| 155 gene = feature[0].replace("ID=","") | 156 gene = feature[0].replace("ID=","") |
| 157 curent_gene = gene | |
| 156 if 'Name' in line : | 158 if 'Name' in line : |
| 157 regex = re.compile('(Name=)([^;]*);') | 159 regex = re.compile('(Name=)([^;]*);?') |
| 158 res = regex.search(line.split('\t')[8]) | 160 res = regex.search(line.split('\t')[8]) |
| 159 Name = res.group(2) | 161 Name = res.group(2) |
| 160 Name = Name.rstrip() | 162 Name = Name.rstrip() |
| 161 else : | 163 else : |
| 162 Name = "Unknown" | 164 Name = "Unknown" |
| 163 ##get annotation | 165 ##get annotation |
| 164 if 'Note' in line : | 166 if 'Note' in line : |
| 165 regex = re.compile('(Note=)([^;]*);') | 167 regex = re.compile('(Note=)([^;]*);?') |
| 166 res = regex.search(line.split('\t')[8]) | 168 res = regex.search(line.split('\t')[8]) |
| 167 note = res.group(2) | 169 note = res.group(2) |
| 168 note = urllib.unquote(str(note)).replace("\n","") | 170 note = urllib.unquote(str(note)).replace("\n","") |
| 169 else : | 171 else : |
| 170 note = "" | 172 note = "" |
| 178 GFF[gene]['name'] = Name | 180 GFF[gene]['name'] = Name |
| 179 GFF[gene]['note'] = note | 181 GFF[gene]['note'] = note |
| 180 GFF[gene]['exon'] = {} | 182 GFF[gene]['exon'] = {} |
| 181 GFF[gene]['exon_number'] = 0 | 183 GFF[gene]['exon_number'] = 0 |
| 182 #print Name | 184 #print Name |
| 185 elif line.split('\t')[2] == 'mRNA' : | |
| 186 regex = re.compile('(Parent=)([^;]*);?') | |
| 187 res = regex.search(line.split('\t')[8]) | |
| 188 gene_name = res.group(2) | |
| 189 regex = re.compile('(ID=)([^;]*);?') | |
| 190 res = regex.search(line.split('\t')[8]) | |
| 191 mRNA_name = res.group(2) | |
| 192 if gene not in mRNA.viewvalues() and gene_name == curent_gene : | |
| 193 mRNA[mRNA_name] = gene_name | |
| 194 | |
| 183 elif line.split('\t')[2] == 'CDS' : | 195 elif line.split('\t')[2] == 'CDS' : |
| 184 regex = re.compile('(Parent=)([^;]*);') | 196 regex = re.compile('(Parent=)([^;]*);?') |
| 185 res = regex.search(line.split('\t')[8]) | 197 res = regex.search(line.split('\t')[8]) |
| 186 gene = res.group(2) | 198 gene = res.group(2) |
| 199 | |
| 187 if 'mRNA' in gene: | 200 if 'mRNA' in gene: |
| 188 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) | 201 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) |
| 202 if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]): | |
| 203 | |
| 204 gene = gene_name | |
| 205 | |
| 189 if GFF.has_key(gene) : | 206 if GFF.has_key(gene) : |
| 190 GFF[gene]['exon_number'] += 1 | 207 GFF[gene]['exon_number'] += 1 |
| 191 exon_number = GFF[gene]['exon_number'] | 208 exon_number = GFF[gene]['exon_number'] |
| 192 GFF[gene]['exon'][exon_number] = {} | 209 GFF[gene]['exon'][exon_number] = {} |
| 193 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | 210 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] |
| 194 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | 211 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) |
| 195 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | 212 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) |
| 196 | 213 |
| 197 ## if there is a five prim UTR intron, we change start of gene | 214 ## if there is a five prim UTR intron, we change start of gene |
| 198 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | 215 elif line.split('\t')[2] == 'five_prime_UTR_intron' : |
| 199 if GFF[gene]['strand'] == "+" : | 216 if GFF[gene]['strand'] == "+" : |
| 200 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | 217 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] |
| 201 else : | 218 else : |
