Mercurial > repos > rlegendre > ribo_tools
comparison ribo_functions.py @ 13:7c944fd9907e draft
release 2
| author | rlegendre |
|---|---|
| date | Thu, 09 Apr 2015 11:35:48 -0400 |
| parents | 707807fee542 |
| children | c87c40e642af |
comparison
equal
deleted
inserted
replaced
| 12:ee3a3435ce43 | 13:7c944fd9907e |
|---|---|
| 5 @author: rachel legendre | 5 @author: rachel legendre |
| 6 @copyright: rachel.legendre@igmors.u-psud.fr | 6 @copyright: rachel.legendre@igmors.u-psud.fr |
| 7 @license: GPL v3 | 7 @license: GPL v3 |
| 8 ''' | 8 ''' |
| 9 | 9 |
| 10 import sys, subprocess, re, commands, time, urllib | 10 import sys, subprocess, re, commands, time, urllib, tempfile |
| 11 from copy import copy | 11 from copy import copy |
| 12 | 12 |
| 13 | 13 |
| 14 def stop_err( msg ): | 14 def stop_err( msg ): |
| 15 sys.stderr.write( "%s\n" % msg ) | 15 sys.stderr.write( "%s\n" % msg ) |
| 125 parse and store gff file in a dictionnary | 125 parse and store gff file in a dictionnary |
| 126 ''' | 126 ''' |
| 127 try: | 127 try: |
| 128 GFF = {} | 128 GFF = {} |
| 129 with open(gff, 'r') as f_gff : | 129 with open(gff, 'r') as f_gff : |
| 130 | |
| 130 GFF['order'] = [] | 131 GFF['order'] = [] |
| 132 #for line in itertools.islice( f_gff, 25 ): | |
| 131 for line in f_gff: | 133 for line in f_gff: |
| 132 ## switch commented lines | 134 ## switch commented lines |
| 133 line = line.split("#")[0] | 135 line = line.split("#")[0] |
| 134 if line != "" : | 136 if line != "" : |
| 135 feature = (line.split('\t')[8]).split(';') | 137 feature = (line.split('\t')[8]).split(';') |
| 136 # first line is already gene line : | 138 # first line is already gene line : |
| 137 if line.split('\t')[2] == 'gene' : | 139 if line.split('\t')[2] == 'gene' : |
| 138 gene = feature[0].replace("ID=","") | 140 gene = feature[0].replace("ID=","") |
| 139 if re.search('gene',feature[2]) : | 141 if 'Name' in line : |
| 140 Name = feature[2].replace("gene=","") | 142 regex = re.compile('(Name=)([^;]*);') |
| 143 res = regex.search(line.split('\t')[8]) | |
| 144 Name = res.group(2) | |
| 145 Name = Name.rstrip() | |
| 141 else : | 146 else : |
| 142 Name = "Unknown" | 147 Name = "Unknown" |
| 143 ##get annotation | 148 ##get annotation |
| 144 note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) | 149 if 'Note' in line : |
| 145 note = urllib.unquote(str(note)).replace("\n","") | 150 regex = re.compile('(Note=)([^;]*);') |
| 151 res = regex.search(line.split('\t')[8]) | |
| 152 note = res.group(2) | |
| 153 note = urllib.unquote(str(note)).replace("\n","") | |
| 154 else : | |
| 155 note = "" | |
| 146 ## store gene information | 156 ## store gene information |
| 147 GFF['order'].append(gene) | 157 GFF['order'].append(gene) |
| 148 GFF[gene] = {} | 158 GFF[gene] = {} |
| 149 GFF[gene]['chrom'] = line.split('\t')[0] | 159 GFF[gene]['chrom'] = line.split('\t')[0] |
| 150 GFF[gene]['start'] = int(line.split('\t')[3]) | 160 GFF[gene]['start'] = int(line.split('\t')[3]) |
| 154 GFF[gene]['note'] = note | 164 GFF[gene]['note'] = note |
| 155 GFF[gene]['exon'] = {} | 165 GFF[gene]['exon'] = {} |
| 156 GFF[gene]['exon_number'] = 0 | 166 GFF[gene]['exon_number'] = 0 |
| 157 #print Name | 167 #print Name |
| 158 elif line.split('\t')[2] == 'CDS' : | 168 elif line.split('\t')[2] == 'CDS' : |
| 159 gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) | 169 regex = re.compile('(Parent=)([^;]*);') |
| 170 res = regex.search(line.split('\t')[8]) | |
| 171 gene = res.group(2) | |
| 172 if 'mRNA' in gene: | |
| 173 gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) | |
| 160 if GFF.has_key(gene) : | 174 if GFF.has_key(gene) : |
| 161 GFF[gene]['exon_number'] += 1 | 175 GFF[gene]['exon_number'] += 1 |
| 162 exon_number = GFF[gene]['exon_number'] | 176 exon_number = GFF[gene]['exon_number'] |
| 163 GFF[gene]['exon'][exon_number] = {} | 177 GFF[gene]['exon'][exon_number] = {} |
| 164 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] | 178 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] |
| 165 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) | 179 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) |
| 166 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) | 180 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) |
| 167 | 181 |
| 168 ## if there is a five prim UTR intron, we change start of gene | 182 ## if there is a five prim UTR intron, we change start of gene |
| 169 elif line.split('\t')[2] == 'five_prime_UTR_intron' : | 183 elif line.split('\t')[2] == 'five_prime_UTR_intron' : |
| 170 if GFF[gene]['strand'] == "+" : | 184 if GFF[gene]['strand'] == "+" : |
| 171 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | 185 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] |
| 172 else : | 186 else : |
| 173 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | 187 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] |
| 174 return GFF | 188 return GFF |
| 197 line = line.split("#")[0] | 211 line = line.split("#")[0] |
| 198 if line != "" : | 212 if line != "" : |
| 199 # first line is already gene line : | 213 # first line is already gene line : |
| 200 if 'protein_coding' in line : | 214 if 'protein_coding' in line : |
| 201 ##get name | 215 ##get name |
| 202 gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() | 216 gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip() |
| 203 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() | 217 Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() |
| 204 if line.split('\t')[2] == 'CDS' : | 218 if line.split('\t')[2] == 'CDS' : |
| 205 ##if its first time we get this gene | 219 ##if its first time we get this gene |
| 206 if gene not in GFF.keys() : | 220 if gene not in GFF.keys() : |
| 207 ## store gene information | 221 ## store gene information |
| 243 else : | 257 else : |
| 244 GFF[gene]['stop'] = int(line.split('\t')[4]) | 258 GFF[gene]['stop'] = int(line.split('\t')[4]) |
| 245 | 259 |
| 246 return __reverse_coordinates__(GFF) | 260 return __reverse_coordinates__(GFF) |
| 247 except Exception,e: | 261 except Exception,e: |
| 248 stop_err( 'Error during gff storage : ' + str( e ) ) | 262 stop_err( 'Error during gtf storage : ' + str( e ) ) |
| 249 | 263 |
| 250 | 264 |
| 251 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | 265 ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; |
| 252 ## exon_id "YDL083C.1"; | 266 ## exon_id "YDL083C.1"; |
| 253 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; | 267 ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; |
| 286 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 | 300 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 |
| 287 ''' | 301 ''' |
| 288 try : | 302 try : |
| 289 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) | 303 header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) |
| 290 #header = results.split('\n') | 304 #header = results.split('\n') |
| 291 | 305 ## tags by default |
| 306 multi_tag = "XS:i:" | |
| 307 tag = "IH:i:1" | |
| 292 # check mapper for define multiple tag | 308 # check mapper for define multiple tag |
| 293 if re.search('bowtie', header): | 309 if 'bowtie' in header: |
| 294 multi_tag = "XS:i:" | 310 multi_tag = "XS:i:" |
| 295 elif re.search('bwa', header): | 311 elif 'bwa' in header: |
| 296 multi_tag = "XT:A:R" | 312 multi_tag = "XT:A:R" |
| 313 elif 'TopHat' in header: | |
| 314 tag = "NH:i:1" | |
| 297 else : | 315 else : |
| 298 stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") | 316 stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") |
| 299 | 317 |
| 300 tmp_sam = tempfile.mktemp() | 318 tmp_sam = tempfile.mktemp() |
| 301 cmd = "samtools view %s > %s" % (bam, tmp_sam) | 319 cmd = "samtools view %s > %s" % (bam, tmp_sam) |
| 302 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | 320 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) |
| 303 returncode = proc.wait() | 321 returncode = proc.wait() |
| 304 | 322 |
| 305 | 323 |
| 306 with open(tempfile.mktemp(),'w') as out : | 324 with open(tempfile.mktemp(),'w') as out : |
| 307 out.write(header) | 325 out.write(header) |
| 308 with open(tmp_sam,'r') as sam : | 326 with open(tmp_sam,'r') as sam : |
| 309 for line in sam : | 327 for line in sam : |
| 310 if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : | 328 if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : |
| 311 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : | 329 if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : |
| 312 out.write(line) | 330 out.write(line) |
| 313 bamfile = tempfile.mktemp()+'.bam' | 331 bamfile = tempfile.mktemp()+'.bam' |
| 314 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) | 332 cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) |
| 315 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | 333 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) |
