# HG changeset patch # User galaxyp # Date 1496933694 14400 # Node ID 63c48a6eddb6fb2f152dc3b6cbc9956f05a01e56 # Parent a0a5aa56d29c064f42e77413d02df135ccac6771 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/bumbershoot/custom_pro_db commit 2174137cf8a15deefed5910ffa152c4ce9c81af6 diff -r a0a5aa56d29c -r 63c48a6eddb6 data_manager/customProDB_annotation.R --- a/data_manager/customProDB_annotation.R Tue Mar 14 14:11:41 2017 -0400 +++ b/data_manager/customProDB_annotation.R Thu Jun 08 10:54:54 2017 -0400 @@ -12,10 +12,13 @@ suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("RGalaxy")) +suppressPackageStartupMessages(library("GetoptLong")) option_list <- list() option_list$dbkey <- make_option('--dbkey', type='character') +option_list$ensembl_host <- make_option('--ensembl_host', type='character') +option_list$ensembl_dataset <- make_option('--ensembl_dataset', type='character') option_list$dbsnp <- make_option('--dbsnp', type='character') option_list$cosmic <- make_option('--cosmic', type='logical') option_list$outputFile <- make_option('--outputFile', type='character') @@ -25,20 +28,29 @@ customProDB_annotation <- function( - dbkey = GalaxyCharacterParam(required=TRUE), + dbkey = GalaxyCharacterParam(required=FALSE), + ensembl_host = GalaxyCharacterParam(required=FALSE), + ensembl_dataset = GalaxyCharacterParam(required=FALSE), dbsnp_str = GalaxyCharacterParam(required=FALSE), cosmic = GalaxyLogicalParam(required=FALSE), dbkey_description = GalaxyCharacterParam(required=FALSE), outputFile = GalaxyOutput("output","json")) { + options(stringsAsFactors = FALSE, gsubfn.engine = "R") + if (!file.exists(outputFile)) { gstop("json params file does not exist") } - if (length(dbkey_description) < 1) + if (length(dbkey)+length(ensembl_dataset)+length(ensembl_host) == 0) { - dbkey_description = dbkey + gstop("one of the genome annotation sources must be specified; either dbkey or host and dataset") + } + else if (length(dbkey) > 0 && + (length(ensembl_dataset) > 0 || length(ensembl_host) > 0)) + { + gstop("only one genome annotation source can be specified; either dbkey or host and dataset") } if (length(dbsnp_str) > 0) @@ -53,7 +65,8 @@ use_cosmic = FALSE if (length(cosmic) > 0) { - if (grepl("^hg", dbkey)) + if (length(dbkey) > 0 && grepl("^hg", dbkey) || + length(ensembl_dataset) > 0 && grepl("^hsapiens", ensembl_dataset)) { use_cosmic = TRUE } @@ -76,26 +89,96 @@ gstop("failed to remove json params file after reading") }) - ucscTableCodingFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgSeq.cdsExon=on&hgSeq.granularity=gene&hgSeq.casing=exon&hgSeq.repMasking=lower&hgta_doGenomicDna=get+sequence&hgta_group=genes&hgta_track=refGene&hgta_table=refGene&hgta_regionType=genome", sep="") - ucscTableProteinFastaURL = paste("http://genome.ucsc.edu/cgi-bin/hgTables?db=", dbkey, "&hgta_geneSeqType=protein&hgta_doGenePredSequence=submit&hgta_track=refGene&hgta_table=refGene", sep="") - codingFastaFilepath = paste(target_directory, "/", dbkey, ".cds.fa", sep="") - proteinFastaFilepath = paste(target_directory, "/", dbkey, ".protein.fa", sep="") + # load customProDB from GitHub (NOTE: downloading the zip is faster than cloning the repo with git2r or devtools::install_github) + download.file("https://github.com/chambm/customProDB/archive/c57e5498392197bc598a18c26acb70d7530a921cc57e5498.zip", "customProDB.zip", quiet=TRUE) + unzip("customProDB.zip") + devtools::load_all("customProDB-c57e5498392197bc598a18c26acb70d7530a921c") - suppressPackageStartupMessages(library(customProDB)) + #suppressPackageStartupMessages(library(customProDB)) options(timeout=3600) - cat(paste("Downloading coding FASTA from:", ucscTableCodingFastaURL, "\n")) - download.file(ucscTableCodingFastaURL, codingFastaFilepath, quiet=T, mode='wb') + # download protein and coding sequences for UCSC annotation + if (length(dbkey) > 0) + { + proteinFastaFilepath = paste(dbkey, ".protein.fa", sep="") + + cat(paste("Downloading protein FASTA from:", getProteinFastaUrlFromUCSC(dbkey), "\n")) + download.file(getProteinFastaUrlFromUCSC(dbkey), proteinFastaFilepath, quiet=T, mode='wb') + + local_cache_path = paste0("customProDB_annotation_", dbkey, "-", tools::md5sum(proteinFastaFilepath)[[1]]) + codingFastaFilepath = paste0(local_cache_path, "/", dbkey, ".cds.fa") + dir.create(local_cache_path, showWarnings=FALSE) + + if (!file.exists(codingFastaFilepath)) { + cat(paste("Downloading coding FASTA from:", getCodingFastaUrlFromUCSC(dbkey), "\n")) + download.file(getCodingFastaUrlFromUCSC(dbkey), codingFastaFilepath, quiet=T, mode='wb') + } + + cat(paste("Preparing Refseq annotation files\n")) + PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path) - cat(paste("Downloading protein FASTA from:", ucscTableProteinFastaURL, "\n")) - download.file(ucscTableProteinFastaURL, proteinFastaFilepath, quiet=T, mode='wb') + if (length(dbkey_description) < 1) + { + dbkey_description = dbkey + } + } + else + { + local_cache_path = paste0("customProDB_annotation_", ensembl_dataset, "_", ensembl_host) + + suppressPackageStartupMessages(library(biomaRt)) + cat(paste("Preparing Ensembl annotation files\n")) + ensembl_mart = useMart("ENSEMBL_MART_ENSEMBL", dataset=ensembl_dataset, host=ensembl_host) + PrepareAnnotationEnsembl(mart=ensembl_mart, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic, local_cache_path=local_cache_path) + + metadata = sqldf::sqldf("SELECT value FROM metadata WHERE name='BioMart database version' OR name='BioMart dataset description' OR name='BioMart dataset version'", + dbname=file.path(target_directory, "txdb.sqlite")) + version = metadata$value[1] # Ensembl Genes 87 + assembly = metadata$value[3] + dbkey = paste0(ensembl_dataset, "_", sub(".*?(\\d+)", "\\1", version, perl=TRUE)) - cat(paste("Preparing Refseq annotation files\n")) - customProDB::PrepareAnnotationRefseq(genome=dbkey, CDSfasta=codingFastaFilepath, pepfasta=proteinFastaFilepath, annotation_path=target_directory, dbsnp=dbsnp, COSMIC=use_cosmic) - - outputPath = paste(dbkey, "/customProDB", sep="") + # convert Ensembl chromosome names to UCSC for Galaxy compatibility + chromosomeMappingsBaseUrl = "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master" + assemblyNoGrcPatch = sub("(\\S+?)(\\.p\\S+)?$", "\\1", assembly, perl=TRUE) + chromosomeMappingsUrl = qq("@{chromosomeMappingsBaseUrl}/@{assemblyNoGrcPatch}_ensembl2UCSC.txt") + if (RCurl::url.exists(chromosomeMappingsUrl)) + { + cat(qq("Converting Ensembl chromosome names from: @{chromosomeMappingsUrl}\n")) + e2u = read.delim(chromosomeMappingsUrl, header=FALSE, col.names=c("ensembl", "ucsc")) + e2u = setNames(as.list(e2u$ucsc), e2u$ensembl) + load(file.path(target_directory, "exon_anno.RData")) + exon$chromosome_name = sapply(exon$chromosome_name, function(x) e2u[[as.character(x)]]) + exon = exon[nzchar(exon$chromosome_name), ] # omit genome patches with no mapping + save(exon, file=file.path(target_directory, "exon_anno.RData")) + } + else + { + gwarning(qq("unable to convert Ensembl chromosome names to UCSC; mapping file @{assemblyNoGrcPatch}_ensembl2UCSC.txt does not exist")) + } + + if (length(dbkey_description) < 1) + { + dbkey_description = qq("@{ensembl_dataset} (@{version}) (@{assembly})") + } + } + + qualified_dbkey = dbkey + + if (length(dbsnp_str) > 0 && nzchar(dbsnp_str)) + { + qualified_dbkey = qq("@{qualified_dbkey}_db@{dbsnp_str}") + dbkey_description = qq("@{dbkey_description} (db@{dbsnp_str})") + } + + if (length(cosmic) > 0) + { + qualified_dbkey = qq("@{qualified_dbkey}_cosmic") + dbkey_description = qq("@{dbkey_description} (COSMIC)") + } + + outputPath = paste0(qualified_dbkey, "/customProDB") output = list(data_tables = list()) - output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=dbkey, value=dbkey) + output[["data_tables"]][["customProDB"]]=c(path=outputPath, name=dbkey_description, dbkey=qualified_dbkey, value=qualified_dbkey) write(toJSON(output), file=outputFile) } diff -r a0a5aa56d29c -r 63c48a6eddb6 data_manager/customProDB_annotation.xml --- a/data_manager/customProDB_annotation.xml Tue Mar 14 14:11:41 2017 -0400 +++ b/data_manager/customProDB_annotation.xml Thu Jun 08 10:54:54 2017 -0400 @@ -1,7 +1,25 @@ - + builder - bioconductor-customprodb + r-base + + bioconductor-rgalaxy + bioconductor-biocinstaller + bioconductor-variantannotation + bioconductor-genomicfeatures + r-devtools + r-xml + r-rmysql + r-testthat + r-getoptlong + r-stringi + r-stringr + r-data.table + r-sqldf + r-gsubfn + r-chron + r-proto + r-plyr @@ -9,17 +27,43 @@ 1 + --outputFile '${out_file}' + + #if str($transcriptome_annotation.source) == 'refseq': + --dbkey '${transcriptome_annotation.dbkey}' + --dbkey_description '${ transcriptome_annotation.dbkey.get_display_text().strip("\"'") }' + #else: + --ensembl_dataset '${transcriptome_annotation.ensembl_dataset.fields.dataset}' + --ensembl_host '${transcriptome_annotation.ensembl_dataset.fields.host}' + --dbkey_description '${transcriptome_annotation.ensembl_dataset.fields.name}' + #end if + + --dbsnp '${dbsnp}' + $cosmic + 2>&1 ]]> - - + + + + + + + + + + + + + + + + + + + + @@ -29,8 +73,12 @@ .. class:: infomark -**Notice:** If you leave name, description, or id blank, it will be generated automatically. +This data manager creates the transcriptome annotation in the RData format needed by customProDB. +Two annotation sources are supported: UCSC and Ensembl. +Note that because UCSC's table browser only provides current gene annotations for a given genome assembly, +only the Ensembl annotation is entirely reproducible, i.e. running again with the same settings next month will create the same annotation. +Ensembl chromosome names (1,2, ...) are converted to UCSC format (chr1,chr2, ...) to ease integration with other Galaxy tools. 10.1093/bioinformatics/btt543 diff -r a0a5aa56d29c -r 63c48a6eddb6 ensembl_datasets.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ensembl_datasets.loc.sample Thu Jun 08 10:54:54 2017 -0400 @@ -0,0 +1,113 @@ +aaegypti_gene_ensembl may2009.archive.ensembl.org A.aegypti genes (Ensembl 54 aaegypti) (AaegL1) +amelanoleuca_gene_ensembl may2017.archive.ensembl.org Panda genes (Ensembl 89 amelanoleuca) (ailMel1) +aplatyrhynchos_gene_ensembl may2017.archive.ensembl.org Duck genes (Ensembl 89 aplatyrhynchos) (BGI_duck_1.0) +acarolinensis_gene_ensembl may2017.archive.ensembl.org Anole lizard genes (Ensembl 89 acarolinensis) (AnoCar2.0) +acarolinensis_gene_ensembl may2009.archive.ensembl.org Anole lizard genes (Ensembl 54 acarolinensis) (AnoCar1.0) +agambiae_gene_ensembl may2009.archive.ensembl.org Mosquito genes (Ensembl 54 agambiae) (AgamP3) +amexicanus_gene_ensembl may2017.archive.ensembl.org Cave fish genes (Ensembl 89 amexicanus) (AstMex102) +btaurus_gene_ensembl may2017.archive.ensembl.org Cow genes (Ensembl 89 btaurus) (UMD3.1) +btaurus_gene_ensembl may2009.archive.ensembl.org Cow genes (Ensembl 54 btaurus) (Btau_4.0) +celegans_gene_ensembl may2017.archive.ensembl.org C.elegans genes (Ensembl 89 celegans) (WBcel235) +celegans_gene_ensembl may2012.archive.ensembl.org C.elegans genes (Ensembl 67 celegans) (WBcel215) +celegans_gene_ensembl may2009.archive.ensembl.org C.elegans genes (Ensembl 54 celegans) (WS190) +cjacchus_gene_ensembl may2017.archive.ensembl.org Marmoset genes (Ensembl 89 cjacchus) (C_jacchus3.2.1) +cfamiliaris_gene_ensembl may2017.archive.ensembl.org Dog genes (Ensembl 89 cfamiliaris) (CanFam3.1) +cfamiliaris_gene_ensembl may2012.archive.ensembl.org Dog genes (Ensembl 67 cfamiliaris) (CanFam 2.0) +tsyrichta_gene_ensembl may2017.archive.ensembl.org Tarsier genes (Ensembl 89 tsyrichta) (tarSyr1) +cporcellus_gene_ensembl may2017.archive.ensembl.org Guinea Pig genes (Ensembl 89 cporcellus) (cavPor3) +csabaeus_gene_ensembl may2017.archive.ensembl.org Vervet-AGM genes (Ensembl 89 csabaeus) (ChlSab1.1) +choffmanni_gene_ensembl may2017.archive.ensembl.org Sloth genes (Ensembl 89 choffmanni) (choHof1) +cintestinalis_gene_ensembl may2017.archive.ensembl.org C.intestinalis genes (Ensembl 89 cintestinalis) (KH) +cintestinalis_gene_ensembl may2009.archive.ensembl.org C.intestinalis genes (Ensembl 54 cintestinalis) (JGI 2) +csavignyi_gene_ensembl may2017.archive.ensembl.org C.savignyi genes (Ensembl 89 csavignyi) (CSAV 2.0) +drerio_gene_ensembl may2017.archive.ensembl.org Zebrafish genes (Ensembl 89 drerio) (GRCz10) +drerio_gene_ensembl mar2015.archive.ensembl.org Zebrafish genes (Ensembl 79 drerio) (Zv9) +drerio_gene_ensembl may2009.archive.ensembl.org Zebrafish genes (Ensembl 54 drerio) (Zv8) +dnovemcinctus_gene_ensembl may2017.archive.ensembl.org Armadillo genes (Ensembl 89 dnovemcinctus) (Dasnov3.0) +dnovemcinctus_gene_ensembl may2012.archive.ensembl.org Armadillo genes (Ensembl 67 dnovemcinctus) (dasNov2) +dnovemcinctus_gene_ensembl may2009.archive.ensembl.org Armadillo genes (Ensembl 54 dnovemcinctus) (ARMA) +dordii_gene_ensembl may2017.archive.ensembl.org Kangaroo rat genes (Ensembl 89 dordii) (dipOrd1) +dmelanogaster_gene_ensembl may2017.archive.ensembl.org Fly genes (Ensembl 89 dmelanogaster) (BDGP6) +dmelanogaster_gene_ensembl dec2014.archive.ensembl.org Fly genes (Ensembl 78 dmelanogaster) (BDGP 5) +dmelanogaster_gene_ensembl may2009.archive.ensembl.org Fly genes (Ensembl 54 dmelanogaster) (BDGP 5.4) +etelfairi_gene_ensembl may2017.archive.ensembl.org Tenrec genes (Ensembl 89 etelfairi) (TENREC) +ecaballus_gene_ensembl may2017.archive.ensembl.org Horse genes (Ensembl 89 ecaballus) (Equ Cab 2) +eeuropaeus_gene_ensembl may2017.archive.ensembl.org Hedgehog genes (Ensembl 89 eeuropaeus) (eriEur1) +fcatus_gene_ensembl may2017.archive.ensembl.org Cat genes (Ensembl 89 fcatus) (Felis_catus_6.2) +fcatus_gene_ensembl may2012.archive.ensembl.org Cat genes (Ensembl 67 fcatus) (CAT) +falbicollis_gene_ensembl may2017.archive.ensembl.org Collared flycatcher genes (Ensembl 89 falbicollis) (FicAlb_1.4) +gmorhua_gene_ensembl may2017.archive.ensembl.org Cod genes (Ensembl 89 gmorhua) (gadMor1) +ggallus_gene_ensembl may2017.archive.ensembl.org Chicken genes (Ensembl 89 ggallus) (Gallus_gallus-5.0) +ggallus_gene_ensembl jul2016.archive.ensembl.org Chicken genes (Ensembl 85 ggallus) (Galgal4) +ggallus_gene_ensembl may2012.archive.ensembl.org Chicken genes (Ensembl 67 ggallus) (WASHUC2) +gaculeatus_gene_ensembl may2017.archive.ensembl.org Stickleback genes (Ensembl 89 gaculeatus) (BROAD S1) +ggorilla_gene_ensembl may2017.archive.ensembl.org Gorilla genes (Ensembl 89 ggorilla) (gorGor3.1) +ggorilla_gene_ensembl may2009.archive.ensembl.org Gorilla genes (Ensembl 54 ggorilla) (gorGor1) +hsapiens_gene_ensembl may2017.archive.ensembl.org Human genes (Ensembl 89 hsapiens) (GRCh38.p10) +hsapiens_gene_ensembl mar2017.archive.ensembl.org Human genes (Ensembl 88 hsapiens) (GRCh38.p7) +hsapiens_gene_ensembl mar2016.archive.ensembl.org Human genes (Ensembl 84 hsapiens) (GRCh38.p5) +hsapiens_gene_ensembl sep2015.archive.ensembl.org Human genes (Ensembl 82 hsapiens) (GRCh38.p3) +hsapiens_gene_ensembl may2015.archive.ensembl.org Human genes (Ensembl 80 hsapiens) (GRCh38.p2) +hsapiens_gene_ensembl dec2014.archive.ensembl.org Human genes (Ensembl 78 hsapiens) (GRCh38) +hsapiens_gene_ensembl feb2014.archive.ensembl.org Human genes (Ensembl 75 hsapiens) (GRCh37.p13) +hsapiens_gene_ensembl may2012.archive.ensembl.org Human genes (Ensembl 67 hsapiens) (GRCh37.p7) +hsapiens_gene_ensembl may2009.archive.ensembl.org Human genes (Ensembl 54 hsapiens) (NCBI 36) +itridecemlineatus_gene_ensembl may2017.archive.ensembl.org Squirrel genes (Ensembl 89 itridecemlineatus) (spetri2) +itridecemlineatus_gene_ensembl may2009.archive.ensembl.org Squirrel genes (Ensembl 54 itridecemlineatus) (speTri1) +lchalumnae_gene_ensembl may2017.archive.ensembl.org Coelacanth genes (Ensembl 89 lchalumnae) (LatCha1) +loculatus_gene_ensembl may2017.archive.ensembl.org Spotted gar genes (Ensembl 89 loculatus) (LepOcu1) +lafricana_gene_ensembl may2017.archive.ensembl.org Elephant genes (Ensembl 89 lafricana) (Loxafr3.0) +lafricana_gene_ensembl may2009.archive.ensembl.org Elephant genes (Ensembl 54 lafricana) (BROAD E1) +mmulatta_gene_ensembl may2017.archive.ensembl.org Macaque genes (Ensembl 89 mmulatta) (Mmul_8.0.1) +mmulatta_gene_ensembl jul2016.archive.ensembl.org Macaque genes (Ensembl 85 mmulatta) (MMUL 1.0) +mgallopavo_gene_ensembl may2017.archive.ensembl.org Turkey genes (Ensembl 89 mgallopavo) (Turkey_2.01) +mmurinus_gene_ensembl may2017.archive.ensembl.org Mouse lemur genes (Ensembl 89 mmurinus) (Mmur_2.0) +mmurinus_gene_ensembl jul2016.archive.ensembl.org Mouse lemur genes (Ensembl 85 mmurinus) (micMur1) +mdomestica_gene_ensembl may2017.archive.ensembl.org Opossum genes (Ensembl 89 mdomestica) (monDom5) +mmusculus_gene_ensembl may2017.archive.ensembl.org Mouse genes (Ensembl 89 mmusculus) (GRCm38.p5) +mmusculus_gene_ensembl oct2016.archive.ensembl.org Mouse genes (Ensembl 86 mmusculus) (GRCm38.p4) +mmusculus_gene_ensembl may2015.archive.ensembl.org Mouse genes (Ensembl 80 mmusculus) (GRCm38.p3) +mmusculus_gene_ensembl oct2014.archive.ensembl.org Mouse genes (Ensembl 77 mmusculus) (GRCm38.p2) +mmusculus_gene_ensembl may2012.archive.ensembl.org Mouse genes (Ensembl 67 mmusculus) (NCBI m37) +mfuro_gene_ensembl may2017.archive.ensembl.org Domestic ferret genes (Ensembl 89 mfuro) (MusPutFur1.0) +mlucifugus_gene_ensembl may2017.archive.ensembl.org Microbat genes (Ensembl 89 mlucifugus) (Myoluc2.0) +mlucifugus_gene_ensembl may2009.archive.ensembl.org Microbat genes (Ensembl 54 mlucifugus) (myoLuc1) +nleucogenys_gene_ensembl may2017.archive.ensembl.org Gibbon genes (Ensembl 89 nleucogenys) (Nleu1.0) +meugenii_gene_ensembl may2017.archive.ensembl.org Wallaby genes (Ensembl 89 meugenii) (Meug_1.0) +oprinceps_gene_ensembl may2017.archive.ensembl.org Pika genes (Ensembl 89 oprinceps) (OchPri2.0-Ens) +oprinceps_gene_ensembl mar2017.archive.ensembl.org Pika genes (Ensembl 88 oprinceps) (OchPri2.0) +oniloticus_gene_ensembl may2017.archive.ensembl.org Tilapia genes (Ensembl 89 oniloticus) (Orenil1.0) +oanatinus_gene_ensembl may2017.archive.ensembl.org Platypus genes (Ensembl 89 oanatinus) (OANA5) +ocuniculus_gene_ensembl may2017.archive.ensembl.org Rabbit genes (Ensembl 89 ocuniculus) (OryCun2.0) +ocuniculus_gene_ensembl may2009.archive.ensembl.org Rabbit genes (Ensembl 54 ocuniculus) (RABBIT) +olatipes_gene_ensembl may2017.archive.ensembl.org Medaka genes (Ensembl 89 olatipes) (HdrR) +ogarnettii_gene_ensembl may2017.archive.ensembl.org Bushbaby genes (Ensembl 89 ogarnettii) (OtoGar3) +ogarnettii_gene_ensembl may2009.archive.ensembl.org Bushbaby genes (Ensembl 54 ogarnettii) (otoGar1) +oaries_gene_ensembl may2017.archive.ensembl.org Sheep genes (Ensembl 89 oaries) (Oar_v3.1) +ptroglodytes_gene_ensembl may2017.archive.ensembl.org Chimp genes (Ensembl 89 ptroglodytes) (CHIMP2.1.4) +ptroglodytes_gene_ensembl may2009.archive.ensembl.org Chimp genes (Ensembl 54 ptroglodytes) (CHIMP2.1) +panubis_gene_ensembl may2017.archive.ensembl.org Olive Baboon genes (Ensembl 89 panubis) (PapAnu2.0) +psinensis_gene_ensembl may2017.archive.ensembl.org Chinese softshell turtle genes (Ensembl 89 psinensis) (PelSin_1.0) +pmarinus_gene_ensembl may2017.archive.ensembl.org Lamprey genes (Ensembl 89 pmarinus) (Pmarinus_7.0) +pformosa_gene_ensembl may2017.archive.ensembl.org Amazon molly genes (Ensembl 89 pformosa) (Poecilia_formosa-5.1.2) +pabelii_gene_ensembl may2017.archive.ensembl.org Orangutan genes (Ensembl 89 pabelii) (PPYG2) +pcapensis_gene_ensembl may2017.archive.ensembl.org Rock hyrax genes (Ensembl 89 pcapensis) (proCap1) +pvampyrus_gene_ensembl may2017.archive.ensembl.org Megabat genes (Ensembl 89 pvampyrus) (pteVam1) +rnorvegicus_gene_ensembl may2017.archive.ensembl.org Rat genes (Ensembl 89 rnorvegicus) (Rnor_6.0) +rnorvegicus_gene_ensembl mar2015.archive.ensembl.org Rat genes (Ensembl 79 rnorvegicus) (Rnor_5.0) +rnorvegicus_gene_ensembl may2012.archive.ensembl.org Rat genes (Ensembl 67 rnorvegicus) (RGSC 3.4) +scerevisiae_gene_ensembl may2017.archive.ensembl.org S.cerevisiae genes (Ensembl 89 scerevisiae) (R64-1-1) +scerevisiae_gene_ensembl dec2013.archive.ensembl.org S.cerevisiae genes (Ensembl 74 scerevisiae) (EF 4) +scerevisiae_gene_ensembl may2009.archive.ensembl.org S.cerevisiae genes (Ensembl 54 scerevisiae) (SGD1.01) +sharrisii_gene_ensembl may2017.archive.ensembl.org Tasmanian Devil genes (Ensembl 89 sharrisii) (Devil_ref v7.0) +saraneus_gene_ensembl may2017.archive.ensembl.org Shrew genes (Ensembl 89 saraneus) (sorAra1) +sscrofa_gene_ensembl may2017.archive.ensembl.org Pig genes (Ensembl 89 sscrofa) (Sscrofa10.2) +tguttata_gene_ensembl may2017.archive.ensembl.org Zebra finch genes (Ensembl 89 tguttata) (taeGut3.2.4) +trubripes_gene_ensembl may2017.archive.ensembl.org Fugu genes (Ensembl 89 trubripes) (FUGU 4.0) +tnigroviridis_gene_ensembl may2017.archive.ensembl.org Tetraodon genes (Ensembl 89 tnigroviridis) (TETRAODON 8.0) +tbelangeri_gene_ensembl may2017.archive.ensembl.org Tree Shrew genes (Ensembl 89 tbelangeri) (tupBel1) +ttruncatus_gene_ensembl may2017.archive.ensembl.org Dolphin genes (Ensembl 89 ttruncatus) (turTru1) +vpacos_gene_ensembl may2017.archive.ensembl.org Alpaca genes (Ensembl 89 vpacos) (vicPac1) +xtropicalis_gene_ensembl may2017.archive.ensembl.org Xenopus genes (Ensembl 89 xtropicalis) (JGI 4.2) +xtropicalis_gene_ensembl may2009.archive.ensembl.org Xenopus genes (Ensembl 54 xtropicalis) (JGI 4.1) +xmaculatus_gene_ensembl may2017.archive.ensembl.org Platyfish genes (Ensembl 89 xmaculatus) (Xipmac4.4.2) diff -r a0a5aa56d29c -r 63c48a6eddb6 tool-data/update_ensembl_datasets.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/update_ensembl_datasets.R Thu Jun 08 10:54:54 2017 -0400 @@ -0,0 +1,58 @@ +## +## Run this script to update the table of Ensembl assemblies available in the customProDB annotation data manager (ensembl_datasets.loc) +## + +library(RMySQL) +library(httr) +library(biomaRt) +library(stringdist) + +con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous") +archives = dbGetQuery(con, "SHOW DATABASES LIKE 'ensembl_archive_%'") +dbDisconnect(con) + +latestArchive = tail(archives[,1], 1) +con = dbConnect(MySQL(), host="ensembldb.ensembl.org", user="anonymous", dbname=latestArchive) +assemblies = dbGetQuery(con, "SELECT s.name, s.common_name, rs.assembly_name, MAX(rs.release_id) AS latest_release, r.date + FROM species as s, release_species as rs, ens_release as r + WHERE s.species_id = rs.species_id AND r.release_id = rs.release_id AND r.online = 'Y' + AND r.release_id < 10000 -- ignore 10075 (the special GRCh37 site) + GROUP BY rs.assembly_name + ORDER BY s.common_name, rs.release_id") +allReleases = assemblies$latest_release +uniqueReleases = unique(allReleases) + +# Get the style archive link for each Ensembl release +urlRedirectMap = sapply(paste0("e", uniqueReleases, ".ensembl.org"), function(url){XML::parseURI(HEAD(url)$url)$server}) + +## NOTE ## Make sure the following line is updated to the latest Ensembl mirror +assemblies$url = sub("www.", "may2017.archive.", urlRedirectMap[paste0("e", allReleases, ".ensembl.org")], fixed=TRUE) + +# Get all datasets from the archives +datasets = c() +for (archive in unique(assemblies$url)) { + datasets = unique(c(datasets, listDatasets(useMart("ensembl", host=archive))$dataset)) +} +datasets = sub("_gene_ensembl", "", datasets, fixed=TRUE) + +# Match the assembly species names to the datasets (using amatch() because of cases like Mustela_putorius_furo -> mfuro) +assemblies$dataset_id = datasets[amatch(tolower(assemblies$name), datasets, maxDist=3, method="osa", weight=c(0.1, 1, 1, 1))] + +# Remove mouse strains (would need to add these from ENSEMBL_MOUSE_MART) +assemblies = assemblies[-grep("Mus_musculus_\\S+", assemblies$name, perl=TRUE),] + +# Remove unmatched assemblies (e.g. Mus spretus) +assemblies = assemblies[-which(is.na(assemblies$dataset_id)),] + +# Replace underscores in scientific name +assemblies$name = gsub("_", " ", assemblies$name, fixed=TRUE) + +# Sort assemblies first by scientific name, then descending by latest release for that assembly +assemblies = assemblies[order(assemblies$name, -assemblies$latest_release),] + +# Write dataset table (3 columns: dataset_id, host, description) +dataset_id = paste0(assemblies$dataset_id, "_gene_ensembl") +host = paste0(assemblies$url) +description = paste0(assemblies$common_name, " genes (Ensembl ", assemblies$latest_release, " ", assemblies$dataset_id, + ") (", assemblies$assembly_name, ")") +write.csv(paste(dataset_id, host, description, sep="\t"), file="ensembl_datasets.loc.sample")