library(data.table)
library(reshape2)

#setwd("D:/wd/martijn/")

all.sources.file = "SOURCEFILE.txt"
phenotypic.groups.file = "KEY_PHENOTYPIC_GROUPS.txt"
hpo.to.omt.file = "KEY_HPO_to_OMT.txt"
inheritance.from.group.0.file = "inheritance_from_group_0.txt"
final.file = "final_file.txt"

args <- commandArgs(trailingOnly = TRUE)

all.sources.file = args[1]
phenotypic.groups.file = args[2]
hpo.to.omt.file = args[3]
inheritance.from.group.0.file = args[4]
omt.coding.file = args[5]
phenotypic.names.file = args[6]
final.file = args[7]

#get ALL_SOURCES_ALL_FREQUENCIES.TXT
all.sources = read.table(all.sources.file, header=T, sep="\t", comment.char="#", quote = "")

#get KEY: PHENOTYPIC GROUPS
phenotypic.groups = read.table(phenotypic.groups.file, header=T, sep="\t", comment.char="#", quote = "")

#filter PHENOTYPIC GROUPS on 1
#phenotypic.groups.1 = phenotypic.groups[phenotypic.groups$GROUP.CODE == 1,]

#create SOURCE + PHENOTYPIC GROUPS
source.phenotype = merge(all.sources, phenotypic.groups[,c("HPO.CODE","GROUP.CODE")], by.x="HPO.ID", by.y="HPO.CODE")

#filter SOURCE + PHENOTYPIC GROUPS on 1
source.phenotype.filter.1 = source.phenotype[source.phenotype$GROUP.CODE == 1,]
source.phenotype.filter.0 = source.phenotype[source.phenotype$GROUP.CODE == 0,]

#create SUBSET HAND (based on diseases present in FILTER GROUP 1)
subset.hand = source.phenotype[source.phenotype$diseaseId %in% source.phenotype.filter.1$diseaseId,]
subset.hand.filter.1 = subset.hand[subset.hand$GROUP.CODE == 1,]
subset.hand.filter.0 = subset.hand[subset.hand$GROUP.CODE == 0,]

inheritance.from.group.0 = read.table(inheritance.from.group.0.file, header=F, sep="\t", comment.char="#", quote = "")

subset.hand.filter.0 = subset.hand.filter.0[subset.hand.filter.0$HPO.ID %in% inheritance.from.group.0$V1,]
subset.hand.filter.0$paste = paste(subset.hand.filter.0$diseaseId, subset.hand.filter.0$gene.symbol)
#subset.hand.filter.0 = subset.hand.filter.0[!duplicated(subset.hand.filter.0$paste),]

#subset.hand.filter = rbind(subset.hand.filter.0, subset.hand.filter.1)

#get KEY: HPO TO OMT
hpo.to.omt = read.table(hpo.to.omt.file, header=T, sep="\t", comment.char="#", quote = "", na.strings="N/C")
hpo.to.omt[is.na(hpo.to.omt)] = 0

omt.coding = read.table(omt.coding.file, header=T, sep="\t", comment.char="#", quote = "", stringsAsFactors=F)

omt.coding.1 = omt.coding[grepl("^.$", omt.coding$ID),]
names(omt.coding.1) = c("ID", "OMT.1.name")
omt.coding.2 = omt.coding[grepl("^..$", omt.coding$ID) | omt.coding$ID == "0",]
names(omt.coding.2) = c("ID", "OMT.2.name")
omt.coding.3 = omt.coding[grepl("^...$", omt.coding$ID) | omt.coding$ID == "0",]
names(omt.coding.3) = c("ID", "OMT.3.name")
omt.coding.4 = omt.coding[grepl("^....$", omt.coding$ID) | omt.coding$ID == "0",]
names(omt.coding.4) = c("ID", "OMT.4.name")
omt.coding.5 = omt.coding[grepl("^.....$", omt.coding$ID) | omt.coding$ID == "0",]
names(omt.coding.5) = c("ID", "OMT.5.name")

hpo.to.omt = merge(hpo.to.omt, omt.coding.1, by.x="OMT.1", by.y="ID")
hpo.to.omt = merge(hpo.to.omt, omt.coding.2, by.x="OMT.2", by.y="ID")
hpo.to.omt = merge(hpo.to.omt, omt.coding.3, by.x="OMT.3", by.y="ID")
hpo.to.omt = merge(hpo.to.omt, omt.coding.4, by.x="OMT.4", by.y="ID")
hpo.to.omt = merge(hpo.to.omt, omt.coding.5, by.x="OMT.5", by.y="ID")




#create FILTER SUBSET HAND ANNOTATIONS + OMT KEY
filter.subset.hand.omt.1 = merge(subset.hand.filter.1, hpo.to.omt[,c("UNIQUE.HPO.IDENTIFIER", "OMT.1", "OMT.1.name", "OMT.2", "OMT.2.name", "OMT.3", "OMT.3.name", "OMT.4", "OMT.4.name", "OMT.5", "OMT.5.name")], by.x="HPO.ID", by.y="UNIQUE.HPO.IDENTIFIER", all.x=T)
filter.subset.hand.omt.1[is.na(filter.subset.hand.omt.1)] = 0


#create COUNT NUMBER OF ANNOTATIONS IN PHENOTYPIC GROUPS FOR EACH GENE-DISEASE RELATION
count.phenotype.in.disease = data.frame(data.table(subset.hand)[, list(count=.N), by=c("gene.symbol", "diseaseId","GROUP.CODE")])
count.phenotype.in.disease = dcast(count.phenotype.in.disease, gene.symbol + diseaseId ~ GROUP.CODE, value.var = "count")
count.phenotype.in.disease[is.na(count.phenotype.in.disease)] = 0
count.phenotype.in.disease$sum = rowSums(count.phenotype.in.disease[paste(1:12)])
for(i in as.character(1:12)){ count.phenotype.in.disease[,i] = count.phenotype.in.disease[,i] / count.phenotype.in.disease$sum }
count.phenotype.in.disease = melt(count.phenotype.in.disease, id.var=1:2, measure.var=4:15, na.rm=T)

#create the final file
final = merge(count.phenotype.in.disease, filter.subset.hand.omt.1, by=c("gene.symbol", "diseaseId"))
final = merge(final, subset.hand.filter.0[,c("gene.symbol", "diseaseId", "HPO.term.name")], by=c("gene.symbol", "diseaseId"), all.x=T, all.y=F)
final$HPO.term.name.y = as.character(final$HPO.term.name.y)
final$HPO.term.name.y[is.na(final$HPO.term.name.y)] = "No inheritance pattern available"

phenotypic.names = read.table(phenotypic.names.file, header=F, sep="\t", comment.char="#", quote = "")
names(phenotypic.names) = c("GROUP.CODE", "GROUP.NAME")
final = merge(final, phenotypic.names, by.x="variable", by.y="GROUP.CODE")


final$url = ""
final$db = gsub(":.*", "", final$diseaseId)
final$id = gsub(".*:", "", final$diseaseId)
final$url = paste("http://www.omim.org/entry/", final$id, sep="")
final[final$db == "ORPHANET",]$url = paste("http://www.orpha.net/consor/cgi-bin/Disease_Search_Simple.php?lng=EN&lng=EN&Disease_Disease_Search_diseaseGroup=", final[final$db == "ORPHANET",]$id, "&Disease_Disease_Search_diseaseType=ORPHA", sep="")

final = final[,c("diseaseId", "gene.symbol", "gene.id.entrez.", "HPO.term.name.x", "variable", "GROUP.NAME", "value", "OMT.1", "OMT.1.name", "OMT.2", "OMT.2.name", "OMT.3", "OMT.3.name", "OMT.4", "OMT.4.name", "OMT.5", "OMT.5.name", "HPO.term.name.y", "url")]
names(final) = c("diseaseId", "gene.symbol", "gene.id.entrez.", "HPO.term.name", "GROUP.CODE", "GROUP.NAME", "ratio", "OMT.1", "OMT.1.name", "OMT.2", "OMT.2.name", "OMT.3", "OMT.3.name", "OMT.4", "OMT.4.name", "OMT.5", "OMT.5.name", "inheritance", "url")
write.table(final, file=final.file, quote=F, sep="\t", row.names=F, col.names=T)














