Mercurial > repos > davidvanzessen > argalaxy_tools
view merge_and_filter.r @ 57:16c7fc1c4bf8 draft
Uploaded
author | davidvanzessen |
---|---|
date | Fri, 18 Mar 2016 07:50:34 -0400 |
parents | |
children |
line wrap: on
line source
args <- commandArgs(trailingOnly = TRUE) summaryfile = args[1] sequencesfile = args[2] mutationanalysisfile = args[3] mutationstatsfile = args[4] hotspotsfile = args[5] gene_identification_file= args[6] output = args[7] unmatchedfile = args[8] method=args[9] functionality=args[10] unique_type=args[11] filter_unique=args[12] summ = read.table(summaryfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") sequences = read.table(sequencesfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") mutationanalysis = read.table(mutationanalysisfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") mutationstats = read.table(mutationstatsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") hotspots = read.table(hotspotsfile, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") gene_identification = read.table(gene_identification_file, header=T, sep="\t", fill=T, stringsAsFactors=F, quote="") if(method == "blastn"){ "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore" gene_identification = gene_identification[!duplicated(gene_identification$qseqid),] ref_length = data.frame(sseqid=c("ca1", "ca2", "cg1", "cg2", "cg3", "cg4", "cm"), ref.length=c(81,81,141,141,141,141,52)) gene_identification = merge(gene_identification, ref_length, by="sseqid", all.x=T) gene_identification$chunk_hit_percentage = (gene_identification$length / gene_identification$ref.length) * 100 gene_identification = gene_identification[,c("qseqid", "chunk_hit_percentage", "pident", "qstart", "sseqid")] colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") } summ = merge(summ, gene_identification, by="Sequence.ID") summ = summ[summ$Functionality != "No results",] if(functionality == "productive"){ summ = summ[summ$Functionality == "productive (see comment)" | summ$Functionality == "productive",] } else if (functionality == "unproductive"){ summ = summ[summ$Functionality == "unproductive (see comment)" | summ$Functionality == "unproductive",] } else if (functionality == "remove_unknown"){ summ = summ[summ$Functionality != "No results" & summ$Functionality != "unknown (see comment)" & summ$Functionality != "unknown",] } higher_than=(summ$chunk_hit_percentage >= 70 & summ$nt_hit_percentage >= 70) unmatched = summ[!higher_than,] unmatched = unmatched[,c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match")] unmatched$best_match = paste("unmatched,", unmatched$best_match) summ = summ[higher_than,] if(length(summ$Sequence.ID) == 0){ stop("No data remaining after filter") } result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") cleanup_columns = c("FR1.IMGT.Nb.of.mutations", "CDR1.IMGT.Nb.of.mutations", "FR2.IMGT.Nb.of.mutations", "CDR2.IMGT.Nb.of.mutations", "FR3.IMGT.Nb.of.mutations") for(col in cleanup_columns){ result[,col] = gsub("\\(.*\\)", "", result[,col]) result[,col] = as.numeric(result[,col]) result[is.na(result[,col]),] = 0 } result$VGene = gsub("^Homsap ", "", result$V.GENE.and.allele) result$VGene = gsub("[*].*", "", result$VGene) result$JGene = gsub("^Homsap ", "", result$J.GENE.and.allele) result$JGene = gsub("[*].*", "", result$JGene) #result$past = paste(result$AA.JUNCTION, result$VGene, result$JGene, (result$FR1.IMGT.Nb.of.mutations + result$CDR1.IMGT.Nb.of.mutations + result$FR2.IMGT.Nb.of.mutations + result$CDR2.IMGT.Nb.of.mutations + result$FR3.IMGT.Nb.of.mutations), result$best_match) if(unique_type == "AA.JUNCTION_V_subclass"){ result$past = paste(result$AA.JUNCTION, result$VGene, result$best_match) } else if (unique_type == "AA.JUNCTION_subclass"){ result$past = paste(result$AA.JUNCTION, result$best_match) } else if (unique_type == "V_subclass"){ result$past = paste(result$VGene, result$best_match) } else if (unique_type == "AA.JUNCTION_V"){ result$past = paste(result$AA.JUNCTION, result$VGene) } else if (unique_type == "AA.JUNCTION"){ result$past = paste(result$AA.JUNCTION) } else { result$past = 1:nrow(result) } print(paste("filter uniques: ", filter_unique)) if(filter_unique != "no"){ clmns = names(result) result$unique.def = paste(result$CDR1.Seq, result$CDR2.Seq, result$CDR3.Seq, result$FR1.IMGT, result$FR2.IMGT, result$FR3.IMGT) result.filtered = result[duplicated(result$unique.def),] fltr = result$unique.def %in% result.filtered$unique.def result.removed = result[!fltr,] result = result[fltr,] if(filter_unique == "keep"){ result = result.removed } result = result[,clmns] #write.table(inputdata.removed, "unique_removed.csv", sep=",",quote=F,row.names=F,col.names=T) } result = result[!duplicated(result$past), ] result = result[,!(names(result) %in% c("past"))] print(paste("Number of rows in result:", nrow(result))) print(paste("Number of rows in unmatched:", nrow(unmatched))) #remove the sequences that have an 'n' (or 'N') in the FR2, FR3, CDR1 and CDR2 regions. sequences = sequences[grepl("n|N", sequences$FR2.IMGT) | grepl("n|N", sequences$FR3.IMGT) | grepl("n|N", sequences$CDR1.IMGT) | grepl("n|N", sequences$CDR2.IMGT),] result = result[!(result$Sequence.ID %in% sequences$Sequence.ID),] write.table(x=result, file=output, sep="\t",quote=F,row.names=F,col.names=T) write.table(x=unmatched, file=unmatchedfile, sep="\t",quote=F,row.names=F,col.names=T)