# HG changeset patch # User davidvanzessen # Date 1408349077 14400 # Node ID 74d2bc479bee329479fc1c32a4f8329c1f9e3a14 Uploaded diff -r 000000000000 -r 74d2bc479bee filter.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,13 @@ +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +filterf = args[2] +output = args[3] + + +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) +filterdat = read.table(filterf, header=T, sep="\t", fill=T, stringsAsFactors=F) + +dat = dat[dat$Sequence.ID %in% filterdat$ID,] + +write.table(x=dat, file=output, sep="\t",quote=F,row.names=F,col.names=T) diff -r 000000000000 -r 74d2bc479bee gene_identification.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gene_identification.py Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,265 @@ +import re +import argparse + + +parser = argparse.ArgumentParser() +parser.add_argument("--input", help="The 1_Summary file from an IMGT zip file") +parser.add_argument("--outdir", help="Output directory, 7 output files will be written here") + +args = parser.parse_args() + +infile = args.input +#infile = "test_VH-Ca_Cg_25nt/1_Summary_test_VH-Ca_Cg_25nt_241013.txt" +outdir = args.outdir +#outfile = "identified.txt" + +dic = dict() +total = 0 + +first = True +with open(infile, 'r') as f: #read all sequences into a dictionary as key = ID, value = sequence + for line in f: + total += 1 + if first: + first = False + continue + linesplt = line.split("\t") + if linesplt[2] == "No results": + continue + ID = linesplt[1] + seq = linesplt[28] + dic[ID] = seq + +#lambda/kappa reference sequence +searchstrings = {"ca": "catccccgaccagccccaaggtcttcccgctgagcctctgcagcacccagccagatgggaacgtggtcatcgcctgcctggtccagggcttcttcccccaggagccactcagtgtgacctggagcgaaag", + "cg": "ctccaccaagggcccatcggtcttccccctggcaccctcctccaagagcacctctgggggcacagcggccctgggctgcctggtcaaggactacttccccgaaccggtgacggtgtcgtggaactcaggcgccctgaccagcggcgtgcacaccttcc", + "cm": "gggagtgcatccgccccaacccttttccccctcgtctcctgtgagaattccc"} + +compiledregex = {"ca": [], + "cg": [], + "cm": []} + +#lambda/kappa reference sequence variable nucleotides +ca1 = {38: 't', 39: 'g', 48: 'a', 49: 'g', 51: 'c', 68: 'a', 73: 'c'} +ca2 = {38: 'g', 39: 'a', 48: 'c', 49: 'c', 51: 'a', 68: 'g', 73: 'a'} +cg1 = {0: 'c', 33: 'a', 38: 'c', 44: 'a', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} +cg2 = {0: 'c', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'g', 132: 't'} +cg3 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 't', 56: 'g', 58: 'g', 66: 'g', 132: 'c'} +cg4 = {0: 't', 33: 'g', 38: 'g', 44: 'g', 54: 'c', 56: 'a', 58: 'a', 66: 'c', 132: 'c'} + +#reference sequences are cut into smaller parts of 'chunklength' length, and with 'chunklength' / 2 overlap +chunklength = 8 + +#create the chunks of the reference sequence with regular expressions for the variable nucleotides +for i in range(0, len(searchstrings["ca"]) - chunklength, chunklength / 2): + pos = i + chunk = searchstrings["ca"][i:i+chunklength] + result = "" + varsInResult = 0 + for c in chunk: + if pos in ca1.keys(): + varsInResult += 1 + result += "[" + ca1[pos] + ca2[pos] + "]" + else: + result += c + pos += 1 + compiledregex["ca"].append((re.compile(result), varsInResult)) + +for i in range(0, len(searchstrings["cg"]) - chunklength, chunklength / 2): + pos = i + chunk = searchstrings["cg"][i:i+chunklength] + result = "" + varsInResult = 0 + for c in chunk: + if pos in cg1.keys(): + varsInResult += 1 + result += "[" + "".join(set([cg1[pos], cg2[pos], cg3[pos], cg4[pos]])) + "]" + else: + result += c + pos += 1 + compiledregex["cg"].append((re.compile(result), varsInResult)) + +for i in range(0, len(searchstrings["cm"]) - chunklength, chunklength / 2): + compiledregex["cm"].append((re.compile(searchstrings["cm"][i:i+chunklength]), False)) + + + +def removeAndReturnMaxIndex(x): #simplifies a list comprehension + m = max(x) + index = x.index(m) + x[index] = 0 + return index + + +start_location = dict() +hits = dict() +alltotal = 0 +for key in compiledregex.keys(): #for ca/cg/cm + regularexpressions = compiledregex[key] #get the compiled regular expressions + for ID in dic.keys()[0:]: #for every ID + if ID not in hits.keys(): #ensure that the dictionairy that keeps track of the hits for every gene exists + hits[ID] = {"ca_hits": 0, "cg_hits": 0, "cm_hits": 0, "ca1": 0, "ca2": 0, "cg1": 0, "cg2": 0, "cg3": 0, "cg4": 0} + currentIDHits = hits[ID] + seq = dic[ID] + lastindex = 0 + start = [0] * len(seq) + for i, regexp in enumerate(regularexpressions): #for every regular expression + regex, hasVar = regexp + matches = regex.finditer(seq[lastindex:]) + for match in matches: #for every match with the current regex, only uses the first hit + lastindex += match.start() + start[lastindex - chunklength / 2 * i] += 1 + if hasVar: #if the regex has a variable nt in it + chunkstart = chunklength / 2 * i #where in the reference does this chunk start + chunkend = chunklength / 2 * i + chunklength #where in the reference does this chunk end + if key == "ca": #just calculate the variable nt score for 'ca', cheaper + currentIDHits["ca1"] += len([1 for x in ca1 if chunkstart <= x < chunkend and ca1[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["ca2"] += len([1 for x in ca2 if chunkstart <= x < chunkend and ca2[x] == seq[lastindex + x - chunkstart]]) + elif key == "cg": #just calculate the variable nt score for 'cg', cheaper + currentIDHits["cg1"] += len([1 for x in cg1 if chunkstart <= x < chunkend and cg1[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg2"] += len([1 for x in cg2 if chunkstart <= x < chunkend and cg2[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg3"] += len([1 for x in cg3 if chunkstart <= x < chunkend and cg3[x] == seq[lastindex + x - chunkstart]]) + currentIDHits["cg4"] += len([1 for x in cg4 if chunkstart <= x < chunkend and cg4[x] == seq[lastindex + x - chunkstart]]) + else: #key == "cm" #no variable regions in 'cm' + pass + break #this only breaks when there was a match with the regex, breaking means the 'else:' clause is skipped + else: #only runs if there were no hits + continue + #print "found ", regex.pattern , "at", lastindex, "adding one to", (lastindex - chunklength / 2 * i), "to the start array of", ID, "gene", key, "it's now:", start[lastindex - chunklength / 2 * i] + currentIDHits[key + "_hits"] += 1 + start_location[ID + "_" + key] = str([(removeAndReturnMaxIndex(start) + 1) for x in range(5) if max(start) > 1]) + #start_location[ID + "_" + key] = str(start.index(max(start))) + + +chunksInCA = len(compiledregex["ca"]) +chunksInCG = len(compiledregex["cg"]) +chunksInCM = len(compiledregex["cm"]) +requiredChunkPercentage = 0.7 +varsInCA = float(len(ca1.keys()) * 2) +varsInCG = float(len(cg1.keys()) * 2) + 1 +varsInCM = 0 +requiredVarPercentage = 0.7 + +ca = 0 +ca1 = 0 +ca2 = 0 +cg = 0 +cg1 = 0 +cg2 = 0 +cg3 = 0 +cg4 = 0 +cm = 0 +try: + cafile = open(outdir + "/ca.txt", 'w') + ca1file = open(outdir + "/ca1.txt", 'w') + ca2file = open(outdir + "/ca2.txt", 'w') + cgfile = open(outdir + "/cg.txt", 'w') + cg1file = open(outdir + "/cg1.txt", 'w') + cg2file = open(outdir + "/cg2.txt", 'w') + cg3file = open(outdir + "/cg3.txt", 'w') + cg4file = open(outdir + "/cg4.txt", 'w') + cmfile = open(outdir + "/cm.txt", 'w') + unmatchedfile = open(outdir + "/unmatched.txt", 'w') + cafile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + ca1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + ca2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cgfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg1file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg2file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg3file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cg4file.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + cmfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\n") + unmatchedfile.write("ID\tnt_hit_percentage\tchunk_hit_percentage\tstart_locations\tbest_match\n") + for ID in hits.keys(): + currentIDHits = hits[ID] + possibleca = float(len(compiledregex["ca"])) + possiblecg = float(len(compiledregex["cg"])) + possiblecm = float(len(compiledregex["cm"])) + cahits = currentIDHits["ca_hits"] + cghits = currentIDHits["cg_hits"] + cmhits = currentIDHits["cm_hits"] + if cahits > cghits and cahits > cmhits: #its a ca gene + if cahits <= int(chunksInCA * requiredChunkPercentage): + unmatchedfile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca\n") + continue + ca += 1 + ca1hits = currentIDHits["ca1"] + ca2hits = currentIDHits["ca2"] + cafile.write(ID + "\tNA\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + if ca1hits > ca2hits: + #print ID, "is ca1 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca1hits / varsInCA) * 100)), "percent hit" + if ca1hits <= int(varsInCA * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") + continue + ca1 += 1 + ca1file.write(ID + "\t" + str(int(ca1hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + else: + #print ID, "is ca2 with", (ca1hits / 2), "hits for ca1 and", (ca2hits / 2), "hits for ca2", (int((ca2hits / varsInCA) * 100)), "percent hit" + if ca2hits <= int(varsInCA * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\tca1\n") + continue + ca2 += 1 + ca2file.write(ID + "\t" + str(int(ca2hits / varsInCA * 100)) + "\t" + str(int(cahits / possibleca * 100)) + "\t" + start_location[ID + "_ca"] + "\n") + elif cghits > cahits and cghits > cmhits: #its a cg gene + if cghits <= int(chunksInCG * requiredChunkPercentage): + unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcg\n") + continue + cg += 1 + cg1hits = currentIDHits["cg1"] + cg2hits = currentIDHits["cg2"] + cg3hits = currentIDHits["cg3"] + cg4hits = currentIDHits["cg4"] + cgfile.write(ID + "\tNA\t" + str(int(cghits / possibleca * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + if cg1hits > cg2hits and cg1hits > cg3hits and cg1hits > cg4hits: #cg1 gene + if cg1hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg1\n") + continue + cg1 += 1 + cg1file.write(ID + "\t" + str(int(cg1hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg2hits > cg1hits and cg2hits > cg3hits and cg2hits > cg4hits: #cg2 gene + if cg2hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg2\n") + continue + cg2 += 1 + cg2file.write(ID + "\t" + str(int(cg2hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + elif cg3hits > cg1hits and cg3hits > cg2hits and cg3hits > cg4hits: #cg3 gene + if cg3hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg3\n") + continue + cg3 += 1 + cg3file.write(ID + "\t" + str(int(cg3hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #cg4 gene + if cg4hits <= int(varsInCG * requiredVarPercentage): + unmatchedfile.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\tcg4\n") + continue + cg4 += 1 + cg4file.write(ID + "\t" + str(int(cg4hits / varsInCG * 100)) + "\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_cg"] + "\n") + else: #its a cm gene + if cmhits <= int(chunksInCM * requiredChunkPercentage): + unmatchedfile.write(ID + "\tNA\t" + str(int(cghits / possiblecg * 100)) + "\t" + start_location[ID + "_ca"] + "\tcm\n") + continue + cm += 1 + cmfile.write(ID + "\tNA\t" + str(int(cmhits / possiblecm * 100)) + "\t" + start_location[ID + "_cm"] + "\n") +finally: + cafile.close() + ca1file.close() + ca2file.close() + cgfile.close() + cg1file.close() + cg2file.close() + cg3file.close() + cg4file.close() + cmfile.close() + unmatchedfile.close() + + +#print ca,cg,cm,(ca+cg+cm) + + + + + + + + + diff -r 000000000000 -r 74d2bc479bee mutation_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.py Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,80 @@ +import re +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--mutationfile", help="The '7_V-REGION-mutation-and-AA-change-table' file from the IMGT output") +parser.add_argument("--hotspotfile", help="The '10_V-REGION-mutation-hotspots' file from the IMGT output") +parser.add_argument("--output", help="Output file") + +args = parser.parse_args() + +mutationfile = args.mutationfile #"test_VH-Ca_Cg_25nt/7_V-REGION-mutation-and-AA-change-table_test_VH-Ca_Cg_25nt_241013.txt" +hotspotsfile = args.hotspotfile #"test_VH-Ca_Cg_25nt/10_V-REGION-mutation-hotspots_test_VH-Ca_Cg_25nt_241013.txt" +outfile = args.output #"out.txt" + +mutationdic = dict() +mutationMatcher = re.compile("^(.)(\d+).(.),?(.)?(\d+)?.?(.)?(.?.?.?.?.?)?") +linecount = 0 + +with open(mutationfile, 'r') as i: + for line in i.readlines()[1:]: + linecount += 1 + linesplt = line.split("\t") + if linesplt[2] != "productive": + continue + ID = linesplt[1] + mutationdic[ID + "_FR1"] = [mutationMatcher.match(x).groups() for x in linesplt[5].split("|") if x] + mutationdic[ID + "_CDR1"] = [mutationMatcher.match(x).groups() for x in linesplt[6].split("|") if x] + mutationdic[ID + "_FR2"] = [mutationMatcher.match(x).groups() for x in linesplt[7].split("|") if x] + mutationdic[ID + "_CDR2"] = [mutationMatcher.match(x).groups() for x in linesplt[8].split("|") if x] + mutationdic[ID + "_FR2-CDR2"] = mutationdic[ID + "_FR2"] + mutationdic[ID + "_CDR2"] + mutationdic[ID + "_FR3"] = [mutationMatcher.match(x).groups() for x in linesplt[9].split("|") if x] + +if linecount == 0: + print "No data, exiting" + with open(outfile, 'w') as o: + o.write("RGYW (%),0,0,NA\n") + o.write("WRCY (%),0,0,NA\n") + o.write("WA (%),0,0,NA\n") + o.write("TW (%),0,0,NA\n") + import sys + sys.exit() + +hotspotMatcher = re.compile("[actg]+,(\d+)-(\d+)\((.*)\)") +RGYWCount = 0 +WRCYCount = 0 +WACount = 0 +TWCount = 0 + +with open(hotspotsfile, 'r') as i: + for line in i.readlines()[1:]: + linesplt = line.split("\t") + if linesplt[2] != "productive": + continue + ID = linesplt[1] + RGYW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[6].split("|") if x]] + WRCY = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[7].split("|") if x]] + WA = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[4].split("|") if x]] + TW = [(int(x),int(y),z) for (x,y,z) in [hotspotMatcher.match(x).groups() for x in linesplt[5].split("|") if x]] + RGYWCount += sum([1 for (x,y,z) in RGYW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + WRCYCount += sum([1 for (x,y,z) in WRCY if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + WACount += sum([1 for (x,y,z) in WA if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + TWCount += sum([1 for (x,y,z) in TW if z != "CDR3" and any([(x <= int(where) <= y) for (frm, where, to, a,b,c,d) in mutationdic[ID + "_" + z]])]) + +path = outfile[:outfile.rfind("/") + 1] + "mutations.txt" +value = 0; +with open(path, 'r') as f: + value = f.readlines()[0].split(",")[1] +with open(outfile, 'w') as o: + if value != "0": + o.write("RGYW (%)," + str(RGYWCount) + "," + value + "," + str(round(RGYWCount / float(value) * 100, 1)) + "\n") + o.write("WRCY (%)," + str(WRCYCount) + "," + value + "," + str(round(WRCYCount / float(value) * 100, 1)) + "\n") + o.write("WA (%)," + str(WACount) + "," + value + "," + str(round(WACount / float(value) * 100, 1)) + "\n") + o.write("TW (%)," + str(TWCount) + "," + value + "," + str(round(TWCount / float(value) * 100, 1)) + "\n") + else: + o.write("RGYW (%)," + str(RGYWCount) + ",0,NA\n") + o.write("WRCY (%)," + str(WRCYCount) + ",0,NA\n") + o.write("WA (%)," + str(WACount) + ",0,NA\n") + o.write("TW (%)," + str(TWCount) + ",0,NA\n") + + diff -r 000000000000 -r 74d2bc479bee mutation_analysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,328 @@ +args <- commandArgs(trailingOnly = TRUE) + +input = args[1] +summaryinput = args[2] +outputdir = args[3] +setwd(outputdir) + +#dat = read.table("NWK276_MID6_25NT/8_V-REGION-nt-mutation-statistics_NWK276_MID6_25NT_051113.txt", header=T, sep="\t", fill=T, stringsAsFactors=F) +dat = read.table(input, header=T, sep="\t", fill=T, stringsAsFactors=F) + +datSum = read.table(summaryinput, header=T, sep="\t", fill=T, stringsAsFactors=F) +datSum = datSum[,c("Sequence.ID", "AA.JUNCTION")] + +dat = merge(dat, datSum, by="Sequence.ID", all.x=T) + +#dat = dat[dat$Functionality == "productive",] + +dat$VGene = gsub("^Homsap ", "", dat$V.GENE.and.allele) +dat$VGene = gsub("[*].*", "", dat$VGene) + +dat$past = paste(dat$AA.JUNCTION, dat$VGene) + +#dat = dat[!duplicated(dat$past), ] + +if(length(dat$Sequence.ID) == 0){ + setwd(outputdir) + result = data.frame(x = rep(0, 5), y = rep(0, 5), z = rep(NA, 5)) + row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) + transitionTable = data.frame(A=rep(0, 4),C=rep(0, 4),G=rep(0, 4),T=rep(0, 4)) + row.names(transitionTable) = c("A", "C", "G", "T") + transitionTable["A","A"] = NA + transitionTable["C","C"] = NA + transitionTable["G","G"] = NA + transitionTable["T","T"] = NA + write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) + cat("0", file="n.txt") + stop("No data") +} + +#print(paste("After filtering on 'productive' and deduplicating based on V and AA JUNCTION there are", length(dat$Sequence.ID), "sequences left")) + +result = data.frame(x = 1:5, y = 1:5, z = 1:5) +row.names(result) = c("Number of Mutations (%)", "Transition (%)", "Transversions (%)", "Transitions at G C (%)", "Targeting of C G (%)") + +cleanup_columns = c("FR1.IMGT.c.a", + "FR2.IMGT.g.t", + "CDR1.IMGT.Nb.of.nucleotides", + "CDR2.IMGT.t.a", + "FR1.IMGT.c.g", + "CDR1.IMGT.c.t", + "FR2.IMGT.a.c", + "FR2.IMGT.Nb.of.mutations", + "FR2.IMGT.g.c", + "FR2.IMGT.a.g", + "FR3.IMGT.t.a", + "FR3.IMGT.t.c", + "FR2.IMGT.g.a", + "FR3.IMGT.c.g", + "FR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.g.a", + "CDR1.IMGT.t.g", + "CDR1.IMGT.g.c", + "CDR2.IMGT.Nb.of.nucleotides", + "FR2.IMGT.a.t", + "CDR1.IMGT.Nb.of.mutations", + "CDR1.IMGT.a.g", + "FR3.IMGT.a.c", + "FR1.IMGT.g.a", + "FR3.IMGT.a.g", + "FR1.IMGT.a.t", + "CDR2.IMGT.a.g", + "CDR2.IMGT.Nb.of.mutations", + "CDR2.IMGT.g.t", + "CDR2.IMGT.a.c", + "CDR1.IMGT.t.c", + "FR3.IMGT.g.c", + "FR1.IMGT.g.t", + "FR3.IMGT.g.t", + "CDR1.IMGT.a.t", + "FR1.IMGT.a.g", + "FR3.IMGT.a.t", + "FR3.IMGT.Nb.of.nucleotides", + "FR2.IMGT.t.c", + "CDR2.IMGT.g.a", + "FR2.IMGT.t.a", + "CDR1.IMGT.t.a", + "FR2.IMGT.t.g", + "FR3.IMGT.t.g", + "FR2.IMGT.Nb.of.nucleotides", + "FR1.IMGT.t.a", + "FR1.IMGT.t.g", + "FR3.IMGT.c.t", + "FR1.IMGT.t.c", + "CDR2.IMGT.a.t", + "FR2.IMGT.c.t", + "CDR1.IMGT.g.t", + "CDR2.IMGT.t.g", + "FR1.IMGT.Nb.of.nucleotides", + "CDR1.IMGT.c.g", + "CDR2.IMGT.t.c", + "FR3.IMGT.g.a", + "CDR1.IMGT.a.c", + "FR2.IMGT.c.a", + "FR3.IMGT.Nb.of.mutations", + "FR2.IMGT.c.g", + "CDR2.IMGT.g.c", + "FR1.IMGT.g.c", + "CDR2.IMGT.c.t", + "FR3.IMGT.c.a", + "CDR1.IMGT.c.a", + "CDR2.IMGT.c.g", + "CDR2.IMGT.c.a", + "FR1.IMGT.c.t") + +for(col in cleanup_columns){ + dat[,col] = gsub("\\(.*\\)", "", dat[,col]) + #dat[dat[,col] == "",] = "0" + dat[,col] = as.numeric(dat[,col]) + dat[is.na(dat[,col]),] = 0 +} + +VRegionMutations = sum(dat$FR1.IMGT.Nb.of.mutations + + dat$CDR1.IMGT.Nb.of.mutations + + dat$FR2.IMGT.Nb.of.mutations + + dat$CDR2.IMGT.Nb.of.mutations + + dat$FR3.IMGT.Nb.of.mutations) + +VRegionNucleotides = sum( dat$FR1.IMGT.Nb.of.nucleotides + + dat$CDR1.IMGT.Nb.of.nucleotides + + dat$FR2.IMGT.Nb.of.nucleotides + + dat$CDR2.IMGT.Nb.of.nucleotides + + dat$FR3.IMGT.Nb.of.nucleotides) + +result[1,"x"] = VRegionMutations +result[1,"y"] = VRegionNucleotides +result[1,"z"] = round(result[1,"x"] / result[1,"y"] * 100, digits=1) + +transitionMutations = sum(dat$FR1.IMGT.a.g + + dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$FR1.IMGT.t.c + + dat$CDR1.IMGT.a.g + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$CDR1.IMGT.t.c + + dat$FR2.IMGT.a.g + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$FR2.IMGT.t.c + + dat$CDR2.IMGT.a.g + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$CDR2.IMGT.t.c + + dat$FR3.IMGT.a.g + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t + + dat$FR3.IMGT.t.c) + +result[2,"x"] = transitionMutations +result[2,"y"] = VRegionMutations +result[2,"z"] = round(result[2,"x"] / result[2,"y"] * 100, digits=1) + +transversionMutations = sum( dat$FR1.IMGT.a.c + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.a.t + + dat$FR1.IMGT.t.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$FR1.IMGT.t.g + + dat$CDR1.IMGT.a.c + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.a.t + + dat$CDR1.IMGT.t.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$CDR1.IMGT.t.g + + dat$FR2.IMGT.a.c + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.a.t + + dat$FR2.IMGT.t.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$FR2.IMGT.t.g + + dat$CDR2.IMGT.a.c + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.a.t + + dat$CDR2.IMGT.t.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$CDR2.IMGT.t.g + + dat$FR3.IMGT.a.c + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.a.t + + dat$FR3.IMGT.t.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t + + dat$FR3.IMGT.t.g) + +result[3,"x"] = transversionMutations +result[3,"y"] = VRegionMutations +result[3,"z"] = round(result[3,"x"] / result[3,"y"] * 100, digits=1) + +transitionMutationsAtGC = sum(dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t) + +totalMutationsAtGC = sum(dat$FR1.IMGT.g.a + + dat$FR1.IMGT.c.t + + dat$FR1.IMGT.c.a + + dat$FR1.IMGT.g.c + + dat$FR1.IMGT.c.g + + dat$FR1.IMGT.g.t + + dat$CDR1.IMGT.g.a + + dat$CDR1.IMGT.c.t + + dat$CDR1.IMGT.c.a + + dat$CDR1.IMGT.g.c + + dat$CDR1.IMGT.c.g + + dat$CDR1.IMGT.g.t + + dat$FR2.IMGT.g.a + + dat$FR2.IMGT.c.t + + dat$FR2.IMGT.c.a + + dat$FR2.IMGT.g.c + + dat$FR2.IMGT.c.g + + dat$FR2.IMGT.g.t + + dat$CDR2.IMGT.g.a + + dat$CDR2.IMGT.c.t + + dat$CDR2.IMGT.c.a + + dat$CDR2.IMGT.g.c + + dat$CDR2.IMGT.c.g + + dat$CDR2.IMGT.g.t + + dat$FR3.IMGT.g.a + + dat$FR3.IMGT.c.t + + dat$FR3.IMGT.c.a + + dat$FR3.IMGT.g.c + + dat$FR3.IMGT.c.g + + dat$FR3.IMGT.g.t) + +result[4,"x"] = transitionMutationsAtGC +result[4,"y"] = totalMutationsAtGC +result[4,"z"] = round(result[4,"x"] / result[4,"y"] * 100, digits=1) + +result[5,"x"] = totalMutationsAtGC +result[5,"y"] = VRegionMutations +result[5,"z"] = round(result[5,"x"] / result[5,"y"] * 100, digits=1) + + +#transitiontable + +transitionTable = data.frame(A=1:4,C=1:4,G=1:4,T=1:4) +row.names(transitionTable) = c("A", "C", "G", "T") +transitionTable["A","A"] = NA +transitionTable["C","C"] = NA +transitionTable["G","G"] = NA +transitionTable["T","T"] = NA +nts = c("a", "c", "g", "t") + + +for(nt1 in nts){ + for(nt2 in nts){ + if(nt1 == nt2){ + next + } + NT1 = LETTERS[letters == nt1] + NT2 = LETTERS[letters == nt2] + FR1 = paste("FR1.IMGT.", nt1, ".", nt2, sep="") + CDR1 = paste("CDR1.IMGT.", nt1, ".", nt2, sep="") + FR2 = paste("FR2.IMGT.", nt1, ".", nt2, sep="") + CDR2 = paste("CDR2.IMGT.", nt1, ".", nt2, sep="") + FR3 = paste("FR3.IMGT.", nt1, ".", nt2, sep="") + transitionTable[NT1,NT2] = sum( dat[,FR1] + + dat[,CDR1] + + dat[,FR2] + + dat[,CDR2] + + dat[,FR3]) + } +} + +setwd(outputdir) +write.table(x=result, file="mutations.txt", sep=",",quote=F,row.names=T,col.names=F) +write.table(x=transitionTable, file="transitions.txt", sep=",",quote=F,row.names=T,col.names=NA) +cat(length(dat$Sequence.ID), file="n.txt") + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 74d2bc479bee mutation_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mutation_analysis.xml Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,15 @@ + + + + wrapper.sh $in_file $out_file $out_file.files_path + + + + + + + + + there is no help + + diff -r 000000000000 -r 74d2bc479bee piechart.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piechart.r Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,24 @@ +if (!("ggplot2" %in% rownames(installed.packages()))) { + install.packages("ggplot2", repos="http://cran.xl-mirror.nl/") +} +library(ggplot2) + +args <- commandArgs(trailingOnly = TRUE) + +vls = args[1] +lbls = args[2] +name = args[3] +output = args[4] + +vls = as.numeric(unlist(strsplit(vls, ","))) +lbls = unlist(strsplit(lbls, ",")) + +pc = ggplot(data.frame(Type=lbls, vls=vls), aes(x = factor(1), y=vls, fill=Type)) +pc = pc + geom_bar(width = 1, stat = "identity") +pc = pc + coord_polar(theta="y") +pc = pc + xlab(" ") + ylab(" ") + ggtitle(name) + +png(filename=output) +#pie(vls, labels=lbls, col=rainbow(length(vls)), main=name) +pc +dev.off() diff -r 000000000000 -r 74d2bc479bee wrapper.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wrapper.sh Mon Aug 18 04:04:37 2014 -0400 @@ -0,0 +1,121 @@ +#!/bin/bash +dir="$(cd "$(dirname "$0")" && pwd)" + +input=$1 +output=$2 +outdir=$3 +mkdir $outdir + +unzip $input -d $PWD/files/ > $PWD/unziplog.log +cat $PWD/files/*/1_* > $PWD/summary.txt +cat $PWD/files/*/7_* > $PWD/mutationanalysis.txt +cat $PWD/files/*/8_* > $PWD/mutationstats.txt +cat $PWD/files/*/10_* > $PWD/hotspots.txt + +mkdir $outdir/identification/ +python $dir/gene_identification.py --input $PWD/summary.txt --outdir $outdir/identification/ +genes=(ca ca1 ca2 cg cg1 cg2 cg3 cg4 cm) +tmp=$PWD/tmp +tmp2=$PWD/tmp2 +hotspottmp=$PWD/hotspottmp +mutationtmp=$PWD/mutationtmp +touch $outdir/mutationandhotspot.txt +for gene in ${genes[@]} +do + echo "Running $gene
" >> $output + mkdir $outdir/$gene + echo "Filtering input..." >> $output + Rscript $dir/filter.r $PWD/summary.txt $outdir/identification/${gene}.txt $outdir/$gene/summary.txt + Rscript $dir/filter.r $PWD/mutationanalysis.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationanalysis.txt + Rscript $dir/filter.r $PWD/mutationstats.txt $outdir/identification/${gene}.txt $outdir/$gene/mutationstats.txt + Rscript $dir/filter.r $PWD/hotspots.txt $outdir/identification/${gene}.txt $outdir/$gene/hotspots.txt + echo "done
" >> $output + + echo "Running R script on $gene..." >> $output + Rscript --verbose $dir/mutation_analysis.r $outdir/$gene/mutationstats.txt $outdir/$gene/summary.txt $outdir/$gene/ 2>&1 + echo "done
" >> $output + + echo "Running Python script..." >> $output + python $dir/mutation_analysis.py --mutationfile $outdir/$gene/mutationanalysis.txt --hotspotfile $outdir/$gene/hotspots.txt --output $outdir/$gene/hotspot_analysis.txt + echo "done
" >> $output + echo "Done with $gene
" >> $output + cut $outdir/$gene/mutations.txt -d, -f2,3,4 > $mutationtmp + cut $outdir/$gene/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp + cat $mutationtmp $hotspottmp > $tmp + paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2 + cat $tmp2 > $outdir/mutationandhotspot.txt +done + +Rscript --verbose $dir/mutation_analysis.r $PWD/mutationstats.txt $PWD/summary.txt $outdir/ 2>&1 +python $dir/mutation_analysis.py --mutationfile $PWD/mutationanalysis.txt --hotspotfile $PWD/hotspots.txt --output $outdir/hotspot_analysis.txt + +cut $outdir/mutations.txt -d, -f2,3,4 > $mutationtmp +cut $outdir/hotspot_analysis.txt -d, -f2,3,4 > $hotspottmp +cat $mutationtmp $hotspottmp > $tmp +paste $outdir/mutationandhotspot.txt -d, $tmp > $tmp2 +cat $tmp2 > $outdir/mutationandhotspot.txt + +cut $outdir/ca1/mutations.txt -d, -f1 > $mutationtmp +cut $outdir/ca1/hotspot_analysis.txt -d, -f1 > $hotspottmp +cat $mutationtmp $hotspottmp > $tmp +paste $tmp $outdir/mutationandhotspot.txt -d, > $tmp2 +cat $tmp2 | tr -s "," > $outdir/mutationandhotspot.txt + +ca_n=`cat $outdir/ca/n.txt` +ca1_n=`cat $outdir/ca1/n.txt` +ca2_n=`cat $outdir/ca2/n.txt` +cg_n=`cat $outdir/cg/n.txt` +cg1_n=`cat $outdir/cg1/n.txt` +cg2_n=`cat $outdir/cg2/n.txt` +cg3_n=`cat $outdir/cg3/n.txt` +cg4_n=`cat $outdir/cg4/n.txt` +cm_n=`cat $outdir/cm/n.txt` +#all_n=$((ca_n + cg_n + cm_n)) +all_n=`cat $outdir/n.txt` + + +echo "" > $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +echo "" >> $output +while IFS=, read name cax cay caz ca1x ca1y ca1z ca2x ca2y ca2z cgx cgy cgz cg1x cg1y cg1z cg2x cg2y cg2z cg3x cg3y cg3z cg4x cg4y cg4z cmx cmy cmz allx ally allz + do + echo "" >> $output +done < $outdir/mutationandhotspot.txt +echo "
infoca (N = $ca_n)ca1 (N = $ca1_n)ca2 (N = $ca2_n)cg (N = $cg_n)cg1 (N = $cg1_n)cg2 (N = $cg2_n)cg3 (N = $cg3_n)cg4 (N = $cg4_n)cm (N = $cm_n)all (N = $all_n)
$name${cax}/${cay} (${caz}%)${ca1x}/${ca1y} (${ca1z}%)${ca2x}/${ca2y} (${ca2z}%)${cgx}/${cgy} (${cgz}%)${cg1x}/${cg1y} (${cg1z}%)${cg2x}/${cg2y} (${cg2z}%)${cg3x}/${cg3y} (${cg3z}%)${cg4x}/${cg4y} (${cg4z}%)${cmx}/${cmy} (${cmz}%)${allx}/${ally} (${allz}%)
" >> $output +echo "umatched
" >> $output + +Rscript $dir/piechart.r "${ca_n},${cg_n},${cm_n}" "IgA - ${ca_n},IgG - ${cg_n},IgM? - ${cm_n}" "Ig* (N = $all_n)" $outdir/all.png 2>&1 +Rscript $dir/piechart.r "${ca1_n},${ca2_n}" "IgA1 - ${ca1_n},IgA2 - ${ca2_n}" "IgA (N = $ca_n)" $outdir/ca.png 2>&1 +Rscript $dir/piechart.r "${cg1_n},${cg2_n},${cg3_n},${cg4_n}" "IgG1 - ${cg1_n},IgG2 - ${cg2_n},IgG3 - ${cg3_n},IgG4 - ${cg4_n}" "IgG (N = $cg_n)" $outdir/cg.png 2>&1 + +echo "" >> $output +echo "" >> $output +echo "" >> $output + +for gene in ${genes[@]} +do + echo "" >> $output + while IFS=, read from a c g t + do + echo "" >> $output + done < $outdir/$gene/transitions.txt + echo "
$gene transition table
$from$a$c$g$t
" >> $output +done + +echo "" >> $output +while IFS=, read from a c g t + do + echo "" >> $output +done < $outdir/transitions.txt +echo "
All transition table
$from$a$c$g$t
" >> $output + +echo "" >> $output