graphclust_postprocessing: addCdhitseqs.py comparison

comparison addCdhitseqs.py @ 4:dbcea781900e draft

planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit f971832d2b34a182314e5201ea6895dd207c5923

author	rnateam
date	Mon, 13 Mar 2017 17:54:32 -0400
parents	6c88ad83de28
children

comparison

equal deleted inserted replaced

-:a8fde40f00fc
+:dbcea781900e
 import re
 import glob
 import sys
 cdhitcluster = sys.argv[1]
-#clusters = sys.argv[2]
 cluster_seqs_stats_path = "RESULTS/*.cluster.all"
 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path)
-#clusterFiles = clusters.split(',')
 repSeqRedSeqdict = {}
 repLine = ""
 count = 0
 first = False
+add_FullId = ""
+k = 0
 with open(cdhitcluster, 'r+') as f:
-lines = f.readlines()
+content = f.read()
+reps = re.compile("^.*\*$", re.MULTILINE).findall(content)
+lines = content.split('\n')
 for i in range(0, len(lines)):
 line = lines[i]
 if ">Cluster" in line:
 first = True
 count = 0
-if i+1 < len(lines):
+repLine = reps[k]
-repLine = lines[i+1]
+k = k+1
 continue
 elif not first:
 count += 1
 first = False
 else:
 first = False
 lineArr = []
 if count > 0:
 repLine = repLine.strip()
 rep_FullId = repLine.split()[2]
-rep_FullId = rep_FullId.replace(">", "")
+rep_FullId = rep_FullId.replace(">","")
-#rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0]
+rep_FullId = rep_FullId.replace("...","")
-rep_FullId = rep_FullId.replace("...", "")
+if "*" in line or not line.strip():
+continue
 line = line.strip()
 add_FullId = line.split()[2]
-add_FullId = add_FullId.replace(">", "")
+add_FullId = add_FullId.replace(">","")
-add_FullId = add_FullId.replace("...", "")
+add_FullId = add_FullId.replace("...","")
-#add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0]
 lineArr.append(add_FullId)
 repSeqRedSeqdict[rep_FullId] = lineArr
-#lineArr.append(add_short_id)
-#repSeqRedSeqdict[rep_short_id] = lineArr
 toWrite = ""
 for singleFile in sorted(cluster_seqs_stats_files):
-with open(singleFile, "a+") as clFile:
+toWrite = ""
-file_content = clFile.read()
+with open(singleFile, "r+") as clFile:
-first_line = file_content.split('\n')[0]
+file_lines = clFile.readlines()
+for line in file_lines:
+line = '\t'.join(line.split())
+toWrite += line + '\n'
+clFile.seek(0)
+clFile.write(toWrite)
+clFile.truncate()
+first_line = file_lines[0]
+toWrite = ""
+cols = first_line.split()
+file_content =  '\n'.join(file_lines)
 for key, val in repSeqRedSeqdict.items():
 if key in file_content:
 for i in val:
-toWrite += first_line.split()[0] + "  " + first_line.split()[1] + "  " + first_line.split()[2] + "  " + " - " + "   " + "CD-Hit" + "    " + first_line.split()[5] + "  " + "ORIGID" + "  "  + str(i) + "\n"
+cols[3] = "---"
+cols[4] = "CD-Hit"
+cols[7] = str(i)
+if len(first_line.split()) > 9:
+cols[9] = str(i.rsplit("_",1)[0])
+toWrite += '\t'.join(cols)
+toWrite +="\n"
 clFile.write(toWrite)

Mercurial > repos > rnateam > graphclust_postprocessing

comparison addCdhitseqs.py @ 4:dbcea781900e draft