Mercurial > repos > rnateam > graphclust_postprocessing
comparison addCdhitseqs.py @ 4:dbcea781900e draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust/CollectResults commit f971832d2b34a182314e5201ea6895dd207c5923
| author | rnateam |
|---|---|
| date | Mon, 13 Mar 2017 17:54:32 -0400 |
| parents | 6c88ad83de28 |
| children |
comparison
equal
deleted
inserted
replaced
| 3:a8fde40f00fc | 4:dbcea781900e |
|---|---|
| 1 import re | 1 import re |
| 2 import glob | 2 import glob |
| 3 import sys | 3 import sys |
| 4 | 4 |
| 5 cdhitcluster = sys.argv[1] | 5 cdhitcluster = sys.argv[1] |
| 6 #clusters = sys.argv[2] | |
| 7 | 6 |
| 8 cluster_seqs_stats_path = "RESULTS/*.cluster.all" | 7 cluster_seqs_stats_path = "RESULTS/*.cluster.all" |
| 9 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) | 8 cluster_seqs_stats_files = glob.glob(cluster_seqs_stats_path) |
| 10 | 9 |
| 11 #clusterFiles = clusters.split(',') | |
| 12 repSeqRedSeqdict = {} | 10 repSeqRedSeqdict = {} |
| 13 repLine = "" | 11 repLine = "" |
| 14 count = 0 | 12 count = 0 |
| 15 first = False | 13 first = False |
| 14 add_FullId = "" | |
| 15 k = 0 | |
| 16 | 16 |
| 17 with open(cdhitcluster, 'r+') as f: | 17 with open(cdhitcluster, 'r+') as f: |
| 18 lines = f.readlines() | 18 content = f.read() |
| 19 reps = re.compile("^.*\*$", re.MULTILINE).findall(content) | |
| 20 lines = content.split('\n') | |
| 21 | |
| 19 for i in range(0, len(lines)): | 22 for i in range(0, len(lines)): |
| 20 line = lines[i] | 23 line = lines[i] |
| 21 if ">Cluster" in line: | 24 if ">Cluster" in line: |
| 22 first = True | 25 first = True |
| 23 count = 0 | 26 count = 0 |
| 24 if i+1 < len(lines): | 27 repLine = reps[k] |
| 25 repLine = lines[i+1] | 28 k = k+1 |
| 26 continue | 29 continue |
| 27 elif not first: | 30 elif not first: |
| 28 count += 1 | 31 count += 1 |
| 29 first = False | 32 first = False |
| 30 else: | 33 else: |
| 31 first = False | 34 first = False |
| 32 lineArr = [] | 35 lineArr = [] |
| 33 if count > 0: | 36 if count > 0: |
| 34 repLine = repLine.strip() | 37 repLine = repLine.strip() |
| 35 rep_FullId = repLine.split()[2] | 38 rep_FullId = repLine.split()[2] |
| 36 rep_FullId = rep_FullId.replace(">", "") | 39 rep_FullId = rep_FullId.replace(">","") |
| 37 #rep_short_id = re.findall("_".join(["[^_]+"] * 2), rep_FullId)[0] | 40 rep_FullId = rep_FullId.replace("...","") |
| 38 rep_FullId = rep_FullId.replace("...", "") | 41 if "*" in line or not line.strip(): |
| 42 continue | |
| 39 line = line.strip() | 43 line = line.strip() |
| 40 add_FullId = line.split()[2] | 44 add_FullId = line.split()[2] |
| 41 add_FullId = add_FullId.replace(">", "") | 45 add_FullId = add_FullId.replace(">","") |
| 42 add_FullId = add_FullId.replace("...", "") | 46 add_FullId = add_FullId.replace("...","") |
| 43 #add_short_id = re.findall("_".join(["[^_]+"] * 2), add_FullId)[0] | |
| 44 lineArr.append(add_FullId) | 47 lineArr.append(add_FullId) |
| 45 repSeqRedSeqdict[rep_FullId] = lineArr | 48 repSeqRedSeqdict[rep_FullId] = lineArr |
| 46 #lineArr.append(add_short_id) | |
| 47 #repSeqRedSeqdict[rep_short_id] = lineArr | |
| 48 | 49 |
| 49 toWrite = "" | 50 toWrite = "" |
| 50 | |
| 51 for singleFile in sorted(cluster_seqs_stats_files): | 51 for singleFile in sorted(cluster_seqs_stats_files): |
| 52 with open(singleFile, "a+") as clFile: | 52 toWrite = "" |
| 53 file_content = clFile.read() | 53 with open(singleFile, "r+") as clFile: |
| 54 first_line = file_content.split('\n')[0] | 54 file_lines = clFile.readlines() |
| 55 for line in file_lines: | |
| 56 line = '\t'.join(line.split()) | |
| 57 toWrite += line + '\n' | |
| 58 clFile.seek(0) | |
| 59 clFile.write(toWrite) | |
| 60 clFile.truncate() | |
| 61 first_line = file_lines[0] | |
| 62 toWrite = "" | |
| 63 cols = first_line.split() | |
| 64 file_content = '\n'.join(file_lines) | |
| 55 for key, val in repSeqRedSeqdict.items(): | 65 for key, val in repSeqRedSeqdict.items(): |
| 56 if key in file_content: | 66 if key in file_content: |
| 67 | |
| 57 for i in val: | 68 for i in val: |
| 58 toWrite += first_line.split()[0] + " " + first_line.split()[1] + " " + first_line.split()[2] + " " + " - " + " " + "CD-Hit" + " " + first_line.split()[5] + " " + "ORIGID" + " " + str(i) + "\n" | 69 cols[3] = "---" |
| 70 cols[4] = "CD-Hit" | |
| 71 cols[7] = str(i) | |
| 72 if len(first_line.split()) > 9: | |
| 73 cols[9] = str(i.rsplit("_",1)[0]) | |
| 74 toWrite += '\t'.join(cols) | |
| 75 toWrite +="\n" | |
| 59 clFile.write(toWrite) | 76 clFile.write(toWrite) |
