Mercurial > repos > mvdbeek > tepid_merge_insertions
comparison merge_insertions.py @ 0:6e4b5319cb89 draft default tip
planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
| author | mvdbeek |
|---|---|
| date | Mon, 23 Jan 2017 10:05:12 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e4b5319cb89 |
|---|---|
| 1 from argparse import ArgumentParser | |
| 2 import os | |
| 3 import tempfile | |
| 4 import pandas as pd | |
| 5 import pybedtools | |
| 6 | |
| 7 COLUMNS = ['ins_chrom', | |
| 8 'ins_start', | |
| 9 'ins_end', | |
| 10 'ref_chrom', | |
| 11 'ref_start', | |
| 12 'ref_end', | |
| 13 'agi', | |
| 14 'accession', | |
| 15 'cluster'] | |
| 16 | |
| 17 def overlap(start1, stop1, start2, stop2, d=50): | |
| 18 """returns True if sets of coordinates overlap. Assumes coordinates are on same chromosome""" | |
| 19 return start1 <= stop2+d and stop1 >= start2-d | |
| 20 | |
| 21 | |
| 22 def merge(i, insertion, result): | |
| 23 if len(result) == 0: | |
| 24 result[i] = insertion | |
| 25 else: | |
| 26 if not can_merge(insertion, result): | |
| 27 result[i] = insertion | |
| 28 | |
| 29 | |
| 30 def can_merge(insertion, result): | |
| 31 """ | |
| 32 Merges insertions and returns True if all requirements are met | |
| 33 """ | |
| 34 for j, master_insertion in result.items(): | |
| 35 if insertion['agi'] & master_insertion['agi']: | |
| 36 if overlap(master_insertion['ins_start'], master_insertion['ins_end'], insertion['ins_start'],insertion['ins_end']): | |
| 37 # Adjusting the insertion start (doesn't really do anything?!) | |
| 38 if len(insertion['agi']) < len(master_insertion['agi']): | |
| 39 ref_start = master_insertion['ref_start'] | |
| 40 else: | |
| 41 ref_start = insertion['ref_start'] | |
| 42 if master_insertion['ins_chrom'] == insertion['ins_chrom'] and insertion['ref_chrom'] == master_insertion['ref_chrom'] and ref_start == master_insertion['ref_start']: | |
| 43 result[j]['accession'] = result[j]['accession'] | (insertion['accession']) | |
| 44 result[j]['agi'] = result[j]['agi'] | (insertion['agi']) | |
| 45 return True | |
| 46 return False | |
| 47 | |
| 48 | |
| 49 def inner_merge(s): | |
| 50 result = {} | |
| 51 for i, insertion in s.items(): | |
| 52 merge(i, insertion, result) | |
| 53 return result.values() | |
| 54 | |
| 55 | |
| 56 def reduce_and_cluster(inputfiles): | |
| 57 """ | |
| 58 Read in inputfiles using pandas, write additional column with sample identifier, | |
| 59 sort and cluster using pybedtools and return dataframe. | |
| 60 """ | |
| 61 usecols = [0,1,2,3,4,5,6] # skip col 7, which contains the read support id | |
| 62 tables = [pd.read_table(f, header=None) for f in inputfiles] | |
| 63 sample_ids = [os.path.basename(f).rsplit('.')[0] for f in inputfiles] | |
| 64 for sample_id, df in zip(sample_ids, tables): | |
| 65 df[7] = sample_id | |
| 66 merged_table = pd.concat(tables) | |
| 67 tfile = tempfile.NamedTemporaryFile() | |
| 68 merged_table.to_csv(tfile, sep="\t", header=None, index=False) | |
| 69 tfile.flush() | |
| 70 bedfile = pybedtools.BedTool(tfile.name).sort().cluster(d=50) | |
| 71 df = bedfile.to_dataframe() | |
| 72 df.columns = COLUMNS | |
| 73 # Split comma separated agi values and make set | |
| 74 df['agi'] = [set(v.split(',')) for v in df['agi'].values] | |
| 75 df['accession'] = [set(str(v).split(',')) for v in df['accession'].values] | |
| 76 return df | |
| 77 | |
| 78 | |
| 79 def split_clusters(df): | |
| 80 """ | |
| 81 clusters as defined by bedtools allow for 50 nt distance. This means that | |
| 82 clusters can be many kb large, so we check each individual insertion in | |
| 83 the cluster against the other insertions. We split the clusters based on | |
| 84 whether the overlap and TE identity criteria are fulfilled (so a | |
| 85 different TE would lead to a split in the clusters) | |
| 86 """ | |
| 87 groups = df.groupby('cluster') | |
| 88 nested_list = [inner_merge(group.transpose().to_dict()) for _, group in groups] | |
| 89 return pd.DataFrame([i for n in nested_list for i in n])[COLUMNS] | |
| 90 | |
| 91 | |
| 92 def write_output(df, output): | |
| 93 # Turn sets back to comma-separated values | |
| 94 df['agi'] = [",".join(agi) for agi in df['agi']] | |
| 95 df['accession'] = [",".join(acc) for acc in df['accession']] | |
| 96 # write out result without last column | |
| 97 df.to_csv(output, sep="\t",header=None, index=None, columns=COLUMNS[:-1]) | |
| 98 | |
| 99 | |
| 100 def main(inputfiles, output): | |
| 101 df = reduce_and_cluster(inputfiles) | |
| 102 df = split_clusters(df) | |
| 103 write_output(df, output) | |
| 104 | |
| 105 | |
| 106 if __name__ == "__main__": | |
| 107 parser = ArgumentParser(description='Merge TE insertions calls') | |
| 108 parser.add_argument('-o', '--output', help='output file', required=True) | |
| 109 parser.add_argument('-i', '--input', help='Insertion files to merge', nargs="+", required=True) | |
| 110 options = parser.parse_args() | |
| 111 | |
| 112 main(inputfiles=options.input, output=options.output) |
