Mercurial > repos > artbio > rsem
annotate purge_gtf_from_multichrom_genes.py @ 11:c86ed39b72eb draft
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
author | artbio |
---|---|
date | Sun, 05 Jan 2020 22:07:36 +0000 |
parents | |
children |
rev | line source |
---|---|
11
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
1 #!/usr/bin/env python |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
2 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
3 import argparse |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
4 from collections import defaultdict |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
5 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
6 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
7 def command_parse(): |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
8 parser = argparse.ArgumentParser(description='Purge GTF file from genes \ |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
9 that are on several chromosomes and list them in a log file') |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
10 parser.add_argument( |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
11 '-i', '--input', dest='input', help='input GTF file', required=True) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
12 parser.add_argument('-o', '--output', dest='output', help='output file \ |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
13 name', default='output.gtf') |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
14 parser.add_argument('-l', '--log', dest='log', help='log of purged \ |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
15 genes', default='purged_genes.log') |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
16 args = parser.parse_args() |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
17 return args |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
18 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
19 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
20 def get_genes(gtf_file): |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
21 genes = defaultdict(list) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
22 with open(gtf_file, 'r') as fh: |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
23 for line in fh: |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
24 if line[0] != '#': |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
25 fields = line[:-1].split("\t") |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
26 chrom = fields[0] |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
27 name_gene = fields[-1].split('gene_id "')[-1].split('"; \ |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
28 transcript_id')[0] |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
29 genes[name_gene].append(chrom) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
30 return genes |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
31 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
32 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
33 def generate_output(genes, log_file): |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
34 ''' |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
35 Search for all genes that are present on several chromosomes. This function |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
36 return a list of these genes in target_genes. It also generate a log tab |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
37 delimited file with one gene per line and with its list of chromosomes |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
38 (coma delimited) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
39 ''' |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
40 output = open(log_file, 'w') |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
41 # output.write('#all genes on several chromosomes' + '\n') |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
42 target_genes = list() |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
43 for name_gene in genes.keys(): |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
44 genes[name_gene] = set(genes[name_gene]) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
45 if len(genes[name_gene]) > 1: |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
46 target_genes.append(name_gene) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
47 new_line = '\t'.join([name_gene, ','.join(genes[name_gene])]) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
48 output.write("%s\n" % new_line) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
49 output.close() |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
50 return target_genes |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
51 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
52 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
53 def purge_gtf(target_genes, gtf_file, output_file): |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
54 ''' |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
55 Remove all lines of the gtf file where the gene_id is gene of target_genes |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
56 list. |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
57 ''' |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
58 output_gtf = open(output_file, 'w') |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
59 with open(gtf_file, 'r') as gtf_handler: |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
60 for line in gtf_handler: |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
61 fields = line[:-1].split("\t") |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
62 gene_name = fields[-1].split('gene_id "')[-1].split('"; \ |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
63 transcript_id')[0] |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
64 if gene_name not in target_genes: |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
65 output_gtf.write(line) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
66 output_gtf.close() |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
67 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
68 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
69 def __main__(): |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
70 args = command_parse() |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
71 genes = get_genes(args.input) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
72 target_genes = generate_output(genes, args.log) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
73 purge_gtf(target_genes, args.input, args.output) |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
74 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
75 |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
76 if __name__ == "__main__": |
c86ed39b72eb
"planemo upload for repository https://github.com/artbio/tools-artbio/tree/master/tools/rsem commit 10719ebe96c608be124360a4be62ec4d164412b3"
artbio
parents:
diff
changeset
|
77 __main__() |