Mercurial > repos > vipints > fml_mergeloci
comparison fml_gff_groomer/scripts/gff_available_limits.py @ 0:a35d6c641115 default tip
Migrated tool version 1.0.0 from old tool shed archive to new tool shed repository
| author | vipints |
|---|---|
| date | Tue, 07 Jun 2011 16:47:44 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a35d6c641115 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # This program is free software; you can redistribute it and/or modify | |
| 4 # it under the terms of the GNU General Public License as published by | |
| 5 # the Free Software Foundation; either version 3 of the License, or | |
| 6 # (at your option) any later version. | |
| 7 # | |
| 8 # Written (W) 2010 Vipin T Sreedharan, Friedrich Miescher Laboratory of the Max Planck Society | |
| 9 # Copyright (C) 2010 Max Planck Society | |
| 10 # | |
| 11 # Description : Provide available source, feature types from a GFF file | |
| 12 | |
| 13 import re, sys | |
| 14 import time | |
| 15 import collections | |
| 16 | |
| 17 def available_limits(gff_handle): | |
| 18 """Figure out the available feature types from the given GFF file""" | |
| 19 | |
| 20 filter_info = dict(gff_id = [0], gff_source_type = [1, 2], | |
| 21 gff_source = [1], gff_type = [2]) | |
| 22 cur_limits = dict() | |
| 23 for filter_key in filter_info.keys(): | |
| 24 cur_limits[filter_key] = collections.defaultdict(int) | |
| 25 for line in gff_handle: | |
| 26 if line.strip('\n\r')[0] != "#": | |
| 27 parts = [p.strip() for p in line.split('\t')] | |
| 28 if len(parts) == 1 and re.search(r'\w+', parts[0]):continue ## GFF files with FASTA sequence together | |
| 29 assert len(parts) == 9, line | |
| 30 for filter_key, cur_indexes in filter_info.items(): | |
| 31 cur_id = tuple([parts[i] for i in cur_indexes]) | |
| 32 cur_limits[filter_key][cur_id] += 1 | |
| 33 # get rid of the default dicts | |
| 34 final_dict = dict() | |
| 35 for key, value_dict in cur_limits.items(): | |
| 36 if len(key) == 1: | |
| 37 key = key[0] | |
| 38 final_dict[key] = dict(value_dict) | |
| 39 | |
| 40 return final_dict | |
| 41 | |
| 42 if __name__=='__main__': | |
| 43 | |
| 44 stime = time.asctime( time.localtime(time.time()) ) | |
| 45 print '-------------------------------------------------------' | |
| 46 print 'FeatureScan started on ' + stime | |
| 47 print '-------------------------------------------------------' | |
| 48 | |
| 49 try: | |
| 50 gff_handle = open(sys.argv[1], 'rU') | |
| 51 except: | |
| 52 sys.stderr.write("Can't open the GFF3 file, terminating...\n") | |
| 53 sys.stderr.write("USAGE: gff_available_limits.py <gff file>\n") | |
| 54 sys.exit(-1) | |
| 55 final_dict = available_limits(gff_handle) | |
| 56 gff_handle.close() | |
| 57 print | |
| 58 print "==Overview of available source(s) and feature type(s) from GFF file==" | |
| 59 print | |
| 60 print "Chromosome identifier(s) and corresponding count:" | |
| 61 for contig, cnt in sorted(final_dict['gff_id'].items()): | |
| 62 print '\t' + str(contig[0]) + '\t' + str(cnt) | |
| 63 print | |
| 64 print "Source(s) of feature and corresponding count:" | |
| 65 for source, cnt in sorted(final_dict['gff_source'].items()): | |
| 66 print '\t' + str(source[0]) + '\t' + str(cnt) | |
| 67 print | |
| 68 print "Feature type(s) and corresponding count:" | |
| 69 for ftype, cnt in sorted(final_dict['gff_type'].items()): | |
| 70 print '\t' + str(cnt) + '\t' + str(ftype[0]) | |
| 71 print | |
| 72 print "Unique combination of Feature type(s), Source(s) and corresponding count:" | |
| 73 for sftype, cnt in sorted(final_dict['gff_source_type'].items()): | |
| 74 print '\t' + str(cnt) + '\t' + str(sftype[0]) + ', '+ str(sftype[1]) | |
| 75 print | |
| 76 stime = time.asctime( time.localtime(time.time()) ) | |
| 77 print '-------------------------------------------------------' | |
| 78 print 'FeatureScan finished at ' + stime | |
| 79 print '-------------------------------------------------------' |
