comparison find_in_reference.py @ 0:fe0327a3ba81

Uploaded
author jjohnson
date Sat, 04 Jan 2014 09:03:57 -0500
parents
children 856033fb26e8
comparison
equal deleted inserted replaced
-1:000000000000 0:fe0327a3ba81
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2013, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 """
16 Takes 2 tabular files as input:
17 1. The file to be filtered
18 2. The reference file
19
20 The string value of selected column of the input file is searched for
21 in the string values of the selected column of the reference file.
22
23 The intended purpose is to filter a peptide fasta file in tabular format
24 by whether those peptide sequences are found in a reference fasta file.
25
26 """
27 import sys,re,os.path
28 import tempfile
29 import optparse
30 from optparse import OptionParser
31 import logging
32
33
34 def __main__():
35 #Parse Command Line
36 parser = optparse.OptionParser()
37 parser.add_option( '-i', '--input', dest='input', help='The input file to filter. (Otherwise read from stdin)' )
38 parser.add_option( '-r', '--reference', dest='reference', help='The reference file to filter against' )
39 parser.add_option( '-o', '--output', dest='output', help='The output file for input lines filtered by reference')
40 parser.add_option( '-f', '--filtered', dest='filtered', help='The output file for input lines not in the output')
41 parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
42 parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
43 parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' )
44 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' )
45 (options, args) = parser.parse_args()
46 # Input files
47 if options.input != None:
48 try:
49 inputPath = os.path.abspath(options.input)
50 inputFile = open(inputPath, 'r')
51 except Exception, e:
52 print >> sys.stderr, "failed: %s" % e
53 exit(2)
54 else:
55 inputFile = sys.stdin
56 # Reference
57 if options.reference == None:
58 print >> sys.stderr, "failed: reference file is required"
59 exit(2)
60 # Output files
61 outFile = None
62 filteredFile = None
63 if options.filtered == None and options.output == None:
64 #write to stdout
65 outFile = sys.stdout
66 else:
67 if options.output != None:
68 try:
69 outPath = os.path.abspath(options.output)
70 outFile = open(outPath, 'w')
71 except Exception, e:
72 print >> sys.stderr, "failed: %s" % e
73 exit(3)
74 if options.filtered != None:
75 try:
76 filteredPath = os.path.abspath(options.filtered)
77 filteredFile = open(filteredPath, 'w')
78 except Exception, e:
79 print >> sys.stderr, "failed: %s" % e
80 exit(3)
81 incol = -1
82 if options.input_column and options.input_column > 0:
83 incol = int(options.input_column)-1
84 refcol = -1
85 if options.reference_column and options.reference_column > 0:
86 refcol = int(options.reference_column)-1
87 refFile = None
88 for ln,line in enumerate(inputFile):
89 try:
90 found = False
91 search_string = line.split('\t')[incol].rstrip('\r\n')
92 if options.debug:
93 print >> sys.stderr, "search: %s" % (search_string)
94 refFile = open(options.reference,'r')
95 for tn,fline in enumerate(refFile):
96 target_string = fline.split('\t')[refcol]
97 if options.debug:
98 print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string)
99 if search_string in target_string:
100 found = True
101 break
102 if found:
103 if options.keep == True:
104 if outFile:
105 outFile.write(line)
106 else:
107 if filteredFile:
108 filteredFile.write(line)
109 else:
110 if options.keep == True:
111 if filteredFile:
112 filteredFile.write(line)
113 else:
114 if outFile:
115 outFile.write(line)
116 except Exception, e:
117 print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference,e)
118 finally:
119 if refFile:
120 refFile.close()
121
122 if __name__ == "__main__" : __main__()
123