Mercurial > repos > cropgeeks > flapjack
comparison FlapjackProject.py @ 65:d7e91a614582 draft
Uploaded
| author | cropgeeks |
|---|---|
| date | Wed, 21 Feb 2018 11:54:51 -0500 |
| parents | |
| children | 84ce7c332dc4 |
comparison
equal
deleted
inserted
replaced
| 64:3b4e505bdad3 | 65:d7e91a614582 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # encoding: utf-8 | |
| 3 ''' | |
| 4 DNASampleSplitter -- shortdesc | |
| 5 | |
| 6 DNASampleSplitter is a description | |
| 7 | |
| 8 It defines classes_and_methods | |
| 9 | |
| 10 @author: John Carlos Ignacia, Milcah, Yaw Nti-Addae | |
| 11 | |
| 12 @copyright: 2017 Cornell University. All rights reserved. | |
| 13 | |
| 14 @license: MIT License | |
| 15 | |
| 16 @contact: yn259@cornell.edu | |
| 17 @deffield updated: Updated | |
| 18 ''' | |
| 19 | |
| 20 import sys | |
| 21 import os | |
| 22 import math | |
| 23 import pandas as pd | |
| 24 | |
| 25 from optparse import OptionParser | |
| 26 from __builtin__ import str | |
| 27 from subprocess import call | |
| 28 | |
| 29 __all__ = [] | |
| 30 __version__ = 0.1 | |
| 31 __date__ = '2017-06-20' | |
| 32 __updated__ = '2017-06-20' | |
| 33 | |
| 34 DEBUG = 1 | |
| 35 TESTRUN = 0 | |
| 36 PROFILE = 0 | |
| 37 | |
| 38 parents = {} | |
| 39 | |
| 40 def splitfile(my_file, sample_data): | |
| 41 header = '' | |
| 42 fj_header = '' | |
| 43 with open(my_file) as infile: | |
| 44 for line in infile: | |
| 45 if line[:2] == '# ': | |
| 46 fj_header += line | |
| 47 elif header == '': | |
| 48 if fj_header == '': | |
| 49 fj_header = '# fjFile = PHENOTYPE\n' | |
| 50 header_list = line.split('\t') | |
| 51 if header_list[0] != '': | |
| 52 header_list[0] = '' | |
| 53 line = "\t".join(header_list) | |
| 54 header = fj_header+line | |
| 55 else: | |
| 56 lst = line.split('\t') | |
| 57 dnarun = lst[0] | |
| 58 dnarun_data = sample_data[sample_data.dnarun_name == dnarun] | |
| 59 group = list(dnarun_data.dnasample_sample_group)[0] | |
| 60 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] | |
| 61 | |
| 62 filename = my_file | |
| 63 isParent = False | |
| 64 for key in parents: | |
| 65 value = parents[key] | |
| 66 if dnarun in value: | |
| 67 filename = my_file+'_'+key+'.txt' | |
| 68 if not os.path.isfile(filename) : | |
| 69 f = open(filename, "w") | |
| 70 f.write('%s' % header) | |
| 71 else: | |
| 72 f=open(filename, "a+") | |
| 73 f.write('%s' % line) | |
| 74 isParent = True | |
| 75 | |
| 76 if isParent: | |
| 77 continue | |
| 78 | |
| 79 if isinstance(group, float) and math.isnan(group): | |
| 80 continue | |
| 81 else: | |
| 82 # get parent data # | |
| 83 | |
| 84 # get file name for genotype data | |
| 85 if isinstance(cycle, float) and math.isnan(cycle): | |
| 86 filename += '_'+group+'.txt' | |
| 87 else: | |
| 88 filename += '_'+group+'_'+cycle+'.txt' | |
| 89 | |
| 90 # save genotype data to file | |
| 91 if not os.path.isfile(filename) : | |
| 92 f = open(filename, "w") | |
| 93 f.write('%s' % header) | |
| 94 else : | |
| 95 f=open(filename, "a+") | |
| 96 f.write('%s' % line) | |
| 97 | |
| 98 def splitData(samplefile, genofile): | |
| 99 # Split sample file # | |
| 100 sample_data = pd.read_table(samplefile, dtype='str') | |
| 101 group_list = sample_data.dnasample_sample_group.drop_duplicates() | |
| 102 for index, item in group_list.iteritems(): | |
| 103 if isinstance(item, float): | |
| 104 if math.isnan(item): | |
| 105 continue | |
| 106 elif isinstance(item, str): | |
| 107 if not item: | |
| 108 continue | |
| 109 df = sample_data[sample_data.dnasample_sample_group == item] | |
| 110 | |
| 111 # store dnaruns of parents in a dictionary | |
| 112 par1 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par1))) | |
| 113 par2 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par2))) | |
| 114 lst1 = list(sample_data.loc[sample_data.germplasm_name.isin(par1), 'dnarun_name']) | |
| 115 lst2 = list(sample_data.loc[sample_data.germplasm_name.isin(par2), 'dnarun_name']) | |
| 116 mergedlst = lst1 + lst2 | |
| 117 | |
| 118 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() | |
| 119 for idx, sub in subgroup_list.iteritems(): | |
| 120 if isinstance(sub, float): | |
| 121 if math.isnan(sub): | |
| 122 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | |
| 123 if not item in parents and mergedlst: | |
| 124 parents.update({item : mergedlst}) | |
| 125 continue | |
| 126 elif isinstance(sub, str): | |
| 127 if not sub: | |
| 128 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | |
| 129 continue | |
| 130 | |
| 131 subkey = item+'_'+sub | |
| 132 if not subkey in parents and mergedlst: | |
| 133 parents.update({subkey : lst1+lst2}) | |
| 134 # df_sub = df[df.dnasample_sample_group_cycle == sub] | |
| 135 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | |
| 136 | |
| 137 # Split genotype file based on sample information # | |
| 138 splitfile(samplefile, sample_data) | |
| 139 splitfile(genofile, sample_data) | |
| 140 | |
| 141 def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile): | |
| 142 for key in groups: | |
| 143 sfile = samplefile+'_'+key+'.txt' | |
| 144 gfile = genofile+'_'+key+'.txt.tmp' | |
| 145 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile] | |
| 146 call(cmd) | |
| 147 | |
| 148 def createHeader(groups, samplefile, genofile, headerjar): | |
| 149 for key in groups: | |
| 150 sfile = samplefile+'_'+key+'.txt' | |
| 151 gfile = genofile+'_'+key+'.txt' | |
| 152 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] | |
| 153 call(cmd) | |
| 154 | |
| 155 def main(argv=None): | |
| 156 '''Command line options.''' | |
| 157 | |
| 158 program_name = os.path.basename(sys.argv[0]) | |
| 159 program_version = "v0.1" | |
| 160 program_build_date = "%s" % __updated__ | |
| 161 | |
| 162 program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) | |
| 163 #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse | |
| 164 program_longdesc = '''''' # optional - give further explanation about what the program does | |
| 165 program_license = "Copyright 2017 user_name (organization_name) \ | |
| 166 Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0" | |
| 167 | |
| 168 if argv is None: | |
| 169 argv = sys.argv[1:] | |
| 170 try: | |
| 171 # setup option parser | |
| 172 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) | |
| 173 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE") | |
| 174 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") | |
| 175 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") | |
| 176 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") | |
| 177 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE") | |
| 178 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE") | |
| 179 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]") | |
| 180 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]") | |
| 181 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") | |
| 182 | |
| 183 # process options | |
| 184 (opts, args) = parser.parse_args(argv) | |
| 185 | |
| 186 if opts.verbose > 0: | |
| 187 print("verbosity level = %d" % opts.verbose) | |
| 188 if opts.genofile: | |
| 189 print("genofile = %s" % opts.genofile) | |
| 190 else: | |
| 191 sys.stderr.write("no genotype file detected!") | |
| 192 if opts.samplefile: | |
| 193 print("samplefile = %s" % opts.samplefile) | |
| 194 else: | |
| 195 sys.stderr.write("no sample file detected!") | |
| 196 if opts.mapfile: | |
| 197 print("mapfile = %s" % opts.mapfile) | |
| 198 else: | |
| 199 sys.stderr.write("no map file detected!") | |
| 200 if opts.qtlfile: | |
| 201 print("qtlfile = %s" % opts.qtlfile) | |
| 202 else: | |
| 203 sys.stderr.write("no QTL file detected!") | |
| 204 if opts.jarfile: | |
| 205 print("jarfile = %s" % opts.jarfile) | |
| 206 else: | |
| 207 sys.stderr.write("no Flapjack project creator jar file detected!") | |
| 208 if opts.headerjar: | |
| 209 print("headerjar = %s" % opts.headerjar) | |
| 210 else: | |
| 211 sys.stderr.write("no Flapjack header creator jar file detected!") | |
| 212 | |
| 213 # MAIN BODY # | |
| 214 splitData(samplefile=opts.samplefile, genofile=opts.genofile) | |
| 215 createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) | |
| 216 createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile) | |
| 217 | |
| 218 | |
| 219 except Exception, e: | |
| 220 indent = len(program_name) * " " | |
| 221 sys.stderr.write(program_name + ": " + repr(e) + "\n") | |
| 222 sys.stderr.write(indent + " for help use --help") | |
| 223 return 2 | |
| 224 | |
| 225 | |
| 226 if __name__ == "__main__": | |
| 227 # if DEBUG: | |
| 228 # sys.argv.append("-h") | |
| 229 if TESTRUN: | |
| 230 import doctest | |
| 231 doctest.testmod() | |
| 232 if PROFILE: | |
| 233 import cProfile | |
| 234 import pstats | |
| 235 profile_filename = 'DNASampleSplitter_profile.txt' | |
| 236 cProfile.run('main()', profile_filename) | |
| 237 statsfile = open("profile_stats.txt", "wb") | |
| 238 p = pstats.Stats(profile_filename, stream=statsfile) | |
| 239 stats = p.strip_dirs().sort_stats('cumulative') | |
| 240 stats.print_stats() | |
| 241 statsfile.close() | |
| 242 sys.exit(0) | |
| 243 sys.exit(main()) |
