Mercurial > repos > tomnl > dma_filelist_generation
diff msp_split.py @ 8:8de790489301 draft
planemo upload for repository https://github.com/computational-metabolomics/dma-tools-galaxy commit af689d3f20c86f69aa824545e668280bcd5e0cca
author | tomnl |
---|---|
date | Mon, 14 May 2018 08:15:07 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/msp_split.py Mon May 14 08:15:07 2018 -0400 @@ -0,0 +1,73 @@ +from __future__ import print_function +import argparse +import textwrap +import os +import re +import csv +import math + +def msp_split(i, o, n): + spec_total = lcount('NAME', i) + spec_lim = math.ceil(spec_total/float(n)) + spec_c = 0 + filelist = [] + header = '' + print('spec_lim', spec_lim) + with open(i, 'r') as msp_in: + for i in range(1, n+1): + with open(os.path.join(o, 'file{}.msp'.format(str(i).zfill(len(str(n))))), 'w+') as msp_out: + while spec_c <= spec_lim: + if header: + msp_out.write(header) + header = '' + line = msp_in.readline() + + if not line: + break # end of file + + if re.match('^NAME:.*$', line, re.IGNORECASE): + header = line + spec_c += 1 + else: + msp_out.write(line) + spec_c = 1 + + return filelist + +def lcount(keyword, fname): + with open(fname, 'r') as fin: + return sum([1 for line in fin if keyword in line]) + +def main(): + + p = argparse.ArgumentParser(prog='PROG', + formatter_class=argparse.RawDescriptionHelpFormatter, + description='''Create filelist for DMA DIMS nearline workflow''', + epilog=textwrap.dedent(''' + ------------------------------------------------------------------------- + + Example Usage + + python dma-filelist-generation.py -i [dir with sample files], [dir with blank files] -o . + + ''')) + + p.add_argument('-i', dest='i', help='dir with sample files', required=True) + p.add_argument('-o', dest='o', help='out dir', required=True) + p.add_argument('-n', dest='n',) + + + args = p.parse_args() + + if not os.path.exists(args.o): + os.makedirs(args.o) + print('in file', args.i) + print('out dir', args.o) + print('nm files', args.n) + + msp_split(args.i, args.o, int(args.n)) + + +if __name__ == '__main__': + main() +