Mercurial > repos > tomnl > deconrank
view dma-filelist-generation.py @ 19:aa7b5aac31d8 draft default tip
planemo upload for repository https://github.com/computational-metabolomics/dma-tools-galaxy commit 14435cfc042911bf1ee409f5a0d5ef908f5feec0-dirty
author | tomnl |
---|---|
date | Thu, 21 Jun 2018 08:46:19 -0400 |
parents | defa57c7775e |
children |
line wrap: on
line source
import argparse import textwrap import os import re import collections import csv from operator import itemgetter def check_folder(in_dir, reg_string): filelist = [] for f in os.listdir(in_dir): if re.match(reg_string, f, re.IGNORECASE): fn, fe = os.path.splitext(f) fnl = fn.split("_") filename = fn fullpth = os.path.join(in_dir, f) wells = fnl[0] if re.match("^.*pos.*", f, re.IGNORECASE): polarity = "pos" elif re.match("^.*neg.*", f, re.IGNORECASE): polarity = "neg" else: error_message = "Files need to have either 'pos' or 'neg' in the file name" print error_message return 1, error_message, "", "" if re.match("^.*blank.*", f, re.IGNORECASE): sample_type = "blank" else: sample_type = "sample" filelist.append([wells, sample_type, polarity, filename, fullpth]) return filelist def get_filelist(filesin, o, file_type='mzML', create_filelist=True): filedict = collections.OrderedDict() if file_type=="mzML": reg_string = "^.*mzML$" else: reg_string = "^.*raw$" if isinstance(filesin, list): filelist = [] for f in filesin: filelist.extend(check_folder(f, reg_string)) else: filelist = check_folder(filesin, reg_string) # Turn filelist into a dictionary for f1 in filelist: well = f1[0] filedict[well] = [] for f2 in filelist: if well == f2[0]: filedict[well].append(f2[1:len(f2)+1]) filelist = sorted(filelist, key=itemgetter(0, 1)) for k, v in filedict.iteritems(): classes = [i[0] for i in v] classes.sort() if not classes == ['blank', 'sample']: error_message = "!!!!ERROR!!!! Blank and sample required for each well, file type {}".format(file_type) print error_message return 1, error_message, "", "" print 'files of type {} checked, files OK'.format(file_type) if create_filelist: outname = write_filedict(filedict, o, file_type) print 'filelist created in folder {}, using file type {}, full path {}'.format(o, file_type, outname) return 0, "files OK", filedict, filelist def write_filedict(filedict, out_dir, file_type, file_spacing='tsv'): outname = os.path.join(out_dir,'filelist_{}.{}'.format(file_type, file_spacing)) if file_spacing=='tsv': delim = '\t' elif file_spacing=='csv': delim = ',' else: delim = ',' with open(outname, 'wb') as csvfile: w = csv.writer(csvfile, delimiter=delim) w.writerow(['filename','classLabel', 'multilist', 'multilistLabel']) c = 1 for k, v in filedict.iteritems(): for i in v: w.writerow([os.path.basename(i[3]), i[0], c, k]) c +=1 return outname def main(): p = argparse.ArgumentParser(prog='PROG', formatter_class=argparse.RawDescriptionHelpFormatter, description='''Create filelist for DMA DIMS nearline workflow''', epilog=textwrap.dedent(''' ------------------------------------------------------------------------- Example Usage python dma-filelist-generation.py -i [dir with sample files], [dir with blank files] -o . ''')) p.add_argument('-i', dest='i', help='dir with sample files', nargs = '*', required=True) p.add_argument('-o', dest='o', help='out dir', required=True) p.add_argument('--check_mzml', dest='check_mzml', action='store_true') p.add_argument('--check_raw', dest='check_raw', action='store_true') p.add_argument('--create_filelist_mzml', dest='create_filelist_mzml', action='store_true') p.add_argument('--create_filelist_raw', dest='create_filelist_raw', action='store_true') args = p.parse_args() if not os.path.exists(args.o): os.makedirs(args.o) print args.o if not args.check_mzml and not args.check_raw: print '--check_mzml or --check_raw (or both) are required as inputs' exit() if args.check_mzml: get_filelist(args.i, args.o, file_type='mzML', create_filelist=args.create_filelist_mzml) if args.check_raw: get_filelist(args.i, args.o, file_type='raw', create_filelist=args.create_filelist_raw) if __name__ == '__main__': main()