Mercurial > repos > tomnl > dma_filelist_generation
comparison dma-filelist-generation.py @ 0:6a2bb42acfe4 draft
planemo upload for repository https://github.com/computational-metabolomics/dma-tools-galaxy commit 6c48bd51987a28401de6cf5e49b1b30e5e73fe16-dirty
| author | tomnl |
|---|---|
| date | Tue, 27 Mar 2018 06:53:36 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6a2bb42acfe4 |
|---|---|
| 1 import argparse | |
| 2 import textwrap | |
| 3 import os | |
| 4 import re | |
| 5 import collections | |
| 6 import csv | |
| 7 from operator import itemgetter | |
| 8 | |
| 9 def check_folder(in_dir, reg_string): | |
| 10 filelist = [] | |
| 11 for f in os.listdir(in_dir): | |
| 12 | |
| 13 if re.match(reg_string, f, re.IGNORECASE): | |
| 14 | |
| 15 fn, fe = os.path.splitext(f) | |
| 16 fnl = fn.split("_") | |
| 17 filename = fn | |
| 18 fullpth = os.path.join(in_dir, f) | |
| 19 wells = fnl[0] | |
| 20 | |
| 21 | |
| 22 | |
| 23 if re.match("^.*pos.*", f, re.IGNORECASE): | |
| 24 polarity = "pos" | |
| 25 elif re.match("^.*neg.*", f, re.IGNORECASE): | |
| 26 polarity = "neg" | |
| 27 else: | |
| 28 error_message = "Files need to have either 'pos' or 'neg' in the file name" | |
| 29 print error_message | |
| 30 return 1, error_message, "", "" | |
| 31 if re.match("^.*blank.*", f, re.IGNORECASE): | |
| 32 sample_type = "blank" | |
| 33 else: | |
| 34 sample_type = "sample" | |
| 35 | |
| 36 | |
| 37 filelist.append([wells, sample_type, polarity, filename, fullpth]) | |
| 38 | |
| 39 return filelist | |
| 40 | |
| 41 def get_filelist(filesin, o, file_type='mzML', create_filelist=True): | |
| 42 | |
| 43 filedict = collections.OrderedDict() | |
| 44 | |
| 45 | |
| 46 if file_type=="mzML": | |
| 47 reg_string = "^.*mzML$" | |
| 48 else: | |
| 49 reg_string = "^.*raw$" | |
| 50 | |
| 51 if isinstance(filesin, list): | |
| 52 filelist = [] | |
| 53 for f in filesin: | |
| 54 filelist.extend(check_folder(f, reg_string)) | |
| 55 else: | |
| 56 filelist = check_folder(filesin, reg_string) | |
| 57 | |
| 58 # Turn filelist into a dictionary | |
| 59 for f1 in filelist: | |
| 60 well = f1[0] | |
| 61 filedict[well] = [] | |
| 62 for f2 in filelist: | |
| 63 if well == f2[0]: | |
| 64 filedict[well].append(f2[1:len(f2)+1]) | |
| 65 | |
| 66 | |
| 67 | |
| 68 filelist = sorted(filelist, key=itemgetter(0, 1)) | |
| 69 | |
| 70 for k, v in filedict.iteritems(): | |
| 71 classes = [i[0] for i in v] | |
| 72 classes.sort() | |
| 73 if not classes == ['blank', 'sample']: | |
| 74 error_message = "!!!!ERROR!!!! Blank and sample required for each well, file type {}".format(file_type) | |
| 75 print error_message | |
| 76 return 1, error_message, "", "" | |
| 77 | |
| 78 print 'files of type {} checked, files OK'.format(file_type) | |
| 79 | |
| 80 if create_filelist: | |
| 81 | |
| 82 outname = write_filedict(filedict, o, file_type) | |
| 83 print 'filelist created in folder {}, using file type {}, full path {}'.format(o, file_type, outname) | |
| 84 | |
| 85 return 0, "files OK", filedict, filelist | |
| 86 | |
| 87 def write_filedict(filedict, out_dir, file_type, file_spacing='tsv'): | |
| 88 | |
| 89 outname = os.path.join(out_dir,'filelist_{}.{}'.format(file_type, file_spacing)) | |
| 90 | |
| 91 if file_spacing=='tsv': | |
| 92 delim = '\t' | |
| 93 elif file_spacing=='csv': | |
| 94 delim = ',' | |
| 95 else: | |
| 96 delim = ',' | |
| 97 | |
| 98 | |
| 99 with open(outname, 'wb') as csvfile: | |
| 100 w = csv.writer(csvfile, delimiter=delim) | |
| 101 | |
| 102 w.writerow(['filename','classLabel', 'multilist', 'multilistLabel']) | |
| 103 c = 1 | |
| 104 for k, v in filedict.iteritems(): | |
| 105 for i in v: | |
| 106 w.writerow([os.path.basename(i[3]), i[0], c, k]) | |
| 107 c +=1 | |
| 108 | |
| 109 return outname | |
| 110 | |
| 111 | |
| 112 def main(): | |
| 113 | |
| 114 p = argparse.ArgumentParser(prog='PROG', | |
| 115 formatter_class=argparse.RawDescriptionHelpFormatter, | |
| 116 description='''Create filelist for DMA DIMS nearline workflow''', | |
| 117 epilog=textwrap.dedent(''' | |
| 118 ------------------------------------------------------------------------- | |
| 119 | |
| 120 Example Usage | |
| 121 | |
| 122 python dma-filelist-generation.py -i [dir with sample files], [dir with blank files] -o . | |
| 123 | |
| 124 ''')) | |
| 125 | |
| 126 p.add_argument('-i', dest='i', help='dir with sample files', nargs = '*', required=True) | |
| 127 p.add_argument('-o', dest='o', help='out dir', required=True) | |
| 128 p.add_argument('--check_mzml', dest='check_mzml', action='store_true') | |
| 129 p.add_argument('--check_raw', dest='check_raw', action='store_true') | |
| 130 p.add_argument('--create_filelist_mzml', dest='create_filelist_mzml', action='store_true') | |
| 131 p.add_argument('--create_filelist_raw', dest='create_filelist_raw', action='store_true') | |
| 132 | |
| 133 args = p.parse_args() | |
| 134 | |
| 135 if not os.path.exists(args.o): | |
| 136 os.makedirs(args.o) | |
| 137 print args.o | |
| 138 | |
| 139 if not args.check_mzml and not args.check_raw: | |
| 140 print '--check_mzml or --check_raw (or both) are required as inputs' | |
| 141 exit() | |
| 142 | |
| 143 if args.check_mzml: | |
| 144 get_filelist(args.i, args.o, file_type='mzML', create_filelist=args.create_filelist_mzml) | |
| 145 | |
| 146 if args.check_raw: | |
| 147 | |
| 148 get_filelist(args.i, args.o, file_type='raw', create_filelist=args.create_filelist_raw) | |
| 149 | |
| 150 | |
| 151 | |
| 152 if __name__ == '__main__': | |
| 153 main() | |
| 154 |
