Mercurial > repos > bgruening > flexynesis
comparison flexynesis_utils.py @ 6:33816f44fc7d draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 6b520305ec30e6dc37eba92c67a5368cea0fc5ad
author | bgruening |
---|---|
date | Wed, 23 Jul 2025 07:49:41 +0000 |
parents | 466b593fd87e |
children |
comparison
equal
deleted
inserted
replaced
5:466b593fd87e | 6:33816f44fc7d |
---|---|
161 except Exception as e: | 161 except Exception as e: |
162 print(f"Error saving {key}: {e}") | 162 print(f"Error saving {key}: {e}") |
163 continue | 163 continue |
164 | 164 |
165 | 165 |
166 def validate_numeric_column(df, column_names, require_integer=False): | |
167 """ Validate that a column(s) in the DataFrame contains numeric values. """ | |
168 if isinstance(column_names, str): | |
169 # Handle comma-separated string: "col1,col2,col3" | |
170 if ',' in column_names: | |
171 column_names = [col.strip() for col in column_names.split(',')] | |
172 else: | |
173 # Single column name | |
174 column_names = [column_names] | |
175 | |
176 # Validate each column | |
177 for column_name in column_names: | |
178 if column_name not in df.columns: | |
179 raise ValueError(f"Column '{column_name}' not found in DataFrame.") | |
180 | |
181 try: | |
182 numeric_col = pd.to_numeric(df[column_name], errors='raise') | |
183 except Exception as e: | |
184 raise ValueError(f"Non-numeric values found in column '{column_name}': {e}") | |
185 | |
186 if require_integer: | |
187 # Check if all non-null values are equivalent to integers | |
188 non_null_values = numeric_col.dropna() | |
189 if not (non_null_values == non_null_values.round()).all(): | |
190 raise ValueError(f"Column '{column_name}' contains non-integer numeric values.") | |
191 print(f"Column '{column_name}': All values are integers or integer-equivalent floats.") | |
192 else: | |
193 print(f"Column '{column_name}': All values are numeric (integers and floats accepted).") | |
194 | |
195 | |
196 def validate_survival(df, column_names): | |
197 """Validate survival column(s) (integer).""" | |
198 validate_numeric_column(df, column_names, require_integer=True) | |
199 | |
200 | |
201 def validate_covariate(df, column_names): | |
202 """Validate covariate column(s) (numeric).""" | |
203 validate_numeric_column(df, column_names, require_integer=False) | |
204 | |
205 | |
166 def main(): | 206 def main(): |
167 parser = argparse.ArgumentParser(description='Flexynesis extra utilities') | 207 parser = argparse.ArgumentParser(description='Flexynesis extra utilities') |
168 | 208 |
169 parser.add_argument("--util", type=str, required=True, | 209 parser.add_argument("--util", type=str, required=True, |
170 choices=['split', 'binarize'], | 210 choices=['split', 'binarize', 'validate_survival', 'validate_covariate'], |
171 help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data") | 211 help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data, 'validate_survival' for validating survival data.") |
172 | 212 |
173 # Arguments for split | 213 # Arguments for split (clin also for validate_survival and validate_covariate) |
174 parser.add_argument('--clin', required=False, | 214 parser.add_argument('--clin', required=False, |
175 help='Path to clinical data CSV file (samples in rows)') | 215 help='Path to clinical data CSV file (samples in rows)') |
176 parser.add_argument('--omics', required=False, | 216 parser.add_argument('--omics', required=False, |
177 help='Comma-separated list of omics CSV files (samples in columns)') | 217 help='Comma-separated list of omics CSV files (samples in columns)') |
178 parser.add_argument('--split', type=float, default=0.7, | 218 parser.add_argument('--split', type=float, default=0.7, |
184 parser.add_argument('--gene_idx', type=int, default=0, | 224 parser.add_argument('--gene_idx', type=int, default=0, |
185 help='Column index for genes in mutation data (default: 0)') | 225 help='Column index for genes in mutation data (default: 0)') |
186 parser.add_argument('--sample_idx', type=int, default=1, | 226 parser.add_argument('--sample_idx', type=int, default=1, |
187 help='Column index for samples in mutation data (default: 1)') | 227 help='Column index for samples in mutation data (default: 1)') |
188 | 228 |
189 # common arguments | 229 # Arguments for validate_survival and validate_covariate |
230 parser.add_argument('--clin_variable', type=str, required=False, | |
231 help='Column name for clinical variable (e.g., death, SEX, ...)') | |
232 | |
233 # common arguments (binarize and split) | |
190 parser.add_argument('--out', default='.', | 234 parser.add_argument('--out', default='.', |
191 help='Output directory (default: current directory)') | 235 help='Output directory (default: current directory)') |
192 | 236 |
193 args = parser.parse_args() | 237 args = parser.parse_args() |
194 | 238 |
195 try: | 239 try: |
196 # validate utility function | 240 # validate utility function |
197 if not args.util: | 241 if not args.util: |
198 raise ValueError("Utility function must be specified") | 242 raise ValueError("Utility function must be specified") |
199 if args.util not in ['split', 'binarize']: | 243 if args.util not in ['split', 'binarize', 'validate_survival', 'validate_covariate']: |
200 raise ValueError(f"Invalid utility function: {args.util}") | 244 raise ValueError(f"Invalid utility function: {args.util}") |
201 | 245 |
202 if args.util == 'split': | 246 if args.util == 'split': |
203 # Validate inputs | 247 # Validate inputs |
204 if not args.clin: | 248 if not args.clin: |
219 raise FileNotFoundError(f"Mutation data file not found: {args.mutations}") | 263 raise FileNotFoundError(f"Mutation data file not found: {args.mutations}") |
220 # Validate gene and sample indices | 264 # Validate gene and sample indices |
221 if args.gene_idx < 0 or args.sample_idx < 0: | 265 if args.gene_idx < 0 or args.sample_idx < 0: |
222 raise ValueError("Gene and sample indices must be non-negative integers") | 266 raise ValueError("Gene and sample indices must be non-negative integers") |
223 | 267 |
268 elif args.util == 'validate_survival' or args.util == 'validate_covariate': | |
269 # Validate clinical data file | |
270 if not args.clin: | |
271 raise ValueError("Clinical data file must be provided") | |
272 if not os.path.isfile(args.clin): | |
273 raise FileNotFoundError(f"Clinical file not found: {args.clin}") | |
274 # Validate survival event variable | |
275 if not args.clin_variable: | |
276 raise ValueError("Survival event variable must be specified") | |
277 | |
224 # Create output directory if it doesn't exist | 278 # Create output directory if it doesn't exist |
225 if not os.path.exists(args.out): | 279 if not os.path.exists(args.out): |
226 os.makedirs(args.out) | 280 os.makedirs(args.out) |
227 | 281 |
228 if args.util == 'split': | 282 if args.util == 'split': |
246 # Save binarized matrix | 300 # Save binarized matrix |
247 output_file = os.path.join(args.out, 'binarized_mutations.tabular') | 301 output_file = os.path.join(args.out, 'binarized_mutations.tabular') |
248 binarized_matrix.to_csv(output_file, sep='\t') | 302 binarized_matrix.to_csv(output_file, sep='\t') |
249 print(f"Binarized mutation matrix saved to {output_file}") | 303 print(f"Binarized mutation matrix saved to {output_file}") |
250 | 304 |
305 elif args.util == 'validate_survival': | |
306 clin_df = read_data(args.clin, index=False) | |
307 if clin_df.empty: | |
308 raise ValueError("Clinical data file is empty") | |
309 | |
310 # Validate survival event variable | |
311 validate_survival(clin_df, args.clin_variable) | |
312 | |
313 elif args.util == 'validate_covariate': | |
314 clin_df = read_data(args.clin, index=False) | |
315 if clin_df.empty: | |
316 raise ValueError("Clinical data file is empty") | |
317 | |
318 # Validate clinical variable | |
319 validate_covariate(clin_df, args.clin_variable) | |
320 | |
251 except Exception as e: | 321 except Exception as e: |
252 print(f"Error: {e}", file=sys.stderr) | 322 print(f"Error: {e}", file=sys.stderr) |
253 sys.exit(1) | 323 sys.exit(1) |
254 | 324 |
255 | 325 |