comparison flexynesis_utils.py @ 6:33816f44fc7d draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit 6b520305ec30e6dc37eba92c67a5368cea0fc5ad
author bgruening
date Wed, 23 Jul 2025 07:49:41 +0000
parents 466b593fd87e
children
comparison
equal deleted inserted replaced
5:466b593fd87e 6:33816f44fc7d
161 except Exception as e: 161 except Exception as e:
162 print(f"Error saving {key}: {e}") 162 print(f"Error saving {key}: {e}")
163 continue 163 continue
164 164
165 165
166 def validate_numeric_column(df, column_names, require_integer=False):
167 """ Validate that a column(s) in the DataFrame contains numeric values. """
168 if isinstance(column_names, str):
169 # Handle comma-separated string: "col1,col2,col3"
170 if ',' in column_names:
171 column_names = [col.strip() for col in column_names.split(',')]
172 else:
173 # Single column name
174 column_names = [column_names]
175
176 # Validate each column
177 for column_name in column_names:
178 if column_name not in df.columns:
179 raise ValueError(f"Column '{column_name}' not found in DataFrame.")
180
181 try:
182 numeric_col = pd.to_numeric(df[column_name], errors='raise')
183 except Exception as e:
184 raise ValueError(f"Non-numeric values found in column '{column_name}': {e}")
185
186 if require_integer:
187 # Check if all non-null values are equivalent to integers
188 non_null_values = numeric_col.dropna()
189 if not (non_null_values == non_null_values.round()).all():
190 raise ValueError(f"Column '{column_name}' contains non-integer numeric values.")
191 print(f"Column '{column_name}': All values are integers or integer-equivalent floats.")
192 else:
193 print(f"Column '{column_name}': All values are numeric (integers and floats accepted).")
194
195
196 def validate_survival(df, column_names):
197 """Validate survival column(s) (integer)."""
198 validate_numeric_column(df, column_names, require_integer=True)
199
200
201 def validate_covariate(df, column_names):
202 """Validate covariate column(s) (numeric)."""
203 validate_numeric_column(df, column_names, require_integer=False)
204
205
166 def main(): 206 def main():
167 parser = argparse.ArgumentParser(description='Flexynesis extra utilities') 207 parser = argparse.ArgumentParser(description='Flexynesis extra utilities')
168 208
169 parser.add_argument("--util", type=str, required=True, 209 parser.add_argument("--util", type=str, required=True,
170 choices=['split', 'binarize'], 210 choices=['split', 'binarize', 'validate_survival', 'validate_covariate'],
171 help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data") 211 help="Utility function: 'split' for spiting data to train and test, 'binarize' for creating a binarized matrix from a mutation data, 'validate_survival' for validating survival data.")
172 212
173 # Arguments for split 213 # Arguments for split (clin also for validate_survival and validate_covariate)
174 parser.add_argument('--clin', required=False, 214 parser.add_argument('--clin', required=False,
175 help='Path to clinical data CSV file (samples in rows)') 215 help='Path to clinical data CSV file (samples in rows)')
176 parser.add_argument('--omics', required=False, 216 parser.add_argument('--omics', required=False,
177 help='Comma-separated list of omics CSV files (samples in columns)') 217 help='Comma-separated list of omics CSV files (samples in columns)')
178 parser.add_argument('--split', type=float, default=0.7, 218 parser.add_argument('--split', type=float, default=0.7,
184 parser.add_argument('--gene_idx', type=int, default=0, 224 parser.add_argument('--gene_idx', type=int, default=0,
185 help='Column index for genes in mutation data (default: 0)') 225 help='Column index for genes in mutation data (default: 0)')
186 parser.add_argument('--sample_idx', type=int, default=1, 226 parser.add_argument('--sample_idx', type=int, default=1,
187 help='Column index for samples in mutation data (default: 1)') 227 help='Column index for samples in mutation data (default: 1)')
188 228
189 # common arguments 229 # Arguments for validate_survival and validate_covariate
230 parser.add_argument('--clin_variable', type=str, required=False,
231 help='Column name for clinical variable (e.g., death, SEX, ...)')
232
233 # common arguments (binarize and split)
190 parser.add_argument('--out', default='.', 234 parser.add_argument('--out', default='.',
191 help='Output directory (default: current directory)') 235 help='Output directory (default: current directory)')
192 236
193 args = parser.parse_args() 237 args = parser.parse_args()
194 238
195 try: 239 try:
196 # validate utility function 240 # validate utility function
197 if not args.util: 241 if not args.util:
198 raise ValueError("Utility function must be specified") 242 raise ValueError("Utility function must be specified")
199 if args.util not in ['split', 'binarize']: 243 if args.util not in ['split', 'binarize', 'validate_survival', 'validate_covariate']:
200 raise ValueError(f"Invalid utility function: {args.util}") 244 raise ValueError(f"Invalid utility function: {args.util}")
201 245
202 if args.util == 'split': 246 if args.util == 'split':
203 # Validate inputs 247 # Validate inputs
204 if not args.clin: 248 if not args.clin:
219 raise FileNotFoundError(f"Mutation data file not found: {args.mutations}") 263 raise FileNotFoundError(f"Mutation data file not found: {args.mutations}")
220 # Validate gene and sample indices 264 # Validate gene and sample indices
221 if args.gene_idx < 0 or args.sample_idx < 0: 265 if args.gene_idx < 0 or args.sample_idx < 0:
222 raise ValueError("Gene and sample indices must be non-negative integers") 266 raise ValueError("Gene and sample indices must be non-negative integers")
223 267
268 elif args.util == 'validate_survival' or args.util == 'validate_covariate':
269 # Validate clinical data file
270 if not args.clin:
271 raise ValueError("Clinical data file must be provided")
272 if not os.path.isfile(args.clin):
273 raise FileNotFoundError(f"Clinical file not found: {args.clin}")
274 # Validate survival event variable
275 if not args.clin_variable:
276 raise ValueError("Survival event variable must be specified")
277
224 # Create output directory if it doesn't exist 278 # Create output directory if it doesn't exist
225 if not os.path.exists(args.out): 279 if not os.path.exists(args.out):
226 os.makedirs(args.out) 280 os.makedirs(args.out)
227 281
228 if args.util == 'split': 282 if args.util == 'split':
246 # Save binarized matrix 300 # Save binarized matrix
247 output_file = os.path.join(args.out, 'binarized_mutations.tabular') 301 output_file = os.path.join(args.out, 'binarized_mutations.tabular')
248 binarized_matrix.to_csv(output_file, sep='\t') 302 binarized_matrix.to_csv(output_file, sep='\t')
249 print(f"Binarized mutation matrix saved to {output_file}") 303 print(f"Binarized mutation matrix saved to {output_file}")
250 304
305 elif args.util == 'validate_survival':
306 clin_df = read_data(args.clin, index=False)
307 if clin_df.empty:
308 raise ValueError("Clinical data file is empty")
309
310 # Validate survival event variable
311 validate_survival(clin_df, args.clin_variable)
312
313 elif args.util == 'validate_covariate':
314 clin_df = read_data(args.clin, index=False)
315 if clin_df.empty:
316 raise ValueError("Clinical data file is empty")
317
318 # Validate clinical variable
319 validate_covariate(clin_df, args.clin_variable)
320
251 except Exception as e: 321 except Exception as e:
252 print(f"Error: {e}", file=sys.stderr) 322 print(f"Error: {e}", file=sys.stderr)
253 sys.exit(1) 323 sys.exit(1)
254 324
255 325