Mercurial > repos > bgruening > statistical_hypothesis_testing
view statistical_hypothesis_testing.py @ 0:22ed769665b6 draft default tip
Uploaded
author | bgruening |
---|---|
date | Sun, 01 Feb 2015 18:35:40 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ """ import sys import argparse from scipy import stats def columns_to_values( args, line ): #here you go over every list samples = [] for list in args: cols = line.split('\t') sample_list = [] for row in list: sample_list.append( cols[row-1] ) samples.append( map(int, sample_list) ) return samples def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--infile', required=True, help='Tabular file.') parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.') parser.add_argument("--sample_one_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_two_cols", help="Input format, like smi, sdf, inchi") parser.add_argument("--sample_cols", help="Input format, like smi, sdf, inchi,separate arrays using ;") parser.add_argument("--test_id", help="statistical test method") parser.add_argument("--mwu_use_continuity", action="store_true", default = False, help="Whether a continuity correction (1/2.) should be taken into account.") parser.add_argument("--equal_var", action="store_true", default = False, help="If set perform a standard independent 2 sample test that assumes equal population variances. If not set, perform Welch's t-test, which does not assume equal population variance.") parser.add_argument("--reta", action="store_true", default = False, help="Whether or not to return the internally computed a values.") parser.add_argument("--fisher", action="store_true", default = False, help="if true then Fisher definition is used") parser.add_argument("--bias", action="store_true", default = False, help="if false,then the calculations are corrected for statistical bias") parser.add_argument("--inclusive1", action="store_true", default= False , help="if false,lower_limit will be ignored") parser.add_argument("--inclusive2", action="store_true", default = False, help="if false,higher_limit will be ignored") parser.add_argument("--inclusive", action="store_true", default = False, help="if false,limit will be ignored") parser.add_argument("--printextras", action="store_true", default = False, help="If True, if there are extra points a warning is raised saying how many of those points there are") parser.add_argument("--initial_lexsort", action="store_true", default="False", help="Whether to use lexsort or quicksort as the sorting method for the initial sort of the inputs.") parser.add_argument("--correction", action="store_true", default = False, help="continuity correction ") parser.add_argument("--axis", type=int, default=0, help="Axis can equal None (ravel array first), or an integer (the axis over which to operate on a and b)") parser.add_argument("--n", type=int, default=0, help="the number of trials. This is ignored if x gives both the number of successes and failures") parser.add_argument("--b", type=int, default=0, help="The number of bins to use for the histogram") parser.add_argument("--N", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--ddof", type=int, default=0, help="Degrees of freedom correction") parser.add_argument("--score", type=int, default=0, help="Score that is compared to the elements in a.") parser.add_argument("--m", type=float, default=0.0, help="limits") parser.add_argument("--mf", type=float, default=2.0, help="lower limit") parser.add_argument("--nf", type=float, default=99.9, help="higher_limit") parser.add_argument("--p", type=float, default=0.5, help="The hypothesized probability of success. 0 <= p <= 1. The default value is p = 0.5") parser.add_argument("--alpha", type=float, default=0.9, help="probability") parser.add_argument("--new", type=float, default=0.0, help="Value to put in place of values in a outside of bounds") parser.add_argument("--proportiontocut", type=float, default=0.0, help="Proportion (in range 0-1) of total data set to trim of each end.") parser.add_argument("--lambda_", type=float, default=1.0, help="lambda_ gives the power in the Cressie-Read power divergence statistic") parser.add_argument("--imbda", type=float, default=0, help="If lmbda is not None, do the transformation for that value.If lmbda is None, find the lambda that maximizes the log-likelihood function and return it as the second output argument.") parser.add_argument("--base", type=float, default=1.6, help="The logarithmic base to use, defaults to e") parser.add_argument("--dtype", help="dtype") parser.add_argument("--med", help="med") parser.add_argument("--cdf", help="cdf") parser.add_argument("--zero_method", help="zero_method options") parser.add_argument("--dist", help="dist options") parser.add_argument("--ties", help="ties options") parser.add_argument("--alternative", help="alternative options") parser.add_argument("--mode", help="mode options") parser.add_argument("--method", help="method options") parser.add_argument("--md", help="md options") parser.add_argument("--center", help="center options") parser.add_argument("--kind", help="kind options") parser.add_argument("--tail", help="tail options") parser.add_argument("--interpolation", help="interpolation options") parser.add_argument("--statistic", help="statistic options") args = parser.parse_args() infile = args.infile outfile = open(args.outfile, 'w+') test_id = args.test_id nf = args.nf mf = args.mf imbda = args.imbda inclusive1 = args.inclusive1 inclusive2 = args.inclusive2 sample0 = 0 sample1 = 0 sample2 = 0 if args.sample_cols != None: sample0 = 1 barlett_samples = [] for sample in args.sample_cols.split(';'): barlett_samples.append( map(int, sample.split(',')) ) if args.sample_one_cols != None: sample1 = 1 sample_one_cols = args.sample_one_cols.split(',') if args.sample_two_cols != None: sample_two_cols = args.sample_two_cols.split(',') sample2 = 1 for line in open( infile ): sample_one = [] sample_two = [] cols = line.strip().split('\t') if sample0 == 1: b_samples = columns_to_values( barlett_samples,line ) if sample1 == 1: for index in sample_one_cols: sample_one.append( cols[ int(index) -1 ] ) if sample2 == 1: for index in sample_two_cols: sample_two.append( cols[ int(index) -1 ] ) if test_id.strip() == 'describe': size, min_max,mean,uv,bs,bk = stats.describe( map(float, sample_one) ) cols.append( size ) cols.append( min_max ) cols.append( mean ) cols.append( uv ) cols.append( bs ) cols.append( bk ) elif test_id.strip() == 'mode': vals, counts = stats.mode( map(float, sample_one) ) cols.append( vals ) cols.append( counts ) elif test_id.strip() == 'nanmean': m = stats.nanmean( map(float, sample_one)) cols.append( m ) elif test_id.strip() == 'nanmedian': m = stats.nanmedian( map(float, sample_one)) cols.append( m ) elif test_id.strip() == 'kurtosistest': z_value, p_value = stats.kurtosistest( map(float, sample_one) ) cols.append( z_value ) cols.append( p_value ) elif test_id.strip() == 'variation': ra = stats.variation( map(float, sample_one)) cols.append( ra ) elif test_id.strip() == 'itemfreq': freq = stats.itemfreq( map(float, sample_one)) for list in freq: elements = ','.join( map(str, list) ) cols.append( elements ) elif test_id.strip() == 'nanmedian': m = stats.nanmedian( map(float, sample_one)) cols.append( m ) elif test_id.strip() == 'variation': ra = stats.variation( map(float, sample_one)) cols.append( ra ) elif test_id.strip() == 'boxcox_llf': IIf = stats.boxcox_llf( imbda,map(float, sample_one) ) cols.append( IIf ) elif test_id.strip() == 'tiecorrect': fa = stats.tiecorrect( map(float, sample_one) ) cols.append( fa ) elif test_id.strip() == 'rankdata': r = stats.rankdata( map(float, sample_one),method=args.md ) cols.append( r ) elif test_id.strip() == 'nanstd': s = stats.nanstd( map(float, sample_one),bias=args.bias ) cols.append( s ) elif test_id.strip() == 'anderson': A2, critical, sig = stats.anderson( map(float, sample_one), dist=args.dist ) cols.append( A2 ) for list in critical: cols.append( list ) cols.append( ',' ) for list in sig: cols.append( list ) elif test_id.strip() == 'binom_test': p_value = stats.binom_test( map(float, sample_one), n=args.n, p=args.p ) cols.append( p_value ) elif test_id.strip() == 'gmean': gm = stats.gmean( map(float, sample_one), dtype=args.dtype ) cols.append( gm ) elif test_id.strip() == 'hmean': hm = stats.hmean( map(float, sample_one), dtype=args.dtype ) cols.append( hm ) elif test_id.strip() == 'kurtosis': k = stats.kurtosis( map(float, sample_one),axis=args.axis, fisher=args.fisher, bias=args.bias ) cols.append( k ) elif test_id.strip() == 'moment': n_moment = stats.moment( map(float, sample_one),n=args.n ) cols.append( n_moment ) elif test_id.strip() == 'normaltest': k2, p_value = stats.normaltest( map(float, sample_one) ) cols.append( k2 ) cols.append( p_value ) elif test_id.strip() == 'skew': skewness = stats.skew( map(float, sample_one),bias=args.bias ) cols.append( skewness ) elif test_id.strip() == 'skewtest': z_value, p_value = stats.skewtest( map(float, sample_one)) cols.append( z_value ) cols.append( p_value ) elif test_id.strip() == 'sem': s = stats.sem( map(float, sample_one),ddof=args.ddof ) cols.append( s ) elif test_id.strip() == 'zscore': z = stats.zscore( map(float, sample_one),ddof=args.ddof ) for list in z: cols.append( list ) elif test_id.strip() == 'signaltonoise': s2n = stats.signaltonoise( map(float, sample_one),ddof=args.ddof ) cols.append( s2n ) elif test_id.strip() == 'percentileofscore': p = stats.percentileofscore( map(float, sample_one),score=args.score,kind=args.kind ) cols.append( p ) elif test_id.strip() == 'bayes_mvs': c_mean, c_var,c_std = stats.bayes_mvs( map(float, sample_one),alpha=args.alpha ) cols.append( c_mean ) cols.append( c_var ) cols.append( c_std ) elif test_id.strip() == 'sigmaclip': c, c_low,c_up = stats.sigmaclip( map(float, sample_one),low=args.m,high=args.n ) cols.append( c ) cols.append( c_low ) cols.append( c_up ) elif test_id.strip() == 'kstest': d, p_value = stats.kstest(map(float, sample_one), cdf=args.cdf , N=args.N,alternative=args.alternative,mode=args.mode ) cols.append(d) cols.append(p_value) elif test_id.strip() == 'chi2_contingency': chi2, p, dof, ex = stats.chi2_contingency( map(float, sample_one), correction=args.correction ,lambda_=args.lambda_) cols.append( chi2 ) cols.append( p ) cols.append( dof ) cols.append( ex ) elif test_id.strip() == 'tmean': if nf is 0 and mf is 0: mean = stats.tmean( map(float, sample_one)) else: mean = stats.tmean( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 )) cols.append( mean ) elif test_id.strip() == 'tmin': if mf is 0: min = stats.tmin( map(float, sample_one)) else: min = stats.tmin( map(float, sample_one),lowerlimit=mf,inclusive=args.inclusive) cols.append( min ) elif test_id.strip() == 'tmax': if nf is 0: max = stats.tmax( map(float, sample_one)) else: max = stats.tmax( map(float, sample_one),upperlimit=nf,inclusive=args.inclusive) cols.append( max ) elif test_id.strip() == 'tvar': if nf is 0 and mf is 0: var = stats.tvar( map(float, sample_one)) else: var = stats.tvar( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 )) cols.append( var ) elif test_id.strip() == 'tstd': if nf is 0 and mf is 0: std = stats.tstd( map(float, sample_one)) else: std = stats.tstd( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 )) cols.append( std ) elif test_id.strip() == 'tsem': if nf is 0 and mf is 0: s = stats.tsem( map(float, sample_one)) else: s = stats.tsem( map(float, sample_one),( mf, nf ),( inclusive1, inclusive2 )) cols.append( s ) elif test_id.strip() == 'scoreatpercentile': if nf is 0 and mf is 0: s = stats.scoreatpercentile( map(float, sample_one),map(float, sample_two),interpolation_method=args.interpolation ) else: s = stats.scoreatpercentile( map(float, sample_one),map(float, sample_two),( mf, nf ),interpolation_method=args.interpolation ) for list in s: cols.append( list ) elif test_id.strip() == 'relfreq': if nf is 0 and mf is 0: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one),args.b) else: rel, low_range, binsize, ex = stats.relfreq( map(float, sample_one),args.b,( mf, nf )) for list in rel: cols.append( list ) cols.append( low_range ) cols.append( binsize ) cols.append( ex ) elif test_id.strip() == 'binned_statistic': if nf is 0 and mf is 0: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one),map(float, sample_two),statistic=args.statistic,bins=args.b ) else: st, b_edge, b_n = stats.binned_statistic( map(float, sample_one),map(float, sample_two),statistic=args.statistic,bins=args.b,range=( mf, nf ) ) cols.append( st ) cols.append( b_edge ) cols.append( b_n ) elif test_id.strip() == 'threshold': if nf is 0 and mf is 0: o = stats.threshold( map(float, sample_one),newval=args.new ) else: o = stats.threshold( map(float, sample_one),mf,nf,newval=args.new ) for list in o: cols.append( list ) elif test_id.strip() == 'trimboth': o = stats.trimboth( map(float, sample_one),proportiontocut=args.proportiontocut ) for list in o: cols.append( list ) elif test_id.strip() == 'trim1': t1 = stats.trim1( map(float, sample_one),proportiontocut=args.proportiontocut,tail=args.tail ) for list in t1: cols.append( list ) elif test_id.strip() == 'histogram': if nf is 0 and mf is 0: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one),args.b) else: hi, low_range, binsize, ex = stats.histogram( map(float, sample_one),args.b,( mf, nf )) cols.append( hi ) cols.append( low_range ) cols.append( binsize ) cols.append( ex ) elif test_id.strip() == 'cumfreq': if nf is 0 and mf is 0: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one),args.b) else: cum, low_range, binsize, ex = stats.cumfreq( map(float, sample_one),args.b,( mf, nf )) cols.append( cum ) cols.append( low_range ) cols.append( binsize ) cols.append( ex ) elif test_id.strip() == 'boxcox_normmax': if nf is 0 and mf is 0: ma = stats.boxcox_normmax( map(float, sample_one)) else: ma = stats.boxcox_normmax( map(float, sample_one),( mf, nf ),method=args.method) cols.append( ma ) elif test_id.strip() == 'boxcox': if imbda is 0: box, ma, ci = stats.boxcox( map(float, sample_one),alpha=args.alpha ) cols.append( box ) cols.append( ma ) cols.append( ci ) else: box = stats.boxcox( map(float, sample_one),imbda,alpha=args.alpha ) cols.append( box ) elif test_id.strip() == 'histogram2': h2 = stats.histogram2( map(float, sample_one), map(float, sample_two) ) for list in h2: cols.append( list ) elif test_id.strip() == 'ranksums': z_statistic, p_value = stats.ranksums( map(float, sample_one), map(float, sample_two) ) cols.append(z_statistic) cols.append(p_value) elif test_id.strip() == 'ttest_1samp': t, prob = stats.ttest_1samp( map(float, sample_one), map(float, sample_two) ) for list in t: cols.append( list ) for list in prob: cols.append( list ) elif test_id.strip() == 'ansari': AB, p_value = stats.ansari( map(float, sample_one), map(float, sample_two) ) cols.append(AB) cols.append(p_value) elif test_id.strip() == 'linregress': slope, intercept, r_value, p_value, stderr = stats.linregress( map(float, sample_one), map(float, sample_two) ) cols.append(slope) cols.append(intercept) cols.append(r_value) cols.append(p_value) cols.append(stderr) elif test_id.strip() == 'pearsonr': cor, p_value = stats.pearsonr( map(float, sample_one), map(float, sample_two) ) cols.append(cor) cols.append(p_value) elif test_id.strip() == 'pointbiserialr': r, p_value = stats.pointbiserialr( map(float, sample_one), map(float, sample_two) ) cols.append(r) cols.append(p_value) elif test_id.strip() == 'ks_2samp': d, p_value = stats.ks_2samp( map(float, sample_one), map(float, sample_two) ) cols.append(d) cols.append(p_value) elif test_id.strip() == 'mannwhitneyu': mw_stats_u, p_value = stats.mannwhitneyu( map(float, sample_one), map(float, sample_two), use_continuity=args.mwu_use_continuity ) cols.append( mw_stats_u ) cols.append( p_value ) elif test_id.strip() == 'zmap': z = stats.zmap( map(float, sample_one),map(float, sample_two),ddof=args.ddof ) for list in z: cols.append( list ) elif test_id.strip() == 'ttest_ind': mw_stats_u, p_value = stats.ttest_ind( map(float, sample_one), map(float, sample_two), equal_var=args.equal_var ) cols.append( mw_stats_u ) cols.append( p_value ) elif test_id.strip() == 'ttest_rel': t, prob = stats.ttest_rel( map(float, sample_one), map(float, sample_two), axis=args.axis ) cols.append( t ) cols.append( prob ) elif test_id.strip() == 'mood': z, p_value = stats.mood( map(float, sample_one), map(float, sample_two), axis=args.axis ) cols.append( z ) cols.append( p_value ) elif test_id.strip() == 'shapiro': W, p_value, a = stats.shapiro( map(float, sample_one), map(float, sample_two), args.reta ) cols.append( W ) cols.append( p_value ) for list in a: cols.append( list ) elif test_id.strip() == 'kendalltau': k, p_value = stats.kendalltau( map(float, sample_one), map(float, sample_two), initial_lexsort=args.initial_lexsort ) cols.append(k) cols.append(p_value) elif test_id.strip() == 'entropy': s = stats.entropy( map(float, sample_one), map(float, sample_two), base=args.base ) cols.append( s ) elif test_id.strip() == 'spearmanr': if sample2 == 1 : rho, p_value = stats.spearmanr( map(float, sample_one), map(float, sample_two) ) else: rho, p_value = stats.spearmanr( map(float, sample_one)) cols.append( rho ) cols.append( p_value ) elif test_id.strip() == 'wilcoxon': if sample2 == 1 : T, p_value = stats.wilcoxon( map(float, sample_one), map(float, sample_two),zero_method=args.zero_method,correction=args.correction ) else: T, p_value = stats.wilcoxon( map(float, sample_one),zero_method=args.zero_method,correction=args.correction ) cols.append( T ) cols.append( p_value ) elif test_id.strip() == 'chisquare': if sample2 == 1 : rho, p_value = stats.chisquare( map(float, sample_one), map(float, sample_two),ddof=args.ddof ) else: rho, p_value = stats.chisquare( map(float, sample_one),ddof=args.ddof) cols.append( rho ) cols.append( p_value ) elif test_id.strip() == 'power_divergence': if sample2 == 1 : stat, p_value = stats.power_divergence( map(float, sample_one), map(float, sample_two),ddof=args.ddof,lambda_=args.lambda_ ) else: stat, p_value = stats.power_divergence( map(float, sample_one),ddof=args.ddof,lambda_=args.lambda_) cols.append( stat ) cols.append( p_value ) elif test_id.strip() == 'theilslopes': if sample2 == 1 : mpe, met, lo, up = stats.theilslopes( map(float, sample_one), map(float, sample_two),alpha=args.alpha ) else: mpe, met, lo, up = stats.theilslopes( map(float, sample_one),alpha=args.alpha) cols.append( mpe ) cols.append( met ) cols.append( lo ) cols.append( up ) elif test_id.strip() == 'combine_pvalues': if sample2 == 1 : stat, p_value = stats.combine_pvalues( map(float, sample_one),method=args.med,weights=map(float, sample_two) ) else: stat, p_value = stats.combine_pvalues( map(float, sample_one),method=args.med) cols.append( stat ) cols.append( p_value ) elif test_id.strip() == 'obrientransform': ob = stats.obrientransform( *b_samples ) for list in ob: elements = ','.join( map(str, list) ) cols.append( elements ) elif test_id.strip() == 'f_oneway': f_value, p_value = stats.f_oneway( *b_samples ) cols.append(f_value) cols.append(p_value) elif test_id.strip() == 'kruskal': h, p_value = stats.kruskal( *b_samples ) cols.append(h) cols.append(p_value) elif test_id.strip() == 'friedmanchisquare': fr, p_value = stats.friedmanchisquare( *b_samples ) cols.append(fr) cols.append(p_value) elif test_id.strip() == 'fligner': xsq, p_value = stats.fligner( center=args.center,proportiontocut=args.proportiontocut,*b_samples ) cols.append(xsq) cols.append(p_value) elif test_id.strip() == 'bartlett': T, p_value = stats.bartlett( *b_samples ) cols.append(T) cols.append(p_value) elif test_id.strip() == 'levene': w, p_value = stats.levene( center=args.center,proportiontocut=args.proportiontocut,*b_samples ) cols.append(w) cols.append(p_value) elif test_id.strip() == 'median_test': stat, p_value, m, table = stats.median_test( ties=args.ties,correction=args.correction ,lambda_=args.lambda_,*b_samples ) cols.append(stat) cols.append(p_value) cols.append(m) cols.append(table) for list in table: elements = ','.join( map(str, list) ) cols.append( elements ) outfile.write( '%s\n' % '\t'.join( map(str, cols) ) ) outfile.close() if __name__ == '__main__': main()