"""
Power Analysis (Python) tool
Developed by Dr. Goncalo Correia and Dr Jianliang Gao
Imperial College London
2016
simple usage: python pa.py TutorialData.csv 8
"TutorialData.csv" is input test data set, can be replaced by actual data set name.
"8" means the first 8 variables, which can be a range, e.g., 8-16
full usage: python pa.py TutorialData.csv 2-9 0:100:500 0.05:0.05:0.7 20 0 4
"0:100:500" (default value) means the range of sample sizes from 0 to 500 (not inclusive) with interval of 100.
"0.05:0.05:0.7" (default value) means the range of effect sizes from 0.05 to 0.7 (not inclusive) with interval of 0.05.
"20" is an integer number of repeats. Default value is 10.
"0" is the default input for working for classification only. Please choose 1 to work on regression only or 2 to work on both.
"4" is an integer number as number of CPU cores to use. By default is to use all available cores.
"""
import os, sys, csv, inspect, dis, os.path, random, multiprocessing, getopt
import numpy as np
import scipy.stats as scistats
import statsmodels.formula.api as sm # for linear regression
import shutil # for creating zip files
from math import fabs, floor, ceil, log, exp
from datetime import datetime
[docs]def iSurfacePlot(output, svfilename, variable, metric, correction, samplsizes,sizeeff, nreps):
"""
This is for plotting interactive 3D surface plot for mean of all variables.
:param output: array for plotting.2D numpy array
:type output: array
:param svfilename: filename for saving the corresponding plot.
:type svfilename: String
:param variable: variable index nubmer. for plotting mean of all variables, as the size is 1, therefore use 1 as input parameter.
:type variable: int
:param metric: metric (confusion matrix, 'TP', 'TF' etc) index nubmer. for plotting mean of all variables, as the size is 1, therefore use 1 as input parameter.
:type metric: int
:param correction: correction methods (no correction, Bonferroni, Benjamini-Hochberg, Benjamini-Yekutieli index nubmer. for plotting mean of all variables, as the size is 1, therefore use 1 as input parameter.
:type correction: int
:param samplsizes: sample size matrix, numpy array 1 x n
:type samplsizes: array
:param sizeeff: effect size matrix, numpy array 1 x n
:type sizeeff: array
:param nreps: number of repeats
:type nreps: int
:return:
"""
import plotly as py
import plotly.graph_objs as go
MUtot = output[variable - 1][correction - 1][metric - 1]
NS, NSE = MUtot.shape
##SIGMAtot = output[variable-1][correction-1][metric+5-1]
##SIGMAlow=MUtot-1.96*SIGMAtot/np.sqrt(nreps)
##SIGMAlow = np.array([[0 if x<0 else x for x in y] for y in SIGMAlow])
##plot
##generate a 2D grid
X, Y = np.meshgrid(samplsizes, sizeeff)
zaxis_title = 'True Positive Rate'
if metric == 1:
if not 'mean' in svfilename:
zaxis_title = 'True Positive Rate'
else:
if 'tp' in svfilename:
zaxis_title = 'True Positive Rate'
if 'fp' in svfilename:
zaxis_title = 'False Positive Rate'
if 'tn' in svfilename:
zaxis_title = 'True Negative Rate'
if 'fn' in svfilename:
zaxis_title = 'False Negative Rate'
elif metric == 2:
zaxis_title = 'False Positive Rate'
elif metric == 3:
zaxis_title = 'True Negative Rate'
elif metric == 4:
zaxis_title = 'False Negative Rate'
camera = dict(
up=dict(x=0, y=0, z=1),
center=dict(x=0, y=0, z=0),
eye=dict(x=2, y=2, z=0.1)
)
layout = go.Layout(
title='Statistical Power Analysis Resutls',
autosize=True,
width=1024,
height=768,
margin=go.Margin(
l=80,
r=40,
b=100,
t=60
),
scene=go.Scene(
xaxis=dict(
title='Sample Sizes',
range=[0, np.max(X) + 0.1]
),
yaxis=dict(
title='Effect Sizes',
tickmode='linear',
tick0=0,
dtick=0.1,
range=[0, np.max(Y)]
),
zaxis=dict(
title=zaxis_title,
tickmode='linear',
tick0=0,
dtick=0.1,
range=[0, 1.0]
)
)
)
data = [go.Surface(x=X, y=Y, z=MUtot)]
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(scene=dict(camera=camera))
py.offline.plot(fig, filename=svfilename, auto_open=False)
##=======End of interactive SurfacePlot============
##====== Beginning of scatter plot for slices of surface plots===============
[docs]def iSlicesPlot(X, Y, Error_y, svfilename, plot_title, x_caption, y_caption, trace_label, trace_num):
"""
For plotting slices from surface plots. Interactive plots with error bars.
:param X: matrix for x-axis. either sample size or effect size. Use 1 x n numpy array.
:type X: array
:param Y: matrix of mean proportion of variables reach power>threshold.
:type Y: array
:param Error_y: a matrix contains standard deviation of proportion of variables reach power>threshold within the repeats.
:type Error_y: array
:param svfilename: filename for saving plots in the running folder.
:type svfilename: string
:param plot_title: Title for the plot
:type plot_title: String
:param x_caption: Caption for x-axis
:type x_caption: String
:param y_caption: Caption for y-axis
:type y_caption: String
:param trace_label: for each trace to show relevant txt.
:type trace_label: String
:param trace_num: for trace label when mouse cursor moving along plot to show which line is which
:type trace_num: array
:return:
"""
import plotly as py
import plotly.graph_objs as go
traces = []
for ii in range(0, len(Y)):
trace_tmp = go.Scatter(x=X, y=Y[ii], error_y=dict(
type='data',
array=Error_y[ii],
visible=True
),
name=trace_label + str(trace_num[0][ii])
)
traces.append(trace_tmp)
data = go.Data(traces)
##define other features of plots
##dictionary of y_caption
if y_caption == 'tpn':
y_caption = 'Proportion'
plot_title = plot_title + '-(no correction)'
if y_caption == 'tpb':
y_caption = 'Proportion'
plot_title = plot_title + '-(Bonferroni correction)'
if y_caption == 'tpbh':
y_caption = 'Proportion'
plot_title = plot_title + '-(Benjamini-Hochberg correction)'
if y_caption == 'tpby':
y_caption = 'Proportion'
plot_title = plot_title + '-(Benjamini-Yekutieli correction)'
if y_caption == 'fpn':
y_caption = 'False Positive Rate'
plot_title = plot_title + '-(no correction)'
if y_caption == 'fpb':
y_caption = 'False Positive Rate'
plot_title = plot_title + '-(Bonferroni correction)'
if y_caption == 'fpbh':
y_caption = 'False Positive Rate'
plot_title = plot_title + '-(Benjamini-Hochberg correction)'
if y_caption == 'fpby':
y_caption = 'False Positive Rate'
plot_title = plot_title + '-(Benjamini-Yekutieli correction)'
if y_caption == 'tnn':
y_caption = 'True Negative Rate'
plot_title = plot_title + '-(no correction)'
if y_caption == 'tnb':
y_caption = 'True Negative Rate'
plot_title = plot_title + '-(Bonferroni correction)'
if y_caption == 'tnbh':
y_caption = 'True Negative Rate'
plot_title = plot_title + '-(Benjamini-Hochberg correction)'
if y_caption == 'tnby':
y_caption = 'True Negative Rate'
plot_title = plot_title + '-(Benjamini-Yekutieli correction)'
if y_caption == 'fnn':
y_caption = 'False Negative Rate'
plot_title = plot_title + '-(no correction)'
if y_caption == 'fnb':
y_caption = 'False Negative Rate'
plot_title = plot_title + '-(Bonferroni correction)'
if y_caption == 'fnbh':
y_caption = 'False Negative Rate'
plot_title = plot_title + '-(Benjamini-Hochberg correction)'
if y_caption == 'fnby':
y_caption = 'False Negative Rate'
plot_title = plot_title + '-(Benjamini-Yekutieli correction)'
layout = go.Layout(
title=plot_title,
xaxis=dict(
title=x_caption,
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
),
yaxis=dict(
title=y_caption,
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
)
)
fig = go.Figure(data=data, layout=layout)
py.offline.plot(fig, filename=svfilename, auto_open=False)
##====== End of scatter plot for slices of surface plots===============
##====== Beginning of surface plot for power rate only===============
[docs]def iSurfacePlotTPR(output, svfilename, correction, samplsizes, sizeeff, nreps):
"""
This is for plotting interactive 3D surface plot for mean of proportion of all variables with power>0.8 accross all repeats
:param output: array for plotting.2D numpy array.
:type output: array
:param svfilename: filename for saving the corresponding plot.
:type svfilename: String
:param correction: correction methods (no correction, Bonferroni, Benjamini-Hochberg, Benjamini-Yekutieli index nubmer. for plotting mean of all variables, as the size is 1, therefore use 1 as input parameter.
:type correction: int
:param samplsizes: sample size matrix, numpy array 1 x n.
:type samplsizes: array
:param sizeeff: effect size matrix, numpy array 1 x n
:type sizeeff: array
:param nreps: number of repeats
:type nreps: int
:return:
"""
import plotly as py
import plotly.graph_objs as go
MUtot = output
NS, NSE = MUtot.shape
##plot
##generate a 2D grid
X, Y = np.meshgrid(samplsizes, sizeeff)
##define z axis title
zaxis_title = 'Proportion'
camera = dict(
up=dict(x=0, y=0, z=1),
center=dict(x=0, y=0, z=0),
eye=dict(x=2, y=2, z=0.1)
)
layout = go.Layout(
title='Proportion of Variables with Power (True Positive)> 0.8 -%s, %d Repeats' % (correction, nreps),
autosize=True,
width=1024,
height=768,
margin=go.Margin(
l=80,
r=40,
b=100,
t=60
),
scene=go.Scene(
xaxis=dict(
title='Sample Sizes',
range=[0, np.max(X) + 0.1]
),
yaxis=dict(
title='Effect Sizes',
tickmode='linear',
tick0=0,
dtick=0.1,
range=[0, np.max(Y)]
),
zaxis=dict(
title=zaxis_title,
tickmode='linear',
tick0=0,
dtick=0.1,
range=[0, 1.0]
)
)
)
data = [go.Surface(x=X, y=Y, z=MUtot)]
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(scene=dict(camera=camera))
py.offline.plot(fig, filename=svfilename, auto_open=False)
##====== End of surface plot for power rate only===============
##=======Beginning of simulateLogNormal===================
[docs]def simulateLogNormal(data, covType, nSamples):
"""
Using log(10) of input matrix to generate normally distributed multivariate matrix
:param data: data matrix input
:type data: array
:param covType: Estimate a covariance matrix or Extract/construct a diagonal array from given variance.
:type covType: String
:param nSamples: number of simulated samples.
:type nSamples: int
:return: Return simulation data and correlation matrix
:rtype: array array
"""
## find offset and offset the data. Why to do this?
offset = fabs(np.amin(data)) + 1
offData = data + offset
##log on the data array
logData = np.log(offData)
meansLog = np.mean(logData, axis=0)
if (covType == 'Estimate'):
covLog = np.cov(logData, rowvar=0)
elif (covType == 'Diagonal'):
varlogData = np.var(logData, axis=0) # get variance of log data by each column
covLog = np.diag(varlogData) # generate a matrix with diagonal of variance of log Data
else:
print('Unknown Covariance type')
simData = np.random.multivariate_normal(np.transpose(meansLog), covLog, nSamples)
simData = np.exp(simData)
simData = simData - offset
##Set to 0 negative values
simData = [[0 if x < 0 else x for x in y] for y in simData]
if (type(simData).__name__ != 'ndarray'):
simData = np.array(simData)
corrMatrix = np.corrcoef(simData,rowvar=0) # work out the correlation of matrix by columns, each column is a variable
return simData, corrMatrix
##=======End of simulateLogNormal===================
##=======Beginning of PCalc_Continuous====================
[docs]def PCalc_Continuous(data, EffectSizes, SampSizes, SignThreshold, nSimSamp, nRepeat, cores):
"""
Calculate the confusion matrix with/without correction methods by regression.
:param data: Test Matrix. numpy array.
:type data: array
:param EffectSizes: effect size matrix, numpy array 1 x n
:type EffectSizes: array
:param SampSizes: Sample size matrix, numpy array 1 x n
:type SampSizes: array
:param SignThreshold: Threshold of significance level.This tool uses 0.05.
:type SignThreshold: float
:param nSimSamp: number of simulated samples. This tool uses 5000.
:type nSimSamp: int
:param nRepeat: number of repeats in the calculation of confusion matrix using simulated data.By default it is set to 10.
:type nRepeat: int
:param cores: number of CPU cores to be used for parallel.
:type cores: int
:return:
"""
##If sample size bigger than number of simulated samples adjust it
sampSizes = SampSizes
signThreshold = SignThreshold
effectSizes = EffectSizes
try:
if max(sampSizes) >= nSimSamp:
print('Number of simulated samples smaller than maximum of samplesizes to check - increased')
nSimSamp = max(sampSizes) + 500
except ValueError:
if max(max(sampSizes)) >= nSimSamp:
print('Number of simulated samples smaller than maximum of samplesizes to check - increased')
nSimSamp = max(max(sampSizes)) + 500
## convert matrix to numpy array type if needed
if (type(data).__name__ != 'ndarray'):
data = np.array(data)
##get data array size
size = data.shape
rows = size[0]
if (data.ndim > 1):
cols = size[1]
else:
cols = 1
##Number of variables
numVars = cols
nRepeats = nRepeat
##Number of sample and effect sizes
if (sampSizes.ndim > 1):
nSampSizes = sampSizes.shape[1]
elif (sampSizes.ndim == 1):
nSampSizes = sampSizes.shape[0]
if (effectSizes.ndim > 1):
nEffSizes = effectSizes.shape[1]
elif (effectSizes.ndim == 1):
nEffSizes = effectSizes.shape[0]
##Simulation of a new data set based on multivariate normal distribution
Samples, correlationMat = simulateLogNormal(data, 'Estimate', nSimSamp)
Samples_seg = Samples
correlationMat_seg = correlationMat
## Initialize the output structures
output2 = []
##define an array for storing the results in each step of repeat for all variables
##with all effect sizes and sample sizes;
output_allsteps_tmp = []
pool = multiprocessing.Pool(processes=cores)
output2 = [pool.apply_async(f_multiproc_cont, args=( \
sampSizes, signThreshold, effectSizes, numVars, nRepeats, nSampSizes, nEffSizes, Samples_seg, correlationMat_seg, \
cols, cores, wk)) for wk in range(cores)]
output2 = [p.get(None) for p in output2]
output3 = list()
# Unpack results
for instanceOutput in output2:
for item in instanceOutput:
output3.append(item)
##pass the results to output
output = []
##work out number of overall results and number of power rate results
num_overall_results = int(round(numVars / cores))
for novr in range(num_overall_results):
output.append(output3[novr])
##pass Power TPR results to output variables
output_uncTP = []
output_bonfTP = []
output_bhTP = []
output_byTP = []
output_uncTP = np.array(output3[num_overall_results])
output_bonfTP = np.array(output3[num_overall_results + 1])
output_bhTP = np.array(output3[num_overall_results + 2])
output_byTP = np.array(output3[num_overall_results + 3])
if cores > 1:
for ii in range(1, cores - 1):
for novr in range(num_overall_results):
output.append(output3[ii * (num_overall_results + 4) + novr])
output_uncTP = np.append(output_uncTP, output3[ii * (num_overall_results + 4) + num_overall_results],
axis=2)
output_bonfTP = np.append(output_bonfTP, output3[ii * (num_overall_results + 4) + num_overall_results + 1],
axis=2)
output_bhTP = np.append(output_bhTP, output3[ii * (num_overall_results + 4) + num_overall_results + 2],
axis=2)
output_byTP = np.append(output_byTP, output3[ii * (num_overall_results + 4) + num_overall_results + 3],
axis=2)
rest_num_overall_results = numVars - num_overall_results * (cores - 1)
for novr in range(rest_num_overall_results):
output.append(output3[(cores - 1) * (num_overall_results + 4) + novr])
output_uncTP = np.append(output_uncTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results], axis=2)
output_bonfTP = np.append(output_bonfTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results + 1],
axis=2)
output_bhTP = np.append(output_bhTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results + 2], axis=2)
output_byTP = np.append(output_byTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results + 3], axis=2)
output = np.array(output)
##for the mean proportion of number of variables achieve the power; and the std
output_uncTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_bonfTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_bhTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_byTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_uncTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
output_bonfTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
output_bhTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
output_byTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
for currEff in range(0, nEffSizes):
for currSamp in range(0, nSampSizes):
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_uncTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_uncTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_uncTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_uncTP_ratio_iqr[currEff][currSamp] = q75 - q25
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_bonfTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_bonfTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_bonfTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_bonfTP_ratio_iqr[currEff][currSamp] = q75 - q25
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_bhTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_bhTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_bhTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_bhTP_ratio_iqr[currEff][currSamp] = q75 - q25
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_byTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_byTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_byTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_byTP_ratio_iqr[currEff][currSamp] = q75 - q25
try:
return output, output_uncTP_ratio_median, output_bonfTP_ratio_median, output_bhTP_ratio_median, output_byTP_ratio_median, \
output_uncTP_ratio_iqr, output_bonfTP_ratio_iqr, output_bhTP_ratio_iqr, output_byTP_ratio_iqr, \
output_uncTP, output_bonfTP, output_bhTP, output_byTP
except:
print('error occurs when returning output')
[docs]def f_multiproc_cont(sampSizes, signThreshold, effectSizes, numVars, nRepeats, nSampSizes, nEffSizes, Samples_seg,
correlationMat_seg, cols, cores, currCore):
"""
Lauch parallel computing for PCalc_Continuous.
:param sampSizes: Sample size matrix, numpy array 1 x n.
:type sampSizes: array
:param signThreshold: Threshold of significance level.This tool uses 0.05.
:type signThreshold: float
:param effectSizes: Effect size matrix, numpy array 1 x n.
:type effectSizes: array
:param numVars: number of variables in current CPU core.
:type numVars: int
:param nRepeats: number of repeats in the calculation of confusion matrix using simulated data.By default it is set to 10.
:type nRepeats: int
:param nSampSizes: length of Sample size matrix.
:type nSampSizes: int
:param nEffSizes: length of effect size matrix.
:type nEffSizes: int
:param Samples_seg: Segment of simulated sample data. Numpy array.
:type Samples_seg: array
:param correlationMat_seg: Segment of correlation matrix on the simulated sample data. Numpy array.
:type correlationMat_seg: array
:param cols: number of variables to be used for calculation.
:type cols: int
:param cores: number of CPU cores to be used.
:type cores: int
:param currCore: current CPU core in use.
:type currCore: int
:return:
"""
##re-check numVars
offSet = currCore * int(round(cols / cores))
if (currCore < (cores - 1)):
numVars = int(round(cols / cores))
else:
numVars = cols - int(round(cols / cores)) * (cores - 1)
##for storing all results in all repeated steps with all effect sizes and sample
##sizes for Power (TP) in current samples_seg
output_all_uncTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output_all_bonfTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output_all_bhTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output_all_byTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
##for storing results of all metric and correction options under the combination
##of effect size and sample size for all variables
output = []
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.zeros((4, 10))
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.zeros((4, 10, nEffSizes, nSampSizes))
##define uncStruct -- dictionary data
uncStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
bonfStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
bhStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
byStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
for currVar in range(0, numVars):
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.zeros((4, 10))
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.zeros((4, 10, nEffSizes, nSampSizes))
for currEff in range(0, nEffSizes):
b1 = np.zeros((cols, 1))
b1[currVar][0] = effectSizes[0][currEff]
for currSampSize in range(0, nSampSizes):
## define the structural data multiplerepeats
class MUltiplerepeats(object):
def __init__(self, Results, Bonferroni, BenjHoch, BenjYek, noCorrection):
self.Results = Results
self.Bonferroni = Bonferroni
self.BenjHoch = BenjHoch
self.BenjYek = BenjYek
self.noCorrection = noCorrection
multiplerepeats = MUltiplerepeats(
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)})
for currRepeat in range(0, nRepeats):
## Select a subset of the simulated spectra
selectIndex = randperm1(sampSizes[0][currSampSize])
if (type(selectIndex).__name__ != 'ndarray'):
selectIndex = np.array(selectIndex)
SelSamples = Samples_seg[selectIndex] # matrix slicing by rows
## UVScaling the data
stDev = np.std(SelSamples,
axis=0) # without argument ddof=1 means using default ddof=0 to work out std on population
SelSamples = SelSamples - np.mean(SelSamples, axis=0)
SelSamples = SelSamples / stDev
noiseLevel = 1
noise = noiseLevel * np.random.randn(sampSizes[0][currSampSize], 1)
#noise = noiseLevel * np.random.normal(0, 1, sampSizes[0][currSampSize])
#noise = np.reshape(noise, (len(noise),1))
Y = SelSamples[:, np.array([currVar])] * b1[currVar][0]
Y = Y + noise
p = np.zeros((1, cols))
##Using regress for multivariate regression test
for i in range(0, cols):
B = np.append(np.ones((Y.shape[0], 1)), SelSamples[:, [i]], axis=1)
stats_result = sm.OLS(Y, B).fit() # ordinary least square linear regression
#stats_result1 = stats_result.predict(B)
# OLS. The result of OLS has attributes such as
# .rsquared as R^2, .fvalue as F-statistics
# .f_pvalue as p-value of F-stats, .scale as error variance
p[0][i] = stats_result.f_pvalue
pUnc = p ##pUnc and p have 1xnumVars elements
pBonf = p * numVars
h1, crit_p, adj_ci_cvrg, pBY = fdr_bh(p, 0.05, 'dep')
h1, crit_p, adj_ci_cvrg, pBH = fdr_bh(p, 0.05, 'pdep')
##
#corrVector = np.array([])
corrVector = correlationMat_seg[:, currVar + offSet]
uncTNTot, uncTPTot, uncFPTot, uncFNTot, uncFDTot = calcConfMatrixUniv(pUnc, corrVector,
signThreshold, 0.8)
bonfTNTot, bonfTPTot, bonfFPTot, bonfFNTot, bonfFDTot = calcConfMatrixUniv(pBonf, corrVector,
signThreshold, 0.8)
byTNTot, byTPTot, byFPTot, byFNTot, byFDTot = calcConfMatrixUniv(pBY, corrVector, signThreshold,
0.8)
bhTNTot, bhTPTot, bhFPTot, bhFNTot, bhFDTot = calcConfMatrixUniv(pBH, corrVector, signThreshold,
0.8)
try:
multiplerepeats.noCorrection['TP'][currRepeat] = uncTPTot
except IndexError:
multiplerepeats.noCorrection['TP'] = np.append(multiplerepeats.noCorrection['TP'],
uncTPTot) # if array index exceeds upper bound, extend the array
try:
multiplerepeats.noCorrection['FP'][currRepeat] = uncFPTot
except IndexError:
multiplerepeats.noCorrection['FP'] = np.append(multiplerepeats.noCorrection['FP'], uncFPTot)
try:
multiplerepeats.noCorrection['TN'][currRepeat] = uncTNTot
except IndexError:
multiplerepeats.noCorrection['TN'] = np.append(multiplerepeats.noCorrection['TN'], uncTNTot)
try:
multiplerepeats.noCorrection['FN'][currRepeat] = uncFNTot
except IndexError:
multiplerepeats.noCorrection['FN'] = np.append(multiplerepeats.noCorrection['FN'], uncFNTot)
try:
multiplerepeats.noCorrection['FD'][currRepeat] = uncFDTot
except IndexError:
multiplerepeats.noCorrection['FD'] = np.append(multiplerepeats.noCorrection['FD'], uncFDTot)
try:
multiplerepeats.Bonferroni['TP'][currRepeat] = bonfTPTot
except IndexError:
multiplerepeats.Bonferroni['TP'] = np.append(multiplerepeats.Bonferroni['TP'], bonfTPTot)
try:
multiplerepeats.Bonferroni['FP'][currRepeat] = bonfFPTot
except IndexError:
multiplerepeats.Bonferroni['FP'] = np.append(multiplerepeats.Bonferroni['FP'], bonfFPTot)
try:
multiplerepeats.Bonferroni['TN'][currRepeat] = bonfTNTot
except IndexError:
multiplerepeats.Bonferroni['TN'] = np.append(multiplerepeats.Bonferroni['TN'], bonfTNTot)
try:
multiplerepeats.Bonferroni['FN'][currRepeat] = bonfFNTot
except IndexError:
multiplerepeats.Bonferroni['FN'] = np.append(multiplerepeats.Bonferroni['FN'], bonfFNTot)
try:
multiplerepeats.Bonferroni['FD'][currRepeat] = bonfFDTot
except IndexError:
multiplerepeats.Bonferroni['FD'] = np.append(multiplerepeats.Bonferroni['FD'], bonfFDTot)
try:
multiplerepeats.BenjHoch['TP'][currRepeat] = bhTPTot
except IndexError:
multiplerepeats.BenjHoch['TP'] = np.append(multiplerepeats.BenjHoch['TP'], bhTPTot)
try:
multiplerepeats.BenjHoch['FP'][currRepeat] = bhFPTot
except IndexError:
multiplerepeats.BenjHoch['FP'] = np.append(multiplerepeats.BenjHoch['FP'], bhFPTot)
try:
multiplerepeats.BenjHoch['TN'][currRepeat] = bhTNTot
except IndexError:
multiplerepeats.BenjHoch['TN'] = np.append(multiplerepeats.BenjHoch['TN'], bhTNTot)
try:
multiplerepeats.BenjHoch['FN'][currRepeat] = bhFNTot
except IndexError:
multiplerepeats.BenjHoch['FN'] = np.append(multiplerepeats.BenjHoch['FN'], bhFNTot)
try:
multiplerepeats.BenjHoch['FD'][currRepeat] = bhFDTot
except IndexError:
multiplerepeats.BenjHoch['FD'] = np.append(multiplerepeats.BenjHoch['FD'], bhFDTot)
try:
multiplerepeats.BenjYek['TP'][currRepeat] = byTPTot
except IndexError:
multiplerepeats.BenjYek['TP'] = np.append(multiplerepeats.BenjYek['TP'], byTPTot)
try:
multiplerepeats.BenjYek['FP'][currRepeat] = byFPTot
except IndexError:
multiplerepeats.BenjYek['FP'] = np.append(multiplerepeats.BenjYek['FP'], byFPTot)
try:
multiplerepeats.BenjYek['TN'][currRepeat] = byTNTot
except IndexError:
multiplerepeats.BenjYek['TN'] = np.append(multiplerepeats.BenjYek['TN'], byTNTot)
try:
multiplerepeats.BenjYek['FN'][currRepeat] = byFNTot
except IndexError:
multiplerepeats.BenjYek['FN'] = np.append(multiplerepeats.BenjYek['FN'], byFNTot)
try:
multiplerepeats.BenjYek['FD'][currRepeat] = byFDTot
except IndexError:
multiplerepeats.BenjYek['FD'] = np.append(multiplerepeats.BenjYek['FD'], byFDTot)
##storing each step of repeats
output_all_uncTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.noCorrection['TP']
output_all_bonfTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.Bonferroni['TP']
output_all_bhTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.BenjYek['TP']
output_all_byTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.BenjYek['TP']
##end of for currRepeat in range(0, nRepeats):
##get multiplerepeats.Bonferroni keys/fields
stats = []
for key, value in multiplerepeats.Bonferroni.iteritems():
stats.append(key)
for currstat in stats:
uncStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.noCorrection[currstat])
uncStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.noCorrection[currstat])
bonfStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.Bonferroni[currstat])
bonfStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.Bonferroni[currstat])
byStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.BenjYek[currstat])
byStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.BenjYek[currstat])
bhStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.BenjHoch[currstat])
bhStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.BenjHoch[currstat])
## end of for currEff in range(1,nEffSizes+1):
stats = []
for key, value in uncStruct.iteritems():
stats.append(key)
for i in range(0, len(stats)):
try:
storeVar[0][i] = uncStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[0][i] = uncStruct[stats[i]]
try:
storeVar[1][i] = bonfStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[1][i] = bonfStruct[stats[i]]
try:
storeVar[2][i] = bhStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[2][i] = bhStruct[stats[i]]
try:
storeVar[3][i] = byStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[3][i] = byStruct[stats[i]]
output.append(storeVar)
print('|| \n')
output.append(output_all_uncTP_tmp)
output.append(output_all_bonfTP_tmp)
output.append(output_all_bhTP_tmp)
output.append(output_all_byTP_tmp)
try:
return output
except:
print('error occurs when returning output in parallel')
def randperm1(totalLen):
##function of random permuation and pick up the sub array according to the specified size
tempList = np.random.permutation(totalLen) ##generate a random permutation array
return tempList
##=======End of PCalc_Continuous====================
##=======Beginning of PCalc_2Group====================
[docs]def PCalc_2Group(data, EffectSizes, SampSizes, SignThreshold, nSimSamp, nRepeat, cores):
"""
Calculate the confusion matrix with/without correction methods by classification.
:param data: Test Matrix. numpy array.
:type data: array
:param EffectSizes: effect size matrix, numpy array 1 x n
:type EffectSizes: array
:param SampSizes: Sample size matrix, numpy array 1 x n
:type SampSizes: array
:param SignThreshold: Threshold of significance level.This tool uses 0.05.
:type SignThreshold: float
:param nSimSamp: number of simulated samples. This tool uses 5000.
:type nSimSamp: int
:param nRepeat: number of repeats in the calculation of confusion matrix using simulated data.By default it is set to 10.
:type nRepeat: int
:param cores: number of CPU cores to be used for parallel.
:type cores: int
:return:
"""
##If sample size bigger than number of simulated samples adjust it
## global sampSizes, signThreshold, effectSizes, numVars, nRepeats, nSampSizes, nEffSizes, Samples_seg, correlationMat_seg, output2
## global output2
sampSizes = SampSizes
signThreshold = SignThreshold
effectSizes = EffectSizes
try:
if 2 * max(sampSizes) >= nSimSamp:
print('Number of simulated samples smaller than maximum of samplesizes to check - increased')
nSimSamp = 2 * max(sampSizes) + 500
except ValueError:
if 2 * max(max(sampSizes)) >= nSimSamp:
print('Number of simulated samples smaller than maximum of samplesizes to check - increased')
nSimSamp = 2 * max(max(sampSizes)) + 500
## convert matrix to numpy array type if needed
if (type(data).__name__ != 'ndarray'):
data = np.array(data)
##get data array size
size = data.shape
rows = size[0]
if (data.ndim > 1):
cols = size[1]
##Number of variables
numVars = cols
nRepeats = nRepeat
##Number of sample and effect sizes
if (sampSizes.ndim > 1):
nSampSizes = sampSizes.shape[1]
elif (sampSizes.ndim == 1):
nSampSizes = sampSizes.shape[0]
if (effectSizes.ndim > 1):
nEffSizes = effectSizes.shape[1]
elif (effectSizes.ndim == 1):
nEffSizes = effectSizes.shape[0]
##Simulation of a new data set based on multivariate normal distribution
Samples, correlationMat = simulateLogNormal(data, 'Estimate', nSimSamp)
Samples_seg = Samples
correlationMat_seg = correlationMat
output2 = []
##define an array for storing the results in each step of repeat for all variables
##with all effect sizes and sample sizes;
output_allsteps_tmp = []
## #debug - using another multiporcessing method to run f_multiproc
pool = multiprocessing.Pool(processes=cores)
output2 = [pool.apply_async(f_multiproc, args=(
sampSizes, signThreshold, effectSizes, numVars, nRepeats, nSampSizes, nEffSizes, Samples_seg, correlationMat_seg,
cols, cores, wk)) for wk in range(cores)]
output2 = [p.get(None) for p in output2]
output3 = list()
# Unpack results
for instanceOutput in output2:
for item in instanceOutput:
output3.append(item)
## output2 = Parallel(n_jobs=cores)(delayed(f_multiproc)(sampSizes, signThreshold, effectSizes, numVars, nRepeats, nSampSizes, nEffSizes, Samples_seg, correlationMat_seg, cols, cores, ii) for ii in range(cores)) #n_jobs=-1 means using all CPUs or any other number>1 to specify the number of CPUs to use
## for ii in range(numVars): ##for non-parallel running
## f_multiproc(ii)
##pass the results to output
output = []
##work out number of overall results and number of power rate results
num_overall_results = int(round(numVars / cores))
for novr in range(num_overall_results):
output.append(output3[novr])
##pass Power TPR results to output variables
output_uncTP = []
output_bonfTP = []
output_bhTP = []
output_byTP = []
output_uncTP = np.array(output3[num_overall_results])
output_bonfTP = np.array(output3[num_overall_results + 1])
output_bhTP = np.array(output3[num_overall_results + 2])
output_byTP = np.array(output3[num_overall_results + 3])
if cores > 1:
for ii in range(1, cores - 1):
for novr in range(num_overall_results):
output.append(output3[ii * (num_overall_results + 4) + novr])
output_uncTP = np.append(output_uncTP, output3[ii * (num_overall_results + 4) + num_overall_results],
axis=2)
output_bonfTP = np.append(output_bonfTP, output3[ii * (num_overall_results + 4) + num_overall_results + 1],
axis=2)
output_bhTP = np.append(output_bhTP, output3[ii * (num_overall_results + 4) + num_overall_results + 2],
axis=2)
output_byTP = np.append(output_byTP, output3[ii * (num_overall_results + 4) + num_overall_results + 3],
axis=2)
rest_num_overall_results = numVars - num_overall_results * (cores - 1)
for novr in range(rest_num_overall_results):
output.append(output3[(cores - 1) * (num_overall_results + 4) + novr])
output_uncTP = np.append(output_uncTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results], axis=2)
output_bonfTP = np.append(output_bonfTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results + 1],
axis=2)
output_bhTP = np.append(output_bhTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results + 2], axis=2)
output_byTP = np.append(output_byTP,
output3[(cores - 1) * (num_overall_results + 4) + rest_num_overall_results + 3], axis=2)
output = np.array(output)
##for the mean proportion of number of variables achieve the power; and the std
output_uncTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_bonfTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_bhTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_byTP_ratio_median = np.zeros((nEffSizes, nSampSizes))
output_uncTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
output_bonfTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
output_bhTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
output_byTP_ratio_iqr = np.zeros((nEffSizes, nSampSizes))
for currEff in range(0, nEffSizes):
for currSamp in range(0, nSampSizes):
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_uncTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_uncTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_uncTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_uncTP_ratio_iqr[currEff][currSamp] = q75 - q25
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_bonfTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_bonfTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_bonfTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_bonfTP_ratio_iqr[currEff][currSamp] = q75 - q25
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_bhTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_bhTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_bhTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_bhTP_ratio_iqr[currEff][currSamp] = q75 - q25
tmp_median_array = np.zeros(nRepeats)
for currStep in range(0, nRepeats):
tmp_median_array[currStep] = (sum(
1 for x in output_byTP[currEff][currSamp][:, currStep] if x > 0.8)) / float(numVars)
output_byTP_ratio_median[currEff][currSamp] = np.median(tmp_median_array)
try:
output_byTP_ratio_iqr[currEff][currSamp] = scistats.iqr(tmp_median_array, axis=0,
interpolation='midpoint')
except AttributeError:
q75, q25 = np.percentile(tmp_median_array, [75, 25], axis=0)
output_byTP_ratio_iqr[currEff][currSamp] = q75 - q25
try:
return output, output_uncTP_ratio_median, output_bonfTP_ratio_median, output_bhTP_ratio_median, output_byTP_ratio_median, \
output_uncTP_ratio_iqr, output_bonfTP_ratio_iqr, output_bhTP_ratio_iqr, output_byTP_ratio_iqr, \
output_uncTP, output_bonfTP, output_bhTP, output_byTP
except:
print('error occurs when returning output')
[docs]def f_multiproc(sampSizes, signThreshold, effectSizes, numVars, nRepeats, nSampSizes, nEffSizes, Samples_seg,
correlationMat_seg, cols, cores, currCore):
"""
Lauch parallel computing for PCalc_2Group.
:param sampSizes: Sample size matrix, numpy array 1 x n.
:type sampSizes: array
:param signThreshold: Threshold of significance level.This tool uses 0.05.
:type signThreshold: float
:param effectSizes: Effect size matrix, numpy array 1 x n.
:type effectSizes: array
:param numVars: number of variables in current core.
:type numVars: int
:param nRepeats: number of repeats in the calculation of confusion matrix using simulated data.By default it is set to 10.
:type nRepeats: int
:param nSampSizes: length of Sample size matrix.
:type nSampSizes: int
:param nEffSizes: length of effect size matrix.
:type nEffSizes: int
:param Samples_seg: Segment of simulated sample data. Numpy array.
:type Samples_seg: array
:param correlationMat_seg: Segment of correlation matrix on the simulated sample data. Numpy array.
:type correlationMat_seg: array
:param cols: number of variables to be used for calculation.
:type cols: int
:param cores: number of CPU cores to be used.
:type cores: int
:param currCore: current CPU core in use.
:type currCore: int
:return:
"""
##re-check numVars
offSet = currCore * int(round(cols / cores))
if (currCore < (cores - 1)):
numVars = int(round(cols / cores))
else:
numVars = cols - int(round(cols / cores)) * (cores - 1)
# debug
print("numVars=%d; current core=%d" % (numVars, currCore))
##for storing all results in all repeated steps with all effect sizes and sample
##sizes for Power (TP) in current samples_seg
output_all_uncTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output_all_bonfTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output_all_bhTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output_all_byTP_tmp = np.zeros((nEffSizes, nSampSizes, numVars, nRepeats))
output = []
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.zeros((4, 10))
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.zeros((4, 10, nEffSizes, nSampSizes))
##define uncStruct, bonfStruct, bhStruct, byStruct -- dictionary data
## the key's order, if retrieving by index, is FP,TN,FD,FN,SFD,STP,STN,SFN,SFP,TP
##STP-- Standard deviation for True Positive prediction; SFP -- Standard deviation for False Positive prediction
uncStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
bonfStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
bhStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
byStruct = {'TP': np.zeros((nEffSizes, nSampSizes)), 'FP': np.zeros((nEffSizes, nSampSizes)),
'TN': np.zeros((nEffSizes, nSampSizes)), \
'FN': np.zeros((nEffSizes, nSampSizes)), 'FD': np.zeros((nEffSizes, nSampSizes)),
'STP': np.zeros((nEffSizes, nSampSizes)), \
'SFP': np.zeros((nEffSizes, nSampSizes)), 'STN': np.zeros((nEffSizes, nSampSizes)),
'SFN': np.zeros((nEffSizes, nSampSizes)), \
'SFD': np.zeros((nEffSizes, nSampSizes))}
for currVar in range(0, numVars):
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.zeros((4, 10))
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.zeros((4, 10, nEffSizes, nSampSizes))
for currEff in range(0, nEffSizes):
for currSampSize in range(0, nSampSizes):
## define the structural data multiplerepeats
class MUltiplerepeats(object):
def __init__(self, Results, Bonferroni, BenjHoch, BenjYek, noCorrection):
self.Results = Results
self.Bonferroni = Bonferroni
self.BenjHoch = BenjHoch
self.BenjYek = BenjYek
self.noCorrection = noCorrection
multiplerepeats = MUltiplerepeats(
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)}, \
{'TP': np.zeros(nRepeats), 'FP': np.zeros(nRepeats), 'TN': np.zeros(nRepeats),
'FN': np.zeros(nRepeats), 'FD': np.zeros(nRepeats)})
for currRepeat in range(0, nRepeats):
## Select a subset of the simulated spectra
selectIndex = randperm(len(Samples_seg), 2 * sampSizes[0][currSampSize]) ##sampSizes is a 1xn array
## check selectIndex is a numpy array, if not, then convert to numpy array.
if (type(selectIndex).__name__ != 'ndarray'):
selectIndex = np.array(selectIndex)
SelSamples = Samples_seg[selectIndex] # matrix slicing by rows
##Assume class balanced, modify proportion of group here
GroupId = np.ones((len(SelSamples), 1))
for i in range(int(floor(len(SelSamples) / 2)), len(SelSamples)):
GroupId[i][0] = 2
##Introduce change
corrVector = np.array([])
# corrVector = correlationMat_seg[currCore][:,currVar] ##this line caused error of calculation on 2nd or other cores
corrVector = correlationMat_seg[:, currVar + offSet]
stdSelSamples = np.std(SelSamples, axis=0, ddof=1)
for k in range(0, cols):
if (corrVector[k] > 0.8):
for j in range(0, len(GroupId)):
if (GroupId[j][0] == 2):
## stdSelSamples = np.std(SelSamples, axis=0, ddof=1)
SelSamples[j][k] = SelSamples[j][k] + effectSizes[0][currEff] * stdSelSamples[k]
##Initialize p value vector for this round
p = np.zeros((1, cols))
for var2check in range(0, cols):
p[0][var2check] = scistats.f_oneway(SelSamples[0:int(len(SelSamples) / 2), var2check],
SelSamples[int(len(SelSamples) / 2):len(SelSamples),
var2check])[1]
## tempSamples1 = []
## tempSamples2 = []
## for i in range(0, len(SelSamples)):
## if (GroupId[i][0]==1):
## tempSamples1.append(SelSamples[i][var2check])
## if (GroupId[i][0]==2):
## tempSamples2.append(SelSamples[i][var2check])
## p[0][var2check] = scistats.f_oneway(tempSamples1,tempSamples2)[1]
pUnc = p ##pUnc and p have 1xnumVars elements
pBonf = p * cols ##pBonf has 1xnumVars elements
h1, crit_p, adj_ci_cvrg, pBY = fdr_bh(p, 0.05, 'dep')
h1, crit_p, adj_ci_cvrg, pBH = fdr_bh(p, 0.05, 'pdep')
uncTNTot, uncTPTot, uncFPTot, uncFNTot, uncFDTot = calcConfMatrixUniv(pUnc, corrVector,
signThreshold, 0.8)
bonfTNTot, bonfTPTot, bonfFPTot, bonfFNTot, bonfFDTot = calcConfMatrixUniv(pBonf, corrVector,
signThreshold, 0.8)
byTNTot, byTPTot, byFPTot, byFNTot, byFDTot = calcConfMatrixUniv(pBY, corrVector, signThreshold,
0.8)
bhTNTot, bhTPTot, bhFPTot, bhFNTot, bhFDTot = calcConfMatrixUniv(pBH, corrVector, signThreshold,
0.8)
try:
multiplerepeats.noCorrection['TP'][currRepeat] = uncTPTot
except IndexError:
multiplerepeats.noCorrection['TP'] = np.append(multiplerepeats.noCorrection['TP'],
uncTPTot) # if array index exceeds upper bound, extend the array
try:
multiplerepeats.noCorrection['FP'][currRepeat] = uncFPTot
except IndexError:
multiplerepeats.noCorrection['FP'] = np.append(multiplerepeats.noCorrection['FP'], uncFPTot)
try:
multiplerepeats.noCorrection['TN'][currRepeat] = uncTNTot
except IndexError:
multiplerepeats.noCorrection['TN'] = np.append(multiplerepeats.noCorrection['TN'], uncTNTot)
try:
multiplerepeats.noCorrection['FN'][currRepeat] = uncFNTot
except IndexError:
multiplerepeats.noCorrection['FN'] = np.append(multiplerepeats.noCorrection['FN'], uncFNTot)
try:
multiplerepeats.noCorrection['FD'][currRepeat] = uncFDTot
except IndexError:
multiplerepeats.noCorrection['FD'] = np.append(multiplerepeats.noCorrection['FD'], uncFDTot)
try:
multiplerepeats.Bonferroni['TP'][currRepeat] = bonfTPTot
except IndexError:
multiplerepeats.Bonferroni['TP'] = np.append(multiplerepeats.Bonferroni['TP'], bonfTPTot)
try:
multiplerepeats.Bonferroni['FP'][currRepeat] = bonfFPTot
except IndexError:
multiplerepeats.Bonferroni['FP'] = np.append(multiplerepeats.Bonferroni['FP'], bonfFPTot)
try:
multiplerepeats.Bonferroni['TN'][currRepeat] = bonfTNTot
except IndexError:
multiplerepeats.Bonferroni['TN'] = np.append(multiplerepeats.Bonferroni['TN'], bonfTNTot)
try:
multiplerepeats.Bonferroni['FN'][currRepeat] = bonfFNTot
except IndexError:
multiplerepeats.Bonferroni['FN'] = np.append(multiplerepeats.Bonferroni['FN'], bonfFNTot)
try:
multiplerepeats.Bonferroni['FD'][currRepeat] = bonfFDTot
except IndexError:
multiplerepeats.Bonferroni['FD'] = np.append(multiplerepeats.Bonferroni['FD'], bonfFDTot)
try:
multiplerepeats.BenjHoch['TP'][currRepeat] = bhTPTot
except IndexError:
multiplerepeats.BenjHoch['TP'] = np.append(multiplerepeats.BenjHoch['TP'], bhTPTot)
try:
multiplerepeats.BenjHoch['FP'][currRepeat] = bhFPTot
except IndexError:
multiplerepeats.BenjHoch['FP'] = np.append(multiplerepeats.BenjHoch['FP'], bhFPTot)
try:
multiplerepeats.BenjHoch['TN'][currRepeat] = bhTNTot
except IndexError:
multiplerepeats.BenjHoch['TN'] = np.append(multiplerepeats.BenjHoch['TN'], bhTNTot)
try:
multiplerepeats.BenjHoch['FN'][currRepeat] = bhFNTot
except IndexError:
multiplerepeats.BenjHoch['FN'] = np.append(multiplerepeats.BenjHoch['FN'], bhFNTot)
try:
multiplerepeats.BenjHoch['FD'][currRepeat] = bhFDTot
except IndexError:
multiplerepeats.BenjHoch['FD'] = np.append(multiplerepeats.BenjHoch['FD'], bhFDTot)
try:
multiplerepeats.BenjYek['TP'][currRepeat] = byTPTot
except IndexError:
multiplerepeats.BenjYek['TP'] = np.append(multiplerepeats.BenjYek['TP'], byTPTot)
try:
multiplerepeats.BenjYek['FP'][currRepeat] = byFPTot
except IndexError:
multiplerepeats.BenjYek['FP'] = np.append(multiplerepeats.BenjYek['FP'], byFPTot)
try:
multiplerepeats.BenjYek['TN'][currRepeat] = byTNTot
except IndexError:
multiplerepeats.BenjYek['TN'] = np.append(multiplerepeats.BenjYek['TN'], byTNTot)
try:
multiplerepeats.BenjYek['FN'][currRepeat] = byFNTot
except IndexError:
multiplerepeats.BenjYek['FN'] = np.append(multiplerepeats.BenjYek['FN'], byFNTot)
try:
multiplerepeats.BenjYek['FD'][currRepeat] = byFDTot
except IndexError:
multiplerepeats.BenjYek['FD'] = np.append(multiplerepeats.BenjYek['FD'], byFDTot)
output_all_uncTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.noCorrection['TP']
output_all_bonfTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.Bonferroni['TP']
output_all_bhTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.BenjYek['TP']
output_all_byTP_tmp[currEff][currSampSize][currVar] = multiplerepeats.BenjYek['TP']
##get multiplerepeats.Bonferroni keys/fields
stats = []
for key, value in multiplerepeats.Bonferroni.iteritems():
stats.append(key)
for currstat in stats:
uncStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.noCorrection[currstat])
uncStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.noCorrection[currstat])
bonfStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.Bonferroni[currstat])
bonfStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.Bonferroni[currstat])
byStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.BenjYek[currstat])
byStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.BenjYek[currstat])
bhStruct[currstat][currEff][currSampSize] = np.mean(multiplerepeats.BenjHoch[currstat])
bhStruct['S' + currstat][currEff][currSampSize] = np.std(multiplerepeats.BenjHoch[currstat])
## end of for currEff in range(0,nEffSizes):
stats = []
for key, value in uncStruct.iteritems():
stats.append(key)
for i in range(0, len(stats)):
try:
storeVar[0][i] = uncStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[0][i] = uncStruct[stats[i]]
try:
storeVar[1][i] = bonfStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[1][i] = bonfStruct[stats[i]]
try:
storeVar[2][i] = bhStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[2][i] = bhStruct[stats[i]]
try:
storeVar[3][i] = byStruct[stats[i]]
except IndexError:
if (nEffSizes == 1 and nSampSizes == 1):
storeVar = np.append(storeVar, np.zeros((4, 1)), axis=1)
elif (nEffSizes > 1 or nSampSizes > 1):
storeVar = np.append(storeVar, np.zeros((4, 1, nEffSizes, nSampSizes)), axis=1)
storeVar[3][i] = byStruct[stats[i]]
output.append(storeVar)
print('| \n')
output.append(output_all_uncTP_tmp)
output.append(output_all_bonfTP_tmp)
output.append(output_all_bhTP_tmp)
output.append(output_all_byTP_tmp)
try:
return output
except:
print('error occurs when returning output in parallel')
[docs]def randperm(totalLen, subLen):
"""
This function is for random permuation and pick up the sub array according to the specified size
:param totalLen: Total length.
:type totalLen: int
:param subLen: Sub length.
:type subLen: int
:return:
"""
##function of random permuation and pick up the sub array according to the specified size
##np.random.seed(10) ##add random seed for testing purpose
tempList = np.random.permutation(totalLen) ##generate a random permutation array
##random.seed(10) ##add random seed for testing purpose
try:
tempList1 = random.sample(tempList, subLen)
except TypeError:
tempList1 = random.sample(list(tempList), subLen)
return tempList1
##=======End of PCalc_2Group====================
[docs]def expecting():
"""
function expecting is for detecting the number of outputs; written by Sami Hangaslammi
Return how many values the caller is expecting
:return:
"""
f = inspect.currentframe()
f = f.f_back.f_back
c = f.f_code
i = f.f_lasti
bytecode = c.co_code
instruction = ord(bytecode[i + 3])
if instruction == dis.opmap['UNPACK_SEQUENCE']:
howmany = ord(bytecode[i + 4])
return howmany
elif instruction == dis.opmap['POP_TOP']:
return 0
return 1
[docs]def fdr_bh(*args):
"""
Function false discovery rate by Benjamini-Hochberg correction
:param args:
:return:
"""
try:
pvals = args[0]
##convert to numpy array type if not the ndarray type
if (type(pvals).__name__ != 'ndarray'):
pvals = np.array(pvals)
except IndexError:
print("Usage: fdr_bh(<arg1>,<arg2>,<arg3>,<arg4>)")
print("arg1 as p-value matrix (mandatory must be provided")
print("arg2 as false discovery rate(optional)")
print("arg3 as method:'pdep' or 'dep', 'pdep' is given as default(optional)")
print("arg4 as report:'yes' or 'no', 'no' is given as default(optional)")
sys.exit(1)
if len(args) < 2:
q = 0.05
else:
q = args[1]
if len(args) < 3:
method = 'pdep' #'pdep' is for Benjamini-Hochberg correction method
else:
method = args[2]
if len(args) < 4:
report = 'no'
else:
report = args[3]
s = pvals.shape
if (pvals.ndim > 1): # if pvals has more than 1 rows, reshape into 1 row array
reshaped_pvals = np.reshape(pvals, (1, np.prod(s)))
p_sorted = np.sort(reshaped_pvals)
sort_ids = np.argsort(reshaped_pvals)
else: # pvals is already 1xn array
p_sorted = np.sort(pvals)
sort_ids = np.argsort(pvals)
dummy = np.sort(sort_ids)
unsort_ids = np.argsort(sort_ids)
if (type(p_sorted[0]).__name__ == 'ndarray'):
m = len(p_sorted[0])
else:
m = len(p_sorted)
if (method == 'pdep'):
##BH procedure for independence or positive dependence
thresh = np.arange(1, m + 1) * q / m
wtd_p = m * p_sorted / np.arange(1, m + 1)
elif (method == 'dep'):
##BH procedure for any dependency structure
denom = m * sum(1.0 / np.arange(1, m + 1))
thresh = np.arange(1, m + 1) * q / denom
wtd_p = denom * p_sorted / np.arange(1, m + 1)
'''
Note, it can produce adjusted p-values greater than 1!
compute adjusted p-values
'''
else:
print('Argument \'method\' needs to be \'pdep\' or \'dep\'.')
nargout = expecting() # get the number of expecting outputs from caller
if (nargout > 3):
##compute adjusted p-values
adj_p = np.zeros(m) * float('NaN')
wtd_p_sorted = np.sort(wtd_p)
wtd_p_sindex = np.argsort(wtd_p)
nextfill = 0
for k in range(0, m):
if (wtd_p_sindex[0][k] >= nextfill):
adj_p[nextfill:(wtd_p_sindex[0][k] + 1)] = wtd_p_sorted[0][k]
nextfill = wtd_p_sindex[0][k] + 1
if (nextfill > m):
break
adj_p = np.reshape(adj_p[unsort_ids], s)
rej = p_sorted <= thresh
try:
max_id = max(max(np.where(rej[0] == True))) # find greatest significant pvalue
except ValueError:
max_id = max(np.where(rej[0] == True))
if not max_id:
## if the max_id is empty
crit_p = 0
h1 = pvals * 0
adj_ci_cvrg = float('NaN')
else:
crit_p = p_sorted[0][max_id]
h1 = pvals <= crit_p
adj_ci_cvrg = 1 - thresh[max_id]
if (report == 'yes'):
n_sig = sum(p_sorted <= crit_p)
if (n_sig == 1):
print('Out of %d tests, %d is significant using a false discovery rate of %f.\n' % (m, n_sig, q))
else:
print('Out of %d tests, %d are significant using a false discovery rate of %f.\n' % (m, n_sig, q))
if (method == 'pdep'):
print('FDR/FCR procedure used is guaranteed valid for independent or positively dependent tests.\n')
else:
print('FDR/FCR procedure used is guaranteed valid for independent or dependent tests.\n')
## return the results
try:
return h1, crit_p, adj_ci_cvrg, adj_p
except:
print("Errors occur when returning h1, crit_p, adj_ci_cvrg and adj_p")
[docs]def calcConfMatrixUniv(p, corrVector, signThreshold, corrThresh):
"""
Function of calculating confusion matrix
:param p: p_value matrix. numpy array
:type p: array
:param corrVector: correlation vector.
:type corrVector: array
:param signThreshold: Significance threshold.
:type signThreshold: float
:param corrThresh: power level
:type corrThresh: float
:return:
"""
TP = 0.0
TN = 0.0
FP = 0.0
FN = 0.0
if (type(p).__name__ != 'ndarray'):
p = np.array(p)
Pf = p < signThreshold
try:
nVars = p.shape[1]
except IndexError:
nVars = p.shape[0]
for i in range(0, nVars):
# debugging
# print "fabsfabs(corrVector[%d])=%f; Pf[0][%d]=%r"%(i,fabs(corrVector[i]),i, Pf[0][i])
if ((fabs(corrVector[i]) < corrThresh) and (Pf[0][i] == False)):
TN = TN + 1
# print "TN=%d"%(TN)
elif ((fabs(corrVector[i]) > corrThresh) and (Pf[0][i] == True)):
TP = TP + 1
# print "TP=%d"%(TP)
elif ((fabs(corrVector[i]) > corrThresh) and (Pf[0][i] == False)):
FN = FN + 1
# print "FN=%d"%(FN)
elif ((fabs(corrVector[i]) < corrThresh) and (Pf[0][i] == True)):
FP = FP + 1
# print "FP=%d"%(FP)
try:
TNtot = TN / (FP + TN)
except ZeroDivisionError:
TNtot = float('NaN')
try: # TPR - power
TPtot = TP / (TP + FN)
except ZeroDivisionError:
TPtot = float('NaN')
# TPtot = 0.0
try:
FPtot = FP / (FP + TN)
except ZeroDivisionError:
FPtot = float('NaN')
try:
FNtot = FN / (TP + FN)
except ZeroDivisionError:
FNtot = float('NaN')
try:
FDtot = FP / (TP + FP)
except ZeroDivisionError:
FDtot = float('NaN')
## return the results
try:
return TNtot, TPtot, FPtot, FNtot, FDtot
except:
print("Errors occur when returning uncTNTot, uncTPTot, uncFPTot, uncFNTot, uncFDTot")
def read2array(filename):
dataArray = []
try:
with open(filename) as infile:
for line in infile:
dataArray.append(line.strip().split(','))
dataArray = [[float(x) for x in y] for y in
dataArray] # The array was created with all elements as strings. Convert into floats.
dataArray = np.array(dataArray) # convert to numpy array type
except IOError:
print(filename + " does not exist!")
return dataArray
def main(argv1, argv2, argv3, argv4, argv5, argv6, argv7):
## read the data into an array;
XSRV = read2array(argv1)
if (type(XSRV).__name__ != 'ndarray'):
XSRV = np.array(XSRV)
##print array size
if (XSRV.ndim > 1):
rows = XSRV.shape[0]
cols = XSRV.shape[1]
elif (XSRV.ndim == 1):
rows = 1
cols = XSRV.shape[0]
print('Input data matrix size is :' + str(rows) + ',' + str(cols))
tmpStr = argv2.split('-')
if len(tmpStr) > 1:
argv2 = [int(tmpStr[0]), int(tmpStr[1]) + 1]
else:
argv2 = [0, int(argv2)]
# debugging
print(argv2[0], argv2[1])
tmpStr = argv3.split(':')
argv3 = range(int(tmpStr[0]), int(tmpStr[2]), int(tmpStr[1]))
if argv3[0] < 1:
argv3[0] = 1
argv3 = np.array(argv3)
argv3 = np.reshape(argv3, (1, len(argv3)))
tmpStr = argv4.split(':')
argv4 = np.arange(float(tmpStr[0]), float(tmpStr[2]), float(tmpStr[1]))
if argv4[0] == 0:
argv4[0] = 0.05
argv4 = np.array(argv4)
argv4 = np.reshape(argv4, (1, len(argv4)))
sampleSizes = argv3 # np.array([[1, 50, 100, 200, 250, 350, 500, 750, 1000]])
effectSizes = argv4 # np.array([[0.05, 0.1, 0.15,0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8]])
##define output metric options
metric_opt = np.array([1, 2, 3, 4]) # see options description below
correction_opt = np.array([1, 2, 3, 4]) # see correction options description below
numberreps = int(argv5)
outcome_type = int(argv6)
cores = int(argv7)
## ## Calculat for a subset of 4 variables (less than 20 seconds on 4-core desktop for each analysis)
diffgroups = np.array([])
linearregression = np.array([])
t_start = datetime.now()
num_cols = int(argv2[1]) - int(argv2[0])
##if the number of variables is less than the request CPU cores, use number of variables as cores.
if (num_cols < cores):
cores = num_cols
if (num_cols > 0):
if (outcome_type == 0 or outcome_type == 2):
diffgroups, output_uncTP_ratio_median, output_bonfTP_ratio_median, output_bhTP_ratio_median, output_byTP_ratio_median, \
output_uncTP_ratio_iqr, output_bonfTP_ratio_iqr, output_bhTP_ratio_iqr, output_byTP_ratio_iqr, \
output_uncTP, output_bonfTP, output_bhTP, output_byTP \
= PCalc_2Group(XSRV[:, np.arange(int(argv2[0]), int(argv2[1]))], effectSizes, sampleSizes, 0.05, 5000,
numberreps, cores)
if (outcome_type == 1 or outcome_type == 2):
linearregression, output_uncTP_ratio_median_ln, output_bonfTP_ratio_median_ln, output_bhTP_ratio_median_ln, output_byTP_ratio_median_ln, \
output_uncTP_ratio_iqr_ln, output_bonfTP_ratio_iqr_ln, output_bhTP_ratio_iqr_ln, output_byTP_ratio_iqr_ln, \
output_uncTP, output_bonfTP, output_bhTP, output_byTP \
= PCalc_Continuous(XSRV[:, np.arange(int(argv2[0]), int(argv2[1]))], effectSizes, sampleSizes, 0.05,
5000, numberreps, cores)
t_end = datetime.now()
print('Time elapsed: ' + str(t_end - t_start))
else:
t_start = datetime.now()
if (outcome_type == 0 or outcome_type == 2):
diffgroups, output_uncTP_ratio_median, output_bonfTP_ratio_median, output_bhTP_ratio_median, output_byTP_ratio_median, \
output_uncTP_ratio_iqr, output_bonfTP_ratio_iqr, output_bhTP_ratio_iqr, output_byTP_ratio_iqr, \
output_uncTP, output_bonfTP, output_bhTP, output_byTP \
= PCalc_2Group(XSRV, effectSizes, sampleSizes, 0.05, 5000, numberreps, cores)
if (outcome_type == 1 or outcome_type == 2):
linearregression, output_uncTP_ratio_median_ln, output_bonfTP_ratio_median_ln, output_bhTP_ratio_median_ln, output_byTP_ratio_median_ln, \
output_uncTP_ratio_iqr_ln, output_bonfTP_ratio_iqr_ln, output_bhTP_ratio_iqr_ln, output_byTP_ratio_iqr_ln, \
output_uncTP, output_bonfTP, output_bhTP, output_byTP \
= PCalc_Continuous(XSRV, effectSizes, sampleSizes, 0.05, 5000, numberreps, cores)
t_end = datetime.now()
print('Time elapsed: ' + str(t_end - t_start))
##diffgroups has dimension of (number of variables, 4, 10, effectsize, samplesize);
##number of variables is the input number of columns from the input dataset.
##4-- 4 correction options
##10--10 metric as "TP","FP","TN","FN","FD","STP","SFP","STN","SFN","SFD"
'''
Using the SurfacePlot function to visualize results
SurfacePlot(output, variable,metric,correction, sizeeff,samplsizes,nreps)
Output is the structure returned from the simulator, variable is the index of variable to plot
metric is the to display and correction the type of multiple testing correction to
visualize.
Metric options:
1 - True positive Rate
2 - False Positive Rate
3 - True Negative Rate
4 - False Negative Rate
Correction:
1 - No correction
2 - Bonferroni
3 - Benjamini-Hochberg
4 - Benjamini-Yekutieli
The example line below will open the False Negative Rate surface for
variable number 2 without multiple testing correction
'''
##write diffgroups and linearregression into file for testing purpose
##np.savetxt('diffgroups.csv',diffgroups[1][3][1], delimiter=",")
##np.savetxt('linearregression.csv',linearregression[1][3][1], delimiter=",")
if not os.path.exists('papy_output'):
os.makedirs('papy_output')
##file names matrix
sv_filenames = np.array([['fpn', 'fpb', 'fpbh', 'fpby'], ['tnn', 'tnb', 'tnbh', 'tnby'], \
['fdn', 'fdb', 'fdbh', 'fdby'], ['fnn', 'fnb', 'fnbh', 'fnby'], \
['sfdn', 'sfdb', 'sfdbh', 'sfdby'], ['stpn', 'stpb', 'stpbh', 'stpby'], \
['stnn', 'stnb', 'stnbh', 'stnby'], ['sfnn', 'sfnb', 'sfnbh', 'sfnby'], \
['sfpn', 'sfpb', 'sfpbh', 'sfpby'], ['tpn', 'tpb', 'tpbh', 'tpby']])
##save the effect sizes and sample sizes
file_handle = file('papy_output/effect_n_sample_sizes.txt', 'a')
np.savetxt(file_handle, np.array(['effect sizes']), fmt='%s')
np.savetxt(file_handle, effectSizes, delimiter=",", fmt='%.3f')
np.savetxt(file_handle, np.array(['sample sizes']), fmt='%s')
np.savetxt(file_handle, sampleSizes, delimiter=",", fmt='%.3f')
file_handle.close()
##save files. jj- Metric options; kk- Correction options; ii- Variable number; for example: jj=1, kk=1 mean tpn-- true positive no correction.
for jj in range(0, sv_filenames.shape[0]):
for kk in range(0, sv_filenames.shape[1]):
# if (jj==2 and kk > 0):
# break;
# if (jj==4 and kk > 0):
# break;
if (outcome_type == 0 or outcome_type == 2):
file_handle = file('papy_output/diffgroups-%s.csv' % (sv_filenames[jj][kk]), 'a')
##write the title line with columns "variables, Sample Sizes (Effect Sizes as columns), and Effect Sizes"
title_str = np.append(np.array([['Variables', 'Effect Sizes (Sample Sizes in Columns)']]),
sampleSizes.astype('str'), axis=1)
np.savetxt(file_handle, title_str, delimiter=',', fmt='%s')
##write rest of output matrix
for ii in range(0, num_cols): ##num_cols is from the test dataset, means number of variables
np.savetxt(file_handle, np.insert(diffgroups[ii][kk][jj], [0], np.insert(effectSizes.T, [0],
np.ones(
[effectSizes.shape[1],
1]) * (ii + 1),
axis=1), axis=1),
delimiter=",", fmt='%.5f')
file_handle.close()
if (outcome_type == 1 or outcome_type == 2):
file_handle = file('papy_output/linearregression-%s.csv' % (sv_filenames[jj][kk]), 'a')
##write the title line with columns "variables, Sample Sizes (Effect Sizes as columns), and Effect Sizes"
title_str = np.append(np.array([['Variables', 'Effect Sizes (Sample Sizes in Columns)']]),
sampleSizes.astype('str'), axis=1)
np.savetxt(file_handle, title_str, delimiter=',', fmt='%s')
##write rest of matrix
for ii in range(0, num_cols): ##num_cols is from the test dataset, means number of variables
np.savetxt(file_handle, np.insert(linearregression[ii][kk][jj], [0], np.insert(effectSizes.T, [0],
np.ones([
effectSizes.shape[
1],
1]) * (
ii + 1), axis=1),
axis=1), delimiter=",", fmt='%.5f')
file_handle.close()
if (outcome_type == 0 or outcome_type == 2):
##plot the surfaces of power rate acrossing the combination of effectSize and SampleSize (classfied)
iSurfacePlotTPR(output_uncTP_ratio_median, 'papy_output/plot-power-rate-noCorrection-diffgroups.html',
'no correction', sampleSizes, effectSizes, numberreps)
iSurfacePlotTPR(output_bonfTP_ratio_median, 'papy_output/plot-power-rate-bonfCorrection-diffgroups.html',
'Bonferroni correction', sampleSizes, effectSizes, numberreps)
iSurfacePlotTPR(output_bhTP_ratio_median, 'papy_output/plot-power-rate-bhCorrection-diffgroups.html',
'Benjamini-Hochberg correction', sampleSizes, effectSizes, numberreps)
iSurfacePlotTPR(output_byTP_ratio_median, 'papy_output/plot-power-rate-byCorrection-diffgroups.html',
'Benjamini-Yekutieli correction', sampleSizes, effectSizes, numberreps)
##plot the slice of surfaces power rate; x-axis is based on sample size (columns)
## 2nd row, mid row, and the 2nd last row
slice_rows = np.array([1, int(floor(effectSizes.shape[1] / 2)), effectSizes.shape[1] - 2])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_uncTP_ratio_median[ll, :])
Y_std_temp.append(output_uncTP_ratio_iqr[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-noCorrection-diffgroups.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Sample Size', 'tpn', 'Effect Size=', effectSizes[:, slice_rows])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_bonfTP_ratio_median[ll, :])
Y_std_temp.append(output_bonfTP_ratio_iqr[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bonfCorrection-diffgroups.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Sample Size', 'tpb', 'Effect Size=', effectSizes[:, slice_rows])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_bhTP_ratio_median[ll, :])
Y_std_temp.append(output_bhTP_ratio_iqr[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bhCorrection-diffgroups.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Sample Size', 'tpbh', 'Effect Size=', effectSizes[:, slice_rows])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_byTP_ratio_median[ll, :])
Y_std_temp.append(output_byTP_ratio_iqr[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-byCorrection-diffgroups.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Sample Size', 'tpby', 'Effect Size=', effectSizes[:, slice_rows])
##plot the slice of surfaces power rate; x-axis is based on effect size (rows)
## 2nd col, mid col, and the 2nd last col
slice_cols = np.array([1, int(floor(sampleSizes.shape[1] / 2)), sampleSizes.shape[1] - 2])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_uncTP_ratio_median[:, ll])
Y_std_temp.append(output_uncTP_ratio_iqr[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-noCorrection-diffgroups-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Effect Size', 'tpn', 'Sample Size=', sampleSizes[:, slice_cols])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_bonfTP_ratio_median[:, ll])
Y_std_temp.append(output_bonfTP_ratio_iqr[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bonfCorrection-diffgroups-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Effect Size', 'tpb', 'Sample Size=', sampleSizes[:, slice_cols])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_bhTP_ratio_median[:, ll])
Y_std_temp.append(output_bhTP_ratio_iqr[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bhCorrection-diffgroups-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Effect Size', 'tpbh', 'Sample Size=', sampleSizes[:, slice_cols])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_byTP_ratio_median[:, ll])
Y_std_temp.append(output_byTP_ratio_iqr[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-byCorrection-diffgroups-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8', \
'Effect Size', 'tpby', 'Sample Size=', sampleSizes[:, slice_cols])
if (outcome_type == 1 or outcome_type == 2):
##plot the surfaces of power rate acrossing the combination of effectSize and SampleSize (linear regression)
iSurfacePlotTPR(output_uncTP_ratio_median_ln, 'papy_output/plot-power-rate-noCorrection-linearregression.html',
'no correction', sampleSizes, effectSizes, numberreps)
iSurfacePlotTPR(output_bonfTP_ratio_median_ln,
'papy_output/plot-power-rate-bonfCorrection-linearregression.html', 'Bonferroni correction',
sampleSizes, effectSizes, numberreps)
iSurfacePlotTPR(output_bhTP_ratio_median_ln, 'papy_output/plot-power-rate-bhCorrection-linearregression.html',
'Benjamini-Hochberg correction', sampleSizes, effectSizes, numberreps)
iSurfacePlotTPR(output_byTP_ratio_median_ln, 'papy_output/plot-power-rate-byCorrection-linearregression.html',
'Benjamini-Yekutieli correction', sampleSizes, effectSizes, numberreps)
## (linear regression)
##plot the slice of surfaces power rate; x-axis is based on sample size (columns)
## 2nd row, mid row, and the 2nd last row
slice_rows = np.array([1, int(floor(effectSizes.shape[1] / 2)), effectSizes.shape[1] - 2])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_uncTP_ratio_median_ln[ll, :])
Y_std_temp.append(output_uncTP_ratio_iqr_ln[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-noCorrection-ln.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Sample Size', 'tpn', 'Effect Size=', effectSizes[:, slice_rows])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_bonfTP_ratio_median_ln[ll, :])
Y_std_temp.append(output_bonfTP_ratio_iqr_ln[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bonfCorrection-ln.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Sample Size', 'tpb', 'Effect Size=', effectSizes[:, slice_rows])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_bhTP_ratio_median_ln[ll, :])
Y_std_temp.append(output_bhTP_ratio_iqr_ln[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bhCorrection-ln.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Sample Size', 'tpbh', 'Effect Size=', effectSizes[:, slice_rows])
Y_temp = []
Y_std_temp = []
for ll in slice_rows:
Y_temp.append(output_byTP_ratio_median_ln[ll, :])
Y_std_temp.append(output_byTP_ratio_iqr_ln[ll, :])
iSlicesPlot(sampleSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-byCorrection-ln.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Sample Size', 'tpby', 'Effect Size=', effectSizes[:, slice_rows])
##plot the slice of surfaces power rate; x-axis is based on effect size (rows)
## 2nd col, mid col, and the 2nd last col
slice_cols = np.array([1, int(floor(sampleSizes.shape[1] / 2)), sampleSizes.shape[1] - 2])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_uncTP_ratio_median_ln[:, ll])
Y_std_temp.append(output_uncTP_ratio_iqr_ln[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-noCorrection-ln-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Effect Size', 'tpn', 'Sample Size=', sampleSizes[:, slice_cols])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_bonfTP_ratio_median_ln[:, ll])
Y_std_temp.append(output_bonfTP_ratio_iqr_ln[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bonfCorrection-ln-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Effect Size', 'tpb', 'Sample Size=', sampleSizes[:, slice_cols])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_bhTP_ratio_median_ln[:, ll])
Y_std_temp.append(output_bhTP_ratio_iqr_ln[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-bhCorrection-ln-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Effect Size', 'tpbh', 'Sample Size=', sampleSizes[:, slice_cols])
Y_temp = []
Y_std_temp = []
for ll in slice_cols:
Y_temp.append(output_byTP_ratio_median_ln[:, ll])
Y_std_temp.append(output_byTP_ratio_iqr_ln[:, ll])
iSlicesPlot(effectSizes[0], Y_temp, Y_std_temp, \
'papy_output/plot-slice-power-rate-byCorrection-ln-eff.html', \
'Proportion of Variables with Power (True Positive)> 0.8 (linear-regression)', \
'Effect Size', 'tpby', 'Sample Size=', sampleSizes[:, slice_cols])
##plot all surfac of True Positive
## sv_name1=np.array([['tpn', 'tpb', 'tpbh', 'tpby']])
## for ii in range(0, num_cols):
## for jj in range(0, sv_name1.shape[1]):
## ##print ii, jj, kk
## iSurfacePlot(diffgroups, 'papy_output/plot-variable%d-diffgroups-%s.html'%(ii+1,sv_name1[0][jj]), ii+1, 10, jj+1,sampleSizes, effectSizes,numberreps)
##save and plot surface of mean of each variable;
##sv_filenames.shape[0] is the dimension of metric options;
##sv_filenames.shape[1] is the dimension of correction options
for jj in range(0, sv_filenames.shape[0]):
for kk in range(0, sv_filenames.shape[1]):
# if (jj==2 and kk > 0):
# break;
# if (jj==4 and kk > 0):
# break;
if (outcome_type == 0 or outcome_type == 2):
temp_diffgroups_array = []
mean_diffgroups_array = []
if (outcome_type == 1 or outcome_type == 2):
temp_linearregression_array = []
mean_linearregression_array = []
for ii in range(0, num_cols):
if (outcome_type == 0 or outcome_type == 2):
temp_diffgroups_array.append(diffgroups[ii][kk][jj])
if (outcome_type == 1 or outcome_type == 2):
temp_linearregression_array.append(linearregression[ii][kk][jj])
if (outcome_type == 0 or outcome_type == 2):
temp_diffgroups_array = np.array(temp_diffgroups_array)
mean_diffgroups_array = np.mean(temp_diffgroups_array, axis=0)
# for calculating standard deviation
std_diffgroups_array = np.std(temp_diffgroups_array, axis=0)
file_handle = file('papy_output/mean-diffgroups-%s.csv' % (sv_filenames[jj][kk]), 'a')
np.savetxt(file_handle, mean_diffgroups_array, delimiter=",", fmt='%.10f')
file_handle.close()
if (outcome_type == 1 or outcome_type == 2):
temp_linearregression_array = np.array(temp_linearregression_array)
mean_linearregression_array = np.mean(temp_linearregression_array, axis=0)
# for calculating standard deviation
std_linearregression_array = np.std(temp_linearregression_array, axis=0)
file_handle = file('papy_output/mean-linearregression-%s.csv' % (sv_filenames[jj][kk]), 'a')
np.savetxt(file_handle, mean_linearregression_array, delimiter=",", fmt='%.10f')
file_handle.close()
##plotting surface plots
if (outcome_type == 0 or outcome_type == 2):
for ii in range(0, 3):
mean_diffgroups_array = np.expand_dims(mean_diffgroups_array, axis=0)
if (outcome_type == 1 or outcome_type == 2):
for ii in range(0, 3):
mean_linearregression_array = np.expand_dims(mean_linearregression_array, axis=0)
if (outcome_type == 0 or outcome_type == 2):
iSurfacePlot(mean_diffgroups_array, 'papy_output/plot-mean-diffgroups-%s.html' % (sv_filenames[jj][kk]),
1, 1, 1, sampleSizes, effectSizes, numberreps)
if (outcome_type == 1 or outcome_type == 2):
iSurfacePlot(mean_linearregression_array,
'papy_output/plot-mean-linearregression-%s.html' % (sv_filenames[jj][kk]), 1, 1, 1,
sampleSizes, effectSizes, numberreps)
##copy plotSurface.py to papy_output folder
##for plotting interactive surface plots for the variables separately
# shutil.copy2('plotSurface.py','papy_output')
##create a zip file on the output folder
shutil.make_archive('papy_output_zip', 'zip', 'papy_output')
##copy some files for user viewing in results folder
if not os.path.exists('results'):
os.makedirs('results')
if (outcome_type == 0 or outcome_type == 2):
shutil.copy2('papy_output/plot-power-rate-byCorrection-diffgroups.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-byCorrection-diffgroups.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-byCorrection-diffgroups-eff.html', 'results')
shutil.copy2('papy_output/plot-power-rate-noCorrection-diffgroups.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-noCorrection-diffgroups.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-noCorrection-diffgroups-eff.html', 'results')
if (outcome_type == 1 or outcome_type == 2):
shutil.copy2('papy_output/plot-power-rate-byCorrection-linearregression.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-byCorrection-ln.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-byCorrection-ln-eff.html', 'results')
shutil.copy2('papy_output/plot-power-rate-noCorrection-linearregression.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-noCorrection-ln.html', 'results')
shutil.copy2('papy_output/plot-slice-power-rate-noCorrection-ln-eff.html', 'results')
##delete the papy_output folder
shutil.rmtree('papy_output')
##display user information
print('The output files are in the papy_output_zip.zip in the running directory')
print('Please move the papy_output_zip.zip file to your work directory and unzipped it to view the output files.')
print('a Python script, plotSurface.py, is included for plotting interactive surface plots for variables')
print('for more details, please have a look the .zip file.')
if __name__ == "__main__":
##detect python version#
ver = sys.version
if not ('2.7' in ver):
print('This tool currently only runs in Python 2.7. Please install Python 2.7')
exit(0)
##start to parse input arguments
args = sys.argv
## for i in range(1, len(args)):
## print(args[i],type(args[i]),len(args[i]))
if (len(args) < 3):
print('too few arguments')
print('simple usage: python pa.py TutorialData.csv 8, TutorialData.csv is input test data set, can be replaced by \n \n \
actual data set name, 8 means the first 8 variables, which can be a range, e.g., 8-16 \n \n \n \
full usage: python pa.py TutorialData.csv 2-9 0:100:500 0.05:0.05:0.7 20 0 4 \n \n \
0:100:500 (default value) means the range of sample sizes from 0 to 500 (not inclusive) with interval of 100 \n \n \
0.05:0.05:0.7 (default value) means the range of effect sizes from 0.05 to 0.7 (not inclusive) with interval of 0.05 \n \n \
20 is an integer number of repeats. Default value is 10. \n \n \
0 is the default input for working for classification only. Please choose 1 to work on regression only or 2 to work on both. \n \n \
4 is an integer number as number of CPU cores to use. By default is to use all available cores. ')
exit(0)
if (len(args) > 3):
tmpStr = args[3].split(':')
if len(tmpStr) < 3:
print('The 3rd parameter is for defining the range of sample size with interval\n \
for example, python pa.py TutorialData.csv 2-9 0:50:500')
exit(0)
else:
args.append('0:100:501')
if (len(args) > 4):
tmpStr = args[4].split(':')
if len(tmpStr) < 3:
print('The 4th parameter is for defining the range of effect size with interval\n \
for example, python pa.py TutorialData.csv 2-9 10:50:500 0.05:0.05:0.8')
exit(0)
else:
args.append('0.05:0.05:0.8')
if (len(args) > 5):
print('')
else:
args.append('10')
# for calculating diffgroups, or linear regression or both
if (len(args) > 6):
tmpInt = int(args[6])
if type(tmpInt).__name__ == 'int':
if (tmpInt == 0):
print('Only work on classification (discrete)!')
if (tmpInt == 1):
print('Only work on regression (continuous)!')
if (tmpInt == 2):
print('Work on both classification and regression!')
if (tmpInt < 0 or tmpInt > 2):
print('The 6th parameter is for dealing with outcome variables\n \
please choose a number among 0, 1 and 2 \n \
0 - for work on classification outcome only \n \
1 - for work on regression outcome only \n \
2 - for work on both')
else:
print('The 6th parameter is for dealing with outcome variables\n \
please choose a number among 0, 1 and 2 \n \
0 - for work on classification outcome only \n \
1 - for work on regression outcome only \n \
2 - for work on both')
exit(0)
else:
args.append('0')
# for using number of cores of CPU
if (len(args) > 7):
tmpInt = int(args[7])
if type(tmpInt).__name__ == 'int':
if multiprocessing.cpu_count() - 1 <= 0:
cores = 1
else:
cores = multiprocessing.cpu_count()
if tmpInt > cores:
args[7] = str(cores)
print('Your machine does not have enough cores as you request, \n \
the maximum number of cores - %i - will be used instead' % (cores))
else:
print('The 7th parameter is for defining the number of CPU cores to use\n \
for example, python pa.py TutorialData.csv 2-9 10:50:500 0.05:0.05:0.8 10 4')
exit(0)
else:
if multiprocessing.cpu_count() - 1 <= 0:
cores = 1
else:
cores = multiprocessing.cpu_count()
args.append(str(cores))
## print('len of args is %i'%(len(args[1:])))
main(args[1], args[2], args[3], args[4], args[5], args[6], args[7])