def crossvalidate_formula(formula, data, response_column, k):
for i in range(k):
start = i * (len(data) // k)
end = (i + 1) * (len(data) // k) if i < k - 1 else len(data)
to_drop = data.index[start: end]
train = data.drop(to_drop)
test = data.loc[to_drop]
model = smf.ols(formula, train).fit()
pred = model.predict(test)
residuals = test[response_column] - pred
PRESS += (residuals ** 2).sum()
response = data[response_column]
Q2 = 1 - PRESS / ((response - response.mean()) ** 2).sum()
return Q2
candidate_split = best_candidate.split(sep=":")
if len(candidate_split) == 2:
if candidate_split[0] not in selected and candidate_split[0] in remaining:
maxvars += 1
if candidate_split[1] not in selected and candidate_split[1] in remaining:
maxvars += 1
current_score = best_new_score
formula = "{} ~ {} + 1".format(response,
' + '.join(selected))
model = smf.ols(formula, data).fit()
return model
def _get_best_configs(self, parameters, results, configs, lower_is_better,
Implements the testing procedure itself and returns the reduced set
of parameter configurations.
df = self._prep_df_for_linreg(parameters, results,
configs, lower_is_better)
l = 1
h = df.Rank.max()
p = h
while l != h:
lm = ols('Objective ~ C(Rank)', data=df.loc[df.Rank <= p, :]).fit()
p_value = sm.stats.anova_lm(lm, typ=2).loc[:, "PR(>F)"].ix["C(Rank)"]
reject = p_value < alpha
if reject:
h = p - 1
l = p
p = math.ceil((l + h) / 2)
return df.loc[df.Rank <= p, :].loc[:,
[p.name for p in parameters]].drop_duplicates().to_dict(
estimator=lambda y,x: OLS(y,x).fit().params,
# create lagmat of both time series
dta = lagmat2ds(x, mxlg, trim='both')
dta = np.delete(dta, -1, axis = 1) # removal of the not lagged xs
#add constant
if addconst:
dtaown = add_constant(dta[:, 1:(mxlg + 1)], prepend=False)
dtajoint = add_constant(dta[:, 1:], prepend=False)
raise NotImplementedError('Not Implemented')
#dtaown = dta[:, 1:mxlg]
#dtajoint = dta[:, 1:]
# Run ols on both models without and with lags of second variable
res2down = OLS(dta[:, 0], dtaown).fit()
res2djoint = OLS(dta[:, 0], dtajoint).fit()
#print results
#for ssr based tests see:
#the other tests are made-up
# Granger Causality test using ssr (F statistic)
fgc1 = ((res2down.ssr - res2djoint.ssr) /
res2djoint.ssr / mxlg * res2djoint.df_resid)
if verbose:
print('ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d,'
' df_num=%d' % (fgc1,
stats.f.sf(fgc1, mxlg,
res2djoint.df_resid, mxlg))
method = 'bonferroni'
elif opts.multiTest == 'Holm':
method = 'holm'
elif opts.multiTest == 'Hochberg':
method = 'simes-hochberg'
elif opts.multiTest == 'Hommel':
method = 'hommel'
elif opts.multiTest == 'BY':
method = 'fdr_by'
elif opts.multiTest == 'TSBH':
method = 'tsbh'
sys.stderr.write('ERROR: The methods for multiple test correction can only accept \'Bonferroni\', \'Holm\', \'Hochberg\', \'Hommel\', \'BH\', \'BY\' or \'TSBH\' as its input.\n')
mtc = sms.stats.multicomp.multipletests(pval[idx], alpha=0.1, method=method, returnsorted=False)
padj = pval.copy()
padj[idx] = mtc[1]
data.padj = padj
return data
print '\r%i genes finished.' % num
if opts.dispDiff and np.isnan(data.dispAdjRibo[i]):
if not opts.dispDiff and np.isnan(data.dispAdj[i]):
response = np.hstack([data.countRibo[i, :], data.countRna[i, :]])
if opts.dispDiff:
disp = np.hstack([np.repeat(data.dispAdjRibo[i], lenSampleRibo), np.repeat(data.dispAdjRna[i], lenSampleRna)])
disp = data.dispAdj[i]
modNB0 = sm.GLM(response, explanatory0, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
modNB1 = sm.GLM(response, explanatory1, family=sm.families.NegativeBinomial(alpha=disp), offset=np.log(librarySizes))
result0 = modNB0.fit()
result1 = modNB1.fit()
except sm.tools.sm_exceptions.PerfectSeparationError:
errorCnt += 1
if not opts.dispDiff:
pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, explanatory1.shape[1] - explanatory0.shape[1])
elif opts.dispDiff:
pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, (explanatory1.shape[1] - explanatory0.shape[1]) / 2.5)
data.pval = pval
sys.stdout.write('Warning: Failed to do test: %i genes. P value set to \'nan\'.\n' % errorCnt)
n_samples, n_features = train.shape
print "n_samples:", n_samples, "n_features:", n_features
print "histogram of target"
print sp.histogram(target,3)
print "len(train):", len(train)
print "len(target):", len(target)
print "dataset shape:", dataset.shape
if family!='gaussian':
raise Exception("Only have gaussian logistic for scipy")
# train the classifier
gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log))
start = time.time()
gauss_log_results = gauss_log.fit()
print "sm_api.GLM took", time.time() - start, "seconds"
print gauss_log_results.summary()
def __init__(self, y, x, z, data, alpha):
self.regression = sm.RLM(data[y], data[x+z])
self.result = self.regression.fit()
self.coefficient = self.result.params[x][0]
confidence_interval = self.result.conf_int(alpha=alpha/2.)
self.upper = confidence_interval[1][x][0]
self.lower = confidence_interval[0][x][0]
df = pd.concat([df, pd.read_csv(csvfile)])
except FileNotFoundError:
if len(df['pivot']) > 0:
print("selective:", np.mean(df['pivot']), np.std(df['pivot']), np.mean(df['length']), np.std(df['length']), np.mean(df['coverage']))
print("naive:", np.mean(df['naive_pivot']), np.std(df['naive_pivot']), np.mean(df['naive_length']), np.std(df['naive_length']), np.mean(df['naive_coverage']))
print("len ratio selective divided by naive:", np.mean(np.array(df['length']) / np.array(df['naive_length'])))
U = np.linspace(0, 1, 101)
plt.plot(U, sm.distributions.ECDF(df['pivot'])(U), 'r', label='Selective', linewidth=3)
plt.plot(U, sm.distributions.ECDF(df['naive_pivot'])(U), 'b', label='Naive', linewidth=3)
plt.plot([0,1], [0,1], 'k--', linewidth=2)
plt.savefig(csvfile[:-4] + '.pdf')
plt.scatter(df['naive_length'], df['length'])
plt.savefig(csvfile[:-4] + '_lengths.pdf')
df.to_csv(csvfile, index=False)