###################################### # Statistical methods for data science: R # # http://didawiki.di.unipi.it/doku.php/mds/smd/ ###################################### # [CTRL-Enter] to execute a line ###### Script R19: Test of hypothesis ########## # z-Test of the mean # Normal data, known variance test.z = function(data, mu, sigma, conf.level=0.95) { n = length(data) xn = mean(data) alpha = 1 - conf.level # P(Z > cv) = alpha/2 iff P(Z <= cv) = 1-alpha/2 cv = qnorm(1-alpha/2) # right-critical value delta = cv*sigma/sqrt(n) cat("\nmean ", xn, "delta ", delta) cat(paste0("\n",conf.level*100, "% CI ="), c(xn-delta, xn+delta) ) # right-sided interval # P(Z > cv1) = alpha iff P(Z <= cv1) = 1-alpha cv1 = qnorm(1-alpha) delta1 = cv1*sigma/sqrt(n) cat(paste0("\n",conf.level*100, "% one-sided CI ="), xn-delta1, "infy" ) z.value = abs( (xn - mu)/sigma*sqrt(n) ) p = 1-pnorm(z.value) # P(Z > z.value) cat("\nz = ", z.value, "\ntwo-sided p-value =", 2*p, "\none-sided p-value =", p) } # test mean = 5 - cannot be rejected for s = 20, it can be for smaller s n = 100; mu = 3; s = 20 data=rnorm(n, mu, s) test.z(data, mu=1, sigma=s); # library with z test and Confidence Intervals built-in library(BSDA) z.test(data, mu=1, sigma.x=s) z.test(data, mu=1, sigma.x=s, alternative="greater") # t-Test # Normal data, unknown variance test.t = function(mu, data, conf.level=0.95) { n = length(data) xn = mean(data) alpha = 1 - conf.level # P(T > cv) = alpha/2 iff P(T <= cv) = 1-alpha/2 cv = qt(1-alpha/2, n-1) # right-critical value delta = cv*sd(data)/sqrt(n) cat("\nmean ", xn, "delta ", delta) cat(paste0("\n",conf.level*100, "% CI ="), c(xn-delta, xn+delta) ) # right-sided interval # P(T > cv1) = alpha iff P(T <= cv1) = 1-alpha cv1 = qt(1-alpha, n-1) # right-critical value delta1 = cv1*sd(data)/sqrt(n) cat(paste0("\n",conf.level*100, "% one-sided CI ="), xn-delta1, "infy" ) t.value = abs( (xn - mu)/sd(data)*sqrt(n) ) p = 1-pt(t.value, n-1) # P(T > t.value) cat("\nt = ", t.value, "\ntwo-sided p-value =", 2*p, "\none-sided p-value =", p) } # test mean = 0 - cannot be rejected n = 100; data=rnorm(n); test.t(data, mu=0); t.test(data, mu=0) # r built-in # test mean = -0.5 - can be rejected test.t(-0.5, data); t.test(data, mu=-0.5) # r built-in t.test(data, m=-0.5, alternative="greater") ### but, how to check whether data is normal? qqnorm(data) # visually qqline(data) # Shapiro test of normality shapiro.test(data) # Kolmogorov-Smirnov test (see later in the course) ks.test(data, pnorm, mean(data), sd(data)) # Non-normal data, unknown variance, large samples n = 1000; data=rexp(n, rate=2) # mean = 1/rate = 0.5 qqnorm(data); qqline(data) # visually shapiro.test(data) # normality check fails ks.test(data, pnorm, mean(data), sd(data)) # normality check fails # use z.test with sigma = sd(data) z.test(data, mu=0.45, sigma.x=sd(data)); # compare to t-test t.test(data, mu=0.45) # since t -> z for large samples, actually it works # Non-normal data, unknown variance, *symmetric* distribution # Wilcoxon signed-rank test [R book] Chpt. 5.2, test statistics: sum( sign(x-mu)*rank(|x-mu|) ) wilcox.test(data, mu=0.4, conf.int = T) # fails because exp is not symmetric! n = 10; data=rbinom(n, size=100, prob=0.5); # mean = size*prob = 50 wilcox.test(data, mu=50, conf.int = T) wilcox.test(data, mu=46, conf.int = T) # Non-normal data, unknown variance, bootstrap t-test library(boot) goldenratio = (sqrt(5)-1)/2 goldenratio shoshoni = c(0.693,0.749,0.654,0.670,0.662,0.672,0.615,0.606,0.690,0.628,0.668,0.611,0.606,0.609,0.601,0.553,0.570,0.844,0.576,0.933) n = length(shoshoni) mean(shoshoni) # H0: mu=goldenratio # from script R14: compute both mean and variance of a dataset subset mean.var.pos = function(d, pos) c(mean(d[pos]), var(d[pos])/length(pos)) b = boot(shoshoni, mean.var.pos, R=10000) bci = boot.ci(b, conf=0.95, type="stud") bci # reject H0: mu=goldenratio if mu not in confidence interval bci$student[4:5] plot(density(b$t[,1])) tboot = (b$t[,1] - mean(shoshoni))/sqrt(b$t[,2]) plot(density(tboot), xlim=c(-6,4)) # [T] Fig. 27.2 tvalue = (mean(shoshoni) - goldenratio)/sd(shoshoni)*sqrt(n) tvalue pvalue = mean(tboot > abs(tvalue)) # P(T > t.value) pvalue # reject H0 in favor of H1: mu > goldenratio # t-test for linear regression (with Gaussian noise) library(SemiPar) data(janka) attach(janka) # scatter plot plot(dens,hardness) # simple linear model fit = lm(hardness~dens) abline(fit, col='red', lw=2) # regression line s = summary(fit) #t value wrt the H0: par=0 s # explicit calculation n = length(hardness) se = s$coef[,2] # std error coef(fit)/se # t value qt(0.975, n-2) # critical value at 0.05 (2-tailed) # P(|T| > |t|) = 2(1-P(T<=|t|)) 2*(1-pt(abs(coef(fit)/se), n-2)) # p-value (2-tailed)