辅导留学生R编程、R语言辅导留学生、辅导Logistic Regression and Classification

#----------------------------------------#
# Logistic Regression and Classification
#----------------------------------------#

# simulate 1-variable logistic regression data
curve(1/(1+exp(-x)),-4,4)
abline(h=.5,v=0)
n=20
x1 = rnorm(n) # continuous variable
z = 1 + 2*x1 # linear with a bias
pr = 1/(1+exp(-z)) # pass through an inv-logit function
y = rbinom(n,1,pr) # bernoulli response variable
plot(x1,y,main="simulated case/control data")

#now feed it to glm (logistic regression, family=”binomial”)
lrsim.df = data.frame(binary_outcome=y,x1=x1)
lr.fit<-glm( binary_outcome~x1,data=lrsim.df,family="binomial")
summary(lr.fit)
plot(lrsim.df$x1,lrsim.df$binary_outcome)

library(ggplot2)
# need type="response"
my.LR.fn <- function(x){predict.glm(lr.fit, newdata=data.frame(x1=x),
type="response")}
my.LR.fn(1)
ggplot(data=lrsim.df, aes(x=x1 , y=binary_outcome)) +
geom_point() +
stat_function(fun = my.LR.fn) +
xlab("x1") + ylab("Case/Control Status") + ggtitle("Logistic Regression Model")

# t-test viewpoint
t.test(data=lrsim.df, x1~binary_outcome)
# change binary_outcome to factor, then ggplot will do boxplot automatially
p <- ggplot(lrsim.df, aes(x=as.factor(binary_outcome), y=x1, fill=binary_outcome))
+ stat_boxplot(geom ='errorbar') + geom_boxplot()
p <- p + xlab("case/control") + ylab("expression")
p

library(rpart)
# decision tree model
tree.fit <- rpart(binary_outcome ~ x1, data = lrsim.df)
plot(tree.fit)
text(tree.fit, use.n = TRUE)

plot(tree.fit, uniform=TRUE, main="Classification tree for 1 simulated variable")
text(tree.fit, use.n=TRUE, all=TRUE, cex=.8)

#------ simulate 2 variables
n=50
x1 = rnorm(n) # continuous variable
x2 = rnorm(n)
z = 1 + 2*x1 + 4*x2 # multivariate linear with a bias
pr = 1/(1+exp(-z)) # pass through an inv-logit function
y2 = rbinom(n,1,pr) # bernoulli response variable

#now feed it to glm:
lrsim2.df = data.frame(binary_class=as.factor(y2),x1=x1,x2=x2)
lr2.fit<-glm( binary_class~x1+x2,data=lrsim2.df,family="binomial")
summary(lr2.fit)

with(data=lrsim2.df, # with specifies local context(lrsim2.df) for variables
plot(x1,binary_class)
)
with(data=lrsim2.df, # looks at class with respect to x2
plot(x2,binary_class)
)

# t-test view
t.test(x1~binary_class,data=lrsim2.df)
t.test(x2~binary_class,data=lrsim2.df)
p <- ggplot(lrsim2.df, aes(x=as.factor(binary_class), y=x1, fill=binary_class)) +
stat_boxplot(geom ='errorbar') + geom_boxplot()
p <- p + xlab("Binary Class") + ylab("Amount of x1")
p <- p + ggtitle("Comparison of groups based on x1")
p

tree2.fit <- rpart(binary_class ~ x1 + x2, data = lrsim2.df)
plot(tree2.fit)
text(tree2.fit, use.n = TRUE)

# check classification accuracy
(tree2.predicted <- predict(tree2.fit, type = "class"))
# the first column is sample index, don't need [-1]
cbind(tree2.predicted[-1],lrsim2.df$binary_class)
tree2.predicted == lrsim2.df$binary_class

# manually check prediction
# have to change these numbers depending on the tree
sum(((x2<.1266)+0)==y2)
cbind((x2<.1266)+0,y2)

plot(tree2.fit, uniform=TRUE,
main="Classification tree for 2 simulated variable")
text(tree2.fit, use.n=TRUE, all=TRUE, cex=.8)

install.packages("glmnet")
library(glmnet)
# alpha=0 means ridge, alpha=1 means lasso
glmnet.model<-
cv.glmnet(cbind(x1,x2),as.factor(y2),alpha=.1,family="binomial",type.measure="class
")
glmnet.coeffs<-predict(glmnet.model,type="coefficients")
plot(glmnet.model)
glmnet.nonzero.coeffs <- glmnet.coeffs@Dimnames[[1]][which(glmnet.coeffs!=0)]

# simulation and false discoveries
t.test(rnorm(100),rnorm(100))
n.tests <- 1000
ts = replicate(n.tests,t.test(rnorm(100),rnorm(100))$p.value)
ts
100*sum(ts<.05)/n.tests # percentage accepted by chance (falsely)
sum(ts<=(.05)/n.tests) # Bonferroni correction (conservative)

#---------- Lab 6

1. try rpart on the iris data to predict the species setosa or virginica

Optional: try the same thing with glmnet.