Questions

Codes
# Rstudio Server URL: http://202.195.187.9:8787/
# 账号密码与Linux一致
# 切换R路径
.libPaths("/home/biotools/anaconda3/envs/R4.3.1/lib/R/")
########################################################################
# 读入练习数据集
# (sex性别, age年龄, eth种族, BMI体重指数, act运动量, Pb/Cd/Hg三种重金属浓度)
# 试做如下分析:
# Q1: 不同性别人群在三种重金属浓度上是否有差异?
# 引入DescTools库
library(DescTools)
data = read.csv("02/Practice-2.csv", header=T, sep = ",")
# 提取sex1的重金属
data_sex1_b = data[data$sex == 1, c("b_Pb", "b_Cd", "b_Hg")]
# 提取sex2的重金属
data_sex2_b = data[data$sex == 2, c("b_Pb", "b_Cd", "b_Hg")]
# 成组设计多元T检验: Hotelling T²检验
HotellingsT2Test(data_sex1_b, data_sex2_b)
########################################################################
# Q2: 对前400个样本使用逐步回归建立其它变量与BMI的回归模型,
# 给出最终模型的R2,调整R2、剩余标准差以及AIC
# 前400行数据
data_400 = data[1:400, ]
data_400 = data_400[complete.cases(data_400),]
# 建立初始线性回归模型
full.m = lm(BMI~., data = data_400)
step(full.m, direction="both", scope=formula(full.m))
# 提取R²、、以及AIC
summary(full.m)$r.squared
# 调整R²
summary(full.m)$adj.r.squared
# 剩余标准差
summary(full.m)$sigma
# AIC
AIC(full.m)
########################################################################
# Q3: 使用上述模型,对剩余100个样本的BMI进行预测,
# 并计算预测值与真实值的相关系数
# 后100行数据
data_100 = data[401:500, ]
# 去除数据不完整行
data_100 = data_100[complete.cases(data_100),]
# 根据模型预测
predicted = predict(full.m, newdata = data_100, se.fit = TRUE)
# 提取预测的BMI
predicted_bmi = predicted$fit
# 提取真实的bmi
real_bmi = data_100$BMI
# 计算相关系数
cor(predicted_bmi, real_bmi)
Data
Hotelling's two sample T2-test
data: data_sex1_b and data_sex2_b
T.2 = 28.519, df1 = 3, df2 = 453, p-value < 2.2e-16
alternative hypothesis: true location difference is not equal to c(0,0,0)
##########################################################################
Start: AIC=1333.57
BMI ~ sex + age + eth + act + b_Pb + b_Cd + b_Hg
Df Sum of Sq RSS AIC
- age 1 4.91 13790 1331.7
- act 1 9.91 13795 1331.8
- sex 1 14.06 13799 1331.9
<none> 13785 1333.6
- b_Pb 1 95.88 13881 1334.1
- eth 1 115.83 13901 1334.6
- b_Hg 1 210.59 13996 1337.1
- b_Cd 1 565.07 14350 1346.1
Step: AIC=1331.7
BMI ~ sex + eth + act + b_Pb + b_Cd + b_Hg
Df Sum of Sq RSS AIC
- act 1 7.67 13798 1329.9
- sex 1 17.15 13807 1330.2
<none> 13790 1331.7
- b_Pb 1 95.65 13886 1332.2
- eth 1 119.96 13910 1332.8
+ age 1 4.91 13785 1333.6
- b_Hg 1 207.25 13997 1335.1
- b_Cd 1 579.82 14370 1344.6
Step: AIC=1329.9
BMI ~ sex + eth + b_Pb + b_Cd + b_Hg
Df Sum of Sq RSS AIC
- sex 1 13.56 13811 1328.3
<none> 13798 1329.9
- b_Pb 1 98.30 13896 1330.5
- eth 1 122.49 13920 1331.1
+ act 1 7.67 13790 1331.7
+ age 1 2.66 13795 1331.8
- b_Hg 1 209.57 14007 1333.4
- b_Cd 1 573.32 14371 1342.6
Step: AIC=1328.26
BMI ~ eth + b_Pb + b_Cd + b_Hg
Df Sum of Sq RSS AIC
<none> 13811 1328.3
- eth 1 118.98 13930 1329.4
+ sex 1 13.56 13798 1329.9
- b_Pb 1 147.58 13959 1330.1
+ age 1 5.31 13806 1330.1
+ act 1 4.07 13807 1330.2
- b_Hg 1 211.31 14023 1331.8
- b_Cd 1 562.88 14374 1340.7
Call:
lm(formula = BMI ~ eth + b_Pb + b_Cd + b_Hg, data = data_400)
Coefficients:
(Intercept) eth b_Pb b_Cd b_Hg
26.427 0.562 -1.065 -1.678 -0.781
##########################################################################
> # 提取R²
[1] 0.09002262
> # 调整R²
[1] 0.07202872
> # 剩余标准差
[1] 6.240287
> # AIC
[1] 2362.885
##########################################################################
> # 计算相关系数
[1] 0.1265127
Result
A1
- 因为p-value < 2.2e-16,表明我们有足够的证据拒绝原假设,即不同性别人群在三种重金属浓度上存在显著差异。
- 答: 不同性别人群在三种重金属浓度上存在显著差异。
A2
最终模型的R² |
调整R² |
剩余标准差 |
AIC |
0.09002262 |
0.07202872 |
6.240287 |
2362.885 |
A3