R_homework_02

Questions

Codes

# Rstudio Server URL: http://202.195.187.9:8787/
# 账号密码与Linux一致
# 切换R路径
.libPaths("/home/biotools/anaconda3/envs/R4.3.1/lib/R/")
########################################################################
# 读入练习数据集 
# (sex性别, age年龄, eth种族, BMI体重指数, act运动量, Pb/Cd/Hg三种重金属浓度)
# 试做如下分析:

# Q1: 不同性别人群在三种重金属浓度上是否有差异? 

# 引入DescTools库
library(DescTools)
data = read.csv("02/Practice-2.csv", header=T, sep = ",")
# 提取sex1的重金属
data_sex1_b = data[data$sex == 1, c("b_Pb", "b_Cd", "b_Hg")]
# 提取sex2的重金属
data_sex2_b = data[data$sex == 2, c("b_Pb", "b_Cd", "b_Hg")]
# 成组设计多元T检验: Hotelling T²检验
HotellingsT2Test(data_sex1_b, data_sex2_b)

########################################################################
# Q2: 对前400个样本使用逐步回归建立其它变量与BMI的回归模型,
#     给出最终模型的R2,调整R2、剩余标准差以及AIC
# 前400行数据
data_400 = data[1:400, ]
data_400 = data_400[complete.cases(data_400),]
# 建立初始线性回归模型
full.m = lm(BMI~., data = data_400)
step(full.m, direction="both", scope=formula(full.m))
# 提取R²、、以及AIC
summary(full.m)$r.squared
# 调整R²
summary(full.m)$adj.r.squared
# 剩余标准差
summary(full.m)$sigma
# AIC
AIC(full.m)

########################################################################
# Q3: 使用上述模型,对剩余100个样本的BMI进行预测,
#     并计算预测值与真实值的相关系数

# 后100行数据
data_100 = data[401:500, ]
# 去除数据不完整行
data_100 = data_100[complete.cases(data_100),]
# 根据模型预测
predicted = predict(full.m, newdata = data_100, se.fit = TRUE)
# 提取预测的BMI
predicted_bmi = predicted$fit
# 提取真实的bmi
real_bmi = data_100$BMI
# 计算相关系数
cor(predicted_bmi, real_bmi)

Data

	Hotelling's two sample T2-test

data:  data_sex1_b and data_sex2_b
T.2 = 28.519, df1 = 3, df2 = 453, p-value < 2.2e-16
alternative hypothesis: true location difference is not equal to c(0,0,0)

##########################################################################

Start:  AIC=1333.57
BMI ~ sex + age + eth + act + b_Pb + b_Cd + b_Hg

       Df Sum of Sq   RSS    AIC
- age   1      4.91 13790 1331.7
- act   1      9.91 13795 1331.8
- sex   1     14.06 13799 1331.9
<none>              13785 1333.6
- b_Pb  1     95.88 13881 1334.1
- eth   1    115.83 13901 1334.6
- b_Hg  1    210.59 13996 1337.1
- b_Cd  1    565.07 14350 1346.1

Step:  AIC=1331.7
BMI ~ sex + eth + act + b_Pb + b_Cd + b_Hg

       Df Sum of Sq   RSS    AIC
- act   1      7.67 13798 1329.9
- sex   1     17.15 13807 1330.2
<none>              13790 1331.7
- b_Pb  1     95.65 13886 1332.2
- eth   1    119.96 13910 1332.8
+ age   1      4.91 13785 1333.6
- b_Hg  1    207.25 13997 1335.1
- b_Cd  1    579.82 14370 1344.6

Step:  AIC=1329.9
BMI ~ sex + eth + b_Pb + b_Cd + b_Hg

       Df Sum of Sq   RSS    AIC
- sex   1     13.56 13811 1328.3
<none>              13798 1329.9
- b_Pb  1     98.30 13896 1330.5
- eth   1    122.49 13920 1331.1
+ act   1      7.67 13790 1331.7
+ age   1      2.66 13795 1331.8
- b_Hg  1    209.57 14007 1333.4
- b_Cd  1    573.32 14371 1342.6

Step:  AIC=1328.26
BMI ~ eth + b_Pb + b_Cd + b_Hg

       Df Sum of Sq   RSS    AIC
<none>              13811 1328.3
- eth   1    118.98 13930 1329.4
+ sex   1     13.56 13798 1329.9
- b_Pb  1    147.58 13959 1330.1
+ age   1      5.31 13806 1330.1
+ act   1      4.07 13807 1330.2
- b_Hg  1    211.31 14023 1331.8
- b_Cd  1    562.88 14374 1340.7

Call:
lm(formula = BMI ~ eth + b_Pb + b_Cd + b_Hg, data = data_400)

Coefficients:
(Intercept)          eth         b_Pb         b_Cd         b_Hg  
     26.427        0.562       -1.065       -1.678       -0.781  

##########################################################################

> # 提取R²
[1] 0.09002262

> # 调整R²
[1] 0.07202872

> # 剩余标准差
[1] 6.240287

> # AIC
[1] 2362.885

##########################################################################

> # 计算相关系数
[1] 0.1265127

Result

A1

  • 因为p-value < 2.2e-16,表明我们有足够的证据拒绝原假设,即不同性别人群在三种重金属浓度上存在显著差异。
  • 答: 不同性别人群在三种重金属浓度上存在显著差异。

A2

最终模型的R² 调整R² 剩余标准差 AIC
0.09002262 0.07202872 6.240287 2362.885

A3

  • 预测值与真实值的相关系数为: 0.1265127

Contents