This post tries to use Bayesian linear regression to explore
causes of heart disease.
Bayesian linear regression
library ( "corrplot" )
library ( ncvreg)
library ( ipred)
library ( dplyr)
library ( ggplot2)
library ( RColorBrewer)
#data(dystrophy)
data( heart)
dim ( heart)
## [1] 462 10
str( heart)
## 'data.frame': 462 obs. of 10 variables:
## $ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
## $ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
## $ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
## $ adiposity: num 23.1 28.6 32.3 38 27.8 ...
## $ famhist : num 1 0 1 1 1 1 0 1 1 1 ...
## $ typea : int 49 55 52 51 60 62 59 62 49 69 ...
## $ obesity : num 25.3 28.9 29.1 32 26 ...
## $ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
## $ age : int 52 63 46 58 49 45 38 58 29 53 ...
## $ chd : int 1 1 0 1 1 0 0 1 0 1 ...
##Omit NA’s and plot variables to see if there’s correlations
dat = heart
Cor <- cor( dat)
corrplot( Cor, type= "upper" , method= "ellipse" , tl.pos= "d" )
corrplot( Cor, type= "lower" , method= "number" , col= "black" ,
add= TRUE , diag= FALSE , tl.pos= "n" , cl.pos= "n" )
dat %>%
as.data.frame () %>%
select( - c ( famhist)) %>%
melt() %>%
ggplot( aes( x= value, fill= variable)) +
geom_histogram( colour= "black" , size= 0.1 ) +
facet_wrap( ~ variable, ncol= 3 , scale= "free" ) +
theme_classic() +
scale_fill_brewer( palette= "Set1" ) +
theme( legend.position= "none" )
#Divide the data into training and test set
chdYes = heart[ heart$ chd== 1 ,]
chdNo = heart[ heart$ chd== 0 ,]
sampleYes = sample ( rownames ( chdYes), nrow ( chdYes) * 0.8 )
sampleNo = sample ( rownames ( chdNo), nrow ( chdNo) * 0.8 )
trainYes = chdYes[ sampleYes,]
trainNo = chdNo[ sampleNo,]
testYes = chdYes[ ! rownames ( chdYes) %in% sampleYes,]
testNo = chdNo[ ! rownames ( chdNo) %in% sampleNo,]
dat = rbind ( trainYes, trainNo)
test = rbind ( testYes, testNo)
###Model the data with JAGS
As we have centered the data around 0, the distribution of Beta will follow a double exponential distriubtion.
X = scale ( dat[, -10 ], center= TRUE , scale= TRUE )
X %>%
as.data.frame () %>%
select( - c ( famhist)) %>%
melt() %>%
ggplot( aes( x= value, fill= variable)) +
geom_histogram( colour= "black" , size= 0.1 ) +
facet_wrap( ~ variable, ncol= 3 , scale= "free" ) +
theme_classic() +
scale_fill_brewer( palette= "Set1" ) +
theme( legend.position= "none" )
mod_glm = summary ( glm( chd ~ . , data= dat))
#JAGS model
library ( "rjags" )
mod1_string = " model {
for (i in 1:length(y)) {
y[i] ~ dbern(p[i])
#logit(p[i]) = int + b[1]*AGE[i] + b[2]*CK[i] + b[3]*H[i] + b[4]*PK[i] + b[5]*LD[i]
logit(p[i]) = int + b[1]*sbp[i] + b[2]+tobacco[i] + b[3]*ldl[i] +
b[4]*adiposity[i] + b[5]*famhist[i] +b[6]*typea[i] +b[7]*obesity[i] + b[8]*alcohol[i] + b[9]*age[i]
}
int ~ dnorm(0.0, 1.0/25.0)
for (j in 1:9) {
b[j] ~ ddexp(0.0, sqrt(2.0)) # has variance 1.0
#b[j] ~ dnorm(0.0, 2) # noninformative for logistic regression
#b[j] ~ dnorm(0.0, 1.0/4.0^2)
}
} "
data_jags = list ( y= dat[, 10 ], sbp= X[, "sbp" ], tobacco= X[, "tobacco" ], ldl= X[, "ldl" ],
adiposity= X[, "adiposity" ], famhist= X[, "famhist" ],
typea= X[, "typea" ], obesity= X[, "obesity" ], alcohol= X[, "alcohol" ], age= X[, "age" ])
params = c ( "int" , "b" )
##Run JAGS
## Error: <text>:3:1: unexpected ','
## 2: suppressMessages(update(mod1, 5e3))
## 3: ,
## ^
convergence diagnostics
1) No pattern in traceplot
2) After 5e3 updates (e.g. burn ins I still observe autocorrelation in b[2] and in int)
Increasing it to 5e4
3) DIC
Mean deviance: 503.9
penalty 8.646
Penalized deviance: 512.5
plot( mod1_sim)
gelman.diag( mod1_sim)
## Potential scale reduction factors:
##
## Point est. Upper C.I.
## b[1] 1.00 1.00
## b[2] 1.05 1.11
## b[3] 1.00 1.00
## b[4] 1.00 1.01
## b[5] 1.00 1.00
## b[6] 1.00 1.00
## b[7] 1.00 1.00
## b[8] 1.00 1.00
## b[9] 1.00 1.00
## int 1.04 1.10
##
## Multivariate psrf
##
## 1.02
autocorr.diag( mod1_sim)
## b[1] b[2] b[3] b[4] b[5]
## Lag 0 1.000000000 1.0000000 1.0000000000 1.000000000 1.0000000000
## Lag 1 0.287920926 0.9851922 0.3462278072 0.722909298 0.2658015636
## Lag 5 0.010262570 0.9378278 0.0008810413 0.205100190 0.0024193182
## Lag 10 0.007303032 0.8806068 0.0132742196 0.030675749 -0.0005156971
## Lag 50 0.005479209 0.5317571 -0.0033351594 0.005065785 -0.0052446834
## b[6] b[7] b[8] b[9] int
## Lag 0 1.000000000 1.000000000 1.000000000 1.000000000 1.0000000
## Lag 1 0.301678048 0.630132049 0.239460718 0.534791017 0.9852943
## Lag 5 0.004675731 0.172722918 -0.006375527 0.092841859 0.9369034
## Lag 10 0.004933605 0.036995541 -0.002855998 0.005109218 0.8799680
## Lag 50 -0.002915316 0.007974022 0.004067879 -0.006795221 0.5316811
autocorr.plot( mod1_sim)
effectiveSize( mod1_sim)
## b[1] b[2] b[3] b[4] b[5] b[6]
## 8172.85337 96.52570 6961.24000 2350.24891 8535.10192 7542.87353
## b[7] b[8] b[9] int
## 2810.94261 8762.84597 3950.24210 98.74925
##Summary statistics
I observe that tobacco, adiposity and alcohol have
posterior probabilities centered around 0.
This means that they do not contribute much to heart disease
I remove these and compare between models
dic1
## Mean deviance: 394.9
## penalty 8.171
## Penalized deviance: 403.1
posterior <- mod1_csim[, 1 : 9 ]
colnames ( posterior) <- colnames ( X)
posterior%>%
as.data.frame () %>%
select( - c ( famhist)) %>%
melt() %>%
ggplot( aes( x= value, fill= variable)) +
geom_histogram( colour= "black" , size= 0.1 , binwidth= 0.05 ) +
facet_wrap( ~ variable, ncol= 3 ) +
theme_classic() +
scale_fill_brewer( palette= "Set1" ) +
theme( legend.position= "none" ) +
xlim( -3 , 3 )
##Ajust model to remove terms centered around 0
Here the autocorrelation disappers and the effective smaple size is larger for all variables
Comparing the models
DIC for model 1 is larger than for model 2. Therefore, model2 is better and I will use this
dic1
## Mean deviance: 394.9
## penalty 8.171
## Penalized deviance: 403.1
dic2
## Mean deviance: 388.3
## penalty 6.957
## Penalized deviance: 395.2
dic1 - dic2
## Difference: 7.843772
## Sample standard error: 26.6649
summary ( mod2_sim)
##
## Iterations = 6001:11000
## Thinning interval = 1
## Number of chains = 3
## Sample size per chain = 5000
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## b[1] 0.14946 0.1264 0.001032 0.001493
## b[2] 0.47494 0.1349 0.001101 0.001532
## b[3] 0.40884 0.1226 0.001001 0.001293
## b[4] 0.34692 0.1351 0.001103 0.001542
## b[5] -0.06677 0.1289 0.001052 0.001496
## b[6] 0.80272 0.1645 0.001343 0.002163
## int -0.87060 0.1362 0.001112 0.001631
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## b[1] -0.08968 0.0631 0.1470 0.2336 0.4073
## b[2] 0.21582 0.3816 0.4739 0.5666 0.7386
## b[3] 0.16948 0.3266 0.4088 0.4910 0.6501
## b[4] 0.08586 0.2571 0.3455 0.4353 0.6193
## b[5] -0.32615 -0.1513 -0.0634 0.0181 0.1826
## b[6] 0.48890 0.6896 0.8020 0.9113 1.1381
## int -1.14168 -0.9611 -0.8686 -0.7798 -0.6092
posterior <- mod2_csim[, 1 : 6 ]
colnames ( posterior) <- colnames ( X)[ - c ( 2 , 4 , 8 )]
posterior%>%
as.data.frame () %>%
select( - c ( famhist)) %>%
melt() %>%
ggplot( aes( x= value, fill= variable)) +
geom_histogram( colour= "black" , size= 0.1 , binwidth= 0.05 ) +
facet_wrap( ~ variable, ncol= 3 ) +
theme_classic() +
scale_fill_brewer( palette= "Set1" ) +
theme( legend.position= "none" ) +
xlim( -2 , 2 )
```
#Predict
Using our trained model we have a 0.72 accuracy
( pm_coef = colMeans ( mod2_csim))
## b[1] b[2] b[3] b[4] b[5] b[6]
## 0.14946415 0.47494319 0.40884479 0.34692490 -0.06676623 0.80272040
## int
## -0.87059519
pm_Xb = pm_coef[ "int" ] + X[, c ( 1 , 3 , 5 , 6 , 7 , 9 )] %*% pm_coef[ 1 : 6 ]
phat = 1.0 / ( 1.0 + exp ( - pm_Xb))
head ( phat)
## [,1]
## 217 0.3155766
## 353 0.7529948
## 413 0.8381924
## 20 0.6011533
## 4 0.7105916
## 335 0.5084351
plot( phat, jitter ( dat[, 10 ]))
( tab0.5 = table ( phat > 0.5 , data_jags2$ y))
##
## 0 1
## FALSE 202 63
## TRUE 39 65
sum ( diag ( tab0.5)) / sum ( tab0.5)
## [1] 0.7235772
#0.72
X.test = scale ( test[, -10 ], center= TRUE , scale= TRUE )
pm_Xb = pm_coef[ "int" ] + X.test[, c ( 1 , 3 , 5 , 6 , 7 , 9 )] %*% pm_coef[ 1 : 6 ]
phat = 1.0 / ( 1.0 + exp ( - pm_Xb))
head ( phat)
## [,1]
## 8 0.6404793
## 19 0.8607690
## 30 0.6674311
## 31 0.1959541
## 36 0.1227429
## 54 0.1375159
plot( phat, jitter ( test[, 10 ]))
( tab0.5 = table ( phat > 0.5 , test[, 10 ]))
##
## 0 1
## FALSE 49 16
## TRUE 12 16
sum ( diag ( tab0.5)) / sum ( tab0.5)
## [1] 0.6989247
#0.73