This post tries to use Bayesian linear regression to explore causes of heart disease.

Bayesian linear regression

library("corrplot")
library(ncvreg)
library(ipred)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
#data(dystrophy)
data(heart)
dim(heart)

## [1] 462  10

str(heart)

## 'data.frame':	462 obs. of  10 variables:
##  $ sbp      : int  160 144 118 170 134 132 142 114 114 132 ...
##  $ tobacco  : num  12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
##  $ ldl      : num  5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
##  $ adiposity: num  23.1 28.6 32.3 38 27.8 ...
##  $ famhist  : num  1 0 1 1 1 1 0 1 1 1 ...
##  $ typea    : int  49 55 52 51 60 62 59 62 49 69 ...
##  $ obesity  : num  25.3 28.9 29.1 32 26 ...
##  $ alcohol  : num  97.2 2.06 3.81 24.26 57.34 ...
##  $ age      : int  52 63 46 58 49 45 38 58 29 53 ...
##  $ chd      : int  1 1 0 1 1 0 0 1 0 1 ...

##Omit NA’s and plot variables to see if there’s correlations

dat = heart
Cor <- cor(dat)
corrplot(Cor, type="upper", method="ellipse", tl.pos="d")
corrplot(Cor, type="lower", method="number", col="black", 
         add=TRUE, diag=FALSE, tl.pos="n", cl.pos="n")

plot of chunk corrplot

dat %>% 
    as.data.frame() %>%
    select(-c(famhist)) %>%
    melt() %>%
    ggplot(aes(x=value, fill=variable))+
        geom_histogram(colour="black", size=0.1) +
        facet_wrap(~variable, ncol=3, scale="free") +
        theme_classic() +
        scale_fill_brewer(palette="Set1")+
        theme(legend.position="none")

plot of chunk corrplot

#Divide the data into training and test set

chdYes = heart[heart$chd==1,]
chdNo = heart[heart$chd==0,]

sampleYes = sample(rownames(chdYes), nrow(chdYes)*0.8)
sampleNo = sample(rownames(chdNo), nrow(chdNo)*0.8)

trainYes = chdYes[sampleYes,]
trainNo = chdNo[sampleNo,]

testYes = chdYes[!rownames(chdYes) %in% sampleYes,]
testNo = chdNo[!rownames(chdNo) %in% sampleNo,]

dat = rbind(trainYes, trainNo)
test = rbind(testYes, testNo)

###Model the data with JAGS As we have centered the data around 0, the distribution of Beta will follow a double exponential distriubtion.

X = scale(dat[,-10],  center=TRUE, scale=TRUE)

X %>% 
    as.data.frame() %>%
    select(-c(famhist)) %>%
    melt() %>%
    ggplot(aes(x=value, fill=variable))+
        geom_histogram(colour="black", size=0.1) +
        facet_wrap(~variable, ncol=3, scale="free") +
        theme_classic() +
        scale_fill_brewer(palette="Set1")+
        theme(legend.position="none")

plot of chunk Model_data

mod_glm = summary(glm(chd ~ ., data=dat))

#JAGS model

library("rjags")
mod1_string = " model {
    for (i in 1:length(y)) {
        y[i] ~ dbern(p[i])
        #logit(p[i]) = int + b[1]*AGE[i] + b[2]*CK[i] + b[3]*H[i] + b[4]*PK[i] + b[5]*LD[i]
        logit(p[i]) = int + b[1]*sbp[i] + b[2]+tobacco[i] + b[3]*ldl[i] + 
            b[4]*adiposity[i] + b[5]*famhist[i] +b[6]*typea[i] +b[7]*obesity[i] + b[8]*alcohol[i] + b[9]*age[i]
    }
    int ~ dnorm(0.0, 1.0/25.0)
    for (j in 1:9) {
        b[j] ~ ddexp(0.0, sqrt(2.0)) # has variance 1.0
        #b[j] ~ dnorm(0.0, 2) # noninformative for logistic regression
        #b[j] ~ dnorm(0.0, 1.0/4.0^2)
    }
} "


data_jags = list(y=dat[,10], sbp=X[,"sbp"], tobacco=X[,"tobacco"], ldl=X[,"ldl"], 
        adiposity=X[,"adiposity"], famhist=X[,"famhist"], 
        typea=X[,"typea"], obesity=X[,"obesity"], alcohol=X[,"alcohol"], age=X[,"age"])


params = c("int", "b")

##Run JAGS

## Error: <text>:3:1: unexpected ','
## 2: suppressMessages(update(mod1, 5e3))
## 3: ,
##    ^

convergence diagnostics

1) No pattern in traceplot 2) After 5e3 updates (e.g. burn ins I still observe autocorrelation in b[2] and in int) Increasing it to 5e4 3) DIC Mean deviance: 503.9 penalty 8.646 Penalized deviance: 512.5

plot(mod1_sim)

plot of chunk Plotting diagnostics

gelman.diag(mod1_sim)

## Potential scale reduction factors:
## 
##      Point est. Upper C.I.
## b[1]       1.00       1.00
## b[2]       1.05       1.11
## b[3]       1.00       1.00
## b[4]       1.00       1.01
## b[5]       1.00       1.00
## b[6]       1.00       1.00
## b[7]       1.00       1.00
## b[8]       1.00       1.00
## b[9]       1.00       1.00
## int        1.04       1.10
## 
## Multivariate psrf
## 
## 1.02

autocorr.diag(mod1_sim)

##               b[1]      b[2]          b[3]        b[4]          b[5]
## Lag 0  1.000000000 1.0000000  1.0000000000 1.000000000  1.0000000000
## Lag 1  0.287920926 0.9851922  0.3462278072 0.722909298  0.2658015636
## Lag 5  0.010262570 0.9378278  0.0008810413 0.205100190  0.0024193182
## Lag 10 0.007303032 0.8806068  0.0132742196 0.030675749 -0.0005156971
## Lag 50 0.005479209 0.5317571 -0.0033351594 0.005065785 -0.0052446834
##                b[6]        b[7]         b[8]         b[9]       int
## Lag 0   1.000000000 1.000000000  1.000000000  1.000000000 1.0000000
## Lag 1   0.301678048 0.630132049  0.239460718  0.534791017 0.9852943
## Lag 5   0.004675731 0.172722918 -0.006375527  0.092841859 0.9369034
## Lag 10  0.004933605 0.036995541 -0.002855998  0.005109218 0.8799680
## Lag 50 -0.002915316 0.007974022  0.004067879 -0.006795221 0.5316811

autocorr.plot(mod1_sim)

plot of chunk Plotting diagnostics

effectiveSize(mod1_sim)

##       b[1]       b[2]       b[3]       b[4]       b[5]       b[6] 
## 8172.85337   96.52570 6961.24000 2350.24891 8535.10192 7542.87353 
##       b[7]       b[8]       b[9]        int 
## 2810.94261 8762.84597 3950.24210   98.74925

##Summary statistics I observe that tobacco, adiposity and alcohol have posterior probabilities centered around 0. This means that they do not contribute much to heart disease I remove these and compare between models

dic1

## Mean deviance:  394.9 
## penalty 8.171 
## Penalized deviance: 403.1

posterior <- mod1_csim[,1:9]
colnames(posterior) <- colnames(X)

posterior%>% 
    as.data.frame() %>%
    select(-c(famhist)) %>%
    melt() %>%
    ggplot(aes(x=value, fill=variable))+
        geom_histogram(colour="black", size=0.1, binwidth=0.05) +
        facet_wrap(~variable, ncol=3) +
        theme_classic() +
        scale_fill_brewer(palette="Set1")+
        theme(legend.position="none") +
        xlim(-3, 3)

plot of chunk unnamed-chunk-1

##Ajust model to remove terms centered around 0 Here the autocorrelation disappers and the effective smaple size is larger for all variables

plot of chunk unnamed-chunk-2

Comparing the models

DIC for model 1 is larger than for model 2. Therefore, model2 is better and I will use this

dic1

## Mean deviance:  394.9 
## penalty 8.171 
## Penalized deviance: 403.1

dic2

## Mean deviance:  388.3 
## penalty 6.957 
## Penalized deviance: 395.2

dic1 - dic2

## Difference: 7.843772
## Sample standard error: 26.6649

summary(mod2_sim)

## 
## Iterations = 6001:11000
## Thinning interval = 1 
## Number of chains = 3 
## Sample size per chain = 5000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##          Mean     SD Naive SE Time-series SE
## b[1]  0.14946 0.1264 0.001032       0.001493
## b[2]  0.47494 0.1349 0.001101       0.001532
## b[3]  0.40884 0.1226 0.001001       0.001293
## b[4]  0.34692 0.1351 0.001103       0.001542
## b[5] -0.06677 0.1289 0.001052       0.001496
## b[6]  0.80272 0.1645 0.001343       0.002163
## int  -0.87060 0.1362 0.001112       0.001631
## 
## 2. Quantiles for each variable:
## 
##          2.5%     25%     50%     75%   97.5%
## b[1] -0.08968  0.0631  0.1470  0.2336  0.4073
## b[2]  0.21582  0.3816  0.4739  0.5666  0.7386
## b[3]  0.16948  0.3266  0.4088  0.4910  0.6501
## b[4]  0.08586  0.2571  0.3455  0.4353  0.6193
## b[5] -0.32615 -0.1513 -0.0634  0.0181  0.1826
## b[6]  0.48890  0.6896  0.8020  0.9113  1.1381
## int  -1.14168 -0.9611 -0.8686 -0.7798 -0.6092

posterior <- mod2_csim[,1:6]
colnames(posterior) <- colnames(X)[-c(2,4,8)]

posterior%>% 
    as.data.frame() %>%
    select(-c(famhist)) %>%
    melt() %>%
    ggplot(aes(x=value, fill=variable))+
        geom_histogram(colour="black", size=0.1, binwidth=0.05) +
        facet_wrap(~variable, ncol=3) +
        theme_classic() +
        scale_fill_brewer(palette="Set1")+
        theme(legend.position="none") +
        xlim(-2, 2)

plot of chunk Posterior

``` #Predict Using our trained model we have a 0.72 accuracy

(pm_coef = colMeans(mod2_csim))

##        b[1]        b[2]        b[3]        b[4]        b[5]        b[6] 
##  0.14946415  0.47494319  0.40884479  0.34692490 -0.06676623  0.80272040 
##         int 
## -0.87059519

pm_Xb = pm_coef["int"] + X[,c(1, 3,5,6,7, 9)] %*% pm_coef[1:6]
phat = 1.0 / (1.0 + exp(-pm_Xb))
head(phat)

##          [,1]
## 217 0.3155766
## 353 0.7529948
## 413 0.8381924
## 20  0.6011533
## 4   0.7105916
## 335 0.5084351

plot(phat, jitter(dat[,10]))

plot of chunk unnamed-chunk-3

(tab0.5 = table(phat > 0.5, data_jags2$y))

##        
##           0   1
##   FALSE 202  63
##   TRUE   39  65

sum(diag(tab0.5)) / sum(tab0.5)

## [1] 0.7235772

#0.72

X.test = scale(test[,-10],  center=TRUE, scale=TRUE)

pm_Xb = pm_coef["int"] + X.test[,c(1, 3,5,6,7, 9)] %*% pm_coef[1:6]
phat = 1.0 / (1.0 + exp(-pm_Xb))
head(phat)

##         [,1]
## 8  0.6404793
## 19 0.8607690
## 30 0.6674311
## 31 0.1959541
## 36 0.1227429
## 54 0.1375159

plot(phat, jitter(test[,10]))

plot of chunk unnamed-chunk-3

(tab0.5 = table(phat > 0.5, test[,10]))

##        
##          0  1
##   FALSE 49 16
##   TRUE  12 16

sum(diag(tab0.5)) / sum(tab0.5)

## [1] 0.6989247

#0.73