library(tidyverse)
library(Matrix)
library(jsonlite)
library(xgboost)
library(broom)
library(GGally)

md <- src_sqlite("MelbDatathon2017.sqlite", create=FALSE)

# Only use patients in the kaggle training set
max_patient_id <- 279200
patient_ids <- 1:max_patient_id

patients <- tbl(md,"Patients") %>% collect(n=Inf) %>%
    mutate(
        Patient_ID = as.integer(Patient_ID),
        age=ifelse(year_of_birth==1900,NA,2016-year_of_birth)) %>%
    filter(Patient_ID <= max_patient_id)

drug_illness <- tbl(md,"ChronicIllness") %>%
    transmute(illness=ChronicIllness, Drug_ID=as.integer(MasterProductID)) %>%
    collect(n=Inf) %>%
    mutate(illness=ifelse(illness=="Chronic Obstructive Pulmonary Disease (COPD)","COPD",illness))

drugs <- tbl(md,"Drugs") %>% collect(n=Inf) %>%
    rename(Drug_ID=MasterProductID) %>%
    mutate(Drug_ID=as.integer(Drug_ID)) %>%
    left_join(drug_illness, by="Drug_ID")

patient_drug_pre2016 <- tbl(md,"Transactions") %>%
    filter(
        Patient_ID <= max_patient_id,
        DISPENSE_WEEK < "2016-01-01") %>%
    distinct(Patient_ID, Drug_ID) %>%
    collect(n=Inf) %>% ungroup()

patient_illness_pre2016 <- inner_join(patient_drug_pre2016, drug_illness, by="Drug_ID") %>%
    distinct(Patient_ID, illness)


patient_drug_2016 <- tbl(md,"Transactions") %>%
    filter(
        Patient_ID <= max_patient_id,
        DISPENSE_WEEK >= "2016-01-01") %>%
    distinct(Patient_ID, Drug_ID) %>%
    collect(n=Inf) %>% ungroup()

patient_illness_2016 <- inner_join(patient_drug_2016, drug_illness, by="Drug_ID") %>%
    distinct(Patient_ID, illness)

Predicting diabetes in 2016 based on past chronic illness

Logistic regression, the predictor variables are indicator variables for whether a patient has taken a drug associated with a chronic illness in the past.

y <- patient_ids %in% filter(patient_illness_2016, illness=="Diabetes")$Patient_ID
table(y)
y
 FALSE   TRUE 
224854  54346 
illnesses <- sort(unique(drug_illness$illness))

get_illness_col <- function(illness_wanted) {
    result <- cbind(as.numeric(patient_ids %in% filter(patient_illness_pre2016, illness==illness_wanted)$Patient_ID))
    colnames(result) <- illness_wanted
    result
}
x <- data.frame(lapply(illnesses, get_illness_col))

fit <- glm(y ~ ., data=x, family="binomial")
summary(fit)

Call:
glm(formula = y ~ ., family = "binomial", data = x)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9654  -0.2030  -0.1711  -0.1483   3.1787  

Coefficients:
               Estimate Std. Error  z value Pr(>|z|)    
(Intercept)    -4.21676    0.02456 -171.677  < 2e-16 ***
Anti.Coagulant -0.09212    0.03524   -2.614  0.00894 ** 
COPD           -0.12032    0.02937   -4.097 4.18e-05 ***
Depression     -0.17879    0.01965   -9.100  < 2e-16 ***
Diabetes        5.55266    0.02037  272.576  < 2e-16 ***
Epilepsy       -0.12389    0.02612   -4.744 2.10e-06 ***
Heart.Failure  -0.07967    0.01924   -4.141 3.46e-05 ***
Hypertension    0.01431    0.01817    0.788  0.43092    
Immunology     -0.15469    0.12524   -1.235  0.21679    
Lipids          0.42454    0.02219   19.134  < 2e-16 ***
Osteoporosis   -0.36857    0.03929   -9.382  < 2e-16 ***
Urology        -0.07445    0.04452   -1.672  0.09445 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 275232  on 279199  degrees of freedom
Residual deviance:  93115  on 279188  degrees of freedom
AIC: 93139

Number of Fisher Scoring iterations: 7
df <- tidy(fit) %>%
    mutate(conf95_low=estimate-std.error*1.96, conf95_high=estimate+std.error*1.96) %>%
    arrange(term != "(Intercept)", -abs(estimate)) %>%
    select(term, estimate, conf95_low, conf95_high) 
df %>%
    mutate(estimate=round(estimate,3), conf95_low=round(conf95_low,3), conf95_high=round(conf95_high,3))

Viewing these coefficients in a plot, with confidence intervals:

#Confidence interval calculation is very slow
#ggcoef(fit)
ggplot(df,aes(y=factor(term,rev(term)),x=estimate,xmin=conf95_low,xmax=conf95_high)) + 
    geom_point() + geom_errorbarh(height=0) + geom_vline(xintercept=0) + labs(y="",x="contribution to log odds")

Drugs predictive of diabetes in 2016

Using L1 regularization to produce a sparse logistic regression model. glmnet was very slow for this size of model with L1 regularization, so I’ve used XGBoost’s linear mode. The predictor variables are indicator variables for whether a patient has taken a drug in the past.

drug_ids <- sort(unique(patient_drug_pre2016$Drug_ID))

x_drug <- sparseMatrix(
    i=patient_drug_pre2016$Patient_ID,
    j=match(patient_drug_pre2016$Drug_ID, drug_ids),
    x=rep(1, nrow(patient_drug_pre2016)),
    dims=c(max_patient_id, length(drug_ids)))
colnames(x_drug) <- as.character(drug_ids)

# Alpha chosen simply to produce a manageable list of nontrivial coefficients
param <- list(booster="gblinear", eta=0.1, alpha=100)
xfit_drug <- xgboost(data=x_drug, label=y, objective="binary:logistic", param=param, 
    nrounds=300, print_every_n=50)
[1] train-error:0.133983 
[51]    train-error:0.076100 
[101]   train-error:0.076411 
[151]   train-error:0.076468 
[201]   train-error:0.076468 
[251]   train-error:0.076472 
[300]   train-error:0.076479 
dump <- xgb.dump(xfit_drug, dump_format="json") %>% fromJSON
df <- data_frame(
        Drug_ID = as.integer(colnames(x_drug)),
        coef = dump$weight[[1]]) %>%
    left_join(drugs, by="Drug_ID") %>%
    arrange(-abs(coef)) %>%
    filter(abs(coef) >= 0.01) %>%
    select(coef, illness, MasterProductFullName, GenericIngredientName)

df %>% arrange(-coef) %>% mutate(coef=round(coef,digits=3))

Unsurprisingly, taking a diabetes drug prior to 2016 predicts continuing to take a diabetes drug in 2016.

Let us examine the non-diabetes drugs in this table:

df %>% filter(is.na(illness) | illness != "Diabetes") %>% arrange(-coef) %>% mutate(coef=round(coef,digits=3))

Proton-pump inhibitors may be interesting (the “-prazole”s):

https://www.ncbi.nlm.nih.gov/pubmed/22886351 etc

LS0tCnRpdGxlOiAiUHJlZGljdGlvbiBvZiBkaWFiZXRlcywgaW4gYW4gaW50ZXJwcmV0YWJsZSB3YXkiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIHdhcm5pbmc9RkFMU0UsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KE1hdHJpeCkKbGlicmFyeShqc29ubGl0ZSkKbGlicmFyeSh4Z2Jvb3N0KQpsaWJyYXJ5KGJyb29tKQpsaWJyYXJ5KEdHYWxseSkKCm1kIDwtIHNyY19zcWxpdGUoIk1lbGJEYXRhdGhvbjIwMTcuc3FsaXRlIiwgY3JlYXRlPUZBTFNFKQoKIyBPbmx5IHVzZSBwYXRpZW50cyBpbiB0aGUga2FnZ2xlIHRyYWluaW5nIHNldAptYXhfcGF0aWVudF9pZCA8LSAyNzkyMDAKcGF0aWVudF9pZHMgPC0gMTptYXhfcGF0aWVudF9pZAoKcGF0aWVudHMgPC0gdGJsKG1kLCJQYXRpZW50cyIpICU+JSBjb2xsZWN0KG49SW5mKSAlPiUKICAgIG11dGF0ZSgKICAgICAgICBQYXRpZW50X0lEID0gYXMuaW50ZWdlcihQYXRpZW50X0lEKSwKICAgICAgICBhZ2U9aWZlbHNlKHllYXJfb2ZfYmlydGg9PTE5MDAsTkEsMjAxNi15ZWFyX29mX2JpcnRoKSkgJT4lCiAgICBmaWx0ZXIoUGF0aWVudF9JRCA8PSBtYXhfcGF0aWVudF9pZCkKCmRydWdfaWxsbmVzcyA8LSB0YmwobWQsIkNocm9uaWNJbGxuZXNzIikgJT4lCiAgICB0cmFuc211dGUoaWxsbmVzcz1DaHJvbmljSWxsbmVzcywgRHJ1Z19JRD1hcy5pbnRlZ2VyKE1hc3RlclByb2R1Y3RJRCkpICU+JQogICAgY29sbGVjdChuPUluZikgJT4lCiAgICBtdXRhdGUoaWxsbmVzcz1pZmVsc2UoaWxsbmVzcz09IkNocm9uaWMgT2JzdHJ1Y3RpdmUgUHVsbW9uYXJ5IERpc2Vhc2UgKENPUEQpIiwiQ09QRCIsaWxsbmVzcykpCgpkcnVncyA8LSB0YmwobWQsIkRydWdzIikgJT4lIGNvbGxlY3Qobj1JbmYpICU+JQogICAgcmVuYW1lKERydWdfSUQ9TWFzdGVyUHJvZHVjdElEKSAlPiUKICAgIG11dGF0ZShEcnVnX0lEPWFzLmludGVnZXIoRHJ1Z19JRCkpICU+JQogICAgbGVmdF9qb2luKGRydWdfaWxsbmVzcywgYnk9IkRydWdfSUQiKQoKcGF0aWVudF9kcnVnX3ByZTIwMTYgPC0gdGJsKG1kLCJUcmFuc2FjdGlvbnMiKSAlPiUKICAgIGZpbHRlcigKICAgICAgICBQYXRpZW50X0lEIDw9IG1heF9wYXRpZW50X2lkLAogICAgICAgIERJU1BFTlNFX1dFRUsgPCAiMjAxNi0wMS0wMSIpICU+JQogICAgZGlzdGluY3QoUGF0aWVudF9JRCwgRHJ1Z19JRCkgJT4lCiAgICBjb2xsZWN0KG49SW5mKSAlPiUgdW5ncm91cCgpCgpwYXRpZW50X2lsbG5lc3NfcHJlMjAxNiA8LSBpbm5lcl9qb2luKHBhdGllbnRfZHJ1Z19wcmUyMDE2LCBkcnVnX2lsbG5lc3MsIGJ5PSJEcnVnX0lEIikgJT4lCiAgICBkaXN0aW5jdChQYXRpZW50X0lELCBpbGxuZXNzKQoKCnBhdGllbnRfZHJ1Z18yMDE2IDwtIHRibChtZCwiVHJhbnNhY3Rpb25zIikgJT4lCiAgICBmaWx0ZXIoCiAgICAgICAgUGF0aWVudF9JRCA8PSBtYXhfcGF0aWVudF9pZCwKICAgICAgICBESVNQRU5TRV9XRUVLID49ICIyMDE2LTAxLTAxIikgJT4lCiAgICBkaXN0aW5jdChQYXRpZW50X0lELCBEcnVnX0lEKSAlPiUKICAgIGNvbGxlY3Qobj1JbmYpICU+JSB1bmdyb3VwKCkKCnBhdGllbnRfaWxsbmVzc18yMDE2IDwtIGlubmVyX2pvaW4ocGF0aWVudF9kcnVnXzIwMTYsIGRydWdfaWxsbmVzcywgYnk9IkRydWdfSUQiKSAlPiUKICAgIGRpc3RpbmN0KFBhdGllbnRfSUQsIGlsbG5lc3MpCgpgYGAKCgojIyBQcmVkaWN0aW5nIGRpYWJldGVzIGluIDIwMTYgYmFzZWQgb24gcGFzdCBjaHJvbmljIGlsbG5lc3MKCkxvZ2lzdGljIHJlZ3Jlc3Npb24sIHRoZSBwcmVkaWN0b3IgdmFyaWFibGVzIGFyZSBpbmRpY2F0b3IgdmFyaWFibGVzIGZvciB3aGV0aGVyIGEgcGF0aWVudCBoYXMgdGFrZW4gYSBkcnVnIGFzc29jaWF0ZWQgd2l0aCBhIGNocm9uaWMgaWxsbmVzcyBpbiB0aGUgcGFzdC4KCmBgYHtyIHJvd3MucHJpbnQ9MjB9CnkgPC0gcGF0aWVudF9pZHMgJWluJSBmaWx0ZXIocGF0aWVudF9pbGxuZXNzXzIwMTYsIGlsbG5lc3M9PSJEaWFiZXRlcyIpJFBhdGllbnRfSUQKdGFibGUoeSkKCmlsbG5lc3NlcyA8LSBzb3J0KHVuaXF1ZShkcnVnX2lsbG5lc3MkaWxsbmVzcykpCgpnZXRfaWxsbmVzc19jb2wgPC0gZnVuY3Rpb24oaWxsbmVzc193YW50ZWQpIHsKICAgIHJlc3VsdCA8LSBjYmluZChhcy5udW1lcmljKHBhdGllbnRfaWRzICVpbiUgZmlsdGVyKHBhdGllbnRfaWxsbmVzc19wcmUyMDE2LCBpbGxuZXNzPT1pbGxuZXNzX3dhbnRlZCkkUGF0aWVudF9JRCkpCiAgICBjb2xuYW1lcyhyZXN1bHQpIDwtIGlsbG5lc3Nfd2FudGVkCiAgICByZXN1bHQKfQp4IDwtIGRhdGEuZnJhbWUobGFwcGx5KGlsbG5lc3NlcywgZ2V0X2lsbG5lc3NfY29sKSkKCmZpdCA8LSBnbG0oeSB+IC4sIGRhdGE9eCwgZmFtaWx5PSJiaW5vbWlhbCIpCnN1bW1hcnkoZml0KQoKZGYgPC0gdGlkeShmaXQpICU+JQogICAgbXV0YXRlKGNvbmY5NV9sb3c9ZXN0aW1hdGUtc3RkLmVycm9yKjEuOTYsIGNvbmY5NV9oaWdoPWVzdGltYXRlK3N0ZC5lcnJvcioxLjk2KSAlPiUKICAgIGFycmFuZ2UodGVybSAhPSAiKEludGVyY2VwdCkiLCAtYWJzKGVzdGltYXRlKSkgJT4lCiAgICBzZWxlY3QodGVybSwgZXN0aW1hdGUsIGNvbmY5NV9sb3csIGNvbmY5NV9oaWdoKSAKZGYgJT4lCiAgICBtdXRhdGUoZXN0aW1hdGU9cm91bmQoZXN0aW1hdGUsMyksIGNvbmY5NV9sb3c9cm91bmQoY29uZjk1X2xvdywzKSwgY29uZjk1X2hpZ2g9cm91bmQoY29uZjk1X2hpZ2gsMykpCmBgYAoKVmlld2luZyB0aGVzZSBjb2VmZmljaWVudHMgaW4gYSBwbG90LCB3aXRoIGNvbmZpZGVuY2UgaW50ZXJ2YWxzOgoKYGBge3J9CiNDb25maWRlbmNlIGludGVydmFsIGNhbGN1bGF0aW9uIGlzIHZlcnkgc2xvdwojZ2djb2VmKGZpdCkKZ2dwbG90KGRmLGFlcyh5PWZhY3Rvcih0ZXJtLHJldih0ZXJtKSkseD1lc3RpbWF0ZSx4bWluPWNvbmY5NV9sb3cseG1heD1jb25mOTVfaGlnaCkpICsgCiAgICBnZW9tX3BvaW50KCkgKyBnZW9tX2Vycm9yYmFyaChoZWlnaHQ9MCkgKyBnZW9tX3ZsaW5lKHhpbnRlcmNlcHQ9MCkgKyBsYWJzKHk9IiIseD0iY29udHJpYnV0aW9uIHRvIGxvZyBvZGRzIikKYGBgCgoKIyMgRHJ1Z3MgcHJlZGljdGl2ZSBvZiBkaWFiZXRlcyBpbiAyMDE2CgpVc2luZyBMMSByZWd1bGFyaXphdGlvbiB0byBwcm9kdWNlIGEgc3BhcnNlIGxvZ2lzdGljIHJlZ3Jlc3Npb24gbW9kZWwuIGdsbW5ldCB3YXMgdmVyeSBzbG93IGZvciB0aGlzIHNpemUgb2YgbW9kZWwgd2l0aCBMMSByZWd1bGFyaXphdGlvbiwgc28gSSd2ZSB1c2VkIFhHQm9vc3QncyBsaW5lYXIgbW9kZS4gVGhlIHByZWRpY3RvciB2YXJpYWJsZXMgYXJlIGluZGljYXRvciB2YXJpYWJsZXMgZm9yIHdoZXRoZXIgYSBwYXRpZW50IGhhcyB0YWtlbiBhIGRydWcgaW4gdGhlIHBhc3QuCgpgYGB7ciByb3dzLnByaW50PTEwfQpkcnVnX2lkcyA8LSBzb3J0KHVuaXF1ZShwYXRpZW50X2RydWdfcHJlMjAxNiREcnVnX0lEKSkKCnhfZHJ1ZyA8LSBzcGFyc2VNYXRyaXgoCiAgICBpPXBhdGllbnRfZHJ1Z19wcmUyMDE2JFBhdGllbnRfSUQsCiAgICBqPW1hdGNoKHBhdGllbnRfZHJ1Z19wcmUyMDE2JERydWdfSUQsIGRydWdfaWRzKSwKICAgIHg9cmVwKDEsIG5yb3cocGF0aWVudF9kcnVnX3ByZTIwMTYpKSwKICAgIGRpbXM9YyhtYXhfcGF0aWVudF9pZCwgbGVuZ3RoKGRydWdfaWRzKSkpCmNvbG5hbWVzKHhfZHJ1ZykgPC0gYXMuY2hhcmFjdGVyKGRydWdfaWRzKQoKIyBBbHBoYSBjaG9zZW4gc2ltcGx5IHRvIHByb2R1Y2UgYSBtYW5hZ2VhYmxlIGxpc3Qgb2Ygbm9udHJpdmlhbCBjb2VmZmljaWVudHMKcGFyYW0gPC0gbGlzdChib29zdGVyPSJnYmxpbmVhciIsIGV0YT0wLjEsIGFscGhhPTEwMCkKeGZpdF9kcnVnIDwtIHhnYm9vc3QoZGF0YT14X2RydWcsIGxhYmVsPXksIG9iamVjdGl2ZT0iYmluYXJ5OmxvZ2lzdGljIiwgcGFyYW09cGFyYW0sIAogICAgbnJvdW5kcz0zMDAsIHByaW50X2V2ZXJ5X249NTApCgpkdW1wIDwtIHhnYi5kdW1wKHhmaXRfZHJ1ZywgZHVtcF9mb3JtYXQ9Impzb24iKSAlPiUgZnJvbUpTT04KZGYgPC0gZGF0YV9mcmFtZSgKICAgICAgICBEcnVnX0lEID0gYXMuaW50ZWdlcihjb2xuYW1lcyh4X2RydWcpKSwKICAgICAgICBjb2VmID0gZHVtcCR3ZWlnaHRbWzFdXSkgJT4lCiAgICBsZWZ0X2pvaW4oZHJ1Z3MsIGJ5PSJEcnVnX0lEIikgJT4lCiAgICBhcnJhbmdlKC1hYnMoY29lZikpICU+JQogICAgZmlsdGVyKGFicyhjb2VmKSA+PSAwLjAxKSAlPiUKICAgIHNlbGVjdChjb2VmLCBpbGxuZXNzLCBNYXN0ZXJQcm9kdWN0RnVsbE5hbWUsIEdlbmVyaWNJbmdyZWRpZW50TmFtZSkKCmRmICU+JSBhcnJhbmdlKC1jb2VmKSAlPiUgbXV0YXRlKGNvZWY9cm91bmQoY29lZixkaWdpdHM9MykpCmBgYAoKVW5zdXJwcmlzaW5nbHksIHRha2luZyBhIGRpYWJldGVzIGRydWcgcHJpb3IgdG8gMjAxNiBwcmVkaWN0cyBjb250aW51aW5nIHRvIHRha2UgYSBkaWFiZXRlcyBkcnVnIGluIDIwMTYuCgpMZXQgdXMgZXhhbWluZSB0aGUgbm9uLWRpYWJldGVzIGRydWdzIGluIHRoaXMgdGFibGU6CgpgYGB7ciByb3dzLnByaW50PTEwMH0KZGYgJT4lIGZpbHRlcihpcy5uYShpbGxuZXNzKSB8IGlsbG5lc3MgIT0gIkRpYWJldGVzIikgJT4lIGFycmFuZ2UoLWNvZWYpICU+JSBtdXRhdGUoY29lZj1yb3VuZChjb2VmLGRpZ2l0cz0zKSkKYGBgCgpQcm90b24tcHVtcCBpbmhpYml0b3JzIG1heSBiZSBpbnRlcmVzdGluZyAodGhlICItcHJhem9sZSJzKToKCmh0dHBzOi8vd3d3Lm5jYmkubmxtLm5paC5nb3YvcHVibWVkLzIyODg2MzUxCmV0YwoKCg==