# This performs all of the prediction, except for the Stan model. library(tidyverse) library(Matrix) library(irlba) library(glmnet) library(xgboost) # Connect to SQLite database src <- src_sqlite("datathon_plus_missing.sqlite", create=F) trans <- tbl(src, "trans") %>% distinct(Patient_ID, Drug_ID, Store_ID, Prescriber_ID, Dispense_Week) #<-- deduplication src_tbls(src) env <- environment() for(name in src_tbls(src)) if (name != "trans") env[[name]] <- tbl(src, name) %>% collect(n=Inf) # Some rows of the data were duplicated: # trans has 64,025,564 rows # after dedup 61,332,803 rows # Collect data n_patients <- 558352 n_train <- 279200 patient_ids <- 1:n_patients train_ids <- 1:n_train test_ids <- (n_train+1):n_patients diabetes_drugs <- filter(illness, ChronicIllness == "Diabetes")$MasterProductID diabetic <- (trans %>% filter( Patient_ID < 279201, Dispense_Week > "2015-12-27", Drug_ID %in% diabetes_drugs) %>% group_by(Patient_ID) %>% summarize() %>% collect(n=Inf))$Patient_ID y <- as.numeric( train_ids %in% diabetic ) already_diabetic_ids <- (trans %>% filter( Dispense_Week <= "2015-12-27", Drug_ID %in% diabetes_drugs) %>% group_by(Patient_ID) %>% summarize() %>% collect(n=Inf))$Patient_ID already_diabetic <- patient_ids %in% already_diabetic_ids # Number of each drug purchased by each patient drug_count <- trans %>% filter(Dispense_Week <= "2015-12-27") %>% group_by(Patient_ID, Drug_ID) %>% count() %>% collect(n=Inf) drug_ids <- sort(unique(drug_count$Drug_ID)) #A bit sparse # Store in a sparse matrix drug_count_mat <- sparseMatrix( i = match(drug_count$Patient_ID, patient_ids), j = match(drug_count$Drug_ID, drug_ids), x = drug_count$n, dims = c(length(patient_ids), length(drug_ids))) colnames(drug_count_mat) <- paste0("drug", drug_ids) # Get some further counts over limited times get_mat <- function(from, to="2015-12-17", prefix="") { counts <- trans %>% filter(Dispense_Week >= from, Dispense_Week <= to) %>% group_by(Patient_ID, Drug_ID) %>% count() %>% collect(n=Inf) mat <- sparseMatrix( i = match(counts$Patient_ID, patient_ids), j = match(counts$Drug_ID, drug_ids), x = counts$n, dims = c(length(patient_ids), length(drug_ids))) colnames(mat) <- paste0(prefix, "drug", drug_ids) mat } drug_count_2015_mat <- get_mat(from="2015-01-01", prefix="from2015") drug_count_2015_2_mat <- get_mat(from="2015-06-01", prefix="from2_2015") drug_count_2014_mat <- get_mat(from="2014-01-01", prefix="from2014") drug_count_2013_mat <- get_mat(from="2013-01-01", prefix="from2013") drug_count_2012_mat <- get_mat(from="2012-01-01", prefix="from2012") # Collate drugs by the chronic illness table by_illness <- function(name, mat) { relevant <- filter(illness, ChronicIllness == name)$MasterProductID rowSums(mat[, drug_ids %in% relevant, drop=F]) } illnesses <- unique(illness$ChronicIllness) illness_count_mat <- Matrix(do.call(cbind, lapply(illnesses, by_illness, drug_count_mat))) colnames(illness_count_mat) <- paste0("illness", illnesses) illness_count_2015_mat <- Matrix(do.call(cbind, lapply(illnesses, by_illness, drug_count_2015_mat))) colnames(illness_count_2015_mat) <- paste0("from2015illness",illnesses) # Count how often each patient bought from each prescriber prescriber_count <- trans %>% filter(Dispense_Week <= "2015-12-27") %>% group_by(Patient_ID, Prescriber_ID) %>% count() %>% collect(n=Inf) prescriber_ids <- sort(unique(prescriber_count$Prescriber_ID)) prescriber_count_mat <- sparseMatrix( i = match(prescriber_count$Patient_ID, patient_ids), j = match(prescriber_count$Prescriber_ID, prescriber_ids), x = prescriber_count$n, dims = c(length(patient_ids), length(prescriber_ids))) colnames(prescriber_count_mat) <- paste0("prescriber", prescriber_ids) # Count how often each patient bought from each store store_count <- trans %>% filter(Dispense_Week <= "2015-12-27") %>% group_by(Patient_ID, Store_ID) %>% count() %>% collect(n=Inf) store_ids <- sort(unique(store_count$Store_ID)) store_count_mat <- sparseMatrix( i = match(store_count$Patient_ID, patient_ids), j = match(store_count$Store_ID, store_ids), x = store_count$n, dims = c(length(patient_ids), length(store_ids))) colnames(store_count_mat) <- paste0("store", store_ids) refine <- function(mat, cutoff) { mat[,colSums(mat > 0) >= cutoff,drop=FALSE] } norm_rows <- function(mat,nmat=mat) mat / rowSums(nmat) scale_cols <- function(mat) { col_sd <- sqrt(colMeans(mat^2)-colMeans(mat)^2) mat %*% Diagonal(x=1.0/col_sd) } # Construct predictor matrices gender <- factor(patients$gender, c("U","M","F")) male <- gender == "M" female <- gender == "F" year <- patients$year_of_birth has_year <- year != 1900 postcode <- patients$postcode year_min <- ifelse(has_year, year, min(year[has_year])-1) year_max <- ifelse(has_year, year, max(year[has_year])+1) model_base <- cbind(already_diabetic=already_diabetic, male=male, female=female, year_min=year_min, year_max=year_max) # This is a large model matrix for use with tree-based prediction model <- scale_cols(cbind( model_base, log1p(illness_count_mat), log1p(illness_count_2015_mat), refine(sparse.model.matrix(~0+postcode), 10), refine(log1p(drug_count_2015_2_mat), 10), refine(log1p(drug_count_2015_mat), 10), refine(log1p(drug_count_2014_mat), 10), refine(log1p(drug_count_2013_mat), 10), refine(log1p(drug_count_2012_mat), 10), refine(log1p(drug_count_mat), 10), refine(prescriber_count_mat, 300) > 0, norm_rows(store_count_mat) )) # Help out xgboost with some principal components decomp <- irlba(model, nv=10, center=colMeans(model)) aug <- Matrix(scale_cols(decomp$u),sparse=TRUE) model <- cbind(aug, model) # This is a smaller model matrix for logistic regression lin_model <- scale_cols(cbind( model_base, log1p(illness_count_mat), log1p(illness_count_2015_mat), refine(sparse.model.matrix(~0+postcode), 10), refine(log1p(drug_count_2015_mat), 10), refine(log1p(drug_count_mat), 10), refine(prescriber_count_mat, 300) > 0, norm_rows(store_count_mat) )) decomp <- irlba(lin_model, nv=20, center=colMeans(lin_model)) aug <- Matrix(scale_cols(decomp$u),sparse=TRUE) lin_model <- cbind(aug, lin_model) train_model <- model[train_ids,] train_lin_model <- lin_model[train_ids,] # Balanced bootstrap routine. n must be >= 2 # - always produces a probability for every patient # - each patient has the same total weight # - approximates bootstrap aggregation better and better as n increases balboot <- function(n, predictor, ...) { dist <- qpois((seq_len(n)-0.5)/n, lambda=1) print(dist) boots <- replicate(n_train, sample(dist)) print(rowSums(boots)) total <- rep(0, n_patients) total_n <- rep(0, n_patients) for(i in seq_len(n)) { cat("Boot",i,"of",n,"\n") p <- predictor(boots[i,], ...) mask <- c(boots[i,] == 0, rep(T, length(test_ids))) total <- total + mask * p total_n <- total_n + mask } p <- total / total_n cat("Result:", auc(y,p[train_ids]), "\n") p } eval_auc <- function(preds, dtrain) { w <- which(getinfo(dtrain,"weight") == 0) err <- auc(y[w], preds[w]) return(list(metric = "auc", value = err)) } tree_predictor <- function(weights, lambda, alpha, initial_p=NULL, initial_p_weight=1.0) { xtrain <- xgb.DMatrix(train_model, label=y) setinfo(xtrain, "weight", weights) if (!is.null(initial_p)) setinfo(xtrain, "base_margin", qlogis(initial_p[train_ids]) * initial_p_weight) xfull <- xgb.DMatrix(model) if (!is.null(initial_p)) setinfo(xfull, "base_margin", qlogis(initial_p) * initial_p_weight) nrounds <- 10000 param <- list(eta=0.1, lambda=lambda, alpha=alpha, max_depth=12) fit <- xgboost(data=xtrain, objective="binary:logistic", feval=eval_auc, nrounds=nrounds, maximize=TRUE, early_stopping_rounds=20, param=param) n_trees <- which.max(fit$evaluation_log$train_auc) cat(n_trees, fit$evaluation_log$train_auc[n_trees], "\n") predict(fit, xfull, ntreelimit=n_trees) } set.seed(1001) p_tree <- balboot(60, tree_predictor, lambda=50, alpha=10) set.seed(1009) p_tree_2 <- balboot(60, tree_predictor, lambda=0, alpha=40) glmnet_predictor <- function(weights, alpha=0.01, max_lambda=1.0, step_lambda=0.01, relevant=train_ids) { gfit <- glmnet(train_lin_model,y,weights=weights,family="binomial",alpha=alpha) tune <- relevant[which(weights[relevant] == 0)] f <- function(s) { cat(s,"") p <- predict(gfit, lin_model[tune,,drop=F], s=s, type="response") %>% as.vector auc(y[tune], p) } lambdas <- seq(0,max_lambda,by=step_lambda) scores <- map_dbl(lambdas, f) best <- which.max(scores) lambda <- lambdas[best] cat("\nlambda",lambda,scores[best],"\n") predict(gfit, lin_model, s=lambda, type="response") %>% as.vector } set.seed(1003) p_glmnet <- balboot(60, glmnet_predictor, alpha=0.01) set.seed(1005) p_glmnet_1 <- balboot(60, glmnet_predictor, alpha=0.0) make_split_predictor <- function(fac, func, ...) { lev <- unique(fac) fac <- match(fac, lev) n <- length(lev) function(weights) { p <- rep(NA, length(fac)) for(i in seq_len(n)) { cat(lev[i],"") this_weights <- weights this_weights[fac[train_ids] != i] <- 0 this_p <- func(this_weights, relevant=which(fac[train_ids]==i), ...) p[fac == i] <- this_p[fac == i] } p } } set.seed(1006) p_split_glmnet <- balboot(60, make_split_predictor(already_diabetic, glmnet_predictor, max_lambda=0.5, step_lambda=0.01)) set.seed(1007) p_split_glmnet_1 <- balboot(60, make_split_predictor(already_diabetic, glmnet_predictor, alpha=0.0, max_lambda=5.0, step_lambda=0.1)) # Run XGBoost, but initialized with predictions from glmnet set.seed(1010) p_tree_3 <- balboot(60, tree_predictor, lambda=0, alpha=50, initial_p=p_glmnet, initial_p_weight=0.5) pool_blend_predictor <- function(weights) { n <- ncol(pool) blend <- function(x) { x <- c(x,1-sum(x)) colSums(t(pool)*x) } f <- function(x) { if (any(x < 0) || sum(x) > 1) return(0.0) -auc(y, blend(x)[train_ids], weights) } result <- optim(rep(1/n,n-1), f) cat(-result$value,"/",result$par,1-sum(result$par),"\n") blend(result$par) } set.seed(1004) pool <- cbind(p_glmnet, p_glmnet_1, p_split_glmnet, p_split_glmnet_1, p_tree_3, p_tree_2, p_tree) p_pool <- balboot(60, pool_blend_predictor) data_frame(Patient_ID = patient_ids[test_ids], Diabetes=p_pool[test_ids]) %>% write_csv("prediction_without_stan.csv") save.image("checkpoint.RData")