# This performs all of the prediction, except for the Stan model.

library(tidyverse)
library(Matrix)
library(irlba)
library(glmnet)
library(xgboost)


# Connect to SQLite database

src <- src_sqlite("datathon_plus_missing.sqlite", create=F)

trans <- tbl(src, "trans") %>%
    distinct(Patient_ID, Drug_ID, Store_ID, Prescriber_ID, Dispense_Week)  #<-- deduplication

src_tbls(src)
env <- environment()
for(name in src_tbls(src))
    if (name != "trans")
        env[[name]] <- tbl(src, name) %>% collect(n=Inf)

# Some rows of the data were duplicated:
# trans has    64,025,564 rows
# after dedup  61,332,803 rows


# Collect data

n_patients <- 558352
n_train <- 279200
patient_ids <- 1:n_patients
train_ids <- 1:n_train
test_ids <- (n_train+1):n_patients


diabetes_drugs <- filter(illness, ChronicIllness == "Diabetes")$MasterProductID
diabetic <- (trans %>%
    filter(
        Patient_ID < 279201, 
        Dispense_Week > "2015-12-27",
        Drug_ID %in% diabetes_drugs) %>%
    group_by(Patient_ID) %>%
    summarize() %>% collect(n=Inf))$Patient_ID

y <- as.numeric( train_ids %in% diabetic )


already_diabetic_ids <- (trans %>%
     filter(
         Dispense_Week <= "2015-12-27",
         Drug_ID %in% diabetes_drugs) %>%
     group_by(Patient_ID) %>%
     summarize() %>% collect(n=Inf))$Patient_ID

already_diabetic <- patient_ids %in% already_diabetic_ids


# Number of each drug purchased by each patient

drug_count <- trans %>%
    filter(Dispense_Week <= "2015-12-27") %>%
    group_by(Patient_ID, Drug_ID) %>%
    count() %>%
    collect(n=Inf)

drug_ids <- sort(unique(drug_count$Drug_ID)) #A bit sparse


# Store in a sparse matrix

drug_count_mat <- sparseMatrix(
    i = match(drug_count$Patient_ID, patient_ids),
    j = match(drug_count$Drug_ID, drug_ids),
    x = drug_count$n,
    dims = c(length(patient_ids), length(drug_ids)))
colnames(drug_count_mat) <- paste0("drug", drug_ids)


# Get some further counts over limited times

get_mat <- function(from, to="2015-12-17", prefix="") {
    counts <- trans %>%
        filter(Dispense_Week >= from, Dispense_Week <= to) %>%
        group_by(Patient_ID, Drug_ID) %>%
        count() %>%
        collect(n=Inf)
    
    mat <- sparseMatrix(
        i = match(counts$Patient_ID, patient_ids),
        j = match(counts$Drug_ID, drug_ids),
        x = counts$n,
        dims = c(length(patient_ids), length(drug_ids)))
    colnames(mat) <- paste0(prefix, "drug", drug_ids)
    mat
}

drug_count_2015_mat <- get_mat(from="2015-01-01", prefix="from2015")

drug_count_2015_2_mat <- get_mat(from="2015-06-01", prefix="from2_2015")

drug_count_2014_mat <- get_mat(from="2014-01-01", prefix="from2014")
drug_count_2013_mat <- get_mat(from="2013-01-01", prefix="from2013")
drug_count_2012_mat <- get_mat(from="2012-01-01", prefix="from2012")


# Collate drugs by the chronic illness table

by_illness <- function(name, mat) {
    relevant <- filter(illness, ChronicIllness == name)$MasterProductID
    rowSums(mat[, drug_ids %in% relevant, drop=F])
}
illnesses <- unique(illness$ChronicIllness)
illness_count_mat <- Matrix(do.call(cbind, lapply(illnesses, by_illness, drug_count_mat)))
colnames(illness_count_mat) <- paste0("illness", illnesses)

illness_count_2015_mat <- Matrix(do.call(cbind, lapply(illnesses, by_illness, drug_count_2015_mat)))
colnames(illness_count_2015_mat) <- paste0("from2015illness",illnesses)


# Count how often each patient bought from each prescriber

prescriber_count <- trans %>%
    filter(Dispense_Week <= "2015-12-27") %>%
    group_by(Patient_ID, Prescriber_ID) %>%
    count() %>%
    collect(n=Inf)

prescriber_ids <- sort(unique(prescriber_count$Prescriber_ID))

prescriber_count_mat <- sparseMatrix(
    i = match(prescriber_count$Patient_ID, patient_ids),
    j = match(prescriber_count$Prescriber_ID, prescriber_ids),
    x = prescriber_count$n,
    dims = c(length(patient_ids), length(prescriber_ids)))
colnames(prescriber_count_mat) <- paste0("prescriber", prescriber_ids)


# Count how often each patient bought from each store

store_count <- trans %>%
    filter(Dispense_Week <= "2015-12-27") %>%
    group_by(Patient_ID, Store_ID) %>%
    count() %>%
    collect(n=Inf)

store_ids <- sort(unique(store_count$Store_ID))

store_count_mat <- sparseMatrix(
    i = match(store_count$Patient_ID, patient_ids),
    j = match(store_count$Store_ID, store_ids),
    x = store_count$n,
    dims = c(length(patient_ids), length(store_ids)))
colnames(store_count_mat) <- paste0("store", store_ids)


refine <- function(mat, cutoff) {
    mat[,colSums(mat > 0) >= cutoff,drop=FALSE]
}

norm_rows <- function(mat,nmat=mat) mat / rowSums(nmat)

scale_cols <- function(mat) {
    col_sd <- sqrt(colMeans(mat^2)-colMeans(mat)^2)
    mat %*% Diagonal(x=1.0/col_sd)
}


# Construct predictor matrices

gender <- factor(patients$gender, c("U","M","F"))
male <- gender == "M"
female <- gender == "F"
year <- patients$year_of_birth
has_year <- year != 1900
postcode <- patients$postcode

year_min <- ifelse(has_year, year, min(year[has_year])-1)
year_max <- ifelse(has_year, year, max(year[has_year])+1)

model_base <- cbind(already_diabetic=already_diabetic, male=male, female=female, 
    year_min=year_min, year_max=year_max) 

# This is a large model matrix for use with tree-based prediction
model <- scale_cols(cbind(
    model_base,
    log1p(illness_count_mat),
    log1p(illness_count_2015_mat),
    refine(sparse.model.matrix(~0+postcode), 10),
    refine(log1p(drug_count_2015_2_mat), 10),
    refine(log1p(drug_count_2015_mat), 10),
    refine(log1p(drug_count_2014_mat), 10),
    refine(log1p(drug_count_2013_mat), 10),
    refine(log1p(drug_count_2012_mat), 10),
    refine(log1p(drug_count_mat), 10),
    refine(prescriber_count_mat, 300) > 0,
    norm_rows(store_count_mat)
))
# Help out xgboost with some principal components
decomp <- irlba(model, nv=10, center=colMeans(model))
aug <- Matrix(scale_cols(decomp$u),sparse=TRUE)
model <- cbind(aug, model)

# This is a smaller model matrix for logistic regression
lin_model <- scale_cols(cbind(
    model_base,
    log1p(illness_count_mat),
    log1p(illness_count_2015_mat),
    refine(sparse.model.matrix(~0+postcode), 10),
    refine(log1p(drug_count_2015_mat), 10),
    refine(log1p(drug_count_mat), 10),
    refine(prescriber_count_mat, 300) > 0,
    norm_rows(store_count_mat)
))
decomp <- irlba(lin_model, nv=20, center=colMeans(lin_model))
aug <- Matrix(scale_cols(decomp$u),sparse=TRUE)
lin_model <- cbind(aug, lin_model)


train_model <- model[train_ids,]
train_lin_model <- lin_model[train_ids,]


# Balanced bootstrap routine. n must be >= 2
# - always produces a probability for every patient
# - each patient has the same total weight
# - approximates bootstrap aggregation better and better as n increases
balboot <- function(n, predictor, ...) {
    dist <- qpois((seq_len(n)-0.5)/n, lambda=1)
    print(dist)

    boots <- replicate(n_train, sample(dist))
    print(rowSums(boots))
    
    total <- rep(0, n_patients)
    total_n <- rep(0, n_patients)
    for(i in seq_len(n)) {
        cat("Boot",i,"of",n,"\n")
        p <- predictor(boots[i,], ...)
        mask <- c(boots[i,] == 0, rep(T, length(test_ids)))
        total <- total + mask * p
        total_n <- total_n + mask
    }
    
    p <- total / total_n
    cat("Result:", auc(y,p[train_ids]), "\n")
    p
}


eval_auc <- function(preds, dtrain) {
    w <- which(getinfo(dtrain,"weight") == 0)
    err <- auc(y[w], preds[w])
    return(list(metric = "auc", value = err))
}

tree_predictor <- function(weights, lambda, alpha,  initial_p=NULL, initial_p_weight=1.0) {
    xtrain <- xgb.DMatrix(train_model, label=y)
    setinfo(xtrain, "weight", weights)
    if (!is.null(initial_p))
        setinfo(xtrain, "base_margin", qlogis(initial_p[train_ids]) * initial_p_weight)
    
    xfull <- xgb.DMatrix(model)
    if (!is.null(initial_p))
        setinfo(xfull, "base_margin", qlogis(initial_p) * initial_p_weight)
    
    nrounds <- 10000
    param <- list(eta=0.1, lambda=lambda, alpha=alpha, max_depth=12)
    fit <- xgboost(data=xtrain, 
        objective="binary:logistic", feval=eval_auc, nrounds=nrounds, maximize=TRUE, early_stopping_rounds=20, param=param)

    n_trees <- which.max(fit$evaluation_log$train_auc)
    cat(n_trees, fit$evaluation_log$train_auc[n_trees], "\n")
    
    predict(fit, xfull, ntreelimit=n_trees)
}

set.seed(1001)
p_tree <- balboot(60, tree_predictor, lambda=50, alpha=10)

set.seed(1009)
p_tree_2 <- balboot(60, tree_predictor, lambda=0, alpha=40)


glmnet_predictor <- function(weights, alpha=0.01, max_lambda=1.0, step_lambda=0.01, relevant=train_ids) {
    gfit <- glmnet(train_lin_model,y,weights=weights,family="binomial",alpha=alpha)
    
    tune <- relevant[which(weights[relevant] == 0)]
    f <- function(s) {
        cat(s,"")
        p <- predict(gfit, lin_model[tune,,drop=F], s=s, type="response") %>% as.vector
        auc(y[tune], p)
    }
    lambdas <- seq(0,max_lambda,by=step_lambda)
    scores <- map_dbl(lambdas, f)
    best <- which.max(scores)
    lambda <- lambdas[best]
    cat("\nlambda",lambda,scores[best],"\n")
    
    predict(gfit, lin_model, s=lambda, type="response") %>% as.vector
}

set.seed(1003)
p_glmnet <- balboot(60, glmnet_predictor, alpha=0.01)

set.seed(1005)
p_glmnet_1 <- balboot(60, glmnet_predictor, alpha=0.0)


make_split_predictor <- function(fac, func, ...) {
    lev <- unique(fac)
    fac <- match(fac, lev)
    n <- length(lev)
    function(weights) {
        p <- rep(NA, length(fac))
        for(i in seq_len(n)) {
            cat(lev[i],"")
            this_weights <- weights
            this_weights[fac[train_ids] != i] <- 0
            this_p <- func(this_weights, relevant=which(fac[train_ids]==i), ...)
            p[fac == i] <- this_p[fac == i]
        }
        p
    }
}

set.seed(1006)
p_split_glmnet <- balboot(60, make_split_predictor(already_diabetic, glmnet_predictor, max_lambda=0.5, step_lambda=0.01))

set.seed(1007)
p_split_glmnet_1 <- balboot(60, make_split_predictor(already_diabetic, glmnet_predictor, alpha=0.0, max_lambda=5.0, step_lambda=0.1))


# Run XGBoost, but initialized with predictions from glmnet
set.seed(1010)
p_tree_3 <- balboot(60, tree_predictor, lambda=0, alpha=50, initial_p=p_glmnet, initial_p_weight=0.5)


pool_blend_predictor <- function(weights) {
    n <- ncol(pool)
    blend <- function(x) { 
        x <- c(x,1-sum(x))
        colSums(t(pool)*x)
    }
    f <- function(x) {
        if (any(x < 0) || sum(x) > 1)
            return(0.0)
        -auc(y, blend(x)[train_ids], weights)
    }
    result <- optim(rep(1/n,n-1), f)
    cat(-result$value,"/",result$par,1-sum(result$par),"\n")
    blend(result$par)
}

set.seed(1004)
pool <- cbind(p_glmnet, p_glmnet_1, p_split_glmnet, p_split_glmnet_1, p_tree_3, p_tree_2, p_tree)
p_pool <- balboot(60, pool_blend_predictor)

data_frame(Patient_ID = patient_ids[test_ids], Diabetes=p_pool[test_ids]) %>% 
    write_csv("prediction_without_stan.csv")


save.image("checkpoint.RData")