## ----setup, echo=FALSE, include=TRUE, eval=TRUE------------------------------- knitr::opts_chunk$set( echo = TRUE, eval = TRUE, cache = TRUE, tidy = FALSE, warning = FALSE, error = FALSE ) library(rlist) library(data.table) library(foreach) ## ----echo=FALSE--------------------------------------------------------------- # The good old original R code to run SIM that was used before cppSim ### this file contains the functions used to run the gravity model cost_function <- function(d, beta, type = "exp") { # d is the od distance matrix, taking the exponential # means doing this operation for each individual element # with an exponent beta # the type parameter allows to set it to either exp or pow. # pow means we use a power function as cost, rather than exponential if (type == "exp") { exp(-beta * d) } else if (type == "pow") { d^(-beta) } else { print("provide a type of functino to compute") } } r_2 <- function(d, f) { cor( d |> as.numeric(), f |> as.numeric() )^2 } r <- function(d, f) { cor( d |> as.numeric(), f |> as.numeric() ) } rmse <- function(d, f) sum((d - f)^2) calibration <- function(cost_fun, O, D, delta = 0.05) { B <- rep_len(1, nrow(cost_fun)) eps <- abs(sum(B)) e <- NULL i <- 0 while ((eps > delta) & (i < 50)) { A_new <- 1 / (apply(cost_fun, function(x) sum(B * D * x), MARGIN = 1)) B_new <- 1 / (apply(cost_fun, function(x) sum(A_new * O * x), MARGIN = 2)) eps <- abs(sum(B_new - B)) e <- append(e, eps) A <- A_new B <- B_new i <- i + 1 } list( "A" = A, "B" = B, "e" = e ) } run_model <- function( flows, distance, beta = 0.25, type = "exp" # ,cores = 3 ) { F_c <- cost_function(d = {{ distance }}, beta = {{ beta }}, type = type) print("cost function computed") O <- apply(flows, sum, MARGIN = 1) |> as.integer() D <- apply(flows, sum, MARGIN = 2) |> as.integer() A_B <- calibration( cost_fun = F_c, O = O, D = D, delta = .001 ) print("calibration: over") A <- A_B$A B <- A_B$B flows_model <- foreach( j = c(1:nrow(F_c)), .combine = rbind ) %do% { round(A[j] * B * O[j] * D * F_c[j, ]) } print("model run: over") e_sor <- e_sorensen(flows, flows_model) |> as.numeric() print(paste0("E_sor = ", e_sor)) r2 <- r_2(flows_model, flows) |> as.numeric() print(paste0("r2 = ", r2)) RMSE <- rmse(flows_model, flows) |> as.numeric() print(paste0("RMSE = ", RMSE)) list( "values" = flows_model, "r2" = r2, "rmse" = RMSE, "calib" = A_B$e, "e_sor" = e_sor ) } ### Validation e_sorensen <- function(data, fit) { 2 * sum(apply(cbind( data |> c(), fit |> c() ), MARGIN = 1, FUN = min)) / (sum(data) + sum(fit)) } ## ----load_data, echo=FALSE---------------------------------------------------- library(cppSim) data("distance_test") data("flows_test") distance_test <- (distance_test / 1000 / 14) |> round() ## ----------------------------------------------------------------------------- # creating the O, D vectors. O <- apply(flows_test, sum, MARGIN = 2) |> c() D <- apply(flows_test, sum, MARGIN = 1) |> c() ## ----------------------------------------------------------------------------- F_c <- cost_function(distance_test,1,type = "exp") ## ----------------------------------------------------------------------------- beta_calib <- foreach::foreach(i = 28:33 ,.combine = rbind) %do% { beta <- 0.1*(i - 1) print(paste0("RUNNING MODEL FOR beta = ",beta)) run <- run_model(flows = flows_test ,distance = distance_test ,beta = beta ,type = "exp" ) cbind(beta, run$r2,run$rmse) } ## ----------------------------------------------------------------------------- plot(beta_calib[,1] ,beta_calib[,2] ,xlab = "beta value" ,ylab = "quality of fit, r" ,main = "influence of beta on the goodness of fit" ,pch = 19 ,cex = 0.5 ,type = "b") ## ----------------------------------------------------------------------------- beta_best_fit <- beta_calib[which(beta_calib[,2] == max(beta_calib[,2])),1] x <- seq_len(100)/20 plot(x ,exp(-beta_best_fit*x) ,main = "cost function" ,xlab = "distance, km" ,ylab = "decay factor" ,pch = 19 ,cex = 0.5 ,type = "l") ## ----------------------------------------------------------------------------- run_best_fit <- run_model(flows = flows_test ,distance = distance_test ,beta = beta_best_fit ,type = "exp" ) ## ----------------------------------------------------------------------------- plot(seq_along(run_best_fit$calib) ,run_best_fit$calib ,xlab = "iteration" ,ylab = "error" ,main = "calibration of balancing factors" ,pch = 19 ,cex = 0.5 ,type = "b" ) ## ----------------------------------------------------------------------------- plot(flows_test ,run_best_fit$values ,ylab = "flows model" ,xlab = "flows" ,log = "xy" ,pch = 19 ,cex = 0.5) lines(seq_len(max(run_best_fit$values)) ,seq_len(max(run_best_fit$values)) ,col = "darkred" ,lwd = 2) ## ----------------------------------------------------------------------------- ## MODEL USING THE GLM And POISSON DISTRIBUTION # flows_london <- rlist::list.load("flows_london.rds") data(flows_london) flows_london <- flows_london sample_od <- sample(unique(flows_london$workplace),100) flows_grav <- flows_london[(workplace %in% sample_od) & (residence %in% sample_od),] flows_grav[,O := sum(bike),by = from_id] flows_grav[,D := sum(bike), by = to_id] # model <- glm(bike ~ workplace+residence+distance -1 ,data = flows_grav ,family = poisson(link = "log") ) ((model$fitted.values - flows_grav$bike)) |> hist(breaks = 100) r2 <- r_2(flows_grav$bike,model$fitted.values) r2 ## ----------------------------------------------------------------------------- print("cost function:") cost_function print("calibration function:") calibration print("model run") run_model