###
# code 6 - modellwahl
# 240514
###

if (!require("pacman")) install.packages("pacman"); library("pacman")
p_load(data.table)
p_load(ggplot2)
p_load(magrittr)

# 1 - Daten laden ----
daten = fread("input/data6/Donner.csv")

# daten inspizieren
head(daten)

ggplot(daten) +
  theme_minimal() +
  geom_point(aes(x = alter, y = ueberlebt, color = geschlecht))

# weitere Variablen erstellen
daten[, familiengroesse := .N, by = familie]

# regression laufen lassen
regression = glm(ueberlebt ~ alter + geschlecht + familiengroesse +
                   alter:geschlecht + alter:familiengroesse + geschlecht:familiengroesse +
                   I(alter^2) + I(familiengroesse^2),
                 data = daten,
                 family = binomial)
summary(regression)

# AIC and BIC
AIC(regression)
BIC(regression)

# 2 - Modellwahl ----

# iteriere über unterschiedliche modelle
models <- list(
  "mod1" = glm(ueberlebt ~ alter + geschlecht + familiengroesse,
               data = daten,
               family = binomial),
  "mod2" = glm(ueberlebt ~ alter + geschlecht + familiengroesse +
                 alter:geschlecht + alter:familiengroesse + geschlecht:familiengroesse,
               data = daten,
               family = binomial),
  "mod3" = glm(ueberlebt ~ alter + geschlecht + familiengroesse +
                 alter:geschlecht + alter:familiengroesse + geschlecht:familiengroesse +
                 I(alter^2) + I(familiengroesse^2),
               data = daten,
               family = binomial)
)

# AIC und BIC
AICs <- sapply(models, AIC)
BICs <- sapply(models, BIC)

# plot
data <- data.table(Model = names(models),
                   AIC = AICs,
                   BIC = BICs) %>%
  melt(id.vars = "Model", variable.name = "Criterion", value.name = "Value")

ggplot(data, aes(x = Model, y = Value, fill = Criterion)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "AIC and BIC for Different Models", x = "Model", y = "Value") +
  theme(plot.title = element_text(hjust = 0.5))


# 3 - Simulation ----

# Funktionen
f <- function(w) {
  return(1 + 2 * w - 2 * w^2)
}

calculate_diffs <- function(mod, fmod, x) {
  integrate(function(z) (f(z) - fmod(z))^2, 0, 1)$value
}


# seed setzen
set.seed(240514)

# datentabelle und plot initialisieren
results <- data.table(simulation = 1:simulations,
                      diffs1 = NA_real_,
                      diffs2 = NA_real_,
                      diffs3 = NA_real_,
                      diffs4 = NA_real_)
plot = ggplot() +
  theme_minimal()

# anzahl der simulationen
simulations <- 5000

# ziehe abhängige datenpunkte
x <- runif(100)

for (zoo in 1:simulations) {
  y <- 1 + 2 * x - 2 * x^2 + rnorm(100)
  
  mod1 <- lm(y ~ x)
  mod2 <- lm(y ~ x + I(x^2))
  mod3 <- lm(y ~ x + I(x^2) + I(x^3))
  mod4 <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))
  
  fmod1 <- function(z) as.numeric(mod1$coeff[1]) + as.numeric(mod1$coeff[2]) * z
  fmod2 <- function(z) as.numeric(mod2$coeff[1]) + as.numeric(mod2$coeff[2]) * z + as.numeric(mod2$coeff[3]) * z^2
  fmod3 <- function(z) as.numeric(mod3$coeff[1]) + as.numeric(mod3$coeff[2]) * z + as.numeric(mod3$coeff[3]) * z^2 + as.numeric(mod3$coeff[4]) * z^3
  fmod4 <- function(z) as.numeric(mod4$coeff[1]) + as.numeric(mod4$coeff[2]) * z + as.numeric(mod4$coeff[3]) * z^2 + as.numeric(mod4$coeff[4]) * z^3 + as.numeric(mod4$coeff[5]) * z^4
  
  results[zoo, diffs1 := calculate_diffs(mod1, fmod1, x)]
  results[zoo, diffs2 := calculate_diffs(mod2, fmod2, x)]
  results[zoo, diffs3 := calculate_diffs(mod3, fmod3, x)]
  results[zoo, diffs4 := calculate_diffs(mod4, fmod4, x)]
  
  # data <- data.table(x = rep(seq(-0.1, 1.1, length.out = 100), 5),
  #                    y = c(1 + 2 * seq(-0.1, 1.1, length.out = 100) - 1.9 * seq(-0.1, 1.1, length.out = 100)^2,
  #                          predict(mod1, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100))),
  #                          predict(mod2, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100))),
  #                          predict(mod3, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100))),
  #                          predict(mod4, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100)))),
  #                    model = rep(c("True Model", "Model 1", "Model 2", "Model 3", "Model 4"), each = 100))
  # 
  # plot = plot +
  #   geom_line(data = data, aes(x, y, color = model))
}

# Histogram der Integrated Squared Errors (ISE)
histogram_data <- melt(results, id.vars = "simulation", variable.name = "Model", value.name = "ISE")
histogram_data[, Mean_ISE := mean(ISE), by = Model]
histogram_data[Model == "diffs1", Model_Name := "Model 1"]
histogram_data[Model == "diffs2", Model_Name := "Model 2"]
histogram_data[Model == "diffs3", Model_Name := "Model 3"]
histogram_data[Model == "diffs4", Model_Name := "Model 4"]

ggplot(histogram_data, aes(x = ISE)) +
  geom_histogram(binwidth = 0.01, fill = "blue", color = "black", alpha = 0.7) +
  facet_wrap(~ Model_Name, scales = "free_y") +
  theme_minimal() +
  geom_vline(aes(xintercept = Mean_ISE), color = "red", linetype = "dashed", size = 1) +
  labs(title = "ISEs for Different Models", x = "ISE", y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5))
ggsave("output/plot_simulation.png", width = 20, height = 15, dpi = 300, units = "cm")


# Beispiel: Plotten der Kurven
y <- 1 + 2 * x - 2 * x^2 + rnorm(100)
mod1 <- lm(y ~ x)
mod2 <- lm(y ~ x + I(x^2))
mod3 <- lm(y ~ x + I(x^2) + I(x^3))
mod4 <- lm(y ~ x + I(x^2) + I(x^3) + I(x^4))

data <- data.table(x = rep(seq(-0.1, 1.1, length.out = 100), 5),
                   y = c(1 + 2 * seq(-0.1, 1.1, length.out = 100) - 1.9 * seq(-0.1, 1.1, length.out = 100)^2,
                         predict(mod1, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100))),
                         predict(mod2, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100))),
                         predict(mod3, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100))),
                         predict(mod4, newdata = data.table(x = seq(-0.1, 1.1, length.out = 100)))),
                   model = rep(c("True Model", "Model 1", "Model 2", "Model 3", "Model 4"), each = 100))

ggplot(data, aes(x, y, color = model)) +
  geom_line() +
  theme_minimal()