library(dplyr)
library(ggplot2)

# Set seed for reproducibility
set.seed(123)

# Simulate data
n <- 1000  # Number of observations


# Simulate Effort (independent variable, affected by Education)
effort <- 5  + rnorm(n, mean = 0, sd = 2)  # Effort in some units
# Simulate Education (independent variable, confounder) as fucntion of effort
education <- 2 * effort + rnorm(n, mean = 0, sd = 2)  # Education in years
# Simulate Wage (dependent variable, affected by both Education and Effort)
wage <- 10  +  2*effort + rnorm(n, mean = 0, sd = 5)  # Wage in dollars/hour
# Create a data frame
data <- data.frame(education, effort, wage)
# Summary of the dataset
summary(data)
# Visualize the data
pairs(data, main = "Scatterplot Matrix of Simulated Data")
# Create a linear model without accounting for the confounder (Education)
model_no_confounder <- lm(wage ~ education, data = data)
summary(model_no_confounder)
# Create a linear model accounting for the confounder (Education)
model_with_confounder <- lm(wage ~ education + effort, data = data)
summary(model_with_confounder)
# Create a data frame from this table