# replicate_xom.R --- R replication of the ExxonMobil event-study results
# published in Goodwin (April 2026), "Read the Fine Print: What ExxonMobil's
# Proxy Actually Says About Texas Redomiciliation."
#
# Mirrors the Python version (replicate_xom.py). Five specifications:
#   1. Synthetic control (21-donor energy peer pool, non-negative simplex
#      weights summing to 1; SLSQP via quadprog or constrOptim)
#   2. Market model (XOM ~ SPY, 240-day pre-window)
#   3. Oil-augmented (XOM ~ SPY + BNO or XLE)
#   4. Matched-pair vs CVX (market-model-adjusted)
#   5. Raw differential vs CVX
#
# Inputs:  daily_closes.csv
# Output:  output_R.json
#
# Run:
#   install.packages(c("readr", "dplyr", "jsonlite", "lubridate", "quadprog"))
#   Rscript replicate_xom.R
#
# Compare output_R.json against expected_results.json --- same +/- 0.5pp tolerance
# rules as the Python version (see README.md).

suppressPackageStartupMessages({
  library(readr)
  library(dplyr)
  library(jsonlite)
  library(lubridate)
  library(quadprog)
})

# --- Configuration ---
EVENT_DATE             <- as.Date("2026-03-10")
ESTIMATION_WINDOW_DAYS <- 240
DONORS <- c("CVX","COP","EOG","SLB","OXY","MPC","VLO","PSX","DVN","FANG",
            "HAL","BKR","CTRA","OVV","APA","EQT","WMB","OKE","TRGP","LNG","ET")

# --- 1. Load prices, compute simple returns ---
closes <- read_csv("daily_closes.csv", show_col_types = FALSE)
closes$Date <- as.Date(closes$Date)
closes <- closes[order(closes$Date), ]
# Drop columns that are entirely NA
keep_cols <- c("Date", names(closes)[-1][sapply(closes[, -1], function(x) sum(!is.na(x)) > 50)])
closes <- closes[, keep_cols]

# Simple returns
returns <- closes
for (col in setdiff(names(closes), "Date")) {
  returns[[col]] <- c(NA, diff(closes[[col]]) / head(closes[[col]], -1))
}
returns <- returns[!is.na(returns$XOM), ]
rownames(returns) <- NULL

# Locate event index (or nearest trading day)
event_idx <- which(returns$Date >= EVENT_DATE)[1]
if (is.na(event_idx)) {
  stop("Event date past end of price file")
}
cat(sprintf("Event date: %s (row %d)\n", returns$Date[event_idx], event_idx))
cat(sprintf("Estimation window: %d trading days ending T-1\n\n", ESTIMATION_WINDOW_DAYS))

donors_avail <- intersect(DONORS, names(returns))
cat(sprintf("Donor pool: %d/%d firms available\n", length(donors_avail), length(DONORS)))

# Pre-event window
est <- returns[(event_idx - ESTIMATION_WINDOW_DAYS):(event_idx - 1), , drop = FALSE]

# --- 2. Synthetic control (quadratic programming with simplex constraint) ---
# Min  || y - Xw ||^2  s.t.  w >= 0, sum(w) = 1
# Solve via solve.QP, which expects:  min -d'b + 1/2 b'Db   s.t. A'b >= b0
fit_synth <- function(y_pre, X_pre) {
  n     <- ncol(X_pre)
  Dmat  <- 2 * crossprod(X_pre)           # 2 X'X
  dvec  <- 2 * as.numeric(crossprod(X_pre, y_pre))  # 2 X'y
  # equality: sum(w) = 1   (first constraint, meq = 1)
  # inequalities: w >= 0
  Amat  <- cbind(rep(1, n), diag(n))
  bvec  <- c(1, rep(0, n))
  # Regularize Dmat slightly for numerical stability
  Dmat  <- Dmat + diag(1e-8, n)
  sol   <- solve.QP(Dmat, dvec, Amat, bvec, meq = 1)
  w     <- sol$solution
  w[w < 0] <- 0
  w / sum(w)
}

y_pre <- est$XOM
X_pre <- as.matrix(est[, donors_avail, drop = FALSE])
w     <- fit_synth(y_pre, X_pre)
day0_gap <- returns$XOM[event_idx] -
            sum(as.numeric(returns[event_idx, donors_avail]) * w)
pre_rmspe <- sqrt(mean((y_pre - X_pre %*% w)^2))
top_w <- sort(setNames(w, donors_avail), decreasing = TRUE)[1:5]

cat("\nSynthetic control (21 donors):\n")
cat(sprintf("  Day-0 gap = %+.4f%%  pre-RMSPE = %.4f%%\n",
            day0_gap * 100, pre_rmspe * 100))
cat("  Top weights:", paste(names(top_w), sprintf("%.1f%%", top_w * 100),
                            sep = "=", collapse = " "), "\n")

# --- 3. Market model (OLS) ---
mm <- lm(XOM ~ SPY, data = est)
alpha <- coef(mm)[1]; beta <- coef(mm)[2]
sigma <- summary(mm)$sigma
r2    <- summary(mm)$r.squared
day0_ar_mm <- returns$XOM[event_idx] -
              (alpha + beta * returns$SPY[event_idx])
# Patell-z
spy_event <- returns$SPY[event_idx]
spy_mean  <- mean(est$SPY)
spy_var   <- var(est$SPY)
n_est     <- nrow(est)
s_ar      <- sigma * sqrt(1 + 1 / n_est + (spy_event - spy_mean)^2 / (n_est * spy_var))
z_mm      <- day0_ar_mm / s_ar
p_mm      <- 2 * (1 - pnorm(abs(z_mm)))
cat(sprintf("\nMarket model (SPY):\n"))
cat(sprintf("  alpha=%.4f%%/day  beta=%.4f  R^2=%.3f\n",
            alpha * 100, beta, r2))
cat(sprintf("  Day-0 AR = %+.4f%%  Patell p = %.4f\n",
            day0_ar_mm * 100, p_mm))

# --- 4. Oil-augmented (SPY + BNO or XLE) ---
oil_proxy <- if ("BNO" %in% names(returns)) "BNO" else "XLE"
oa <- lm(as.formula(sprintf("XOM ~ SPY + %s", oil_proxy)), data = est)
oa_event_x <- as.numeric(c(1, returns$SPY[event_idx], returns[event_idx, oil_proxy]))
day0_ar_oa <- returns$XOM[event_idx] - sum(coef(oa) * oa_event_x)
sigma_oa <- summary(oa)$sigma
z_oa <- day0_ar_oa / sigma_oa
p_oa <- 2 * (1 - pnorm(abs(z_oa)))
cat(sprintf("\nOil-augmented market model (SPY + %s):\n", oil_proxy))
cat(sprintf("  alpha=%.4f%%  beta_mkt=%.3f  beta_oil=%.3f  R^2=%.3f\n",
            coef(oa)[1] * 100, coef(oa)[2], coef(oa)[3], summary(oa)$r.squared))
cat(sprintf("  Day-0 AR = %+.4f%%  Patell p = %.4f\n",
            day0_ar_oa * 100, p_oa))

# --- 5. Matched-pair vs CVX (market-model-adjusted) ---
mm_xom <- lm(XOM ~ SPY, data = est)
mm_cvx <- lm(CVX ~ SPY, data = est)
ar_xom <- returns$XOM[event_idx] - (coef(mm_xom)[1] + coef(mm_xom)[2] * returns$SPY[event_idx])
ar_cvx <- returns$CVX[event_idx] - (coef(mm_cvx)[1] + coef(mm_cvx)[2] * returns$SPY[event_idx])
delta  <- ar_xom - ar_cvx
se_delta <- sqrt(summary(mm_xom)$sigma^2 + summary(mm_cvx)$sigma^2)
t_mp <- delta / se_delta
p_mp <- 2 * (1 - pnorm(abs(t_mp)))
cat("\nMatched pair vs CVX (market-model-adjusted):\n")
cat(sprintf("  AR(XOM)=%+.4f%%  AR(CVX)=%+.4f%%  Delta=%+.4f%%  t=%.3f  p=%.4f\n",
            ar_xom * 100, ar_cvx * 100, delta * 100, t_mp, p_mp))

# --- 6. Raw differential ---
raw_diff <- returns$XOM[event_idx] - returns$CVX[event_idx]
cat(sprintf("\nRaw differential vs CVX:\n  raw_diff = %+.4f%%\n", raw_diff * 100))

# --- Output JSON ---
out <- list(
  event_date            = as.character(returns$Date[event_idx]),
  estimation_window_days = ESTIMATION_WINDOW_DAYS,
  n_donors              = length(donors_avail),
  synthetic_control = list(
    day0_ar_pct    = round(day0_gap * 100, 4),
    pre_rmspe_pct  = round(pre_rmspe * 100, 4),
    top_weights    = as.list(round(top_w, 4))
  ),
  market_model = list(
    day0_ar_pct = round(day0_ar_mm * 100, 4),
    patell_p    = round(p_mm, 4),
    alpha       = round(alpha, 6),
    beta        = round(beta, 4),
    r2          = round(r2, 4)
  ),
  oil_augmented = list(
    day0_ar_pct     = round(day0_ar_oa * 100, 4),
    patell_p        = round(p_oa, 4),
    beta_mkt        = round(coef(oa)[2], 4),
    beta_oil        = round(coef(oa)[3], 4),
    oil_proxy_used  = oil_proxy
  ),
  matched_pair = list(
    delta_day0_pct = round(delta * 100, 4),
    p_two_sided    = round(p_mp, 4)
  ),
  raw_differential = list(
    raw_diff_pct = round(raw_diff * 100, 4)
  )
)
writeLines(toJSON(out, pretty = TRUE, auto_unbox = TRUE), "output_R.json")
cat("\nWrote: output_R.json\n")
cat("Next: diff output_R.json against expected_results.json\n")
cat("PASS criteria: every point estimate within +/- 0.5pp, every p-value within +/- 0.05\n")
