"""
replicate_xom.py - Self-contained replication of the ExxonMobil event-study
results published in Goodwin (April 2026), "Read the Fine Print: What
ExxonMobil's Proxy Actually Says About Texas Redomiciliation."

Inputs:
  daily_closes.csv         - 713 daily closes for XOM + 21 energy donors + SPY
                             (columns: Date, XOM, CVX, COP, EOG, SLB, OXY, MPC,
                              VLO, PSX, DVN, FANG, HAL, BKR, CTRA, OVV, APA,
                              EQT, WMB, OKE, TRGP, LNG, ET, SPY, ...)

Output:
  output.json              - reviewer's replicated estimates

Compare output.json against expected_results.json. PASS if every point estimate
is within +/- 0.5 percentage points and every p-value is within +/- 0.05.

Methodology (formal version in methodology.md, Section 7):
  - Synthetic control: 21-donor energy peer pool, SLSQP optimizer, w_j >= 0,
    sum(w_j) = 1; minimize pre-event RMSPE on 240-day window ending T-1
  - Market model (SPY benchmark): OLS regression on 240-day pre-window,
    Patell-z standardized abnormal-return test
  - Oil-augmented market model: SPY + WTI two-factor; in this kit, since the
    daily_closes.csv lacks WTI, we use BNO (Brent ETF) as a proxy
  - Matched-pair vs CVX: simple difference-of-returns test
  - Raw differential vs CVX: just the difference (no inference)

Run:
    pip install pandas numpy scipy
    python replicate_xom.py
"""
import json
import numpy as np
import pandas as pd
from scipy import stats
from scipy.optimize import minimize


# Configuration
EVENT_DATE = "2026-03-10"
ESTIMATION_WINDOW_DAYS = 240
DONORS = ["CVX", "COP", "EOG", "SLB", "OXY", "MPC", "VLO", "PSX",
          "DVN", "FANG", "HAL", "BKR", "CTRA", "OVV", "APA",
          "EQT", "WMB", "OKE", "TRGP", "LNG", "ET"]


def fit_synth(y_pre, X_pre):
    """SLSQP-fit donor weights to minimize pre-event RMSPE.
    Constraints: w_j >= 0, sum(w_j) = 1."""
    n = X_pre.shape[1]
    obj = lambda w: float(np.sum((y_pre - X_pre @ w) ** 2))
    cons = ({"type": "eq", "fun": lambda w: w.sum() - 1.0},)
    bnds = [(0.0, 1.0)] * n
    w0 = np.ones(n) / n
    res = minimize(obj, w0, method="SLSQP", bounds=bnds, constraints=cons,
                   options={"maxiter": 200, "ftol": 1e-9})
    return res.x


def synth_day0(returns, treated, donors, event_idx, win=ESTIMATION_WINDOW_DAYS):
    """Synthetic-control estimate of day-0 abnormal return."""
    est = returns.iloc[event_idx - win:event_idx]
    y_pre = est[treated].values
    X_pre = est[donors].values
    w = fit_synth(y_pre, X_pre)
    day0 = returns.iloc[event_idx][treated] - returns.iloc[event_idx][donors].values @ w
    pre_rmspe = float(np.sqrt(np.mean((y_pre - X_pre @ w) ** 2)))
    return float(day0), w, pre_rmspe


def market_model(returns, treated, benchmark, event_idx, win=ESTIMATION_WINDOW_DAYS):
    """Plain market model: AR_t = R_treated_t - (alpha + beta * R_bench_t)."""
    est = returns.iloc[event_idx - win:event_idx]
    y = est[treated].values
    x = est[benchmark].values
    # OLS with intercept
    X = np.column_stack([np.ones(len(x)), x])
    beta_hat, *_ = np.linalg.lstsq(X, y, rcond=None)
    alpha, beta = beta_hat
    r2 = 1 - np.sum((y - X @ beta_hat) ** 2) / np.sum((y - y.mean()) ** 2)
    sigma = float(np.std(y - X @ beta_hat, ddof=2))
    bench_event = returns.iloc[event_idx][benchmark]
    treated_event = returns.iloc[event_idx][treated]
    expected = alpha + beta * bench_event
    ar = float(treated_event - expected)
    # Patell-z (standardized AR)
    n = len(y)
    bench_mean = est[benchmark].mean()
    bench_var = est[benchmark].var(ddof=1)
    s_ar = sigma * np.sqrt(1 + 1.0 / n + (bench_event - bench_mean) ** 2 / (n * bench_var))
    z = ar / float(s_ar)
    p = 2 * (1 - stats.norm.cdf(abs(z)))
    return {"alpha": float(alpha), "beta": float(beta), "r2": float(r2),
            "sigma": sigma, "day0_ar": ar, "patell_z": float(z),
            "patell_p": float(p)}


def oil_augmented(returns, treated, mkt, oil, event_idx, win=ESTIMATION_WINDOW_DAYS):
    """Two-factor market model: SPY + oil benchmark."""
    est = returns.iloc[event_idx - win:event_idx]
    y = est[treated].values
    X = np.column_stack([np.ones(len(y)), est[mkt].values, est[oil].values])
    beta_hat, *_ = np.linalg.lstsq(X, y, rcond=None)
    alpha, b_mkt, b_oil = beta_hat
    sigma = float(np.std(y - X @ beta_hat, ddof=3))
    r2 = 1 - np.sum((y - X @ beta_hat) ** 2) / np.sum((y - y.mean()) ** 2)
    expected = alpha + b_mkt * returns.iloc[event_idx][mkt] + b_oil * returns.iloc[event_idx][oil]
    ar = float(returns.iloc[event_idx][treated] - expected)
    z = ar / sigma
    p = 2 * (1 - stats.norm.cdf(abs(z)))
    return {"alpha": float(alpha), "beta_mkt": float(b_mkt), "beta_oil": float(b_oil),
            "r2": float(r2), "sigma": sigma, "day0_ar": ar, "patell_z": float(z),
            "patell_p": float(p)}


def matched_pair_mm(returns, treated, peer, benchmark, event_idx, win=ESTIMATION_WINDOW_DAYS):
    """Market-model-adjusted matched-pair test.

    Fit market model separately for treated firm and peer firm, compute each
    firm's day-0 abnormal return, then test whether the DIFFERENCE in ARs is
    statistically distinguishable from zero. This is the published
    'matched_pair_mm' specification (Goodwin paper Section 7.4).

    NOT a raw-return difference (which is what 'raw_differential' computes).
    """
    est = returns.iloc[event_idx - win:event_idx]
    # Fit market model for treated firm
    X = np.column_stack([np.ones(len(est)), est[benchmark].values])
    beta_t, *_ = np.linalg.lstsq(X, est[treated].values, rcond=None)
    sigma_t = float(np.std(est[treated].values - X @ beta_t, ddof=2))
    ar_t = float(returns.iloc[event_idx][treated]
                 - (beta_t[0] + beta_t[1] * returns.iloc[event_idx][benchmark]))
    # Fit market model for peer
    beta_p, *_ = np.linalg.lstsq(X, est[peer].values, rcond=None)
    sigma_p = float(np.std(est[peer].values - X @ beta_p, ddof=2))
    ar_p = float(returns.iloc[event_idx][peer]
                 - (beta_p[0] + beta_p[1] * returns.iloc[event_idx][benchmark]))
    # Difference of abnormal returns
    delta_event = ar_t - ar_p
    # SE of delta: independent-firm approximation (sqrt of sum of variances)
    se_delta = float(np.sqrt(sigma_t**2 + sigma_p**2))
    t = delta_event / se_delta if se_delta > 0 else 0.0
    p = 2 * (1 - stats.norm.cdf(abs(t)))
    return {"ar_treated_day0": ar_t, "ar_peer_day0": ar_p,
            "delta_day0": delta_event, "se_delta": se_delta,
            "t": float(t), "p": float(p)}


def raw_differential(returns, treated, peer, event_idx):
    """Just the difference, no model."""
    return {"raw_diff": float(returns.iloc[event_idx][treated] -
                              returns.iloc[event_idx][peer])}


def main():
    print("=" * 70)
    print("XOM event-study replication")
    print("=" * 70)

    # Load daily closes, compute log/simple returns
    closes = pd.read_csv("daily_closes.csv", parse_dates=["Date"]).set_index("Date").sort_index()
    closes = closes.dropna(axis=1, how="all")
    returns = closes.pct_change().dropna(how="all")
    event = pd.Timestamp(EVENT_DATE)
    if event not in returns.index:
        # Find the closest trading day
        event = returns.index[returns.index.get_indexer([event], method="nearest")[0]]
    event_idx = returns.index.get_loc(event)
    print(f"Event date: {event.date()}  (index = {event_idx})")
    print(f"Estimation window: {ESTIMATION_WINDOW_DAYS} days ending T-1")
    print()

    # Filter donors that exist in the data
    donors = [d for d in DONORS if d in returns.columns]
    print(f"Donor pool: {len(donors)}/{len(DONORS)} firms available")

    # 1. Synthetic control
    sc_day0, sc_w, sc_rmspe = synth_day0(returns, "XOM", donors, event_idx)
    top_w = sorted(zip(donors, sc_w), key=lambda x: -x[1])[:5]
    print(f"\nSynthetic control (21 donors):")
    print(f"  Day-0 gap = {sc_day0*100:+.4f}%  pre-RMSPE = {sc_rmspe*100:.4f}%")
    print(f"  Top weights: {[(t, f'{w*100:.1f}%') for t, w in top_w]}")

    # 2. Market model (SPY)
    mm = market_model(returns, "XOM", "SPY", event_idx)
    print(f"\nMarket model (SPY):")
    print(f"  alpha={mm['alpha']*100:.4f}%/day  beta={mm['beta']:.4f}  R^2={mm['r2']:.3f}")
    print(f"  Day-0 AR = {mm['day0_ar']*100:+.4f}%  Patell p = {mm['patell_p']:.4f}")

    # 3. Oil-augmented (SPY + BNO as oil proxy)
    oil_proxy = "BNO" if "BNO" in returns.columns else "XLE"
    oa = oil_augmented(returns, "XOM", "SPY", oil_proxy, event_idx)
    print(f"\nOil-augmented market model (SPY + {oil_proxy}):")
    print(f"  alpha={oa['alpha']*100:.4f}  beta_mkt={oa['beta_mkt']:.3f}  beta_oil={oa['beta_oil']:.3f}  R^2={oa['r2']:.3f}")
    print(f"  Day-0 AR = {oa['day0_ar']*100:+.4f}%  Patell p = {oa['patell_p']:.4f}")

    # 4. Matched pair vs CVX (market-model-adjusted)
    mp = matched_pair_mm(returns, "XOM", "CVX", "SPY", event_idx)
    print(f"\nMatched pair vs CVX (market-model-adjusted, mm):")
    print(f"  AR(XOM day-0) = {mp['ar_treated_day0']*100:+.4f}%  AR(CVX day-0) = {mp['ar_peer_day0']*100:+.4f}%")
    print(f"  Delta = AR(XOM) - AR(CVX) = {mp['delta_day0']*100:+.4f}%  t = {mp['t']:.3f}  p = {mp['p']:.4f}")

    # 5. Raw differential
    rd = raw_differential(returns, "XOM", "CVX", event_idx)
    print(f"\nRaw differential vs CVX:")
    print(f"  raw_diff = {rd['raw_diff']*100:+.4f}%")

    # Output JSON
    output = {
        "event_date": str(event.date()),
        "estimation_window_days": ESTIMATION_WINDOW_DAYS,
        "n_donors": len(donors),
        "synthetic_control": {
            "day0_ar_pct":   round(sc_day0 * 100, 4),
            "pre_rmspe_pct": round(sc_rmspe * 100, 4),
            "top_weights":   {t: round(float(w), 4) for t, w in top_w},
        },
        "market_model": {
            "day0_ar_pct": round(mm["day0_ar"] * 100, 4),
            "patell_p":    round(mm["patell_p"], 4),
            "alpha":       round(mm["alpha"], 6),
            "beta":        round(mm["beta"], 4),
            "r2":          round(mm["r2"], 4),
        },
        "oil_augmented": {
            "day0_ar_pct": round(oa["day0_ar"] * 100, 4),
            "patell_p":    round(oa["patell_p"], 4),
            "beta_mkt":    round(oa["beta_mkt"], 4),
            "beta_oil":    round(oa["beta_oil"], 4),
            "oil_proxy_used": oil_proxy,
        },
        "matched_pair": {
            "delta_day0_pct": round(mp["delta_day0"] * 100, 4),
            "p_two_sided":    round(mp["p"], 4),
        },
        "raw_differential": {
            "raw_diff_pct": round(rd["raw_diff"] * 100, 4),
        },
    }
    with open("output.json", "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\nWrote: output.json")
    print(f"\nNext: diff output.json against expected_results.json")
    print(f"PASS criteria: every point estimate within +/- 0.5pp, every p-value within +/- 0.05")


if __name__ == "__main__":
    main()
