"""Cross-reference fresh findings against April baseline (master_worklist.csv)."""
import csv
import re
from collections import defaultdict, Counter
from urllib.parse import urlparse


def load_csv(path):
    with open(path, encoding="utf-8") as f:
        return list(csv.DictReader(f))


def page_path(url):
    """Strip protocol/host, return path only (no trailing slash)."""
    p = urlparse(url).path if url.startswith("http") else url
    return p.rstrip("/") or "/"


def norm_href(h):
    """Strip query/fragment for matching."""
    return h.split("?", 1)[0].split("#", 1)[0].rstrip("/") or "/"


def main():
    master = load_csv("../master_worklist.csv")
    t016 = load_csv("t016_findings_2026_05_08.csv")
    t063 = load_csv("t063_findings_2026_05_08.csv")

    # Index April baseline by (page_path, link_href). Page_url in master may be path-only.
    april_index = defaultdict(list)
    for r in master:
        page = page_path(r["page_url"])
        href = norm_href(r["current_href"])
        april_index[(page, href)].append(r)

    print(f"April master_worklist rows: {len(master)}")
    print(f"April unique (page,href) keys: {len(april_index)}")
    print()

    # April status by task
    print("April status breakdown:")
    by_status = Counter((r["task"], r["execution_status"]) for r in master)
    for k, v in sorted(by_status.items()):
        print(f"  {k}: {v}")
    print()

    # For each fresh finding, classify against April
    def classify(page, href):
        pagep = page_path(page)
        hrefn = norm_href(href)
        if (pagep, hrefn) in april_index:
            statuses = [r["execution_status"] for r in april_index[(pagep, hrefn)]]
            tasks = [r["task"] for r in april_index[(pagep, hrefn)]]
            return "carry-over", statuses, tasks
        return "new", [], []

    # T016 classification
    t016_main = [r for r in t016 if r["in_main"] == "1" and r["in_chrome"] == "0"]
    t016_chrome = [r for r in t016 if r["in_chrome"] == "1"]

    t016_carry_main = []
    t016_new_main = []
    for r in t016_main:
        c, s, t = classify(r["page_url"], r["link_href"])
        if c == "carry-over":
            t016_carry_main.append({**r, "april_status": ",".join(set(s)), "april_task": ",".join(set(t))})
        else:
            t016_new_main.append(r)

    print(f"=== T016 (link-via-redirect) ===")
    print(f"  Total findings (in_main): {len(t016_main)}")
    print(f"    Carried over from April baseline: {len(t016_carry_main)}")
    print(f"    New since April: {len(t016_new_main)}")
    print(f"  Total findings (in_chrome / theme-level): {len(t016_chrome)}")
    print()

    # Status of carried-over
    if t016_carry_main:
        carry_status = Counter(r["april_status"] for r in t016_carry_main)
        print(f"  Carried-over T016 by April status: {dict(carry_status)}")
    print()

    # T063 classification
    t063_main = [r for r in t063 if r["in_main"] == "1"]
    t063_main_fixable = [r for r in t063_main if r["fixable"] == "1"]
    t063_main_broken = [r for r in t063_main if r["fixable"] == "0"]

    t063_carry_fixable = []
    t063_new_fixable = []
    for r in t063_main_fixable:
        c, s, t = classify(r["page_url"], r["link_href"])
        (t063_carry_fixable if c == "carry-over" else t063_new_fixable).append({**r, "april_status": ",".join(set(s))})

    t063_carry_broken = []
    t063_new_broken = []
    for r in t063_main_broken:
        c, s, t = classify(r["page_url"], r["link_href"])
        (t063_carry_broken if c == "carry-over" else t063_new_broken).append({**r, "april_status": ",".join(set(s))})

    print(f"=== T063 (legacy /node/N) ===")
    print(f"  Fixable (has redirect target):")
    print(f"    Total: {len(t063_main_fixable)}")
    print(f"    Carried over: {len(t063_carry_fixable)}")
    print(f"    New: {len(t063_new_fixable)}")
    print(f"  Broken (no redirect, deleted target):")
    print(f"    Total: {len(t063_main_broken)}")
    print(f"    Carried over: {len(t063_carry_broken)}")
    print(f"    New: {len(t063_new_broken)}")
    print()

    if t063_carry_fixable:
        cs = Counter(r["april_status"] for r in t063_carry_fixable)
        print(f"  Carried-over T063 fixable by April status: {dict(cs)}")
    if t063_carry_broken:
        cs = Counter(r["april_status"] for r in t063_carry_broken)
        print(f"  Carried-over T063 broken by April status: {dict(cs)}")
    print()

    # Save annotated CSVs
    def save(rows, path, extra_cols=None):
        if not rows:
            with open(path, "w", encoding="utf-8") as f:
                f.write("(empty)\n")
            return
        cols = list(rows[0].keys())
        with open(path, "w", encoding="utf-8", newline="") as f:
            w = csv.DictWriter(f, fieldnames=cols)
            w.writeheader()
            for r in rows:
                w.writerow(r)

    save(t016_carry_main, "t016_carry_over_main.csv")
    save(t016_new_main, "t016_new_since_april_main.csv")
    save(t016_chrome, "t016_theme_level.csv")
    save(t063_carry_fixable, "t063_carry_over_fixable.csv")
    save(t063_new_fixable, "t063_new_since_april_fixable.csv")
    save(t063_carry_broken, "t063_carry_over_broken.csv")
    save(t063_new_broken, "t063_new_since_april_broken.csv")

    # Final scorecard
    print("=" * 60)
    print("SCORECARD: April baseline vs May 8 reaudit")
    print("=" * 60)

    # T015
    apr_t015_in_scope = sum(1 for r in master if r["task"] == "T015")
    print(f"\nT015 (redirect chains over 2 hops)")
    print(f"  April: 15 in scope (12 to fix + 3 SKIPPED lang variants)")
    print(f"  May 8: 3 chains remain — all 3 are the SKIPPED IT/ES/FR variants")
    print(f"  Net: 12/12 fixable items resolved (100%). 0 new chains.")

    # T016
    apr_t016 = [r for r in master if r["task"] == "T016"]
    apr_t016_done = [r for r in apr_t016 if r["execution_status"] == "DONE"]
    apr_t016_skipped = [r for r in apr_t016 if r["execution_status"] in ("SKIPPED",)]
    apr_t016_pending = [r for r in apr_t016 if r["execution_status"] in ("", "MANUAL-REQUIRED", "FAILED", "NOT-FOUND")]
    print(f"\nT016 (links via redirect)")
    print(f"  April: 183 in scope")
    print(f"    DONE in April: {len(apr_t016_done)}")
    print(f"    Pending/Failed/NotFound: {len(apr_t016_pending)}")
    print(f"  May 8: {len(t016_main)} body-content findings ({len(t016_carry_main)} carry-over + {len(t016_new_main)} new)")
    print(f"  Plus: {len(t016_chrome)} theme-level (header/footer menu) — single fix in theme config")

    # T063
    apr_t063 = [r for r in master if r["task"] == "T063"]
    apr_t063_done = [r for r in apr_t063 if r["execution_status"] == "DONE"]
    apr_brokenlink = [r for r in master if r["task"] == "BROKEN-LINK"]
    apr_brokenlink_done = [r for r in apr_brokenlink if r["execution_status"] == "DONE"]
    print(f"\nT063 (legacy /node/N links)")
    print(f"  April: 175 fixable + 39 broken (master_worklist sum incl ones discovered during exec)")
    print(f"    DONE in April (T063): {len(apr_t063_done)}")
    print(f"    DONE in April (BROKEN-LINK): {len(apr_brokenlink_done)}")
    print(f"  May 8: {len(t063_main_fixable)} fixable + {len(t063_main_broken)} broken = {len(t063_main)} total")
    print(f"    Carry-over: {len(t063_carry_fixable)} fixable + {len(t063_carry_broken)} broken")
    print(f"    New since April: {len(t063_new_fixable)} fixable + {len(t063_new_broken)} broken")


if __name__ == "__main__":
    main()
