"""Re-fetch pages with carry-over findings and apply precise widget-vs-editable classifier
to get a real count of remaining work (not extrapolated)."""
import csv, json, sys
from collections import Counter, defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (FortraReaudit/2026-05-08-PreciseCount)"})

def load_csv(path):
    with open(path, encoding="utf-8") as f:
        return list(csv.DictReader(f))

def pp(u):
    p = urlparse(u).path if u.startswith("http") else u
    return p.rstrip("/") or "/"
def nh(h):
    return h.split("?",1)[0].split("#",1)[0].rstrip("/") or "/"

# Load fresh findings (in_main only)
t016 = [r for r in load_csv("t016_findings_2026_05_08.csv") if r["in_main"]=="1" and r["in_chrome"]=="0"]
t063 = [r for r in load_csv("t063_findings_2026_05_08.csv") if r["in_main"]=="1"]

# All unique findings keyed by (page_url, link_href)
all_findings = []
for r in t016: all_findings.append({**r, "task": "T016", "src_csv": "t016"})
for r in t063: all_findings.append({**r, "task": "T063", "src_csv": "t063"})
print(f"Total in-main findings to classify: {len(all_findings)}", file=sys.stderr)

# Group findings by page so we fetch each page once
by_page = defaultdict(list)
for f in all_findings:
    by_page[f["page_url"]].append(f)
print(f"Unique pages to fetch: {len(by_page)}", file=sys.stderr)

# Region classifier (based on parent class chain)
WIDGET_HINTS = ("related-content","related-articles","recent-posts","sidebar","block-views","recommended","featured-content","node--title","field--name-title","you-may-also-like","more-from","auto-list")
EDITABLE_HINTS = ("paragraph--type--text","paragraph--type--cta","paragraph--type--callout","paragraph--type--accordion","paragraph--type--rich-text","paragraph--type--quote","paragraph--type--media","paragraph--type--columns","paragraph--type--button","field--name-body")

def classify(a):
    parent_descs = []
    for p in list(a.parents)[:12]:
        cls = " ".join(p.get("class") or []).lower()
        pid = (p.get("id") or "").lower()
        parent_descs.append(cls + " " + pid)
    flat = " ".join(parent_descs)
    # Widget? (auto-generated lists)
    if any(w in flat for w in WIDGET_HINTS):
        return "widget"
    # Editable body or CTA?
    if any(e in flat for e in EDITABLE_HINTS):
        # CTA paragraph specifically
        if "paragraph--type--cta" in flat:
            return "editable_cta"
        return "editable_body"
    # Header/footer/nav (shouldn't reach here for in_main, but safety)
    if "site-header" in flat or "main-menu" in flat or "footer" in flat or "<nav" in flat:
        return "chrome"
    return "unclear"

def fetch_and_classify(page_url, findings_for_page):
    out = []
    try:
        r = session.get(page_url, timeout=25)
        if r.status_code != 200:
            for f in findings_for_page:
                out.append({**f, "live_status": r.status_code, "region": "page_error", "match_count": 0})
            return out
    except Exception as e:
        for f in findings_for_page:
            out.append({**f, "live_status": 0, "region": "fetch_error", "match_count": 0, "error": str(e)})
        return out

    soup = BeautifulSoup(r.text, "html.parser")
    for f in findings_for_page:
        href_norm = nh(f["link_href"])
        # Find all anchors with matching normalized href
        matches = []
        for a in soup.find_all("a", href=True):
            if nh(a["href"]) == href_norm:
                matches.append(a)
        if not matches:
            out.append({**f, "live_status": 200, "region": "gone", "match_count": 0})
            continue
        # Classify each occurrence; the worst-case (most-real) wins for the row
        regions = [classify(a) for a in matches]
        priority = {"editable_body":0, "editable_cta":1, "unclear":2, "widget":3, "chrome":4, "gone":5, "page_error":6}
        # Pick best (lowest priority)
        best_region = min(regions, key=lambda r: priority.get(r, 99))
        out.append({**f, "live_status": 200, "region": best_region, "match_count": len(matches), "all_regions": ",".join(regions)})
    return out

results = []
with ThreadPoolExecutor(max_workers=12) as ex:
    futs = {ex.submit(fetch_and_classify, page, lst): page for page, lst in by_page.items()}
    for i, fut in enumerate(as_completed(futs), 1):
        results.extend(fut.result())
        if i % 25 == 0:
            print(f"  {i}/{len(by_page)} pages done", file=sys.stderr)

# Categorize
counter = Counter((r["task"], r["region"]) for r in results)
print("\n=== Region breakdown of all 202 in-main findings ===", file=sys.stderr)
for k,v in sorted(counter.items()):
    print(f"  {k}: {v}", file=sys.stderr)

# Count REAL remaining work = editable_body + editable_cta + unclear
real_t016 = [r for r in results if r["task"]=="T016" and r["region"] in ("editable_body","editable_cta","unclear")]
real_t063 = [r for r in results if r["task"]=="T063" and r["region"] in ("editable_body","editable_cta","unclear")]

# Unique link instances (page+href+link_text)
def uniq_key(r):
    return (pp(r["page_url"]), nh(r["link_href"]), r["link_text"])

uniq_t016 = {uniq_key(r) for r in real_t016}
uniq_t063 = {uniq_key(r) for r in real_t063}

print(f"\n=== REAL REMAINING WORK (editable body + CTA + unclear) ===", file=sys.stderr)
print(f"  T016 (link via redirect):  {len(real_t016)} rows / {len(uniq_t016)} unique links", file=sys.stderr)
print(f"  T063 (legacy /node/N):     {len(real_t063)} rows / {len(uniq_t063)} unique links", file=sys.stderr)
print(f"  Total unique links to fix: {len(uniq_t016) + len(uniq_t063)}", file=sys.stderr)

# Pages needing edits
pages_t016 = {r["page_url"] for r in real_t016}
pages_t063 = {r["page_url"] for r in real_t063}
all_pages = pages_t016 | pages_t063
print(f"  Across {len(all_pages)} unique pages", file=sys.stderr)

# CTA vs body breakdown
cta_count = sum(1 for r in real_t016+real_t063 if r["region"]=="editable_cta")
body_count = sum(1 for r in real_t016+real_t063 if r["region"]=="editable_body")
unclear_count = sum(1 for r in real_t016+real_t063 if r["region"]=="unclear")
print(f"  By type: {body_count} body links, {cta_count} CTA buttons, {unclear_count} unclear", file=sys.stderr)

# Save the precise remaining work as a CSV
with open("REAL_remaining_work_2026_05_08.csv", "w", encoding="utf-8", newline="") as f:
    w = csv.writer(f)
    w.writerow(["task","page_url","link_href","link_text","region","is_cta","final_target_or_proposed","match_count_on_page"])
    for r in sorted(real_t016 + real_t063, key=lambda x: (x["page_url"], x["link_href"])):
        target = r.get("final_target") or r.get("proposed_target") or ""
        w.writerow([r["task"], r["page_url"], r["link_href"], r["link_text"], r["region"], r.get("is_cta",""), target, r.get("match_count","")])

# Save full JSON for inspection
with open("precise_classification.json","w") as f:
    json.dump(results, f, indent=2)
