"""Run T016 (link-via-redirect) + T063 (legacy /node/N) analysis on fresh crawl."""
import csv
import json
import re
from collections import Counter, defaultdict
from urllib.parse import urlparse


def load_csv(path, **k):
    with open(path, encoding="utf-8") as f:
        return list(csv.DictReader(f, **k))


def is_terminal(target):
    if not target:
        return True
    if target.startswith(("http://", "https://")):
        return True
    if target == "<front>":
        return True
    return False


def resolve_chain(start, src2dst, max_hops=20):
    chain = [start]
    visited = {start}
    cur = start
    while True:
        if is_terminal(cur):
            break
        if cur not in src2dst:
            break
        nxt = src2dst[cur]
        chain.append(nxt)
        if nxt in visited:
            return chain, "cycle"
        visited.add(nxt)
        cur = nxt
        if len(chain) > max_hops:
            return chain, "too-long"
    return chain, "ok"


def main():
    # Load fresh redirect table
    new_r = load_csv("redirects_raw_2026_05_08.csv")
    by_src = defaultdict(list)
    for r in new_r:
        by_src[r["source"]].append((r["destination"], r["language"], r.get("status_code", "")))

    def primary_dst(lst):
        for d, l, _ in lst:
            if l in ("und", "en"):
                return d
        return lst[0][0]

    src2dst = {s: primary_dst(lst) for s, lst in by_src.items()}

    # Load fresh crawl
    links = load_csv("all_internal_links_2026_05_08.csv")
    print(f"Total internal links from crawl: {len(links)}")

    # T063: links to /node/N
    t063_findings = []
    for L in links:
        href = L["link_href"]
        # Strip query/fragment for /node/N detection
        path = href.split("?", 1)[0].split("#", 1)[0]
        m = re.match(r"^/node/(\d+)$", path)
        if m:
            t063_findings.append(L)

    # T016: links whose href is itself a redirect source (and resolves to a different final URL)
    t016_findings = []
    for L in links:
        href = L["link_href"]
        path = href.split("?", 1)[0].split("#", 1)[0]
        if path in src2dst:
            chain, status = resolve_chain(path, src2dst)
            final = chain[-1]
            if final == path:
                continue  # self-redirect or cycle, skip
            hops = len(chain) - 1
            t016_findings.append({
                **L,
                "chain": " -> ".join(chain),
                "final_target": final,
                "hops": hops,
                "chain_status": status,
            })

    print(f"\nT063 raw matches (any /node/N link): {len(t063_findings)}")
    print(f"T016 raw matches (any link via redirect): {len(t016_findings)}")

    # T063 broken vs fixable
    # Broken: target node has been deleted (no canonical) — we'd need to test each /node/NNN URL
    # For now, classify as: redirected (fixable -> swap to alias) vs not-in-redirect (probably 404)
    t063_fixable = []
    t063_probably_broken = []
    for L in t063_findings:
        href = L["link_href"].split("?", 1)[0].split("#", 1)[0]
        if href in src2dst:
            chain, _ = resolve_chain(href, src2dst)
            t063_fixable.append({**L, "proposed": chain[-1], "chain": " -> ".join(chain)})
        else:
            t063_probably_broken.append(L)

    print(f"  /node/N with redirect target (fixable): {len(t063_fixable)}")
    print(f"  /node/N with no redirect (likely broken): {len(t063_probably_broken)}")

    # Categorize T016 by hop count
    hop_counter = Counter(f["hops"] for f in t016_findings)
    print(f"\nT016 by hops: {dict(sorted(hop_counter.items()))}")

    # Filter to in-scope: same scope as April was "any link via redirect", not just over-2-hop chains
    # April delivered 183 link-via-redirect findings. Those were probably body+CTA in <main>, excluding header/footer
    # Let's bucket findings by region
    main_only_t016 = [f for f in t016_findings if f["in_main"] == "1" and f["in_chrome"] == "0"]
    chrome_t016 = [f for f in t016_findings if f["in_chrome"] == "1"]
    print(f"\nT016 in main content: {len(main_only_t016)}")
    print(f"T016 in header/footer/nav (chrome): {len(chrome_t016)}")

    # Save findings
    with open("t016_findings_2026_05_08.csv", "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["page_url","page_nid","link_href","link_text","is_cta","in_main","in_chrome","final_target","hops","chain","chain_status"])
        w.writeheader()
        for r in t016_findings:
            w.writerow({k: r.get(k, "") for k in w.fieldnames})

    with open("t063_findings_2026_05_08.csv", "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["page_url","page_nid","link_href","link_text","is_cta","in_main","in_chrome","fixable","proposed_target","chain"])
        w.writeheader()
        for r in t063_fixable:
            w.writerow({"page_url":r["page_url"],"page_nid":r["page_nid"],"link_href":r["link_href"],"link_text":r["link_text"],"is_cta":r["is_cta"],"in_main":r["in_main"],"in_chrome":r["in_chrome"],"fixable":1,"proposed_target":r["proposed"],"chain":r["chain"]})
        for r in t063_probably_broken:
            w.writerow({"page_url":r["page_url"],"page_nid":r["page_nid"],"link_href":r["link_href"],"link_text":r["link_text"],"is_cta":r["is_cta"],"in_main":r["in_main"],"in_chrome":r["in_chrome"],"fixable":0,"proposed_target":"","chain":""})


if __name__ == "__main__":
    main()
