"""Crawl all public URLs from sitemap.xml and extract internal links."""
import csv
import json
import re
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse, urljoin

import requests
from bs4 import BeautifulSoup

SITEMAP = "https://automate.fortra.com/sitemap.xml"
HOST = "automate.fortra.com"
TIMEOUT = 25

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (FortraReaudit/2026-05-08)"})


def get_sitemap_urls():
    r = session.get(SITEMAP, timeout=TIMEOUT)
    r.raise_for_status()
    return re.findall(r"<loc>(.*?)</loc>", r.text)


def normalize_internal(href, base_url):
    """Return (path, is_internal). Path includes leading slash, no domain."""
    if not href:
        return None, False
    href = href.strip()
    # Skip mailto, tel, javascript, hash-only, data:
    if href.startswith(("mailto:", "tel:", "javascript:", "#", "data:")):
        return None, False
    # Resolve to absolute
    abs_url = urljoin(base_url, href)
    parsed = urlparse(abs_url)
    if parsed.netloc and parsed.netloc != HOST:
        return None, False
    path = parsed.path
    if parsed.query:
        path += "?" + parsed.query
    if parsed.fragment:
        # ignore fragment for redirect matching, but keep for context
        pass
    return path, True


def extract_links(html, page_url):
    """Return list of dicts: {href, link_text, region}."""
    soup = BeautifulSoup(html, "html.parser")
    out = []

    # Common Drupal regions to identify CTA buttons vs body links
    # CTA buttons usually have a class like "btn", "button", "cta-button"
    # Body content lives inside <div class="field--name-body"> or similar paragraph fields

    for a in soup.find_all("a", href=True):
        href = a["href"]
        path, is_internal = normalize_internal(href, page_url)
        if not is_internal:
            continue
        # Detect CTA-ness via class hierarchy
        is_cta = False
        for elem in [a] + list(a.parents)[:6]:
            classes = (elem.get("class") or [])
            class_str = " ".join(classes).lower()
            if any(k in class_str for k in ("cta", "btn", "button-cta", "callout-button")):
                is_cta = True
                break
        # Skip obvious nav/footer links to reduce noise — focus on body
        in_main = False
        for parent in a.parents:
            pid = parent.get("id") or ""
            if pid in ("main-content", "block-mainpagecontent", "block-fortra-page-title"):
                in_main = True
                break
            classes = (parent.get("class") or [])
            class_str = " ".join(classes).lower()
            if any(k in class_str for k in ("layout-content", "main-content", "node__content", "field--name-body", "paragraph--type-")):
                in_main = True
                break
        # Detect header/footer/sidebar
        in_chrome = False
        for parent in a.parents:
            pid = parent.get("id") or ""
            tag = (parent.name or "").lower()
            classes = (parent.get("class") or [])
            class_str = " ".join(classes).lower()
            if tag in ("header", "footer", "nav") or "footer" in class_str or "site-header" in class_str or "main-menu" in class_str:
                in_chrome = True
                break
        out.append({
            "href": path,
            "raw_href": href,
            "link_text": a.get_text(strip=True)[:200],
            "is_cta": is_cta,
            "in_main": in_main,
            "in_chrome": in_chrome,
        })
    return out


def fetch_one(url):
    try:
        r = session.get(url, timeout=TIMEOUT, allow_redirects=False)
        if r.status_code in (301, 302):
            return {"url": url, "status": r.status_code, "redirect_to": r.headers.get("Location"), "links": []}
        if r.status_code != 200:
            return {"url": url, "status": r.status_code, "links": []}
        # Get body HTML and extract internal links
        # Note: r.url after redirect chain (we disabled, so r.url == url)
        ctype = r.headers.get("content-type", "")
        if "html" not in ctype:
            return {"url": url, "status": r.status_code, "ctype": ctype, "links": []}
        links = extract_links(r.text, url)
        # Detect canonical NID from <link rel="shortlink" href="/node/NNNNN">
        m = re.search(r'<link[^>]+rel="shortlink"[^>]+href="(/node/\d+)"', r.text)
        nid = m.group(1) if m else ""
        m2 = re.search(r'<link[^>]+rel="canonical"[^>]+href="([^"]+)"', r.text)
        canonical = m2.group(1) if m2 else ""
        return {"url": url, "status": 200, "nid": nid, "canonical": canonical, "links": links}
    except Exception as e:
        return {"url": url, "status": 0, "error": str(e), "links": []}


def main():
    urls = get_sitemap_urls()
    print(f"Sitemap URLs: {len(urls)}", file=sys.stderr)

    results = {}
    with ThreadPoolExecutor(max_workers=12) as ex:
        futs = {ex.submit(fetch_one, u): u for u in urls}
        for i, fut in enumerate(as_completed(futs), 1):
            res = fut.result()
            results[res["url"]] = res
            if i % 50 == 0:
                print(f"  {i}/{len(urls)} fetched", file=sys.stderr)

    # Write raw JSON for downstream analysis
    with open("crawl_2026_05_08.json", "w", encoding="utf-8") as f:
        json.dump(results, f)

    # Flat CSV: page_url, nid, link_href, link_text, is_cta, in_main, in_chrome
    n_links = 0
    n_pages_with_links = 0
    n_404 = 0
    with open("all_internal_links_2026_05_08.csv", "w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["page_url", "page_status", "page_nid", "link_href", "link_text", "is_cta", "in_main", "in_chrome"])
        for url, res in results.items():
            if res.get("status") != 200:
                if res.get("status") == 404:
                    n_404 += 1
                continue
            links = res.get("links", [])
            if links:
                n_pages_with_links += 1
            for L in links:
                n_links += 1
                w.writerow([url, res["status"], res.get("nid", ""), L["href"], L["link_text"], int(L["is_cta"]), int(L["in_main"]), int(L["in_chrome"])])

    statuses = {}
    for r in results.values():
        statuses[r.get("status", 0)] = statuses.get(r.get("status", 0), 0) + 1
    print(f"\nStatus summary: {statuses}", file=sys.stderr)
    print(f"Total internal links: {n_links}", file=sys.stderr)
    print(f"Pages with at least 1 internal link: {n_pages_with_links}", file=sys.stderr)
    print(f"404 pages: {n_404}", file=sys.stderr)


if __name__ == "__main__":
    main()
