"""Playwright crawl runner.

Crawls a client site (homepage + N internal pages), saves per-page artifacts:
  pages/<idx>__<slug>/
    screenshot.png   full-page screenshot
    dom.html         rendered HTML
    interactive.json structured list of interactive elements (forms, CTAs, etc.)
    meta.json        url, title, status, duration, errors

Synchronous; intended to be invoked from a background thread.
"""

from __future__ import annotations

import json
import re
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Optional
from urllib.parse import urljoin, urlparse, urlunparse, urldefrag

from playwright.sync_api import sync_playwright, Page, TimeoutError as PWTimeoutError

USER_AGENT = (
    "Mozilla/5.0 (compatible; TrackingSetupBot/1.0; "
    "+https://tracking-setup.demoing.info/about-bot)"
)

EXCLUDE_PATHS = (
    "/wp-admin", "/wp-login", "/admin", "/login", "/signin", "/sign-in",
    "/logout", "/account/", "/checkout", "/cart", "/my-account", "/feed",
)
EXCLUDE_EXTENSIONS = (".pdf", ".zip", ".dmg", ".exe", ".jpg", ".jpeg", ".png",
                     ".gif", ".webp", ".mp4", ".mp3", ".woff", ".woff2", ".ttf",
                     ".css", ".js", ".xml", ".rss")

INTERACTIVE_JS = """
() => {
  const out = { forms: [], anchors: [], buttons: [], videos: [], other: [] };
  const txt = (el) => (el && el.textContent || '').trim().slice(0, 200);
  const attrs = (el, names) => {
    const o = {};
    for (const n of names) { const v = el.getAttribute(n); if (v) o[n] = v; }
    return o;
  };
  const visible = (el) => {
    const r = el.getBoundingClientRect();
    if (!r.width || !r.height) return false;
    const s = getComputedStyle(el);
    if (s.visibility === 'hidden' || s.display === 'none' || parseFloat(s.opacity) === 0) return false;
    return true;
  };

  for (const f of document.forms) {
    if (!visible(f)) continue;
    const fields = Array.from(f.elements).map(e => ({
      tag: e.tagName.toLowerCase(),
      type: e.type || null,
      name: e.name || null,
      id: e.id || null,
      placeholder: e.placeholder || null,
      required: !!e.required,
      label: (e.labels && e.labels[0]) ? txt(e.labels[0]) : null,
    }));
    out.forms.push({
      id: f.id || null,
      name: f.name || null,
      action: f.getAttribute('action') || null,
      method: (f.method || 'get').toLowerCase(),
      class: f.className || null,
      fields,
      submit_label: (() => {
        const s = f.querySelector('[type=submit], button:not([type])');
        return s ? txt(s) || s.value || null : null;
      })(),
    });
  }

  for (const a of document.querySelectorAll('a[href]')) {
    const href = a.getAttribute('href') || '';
    if (!href || href.startsWith('javascript:')) continue;
    if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('sms:')) {
      out.anchors.push({ kind: href.split(':')[0], href, text: txt(a) });
      continue;
    }
    if (a.hasAttribute('download')) {
      out.anchors.push({ kind: 'download', href, text: txt(a), filename: a.getAttribute('download') });
    }
  }

  for (const b of document.querySelectorAll('button, [role=button]')) {
    if (!visible(b)) continue;
    const t = txt(b);
    if (!t) continue;
    out.buttons.push({
      tag: b.tagName.toLowerCase(),
      text: t,
      ...attrs(b, ['id', 'class', 'data-event', 'data-action', 'aria-label']),
    });
  }

  for (const v of document.querySelectorAll('video, iframe[src*="youtube"], iframe[src*="vimeo"], iframe[src*="wistia"]')) {
    out.videos.push({
      tag: v.tagName.toLowerCase(),
      ...attrs(v, ['src', 'id', 'class', 'data-video-id', 'title']),
    });
  }

  // Big primary CTA candidates: anchors styled as buttons in nav/hero
  for (const a of document.querySelectorAll('a.btn, a.button, a[role=button], header a, [class*="hero"] a, [class*="cta"] a')) {
    if (!visible(a)) continue;
    const t = txt(a);
    if (!t) continue;
    out.other.push({
      kind: 'styled_cta',
      tag: 'a',
      text: t,
      href: a.getAttribute('href') || null,
      class: a.className || null,
    });
  }

  return out;
}
"""


@dataclass
class PageArtifact:
    index: int
    url: str
    title: Optional[str]
    status: int
    duration_ms: int
    error: Optional[str]
    dir_name: str
    interactive_summary: dict  # counts + small samples


@dataclass
class CrawlResult:
    started_at: float
    finished_at: float
    pages: list[PageArtifact]
    error: Optional[str]


def _slugify(url: str) -> str:
    p = urlparse(url)
    s = (p.path or "/").strip("/").replace("/", "_") or "home"
    s = re.sub(r"[^a-zA-Z0-9_\-]+", "-", s)
    return s[:60] or "page"


def _normalize(href: str, base: str) -> Optional[str]:
    if not href:
        return None
    href, _ = urldefrag(href)
    abs_url = urljoin(base, href)
    p = urlparse(abs_url)
    if p.scheme not in ("http", "https"):
        return None
    if any(p.path.lower().endswith(ext) for ext in EXCLUDE_EXTENSIONS):
        return None
    if any(p.path.lower().startswith(ex) for ex in EXCLUDE_PATHS):
        return None
    p = p._replace(query="", fragment="")
    return urlunparse(p)


def crawl(target_url: str, output_dir: Path, max_pages: int = 25,
          per_page_timeout_ms: int = 30000) -> CrawlResult:
    output_dir.mkdir(parents=True, exist_ok=True)
    pages_dir = output_dir / "pages"
    pages_dir.mkdir(exist_ok=True)

    start = time.time()
    target_origin = urlparse(target_url)
    target_origin = (target_origin.scheme, target_origin.netloc)

    visited: set[str] = set()
    queue: list[str] = [target_url]
    artifacts: list[PageArtifact] = []
    fatal_error: Optional[str] = None

    try:
        with sync_playwright() as pw:
            browser = pw.chromium.launch(
                args=["--disable-dev-shm-usage", "--no-sandbox"],
            )
            context = browser.new_context(
                user_agent=USER_AGENT,
                viewport={"width": 1366, "height": 900},
                ignore_https_errors=False,
            )

            try:
                while queue and len(artifacts) < max_pages:
                    url = queue.pop(0)
                    if url in visited:
                        continue
                    visited.add(url)
                    p_origin = urlparse(url)
                    if (p_origin.scheme, p_origin.netloc) != target_origin:
                        continue

                    page = context.new_page()
                    idx = len(artifacts)
                    page_dir = pages_dir / f"{idx:03d}__{_slugify(url)}"
                    page_dir.mkdir(exist_ok=True)
                    p_start = time.time()
                    err: Optional[str] = None
                    status = 0
                    title: Optional[str] = None
                    interactive: dict = {}

                    try:
                        resp = page.goto(url, wait_until="networkidle",
                                         timeout=per_page_timeout_ms)
                        status = resp.status if resp else 0
                        try:
                            page.wait_for_load_state("networkidle", timeout=5000)
                        except PWTimeoutError:
                            pass
                        title = page.title()
                        # Screenshot
                        page.screenshot(path=str(page_dir / "screenshot.png"),
                                        full_page=True, animations="disabled")
                        # DOM
                        html = page.content()
                        (page_dir / "dom.html").write_text(html, encoding="utf-8")
                        # Interactive elements
                        interactive = page.evaluate(INTERACTIVE_JS)
                        (page_dir / "interactive.json").write_text(
                            json.dumps(interactive, indent=2), encoding="utf-8"
                        )
                        # Discover next pages on the homepage + first few pages
                        if idx < 5:
                            for a in page.locator("a[href]").all():
                                try:
                                    href = a.get_attribute("href")
                                except Exception:
                                    continue
                                norm = _normalize(href or "", url)
                                if norm and norm not in visited and norm not in queue:
                                    p2 = urlparse(norm)
                                    if (p2.scheme, p2.netloc) == target_origin:
                                        queue.append(norm)
                    except PWTimeoutError as e:
                        err = f"timeout: {e}"
                    except Exception as e:
                        err = f"{type(e).__name__}: {e}"
                    finally:
                        try:
                            page.close()
                        except Exception:
                            pass

                    artifact = PageArtifact(
                        index=idx,
                        url=url,
                        title=title,
                        status=status,
                        duration_ms=int((time.time() - p_start) * 1000),
                        error=err,
                        dir_name=page_dir.name,
                        interactive_summary={
                            "n_forms": len(interactive.get("forms", [])),
                            "n_anchors": len(interactive.get("anchors", [])),
                            "n_buttons": len(interactive.get("buttons", [])),
                            "n_videos": len(interactive.get("videos", [])),
                            "n_styled_ctas": len(interactive.get("other", [])),
                        },
                    )
                    artifacts.append(artifact)
                    (page_dir / "meta.json").write_text(
                        json.dumps(asdict(artifact), indent=2), encoding="utf-8"
                    )
            finally:
                context.close()
                browser.close()

    except Exception as e:
        fatal_error = f"{type(e).__name__}: {e}"

    return CrawlResult(
        started_at=start,
        finished_at=time.time(),
        pages=artifacts,
        error=fatal_error,
    )
