"""Playwright crawl runner. Crawls a client site (homepage + N internal pages), saves per-page artifacts: pages/__/ screenshot.png full-page screenshot dom.html rendered HTML interactive.json structured list of interactive elements (forms, CTAs, etc.) meta.json url, title, status, duration, errors Synchronous; intended to be invoked from a background thread. """ from __future__ import annotations import json import re import time from dataclasses import dataclass, asdict from pathlib import Path from typing import Optional from urllib.parse import urljoin, urlparse, urlunparse, urldefrag from playwright.sync_api import sync_playwright, Page, TimeoutError as PWTimeoutError USER_AGENT = ( "Mozilla/5.0 (compatible; TrackingSetupBot/1.0; " "+https://tracking-setup.demoing.info/about-bot)" ) EXCLUDE_PATHS = ( "/wp-admin", "/wp-login", "/admin", "/login", "/signin", "/sign-in", "/logout", "/account/", "/checkout", "/cart", "/my-account", "/feed", ) EXCLUDE_EXTENSIONS = (".pdf", ".zip", ".dmg", ".exe", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".mp4", ".mp3", ".woff", ".woff2", ".ttf", ".css", ".js", ".xml", ".rss") INTERACTIVE_JS = """ () => { const out = { forms: [], anchors: [], buttons: [], videos: [], other: [] }; const txt = (el) => (el && el.textContent || '').trim().slice(0, 200); const attrs = (el, names) => { const o = {}; for (const n of names) { const v = el.getAttribute(n); if (v) o[n] = v; } return o; }; const visible = (el) => { const r = el.getBoundingClientRect(); if (!r.width || !r.height) return false; const s = getComputedStyle(el); if (s.visibility === 'hidden' || s.display === 'none' || parseFloat(s.opacity) === 0) return false; return true; }; for (const f of document.forms) { if (!visible(f)) continue; const fields = Array.from(f.elements).map(e => ({ tag: e.tagName.toLowerCase(), type: e.type || null, name: e.name || null, id: e.id || null, placeholder: e.placeholder || null, required: !!e.required, label: (e.labels && e.labels[0]) ? txt(e.labels[0]) : null, })); out.forms.push({ id: f.id || null, name: f.name || null, action: f.getAttribute('action') || null, method: (f.method || 'get').toLowerCase(), class: f.className || null, fields, submit_label: (() => { const s = f.querySelector('[type=submit], button:not([type])'); return s ? txt(s) || s.value || null : null; })(), }); } for (const a of document.querySelectorAll('a[href]')) { const href = a.getAttribute('href') || ''; if (!href || href.startsWith('javascript:')) continue; if (href.startsWith('mailto:') || href.startsWith('tel:') || href.startsWith('sms:')) { out.anchors.push({ kind: href.split(':')[0], href, text: txt(a) }); continue; } if (a.hasAttribute('download')) { out.anchors.push({ kind: 'download', href, text: txt(a), filename: a.getAttribute('download') }); } } for (const b of document.querySelectorAll('button, [role=button]')) { if (!visible(b)) continue; const t = txt(b); if (!t) continue; out.buttons.push({ tag: b.tagName.toLowerCase(), text: t, ...attrs(b, ['id', 'class', 'data-event', 'data-action', 'aria-label']), }); } for (const v of document.querySelectorAll('video, iframe[src*="youtube"], iframe[src*="vimeo"], iframe[src*="wistia"]')) { out.videos.push({ tag: v.tagName.toLowerCase(), ...attrs(v, ['src', 'id', 'class', 'data-video-id', 'title']), }); } // Big primary CTA candidates: anchors styled as buttons in nav/hero for (const a of document.querySelectorAll('a.btn, a.button, a[role=button], header a, [class*="hero"] a, [class*="cta"] a')) { if (!visible(a)) continue; const t = txt(a); if (!t) continue; out.other.push({ kind: 'styled_cta', tag: 'a', text: t, href: a.getAttribute('href') || null, class: a.className || null, }); } return out; } """ @dataclass class PageArtifact: index: int url: str title: Optional[str] status: int duration_ms: int error: Optional[str] dir_name: str interactive_summary: dict # counts + small samples @dataclass class CrawlResult: started_at: float finished_at: float pages: list[PageArtifact] error: Optional[str] def _slugify(url: str) -> str: p = urlparse(url) s = (p.path or "/").strip("/").replace("/", "_") or "home" s = re.sub(r"[^a-zA-Z0-9_\-]+", "-", s) return s[:60] or "page" def _normalize(href: str, base: str) -> Optional[str]: if not href: return None href, _ = urldefrag(href) abs_url = urljoin(base, href) p = urlparse(abs_url) if p.scheme not in ("http", "https"): return None if any(p.path.lower().endswith(ext) for ext in EXCLUDE_EXTENSIONS): return None if any(p.path.lower().startswith(ex) for ex in EXCLUDE_PATHS): return None p = p._replace(query="", fragment="") return urlunparse(p) def crawl(target_url: str, output_dir: Path, max_pages: int = 25, per_page_timeout_ms: int = 30000) -> CrawlResult: output_dir.mkdir(parents=True, exist_ok=True) pages_dir = output_dir / "pages" pages_dir.mkdir(exist_ok=True) start = time.time() target_origin = urlparse(target_url) target_origin = (target_origin.scheme, target_origin.netloc) visited: set[str] = set() queue: list[str] = [target_url] artifacts: list[PageArtifact] = [] fatal_error: Optional[str] = None try: with sync_playwright() as pw: browser = pw.chromium.launch( args=["--disable-dev-shm-usage", "--no-sandbox"], ) context = browser.new_context( user_agent=USER_AGENT, viewport={"width": 1366, "height": 900}, ignore_https_errors=False, ) try: while queue and len(artifacts) < max_pages: url = queue.pop(0) if url in visited: continue visited.add(url) p_origin = urlparse(url) if (p_origin.scheme, p_origin.netloc) != target_origin: continue page = context.new_page() idx = len(artifacts) page_dir = pages_dir / f"{idx:03d}__{_slugify(url)}" page_dir.mkdir(exist_ok=True) p_start = time.time() err: Optional[str] = None status = 0 title: Optional[str] = None interactive: dict = {} try: resp = page.goto(url, wait_until="networkidle", timeout=per_page_timeout_ms) status = resp.status if resp else 0 try: page.wait_for_load_state("networkidle", timeout=5000) except PWTimeoutError: pass title = page.title() # Screenshot page.screenshot(path=str(page_dir / "screenshot.png"), full_page=True, animations="disabled") # DOM html = page.content() (page_dir / "dom.html").write_text(html, encoding="utf-8") # Interactive elements interactive = page.evaluate(INTERACTIVE_JS) (page_dir / "interactive.json").write_text( json.dumps(interactive, indent=2), encoding="utf-8" ) # Discover next pages on the homepage + first few pages if idx < 5: for a in page.locator("a[href]").all(): try: href = a.get_attribute("href") except Exception: continue norm = _normalize(href or "", url) if norm and norm not in visited and norm not in queue: p2 = urlparse(norm) if (p2.scheme, p2.netloc) == target_origin: queue.append(norm) except PWTimeoutError as e: err = f"timeout: {e}" except Exception as e: err = f"{type(e).__name__}: {e}" finally: try: page.close() except Exception: pass artifact = PageArtifact( index=idx, url=url, title=title, status=status, duration_ms=int((time.time() - p_start) * 1000), error=err, dir_name=page_dir.name, interactive_summary={ "n_forms": len(interactive.get("forms", [])), "n_anchors": len(interactive.get("anchors", [])), "n_buttons": len(interactive.get("buttons", [])), "n_videos": len(interactive.get("videos", [])), "n_styled_ctas": len(interactive.get("other", [])), }, ) artifacts.append(artifact) (page_dir / "meta.json").write_text( json.dumps(asdict(artifact), indent=2), encoding="utf-8" ) finally: context.close() browser.close() except Exception as e: fatal_error = f"{type(e).__name__}: {e}" return CrawlResult( started_at=start, finished_at=time.time(), pages=artifacts, error=fatal_error, )