"""Tracking-spec builder: turns crawl artifacts into a Report + Actions via Claude.

v1 strategy (cost-efficient):
- Single Claude Opus call.
- System prompt is cached (it's static across runs — high cache hit rate).
- User content: structured textual summary of all crawled pages' interactive elements
  + ONE screenshot (homepage) for visual brand/CTA context. We don't send 25 screenshots
  because cost balloons fast and the textual interactive.json captures most of the
  signal an analyst needs.
"""

from __future__ import annotations

import base64
import json
import os
from pathlib import Path
from typing import Any

import anthropic

from db import execute, query_one

ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANALYZER_MODEL = os.environ.get("ANALYZER_MODEL", "claude-opus-4-7")
CRAWL_MAX_SPEND_USD = float(os.environ.get("CRAWL_MAX_SPEND_USD", "2.00"))

# Approx Anthropic pricing for claude-opus-4-7 ($/MTok). Update if pricing shifts.
# Used for the per-CrawlRun spend cap and the spend display on the crawl detail page.
PRICE_INPUT_USD_PER_MTOK = 15.0
PRICE_OUTPUT_USD_PER_MTOK = 75.0
PRICE_CACHE_WRITE_USD_PER_MTOK = 18.75
PRICE_CACHE_READ_USD_PER_MTOK = 1.50


SYSTEM_PROMPT = """You are an expert digital analytics consultant specializing in Google Analytics 4 (GA4), Google Tag Manager (GTM), and Google Ads conversion tracking.

You are helping set up tracking for a website. You will be given a structured summary of a recent crawl of the site (homepage + several internal pages), including:
- Page metadata (URL, title, HTTP status)
- Interactive element inventory per page (forms, mailto/tel links, primary CTAs, buttons, videos)
- A screenshot of the homepage

Your job: propose a complete, opinionated GA4 + GTM tracking plan for this site, output as strict JSON conforming to the schema below.

Be specific and concrete: every Action must reference real elements you observed. Don't invent forms that aren't there. Prefer fewer, well-justified events over a long list of low-value events. Use the GA4 recommended event names where applicable (purchase, generate_lead, sign_up, contact, view_item, add_to_cart, etc.) and snake_case for any custom event names.

For form submissions: prefer one event per *purpose* (lead form, contact form, newsletter signup) rather than one per form element — group similar forms under the same event with a `form_id` parameter.

For phone/email link clicks: propose `contact` event with method=phone or method=email, and a parameter for the actual value (last 4 digits of phone for privacy is fine, full email is fine since user clicked it).

For primary CTAs (e.g. "Book a Consultation", "Get a Quote"): propose meaningful event names tied to the action's intent.

For video plays (YouTube/Vimeo embeds): propose video_start and video_complete using GTM's built-in video triggers.

OUTPUT FORMAT — return ONLY a single JSON object (no prose, no markdown fence) matching this schema:

{
  "title": "Tracking Plan for <Client Name>",
  "summary_md": "<2-4 paragraph markdown executive summary: what we're recommending and why, in plain English a non-technical client can understand>",
  "actions": [
    {
      "kind": "event" | "custom_dimension" | "custom_metric" | "conversion_marking" | "ads_import" | "search_console" | "settings",
      "title": "<short imperative title for the action — what we're doing>",
      "notes_md": "<markdown explanation: what this captures, why it matters for the business, how the client should think about the data it produces>",
      "spec": {
        // For kind=event:
        "event_name": "generate_lead",
        "trigger": {
          "type": "form_submit" | "click" | "page_view" | "video" | "scroll" | "custom",
          "selector": "<CSS selector or descriptor>",
          "page_filter": "<URL pattern or 'any'>",
          "details": "<plain-language description of when this fires>"
        },
        "parameters": [
          {"name": "form_id", "value": "<source>", "description": "..."}
        ],
        "should_be_conversion": true | false,
        "should_import_to_google_ads": true | false
      }
      // For kind=custom_dimension/custom_metric:
      // spec: { "scope": "event"|"user", "name": "...", "param_name": "...", "rationale": "..." }
      // For kind=conversion_marking:
      // spec: { "event_name": "...", "rationale": "..." }
      // For kind=ads_import:
      // spec: { "ga4_event_name": "...", "ads_conversion_action_name": "...", "category": "lead|purchase|signup|contact|other", "rationale": "..." }
      // For kind=search_console:
      // spec: { "verification_method": "dns_txt"|"meta_tag"|"google_analytics", "rationale": "..." }
      // For kind=settings:
      // spec: { "service": "ga4"|"gtm"|"google_ads", "setting": "...", "value": "...", "rationale": "..." }
    }
  ]
}

Every Action in `actions` must be self-explanatory enough for a non-technical client to read and approve. Order actions: events first (most important to least), then custom dimensions, then conversion markings, then ads imports, then search console, then misc settings."""


def _b64_image(path: Path) -> str:
    with path.open("rb") as f:
        return base64.standard_b64encode(f.read()).decode()


def _build_summary(pages: list, target_url: str, crawl_dir: Path) -> str:
    """Compact textual summary of the crawl's interactive inventory.

    Stays under ~30k tokens for a 25-page site. Per-page block includes a trimmed
    list of forms/CTAs/buttons (with text only, attributes summarized).
    """
    lines: list[str] = [f"# Crawl summary for {target_url}", ""]
    for p in pages:
        page_dir = crawl_dir / "pages" / p.dir_name
        try:
            interactive = json.loads((page_dir / "interactive.json").read_text())
        except Exception:
            interactive = {}
        lines.append(f"## Page #{p.index} — {p.title or '(no title)'}")
        lines.append(f"URL: {p.url}")
        lines.append(f"HTTP {p.status}, {p.duration_ms} ms")
        if p.error:
            lines.append(f"ERROR: {p.error}")

        forms = interactive.get("forms", [])
        if forms:
            lines.append(f"### Forms ({len(forms)})")
            for f in forms[:20]:
                fid = f.get("id") or f.get("name") or "(unnamed)"
                action = f.get("action") or "(default)"
                fields = ", ".join(
                    (fld.get("name") or fld.get("placeholder") or fld.get("label") or fld.get("type") or "?")
                    for fld in f.get("fields", [])
                    if fld.get("type") not in ("submit", "hidden", "button")
                )[:300]
                submit = f.get("submit_label") or "(submit)"
                lines.append(f"- form `{fid}` action={action} → fields: {fields} | submit: {submit}")

        anchors = interactive.get("anchors", [])
        if anchors:
            lines.append(f"### Special anchors ({len(anchors)})")
            for a in anchors[:30]:
                lines.append(f"- {a.get('kind')}: {a.get('href')} — {a.get('text', '')}")

        ctas = interactive.get("other", [])
        if ctas:
            lines.append(f"### Styled CTAs ({len(ctas)})")
            for c in ctas[:25]:
                lines.append(f"- '{c.get('text')}' → {c.get('href')}")

        buttons = interactive.get("buttons", [])
        if buttons:
            lines.append(f"### Buttons ({len(buttons)})")
            for b in buttons[:25]:
                aria = b.get("aria-label")
                lines.append(f"- '{b.get('text')}'{' [aria=' + aria + ']' if aria else ''}")

        videos = interactive.get("videos", [])
        if videos:
            lines.append(f"### Videos / embeds ({len(videos)})")
            for v in videos[:10]:
                lines.append(f"- {v.get('tag')} src={v.get('src')}")
        lines.append("")
    return "\n".join(lines)


def _calc_spend_cents(usage: Any) -> int:
    in_tok = getattr(usage, "input_tokens", 0) or 0
    out_tok = getattr(usage, "output_tokens", 0) or 0
    cache_w = getattr(usage, "cache_creation_input_tokens", 0) or 0
    cache_r = getattr(usage, "cache_read_input_tokens", 0) or 0
    cost = (
        in_tok / 1_000_000 * PRICE_INPUT_USD_PER_MTOK
        + out_tok / 1_000_000 * PRICE_OUTPUT_USD_PER_MTOK
        + cache_w / 1_000_000 * PRICE_CACHE_WRITE_USD_PER_MTOK
        + cache_r / 1_000_000 * PRICE_CACHE_READ_USD_PER_MTOK
    )
    return int(round(cost * 100))


def _strip_json(text: str) -> str:
    t = text.strip()
    if t.startswith("```"):
        first_nl = t.find("\n")
        if first_nl != -1:
            t = t[first_nl + 1:]
        if t.endswith("```"):
            t = t[:-3]
    return t.strip()


def analyze_and_build_report(
    crawl_id: int, client_id: int, crawl_dir: Path,
    pages: list, target_url: str,
) -> tuple[int, int, str]:
    """Run analyzer, persist Report + Actions, return (report_id, spend_cents, model)."""
    if not ANTHROPIC_API_KEY:
        raise RuntimeError("ANTHROPIC_API_KEY is not set")

    summary = _build_summary(pages, target_url, crawl_dir)
    homepage_shot = None
    if pages:
        candidate = crawl_dir / "pages" / pages[0].dir_name / "screenshot.png"
        if candidate.exists():
            homepage_shot = candidate

    user_content: list[dict[str, Any]] = []
    if homepage_shot is not None:
        user_content.append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/png",
                "data": _b64_image(homepage_shot),
            },
        })
        user_content.append({"type": "text", "text": "↑ Homepage screenshot for visual reference.\n\n"})
    user_content.append({"type": "text", "text": summary})
    user_content.append({
        "type": "text",
        "text": (
            "\n\nNow produce the tracking plan. Output ONLY the JSON object — no prose, "
            "no markdown fence."
        ),
    })

    client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
    resp = client.messages.create(
        model=ANALYZER_MODEL,
        max_tokens=8000,
        system=[
            {
                "type": "text",
                "text": SYSTEM_PROMPT,
                "cache_control": {"type": "ephemeral"},
            }
        ],
        messages=[{"role": "user", "content": user_content}],
    )
    spend_cents = _calc_spend_cents(resp.usage)
    if spend_cents > int(CRAWL_MAX_SPEND_USD * 100):
        raise RuntimeError(
            f"spend cap exceeded: ${spend_cents / 100:.2f} > ${CRAWL_MAX_SPEND_USD:.2f}"
        )

    raw_text = "".join(
        block.text for block in resp.content if getattr(block, "type", None) == "text"
    )
    (crawl_dir / "analyzer_raw.json").write_text(raw_text, encoding="utf-8")

    try:
        spec = json.loads(_strip_json(raw_text))
    except json.JSONDecodeError as e:
        raise RuntimeError(f"analyzer returned non-JSON: {e}")

    title = spec.get("title") or f"Tracking plan for {target_url}"
    summary_md = spec.get("summary_md") or ""
    actions = spec.get("actions") or []

    report_id = execute(
        """INSERT INTO report (client_id, crawl_run_id, title, summary_md, status)
           VALUES (?, ?, ?, ?, 'draft')""",
        (client_id, crawl_id, title, summary_md),
    )

    for i, a in enumerate(actions):
        execute(
            """INSERT INTO action (report_id, kind, title, proposed_spec_json,
                                   notes_md, sort_order, status)
               VALUES (?, ?, ?, ?, ?, ?, 'proposed')""",
            (
                report_id,
                a.get("kind") or "event",
                a.get("title") or "(untitled action)",
                json.dumps(a.get("spec") or {}),
                a.get("notes_md") or "",
                i,
            ),
        )

    return report_id, spend_cents, ANALYZER_MODEL