"""Audit Regent Grand 301 redirects + SEO on live site.

Reads:
  /srv/apps/regent/Regent Grand 301 Redirects.xlsx
  /srv/apps/regent/Regent Grand SEO stats.xlsx

Writes:
  /srv/apps/regent/Regent Grand Audit.xlsx  (two sheets: Redirects, SEO)
  /srv/apps/regent/Regent Grand Audit Redirects.csv
  /srv/apps/regent/Regent Grand Audit SEO.csv
"""
from __future__ import annotations

import csv
import html
import re
import sys
import time
from pathlib import Path
from urllib.parse import urlparse

import openpyxl
import requests
from bs4 import BeautifulSoup
from openpyxl.styles import Font, PatternFill

ROOT = Path("/srv/apps/regent")
REDIRECTS_XLSX = ROOT / "Regent Grand 301 Redirects.xlsx"
SEO_XLSX = ROOT / "Regent Grand SEO stats.xlsx"
OUT_XLSX = ROOT / "Regent Grand Audit.xlsx"
OUT_REDIRECTS_CSV = ROOT / "Regent Grand Audit Redirects.csv"
OUT_SEO_CSV = ROOT / "Regent Grand Audit SEO.csv"

UA = "Mozilla/5.0 (RegentGrandAudit/1.0)"
TIMEOUT = 20


def normalize(u: str | None) -> str:
    if not u:
        return ""
    return u.strip().rstrip("/").lower()


def fetch(url: str) -> tuple[int, str, str]:
    """Return (status_code, final_url, html_text). Empty html on failure."""
    try:
        r = requests.get(
            url,
            headers={"User-Agent": UA},
            timeout=TIMEOUT,
            allow_redirects=True,
        )
        return r.status_code, r.url, r.text
    except requests.RequestException as exc:
        return 0, "", f"ERROR: {exc}"


def extract_seo(html_text: str) -> dict:
    if not html_text or html_text.startswith("ERROR"):
        return {"title": "", "description": "", "h1": ""}
    soup = BeautifulSoup(html_text, "html.parser")
    title = (soup.title.string or "").strip() if soup.title else ""
    desc = ""
    md = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
    if md and md.get("content"):
        desc = md["content"].strip()
    h1 = ""
    h1_tag = soup.find("h1")
    if h1_tag:
        h1 = h1_tag.get_text(" ", strip=True)
    return {
        "title": html.unescape(title),
        "description": html.unescape(desc),
        "h1": html.unescape(h1),
    }


def classify_redirect(old_url: str, status: int, final_url: str) -> str:
    if status == 0:
        return "error"
    if status >= 400:
        return f"broken ({status})"
    if normalize(old_url) == normalize(final_url):
        return "same url"
    return "redirected"


def main() -> int:
    # --- Load redirects sheet ----------------------------------------
    wb_r = openpyxl.load_workbook(REDIRECTS_XLSX, data_only=True)
    ws_r = wb_r.active
    rows_r = list(ws_r.iter_rows(values_only=True))
    header_r = rows_r[0]
    redirect_rows = rows_r[1:]
    print(f"Loaded {len(redirect_rows)} redirect rows")

    # --- Load SEO sheet ----------------------------------------------
    wb_s = openpyxl.load_workbook(SEO_XLSX, data_only=True)
    ws_s = wb_s.active
    rows_s = list(ws_s.iter_rows(values_only=True))
    seo_rows = rows_s[1:]
    print(f"Loaded {len(seo_rows)} SEO rows")

    # --- Audit each URL ---------------------------------------------
    # Cache fetch results so we only hit each old URL once
    cache: dict[str, tuple[int, str, str]] = {}

    def get(url: str):
        if url not in cache:
            print(f"  GET {url}")
            cache[url] = fetch(url)
            time.sleep(0.2)
        return cache[url]

    # Redirect audit results
    redirect_results = []
    for page_name, old_url, _ in redirect_rows:
        if not old_url:
            continue
        status, final, _ = get(old_url)
        cls = classify_redirect(old_url, status, final)
        redirect_results.append(
            {
                "page": page_name,
                "old_url": old_url,
                "final_url": final or "",
                "http_status": status,
                "result": cls,
                "redirected_correctly": "yes" if cls in ("same url", "redirected") else "no",
            }
        )

    # SEO audit results - fetch the *live* URL (= final URL after any redirect)
    # Heuristic ranges based on common SEO guidance:
    TITLE_OK = (50, 60)   # px-budget proxy in chars
    DESC_OK = (140, 160)

    def length_verdict(text: str, lo: int, hi: int) -> str:
        n = len(text or "")
        if n == 0:
            return "missing"
        if (text or "").strip().lower() in ("n/a", "na"):
            return "missing"
        if n < lo:
            return f"short ({n})"
        if n > hi:
            return f"long ({n})"
        return f"good ({n})"

    def pick_better(old_v: str, new_v: str) -> str:
        """Verdict based purely on length-band; ties favour 'new' (current live)."""
        old_in = "good" in old_v
        new_in = "good" in new_v
        if old_in and not new_in:
            return "old"
        if new_in and not old_in:
            return "new"
        if old_v.startswith("missing") and not new_v.startswith("missing"):
            return "new"
        if new_v.startswith("missing") and not old_v.startswith("missing"):
            return "old"
        return "tie"

    seo_results = []
    for page_name, old_title, old_desc, old_h1, page_url in seo_rows:
        if not page_url:
            continue
        status, final, body = get(page_url)
        live = extract_seo(body)
        # Compare (text equality)
        title_changed = (old_title or "").strip() != live["title"].strip()
        desc_changed = (old_desc or "").strip() != live["description"].strip()
        h1_changed = (old_h1 or "").strip() != live["h1"].strip()
        # Length verdicts
        old_t_v = length_verdict(old_title or "", *TITLE_OK)
        new_t_v = length_verdict(live["title"], *TITLE_OK)
        old_d_v = length_verdict(old_desc or "", *DESC_OK)
        new_d_v = length_verdict(live["description"], *DESC_OK)
        seo_results.append(
            {
                "page": page_name,
                "old_url": page_url,
                "final_url": final or "",
                "http_status": status,
                "old_title": old_title or "",
                "live_title": live["title"],
                "title_changed": "yes" if title_changed else "no",
                "old_title_len": len(old_title or ""),
                "live_title_len": len(live["title"]),
                "old_title_check": old_t_v,
                "live_title_check": new_t_v,
                "title_better": pick_better(old_t_v, new_t_v) if title_changed else "n/a",
                "old_description": old_desc or "",
                "live_description": live["description"],
                "description_changed": "yes" if desc_changed else "no",
                "old_desc_len": len(old_desc or ""),
                "live_desc_len": len(live["description"]),
                "old_desc_check": old_d_v,
                "live_desc_check": new_d_v,
                "description_better": pick_better(old_d_v, new_d_v) if desc_changed else "n/a",
                "old_h1": old_h1 or "",
                "live_h1": live["h1"],
                "h1_changed": "yes" if h1_changed else "no",
            }
        )

    # --- Write CSV outputs ------------------------------------------
    with OUT_REDIRECTS_CSV.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=list(redirect_results[0].keys()))
        w.writeheader()
        w.writerows(redirect_results)
    print(f"Wrote {OUT_REDIRECTS_CSV}")

    with OUT_SEO_CSV.open("w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=list(seo_results[0].keys()))
        w.writeheader()
        w.writerows(seo_results)
    print(f"Wrote {OUT_SEO_CSV}")

    # --- Write a single combined xlsx -------------------------------
    out = openpyxl.Workbook()
    # Redirects sheet
    sh1 = out.active
    sh1.title = "Redirects"
    headers = list(redirect_results[0].keys())
    sh1.append(headers)
    for c in sh1[1]:
        c.font = Font(bold=True)
    for row in redirect_results:
        sh1.append([row[h] for h in headers])
    # Color coding for "result"
    res_col = headers.index("result") + 1
    fills = {
        "same url": PatternFill("solid", fgColor="D9EAD3"),
        "redirected": PatternFill("solid", fgColor="CFE2F3"),
    }
    for r in range(2, sh1.max_row + 1):
        v = sh1.cell(r, res_col).value
        if v in fills:
            sh1.cell(r, res_col).fill = fills[v]
        elif v and "broken" in v:
            sh1.cell(r, res_col).fill = PatternFill("solid", fgColor="F4CCCC")
    # Auto column widths (rough)
    widths = {"old_url": 60, "final_url": 60, "page": 25, "result": 18, "redirected_correctly": 22, "http_status": 12}
    for i, h in enumerate(headers, 1):
        sh1.column_dimensions[openpyxl.utils.get_column_letter(i)].width = widths.get(h, 18)

    # SEO sheet
    sh2 = out.create_sheet("SEO")
    headers2 = list(seo_results[0].keys())
    sh2.append(headers2)
    for c in sh2[1]:
        c.font = Font(bold=True)
    for row in seo_results:
        sh2.append([row[h] for h in headers2])
    # Highlight changed columns
    yellow = PatternFill("solid", fgColor="FFF2CC")
    for col_name in ("title_changed", "description_changed", "h1_changed"):
        idx = headers2.index(col_name) + 1
        for r in range(2, sh2.max_row + 1):
            if sh2.cell(r, idx).value == "yes":
                sh2.cell(r, idx).fill = yellow
    width_map = {
        "page": 22,
        "old_url": 50,
        "final_url": 50,
        "http_status": 10,
        "old_title": 50,
        "live_title": 50,
        "old_description": 60,
        "live_description": 60,
        "old_h1": 25,
        "live_h1": 25,
        "title_changed": 14,
        "description_changed": 18,
        "h1_changed": 12,
        "old_title_len": 12,
        "live_title_len": 12,
        "old_title_check": 14,
        "live_title_check": 14,
        "title_better": 14,
        "old_desc_len": 12,
        "live_desc_len": 12,
        "old_desc_check": 14,
        "live_desc_check": 14,
        "description_better": 18,
    }
    # Highlight winner column
    green = PatternFill("solid", fgColor="D9EAD3")
    red = PatternFill("solid", fgColor="F4CCCC")
    for col_name, winner in (("title_better", None), ("description_better", None)):
        idx = headers2.index(col_name) + 1
        for r in range(2, sh2.max_row + 1):
            v = sh2.cell(r, idx).value
            if v == "new":
                sh2.cell(r, idx).fill = green
            elif v == "old":
                sh2.cell(r, idx).fill = red
    for i, h in enumerate(headers2, 1):
        sh2.column_dimensions[openpyxl.utils.get_column_letter(i)].width = width_map.get(h, 18)

    out.save(OUT_XLSX)
    print(f"Wrote {OUT_XLSX}")

    # --- Quick console summary --------------------------------------
    same = sum(1 for r in redirect_results if r["result"] == "same url")
    red = sum(1 for r in redirect_results if r["result"] == "redirected")
    broken = sum(1 for r in redirect_results if r["result"].startswith("broken") or r["result"] == "error")
    print(f"\nRedirect summary: same={same} redirected={red} broken={broken}")
    t_changed = sum(1 for r in seo_results if r["title_changed"] == "yes")
    d_changed = sum(1 for r in seo_results if r["description_changed"] == "yes")
    h_changed = sum(1 for r in seo_results if r["h1_changed"] == "yes")
    print(f"SEO summary: title_changed={t_changed} desc_changed={d_changed} h1_changed={h_changed}")
    return 0


if __name__ == "__main__":
    sys.exit(main())