"""Detect black-outlined rectangles in flyer pages and crop each one to a JPG."""
import cv2
import numpy as np
from pathlib import Path

OUT_DIR = Path("extracted_screenshots")
OUT_DIR.mkdir(exist_ok=True)

# Detect dark rectangles by thresholding for near-black pixels, then finding
# contours that look like rectangles enclosing a screenshot.
MIN_AREA_PX = 200 * 200  # ignore tiny artefacts at 400 DPI
DARK_THRESHOLD = 80      # pixel intensity below which we consider "black-ish"

count = 0
crops = []
for page_idx, page_path in enumerate(sorted(Path("pages_hires").glob("page-*.jpg"))):
    img = cv2.imread(str(page_path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Threshold: black outlines become white on black background
    _, mask = cv2.threshold(gray, DARK_THRESHOLD, 255, cv2.THRESH_BINARY_INV)

    # Slight dilate so broken outlines connect
    kernel = np.ones((3, 3), np.uint8)
    mask = cv2.dilate(mask, kernel, iterations=1)

    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    page_boxes = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        area = w * h
        if area < MIN_AREA_PX:
            continue
        # Skip the whole-page contour or near-whole-page (e.g. footer banner)
        if w > img.shape[1] * 0.85 and h > img.shape[0] * 0.85:
            continue
        # Aspect filter: screenshots range tall/wide but exclude weird sliver shapes
        ar = w / h
        if ar > 6 or ar < 0.15:
            continue
        page_boxes.append((x, y, w, h))

    # Drop boxes mostly contained within another box (e.g. inner UI elements)
    page_boxes.sort(key=lambda b: b[2] * b[3], reverse=True)
    kept = []
    for b in page_boxes:
        x, y, w, h = b
        contained = False
        for kx, ky, kw, kh in kept:
            if x >= kx - 5 and y >= ky - 5 and x + w <= kx + kw + 5 and y + h <= ky + kh + 5:
                contained = True
                break
        if not contained:
            kept.append(b)

    # Sort visually: top-to-bottom, then left-to-right
    kept.sort(key=lambda b: (b[1] // 100, b[0]))

    print(f"Page {page_idx + 1}: detected {len(kept)} candidate rectangles")
    for (x, y, w, h) in kept:
        count += 1
        # Inset by ~2px to drop the black border from the crop
        pad = 4
        x0 = max(x + pad, 0)
        y0 = max(y + pad, 0)
        x1 = min(x + w - pad, img.shape[1])
        y1 = min(y + h - pad, img.shape[0])
        crop = img[y0:y1, x0:x1]
        out = OUT_DIR / f"screenshot_p{page_idx + 1}_{count:02d}.jpg"
        cv2.imwrite(str(out), crop, [cv2.IMWRITE_JPEG_QUALITY, 92])
        crops.append((out, w, h))
        print(f"  -> {out.name}  ({w}x{h})")

print(f"\nTotal extracted: {count}")
