#!/usr/bin/env python3
"""Harvest Green-Acres detail pages for photos + sizes via local Playwright.

Green-Acres serves photos on lb1.green-acres.com with predictable URLs and the
detail page has explicit "Surface area : X m²" / "Land : X m²" labels — cleaner
than Leggett. The store has ~461 GA records with metadata but no photos, leaving
the character scorer blind to them. This fills in photo_urls + building_size_m2 +
land_size_m2 so they become first-class candidates.

Usage:
    python3 greenacres_photo_harvest.py --top 30        # top N by current proxy score
    python3 greenacres_photo_harvest.py --refs URL1 URL2
    python3 greenacres_photo_harvest.py --all-without-photos
"""
from __future__ import annotations

import argparse
import random
import re
import sys
import time
from pathlib import Path

from playwright.sync_api import sync_playwright

SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from store import CHROME_UA, get_score, is_active, load, persist, upsert  # noqa: E402

PHOTO_RE = re.compile(r'https://lb1\.green-acres\.com/[^"\' ]+\.(?:jpg|jpeg|webp|png)', re.I)
SURFACE_RE = re.compile(r'Surface area\s*:\s*([\d][\d ,. ]{0,6})\s*m', re.I)
LAND_RE = re.compile(r'Land\s*:\s*([\d][\d ,. ]{0,9})\s*m', re.I)
BEDS_RE = re.compile(r'(\d+)\s*bedroom', re.I)


def _to_int(s: str) -> int | None:
    digits = re.sub(r'[^\d]', '', s)
    return int(digits) if digits else None


def _dedupe_photos(urls: set[str]) -> list[str]:
    """One per underlying image. GA URLs end in _N.jpg per gallery position."""
    seen: dict[str, str] = {}
    for u in urls:
        if '/no-photo' in u or '/logo' in u or '/badges' in u:
            continue
        # Group by the trailing filename — same image, possibly different sizes
        key = u.rsplit('/', 1)[-1]
        if key not in seen:
            seen[key] = u
    return list(seen.values())


def _extract(html: str) -> dict:
    out: dict = {}
    photos = _dedupe_photos(set(PHOTO_RE.findall(html)))
    if photos:
        out['photo_urls'] = photos
        out['photo_count'] = len(photos)
    m = SURFACE_RE.search(html)
    if m:
        v = _to_int(m.group(1))
        if v and v >= 20:
            out['building_size_m2'] = v
    m = LAND_RE.search(html)
    if m:
        v = _to_int(m.group(1))
        if v and v >= 50:
            out['land_size_m2'] = v
    m = BEDS_RE.search(html)
    if m:
        out['bedrooms'] = int(m.group(1))
    return out


def harvest_one(context, url: str) -> dict:
    page = context.new_page()
    try:
        r = page.goto(url, wait_until='domcontentloaded', timeout=60000)
        if not r or r.status != 200:
            return {'_status': r.status if r else 'no-response'}
        page.wait_for_timeout(3500)
        html = page.content()
        data = _extract(html)
        data['_status'] = 200
        return data
    finally:
        page.close()


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--top', type=int, help='top N by current cp_score')
    ap.add_argument('--refs', nargs='*', help='specific URLs')
    ap.add_argument('--all-without-photos', action='store_true')
    args = ap.parse_args()

    store = load()
    ga = [(u, p) for u, p in store.items()
          if p.get('source') == 'greenacres' and is_active(p)]
    if args.refs:
        urls = args.refs
    elif args.all_without_photos:
        urls = [u for u, p in ga if not p.get('photo_urls')]
    else:
        ga.sort(key=lambda kv: -float(get_score(kv[1]) or 0))
        urls = [u for u, _ in ga[: args.top or 30]]

    print(f'Harvesting {len(urls)} Green-Acres pages')
    ok = 0
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        for i, url in enumerate(urls, 1):
            ctx = browser.new_context(user_agent=CHROME_UA, locale='en-GB')
            try:
                res = harvest_one(ctx, url)
            except Exception as e:  # noqa: BLE001
                print(f'  ERROR {type(e).__name__}: {e} — {url[-50:]}')
                ctx.close()
                continue
            finally:
                ctx.close()
            if res.get('_status') != 200:
                print(f'  status={res["_status"]}  {url[-50:]}')
                continue
            res.pop('_status', None)
            upsert(store, url, res)
            n = res.get('photo_count', 0)
            bs = res.get('building_size_m2')
            ls = res.get('land_size_m2')
            city = store[url].get('city', '?')[:24]
            print(f'  {city:24} {n:>2} photos  bldg {bs or "?"}m2  land {ls or "?":>5}m2')
            ok += 1 if n else 0
            if i % 10 == 0:
                persist(store)
            time.sleep(random.uniform(4.0, 8.0))
        browser.close()
    persist(store)
    print(f'\nHarvested photos for {ok}/{len(urls)} GA pages')
    return 0


if __name__ == '__main__':
    sys.exit(main())