#!/usr/bin/env python3
"""Harvest Leggett (frenchestateagents.com) photos + basics past Cloudflare via local Playwright.

The www site is Cloudflare-protected (HTTP 403 to plain fetch), but photos live on
the public image.hestia.immo CDN and the URLs are embedded in the page HTML. A local
headless Chromium solves the JS challenge — no third-party proxy, no API key.

Harvests for each favourite REF: hestia photo URLs (deduped to the largest variant
per image), and best-effort price / bedrooms / location / size from JSON-LD or DOM.
Writes photo_urls + photo_count (+ basics for missing records) into properties.json.

Usage:
    python3 leggett_photo_harvest.py --refs /tmp/leggett_favs.txt
    python3 leggett_photo_harvest.py --ref A26151JRD22
"""
from __future__ import annotations

import argparse
import json
import random
import re
import sys
import time
from pathlib import Path

from playwright.sync_api import sync_playwright

SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from store import CHROME_UA, load, persist, upsert  # noqa: E402

VIEW_URL = 'https://www.frenchestateagents.com/french-property-for-sale/view/{ref}/'
HESTIA_RE = re.compile(r'https://image\.hestia\.immo/[^"\' )]+')


def _dedupe_photos(urls: set[str]) -> list[str]:
    """One URL per underlying image, preferring the largest width variant.

    Hestia URLs encode the source path (base64) in the final path segment; the
    transform params (w:400 vs w:800) differ. Group by that trailing segment.
    """
    best: dict[str, tuple[int, str]] = {}
    for u in urls:
        key = u.rsplit('/', 1)[-1]  # base64 source id + .jpg — stable per image
        m = re.search(r'/w:(\d+)/', u)
        w = int(m.group(1)) if m else 0
        if key not in best or w > best[key][0]:
            best[key] = (w, u)
    return [v[1] for v in best.values()]


def _extract_basics(html: str) -> dict:
    """Price / bedrooms / location from the page's RealEstateListing JSON-LD node.

    The page carries several JSON-LD blocks (agency, the listing, sometimes related
    items). Only the RealEstateListing node describes THIS property — take price from
    it alone, never a loose regex over the whole page (that grabs related listings).
    Location comes from the <h1>/<title>, which always name this property's commune.
    """
    out: dict = {}
    for blob in re.findall(r'<script[^>]+ld\+json[^>]*>(.*?)</script>', html, re.S):
        try:
            data = json.loads(blob)
        except json.JSONDecodeError:
            continue
        nodes = data if isinstance(data, list) else [data]
        for n in nodes:
            if not isinstance(n, dict) or n.get('@type') != 'RealEstateListing':
                continue
            offers = n.get('offers') or {}
            if isinstance(offers, dict) and offers.get('price'):
                try:
                    out['price'] = int(float(offers['price']))
                except (TypeError, ValueError):
                    pass
            if n.get('name'):
                out['title'] = n['name'].strip('"')
            nb = n.get('numberOfBedrooms') or n.get('numberOfRooms')
            if nb:
                try:
                    out['bedrooms'] = int(nb)
                except (TypeError, ValueError):
                    pass
            # Capture the agent's canonical photo ORDER. Leggett's image.hestia.immo
            # URLs (in photo_urls) encode source paths as base64 with no positional
            # info, so we can't sort photo_urls themselves. The JSON-LD `image` field
            # is a list in the agent's chosen order — [0] is the cover. Stored as
            # cover_photos so the shortlist can prefer it as the lead.
            img = n.get('image')
            if isinstance(img, list):
                out['cover_photos'] = [u for u in img if isinstance(u, str)]
            elif isinstance(img, str):
                out['cover_photos'] = [img]

    mh = re.search(r'<h1[^>]*>(.*?)</h1>', html, re.S)
    if mh:
        h1 = re.sub(r'<[^>]+>', '', mh.group(1)).strip()
        m = re.search(r'\bin\s+([^,]+),\s*([^,]+)', h1)
        if m:
            out['city'] = m.group(1).strip()
            out['department'] = m.group(2).strip()
    if 'bedrooms' not in out:
        mb = re.search(r'(\d+)\s*-?\s*bed', html, re.I)
        if mb:
            out['bedrooms'] = int(mb.group(1))
    return out


def harvest_one(context, ref: str) -> dict:
    """Fetch one listing in a fresh page. Waits out the Cloudflare JS challenge.

    Cloudflare rate-limits rapid sequential hits even from a real browser, so each
    call uses a fresh page and tolerates an initial 403/challenge by waiting and
    re-reading content once the JS solve completes.
    """
    page = context.new_page()
    try:
        r = page.goto(VIEW_URL.format(ref=ref), wait_until='domcontentloaded', timeout=60000)
        status = r.status if r else None
        page.wait_for_timeout(4000)  # let Cloudflare JS + lazy images settle
        title = (page.title() or '').lower()
        if status == 403 or 'just a moment' in title or 'attention required' in title:
            page.wait_for_timeout(9000)  # wait out the interstitial JS solve
            title = (page.title() or '').lower()
        html = page.content()
        photos = _dedupe_photos(set(HESTIA_RE.findall(html)))
        if not photos and ('just a moment' in title or 'attention required' in title):
            return {'_status': 403}
        basics = _extract_basics(html)
        return {'photo_urls': photos, 'photo_count': len(photos), **basics,
                'url': page.url, '_status': 200}
    finally:
        page.close()


def find_existing_url(store: dict, ref: str) -> str | None:
    for u in store:
        if f'/view/{ref}/' in u or u.rstrip('/').split('/')[-1] == ref:
            return u
    return None


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--refs', help='file with whitespace-separated REFs')
    ap.add_argument('--ref', help='single REF')
    ap.add_argument('--headless', action='store_true', default=True)
    args = ap.parse_args()

    if args.ref:
        refs = [args.ref]
    elif args.refs:
        refs = Path(args.refs).read_text().split()
    else:
        ap.error('need --refs or --ref')

    store = load()
    ok = miss = 0
    failed_refs: list[str] = []
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=args.headless)
        for i, ref in enumerate(refs, 1):
            # Fresh context per listing: own cookie jar, avoids the rate-limit
            # signature that builds up across reused sessions.
            context = browser.new_context(user_agent=CHROME_UA, locale='en-GB')
            try:
                res = harvest_one(context, ref)
            except Exception as e:  # noqa: BLE001
                print(f'  {ref:14} ERROR {type(e).__name__}: {e}')
                miss += 1
                failed_refs.append(ref)
                context.close()
                continue
            finally:
                context.close()
            if res.get('_status') != 200:
                print(f'  {ref:14} status={res["_status"]}')
                miss += 1
                failed_refs.append(ref)
            else:
                url = find_existing_url(store, ref) or res['url']
                res.pop('_status', None)
                res.pop('url', None)
                upsert(store, url, {**res, 'source': 'leggett', 'leggett_ref': ref})
                n = res.get('photo_count', 0)
                print(f'  {ref:14} {n:>2} photos  €{res.get("price","?"):>9}  {res.get("city","?")[:24]}')
                ok += 1 if n else 0
            if i % 10 == 0:
                persist(store)  # checkpoint
            time.sleep(random.uniform(6.0, 11.0))  # human-paced, avoids rate trip
        browser.close()
    persist(store)
    if failed_refs:
        Path('/tmp/leggett_failed.txt').write_text(' '.join(failed_refs))
        print(f'failed refs written to /tmp/leggett_failed.txt ({len(failed_refs)})')
    print(f'\nHarvested photos for {ok}/{len(refs)} refs ({miss} failed)')
    return 0


if __name__ == '__main__':
    sys.exit(main())
