#!/usr/bin/env python3
"""Harvest Immonot (notary network) detail pages for photos + sizes via Playwright.

Immonot pages carry photos via cdn-immonot.notariat.services in a numbered scheme:
{hash}/{slug}-{NNN}.jpg, where {hash} is a 40-char SHA + timestamp and {NNN} runs
001..N. The page HTML lists all photo filenames in the gallery markup (even when
lazy-loaded), so we can extract the full set without scrolling.

Sizes come from the page's "Caractéristiques" panel:
  - Surface habitable X m²  → building_size_m2
  - Surface terrain Y m²    → land_size_m2
  - Chambres Z              → bedrooms
  - Pièces Z                → number_of_rooms

Usage:
    python3 immonot_photo_harvest.py --top 30
    python3 immonot_photo_harvest.py --refs URL1 URL2
    python3 immonot_photo_harvest.py --all-without-photos
"""
from __future__ import annotations

import argparse
import json
import random
import re
import sys
import time
from pathlib import Path

from playwright.sync_api import sync_playwright

SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from store import CHROME_UA, get_score, is_active, load, persist, upsert  # noqa: E402

# CDN URL shape: cdn-immonot.notariat.services/photo/jpg/{HASH}_{TS}/{SLUG}-NNN.jpg
# JSON-LD canonical image is the vignette-375 variant; full size is photo/jpg.
CDN_HOST = 'cdn-immonot.notariat.services'
PHOTO_PATH_RE = re.compile(
    r'/(?:vignette-\d+|photo)/(?:jpg|webp|png)/'
    r'(?P<hash>[0-9a-f]{40}_\d{14})/'
    r'(?P<slug>[a-z0-9-]+?)-(?P<num>\d{3})\.(?:jpe?g|webp|png)',
    re.I,
)
JSONLD_HOUSE_RE = re.compile(
    r'<script[^>]+ld\+json[^>]*>(\{[^<]*?"@type":\s*"House"[^<]*?\})</script>',
    re.S,
)
SIZE_PATTERNS = {
    # 'Surface habitable 145 m²' (with optional nbsp)
    'building_size_m2': re.compile(
        r'Surface\s+habitable\s+([\d  .,]+)\s*m', re.I),
    # 'Surface terrain 5 734 m²'
    'land_size_m2': re.compile(
        r'Surface\s+terrain\s+([\d  .,]+)\s*m', re.I),
}
ROOMS_RE = re.compile(r'Pi[èe]ces?\s*[:\s]\s*(\d+)', re.I)
BEDS_RE = re.compile(r'Chambres?\s*[:\s]\s*(\d+)', re.I)


def _to_int(s: str) -> int | None:
    """Immonot uses non-breaking spaces and regular spaces as thousand separators."""
    digits = re.sub(r'[^\d]', '', s)
    return int(digits) if digits else None


def _extract_property_hash(html: str, listing_ref: str | None) -> str | None:
    """Find THIS property's hash.

    Approach 1 (preferred): JSON-LD `"@type": "House"` block carries the
    canonical lead-photo URL. Approach 2 (fallback): pick the hash with the
    most numbered photo occurrences in the HTML — related-listing thumbnails
    appear once each; the main listing's gallery has 5-25.
    """
    # Approach 1: JSON-LD House block (when matchable)
    m = JSONLD_HOUSE_RE.search(html)
    if m:
        blob = m.group(1)
        img_m = re.search(r'"image":\s*"([^"]+)"', blob)
        if img_m:
            path_m = PHOTO_PATH_RE.search(img_m.group(1))
            if path_m:
                return path_m.group('hash')
    # Approach 2: hash with most photo refs = the main listing's gallery
    counts: dict[str, int] = {}
    for pm in PHOTO_PATH_RE.finditer(html):
        h = pm.group('hash')
        counts[h] = counts.get(h, 0) + 1
    if not counts:
        return None
    # Require at least 3 photos to confidently identify "the main listing"
    best_hash = max(counts, key=counts.get)
    return best_hash if counts[best_hash] >= 3 else None


def _full_size_url(hash_: str, slug: str, num: str) -> str:
    return f'https://{CDN_HOST}/photo/jpg/{hash_}/{slug}-{num}.jpg'


def _extract(html: str, listing_ref: str | None = None) -> dict:
    out: dict = {}
    # 1) Identify THIS property's hash (filters out related-listing noise)
    own_hash = _extract_property_hash(html, listing_ref)
    if own_hash:
        # Collect every numbered photo path matching this hash
        matches = list(PHOTO_PATH_RE.finditer(html))
        slug_to_nums: dict[str, set[str]] = {}
        for m in matches:
            if m.group('hash') != own_hash:
                continue
            slug = m.group('slug')
            slug_to_nums.setdefault(slug, set()).add(m.group('num'))
        # The property has one slug — pick the one with most numbers (defensive)
        if slug_to_nums:
            best_slug = max(slug_to_nums, key=lambda k: len(slug_to_nums[k]))
            nums = sorted(slug_to_nums[best_slug])
            out['photo_urls'] = [_full_size_url(own_hash, best_slug, n) for n in nums]
            out['photo_count'] = len(out['photo_urls'])

    # 2) Sizes from "Caractéristiques" panel (strip tags + nbsp first)
    page_text = re.sub(r'<[^>]+>', ' ', html).replace('\xa0', ' ')
    page_text = re.sub(r'\s+', ' ', page_text)
    for field, pat in SIZE_PATTERNS.items():
        m = pat.search(page_text)
        if m:
            v = _to_int(m.group(1))
            if v and v > 0:
                out[field] = v
                if field == 'land_size_m2':
                    out['land_size_verified'] = True  # gate trusts this now

    # 3) Rooms / bedrooms
    m = ROOMS_RE.search(page_text)
    if m:
        out['number_of_rooms'] = int(m.group(1))
    m = BEDS_RE.search(page_text)
    if m:
        out['bedrooms'] = int(m.group(1))

    return out


def harvest_one(context, url: str) -> dict:
    page = context.new_page()
    try:
        r = page.goto(url, wait_until='domcontentloaded', timeout=60000)
        if not r or r.status != 200:
            return {'_status': r.status if r else 'no-response'}
        page.wait_for_timeout(3500)
        # Trigger lazy-load
        try:
            page.evaluate('window.scrollTo(0, document.body.scrollHeight/2);')
            page.wait_for_timeout(1500)
        except Exception:
            pass
        html = page.content()
        # Get listing ref from URL: /annonce-immobiliere/{REF}/...
        m = re.search(r'/annonce-immobiliere/([^/]+)/', url)
        ref = m.group(1) if m else None
        data = _extract(html, ref)
        data['_status'] = 200
        return data
    finally:
        page.close()


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--top', type=int, help='top N by current cp_score')
    ap.add_argument('--refs', nargs='*', help='specific URLs')
    ap.add_argument('--all-without-photos', action='store_true')
    args = ap.parse_args()

    store = load()
    immonot = [(u, p) for u, p in store.items()
               if p.get('source') == 'immonot' and is_active(p)]
    if args.refs:
        urls = args.refs
    elif args.all_without_photos:
        urls = [u for u, p in immonot if not p.get('photo_urls')]
    else:
        immonot.sort(key=lambda kv: -float(get_score(kv[1]) or 0))
        urls = [u for u, _ in immonot[: args.top or 30]]

    print(f'Harvesting {len(urls)} Immonot pages')
    ok = 0
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        for i, url in enumerate(urls, 1):
            ctx = browser.new_context(user_agent=CHROME_UA, locale='fr-FR')
            try:
                res = harvest_one(ctx, url)
            except Exception as e:  # noqa: BLE001
                print(f'  ERROR {type(e).__name__}: {e} — {url[-50:]}')
                ctx.close()
                continue
            finally:
                ctx.close()
            if res.get('_status') != 200:
                print(f'  status={res["_status"]}  {url[-50:]}')
                continue
            res.pop('_status', None)
            upsert(store, url, res)
            n = res.get('photo_count', 0)
            bs = res.get('building_size_m2')
            ls = res.get('land_size_m2')
            city = (store[url].get('city') or '?')[:24]
            print(f'  {city:24} {n:>2} photos  bldg {bs or "?"}m2  land {ls or "?":>5}m2')
            ok += 1 if n else 0
            if i % 10 == 0:
                persist(store)
            time.sleep(random.uniform(4.0, 8.0))
        browser.close()
    persist(store)
    print(f'\nHarvested photos for {ok}/{len(urls)} Immonot pages')
    return 0


if __name__ == '__main__':
    sys.exit(main())