#!/usr/bin/env python3
"""Enrich Leggett listings with building_size_m2, land_size_m2, description, full bed-count.

The first harvester pulled price + photos but skipped m² and the long description.
Leggett's JSON-LD carries floorSize (building) as structured data and the page DOM
carries the building/land pair as adjacent m² values in the header. This re-fetches
only what we ask for (no full photo re-harvest) and merges into the store.

Usage:
    python3 leggett_size_enrich.py --refs refA refB refC
    python3 leggett_size_enrich.py --refs-file /tmp/refs.txt
"""
from __future__ import annotations

import argparse
import json
import random
import re
import sys
import time
from pathlib import Path

from playwright.sync_api import sync_playwright

SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from store import CHROME_UA, load, persist, upsert  # noqa: E402

VIEW_URL = 'https://www.frenchestateagents.com/french-property-for-sale/view/{ref}/'
# Land in the description text — most reliable signal.
# Examples seen: "land of 3,830m²", "plot of 8,000m2", "grounds of 1.2 hectares",
# French: "terrain de 5 000 m²", "sur un terrain de 1,2 hectare".
LAND_PHRASE_RE = re.compile(
    r'(?:land|plot|grounds|terrain(?:\s+de)?)\s*(?:of|d\'?un\s+|de\s+)?\s*'
    r'([\d][\d ,. ]{0,9})\s*(m²|m2|hectares?|ha)\b',
    re.I,
)


def _to_int(s: str) -> int | None:
    digits = re.sub(r'[^\d]', '', s)
    return int(digits) if digits else None


def _extract(html: str) -> dict:
    out: dict = {}
    # JSON-LD: building size + room count + full description
    for blob in re.findall(r'<script[^>]+ld\+json[^>]*>(.*?)</script>', html, re.S):
        try:
            data = json.loads(blob)
        except json.JSONDecodeError:
            continue
        nodes = data if isinstance(data, list) else [data]
        for n in nodes:
            if not isinstance(n, dict) or n.get('@type') != 'RealEstateListing':
                continue
            fs = n.get('floorSize') or {}
            if isinstance(fs, dict) and fs.get('value'):
                out['building_size_m2'] = int(float(fs['value']))
            nr = n.get('numberOfRooms')
            if isinstance(nr, dict):
                nr = nr.get('value')
            if nr:
                try:
                    out['number_of_rooms'] = int(nr)
                except (TypeError, ValueError):
                    pass
            if n.get('description'):
                out['description'] = n['description']
    # Land: every Leggett page header carries "Floor Xm2 | Ext Ym2" — Ext is the
    # canonical parcel size. Validated 2026-05-30 on 3 listings. Far more reliable
    # than chasing prose ("land of", "plot of", etc) which only some listings use.
    page_text = re.sub(r'<[^>]+>', ' ', html)
    m_ext = re.search(r'Ext\s*([\d][\d ,. ]{0,9})\s*m', page_text, re.I)
    if m_ext:
        v = re.sub(r'[^\d]', '', m_ext.group(1))
        if v and int(v) >= 200:  # reject obvious garbage (village houses w/o Ext)
            out['land_size_m2'] = int(v)
    # Backup: hectare phrase in description ("1,1 ha", "2.5 hectares") for the
    # rare listing missing the Ext header.
    if 'land_size_m2' not in out:
        for m in re.finditer(r'([\d]+(?:[,.][\d]+)?)\s*ha\b', page_text, re.I):
            try:
                ha = float(m.group(1).replace(',', '.'))
                if 0.05 <= ha <= 100:
                    out['land_size_m2'] = int(ha * 10000)
                    break
            except ValueError:
                pass
    # Bedroom count from description (more reliable than the title's "2/4 bed")
    if out.get('description'):
        mb = re.search(r'(\d+)\s*bedroom', out['description'], re.I)
        if mb:
            out['bedrooms'] = int(mb.group(1))
    return out


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--refs', nargs='*')
    ap.add_argument('--refs-file')
    args = ap.parse_args()

    refs: list[str] = list(args.refs or [])
    if args.refs_file:
        refs += Path(args.refs_file).read_text().split()
    if not refs:
        ap.error('need --refs or --refs-file')

    store = load()
    byref = {p.get('leggett_ref'): u for u, p in store.items() if p.get('leggett_ref')}
    ok = 0
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        for i, ref in enumerate(refs, 1):
            ctx = browser.new_context(user_agent=CHROME_UA, locale='en-GB')
            try:
                page = ctx.new_page()
                r = page.goto(VIEW_URL.format(ref=ref), wait_until='domcontentloaded', timeout=60000)
                page.wait_for_timeout(4500)
                if not r or r.status != 200:
                    page.wait_for_timeout(8000)
                html = page.content()
            finally:
                ctx.close()
            data = _extract(html)
            url = byref.get(ref)
            if not url:
                print(f'  {ref:13} NOT IN STORE — skipping')
                continue
            upsert(store, url, data)
            bs = data.get('building_size_m2')
            ls = data.get('land_size_m2')
            bd = data.get('bedrooms')
            print(f'  {ref:13} bldg={bs or "?"}m2 land={ls or "?"}m2 beds={bd or "?"}  desc={len(data.get("description",""))}c')
            ok += 1 if (bs or ls) else 0
            if i % 5 == 0:
                persist(store)
            time.sleep(random.uniform(6.0, 11.0))
        browser.close()
    persist(store)
    print(f'\nEnriched {ok}/{len(refs)} with size data')
    return 0


if __name__ == '__main__':
    sys.exit(main())
