#!/usr/bin/env python3 """Enrich Leggett listings with building_size_m2, land_size_m2, description, full bed-count. The first harvester pulled price + photos but skipped m² and the long description. Leggett's JSON-LD carries floorSize (building) as structured data and the page DOM carries the building/land pair as adjacent m² values in the header. This re-fetches only what we ask for (no full photo re-harvest) and merges into the store. Usage: python3 leggett_size_enrich.py --refs refA refB refC python3 leggett_size_enrich.py --refs-file /tmp/refs.txt """ from __future__ import annotations import argparse import json import random import re import sys import time from pathlib import Path from playwright.sync_api import sync_playwright SCRIPT_DIR = Path(__file__).resolve().parent if str(SCRIPT_DIR) not in sys.path: sys.path.insert(0, str(SCRIPT_DIR)) from store import CHROME_UA, load, persist, upsert # noqa: E402 VIEW_URL = 'https://www.frenchestateagents.com/french-property-for-sale/view/{ref}/' # Land in the description text — most reliable signal. # Examples seen: "land of 3,830m²", "plot of 8,000m2", "grounds of 1.2 hectares", # French: "terrain de 5 000 m²", "sur un terrain de 1,2 hectare". LAND_PHRASE_RE = re.compile( r'(?:land|plot|grounds|terrain(?:\s+de)?)\s*(?:of|d\'?un\s+|de\s+)?\s*' r'([\d][\d ,. ]{0,9})\s*(m²|m2|hectares?|ha)\b', re.I, ) def _to_int(s: str) -> int | None: digits = re.sub(r'[^\d]', '', s) return int(digits) if digits else None def _extract(html: str) -> dict: out: dict = {} # JSON-LD: building size + room count + full description for blob in re.findall(r']+ld\+json[^>]*>(.*?)', html, re.S): try: data = json.loads(blob) except json.JSONDecodeError: continue nodes = data if isinstance(data, list) else [data] for n in nodes: if not isinstance(n, dict) or n.get('@type') != 'RealEstateListing': continue fs = n.get('floorSize') or {} if isinstance(fs, dict) and fs.get('value'): out['building_size_m2'] = int(float(fs['value'])) nr = n.get('numberOfRooms') if isinstance(nr, dict): nr = nr.get('value') if nr: try: out['number_of_rooms'] = int(nr) except (TypeError, ValueError): pass if n.get('description'): out['description'] = n['description'] # Land: every Leggett page header carries "Floor Xm2 | Ext Ym2" — Ext is the # canonical parcel size. Validated 2026-05-30 on 3 listings. Far more reliable # than chasing prose ("land of", "plot of", etc) which only some listings use. page_text = re.sub(r'<[^>]+>', ' ', html) m_ext = re.search(r'Ext\s*([\d][\d ,. ]{0,9})\s*m', page_text, re.I) if m_ext: v = re.sub(r'[^\d]', '', m_ext.group(1)) if v and int(v) >= 200: # reject obvious garbage (village houses w/o Ext) out['land_size_m2'] = int(v) # Backup: hectare phrase in description ("1,1 ha", "2.5 hectares") for the # rare listing missing the Ext header. if 'land_size_m2' not in out: for m in re.finditer(r'([\d]+(?:[,.][\d]+)?)\s*ha\b', page_text, re.I): try: ha = float(m.group(1).replace(',', '.')) if 0.05 <= ha <= 100: out['land_size_m2'] = int(ha * 10000) break except ValueError: pass # Bedroom count from description (more reliable than the title's "2/4 bed") if out.get('description'): mb = re.search(r'(\d+)\s*bedroom', out['description'], re.I) if mb: out['bedrooms'] = int(mb.group(1)) return out def main(): ap = argparse.ArgumentParser(description=__doc__) ap.add_argument('--refs', nargs='*') ap.add_argument('--refs-file') args = ap.parse_args() refs: list[str] = list(args.refs or []) if args.refs_file: refs += Path(args.refs_file).read_text().split() if not refs: ap.error('need --refs or --refs-file') store = load() byref = {p.get('leggett_ref'): u for u, p in store.items() if p.get('leggett_ref')} ok = 0 with sync_playwright() as p: browser = p.chromium.launch(headless=True) for i, ref in enumerate(refs, 1): ctx = browser.new_context(user_agent=CHROME_UA, locale='en-GB') try: page = ctx.new_page() r = page.goto(VIEW_URL.format(ref=ref), wait_until='domcontentloaded', timeout=60000) page.wait_for_timeout(4500) if not r or r.status != 200: page.wait_for_timeout(8000) html = page.content() finally: ctx.close() data = _extract(html) url = byref.get(ref) if not url: print(f' {ref:13} NOT IN STORE — skipping') continue upsert(store, url, data) bs = data.get('building_size_m2') ls = data.get('land_size_m2') bd = data.get('bedrooms') print(f' {ref:13} bldg={bs or "?"}m2 land={ls or "?"}m2 beds={bd or "?"} desc={len(data.get("description",""))}c') ok += 1 if (bs or ls) else 0 if i % 5 == 0: persist(store) time.sleep(random.uniform(6.0, 11.0)) browser.close() persist(store) print(f'\nEnriched {ok}/{len(refs)} with size data') return 0 if __name__ == '__main__': sys.exit(main())