#!/usr/bin/env python3 """Harvest Leggett (frenchestateagents.com) photos + basics past Cloudflare via local Playwright. The www site is Cloudflare-protected (HTTP 403 to plain fetch), but photos live on the public image.hestia.immo CDN and the URLs are embedded in the page HTML. A local headless Chromium solves the JS challenge — no third-party proxy, no API key. Harvests for each favourite REF: hestia photo URLs (deduped to the largest variant per image), and best-effort price / bedrooms / location / size from JSON-LD or DOM. Writes photo_urls + photo_count (+ basics for missing records) into properties.json. Usage: python3 leggett_photo_harvest.py --refs /tmp/leggett_favs.txt python3 leggett_photo_harvest.py --ref A26151JRD22 """ from __future__ import annotations import argparse import json import random import re import sys import time from pathlib import Path from playwright.sync_api import sync_playwright SCRIPT_DIR = Path(__file__).resolve().parent if str(SCRIPT_DIR) not in sys.path: sys.path.insert(0, str(SCRIPT_DIR)) from store import CHROME_UA, load, persist, upsert # noqa: E402 VIEW_URL = 'https://www.frenchestateagents.com/french-property-for-sale/view/{ref}/' HESTIA_RE = re.compile(r'https://image\.hestia\.immo/[^"\' )]+') def _dedupe_photos(urls: set[str]) -> list[str]: """One URL per underlying image, preferring the largest width variant. Hestia URLs encode the source path (base64) in the final path segment; the transform params (w:400 vs w:800) differ. Group by that trailing segment. """ best: dict[str, tuple[int, str]] = {} for u in urls: key = u.rsplit('/', 1)[-1] # base64 source id + .jpg — stable per image m = re.search(r'/w:(\d+)/', u) w = int(m.group(1)) if m else 0 if key not in best or w > best[key][0]: best[key] = (w, u) return [v[1] for v in best.values()] def _extract_basics(html: str) -> dict: """Price / bedrooms / location from the page's RealEstateListing JSON-LD node. The page carries several JSON-LD blocks (agency, the listing, sometimes related items). Only the RealEstateListing node describes THIS property — take price from it alone, never a loose regex over the whole page (that grabs related listings). Location comes from the

/, which always name this property's commune. """ out: dict = {} for blob in re.findall(r'<script[^>]+ld\+json[^>]>(.?)</script>', html, re.S): try: data = json.loads(blob) except json.JSONDecodeError: continue nodes = data if isinstance(data, list) else [data] for n in nodes: if not isinstance(n, dict) or n.get('@type') != 'RealEstateListing': continue offers = n.get('offers') or {} if isinstance(offers, dict) and offers.get('price'): try: out['price'] = int(float(offers['price'])) except (TypeError, ValueError): pass if n.get('name'): out['title'] = n['name'].strip('"') nb = n.get('numberOfBedrooms') or n.get('numberOfRooms') if nb: try: out['bedrooms'] = int(nb) except (TypeError, ValueError): pass # Capture the agent's canonical photo ORDER. Leggett's image.hestia.immo # URLs (in photo_urls) encode source paths as base64 with no positional # info, so we can't sort photo_urls themselves. The JSON-LD `image` field # is a list in the agent's chosen order — [0] is the cover. Stored as # cover_photos so the shortlist can prefer it as the lead. img = n.get('image') if isinstance(img, list): out['cover_photos'] = [u for u in img if isinstance(u, str)] elif isinstance(img, str): out['cover_photos'] = [img] mh = re.search(r'<h1[^>]>(.?)</h1>', html, re.S) if mh: h1 = re.sub(r'<[^>]+>', '', mh.group(1)).strip() m = re.search(r'\bin\s+([^,]+),\s([^,]+)', h1) if m: out['city'] = m.group(1).strip() out['department'] = m.group(2).strip() if 'bedrooms' not in out: mb = re.search(r'(\d+)\s-?\s*bed', html, re.I) if mb: out['bedrooms'] = int(mb.group(1)) return out def harvest_one(context, ref: str) -> dict: """Fetch one listing in a fresh page. Waits out the Cloudflare JS challenge. Cloudflare rate-limits rapid sequential hits even from a real browser, so each call uses a fresh page and tolerates an initial 403/challenge by waiting and re-reading content once the JS solve completes. """ page = context.new_page() try: r = page.goto(VIEW_URL.format(ref=ref), wait_until='domcontentloaded', timeout=60000) status = r.status if r else None page.wait_for_timeout(4000) # let Cloudflare JS + lazy images settle title = (page.title() or '').lower() if status == 403 or 'just a moment' in title or 'attention required' in title: page.wait_for_timeout(9000) # wait out the interstitial JS solve title = (page.title() or '').lower() html = page.content() photos = _dedupe_photos(set(HESTIA_RE.findall(html))) if not photos and ('just a moment' in title or 'attention required' in title): return {'_status': 403} basics = _extract_basics(html) return {'photo_urls': photos, 'photo_count': len(photos), basics, 'url': page.url, '_status': 200} finally: page.close() def find_existing_url(store: dict, ref: str) -> str | None: for u in store: if f'/view/{ref}/' in u or u.rstrip('/').split('/')[-1] == ref: return u return None def main(): ap = argparse.ArgumentParser(description=doc) ap.add_argument('--refs', help='file with whitespace-separated REFs') ap.add_argument('--ref', help='single REF') ap.add_argument('--headless', action='store_true', default=True) args = ap.parse_args() if args.ref: refs = [args.ref] elif args.refs: refs = Path(args.refs).read_text().split() else: ap.error('need --refs or --ref') store = load() ok = miss = 0 failed_refs: list[str] = [] with sync_playwright() as p: browser = p.chromium.launch(headless=args.headless) for i, ref in enumerate(refs, 1): # Fresh context per listing: own cookie jar, avoids the rate-limit # signature that builds up across reused sessions. context = browser.new_context(user_agent=CHROME_UA, locale='en-GB') try: res = harvest_one(context, ref) except Exception as e: # noqa: BLE001 print(f' {ref:14} ERROR {type(e).name}: {e}') miss += 1 failed_refs.append(ref) context.close() continue finally: context.close() if res.get('_status') != 200: print(f' {ref:14} status={res["_status"]}') miss += 1 failed_refs.append(ref) else: url = find_existing_url(store, ref) or res['url'] res.pop('_status', None) res.pop('url', None) upsert(store, url, {res, 'source': 'leggett', 'leggett_ref': ref}) n = res.get('photo_count', 0) print(f' {ref:14} {n:>2} photos €{res.get("price","?"):>9} {res.get("city","?")[:24]}') ok += 1 if n else 0 if i % 10 == 0: persist(store) # checkpoint time.sleep(random.uniform(6.0, 11.0)) # human-paced, avoids rate trip browser.close() persist(store) if failed_refs: Path('/tmp/leggett_failed.txt').write_text(' '.join(failed_refs)) print(f'failed refs written to /tmp/leggett_failed.txt ({len(failed_refs)})') print(f'\nHarvested photos for {ok}/{len(refs)} refs ({miss} failed)') return 0 if name == 'main': sys.exit(main())