#!/usr/bin/env python3
"""
Search property platforms for new listings matching Cyber Prairie criteria.

Discovers new properties and adds them to the store for pipeline processing.
Supports: Immobiliare.it, Green-Acres, Leggett, Properstar.

Usage:
    python3 search_properties.py                           # Search all platforms
    python3 search_properties.py --platform immobiliare    # Single platform
    python3 search_properties.py --dry-run                 # Show URLs without saving
    python3 search_properties.py --limit 20                # Max new properties per platform
    python3 search_properties.py --pages 3                 # Max pages per region search
"""
import argparse
import asyncio
import base64
import json
import math
import re
import time
from datetime import datetime
from html import unescape
from pathlib import Path

import requests as req

from store import load, persist, upsert, browser_page, wait_for_cloudflare, CHROME_UA

# ─── Search Criteria (from cyber-prairie-property-spec.md) ───

CRITERIA = {
    'price_min': 100000,   # raised from 50k — eliminates ruins/barns
    'price_max': 450000,
    'bedrooms_min': 3,
}

# Property types to SKIP (not suitable for farm retreat)
SKIP_TYPES = {'apartment', 'flat', 'penthouse', 'studio', 'project',
              'garage', 'parking', 'storage', 'office', 'shop',
              'business-premises', 'commercial',
              'terraced', 'townhouse', 'semi-detached',  # must be detached
              'land', 'plot', 'terrain'}                  # must have building

# ─── Region definitions per platform ───

# Immobiliare.it: region slug → display name
REGIONS_IMMOBILIARE = [
    ('marche', 'Le Marche'),
    ('umbria', 'Umbria'),
    ('toscana', 'Tuscany'),
    ('liguria', 'Liguria'),
    ('piemonte', 'Piedmont'),
    ('abruzzo', 'Abruzzo'),
    ('puglia', 'Puglia'),
    ('calabria', 'Calabria'),
]

# Green-Acres: (country_tld, region_slug, display_name)
REGIONS_GREENACRES = [
    # France — high-vibe regions (★★★★★)
    ('fr', 'drome', 'Drôme'),
    ('fr', 'ardeche', 'Ardèche'),
    ('fr', 'herault', 'Hérault'),
    ('fr', 'cotes-d-armor', "Côtes-d'Armor"),
    ('fr', 'morbihan', 'Morbihan'),
    ('fr', 'finistere', 'Finistère'),
    # France — strong vibe (★★★★)
    ('fr', 'charente', 'Charente'),
    ('fr', 'charente-maritime', 'Charente-Maritime'),
    # France — good vibe (★★★)
    ('fr', 'dordogne', 'Dordogne'),
    ('fr', 'gard', 'Gard'),
    ('fr', 'lot', 'Lot'),
    ('fr', 'correze', 'Corrèze'),
    ('fr', 'creuse', 'Creuse'),
    # France — favourite regions (★★, but user-validated)
    ('fr', 'gers', 'Gers'),
    ('fr', 'lot-et-garonne', 'Lot-et-Garonne'),
    ('fr', 'mayenne', 'Mayenne'),
    # France — adjacent potential
    ('fr', 'orne', 'Orne'),
    ('fr', 'indre', 'Indre'),
    ('fr', 'deux-sevres', 'Deux-Sèvres'),
    ('fr', 'sarthe', 'Sarthe'),
    # Italy
    ('it', 'marche', 'Le Marche'),
    ('it', 'umbria', 'Umbria'),
    ('it', 'toscana', 'Tuscany'),
    ('it', 'liguria', 'Liguria'),
]

REGIONS_LEGGETT = [
    # High-vibe (★★★★★)
    ('finistere', 'Finistère'),
    ('herault', 'Hérault'),
    ('cotes-d-armor', "Côtes-d'Armor"),
    ('morbihan', 'Morbihan'),
    ('drome', 'Drôme'),
    # Strong vibe (★★★★)
    ('charente', 'Charente'),
    ('charente-maritime', 'Charente-Maritime'),
    ('ardeche', 'Ardèche'),
    # Good vibe (★★★)
    ('gard', 'Gard'),
    ('lot', 'Lot'),
    ('dordogne', 'Dordogne'),
    ('correze', 'Corrèze'),
    ('creuse', 'Creuse'),
    # Favourite regions (user-validated)
    ('gers', 'Gers'),
    ('lot-et-garonne', 'Lot-et-Garonne'),
    ('mayenne', 'Mayenne'),
    # Adjacent potential
    ('orne', 'Orne'),
    ('indre', 'Indre'),
    ('deux-sevres', 'Deux-Sèvres'),
    ('sarthe', 'Sarthe'),
]

REGIONS_PROPERSTAR = [
    # France
    ('france/finistere', 'Finistère'),
    ('france/herault', 'Hérault'),
    ('france/charente', 'Charente'),
    ('france/cotes-d-armor', "Côtes-d'Armor"),
    ('france/drome', 'Drôme'),
    ('france/ardeche', 'Ardèche'),
    ('france/morbihan', 'Morbihan'),
    ('france/dordogne', 'Dordogne'),
    # Italy
    ('italy/marche', 'Le Marche'),
    ('italy/umbria', 'Umbria'),
    ('italy/tuscany', 'Tuscany'),
    # Portugal
    ('portugal/alentejo', 'Alentejo'),
    # Spain
    ('spain/asturias', 'Asturias'),
    ('spain/galicia', 'Galicia'),
]

# Idealista: (domain, url_path, region_name, country_code)
# Each domain has its own URL structure for property type + price filters
REGIONS_IDEALISTA = [
    # Italy — rustici (farmhouses/country houses)
    ('it', 'en/geo/vendita-case/toscana/con-rustici', 'Tuscany', 'IT'),
    ('it', 'en/geo/vendita-case/umbria/con-rustici', 'Umbria', 'IT'),
    ('it', 'en/geo/vendita-case/marche/con-rustici', 'Le Marche', 'IT'),
    ('it', 'en/geo/vendita-case/liguria/con-rustici', 'Liguria', 'IT'),
    ('it', 'en/geo/vendita-case/piemonte/con-rustici', 'Piedmont', 'IT'),
    ('it', 'en/geo/vendita-case/abruzzo/con-rustici', 'Abruzzo', 'IT'),
    ('it', 'en/geo/vendita-case/puglia/con-rustici', 'Puglia', 'IT'),
    # Spain — casas de pueblo (country houses)
    ('com', 'en/geo/venta-viviendas/asturias/con-casas-de-pueblo', 'Asturias', 'ES'),
    ('com', 'en/geo/venta-viviendas/galicia/con-casas-de-pueblo', 'Galicia', 'ES'),
    ('com', 'en/geo/venta-viviendas/aragon/con-casas-de-pueblo', 'Aragon', 'ES'),
    ('com', 'en/geo/venta-viviendas/castilla-y-leon/con-casas-de-pueblo', 'Castilla y Leon', 'ES'),
    ('com', 'en/geo/venta-viviendas/cataluna/con-casas-de-pueblo', 'Catalonia', 'ES'),
    # Portugal — casas rústicas (rural houses / quintas)
    ('pt', 'en/geo/comprar-casas/alentejo/com-casa-rusticas', 'Alentejo', 'PT'),
    ('pt', 'en/geo/comprar-casas/centro/com-casa-rusticas', 'Centro', 'PT'),
    ('pt', 'en/geo/comprar-casas/norte/com-casa-rusticas', 'Norte', 'PT'),
]


# ─── Immobiliare.it (API-based, no browser needed) ───

def search_immobiliare(store, limit, max_pages):
    """Search immobiliare.it via __NEXT_DATA__ JSON extraction."""
    new_urls = []
    existing = set(store.keys())
    session = req.Session()
    session.headers.update({'User-Agent': CHROME_UA, 'Accept-Language': 'en'})

    for region_slug, region_name in REGIONS_IMMOBILIARE:
        if limit and len(new_urls) >= limit:
            break

        print(f"  Searching Immobiliare.it: {region_name}...")

        for page_num in range(1, max_pages + 1):
            # idTipologia: 7=villa, 23=single-family villa, 24=two-family villa,
            # 28=country house, 29=farmhouse, 31=rustico/casale, 33=cascina
            params = (f'prezzoMinimo={CRITERIA["price_min"]}&prezzoMassimo={CRITERIA["price_max"]}'
                      f'&localiMinimo={CRITERIA["bedrooms_min"]}'
                      f'&idTipologia=7,23,24,28,29,31,33')
            if page_num > 1:
                params += f'&pag={page_num}'

            url = f'https://www.immobiliare.it/en/vendita-case/{region_slug}/?{params}'

            try:
                resp = session.get(url, timeout=20)
                if resp.status_code != 200:
                    print(f"    Page {page_num}: HTTP {resp.status_code}")
                    break

                # Extract __NEXT_DATA__ JSON
                match = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', resp.text, re.DOTALL)
                if not match:
                    print(f"    Page {page_num}: no __NEXT_DATA__ found")
                    break

                next_data = json.loads(match.group(1))
                queries = next_data.get('props', {}).get('pageProps', {}).get('dehydratedState', {}).get('queries', [])
                if not queries:
                    break

                results = queries[0].get('state', {}).get('data', {}).get('results', [])
                if not results:
                    print(f"    Page {page_num}: no results")
                    break

                batch_new = 0
                for r in results:
                    re_data = r.get('realEstate', {})
                    seo = r.get('seo', {})
                    prop_url = seo.get('url', '')
                    if not prop_url or prop_url in existing:
                        continue

                    props = re_data.get('properties', [{}])[0]
                    loc = props.get('location', {})
                    price_data = re_data.get('price', {})

                    # Parse surface string like "103 m²"
                    surface = props.get('surface', '')
                    building_m2 = None
                    if surface:
                        m = re.search(r'(\d+)', surface.replace(',', ''))
                        if m:
                            building_m2 = int(m.group(1))

                    bedrooms = None
                    beds_str = props.get('bedRoomsNumber')
                    if beds_str:
                        try:
                            bedrooms = int(beds_str)
                        except (ValueError, TypeError):
                            pass

                    rooms = None
                    rooms_str = props.get('rooms')
                    if rooms_str:
                        try:
                            rooms = int(rooms_str)
                        except (ValueError, TypeError):
                            pass

                    # Skip apartment-like property types
                    typology = re_data.get('typology', {}).get('name', '')
                    if typology.lower() in SKIP_TYPES:
                        continue

                    photo = props.get('photo', {})
                    thumb = (photo.get('urls') or {}).get('small', '')

                    new_urls.append({
                        'url': prop_url,
                        'source': 'immobiliare',
                        'title': re_data.get('title', ''),
                        'price': price_data.get('value'),
                        'city': loc.get('city'),
                        'country': loc.get('nation', {}).get('id', 'IT'),
                        'lat': loc.get('latitude'),
                        'lon': loc.get('longitude'),
                        'building_size': building_m2,
                        'bedrooms': bedrooms,
                        'rooms': rooms,
                        'property_type': re_data.get('typology', {}).get('name'),
                        'thumbnail': thumb,
                        'search_region': region_name,
                    })
                    existing.add(prop_url)
                    batch_new += 1

                    if limit and len(new_urls) >= limit:
                        break

                print(f"    Page {page_num}: {len(results)} listings, {batch_new} new")

                if limit and len(new_urls) >= limit:
                    break

                # Check if there are more pages
                total = queries[0].get('state', {}).get('data', {}).get('count', 0)
                if page_num * 25 >= total:
                    break

                time.sleep(2)

            except Exception as e:
                print(f"    Error on page {page_num}: {str(e)[:80]}")
                break

        time.sleep(3)  # Delay between regions to avoid rate limiting

    return new_urls


# ─── Green-Acres (HTML scraping, no browser needed) ───

def search_greenacres(store, limit, max_pages):
    """Search green-acres.fr/it via server-rendered HTML."""
    new_urls = []
    existing = set(store.keys())
    session = req.Session()
    session.headers.update({'User-Agent': CHROME_UA, 'Accept-Language': 'en'})

    for country_tld, region_slug, region_name in REGIONS_GREENACRES:
        if limit and len(new_urls) >= limit:
            break

        print(f"  Searching Green-Acres.{country_tld}: {region_name}...")

        url = f'https://www.green-acres.{country_tld}/property-for-sale/{region_slug}'

        try:
            resp = session.get(url, timeout=20)
            if resp.status_code != 200:
                print(f"    HTTP {resp.status_code}")
                continue

            html = resp.text

            # Extract cards with data-advertid and data-o (base64 URL)
            card_pattern = re.compile(
                r'<div[^>]*class="announce-card[^"]*"[^>]*data-advertid="([^"]*)"[^>]*data-o="([^"]*)"',
                re.DOTALL
            )

            cards = card_pattern.findall(html)
            if not cards:
                print(f"    No listing cards found")
                continue

            batch_new = 0
            for advert_id, data_o in cards:
                try:
                    prop_url = base64.b64decode(data_o).decode('utf-8')
                except Exception:
                    continue

                if not prop_url or prop_url in existing:
                    continue

                # Skip non-residential property types
                if any(skip in prop_url.lower() for skip in ['business-premises', 'commercial', 'office', 'garage']):
                    continue

                # Extract city from URL (always reliable)
                # Pattern: /properties/{type}/{city}/{id}.htm
                city = None
                url_match = re.search(r'/properties/[^/]+/([^/]+)/[^/]+\.htm', prop_url)
                if url_match:
                    city = url_match.group(1).replace('-', ' ').title()

                # Find the announce-info section (skip photo carousel)
                card_start = html.find(f'data-advertid="{advert_id}"')
                if card_start < 0:
                    continue
                info_start = html.find('announce-info', card_start)
                if info_start < 0 or info_start > card_start + 15000:
                    info_html = ''
                else:
                    info_html = unescape(html[info_start:info_start + 1500])

                # Price
                price = None
                if info_html:
                    price_match = re.search(r'info-price[^>]*>([\d,. ]+)', info_html)
                    if price_match:
                        digits = re.sub(r'[^\d]', '', price_match.group(1))
                        if digits:
                            price = int(digits)

                # Apply price filter
                if price and (price < CRITERIA['price_min'] or price > CRITERIA['price_max']):
                    continue

                # Location from HTML (more specific than URL)
                if info_html:
                    loc_match = re.search(r'announce-localisation[^>]*>\s*([^<]+)', info_html)
                    if loc_match:
                        loc_text = loc_match.group(1).strip()
                        if loc_text:
                            city = loc_text

                # Features from info section
                building_m2 = None
                land_m2 = None
                rooms = None
                bedrooms = None

                if info_html:
                    area_match = re.search(r'title="Living area"[^>]*>\s*(\d[\d,. ]*)', info_html)
                    if area_match:
                        building_m2 = int(re.sub(r'[^\d]', '', area_match.group(1)))

                    land_match = re.search(r'title="Land"[^>]*>\s*([\d,. ]+)\s*(m|hectare)', info_html)
                    if land_match:
                        val_str = land_match.group(1).replace(',', '').strip()
                        try:
                            val = float(val_str)
                            land_m2 = int(val * 10000) if 'hectare' in land_match.group(2) else int(val)
                        except ValueError:
                            pass

                    rooms_match = re.search(r'title="Rooms"[^>]*>\s*(\d+)', info_html)
                    if rooms_match:
                        rooms = int(rooms_match.group(1))

                    beds_match = re.search(r'title="Bedrooms"[^>]*>\s*(\d+)', info_html)
                    if beds_match:
                        bedrooms = int(beds_match.group(1))

                # Thumbnail from photo section
                thumb = None
                card_html = html[card_start:card_start + 3000]
                img_match = re.search(r'announce-card-img[^>]*src="([^"]+)"', card_html)
                if img_match:
                    thumb = img_match.group(1)

                country_code = 'FR' if country_tld == 'fr' else 'IT'

                new_urls.append({
                    'url': prop_url,
                    'source': 'greenacres',
                    'title': city or region_name,
                    'price': price,
                    'city': city,
                    'country': country_code,
                    'building_size': building_m2,
                    'land_size': land_m2,
                    'rooms': rooms,
                    'bedrooms': bedrooms,
                    'thumbnail': thumb,
                    'search_region': region_name,
                })
                existing.add(prop_url)
                batch_new += 1

                if limit and len(new_urls) >= limit:
                    break

            print(f"    Found {len(cards)} listings, {batch_new} new")

        except Exception as e:
            print(f"    Error: {str(e)[:80]}")

        time.sleep(1)

    return new_urls


# ─── Leggett (Playwright, Cloudflare-protected) ───

async def search_leggett(page, store, limit, max_pages):
    """Search frenchestateagents.com for matching properties."""
    new_urls = []
    existing = set(store.keys())

    for region_slug, region_name in REGIONS_LEGGETT:
        if limit and len(new_urls) >= limit:
            break

        url = (f"https://www.frenchestateagents.com/french-property-for-sale/"
               f"in/{region_slug}/"
               f"?min_price={CRITERIA['price_min']}&max_price={CRITERIA['price_max']}"
               f"&min_bedrooms={CRITERIA['bedrooms_min']}"
               f"&type=house,farm")

        print(f"  Searching Leggett: {region_name}...")

        try:
            await page.goto(url, wait_until='domcontentloaded', timeout=30000)
            await page.wait_for_timeout(5000)

            if not await wait_for_cloudflare(page, timeout_ms=10000):
                print(f"    Cloudflare blocked. Skip {region_name}.")
                continue

            items = await page.evaluate("""() => {
                const results = [];
                // Try multiple selector patterns for robustness
                const anchors = document.querySelectorAll('a[href*="/french-property-for-sale/view/"]');
                const seen = new Set();
                for (const a of anchors) {
                    const url = a.href;
                    if (seen.has(url)) continue;
                    seen.add(url);

                    // Walk up to find the card container for price/thumb
                    let card = a.closest('article, .card, .property-card, .listing-card, li');
                    let price = null;
                    let thumb = null;

                    if (card) {
                        const priceEl = card.querySelector('.price .new-price, .price, [class*="price"]');
                        if (priceEl) price = parseInt(priceEl.textContent.replace(/[^0-9]/g, '')) || null;
                        const img = card.querySelector('img[src*="http"]');
                        if (img) thumb = img.src;
                    }

                    results.push({ url, price, thumb });
                }
                return results;
            }""")

            batch_new = 0
            for item in items:
                prop_url = item.get('url', '')
                if not prop_url or prop_url in existing:
                    continue

                price = item.get('price')
                if price and (price < CRITERIA['price_min'] or price > CRITERIA['price_max']):
                    continue

                batch_new += 1
                new_urls.append({
                    'url': prop_url,
                    'source': 'leggett',
                    'price': price,
                    'thumbnail': item.get('thumb'),
                    'search_region': region_name,
                    'country': 'FR',
                })
                existing.add(prop_url)

                if limit and len(new_urls) >= limit:
                    break

            print(f"    Found {len(items)} listings, {batch_new} new")

        except Exception as e:
            print(f"    Error: {str(e)[:80]}")

        await page.wait_for_timeout(2000)

    return new_urls


# ─── Properstar (Playwright, Cloudflare-protected) ───

async def search_properstar(page, store, limit, max_pages):
    """Search properstar.nl for matching properties."""
    new_urls = []
    existing = set(store.keys())

    for region_slug, region_name in REGIONS_PROPERSTAR:
        if limit and len(new_urls) >= limit:
            break

        url = (f"https://www.properstar.nl/{region_slug}/"
               f"kopen/huis?budget={CRITERIA['price_min']}-{CRITERIA['price_max']}"
               f"&bedrooms={CRITERIA['bedrooms_min']}-")

        print(f"  Searching Properstar: {region_name}...")

        try:
            await page.goto(url, wait_until='domcontentloaded', timeout=30000)
            await page.wait_for_timeout(3000)

            items = await page.evaluate("""() => {
                const results = [];
                const seen = new Set();
                // Properstar listing links
                const anchors = document.querySelectorAll('a[href*="/listing/"]');
                for (const a of anchors) {
                    const url = a.href;
                    if (seen.has(url) || !url.includes('/listing/')) continue;
                    seen.add(url);

                    let card = a.closest('article, [class*="card"], [class*="listing"]');
                    let price = null;

                    if (card) {
                        const priceEl = card.querySelector('[class*="price"], [itemprop="price"]');
                        if (priceEl) price = parseInt(priceEl.textContent.replace(/[^0-9]/g, '')) || null;
                    }

                    results.push({ url, price });
                }
                return results;
            }""")

            batch_new = 0
            for item in items:
                prop_url = item.get('url', '')
                if not prop_url or prop_url in existing:
                    continue

                batch_new += 1
                new_urls.append({
                    'url': prop_url,
                    'source': 'properstar',
                    'price': item.get('price'),
                    'search_region': region_name,
                })
                existing.add(prop_url)

                if limit and len(new_urls) >= limit:
                    break

            print(f"    Found {len(items)} listings, {batch_new} new")

        except Exception as e:
            print(f"    Error: {str(e)[:80]}")

        await page.wait_for_timeout(2000)

    return new_urls


# ─── Idealista (Playwright, per-domain auth) ───

IDEALISTA_AUTH_DIR = Path(__file__).parent

# Price filter URL fragments per domain
IDEALISTA_PRICE_FILTER = {
    'it': f'prezzo-min_{CRITERIA["price_min"]},prezzo-max_{CRITERIA["price_max"]}',
    'com': f'precio-min_{CRITERIA["price_min"]},precio-max_{CRITERIA["price_max"]}',
    'pt': f'preco-min_{CRITERIA["price_min"]},preco-max_{CRITERIA["price_max"]}',
}

# Listing URL patterns per domain (for detecting property links)
IDEALISTA_LISTING_PATTERNS = {
    'it': '/immobile/',
    'com': '/inmueble/',
    'pt': '/imovel/',
}


async def _idealista_check_captcha(page, context, domain, auth_file):
    """Detect DataDome CAPTCHA and wait for manual solving. Returns True if page is usable."""
    try:
        html = await page.evaluate('() => document.documentElement.outerHTML.substring(0, 3000)')
    except Exception:
        return False

    has_captcha = 'captcha-delivery.com' in html or 'datadome' in html.lower()
    has_content = await page.evaluate('() => document.querySelectorAll("article").length') > 0

    if has_captcha and not has_content:
        print(f"      DataDome CAPTCHA detected for idealista.{domain}")
        print(f"      Please solve the CAPTCHA in the browser window...")
        # Wait for CAPTCHA to be solved (page will reload with real content)
        for attempt in range(40):  # ~2 minutes
            await page.wait_for_timeout(3000)
            try:
                new_html = await page.evaluate('() => document.documentElement.outerHTML.substring(0, 3000)')
                if 'captcha-delivery.com' not in new_html:
                    print(f"      CAPTCHA solved!")
                    # Save cookies immediately
                    try:
                        await context.storage_state(path=str(auth_file))
                        print(f"      Cookies saved to {auth_file.name}")
                    except Exception:
                        pass
                    await page.wait_for_timeout(2000)
                    return True
            except Exception:
                continue
            if attempt % 5 == 4:
                print(f"      Still waiting for CAPTCHA solve...")
        print(f"      CAPTCHA timeout, skipping idealista.{domain}")
        return False

    return True  # No CAPTCHA, page is fine


async def _idealista_ensure_auth(context, page, domain, auth_file):
    """Check if logged in, prompt for manual login if needed, save auth."""
    current_url = page.url
    # If redirected to login or not on expected page, need auth
    if 'login' in current_url.lower() or 'accedi' in current_url.lower() or 'entrar' in current_url.lower():
        print(f"    Login required for idealista.{domain}")
        print(f"    Please log in in the browser window...")
        # Wait for navigation away from login page
        for _ in range(60):
            await page.wait_for_timeout(3000)
            current_url = page.url
            if 'login' not in current_url.lower() and 'accedi' not in current_url.lower() and 'entrar' not in current_url.lower():
                break
            print(f"    Still waiting for login... ({current_url[:50]})")
        # Save auth
        try:
            await context.storage_state(path=str(auth_file))
            print(f"    Auth saved to {auth_file.name}")
        except Exception:
            pass


async def search_idealista(page, store, limit, max_pages):
    """Search idealista.it/.com/.pt for matching rural properties.

    Uses patchright (undetected Playwright fork) with persistent context
    per domain to bypass DataDome bot detection. The shared `page` argument
    is ignored, we launch our own patchright browser.
    """
    from patchright.async_api import async_playwright as patchright_playwright

    new_urls = []
    existing = set(store.keys())

    by_domain = {}
    for domain, url_path, region_name, country_code in REGIONS_IDEALISTA:
        by_domain.setdefault(domain, []).append((url_path, region_name, country_code))

    async with patchright_playwright() as pw:
        for domain, regions in by_domain.items():
            if limit and len(new_urls) >= limit:
                break

            auth_file = IDEALISTA_AUTH_DIR / f'idealista_auth_{domain}.json'
            listing_pattern = IDEALISTA_LISTING_PATTERNS[domain]
            price_fragment = IDEALISTA_PRICE_FILTER[domain]
            user_data_dir = IDEALISTA_AUTH_DIR / f'.idealista_profile_{domain}'
            user_data_dir.mkdir(exist_ok=True)

            print(f"\n  idealista.{domain} ({len(regions)} regions)")

            context = await pw.chromium.launch_persistent_context(
                user_data_dir=str(user_data_dir),
                channel="chrome",
                headless=False,
                no_viewport=True,
            )
            domain_page = context.pages[0] if context.pages else await context.new_page()

            try:
                for url_path, region_name, country_code in regions:
                    if limit and len(new_urls) >= limit:
                        break

                    base_url = f'https://www.idealista.{domain}/{url_path}'
                    search_url = f'{base_url},{price_fragment}/'
                    print(f"    Searching: {region_name}...")

                    for page_num in range(1, max_pages + 1):
                        page_url = search_url if page_num == 1 else f'{search_url}pagina-{page_num}.htm'

                        try:
                            await domain_page.goto(page_url, wait_until='domcontentloaded', timeout=30000)
                            await domain_page.wait_for_timeout(4000)

                            if not await _idealista_check_captcha(domain_page, context, domain, auth_file):
                                print(f"      Blocked by CAPTCHA, skipping remaining {domain} regions")
                                break

                            try:
                                body_text = await domain_page.evaluate('() => document.body ? document.body.innerText.substring(0, 200) : ""')
                            except Exception:
                                body_text = ''
                            if 'uso improprio' in body_text.lower() or 'blocked' in body_text.lower() or 'bloccato' in body_text.lower():
                                print(f"      IP blocked by idealista.{domain}, skipping")
                                break

                            if 'login' in domain_page.url.lower() or 'accedi' in domain_page.url.lower():
                                await _idealista_ensure_auth(context, domain_page, domain, auth_file)
                                await domain_page.goto(page_url, wait_until='domcontentloaded', timeout=30000)
                                await domain_page.wait_for_timeout(3000)

                            escaped_pattern = listing_pattern.replace('/', '\\/')
                            items = await domain_page.evaluate("""(pattern) => {
                                const results = [];
                                const seen = new Set();
                                const re = new RegExp(pattern);
                                const articles = document.querySelectorAll('article.item, article[data-adid], .item-info-container');
                                for (const art of articles) {
                                    const links = art.querySelectorAll('a[href]');
                                    for (const a of links) {
                                        if (!re.test(a.href) || seen.has(a.href)) continue;
                                        seen.add(a.href);
                                        const priceEl = art.querySelector('.item-price, .price-row, [class*="price"] h3, [class*="price"]');
                                        const price = priceEl ? parseInt(priceEl.textContent.replace(/[^0-9]/g, '')) || null : null;
                                        const locEl = art.querySelector('.item-detail-char .item-detail, .item-description');
                                        const location = locEl ? locEl.textContent.trim().substring(0, 80) : null;
                                        const titleEl = art.querySelector('.item-link, a.item-link');
                                        const title = titleEl ? titleEl.textContent.trim() : null;
                                        const img = art.querySelector('img[src*="http"], picture img');
                                        const thumb = img ? img.src : null;
                                        const detailEls = art.querySelectorAll('.item-detail');
                                        let sqm = null, rooms = null;
                                        for (const d of detailEls) {
                                            const t = d.textContent.trim();
                                            const sqmMatch = t.match(/(\\d[\\d.,]*)\\s*m/);
                                            if (sqmMatch) sqm = parseInt(sqmMatch[1].replace(/[.,]/g, ''));
                                            const roomMatch = t.match(/(\\d+)\\s*(room|habitaci|local|vano)/i);
                                            if (roomMatch) rooms = parseInt(roomMatch[1]);
                                        }
                                        results.push({ url: a.href, price, location, title, thumb, sqm, rooms });
                                        break;
                                    }
                                }
                                if (results.length === 0) {
                                    const allLinks = document.querySelectorAll('a[href]');
                                    for (const a of allLinks) {
                                        if (!re.test(a.href) || seen.has(a.href)) continue;
                                        seen.add(a.href);
                                        results.push({ url: a.href, price: null, location: null, title: a.textContent.trim().substring(0, 80), thumb: null, sqm: null, rooms: null });
                                    }
                                }
                                return results;
                            }""", escaped_pattern)

                            batch_new = 0
                            for item in items:
                                prop_url = item.get('url', '')
                                if not prop_url or prop_url in existing:
                                    continue
                                price = item.get('price')
                                if price and (price < CRITERIA['price_min'] or price > CRITERIA['price_max']):
                                    continue
                                batch_new += 1
                                new_urls.append({
                                    'url': prop_url,
                                    'source': 'idealista',
                                    'price': price,
                                    'city': item.get('location'),
                                    'title': item.get('title'),
                                    'thumbnail': item.get('thumb'),
                                    'building_size': item.get('sqm'),
                                    'rooms': item.get('rooms'),
                                    'search_region': region_name,
                                    'country': country_code,
                                })
                                existing.add(prop_url)
                                if limit and len(new_urls) >= limit:
                                    break

                            print(f"      Page {page_num}: {len(items)} listings, {batch_new} new")
                            if not items or batch_new == 0:
                                break
                            if limit and len(new_urls) >= limit:
                                break

                        except Exception as e:
                            print(f"      Error on page {page_num}: {str(e)[:80]}")
                            break

                        await domain_page.wait_for_timeout(3000)

                    await domain_page.wait_for_timeout(2000)

            finally:
                await context.close()

    return new_urls


# ─── Platform registry ───

# API-based platforms (fast, no browser)
API_PLATFORMS = {
    'immobiliare': search_immobiliare,
    'greenacres': search_greenacres,
}

# Browser-based platforms (need Playwright)
BROWSER_PLATFORMS = {
    'leggett': search_leggett,
    'properstar': search_properstar,
    'idealista': search_idealista,
}

ALL_PLATFORMS = list(API_PLATFORMS.keys()) + list(BROWSER_PLATFORMS.keys())


# ─── Main ───

async def main():
    parser = argparse.ArgumentParser(description='Search platforms for new properties')
    parser.add_argument('--platform', choices=ALL_PLATFORMS,
                        help='Search single platform')
    parser.add_argument('--limit', type=int, default=50,
                        help='Max new properties per platform (default: 50)')
    parser.add_argument('--pages', type=int, default=3,
                        help='Max result pages per region (default: 3)')
    parser.add_argument('--dry-run', action='store_true',
                        help='Show found URLs without saving to store')
    parser.add_argument('--headless', action='store_true',
                        help='Run browser headless (may fail on Cloudflare)')
    args = parser.parse_args()

    store = load()
    platforms = [args.platform] if args.platform else ALL_PLATFORMS

    print(f"SEARCH — Discovering new properties")
    print(f"  Store: {len(store)} existing properties")
    print(f"  Platforms: {', '.join(platforms)}")
    print(f"  Criteria: EUR {CRITERIA['price_min']:,}-{CRITERIA['price_max']:,}, "
          f"{CRITERIA['bedrooms_min']}+ beds")
    print(f"  Max pages per region: {args.pages}")
    print()

    all_new = []

    def _save_batch(st, batch, dry):
        """Persist batch immediately so partial results survive IP blocks."""
        if dry or not batch:
            return
        now = datetime.now().isoformat()
        for item in batch:
            fields = {'source': item['source'], 'discovered_at': now,
                      'search_region': item.get('search_region')}
            for key in ['title', 'price', 'city', 'country', 'lat', 'lon',
                        'building_size', 'land_size', 'bedrooms', 'rooms',
                        'property_type', 'thumbnail']:
                if item.get(key) is not None:
                    fields[key] = item[key]
            upsert(st, item['url'], fields)
        persist(st)
        print(f"  (saved {len(batch)} to store, {len(st)} total)")

    # Run API-based searches first (fast, no browser needed)
    api_to_run = [p for p in platforms if p in API_PLATFORMS]
    for platform in api_to_run:
        print(f"{'=' * 50}")
        print(f"  {platform.upper()} (API)")
        print(f"{'=' * 50}")

        search_fn = API_PLATFORMS[platform]
        new_urls = search_fn(store, args.limit, args.pages)
        all_new.extend(new_urls)
        _save_batch(store, new_urls, args.dry_run)
        print(f"  -> {len(new_urls)} new from {platform}\n")

    # Run browser-based searches (need Playwright)
    browser_to_run = [p for p in platforms if p in BROWSER_PLATFORMS]
    # Idealista manages its own patchright browser, separate from shared page
    standalone_browsers = {'idealista'}
    shared_browser_platforms = [p for p in browser_to_run if p not in standalone_browsers]
    standalone_platform_list = [p for p in browser_to_run if p in standalone_browsers]

    if shared_browser_platforms:
        async with browser_page(headless=args.headless) as page:
            for platform in shared_browser_platforms:
                print(f"{'=' * 50}")
                print(f"  {platform.upper()} (Browser)")
                print(f"{'=' * 50}")

                search_fn = BROWSER_PLATFORMS[platform]
                new_urls = await search_fn(page, store, args.limit, args.pages)
                all_new.extend(new_urls)
                _save_batch(store, new_urls, args.dry_run)
                print(f"  -> {len(new_urls)} new from {platform}\n")

    for platform in standalone_platform_list:
        print(f"{'=' * 50}")
        print(f"  {platform.upper()} (Standalone browser)")
        print(f"{'=' * 50}")

        search_fn = BROWSER_PLATFORMS[platform]
        new_urls = await search_fn(None, store, args.limit, args.pages)
        all_new.extend(new_urls)
        _save_batch(store, new_urls, args.dry_run)
        print(f"  -> {len(new_urls)} new from {platform}\n")

    # Summary
    print(f"{'=' * 50}")
    print(f"  SEARCH COMPLETE")
    print(f"{'=' * 50}")

    by_platform = {}
    by_region = {}
    for item in all_new:
        p = item['source']
        r = item.get('search_region', '?')
        by_platform[p] = by_platform.get(p, 0) + 1
        by_region[r] = by_region.get(r, 0) + 1

    print(f"  Total new: {len(all_new)}")
    for p, c in sorted(by_platform.items(), key=lambda x: -x[1]):
        print(f"    {p}: {c}")
    print()
    for r, c in sorted(by_region.items(), key=lambda x: -x[1]):
        print(f"    {r}: {c}")

    if args.dry_run:
        print(f"\nDRY RUN — showing first 30:")
        for item in all_new[:30]:
            price_str = f"EUR {item['price']:,}" if item.get('price') else '?'
            city = item.get('city') or item.get('search_region', '')
            print(f"  {item['source']:13s} {city:25s} {price_str:>15s}  {item['url'][:60]}")
        return

    if all_new:
        print(f"\nAdded {len(all_new)} new properties to store ({len(store)} total)")
    else:
        print("\nNo new properties found")


if __name__ == '__main__':
    asyncio.run(main())
