#!/usr/bin/env python3
"""
Cyber Prairie Scoring — Harmonized property evaluation for Paradiso sessions.

Reads enriched_data.json and applies the unified spec from
system/memory/life/cyber-prairie-property-spec.md on top of existing GPT + custom scores.

Usage:
    python3 cyber_prairie_score.py              # Top 10 shortlist
    python3 cyber_prairie_score.py --top 20     # Top 20
    python3 cyber_prairie_score.py --removed    # Show gate removals
    python3 cyber_prairie_score.py --flags      # Tier 2 research summary
    python3 cyber_prairie_score.py --json       # JSON to stdout
"""
import argparse
import json
import math
import re
import sys
from datetime import datetime
from pathlib import Path

# ─── CRITERIA WEIGHTS — loaded from cyber-prairie-criteria.yaml ───
# Single source of truth: system/memory/life/cyber-prairie-criteria.yaml.
# criteria_loader validates intent assertions at import (e.g. character is heaviest,
# views ≥3.0, three Tier-1 gates present) — startup error on drift.
# Tune by editing the YAML, not by editing Python constants.
from criteria_loader import CRITERIA as _CP  # noqa: E402
from vetting import vetted_blockers  # noqa: E402  — single source of truth for "vetted"

# YAML-backed gate thresholds (replace ad-hoc Python constants below).
_PRICE_MAX = _CP.gates['price']['max_purchase_eur']
_PRICE_MIN = _CP.gates['price']['min_eur']
_PRICE_TOTAL_MAX = _CP.gates['price'].get('max_total_cost_eur')
_LAND_FLOOR = _CP.gates['land']['min_m2']

# Build legacy CRITERIA dict from YAML so the rest of the file's references keep
# working (weight + default + source are all the legacy code reads). Keeping the
# dict shape stable means the bounded refactor avoids touching the weighted-sum loop.
CRITERIA = {name: dict(cfg) for name, cfg in _CP.weighted_criteria.items()}
# (Weights live in cyber-prairie-criteria.yaml — the single source. Don't reintroduce a
# hardcoded copy here; it drifts. land_size is 3.0 there as of the 2026-06-08 rebalance.)

# ─── GWENDA VIBE REGIONS (from Paradiso sessions) ───

VIBE_RULES = [
    # Revised 2026-06-13: joint J+G revealed-preference session (10 loved/passed verdicts).
    # Brittany (Morbihan, Côtes-d'Armor, Finistère) demoted from 5→1 (6/6 rejections).
    # Dordogne + Tarn-et-Garonne promoted to 4 (loved: Bourniquel, Montaigu-de-Quercy).
    # Top tier (5): Confirmed strong preference, no rejections
    (r'\bhérault\b|\bherault\b', 5, 'Hérault'),
    (r'\bdrôme\b|\bdrome\b', 5, 'Drôme'),
    (r'\bcévennes\b|\bcevennes\b', 5, 'Cévennes'),
    (r'\bpic saint.loup\b', 5, 'Pic Saint-Loup'),
    # Strong (4): Loved or adjacent to loved corridor (SW France / Charente-Dordogne axis)
    (r'\bcharente.maritime\b', 4, 'Charente-Maritime'),  # 7x 👍👍 2026-06-13; must come before \bcharente\b
    (r'\bcharente\b', 2, 'Charente (inland)'),  # 1x 👎 Montrollet 2026-06-13; inland ≠ Charente-Maritime
    (r'\bdordogne\b', 4, 'Dordogne'),
    (r'\btarn.et.garonne\b', 4, 'Tarn-et-Garonne'),
    (r'\bardèche\b|\bardeche\b', 4, 'Ardèche'),
    (r'\bgers\b', 4, 'Gers'),
    # Good (3)
    (r'\bgard\b', 3, 'Gard'),
    (r'\bcorrèze\b|\bcorreze\b', 3, 'Corrèze'),
    (r'\blanguedoc\b', 3, 'Languedoc'),
    (r'\blimousin\b', 3, 'Limousin'),
    (r'\bcreuse\b', 3, 'Creuse'),
    (r'\blot\b(?!.et.garonne)', 2, 'Lot'),  # 2x 👎👎 (Assier + Lavergne) 2026-06-13
    (r'\bvienne\b', 3, 'Vienne'),
    (r'\bdeux.sèvres\b|\bdeux.sevres\b', 3, 'Deux-Sèvres'),
    # Moderate (2)
    (r'\bnormand', 2, 'Normandy'),
    (r'\bmanche\b', 2, 'Manche'),
    (r'\borne\b', 2, 'Orne'),
    (r'\bmayenne\b', 2, 'Mayenne'),
    (r'\baquitaine\b|\bgironde\b', 2, 'Aquitaine'),
    (r'\bmidi', 2, 'Midi-Pyrénées'),
    (r'\bindre\b', 2, 'Indre'),
    (r'\bpyrénées.atlantiques\b|\bpyrenees.atlantiques\b', 2, 'Pyrénées-Atlantiques'),
    (r'\baveyron\b', 2, 'Aveyron'),  # 1x 👎👎 2026-06-13
    # Low (1): Consistently rejected by both Jonathan + Gwenda
    (r'\bcôtes.d.armor\b|\bcotes.d.armor\b', 1, "Côtes-d'Armor"),
    (r'\bmorbihan\b', 1, 'Morbihan'),
    (r'\bfinistère\b|\bfinistere\b', 1, 'Finistère'),
    (r'\bbrittany\b|\bbretagne\b', 1, 'Brittany'),
    (r'\bille.et.vilaine\b', 1, 'Ille-et-Vilaine'),
    # Italy — top tier
    (r'\bliguria\b|\bliguri', 4, 'Liguria'),
    (r'\btoscane?\b|\btuscany\b', 4, 'Tuscany'),
    (r'\bumbria\b|\bumbri', 4, 'Umbria'),
    (r'\bmarche\b|\bmacerata\b', 3, 'Le Marche'),
    (r'\bpuglia\b|\bapulia\b|\bmanduria\b', 3, 'Puglia'),
    (r'\babruzzo\b', 3, 'Abruzzo'),
    (r'\bpiemonte?\b|\bpiedmont\b', 3, 'Piedmont'),
    (r'\bcalabria\b', 2, 'Calabria'),
    (r'\bsicil', 2, 'Sicily'),
    (r'\bsardegna\b|\bsardinia\b', 2, 'Sardinia'),
    (r'\bcampania\b', 2, 'Campania'),
    (r'\bmolise\b', 2, 'Molise'),
    # Portugal
    (r'\balentejo\b', 4, 'Alentejo'),
    (r'\bcentro\b.*portugal|\bcastelo branco\b|\balcains\b', 3, 'Central Portugal'),
    (r'\bminho\b|\btrás.os.montes\b', 3, 'Northern Portugal'),
    (r'\balgarve\b', 2, 'Algarve'),
    # Spain
    (r'\basturias\b', 4, 'Asturias'),
    (r'\bgalicia\b|\bgalici', 4, 'Galicia'),
    (r'\bnavarra\b', 3, 'Navarra'),
    (r'\baragón\b|\baragon\b', 3, 'Aragón'),
    (r'\bcastilla\b|\bcastile\b', 2, 'Castilla'),
    (r'\bextremadura\b', 2, 'Extremadura'),
    # Croatia / Greece / Ireland
    (r'\bistria\b', 3, 'Istria'),
    (r'\bdalmatia\b', 2, 'Dalmatia'),
    (r'\bcork\b|\bkerry\b', 3, 'SW Ireland'),
    (r'\bgalway\b|\bclare\b', 3, 'W Ireland'),
    (r'\bcrete\b|\bkreta\b', 3, 'Crete'),
    (r'\bpeloponne', 3, 'Peloponnese'),
]

# ─── RED FLAGS ───

RED_FLAGS = {
    'heritage': {
        'keywords': ['monument historique', 'classé', 'inscrit au', 'listed building',
                     'bâtiment classé', 'patrimoine protégé', 'historisch monument', 'DRAC'],
        'severity': 'DISQUALIFY',
        'reason': 'Heritage/listed building (2-3x renovation cost)',
    },
    'ruin': {
        'keywords': ['ruine', 'ruin ', 'bouwval', 'to demolish', 'à démolir',
                     'not habitable', 'niet bewoonbaar', 'inhabitable', 'onbewoonbaar'],
        'severity': 'DISQUALIFY',
        'reason': 'Ruin/not habitable',
    },
    'semi_detached': {
        'keywords': ['semi-detached', 'semi detached', 'mitoyenne', 'mitoyen',
                     'half-vrijstaand', 'halfvrijstaand', 'terraced', 'townhouse',
                     'maison de village', 'village house'],
        'severity': 'FLAG',
        'reason': 'Semi-detached/attached (limits guest business privacy)',
    },
    'land_only': {
        'keywords': ['terrain à bâtir', 'building plot', 'bouwgrond', 'land for sale',
                     'terrain constructible', 'plot of land'],
        'severity': 'FLAG',
        'reason': 'Land only, no building',
    },
    'isolation': {
        'keywords': ['very remote', 'très isolé', 'zeer afgelegen', 'extremely isolated',
                     'no neighbors within', 'aucun voisin'],
        'severity': 'FLAG',
        'reason': 'Potentially isolated (>30 min to village)',
    },
}

DH_LAT, DH_LON = 52.0705, 4.3007  # The Hague


# ─── SCORING FUNCTIONS ───

def is_likely_land(m2):
    """Values under 1000m2 are almost certainly building area, not land."""
    return m2 is not None and m2 >= 1000


def score_land_size(land_m2):
    """Score land for a homestead + market-garden + retreat (1-5). Recalibrated
    2026-06-08 (4-pillar rebalance) to reward the USABLE homestead band, not raw
    estate size. The old scale (50k+=5) rewarded 5ha+ estates and scored real
    candidates (0.4-1.5ha) only 1-2; with land now a co-equal pillar (weight 3.0)
    that would have buried the actual homesteads. ~0.8ha+ is ample; the 3,000m²
    gate-minimum is merely adequate."""
    if not is_likely_land(land_m2):
        return None  # Skip building-size values
    if land_m2 >= 8000: return 5   # ~0.8ha+ — ample for garden + privacy + retreat
    if land_m2 >= 5000: return 4
    if land_m2 >= 3000: return 3   # meets the gate, workable
    if land_m2 >= 1500: return 2
    return 1


def compute_environmental_risk_score(p):
    """Compute environmental risk criterion (1-5) from georisques data.

    Uses risk_score from enrich_apis.py if available (already 1-5 scale).
    Falls back to risk_profile mapping for properties not yet enriched.
    5 = very safe, 1 = high risk.
    """
    # Best source: computed risk_score from georisques API data
    rs = p.get('risk_score')
    if rs is not None:
        return round(rs)

    # Fallback: risk_profile string — ONLY accept Dutch georisques labels.
    # GPT sets "high" for "unknown utilities" which is NOT environmental risk.
    profile = (p.get('risk_profile') or '').strip().lower()
    mapping = {'laag': 5, 'gemiddeld': 3, 'hoog': 1}
    return mapping.get(profile)


def get_vibe(text):
    text_lower = text.lower()
    best_score, best_region = 0, ''
    for pattern, score, name in VIBE_RULES:
        if re.search(pattern, text_lower) and score > best_score:
            best_score, best_region = score, name
    return best_score, best_region


def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    dlat, dlon = math.radians(lat2 - lat1), math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    return R * 2 * math.asin(math.sqrt(a))


def scan_red_flags(text):
    text_lower = text.lower()
    found = []
    for flag_id, flag in RED_FLAGS.items():
        for kw in flag['keywords']:
            if kw in text_lower:
                found.append({'id': flag_id, 'severity': flag['severity'],
                              'reason': flag['reason'], 'matched': kw})
                break
    return found


def compute_livability_from_amenities(amenities):
    """Livability (1-5) = ACCESSIBILITY, not proximity.

    Reframed 2026-05-28: the old version gave more points the CLOSER an amenity
    was, which scored city-centre properties highest — backwards for a rural
    homestead. Now it's a saturation model: an amenity reachable within the
    'accessible' band scores full points, with NO bonus for being on top of it.
    A homestead 15km from a bakery should score the same livability as one 2km
    away; being IN the town (everything <2km) is handled separately as an urban
    penalty (compute_urban_penalty), not rewarded here.
    """
    if not amenities:
        return None

    points = 0
    checks = 0

    # Each amenity: full points if reachable within the accessible band, partial
    # if just beyond, zero if too far. No extra reward for being very close.
    def access(name, full_km, partial_km, full_pts):
        nonlocal points, checks
        checks += full_pts
        a = amenities.get(name)
        if a:
            km = a.get('km', 999)
            if km <= full_km:
                points += full_pts
            elif km <= partial_km:
                points += full_pts * 0.5

    access('bakery', 20, 35, 2)
    access('supermarket', 20, 35, 2)
    access('hospital', 40, 60, 2)
    access('train_station', 40, 70, 1)
    access('town', 30, 50, 1)

    ratio = points / checks if checks > 0 else 0
    return max(1, min(5, round(ratio * 5)))


# Major French cities / préfectures where commune = urban core (poor homestead
# fit). Used as a coarse urban signal until parcel-precise building-density
# (Tier 2) supersedes it. Focused on the search regions.
URBAN_COMMUNES = {
    'tarbes', 'pau', 'bayonne', 'lourdes', 'angoulême', 'angouleme', 'périgueux',
    'perigueux', 'montpellier', 'béziers', 'beziers', 'carcassonne', 'narbonne',
    'perpignan', 'rodez', 'albi', 'castres', 'montauban', 'cahors', 'aurillac',
    'mende', 'privas', 'valence', 'montélimar', 'montelimar', 'nîmes', 'nimes',
    'quimper', 'brest', 'lorient', 'vannes', 'saint-brieuc', 'lannion',
    'la rochelle', 'saintes', 'rochefort', 'niort', 'limoges', 'tulle',
    'guéret', 'gueret', 'agen', 'lannemezan',
}


def compute_urban_penalty(p):
    """Tier 1 urban detection (deterministic, no LLM). Returns (delta, reasons).

    Three coarse signals; superseded by Tier 2 building-density when parcel
    coords are available. Penalises city-centre / village-house properties that
    are poor homestead fits.
    """
    delta = 0.0
    reasons = []
    amenities = p.get('amenities') or {}

    # Tier 2 DEFINITIVE signal: building density at the parcel (urban_density.py).
    # When present it supersedes the coarser proxies below — it directly measures
    # how built-up the surroundings are. <=30 buildings/250m = rural/village (good).
    density = p.get('building_density')
    if density is not None:
        if density > 150:
            delta -= 0.9
            reasons.append(f'town/city core ({density} buildings/250m)')
        elif density > 80:
            delta -= 0.5
            reasons.append(f'dense built-up area ({density} buildings/250m)')
        elif density > 40:
            delta -= 0.2
            reasons.append(f'edge of town ({density} buildings/250m)')
        # <=40: rural/hamlet/village-edge → no penalty
        return max(delta, -0.9), reasons  # density is authoritative; skip proxies

    # ── Fallback proxies (no building-density yet) ──
    # Signal 1: amenity cluster — 3+ amenities all within 2km = town/city core
    close = 0
    for name in ('bakery', 'supermarket', 'hospital', 'train_station', 'town'):
        a = amenities.get(name)
        if a and a.get('km', 999) <= 2.0:
            close += 1
    if close >= 3:
        delta -= 0.6
        reasons.append(f'urban core ({close} amenities <2km)')

    # Signal 2: confirmed small land + building present = village/town house
    land = p.get('land_size_m2') or p.get('land_size') or 0
    building = p.get('building_size_m2') or p.get('building_size') or 0
    if 0 < land < 1500 and building:
        delta -= 0.4
        reasons.append(f'small plot ({land:.0f}m2) + building = village/town house')

    # Signal 3: commune name matches a known city/préfecture.
    # City fields often carry a region suffix ("Tarbes (Hautes-Pyrénées)"), so
    # match on whole words, not exact-equality.
    text = ((p.get('city') or '') + ' ' + (p.get('location') or '') + ' '
            + (p.get('title') or '')).lower()
    words = set(re.findall(r"[a-zà-ÿ'\-]+", text))
    matched = next((c for c in URBAN_COMMUNES
                    if (' ' in c and c in text) or (c in words)), None)
    if matched:
        delta -= 0.5
        reasons.append(f'commune is a city/préfecture ({matched})')

    # Cap total urban penalty (avoid triple-counting the same urban property)
    delta = max(delta, -0.9)
    return delta, reasons


def compute_price_per_m2(p):
    """Compute price per m2 of building. Returns value or None."""
    price = p.get('price')
    building = p.get('building_size_m2')
    if price and building and building > 20:
        return round(price / building)
    return None


# ─── DETERMINISTIC GROUND-TRUTH LAYER (no LLM / no OpenAI) ───
# Added 2026-05-28. Replaces the gpt-4o-mini analyze step for the unattended
# pipeline. Derives the subjective criteria from listing keywords + attributes
# when no GPT criteria exist, and applies hazard rules learned from the
# in-session scoring of the W22 top-16 (nuclear/industrial misses, land-substrate
# undervaluation, commune-level flood over-reporting, soil-pollution blindness).

# Keyword banks for proxy criteria (English + French + Dutch listing vocab)
KW_OUTBUILDINGS = ['barn', 'barns', 'grange', 'outbuilding', 'outbuildings', 'dependance',
                   'dépendance', 'dépendances', 'atelier', 'workshop', 'hangar', 'gite',
                   'gîte', 'chai', 'écurie', 'stable', 'stables', 'bijgebouw', 'schuur']
KW_CHARACTER = ['stone', 'pierre', 'character', 'caractère', 'historic', 'historique',
                'longère', 'longere', 'farmhouse', 'ferme', 'charentaise', 'maison de maître',
                'manoir', 'château', 'chateau', 'period', '18th', '19th', 'ancien', 'authentic',
                'beam', 'poutre', 'fireplace', 'cheminée']
KW_FOOD = ['kitchen', 'cuisine', 'dining', 'salle à manger', 'dinatoire', 'pizza',
           'bread oven', 'four à pain', 'summer kitchen', 'cuisine d\'été']
KW_GUEST = ['pool', 'piscine', 'gite', 'gîte', 'guest', 'chambre d\'hôte', 'b&b',
            'bed and breakfast', 'separate', 'independent', 'annexe']


def _text_blob(p):
    # NB: deliberately does NOT include gpt_summary — LLM-authored prose must not feed
    # the hard gates that read this blob (derive_condition_score → KW_RUIN, score floor).
    # Round-2 review caught a real gate flip when it was briefly included.
    return ' '.join(filter(None, [
        p.get('title', ''), p.get('summary', ''),
        p.get('description', ''), p.get('analysis', ''),
        ' '.join(p.get('keyword_signals', []) or []),
        ' '.join((p.get('extra', {}) or {}).get('description', '').split()),
    ])).lower()


def _kw_hits(blob, bank):
    return sum(1 for kw in bank if kw in blob)


def derive_proxy_criteria(p):
    """Build a criteria-like dict from listing keywords + attributes — NO LLM.

    Used when p['criteria'] is empty (i.e. no prior GPT analysis). Crude vs an
    LLM but dependency-free; the in-session precision pass refines the top N.
    """
    blob = _text_blob(p)
    building = p.get('building_size_m2') or p.get('building_size') or 0
    land = p.get('land_size_m2') or p.get('land_size') or 0
    beds = p.get('bedrooms') or 0

    # workshop (heavy weight): outbuildings + building size
    ob = _kw_hits(blob, KW_OUTBUILDINGS)
    workshop = 5 if ob >= 2 else (4 if ob == 1 else (3 if building >= 200 else 2))

    # food_experience (heavy weight): kitchen/dining cues + sizable house
    food = _kw_hits(blob, KW_FOOD)
    food_experience = 4 if (food >= 1 and building >= 150) else (4 if food >= 2 else 3)

    # guest_accommodation: bedrooms primary + guest cues
    g = _kw_hits(blob, KW_GUEST)
    if beds >= 5: guest = 5
    elif beds == 4: guest = 4
    elif beds == 3: guest = 3
    elif beds: guest = 2
    else: guest = 3 if g else 2
    if g >= 1 and guest < 5: guest += 1

    # design_story: character keywords
    ch = _kw_hits(blob, KW_CHARACTER)
    design_story = 5 if ch >= 3 else (4 if ch >= 1 else 3)

    # market_garden: land substrate + soil + NOT pollution (rule 2 + 5)
    polluted = 'Pollution des sols' in (p.get('risk_labels') or [])
    if land >= 30000: mg = 5
    elif land >= 10000: mg = 4
    elif land >= 5000: mg = 3
    elif land >= 3000: mg = 2
    else: mg = 1
    if polluted:
        mg = min(mg, 2)  # rule 5: cap on contamination flag

    # location: region vibe proxy (0-5)
    vibe, _ = get_vibe(_text_blob(p) + ' ' + (p.get('search_region', '') or ''))
    location = vibe if vibe else 3

    return {
        'workshop': workshop,
        'food_experience': food_experience,
        'guest_accommodation': guest,
        'design_story': design_story,
        'market_garden': mg,
        'location': location,
        'local_market': 3,
        'livability': None,  # filled from amenities downstream
        '_proxy': True,
    }


def hazard_adjustment(p):
    """Rule 1 + 5: deterministic penalties for hazards the LLM can't see.
    Returns (score_delta, reasons). Applied to the final CP score."""
    labels = p.get('risk_labels') or []
    delta = 0.0
    reasons = []

    if 'Nucléaire' in labels or p.get('nuclear_count', 0) > 0:
        delta -= 1.5
        reasons.append('nuclear installation (commune)')
    if 'Installations industrielles classées (ICPE)' in labels or p.get('seveso_any_count', 0) > 0:
        delta -= 0.6
        reasons.append('ICPE industrial site')
    if 'Canalisations de transport de matières dangereuses' in labels:
        delta -= 0.5
        reasons.append('hazmat transport pipeline')
    if 'Pollution des sols' in labels:
        delta -= 0.4
        reasons.append('soil pollution flag (verify for farming)')

    # Rule 3: flood + clay are COMMUNE-level (geocoded to commune centre, not
    # parcel) so they over-report. Treat as a graded penalty + verify-flag,
    # NOT a hard gate. Only the severe combo (flood + clay=fort) bites meaningfully.
    flood = p.get('has_flood_risk')
    clay = p.get('clay_risk')
    if flood and clay == 'fort':
        delta -= 0.4
        reasons.append('flood + strong clay (foundation survey — commune-level, verify parcel)')
    elif clay == 'fort':
        delta -= 0.2
        reasons.append('strong clay shrink-swell (verify parcel)')
    return delta, reasons


LAND_FLOOR_M2 = _LAND_FLOOR   # YAML-driven (gates.land.min_m2). Below this = removed.

# Mature-landscape signals — a definite plus for a retreat (charm + instant
# maturity + self-sufficiency). Trees/orchards take decades; you can't buy them later.
KW_MATURE_LAND = ['mature tree', 'mature garden', 'centenni', 'century-old', 'old trees',
                  'ancient tree', 'orchard', 'verger', 'fruit tree', 'olive grove',
                  'oliveraie', 'vines', 'vineyard', 'vigne', 'walnut', 'noyer', 'chestnut',
                  'châtaigni', 'oak', 'chêne', 'parkland', 'parc arboré', 'arbres centenaires',
                  'specimen tree', 'wooded', 'boisé', 'woodland']


def land_substrate_bonus(p):
    """Rule 2 (RETREAT-recalibrated): land is buildings-first — pass/fail at the
    floor + a MILD bonus to ~2ha, then FLAT. Past ~2ha extra hectares don't keep
    winning (a characterful longère + barns on 1ha beats a cottage on 5ha for a
    retreat). Plus a mature-landscape bonus for trees/orchards/groves — these are
    decades to grow, instant charm, and self-sufficiency, so they're a real plus.
    Returns (bonus, reasons_list)."""
    land = p.get('land_size_m2') or p.get('land_size') or 0
    polluted = 'Pollution des sols' in (p.get('risk_labels') or [])
    bonus = 0.0
    reasons = []

    # Land: mild, capped (buildings-first). No bonus below 1ha; flat past ~2ha.
    if is_likely_land(land) and not polluted:
        if land >= 20000:
            bonus += 0.20
            reasons.append(f'{land/10000:.1f}ha (ample, capped)')
        elif land >= 10000:
            bonus += 0.15
            reasons.append(f'{land/10000:.1f}ha')
        # 0.3–1ha: meets floor, sufficient, no bonus

    # Mature landscape: trees / orchards / groves — a definite plus
    hits = _kw_hits(_text_blob(p), KW_MATURE_LAND)
    if hits >= 2:
        bonus += 0.25
        reasons.append('mature trees/orchard/grove')
    elif hits == 1:
        bonus += 0.12
        reasons.append('some mature planting')

    return round(bonus, 2), reasons


# Water/bathing features — year-round revenue (extended season) + on-thesis charm.
# Natural swimming pool / pond weighted highest: ecological, beautiful, exactly the
# retreat aesthetic. Conventional pool = season-extender. Pond/lake = water + charm.
KW_NATURAL_POOL = ['natural swimming', 'natural pool', 'piscine naturelle',
                   'baignade naturelle', 'swimming pond', 'étang de baignade',
                   'bio pool', 'eco pool', 'biotope']
KW_POOL = ['swimming pool', 'piscine', 'heated pool', 'plunge pool', 'pool']
KW_WATER = ['pond', 'étang', 'lake', 'lac', 'stream', 'ruisseau', 'spring', 'source',
            'river', 'rivière', 'well', 'puits']


def retreat_feature_bonus(p):
    """Pool / swimming pond / natural pool + water features. Returns (bonus, reasons)."""
    blob = _text_blob(p)
    bonus = 0.0
    reasons = []
    if any(kw in blob for kw in KW_NATURAL_POOL):
        bonus += 0.25
        reasons.append('natural swimming pool/pond (on-thesis)')
    elif any(kw in blob for kw in KW_POOL):
        bonus += 0.15
        reasons.append('swimming pool (season-extender)')
    # Water source/feature (separate from a pool — irrigation + charm)
    if any(kw in blob for kw in KW_WATER):
        bonus += 0.10
        reasons.append('water feature/source')
    return round(min(bonus, 0.35), 2), reasons


# Property typology — two preferred forms for a retreat:
# 1. FORMER FARM COMPLEX — multiple buildings round a courtyard = the buildings-first
#    ideal (letting units + studio + workshop + privacy, all at once). The research's
#    #1 physical enabler (Amassa, Villa Magnan, Maison de Lunel).
# 2. VINEYARD DOMAIN — charm + terroir + a ready product line. NOTE: the soil-pollution
#    penalty in hazard_adjustment() stays independent — old vineyards carry copper/
#    pesticide legacy, so a vineyard can earn this bonus AND the pollution warning.
KW_FARM_COMPLEX = ['corps de ferme', 'former farm', 'farm complex', 'ancienne ferme',
                   'old farm', 'fermette', 'hameau', 'hamlet', ' mas ', 'bastide',
                   'domaine', 'longère', 'longere', 'agricultural complex',
                   'multiple buildings', 'several outbuildings', 'courtyard', 'cour fermée',
                   'group of buildings', 'ensemble de bâtiments']
KW_VINEYARD = ['vineyard', 'vignoble', 'wine estate', 'domaine viticole', 'winery',
               'wine domain', ' chai', 'cognac', 'vines', 'vigne', 'viticole']


# Condition keyword banks (read from description). Ruin/heavy = budget killer +
# 12-18mo no-revenue; renovated/turnkey = open sooner. Deterministic, no LLM.
KW_RUIN = ['à restaurer', 'a restaurer', 'to restore', 'to renovate', 'to be renovated',
           'à rénover', 'a renover', 'ruin', 'ruine', 'gros oeuvre', 'gros œuvre', 'shell',
           'renovation required', 'needs renovation', 'for renovation', 'rénovation complète',
           'entièrement à rénover', 'restauration', 'habitable après travaux']
KW_RENOVATED = ['renovated', 'rénové', 'renove', 'restored', 'restauré', 'move-in', 'turnkey',
                'clé en main', 'cle en main', 'refait à neuf', 'refait a neuf', 'tastefully restored',
                'beautifully renovated', 'fully renovated', 'recently renovated', 'habitable immédiatement']

# Track 1 — text-detectable FACILITATION signals (cyber-prairie-criteria.yaml § facilitation).
# Practical aspects the operation needs that listings often state literally. We only score
# what the text discloses; capacity/fitness that needs a floor-plan or site visit stays in
# the viewing checklist (Track 2), not here. Grouped by category — a property earns the bonus
# per DISTINCT category present, so breadth of operational readiness is rewarded, not repetition.
KW_FACILITATION = {
    '3-phase power': ['triphasé', 'tri-phasé', 'triphase', '3-phase', 'three-phase', 'three phase',
                      'force motrice', 'corrente trifase'],
    'level/workable land': ['terrain plat', 'terrain plain-pied', 'level ground', 'flat land',
                            'level land', 'gently sloping', 'terreno pianeggiante', 'plat et'],
    'south-facing': ['exposition sud', 'plein sud', 'orienté sud', 'oriente sud', 'south-facing',
                     'south facing', 'southern exposure', 'esposizione sud', 'sud-ouest', 'south-west'],
    # Irrigation-specific drilled-supply only — pond/stream/well/source already scored by
    # retreat_feature_bonus (KW_WATER); these terms don't overlap, so no double-count.
    'water source': ['forage', 'borehole', 'captage', 'irrigation'],
    'independent guest unit': ['entrée indépendante', 'entree independante', 'independent entrance',
                               'separate entrance', 'gîte indépendant', 'gite independant',
                               'accès indépendant', 'logement indépendant', 'appartement indépendant',
                               'self-contained', 'independent apartment', 'guest house'],
    'convertible attic': ['combles aménageables', 'combles amenageables', 'grenier aménageable',
                          'combles à aménager', 'convertible attic', 'attic to convert',
                          'sottotetto', 'attic conversion'],
    'mains drainage': ['tout à l\'égout', 'tout a l\'egout', 'mains drainage', 'mains sewer',
                       'assainissement collectif', 'tout-à-l\'égout'],
}


def derive_condition_score(p):
    """Renovation condition 1-5 from explicit field or description keywords.
    5 = turnkey, 3 = neutral/habitable, 1 = ruin. Returns int."""
    rs = p.get('renovation_score')
    if rs is not None:
        return int(rs)
    blob = _text_blob(p)
    if any(kw in blob for kw in KW_RENOVATED):
        return 5
    if any(kw in blob for kw in KW_RUIN):
        return 2
    return 3  # neutral / unknown


def condition_adjustment(p):
    """Condition → score delta. Turnkey rewarded, ruin penalised. The spec's
    'renovation reality' (€900-1500/m² + 12-18mo) made concrete. Returns (delta, reason)."""
    cs = derive_condition_score(p)
    table = {5: (0.30, 'turnkey/renovated'), 4: (0.15, 'light work'),
             3: (0.0, None), 2: (-0.30, 'heavy renovation / ruin'),
             1: (-0.45, 'ruin (full restoration)')}
    delta, label = table.get(cs, (0.0, None))
    return delta, (f'condition {cs}/5 ({label})' if label else None)


def _bedrooms(p):
    """Bedrooms from field, else extracted from description ('X bedroom/chambre').
    Sanity-clamps absurd values (e.g. a parse bug yielding 325) — a value outside
    1..15 is treated as unknown rather than trusted, so garbage can't feed the
    capacity/revenue deltas or the display."""
    b = p.get('bedrooms')
    if isinstance(b, (int, float)) and 1 <= b <= 15:
        return int(b)
    m = re.search(r'(\d+)\s*(?:bedroom|chambre|slaapkamer)', _text_blob(p), re.I)
    if m:
        n = int(m.group(1))
        if 1 <= n <= 15:
            return n
    return None


def capacity_gate(p):
    """B&B viability by bedroom count. A retreat needs guest rooms; 1 bed can't
    host. Soft-gate via penalty (with a farm-complex exception — outbuildings can
    convert to letting units). Returns (delta, reason). Killed review #2-class miss
    (Saint-Germain 1-bed ranked #1 on charm before this)."""
    beds = _bedrooms(p)
    if beds is None:
        return 0.0, None  # unverified — don't gate on unknown (flag elsewhere)
    # Farm-complex / many outbuildings → expansion potential softens low bed count
    blob = _text_blob(p)
    convertible = (any(kw in blob for kw in KW_FARM_COMPLEX)
                   or re.search(r'(\d+)\s*(?:dépendance|outbuilding|barn|grange)', blob, re.I))
    if beds >= 4:
        return 0.0, None
    if beds == 3:
        return 0.0, None
    if beds == 2:
        return (-0.2, 'only 2 bed (convertible outbuildings)') if convertible else (-0.4, 'only 2 bedrooms')
    # 1 bedroom or 0
    return (-0.4, '1 bed but convertible outbuildings') if convertible else (-0.8, 'only 1 bedroom — not B&B-viable')


def _is_convertible(p):
    """Has outbuildings / farm-complex form that can convert to letting units."""
    blob = _text_blob(p)
    return (any(kw in blob for kw in KW_FARM_COMPLEX)
            or any(kw in blob for kw in KW_OUTBUILDINGS)
            or bool(re.search(r'(\d+)\s*(?:dépendance|outbuilding|barn|grange)', blob, re.I)))


def expandability_bonus(p):
    """Research + revealed preference: successful retreats START small and GROW, and
    the user's 54-favourite pass (2026-05-30) showed outbuildings/dépendances/gîte-
    potential is the single most consistent attribute across hand-picked favourites —
    appearing in 7/8 of the top-ranked. Promoted from a modest bonus to near-character
    weight so a characterful 2-bed with barns out-ranks a charmless 3-bed with
    nowhere to go. Returns (bonus, reason)."""
    land = p.get('land_size_m2') or p.get('land_size') or 0
    convertible = _is_convertible(p)
    if convertible and is_likely_land(land) and land >= 5000:
        return 0.35, 'expandable (outbuildings + land headroom)'
    if convertible and is_likely_land(land) and land >= 3000:
        return 0.25, 'expandable (outbuildings + adequate land)'
    if convertible:
        return 0.18, 'outbuildings to convert (room to grow)'
    if is_likely_land(land) and land >= 10000:
        return 0.10, 'land headroom to expand'
    return 0.0, None


def revenue_viability_bonus(p):
    """Research's #1 success factor: the DIVERSIFIED model (B&B + products +
    workshops/events). Reward properties that can carry multiple streams, not just
    rooms. Returns (bonus, reason)."""
    beds = _bedrooms(p) or 0
    land = p.get('land_size_m2') or p.get('land_size') or 0
    building = p.get('building_size_m2') or p.get('building_size') or 0
    convertible = _is_convertible(p)
    polluted = 'Pollution des sols' in (p.get('risk_labels') or [])

    streams = []
    if beds >= 3 or convertible:
        streams.append('B&B')
    if is_likely_land(land) and land >= 5000 and not polluted:
        streams.append('products')
    if convertible or building >= 200:
        streams.append('workshop/events')
    n = len(streams)
    # 4-pillar rebalance 2026-06-08: revenue now rivals character (was 0.25/0.12).
    # Research's #1 success factor — a property that can carry 3 stacked streams
    # should weigh nearly as much as charm. YAML range [-0.15, +0.45] = ≥80% of character.
    if n >= 3:
        return 0.45, f'3 revenue streams ({"+".join(streams)})'
    if n == 2:
        return 0.25, f'2 revenue streams ({"+".join(streams)})'
    return 0.0, None


def critical_data_penalty(p):
    """Penalize-unknowns (user directive 2026-06-08): a missing KEYSTONE characteristic
    must rank BELOW a confirmed one, not score as neutral. Land is the keystone of a
    homestead — unknown land is high-risk, so it carries the heaviest unknown-penalty.
    This kills the null-bias where a land-less town/village house rides character +
    condition up the list (Callac, Vers-Pont-du-Gard). It penalizes, not gates — the
    property stays visible, just ranked below verified land. Returns (delta, reasons)."""
    delta = 0.0
    reasons = []
    land = p.get('land_size_m2')
    if not isinstance(land, (int, float)):
        delta -= 0.40
        reasons.append('land size unverified (keystone unknown)')
    bld = p.get('building_size_m2') or p.get('building_size')
    if not isinstance(bld, (int, float)):
        delta -= 0.10
        reasons.append('building size unknown')
    return round(delta, 2), reasons


def facilitation_bonus(p):
    """Track 1: reward operational facilitations the listing TEXT discloses (3-phase
    power, level/workable land, south aspect, water source, independent guest unit,
    convertible attic, mains drainage). +0.04 per distinct category, capped +0.20.
    Bounded small on purpose — these are confirmations the property can host the
    operation, not headline signals; what needs a floor-plan/visit lives in the
    viewing checklist (Track 2), not here. Returns (bonus, reason)."""
    blob = _text_blob(p).lower()
    hits = [cat for cat, kws in KW_FACILITATION.items() if any(kw in blob for kw in kws)]
    if not hits:
        return 0.0, None
    bonus = min(0.20, round(0.04 * len(hits), 2))
    return bonus, 'facilitation: ' + ', '.join(hits)


def character_adjustment(p):
    """Vision-derived character (1-5) → strong ranking signal. Character is the
    product a retreat sells, so it sits near the top of the stack: ±0.5 across
    the range (comparable to hazard penalties). Only present on photo-assessed
    properties (vision rubric); absent = no adjustment (not penalised for missing).
    Returns (delta, reason)."""
    cs = p.get('character_score')
    if cs is None:
        return 0.0, None
    delta = round((cs - 3) * 0.25, 2)  # 5→+0.5, 4→+0.25, 3→0, 2→-0.25, 1→-0.5
    tag = p.get('character_tags') or ''
    label = f'character {cs}/5' + (f' ({tag})' if tag else '')
    return delta, label


def amenity_distance_adjustment(p):
    """Soft signal for amenity proximity. Replaces the hard 'no hospital'/'no
    bakery+supermarket' gates that were too brittle against OSM/Overpass
    coverage gaps (see comments in check_tier1_gates).

    Scoring (cumulative within bounds [-0.25, +0.10]):
      hospital ≤20km: +0.05 | 20-50km: 0 | >50km or unknown: -0.05
      supermarket ≤10km: +0.05 | 10-25km: 0 | >25km or unknown: -0.05
      bakery ≤10km: present-bonus +0.02 if both unknown or none, -0.05

    Honest about uncertainty: 'unknown' is penalised mildly because we can't
    verify rural access, but never enough to bury a strong property."""
    am = p.get('amenities') or {}
    if not am:
        return 0.0, None
    delta = 0.0
    reasons = []
    hosp = am.get('hospital')
    hosp_km = hosp.get('km') if isinstance(hosp, dict) else None
    if hosp_km is None:
        delta -= 0.05; reasons.append('hospital unknown')
    elif hosp_km <= 20:
        delta += 0.05; reasons.append(f'hospital {hosp_km}km')
    elif hosp_km > 50:
        delta -= 0.05; reasons.append(f'hospital {hosp_km}km (>50)')
    sup = am.get('supermarket')
    sup_km = sup.get('km') if isinstance(sup, dict) else None
    if sup_km is None:
        delta -= 0.05; reasons.append('supermarket unknown')
    elif sup_km <= 10:
        delta += 0.05; reasons.append(f'supermarket {sup_km}km')
    elif sup_km > 25:
        delta -= 0.05; reasons.append(f'supermarket {sup_km}km (>25)')
    bak = am.get('bakery')
    bak_km = bak.get('km') if isinstance(bak, dict) else None
    if bak_km is None and sup_km is None:
        delta -= 0.05; reasons.append('no bakery/supermarket data')
    elif bak_km is not None and bak_km <= 10:
        delta += 0.02; reasons.append(f'bakery {bak_km}km')
    delta = max(-0.25, min(0.10, round(delta, 2)))
    return delta, ' / '.join(reasons) if reasons else None


def regional_preference_adjustment(p):
    """Vibe-based regional preference delta — derived from J+G revealed preferences 2026-06-13.
    Brittany (vibe=1) consistently rejected (6/6); Charente/Dordogne (vibe=4) loved (3/3).
    This delta ensures the regional preference affects cp_score even when GPT scores location highly.

    vibe 5 → +0.15  (Hérault, Drôme, Cévennes — desired but untested)
    vibe 4 → +0.10  (Charente-Maritime, Dordogne — confirmed loved)
    vibe 3 → 0.00   (neutral)
    vibe 2 → -0.20  (mild preference against)
    vibe 1 → -0.60  (Brittany — strongly rejected, consistent signal)
    """
    text = _text_blob(p) + ' ' + (p.get('search_region', '') or '') + ' ' + (p.get('department', '') or '')
    vibe, region = get_vibe(text)
    if not vibe:
        return 0.0, None
    mapping = {5: 0.15, 4: 0.10, 3: 0.0, 2: -0.20, 1: -0.60}
    delta = mapping.get(vibe, 0.0)
    if delta == 0.0:
        return 0.0, None
    direction = 'preferred' if delta > 0 else 'avoided'
    return delta, f'region {region} ({direction}, vibe {vibe}/5)'


def distance_from_nl_adjustment(p):
    """Distance from The Hague (NL) → easier/harder visits.

    For a Dutch buyer, drive time is the binding constraint on visit frequency,
    Gwenda's involvement during build-out, and ferry-runs for furniture. The
    scorer already computes dist_km via haversine but never weighted it.

    Tiers (one-way drive estimate from haversine × 1.25 road-factor / 80km·h):
      <500km   (~6h): +0.15  — weekend-doable, ferry day-trip possible
      500-700  (~7h): +0.05
      700-900  (~8.5h): 0
      900-1100 (~10h): -0.15
      >1100   (~12h+): -0.30  — overnight needed for any visit, multi-day commitment

    Returns (delta, reason). Coords absent = 0 (no signal, no penalty)."""
    if not (isinstance(p.get('lat'), (int, float))
            and isinstance(p.get('lon'), (int, float))):
        return 0.0, None
    km = haversine(DH_LAT, DH_LON, p['lat'], p['lon'])
    if km < 500:
        return 0.15, f'≈{km:.0f}km NL — weekend-doable'
    if km < 700:
        return 0.05, f'≈{km:.0f}km NL — easy day-and-half'
    if km < 900:
        return 0.0, f'≈{km:.0f}km NL — long day drive'
    if km < 1100:
        return -0.15, f'≈{km:.0f}km NL — overnight needed for visits'
    return -0.30, f'≈{km:.0f}km NL — multi-day commitment per visit'


def community_vitality_adjustment(p):
    """Community vitality from commune population (community_vitality.py / INSEE
    via geo.api.gouv.fr). Research: deep-rural projects relocate for lack of
    footfall (Limousin); a healthy village = services + guest access + alive.
    Reward the middle, penalise both extremes (too-remote / too-urban).
    Returns (delta, reason). Population absent = 0 (unknown)."""
    pop = p.get('commune_population')
    if pop is None:
        return 0.0, None
    if pop < 150:
        return -0.25, f'very remote commune ({pop} pop — footfall risk)'
    if pop < 500:
        return -0.10, f'small commune ({pop} pop)'
    if pop <= 10000:
        return 0.10, f'healthy village/town ({pop} pop — alive)'
    if pop <= 25000:
        return 0.0, f'market town ({pop} pop)'
    return -0.15, f'urban commune ({pop} pop)'


def proximity_penalty(p):
    """Tier-1 deal-breakers from real review feedback: adjacent neighbours
    (killed #2) + busy road (killed #4). Graded penalties from parcel-precise
    OSM data (urban_density.py). Returns (delta, reasons)."""
    delta = 0.0
    reasons = []

    # Privacy: buildings within 50m (own house/barn count, so 1-2 ≈ private)
    priv = p.get('privacy_buildings_50m')
    if priv is not None:
        if priv >= 6:
            delta -= 0.5
            reasons.append(f'hemmed in ({priv} buildings <50m)')
        elif priv >= 4:
            delta -= 0.3
            reasons.append(f'close neighbours ({priv} buildings <50m)')

    # Road noise: distance to nearest motorway/trunk/primary/secondary
    road = p.get('major_road_m')
    if road is not None:
        if road < 75:
            delta -= 0.5
            reasons.append(f'busy road {road}m away (noise)')
        elif road < 150:
            delta -= 0.25
            reasons.append(f'major road {road}m away')

    # Internet (Tier-1 must-have). Starlink covers ~all rural France, so only
    # penalise an EXPLICIT no-internet flag in the listing text — not absence
    # of mention (absence ≠ unavailable).
    blob = _text_blob(p)
    if any(kw in blob for kw in ['no internet', 'pas d\'internet', 'no fibre', 'no broadband',
                                 'sans internet', 'no phone line', 'no mains']):
        delta -= 0.3
        reasons.append('internet flagged absent (verify Starlink)')

    return round(delta, 2), reasons


def typology_bonus(p):
    """Preference for former-farm-complex (buildings-first ideal) and vineyard
    domains (charm + terroir + product). Returns (bonus, reasons)."""
    blob = _text_blob(p)
    bonus = 0.0
    reasons = []
    if any(kw in blob for kw in KW_FARM_COMPLEX):
        bonus += 0.25
        reasons.append('former farm complex (buildings-first)')
    if any(kw in blob for kw in KW_VINEYARD) or (p.get('sub_type') or '').lower() in ('vineyard', 'vineyeard'):
        bonus += 0.20
        reasons.append('vineyard / wine domain (terroir + product)')
    return round(min(bonus, 0.4), 2), reasons  # cap combined water bonus


def compute_data_confidence(p):
    """Rule 4: how much real data backs the score (1-5). Surfaced, not gating."""
    have = 0
    if p.get('building_size_m2') or p.get('building_size'): have += 1
    if p.get('land_size_m2') or p.get('land_size'): have += 1
    if p.get('price'): have += 1
    if p.get('risk_score') is not None or p.get('georisques_enriched'): have += 1
    if p.get('criteria') and not p.get('criteria', {}).get('_proxy'): have += 1
    return have  # 0-5


def renovation_scope_score(p):
    """Renovation condition (1=ruin .. 5=move-in). From detail-page read
    (renovation_score) or the search-card renovation_estimate, else None
    (CRITERIA default of 3 applies)."""
    rs = p.get('renovation_score')
    if rs is not None:
        return rs
    est = (p.get('renovation_estimate') or '').lower()
    return {'minor': 4, 'moderate': 3, 'major': 1}.get(est)


def compute_cp_score(p):
    """Compute Cyber Prairie score from a property dict. Returns (raw, final, scores, missing)."""
    criteria = p.get('criteria', {})
    # No GPT criteria → derive deterministic proxies (no LLM, no OpenAI)
    if not criteria:
        criteria = derive_proxy_criteria(p)

    # Use computed livability from amenities if available, else GPT estimate
    amenity_livability = compute_livability_from_amenities(p.get('amenities'))
    livability = amenity_livability if amenity_livability is not None else criteria.get('livability')

    scores = {
        'workshop': criteria.get('workshop'),
        'location_view': criteria.get('location'),  # proxy
        'food_experience': criteria.get('food_experience'),
        'guest_accommodation': criteria.get('guest_accommodation'),
        'livability': livability,
        'environmental_risk': compute_environmental_risk_score(p),
        'design_story': criteria.get('design_story'),
        'market_garden': criteria.get('market_garden'),
        'land_size': score_land_size(p.get('land_size_m2')),
        'renovation_scope': renovation_scope_score(p),  # from detail-page condition read
        'local_market': criteria.get('local_market'),
    }

    weighted_sum = 0.0
    weight_sum = 0.0
    missing = []

    for name, config in CRITERIA.items():
        val = scores[name]
        if val is None:
            val = config['default']
        if val is not None:
            weighted_sum += val * config['weight']
            weight_sum += config['weight']
            scores[name] = val
        else:
            missing.append(name)

    if weight_sum == 0:
        return 0, 0, scores, missing

    raw = weighted_sum / weight_sum

    # Deterministic ground-truth adjustments — post-weighting
    hz_delta, hz_reasons = hazard_adjustment(p)
    sub_bonus, sub_reasons = land_substrate_bonus(p)        # land (capped) + mature trees/orchard
    feat_bonus, feat_reasons = retreat_feature_bonus(p)     # pool / natural pond / water
    typ_bonus, typ_reasons = typology_bonus(p)              # farm complex / vineyard domain
    prox_delta, prox_reasons = proximity_penalty(p)         # privacy (neighbours) + road noise
    char_delta, char_reason = character_adjustment(p)       # vision-derived charm (near-top weight)
    cap_delta, cap_reason = capacity_gate(p)                # bedrooms / B&B viability
    cond_delta, cond_reason = condition_adjustment(p)       # renovation condition (ruin↓ turnkey↑)
    exp_bonus, exp_reason = expandability_bonus(p)          # room to grow (research: start-small-grow)
    rev_bonus, rev_reason = revenue_viability_bonus(p)      # diversified-revenue capacity
    fac_bonus, fac_reason = facilitation_bonus(p)           # Track 1: text-stated operational facilitations
    crit_delta, crit_reasons = critical_data_penalty(p)     # penalize missing keystone data (land)
    vit_delta, vit_reason = community_vitality_adjustment(p)  # commune alive vs too-remote/urban
    nl_delta, nl_reason = distance_from_nl_adjustment(p)    # haversine from The Hague (NL)
    reg_delta, reg_reason = regional_preference_adjustment(p)  # J+G revealed regional prefs 2026-06-13
    am_delta, am_reason = amenity_distance_adjustment(p)    # soft: hospital/supermarket/bakery proximity
    urban_delta, urban_reasons = compute_urban_penalty(p)

    # Rule 4 ACTIVE: discount properties we cannot hazard-check. A property with
    # neither a risk_score NOR a building_density has escaped the nuclear/industrial/
    # urban screens entirely (typically Leggett-email listings — région only, no
    # address to geocode). Without that, a charming-sounding listing could sit next
    # to a Seveso site and we'd never know. Modest discount so unvetted listings
    # don't outrank fully-screened ones purely by dodging the penalties.
    unvetted = (p.get('risk_score') is None and p.get('building_density') is None)
    conf_delta = -0.35 if unvetted else 0.0

    final = max(0.0, raw + hz_delta + sub_bonus + feat_bonus + typ_bonus
                + prox_delta + char_delta + cap_delta + cond_delta
                + exp_bonus + rev_bonus + fac_bonus + vit_delta + nl_delta
                + am_delta + urban_delta + conf_delta + crit_delta + reg_delta)
    final = round(min(final, 5.0), 2)

    scores['_hazard_delta'] = round(hz_delta, 2)
    scores['_hazard_reasons'] = hz_reasons
    scores['_substrate_bonus'] = round(sub_bonus, 2)
    scores['_substrate_reasons'] = sub_reasons
    scores['_feature_bonus'] = round(feat_bonus, 2)
    scores['_feature_reasons'] = feat_reasons
    scores['_typology_bonus'] = round(typ_bonus, 2)
    scores['_typology_reasons'] = typ_reasons
    scores['_proximity_delta'] = round(prox_delta, 2)
    scores['_proximity_reasons'] = prox_reasons
    scores['_character_delta'] = round(char_delta, 2)
    scores['_character_reason'] = char_reason
    scores['_capacity_delta'] = round(cap_delta, 2)
    scores['_capacity_reason'] = cap_reason
    scores['_condition_delta'] = round(cond_delta, 2)
    scores['_condition_reason'] = cond_reason
    scores['_expandability_bonus'] = round(exp_bonus, 2)
    scores['_expandability_reason'] = exp_reason
    scores['_revenue_bonus'] = round(rev_bonus, 2)
    scores['_revenue_reason'] = rev_reason
    scores['_facilitation_bonus'] = round(fac_bonus, 2)
    scores['_facilitation_reason'] = fac_reason
    scores['_critical_data_delta'] = round(crit_delta, 2)
    scores['_critical_data_reasons'] = crit_reasons
    scores['_vitality_delta'] = round(vit_delta, 2)
    scores['_vitality_reason'] = vit_reason
    scores['_urban_delta'] = round(urban_delta, 2)
    scores['_urban_reasons'] = urban_reasons
    scores['_data_confidence'] = compute_data_confidence(p)
    scores['_unvetted'] = unvetted
    if criteria.get('_proxy'):
        scores['_proxy_criteria'] = True

    return round(raw, 2), final, scores, missing


def check_tier1_gates(p, scores, cp_score):
    """Check Tier 1 auto-removal gates. Returns list of triggered gates."""
    triggered = []

    # User explicit override — bypasses all auto-gates (set when borderline gates
    # are manually reviewed and accepted by J+G).
    if p.get('user_ungate'):
        return []

    # User manual review verdict
    verdict = p.get('user_verdict', '').lower()
    if verdict == 'no':
        triggered.append(f"User reviewed: rejected ({', '.join(p.get('user_flags', []))})")

    # Guest gate: only hard-gate when guest=1 AND building is confirmed tiny/absent.
    # GPT scores 1 for land listings without building descriptions, which is data quality, not reality.
    guest = scores.get('guest_accommodation')
    if guest is not None and guest <= 1:
        building = p.get('building_size_m2')
        bedrooms = p.get('bedrooms')
        if building is not None and building < 50:
            triggered.append(f'Guest capacity = 1 + building only {building:.0f}m2')
        elif building is not None or bedrooms is not None:
            triggered.append('Guest capacity = 1 (cannot do B&B)')

    # Bedroom-capacity gate (softened 2026-05-28): only HARD-gate 1-bed without
    # convertible outbuildings (truly not a B&B — user: "1-bed = out"). 2-bed is
    # MARGINAL not out — handled as a graded penalty in capacity_gate() so it stays
    # visible + flagged for the joint review (user: "2-bed marginal, not out").
    # bedrooms=None = unverified (no gate; data-confidence handles unknowns).
    beds = p.get('bedrooms')
    if beds is not None and beds <= 1:
        blob = ' '.join(filter(None, [p.get('title', ''), p.get('description', ''),
                                      (p.get('extra', {}) or {}).get('description', '')])).lower()
        convertible = (any(kw in blob for kw in KW_FARM_COMPLEX)
                       or any(kw in blob for kw in KW_OUTBUILDINGS)
                       or 'dépendance' in blob or 'outbuilding' in blob)
        if not convertible:
            triggered.append(f'{beds} bedroom — not B&B-viable, no convertible outbuildings')

    # Risk gate: use numeric risk_score from georisques, not GPT's risk_profile text.
    # GPT labels "unknown utilities" as "high" which is NOT environmental risk.
    risk_score_val = p.get('risk_score')
    if risk_score_val is not None and risk_score_val <= 2.0:
        triggered.append(f'High environmental risk (score {risk_score_val}/5)')

    if cp_score > 0 and cp_score < 1.5:
        triggered.append(f'Score {cp_score} below 1.5 threshold')

    # Land-floor gate. is_likely_land() guards LEGACY data ambiguity (some old
    # sources stuffed building-area into the land field). But once our enrichers
    # (leggett_size_enrich Ext-header, greenacres_photo_harvest 'Land:' label,
    # community_vitality, manual entry) verify the value, we should trust it
    # below 1000m² too — otherwise tiny verified parcels skate through the gate.
    # Verified = land_size_verified flag from the enricher, OR a structured source
    # marker (source in {leggett,greenacres} with photo_count>0 implies the
    # detail-page extractor ran and the value came from a labelled field).
    land = p.get('land_size_m2') or p.get('land_size')
    if isinstance(land, (int, float)) and land > 0:
        verified = (
            p.get('land_size_verified') is True
            or (p.get('source') in ('leggett', 'greenacres')
                and (p.get('photo_count') or 0) > 0)
        )
        if (verified or is_likely_land(land)) and land < LAND_FLOOR_M2:
            triggered.append(f'Land {land:,.0f}m2 below {LAND_FLOOR_M2:,}m2 hard floor (market garden non-negotiable)')

    # Utilities gate — only triggers when GPT explicitly confirmed absence
    if p.get('has_electricity') is False:
        triggered.append('No electricity (confirmed)')
    if p.get('has_mains_water') is False:
        triggered.append('No running water (confirmed)')

    price = p.get('price')
    if isinstance(price, (int, float)) and price > 0:
        if price > _PRICE_MAX:
            triggered.append(f'Price EUR {price:,.0f} exceeds {_PRICE_MAX/1000:.0f}k cap')
        if price < _PRICE_MIN:
            triggered.append(f'Price EUR {price:,.0f} below {_PRICE_MIN/1000:.0f}k floor (likely ruin/barn)')
        # Total-cost gate (Strategy A guard): purchase + estimated renovation
        # gate-when-data-PRESENT-AND-FAILS: only triggers when est_total_cost has
        # actually been computed (not all properties carry it yet).
        est_total = p.get('est_total_cost_eur')
        if _PRICE_TOTAL_MAX and isinstance(est_total, (int, float)) and est_total > _PRICE_TOTAL_MAX:
            triggered.append(f'Total cost EUR {est_total:,.0f} exceeds {_PRICE_TOTAL_MAX/1000:.0f}k cap (purchase+reno)')

    # Livable gate — must be habitable now (not a ruin/shell). Condition derived
    # from description keywords ('à restaurer' etc.) or verified renovation_score.
    # Only confirmed ruins/heavy-reno (≤2) are gated; unknown (3) passes.
    if derive_condition_score(p) <= 2:
        triggered.append('Not livable (ruin / heavy renovation required)')

    # Amenity gates — DEACTIVATED 2026-05-30 as hard gates.
    # Reason: Overpass/OSM coverage is unreliable for rural FR. We saw the same
    # commune return a hospital for one listing and None for another (Duravel).
    # The penalty for missing amenities now lives in the `proximity` / livability
    # deltas instead — properties are penalised but stay visible. A coarse data
    # source shouldn't silently kill candidates.
    # If a property is truly far from services, character/vitality already drag
    # it down; we don't need a binary OSM-derived gate on top.
    # (Kept here as a comment so the rationale survives future readings.)
    # OLD:
    #   amenities = p.get('amenities', {})
    #   if amenities:
    #       if amenities.get('hospital') is None and country == 'FR':
    #           triggered.append('No hospital within 50km')
    #       if amenities.get('bakery') is None and amenities.get('supermarket') is None and country == 'FR':
    #           triggered.append('No bakery or supermarket within 20km')

    # Environmental risk gates (from georisques.gouv.fr)
    seismic = p.get('seismic_zone')
    if seismic is not None and seismic >= 4:
        triggered.append(f'Seismic zone {seismic}/5 (high earthquake risk)')

    if p.get('seveso_high_count', 0) > 0:
        triggered.append(f"Seveso high-threshold site nearby ({p['seveso_high_count']})")

    # NOTE: nuclear / ICPE / hazmat / flood / clay are now handled as graded
    # score penalties in hazard_adjustment() rather than hard gates, because
    # commune-centre geocoding over-reports them (rule 3). A nuclear-adjacent
    # property gets a -1.5 hit (usually enough to drop it well out of contention)
    # without being silently removed — it stays visible with the reason attached.

    # Land-only listings (no building)
    url = p.get('url', '')
    if '/land-for-sale-' in url or '/terrain-' in url:
        triggered.append('Land-only listing (no building)')

    # Semi-detached detection from keywords
    text = ' '.join(filter(None, [
        p.get('title', ''), p.get('summary', ''), p.get('analysis', ''),
    ])).lower()
    semi_keywords = ['semi-detached', 'semi detached', 'mitoyenne', 'mitoyen',
                     'half-vrijstaand', 'terraced', 'townhouse']
    for kw in semi_keywords:
        if kw in text:
            triggered.append(f'Semi-detached/attached ({kw})')
            break

    # Property-TYPE gate (2026-06-08): a homestead is a house/farm with land. Exclude
    # types that aren't candidates at all and only slip through because they have no
    # land/size data to fail the other gates — an apartment, or a sub-35m² unit with no
    # land AND no convertible outbuildings. (Detect type from the source URL + an explicit
    # listing-type phrase, NOT the bare word "apartment" — that appears as a POSITIVE
    # "independent apartment / gîte" facilitation signal.)
    url_l = (p.get('url') or '').lower()
    building_m2 = p.get('building_size_m2') or p.get('building_size')
    land_m2 = p.get('land_size_m2') or p.get('land_size') or 0
    if '/apartment/' in url_l or 'appartement à vendre' in text or 'apartment for sale' in text:
        triggered.append('Apartment — not a homestead candidate')
    elif (isinstance(building_m2, (int, float)) and building_m2 < 35
          and not is_likely_land(land_m2) and not _is_convertible(p)):
        triggered.append(f'Building only {building_m2:.0f}m2, no land, no outbuildings — not a homestead')

    # ─── NEW YAML-driven Tier-1 gates (audit 2026-05-30) ───
    # All gate-when-data-PRESENT-AND-FAILS: missing data = pass (data_confidence
    # discount applies in the score). This honours the audit findings without
    # nuking the existing store before enrichment catches up.
    triggered.extend(_check_new_tier1_gates(p))

    return triggered


def _check_new_tier1_gates(p):
    """Tier-1 gates added 2026-05-30 from research audit: privacy, road noise,
    internet, septic, heritage exclusion, amenity time-distances.

    Each gate checks `data PRESENT AND FAILS` — missing data passes silently
    and is captured by the data_confidence delta downstream.
    """
    out = []

    # Privacy gate — killed property #2 (Saint-Malo). Two signals:
    # (1) explicit per-property field `nearest_neighbour_m`, (2) Overpass-derived
    # privacy_buildings_50m count. Gate when either confirms failure.
    priv_cfg = _CP.gates.get('privacy', {})
    min_dist = priv_cfg.get('min_neighbour_distance_m')
    nbr_m = p.get('nearest_neighbour_m')
    if min_dist and isinstance(nbr_m, (int, float)) and nbr_m < min_dist:
        out.append(f'Nearest building {nbr_m:.0f}m < {min_dist}m privacy floor')
    if priv_cfg.get('no_adjacent_building'):
        priv_count = p.get('privacy_buildings_50m')
        if isinstance(priv_count, int) and priv_count >= 3:
            out.append(f'{priv_count} buildings within 50m (no privacy)')

    # Road-noise gate — killed property #4 (Clérac).
    road_cfg = _CP.gates.get('road_noise', {})
    if road_cfg.get('exclude_busy_road'):
        major_road_m = p.get('major_road_m')
        if isinstance(major_road_m, (int, float)) and major_road_m < 50:
            out.append(f'Major road within {major_road_m:.0f}m (road noise)')

    # Internet gate — remote work essential. Treat as failing only when
    # has_internet is explicitly False or a per-property `internet_viable` flag
    # is False. Unknown = pass (most listings don't specify).
    if _CP.gates.get('internet', {}).get('require_fibre_or_starlink_viable'):
        if p.get('internet_viable') is False or p.get('has_internet') is False:
            out.append('Internet not viable (confirmed)')

    # Septic compliance — gate only when explicitly non-compliant.
    if _CP.gates.get('environmental', {}).get('septic_compliant') == 'required':
        if p.get('septic_compliant') is False:
            out.append('Septic non-compliant (confirmed)')

    # Listed-heritage exclusion (DRAC oversight = 2-3x reno cost).
    if _CP.gates.get('environmental', {}).get('listed_heritage') == 'exclude':
        if p.get('is_listed_heritage') is True:
            out.append('Listed heritage building (DRAC oversight)')
        else:
            # Keyword fallback for description-based detection
            blob = (p.get('description', '') or '').lower()
            heritage_kw = ['classé monument historique', 'inscrit aux monuments',
                           'listed building', 'monument historique']
            for kw in heritage_kw:
                if kw in blob:
                    out.append(f'Listed heritage indicated ("{kw}")')
                    break

    # Amenity time-distances (criteria M deal-breakers). Gate when verified
    # numeric minutes exceed cap; rely on existing OSM amenity gates for the
    # km-distance fallback already in check_tier1_gates above.
    amen_cfg = _CP.gates.get('amenities', {})
    groc_max = amen_cfg.get('grocery_max_minutes')
    hosp_max = amen_cfg.get('hospital_max_minutes')
    if groc_max:
        g = p.get('grocery_minutes')
        if isinstance(g, (int, float)) and g > groc_max:
            out.append(f'Grocery {g:.0f}min > {groc_max}min cap')
    if hosp_max:
        h = p.get('hospital_minutes')
        if isinstance(h, (int, float)) and h > hosp_max:
            out.append(f'Hospital {h:.0f}min > {hosp_max}min cap')

    return out


def get_tier2_flags(p, scores):
    """Return Tier 2 research flags for shortlisted properties."""
    flags = []
    if p.get('price') is None:
        flags.append('verify_price')
    if p.get('land_size_m2') is None:
        flags.append('verify_land_size')
    if p.get('lat') is None or p.get('lon') is None:
        flags.append('verify_coordinates')
    if scores.get('renovation_scope') == 3 and CRITERIA['renovation_scope']['source'] == 'unavailable':
        flags.append('verify_renovation')
    # Utility verification
    signals = p.get('keyword_signals') or []
    if not p.get('has_electricity') and 'has_electricity' not in signals:
        flags.append('verify_electricity')
    if not p.get('has_mains_water') and 'has_mains_water' not in signals:
        flags.append('verify_water')
    flags.append('verify_internet')
    if p.get('risk_score') is None:
        flags.append('verify_environmental_risk')
    return flags


# ─── MAIN ───

def main():
    parser = argparse.ArgumentParser(description='Cyber Prairie Harmonized Property Scoring')
    parser.add_argument('--top', type=int, default=10, help='Number of top results to show')
    parser.add_argument('--removed', action='store_true', help='Show Tier 1 gate removals')
    parser.add_argument('--flags', action='store_true', help='Show Tier 2 research flag summary')
    parser.add_argument('--json', action='store_true', help='Output JSON to stdout')
    parser.add_argument('--input', default='properties.json', help='Input file (SOT = properties.json; enriched_data.json kept for legacy compatibility but not canonical)')
    args = parser.parse_args()

    data_path = Path(args.input)
    if not data_path.exists():
        print(f"Error: {data_path} not found", file=sys.stderr)
        sys.exit(1)

    with open(data_path, encoding='utf-8') as f:
        raw = json.load(f)
    properties = list(raw.values()) if isinstance(raw, dict) else raw

    now = datetime.now().strftime('%Y-%m-%d %H:%M')
    shortlisted = []
    gated = []
    flagged_counts = {}

    for p in properties:
        if p.get('status') == 'Removed':
            continue

        raw, final, scores, missing = compute_cp_score(p)

        # Vibe
        search_text = ' '.join(filter(None, [
            p.get('location', ''), p.get('title', ''),
            p.get('summary', ''), p.get('url', ''),
            p.get('department', ''), p.get('search_region', ''),
        ]))
        vibe_score, vibe_region = get_vibe(search_text)

        # Distance from The Hague
        lat = p.get('lat') or p.get('latitude')
        lon = p.get('lon') or p.get('longitude')
        dist_km = None
        if lat and lon:
            try:
                lat_f, lon_f = float(lat), float(lon)
                if 41 < lat_f < 52 and -5 < lon_f < 10:
                    dist_km = round(haversine(DH_LAT, DH_LON, lat_f, lon_f))
            except (ValueError, TypeError):
                pass

        # Red flags
        flag_text = ' '.join(filter(None, [
            p.get('analysis', ''), p.get('title', ''), p.get('summary', ''),
        ]))
        red_flags = scan_red_flags(flag_text)

        # Source detection
        url = p.get('url', '')
        if 'frenchestateagents' in url:
            source = 'Leggett'
        elif 'idealista' in url:
            source = 'Idealista'
        else:
            source = 'Properstar'

        # Data completeness (0-5): how many key fields are known
        data_fields = [p.get('price'), p.get('land_size_m2'), lat, p.get('bedrooms'), p.get('building_size_m2')]
        data_completeness = sum(1 for v in data_fields if v is not None)

        # Amenity summary
        amenities = p.get('amenities', {})
        amenity_summary = {}
        for key in ['bakery', 'hospital', 'train_station', 'supermarket', 'airport', 'town']:
            info = amenities.get(key)
            if info:
                amenity_summary[key] = f"{info.get('name', '')[:20]} ({info['km']}km)" if info.get('name') else f"{info['km']}km"
            elif amenities:  # has amenity data but this one is missing
                amenity_summary[key] = 'none'

        price_m2 = compute_price_per_m2(p)

        entry = {
            'cp_score': final,
            'cp_raw': raw,
            'scores': scores,
            'missing': missing,
            'vibe_score': vibe_score,
            'vibe_region': vibe_region,
            'dist_km': dist_km,
            'red_flags': red_flags,
            'source': source,
            'data_completeness': data_completeness,
            'amenities': amenity_summary,
            'price_per_m2': price_m2,
            'url': url,
            'location': p.get('location', ''),
            'title': (p.get('title') or '')[:80],
            'price': p.get('price'),
            'land_m2': p.get('land_size_m2'),
            'building_m2': p.get('building_size_m2'),
            'bedrooms': _bedrooms(p),  # sanitized (absurd values like 325 → None)
            'thumbnail': p.get('thumbnail') or '',
            'photo_urls': p.get('photo_urls') or [],
            'verdicts': p.get('verdicts') or {},  # per-voter 👍/👎 (vote_server.py)
            'risk': p.get('risk_profile', ''),
            'risk_score': p.get('risk_score'),
            'risk_labels': p.get('risk_labels') or [],
            'seismic_zone': p.get('seismic_zone'),
            'radon_level': p.get('radon_level'),
            'has_flood_risk': p.get('has_flood_risk'),
            'clay_risk': p.get('clay_risk'),
            'seveso_high_count': p.get('seveso_high_count', 0),
            'nuclear_count': p.get('nuclear_count', 0),
            'summary': (p.get('summary') or '')[:200],
        }

        # Tier 1 gates
        disqualified_flags = [f for f in red_flags if f['severity'] == 'DISQUALIFY']
        gates = check_tier1_gates(p, scores, final)

        if disqualified_flags:
            gates.extend([f['reason'] for f in disqualified_flags])

        if gates:
            entry['gate_reasons'] = gates
            gated.append(entry)
        else:
            # Vetted = substantively evaluated AND top-tier. Computed here with the fresh
            # cp_score (single source of truth in vetting.py); the page reads this flag.
            blockers = vetted_blockers(p, final, gated=False)
            entry['vetted'] = not blockers
            entry['vetted_blockers'] = blockers
            entry['tier2_flags'] = get_tier2_flags(p, scores)
            for flag in entry['tier2_flags']:
                flagged_counts[flag] = flagged_counts.get(flag, 0) + 1
            shortlisted.append(entry)

    # Sort: score → vibe → data completeness (properties with more data rank higher on ties)
    shortlisted.sort(key=lambda x: (-x['cp_score'], -x['vibe_score'], -x['data_completeness']))

    # ─── JSON output ───
    if args.json:
        output = {
            'generated': now,
            'total_evaluated': len(properties),
            'active': len(shortlisted) + len(gated),
            'shortlisted': len(shortlisted),
            'gated': len(gated),
            'shortlist': shortlisted,
            'gated_removals': gated,
        }
        print(json.dumps(output, indent=2, ensure_ascii=False, default=str))
        return

    # Save JSON file
    output_path = data_path.parent / 'cyber_prairie_shortlist.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump({'generated': now, 'shortlist': shortlisted, 'gated': gated},
                  f, indent=2, ensure_ascii=False, default=str)

    # ─── Terminal output ───
    active_count = len(shortlisted) + len(gated)
    removed_count = sum(1 for p in properties if p.get('status') == 'Removed')

    print()
    print('=' * 80)
    print('  CYBER PRAIRIE SHORTLIST')
    print(f'  {now}  |  Spec: cyber-prairie-property-spec.md')
    print('=' * 80)
    print(f'  Evaluated: {active_count}  |  Shortlisted: {len(shortlisted)}  |  Gated: {len(gated)}  |  Removed: {removed_count}')
    print(f'  Threshold: >= 3.0/5  |  Risk: georisques.gouv.fr (seismic/flood/radon/Seveso/clay)')
    print()

    vibe_stars = {5: '*****', 4: '****', 3: '***', 2: '**', 1: '*', 0: '-'}

    for i, e in enumerate(shortlisted[:args.top]):
        s = e['scores']
        price_str = f"EUR {e['price']:,.0f}" if isinstance(e['price'], (int, float)) and e['price'] > 0 else '?'
        land_str = f"{e['land_m2']/10000:.1f} ha" if e['land_m2'] else '?'
        beds_str = str(e['bedrooms']) if e['bedrooms'] else '?'
        dist_str = f"{e['dist_km']}km (~{round(e['dist_km']/80, 1)}h)" if e['dist_km'] else '?'
        vibe_str = f"{vibe_stars.get(e['vibe_score'], '-')} {e['vibe_region']}" if e['vibe_region'] else '-'

        reno_str = '[?]' if CRITERIA['renovation_scope']['source'] == 'unavailable' else str(s.get('renovation_scope', '?'))

        print(f"  {'─'*75}")
        data_q = f"Data: {e['data_completeness']}/5" if e['data_completeness'] < 3 else ''
        print(f"  #{i+1}  CP Score: {e['cp_score']:.2f}/5  |  {e['source']}  |  Vibe: {vibe_str}  {data_q}".rstrip())
        print(f"  {e['location'] or e['title'][:50]}")
        pm2_str = f"€{e['price_per_m2']:,}/m²" if e.get('price_per_m2') else ''
        print(f"  {price_str}  |  {land_str}  |  {beds_str} beds  |  {dist_str}  |  Risk: {e['risk']}  {pm2_str}".rstrip())
        print(f"  Wkshp:{s.get('workshop','?')}  Loc:{s.get('location_view','?')}"
              f"  Food:{s.get('food_experience','?')}  Guest:{s.get('guest_accommodation','?')}"
              f"  Live:{s.get('livability','?')}  EnvRisk:{s.get('environmental_risk','?')}"
              f"  Design:{s.get('design_story','?')}"
              f"  Gdn:{s.get('market_garden','?')}  Land:{s.get('land_size','?')}"
              f"  Reno:{reno_str}  Mkt:{s.get('local_market','?')}")

        am = e.get('amenities', {})
        if am:
            am_parts = []
            for key in ['bakery', 'hospital', 'train_station', 'supermarket', 'airport']:
                val = am.get(key, '?')
                short_key = {'bakery': 'Baker', 'hospital': 'Hosp', 'train_station': 'Train',
                             'supermarket': 'Super', 'airport': 'Air'}[key]
                am_parts.append(f"{short_key}:{val}")
            print(f"  {' | '.join(am_parts)}")

        # Georisques risk detail
        risk_parts = []
        if e.get('seismic_zone'):
            risk_parts.append(f"Seis:{e['seismic_zone']}/5")
        if e.get('radon_level'):
            risk_parts.append(f"Radon:{e['radon_level']}/3")
        if e.get('has_flood_risk'):
            risk_parts.append('FLOOD')
        clay = e.get('clay_risk')
        if clay and clay != 'faible':
            risk_parts.append(f"Clay:{clay}")
        if e.get('seveso_high_count', 0) > 0:
            risk_parts.append(f"SEVESO-H:{e['seveso_high_count']}")
        if e.get('nuclear_count', 0) > 0:
            risk_parts.append(f"NUCLEAR:{e['nuclear_count']}")
        if risk_parts:
            print(f"  Risks: {' | '.join(risk_parts)}")

        warn_flags = [f for f in e.get('red_flags', []) if f['severity'] == 'FLAG']
        if warn_flags:
            print(f"  !! {', '.join(f['reason'] for f in warn_flags)}")

        if e.get('tier2_flags'):
            t2 = [f for f in e['tier2_flags'] if f != 'verify_internet' and f != 'verify_renovation']
            if t2:
                print(f"  Research: {', '.join(t2)}")

        print(f"  {e['url']}")
        print()

    # ─── Gate removals ───
    if args.removed and gated:
        print()
        print('=' * 80)
        print(f'  TIER 1 GATE REMOVALS ({len(gated)} properties)')
        print('=' * 80)
        for e in gated[:30]:
            reasons = ', '.join(e.get('gate_reasons', []))
            loc = e['location'] or e['title'][:40]
            print(f"  {loc:35s}  |  {reasons}")
            print(f"    {e['url']}")
        if len(gated) > 30:
            print(f"  ... and {len(gated) - 30} more")

    # ─── Tier 2 flags summary ───
    if args.flags and flagged_counts:
        print()
        print('=' * 80)
        print('  TIER 2 RESEARCH FLAGS SUMMARY')
        print('=' * 80)
        for flag, count in sorted(flagged_counts.items(), key=lambda x: -x[1]):
            print(f"  {flag:25s}  {count} properties")

    # ─── Region summary (Paradiso-friendly) ───
    print(f"  {'─'*75}")
    print(f"  REGION OVERVIEW (all {len(shortlisted)} shortlisted)")
    print(f"  {'─'*75}")

    region_data = {}
    for e in shortlisted:
        rg = e['vibe_region'] or 'Other'
        vt = e['vibe_score']
        if rg not in region_data:
            region_data[rg] = {'count': 0, 'vibe': vt, 'top_score': 0, 'has_price': 0}
        region_data[rg]['count'] += 1
        region_data[rg]['top_score'] = max(region_data[rg]['top_score'], e['cp_score'])
        if isinstance(e['price'], (int, float)) and e['price'] > 0:
            region_data[rg]['has_price'] += 1

    vibe_stars = {5: '*****', 4: '****', 3: '***', 2: '**', 1: '*', 0: '-'}
    for rg, rd in sorted(region_data.items(), key=lambda x: (-x[1]['vibe'], -x[1]['count'])):
        price_note = f"({rd['has_price']} with price)" if rd['has_price'] else '(no prices)'
        print(f"  {vibe_stars.get(rd['vibe'], '-'):5s}  {rg:20s}  {rd['count']:2d} properties  Best: {rd['top_score']:.1f}  {price_note}")

    # ─── Source distribution ───
    sources = {}
    for e in shortlisted:
        sources[e['source']] = sources.get(e['source'], 0) + 1
    print(f"\n  Sources: {', '.join(f'{s}: {c}' for s, c in sorted(sources.items(), key=lambda x: -x[1]))}")
    print(f"\n  Saved: {output_path}")
    print()


if __name__ == '__main__':
    main()