#!/usr/bin/env python3
"""
Enrich properties with free government/scientific API data.

APIs used:
  - SoilGrids (ISRIC) — soil organic carbon, clay content, pH
  - Géorisques (BRGM)  — flood zones, seismic risk, clay shrink-swell

Usage:
    python3 enrich_apis.py                # Enrich all active with lat/lon
    python3 enrich_apis.py --limit 5      # Test on 5 properties
    python3 enrich_apis.py --dry-run      # Show what would be enriched
    python3 enrich_apis.py --force        # Re-enrich even if already done
"""
import argparse
import time
from datetime import datetime

import requests

from store import load, save, persist, upsert, is_active, short_url, NOMINATIM_UA as USER_AGENT
HEADERS = {'User-Agent': USER_AGENT}


def needs_enrichment(prop, force=False):
    """Property has coordinates and is missing soil OR risk enrichment.

    Per-source tracking so a failed georisques fetch doesn't permanently
    skip re-attempts on a future run.
    """
    if not is_active(prop):
        return False
    if not prop.get('lat') or not prop.get('lon'):
        return False
    if force:
        return True
    has_soil = bool(prop.get('soil_enriched') or prop.get('soil_quality_score'))
    has_risk = bool(prop.get('georisques_enriched') or prop.get('risk_score') is not None)
    return not (has_soil and has_risk)


# ─── SoilGrids (ISRIC) ───

def fetch_soilgrids(lat, lon):
    """Fetch soil properties from SoilGrids REST API."""
    url = 'https://rest.isric.org/soilgrids/v2.0/properties/query'
    params = {
        'lon': lon,
        'lat': lat,
        'property': ['ocs', 'clay', 'phh2o'],
        'depth': '0-30cm',
        'value': 'mean',
    }
    try:
        r = requests.get(url, params=params, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            return None, f'HTTP {r.status_code}'
        data = r.json()

        result = {}
        for layer in data.get('properties', {}).get('layers', []):
            name = layer.get('name', '')
            depths = layer.get('depths', [])
            if not depths:
                continue
            val = depths[0].get('values', {}).get('mean')
            if val is None:
                continue

            if name == 'ocs':
                # Organic carbon stock in t/ha (deci-tonnes → tonnes)
                result['soil_organic_carbon_t_ha'] = val / 10
            elif name == 'clay':
                # Clay content in g/kg → percentage
                result['soil_clay_pct'] = val / 10
            elif name == 'phh2o':
                # pH in water (stored as pH * 10)
                result['soil_ph'] = val / 10

        return result, 'ok'

    except requests.RequestException as e:
        return None, str(e)[:80]


def soil_quality_score(soil_data):
    """Simple soil quality 1-5 from organic carbon + pH."""
    if not soil_data:
        return None

    score = 3.0  # neutral default
    oc = soil_data.get('soil_organic_carbon_t_ha')
    ph = soil_data.get('soil_ph')

    if oc is not None:
        if oc >= 80:
            score += 1.0   # rich soil
        elif oc >= 40:
            score += 0.5
        elif oc < 20:
            score -= 0.5   # poor soil

    if ph is not None:
        if 5.5 <= ph <= 7.5:
            score += 0.5   # good range for most crops
        elif ph < 4.5 or ph > 8.5:
            score -= 1.0   # extreme

    return max(1.0, min(5.0, score))


# ─── Géorisques (BRGM) — Comprehensive Risk Report ───

def _severity_to_int(label, default_present=3):
    """Map French severity wording in 'libelleStatutAdresse' to 1-5 int.

    Schema rewrite 2026-05-25: the V1 API now returns severity as French text
    inside libelleStatutAdresse (e.g. 'Risque Existant - modéré') rather than
    a numeric code. We map the wording to the legacy 1-5 scale so the
    downstream scorer doesn't need changes.
    """
    if not label:
        return 1  # absent / no risk
    lo = label.lower()
    if 'très fort' in lo or 'tres fort' in lo: return 5
    if 'fort' in lo or 'important' in lo: return 4
    if 'modér' in lo or 'moyen' in lo: return 3
    if 'faible' in lo: return 2
    if 'existant' in lo or 'concerne' in lo or 'present' in lo:
        return default_present
    return 1


def _severity_to_clay(label):
    """Convert clay risk text to legacy 'fort'/'moyen'/'faible' string."""
    if not label: return None
    lo = label.lower()
    if 'fort' in lo or 'important' in lo: return 'fort'
    if 'modér' in lo or 'moyen' in lo: return 'moyen'
    if 'faible' in lo: return 'faible'
    return None


def _in_france_bbox(lat, lon):
    """Cheap pre-check: skip the Géorisques call for points clearly outside France.
    Mainland: lat 41-51, lon -5 to 10. Avoids 404s on Italy/Spain/Portugal coords."""
    return 41.0 <= lat <= 51.5 and -5.5 <= lon <= 10.0


def fetch_georisques(lat, lon):
    """Fetch comprehensive risk report from Géorisques API (V1, camelCase schema).

    Schema rewrite 2026-05-25: the API moved from snake_case sectional output
    (zonage_sismique, gaspar, rga, …) to a unified shape with risquesNaturels
    and risquesTechnologiques objects, each containing per-risk records with
    present/libelleStatut* fields. We parse the new shape and map the values
    back to the legacy field names the scorer already understands.
    """
    if not _in_france_bbox(lat, lon):
        return None, 'outside France (Géorisques is France-only)'
    try:
        url = 'https://georisques.gouv.fr/api/v1/resultats_rapport_risque'
        params = {'latlon': f'{lon},{lat}'}
        r = requests.get(url, params=params, headers=HEADERS, timeout=20)
        if r.status_code != 200:
            return None, f'HTTP {r.status_code}'

        data = r.json()
        nat = data.get('risquesNaturels') or {}
        tech = data.get('risquesTechnologiques') or {}

        # Sanity: if neither block is present, the schema changed again
        if not nat and not tech:
            return None, 'empty response (schema may have changed)'

        result = {}

        def status(block, key):
            """Best available severity label for a risk: address > commune > None."""
            risk = block.get(key) or {}
            return risk.get('libelleStatutAdresse') or risk.get('libelleStatutCommune')

        def present(block, key):
            return bool((block.get(key) or {}).get('present'))

        # Seismic zone (1=low, 5=very high)
        seis_label = status(nat, 'seisme')
        if seis_label or present(nat, 'seisme'):
            result['seismic_zone'] = _severity_to_int(seis_label)

        # Radon (1=low, 3=high — collapse 5-scale to 3-scale)
        radon_label = status(nat, 'radon')
        if radon_label or present(nat, 'radon'):
            sev = _severity_to_int(radon_label)
            result['radon_level'] = min(3, max(1, sev - 1))  # 2→1, 3→2, 4→3, 5→3

        # Flood = inondation OR remonteeNappe OR risqueCotier
        flood = (present(nat, 'inondation') or present(nat, 'remonteeNappe')
                 or present(nat, 'risqueCotier'))
        if flood:
            result['has_flood_risk'] = True

        # Clay shrink-swell
        clay = _severity_to_clay(status(nat, 'retraitGonflementArgile'))
        if clay:
            result['clay_risk'] = clay

        # Ground movements
        if present(nat, 'mouvementTerrain'):
            result['ground_movement_count'] = 1  # presence flag, granular count not in new schema

        # ICPE / Seveso — new schema only exposes present + severity, not counts
        if present(tech, 'icpe'):
            icpe_label = status(tech, 'icpe') or ''
            high = ('seuil haut' in icpe_label.lower() or 'fort' in icpe_label.lower()
                    or 'important' in icpe_label.lower())
            result['seveso_any_count'] = 1
            result['seveso_high_count'] = 1 if high else 0

        # Nuclear
        if present(tech, 'nucleaire'):
            result['nuclear_count'] = 1

        # Wildfire risk (added — wasn't in old schema)
        if present(nat, 'feuForet'):
            result['wildfire_risk'] = True

        # Aggregate: count of all present risks (legacy scorer uses this)
        nat_present = sum(1 for k, v in nat.items() if isinstance(v, dict) and v.get('present'))
        tech_present = sum(1 for k, v in tech.items() if isinstance(v, dict) and v.get('present'))
        result['risk_count'] = nat_present + tech_present
        result['risk_labels'] = [v.get('libelle') for k, v in {**nat, **tech}.items()
                                  if isinstance(v, dict) and v.get('present') and v.get('libelle')]
        result['georisques_commune'] = (data.get('commune') or {}).get('libelle')

    except requests.RequestException as e:
        return None, str(e)[:80]

    if not result:
        return None, 'no risks parsed (point outside France?)'
    return result, 'ok'


# ─── Legacy parser kept for reference only ───
def _fetch_georisques_legacy_disabled(lat, lon):
    """ARCHIVED 2026-05-25: snake_case schema parser. API moved to camelCase.
    Kept in source for one cycle in case fields need re-deriving."""
    result = {}
    try:
        url = 'https://georisques.gouv.fr/api/v1/resultats_rapport_risque'
        params = {'latlon': f'{lon},{lat}'}
        r = requests.get(url, params=params, headers=HEADERS, timeout=20)
        if r.status_code != 200:
            return None, f'HTTP {r.status_code}'

        data = r.json()

        # ── Seismic zone (1-5, where 5 = highest risk) ──
        sismique = data.get('zonage_sismique')
        if sismique:
            zone = sismique.get('zone_sismicite') or sismique.get('code_zone')
            if zone is not None:
                try:
                    result['seismic_zone'] = int(zone)
                except (ValueError, TypeError):
                    result['seismic_zone_raw'] = str(zone)

        # ── Radon (1=low, 2=medium, 3=high) ──
        radon = data.get('radon')
        if radon:
            cat = radon.get('classe_potentiel') or radon.get('potentiel_radon')
            if cat is not None:
                try:
                    result['radon_level'] = int(cat)
                except (ValueError, TypeError):
                    result['radon_level_raw'] = str(cat)

        # ── Clay shrink-swell (retrait-gonflement argiles) ──
        rga = data.get('rga') or data.get('retrait_gonflement_argiles')
        if rga:
            exposition = rga.get('exposition') or rga.get('niveau')
            if exposition:
                result['clay_risk'] = str(exposition).lower()

        # ── Flood zones (from GASPAR risques) ──
        gaspar = data.get('gaspar') or {}
        risques = gaspar.get('risques_detail') or gaspar.get('risques') or []
        if isinstance(risques, list) and risques:
            risk_codes = [r.get('code_national_risque', '') for r in risques if isinstance(r, dict)]
            risk_labels = [r.get('libelle_risque_long', '') for r in risques if isinstance(r, dict)]
            result['risk_codes'] = risk_codes
            result['risk_labels'] = [l for l in risk_labels if l]
            result['risk_count'] = len(risques)
            flood_prefixes = {'13', '14', '15'}
            result['has_flood_risk'] = any(c[:2] in flood_prefixes for c in risk_codes if len(c) >= 2)

        # ── Ground movements (mouvements de terrain) ──
        mvt = data.get('mouvements_terrain') or data.get('mvt')
        if mvt:
            items = mvt if isinstance(mvt, list) else mvt.get('data', [])
            result['ground_movement_count'] = len(items) if isinstance(items, list) else 0

        # ── Underground cavities ──
        cavites = data.get('cavites')
        if cavites:
            items = cavites if isinstance(cavites, list) else cavites.get('data', [])
            result['cavity_count'] = len(items) if isinstance(items, list) else 0

        # ── Industrial installations (ICPE / Seveso) ──
        icpe = data.get('installations_classees') or data.get('icpe')
        if icpe:
            items = icpe if isinstance(icpe, list) else icpe.get('data', [])
            if isinstance(items, list):
                result['icpe_count'] = len(items)
                seveso_items = [i for i in items if isinstance(i, dict) and
                                i.get('seveso', '').upper() in ('SEUIL_HAUT', 'SH', 'SEUIL HAUT')]
                result['seveso_high_count'] = len(seveso_items)
                seveso_all = [i for i in items if isinstance(i, dict) and
                              'seveso' in str(i.get('seveso', '')).upper()]
                result['seveso_any_count'] = len(seveso_all)

        # ── Nuclear installations ──
        nucleaire = data.get('installations_nucleaires')
        if nucleaire:
            items = nucleaire if isinstance(nucleaire, list) else nucleaire.get('data', [])
            result['nuclear_count'] = len(items) if isinstance(items, list) else 0

        # ── Risk prevention plans (PPR) ──
        ppr = data.get('ppr') or {}
        ppr_items = ppr if isinstance(ppr, list) else ppr.get('data', [])
        if isinstance(ppr_items, list):
            result['ppr_count'] = len(ppr_items)

        # ── CatNat declarations (natural disaster history) ──
        catnat = data.get('catnat') or gaspar.get('catnat') or []
        if isinstance(catnat, list):
            result['catnat_count'] = len(catnat)

    except requests.RequestException as e:
        return None, str(e)[:80]

    if not result:
        # Fallback: try individual endpoints if combined report returned empty
        return _fetch_georisques_fallback(lat, lon)

    return result, 'ok'


def _fetch_georisques_fallback(lat, lon):
    """Fallback to individual endpoints if combined report is empty."""
    result = {}

    # Flood zones (GASPAR)
    try:
        url = 'https://georisques.gouv.fr/api/v1/gaspar/risques'
        params = {'latlon': f'{lon},{lat}', 'rayon': 1000}
        r = requests.get(url, params=params, headers=HEADERS, timeout=15)
        if r.status_code == 200:
            data = r.json().get('data', [])
            if data:
                result['risk_codes'] = [d.get('code_national_risque', '') for d in data]
                result['risk_labels'] = [d.get('libelle_risque_long', '') for d in data if d.get('libelle_risque_long')]
                result['risk_count'] = len(data)
                flood_prefixes = {'13', '14', '15'}
                result['has_flood_risk'] = any(c[:2] in flood_prefixes for c in result['risk_codes'] if len(c) >= 2)
    except requests.RequestException:
        pass

    # Clay shrink-swell
    try:
        url = 'https://georisques.gouv.fr/api/v1/rga'
        params = {'latlon': f'{lon},{lat}'}
        r = requests.get(url, params=params, headers=HEADERS, timeout=15)
        if r.status_code == 200:
            data = r.json().get('data', [])
            if data:
                result['clay_risk'] = data[0].get('exposition', 'unknown')
    except requests.RequestException:
        pass

    return (result, 'ok') if result else (None, 'no data')


def risk_score(risk_data):
    """Comprehensive environmental risk score 1-5 (5 = safest).

    Evaluates: seismic zone, flood risk, clay shrink-swell, radon,
    ground movements, cavities, Seveso/ICPE, nuclear, natural disaster history.
    """
    if not risk_data:
        return None

    score = 5.0  # start optimistic

    # Seismic zone (French scale: 1=very low, 5=strong)
    seismic = risk_data.get('seismic_zone')
    if seismic is not None:
        if seismic >= 5:
            score -= 2.0
        elif seismic >= 4:
            score -= 1.5
        elif seismic >= 3:
            score -= 0.5
        # zones 1-2: no penalty

    # Flood risk
    if risk_data.get('has_flood_risk'):
        score -= 1.0

    # GASPAR risk count (general hazard exposure)
    risk_count = risk_data.get('risk_count', 0)
    if risk_count >= 5:
        score -= 1.0
    elif risk_count >= 3:
        score -= 0.5

    # Clay shrink-swell
    clay = risk_data.get('clay_risk', '')
    if clay == 'fort':
        score -= 0.5
    elif clay == 'moyen':
        score -= 0.25

    # Radon (1=low, 2=medium, 3=high)
    radon = risk_data.get('radon_level')
    if radon is not None:
        if radon >= 3:
            score -= 0.5
        elif radon >= 2:
            score -= 0.25

    # Seveso high-threshold = serious industrial risk
    if risk_data.get('seveso_high_count', 0) > 0:
        score -= 1.5

    # Other Seveso sites nearby
    elif risk_data.get('seveso_any_count', 0) > 0:
        score -= 0.5

    # Nuclear installations nearby
    if risk_data.get('nuclear_count', 0) > 0:
        score -= 0.5

    # Ground movements / cavities (structural risk)
    if risk_data.get('ground_movement_count', 0) >= 3:
        score -= 0.5
    if risk_data.get('cavity_count', 0) >= 3:
        score -= 0.5

    # Natural disaster history
    catnat = risk_data.get('catnat_count', 0)
    if catnat >= 10:
        score -= 0.5
    elif catnat >= 5:
        score -= 0.25

    return max(1.0, min(5.0, round(score, 1)))


# ─── Main ───

def main():
    parser = argparse.ArgumentParser(description='Enrich properties with API data')
    parser.add_argument('--limit', type=int, default=0, help='Max properties (0=all)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be enriched')
    parser.add_argument('--force', action='store_true', help='Re-enrich already done')
    args = parser.parse_args()

    store = load()
    candidates = [url for url, p in store.items() if needs_enrichment(p, args.force)]

    if args.limit:
        candidates = candidates[:args.limit]

    print(f"API ENRICHMENT — {len(candidates)} properties to enrich")
    print(f"  APIs: SoilGrids (soil), Géorisques (risks)")
    print()

    if args.dry_run:
        for url in candidates:
            print(f"  {url}")
        return

    if not candidates:
        print("Nothing to do.")
        return

    enriched = 0
    failed = 0

    for i, url in enumerate(candidates):
        prop = store[url]
        lat, lon = prop['lat'], prop['lon']
        url_short = short_url(url)
        print(f"  [{i+1}/{len(candidates)}] {url_short}...", end=' ')

        fields = {}
        now_iso = datetime.now().isoformat()

        # SoilGrids
        soil, soil_status = fetch_soilgrids(lat, lon)
        if soil:
            fields.update(soil)
            sq = soil_quality_score(soil)
            if sq is not None:
                fields['soil_quality_score'] = sq
            fields['soil_enriched'] = now_iso

        # Géorisques (France-only; returns 404 for IT/ES/PT)
        time.sleep(0.5)  # brief pause between APIs
        risks, risk_status = fetch_georisques(lat, lon)
        if risks:
            fields.update(risks)
            rs = risk_score(risks)
            if rs is not None:
                fields['risk_score'] = rs
                # Map risk_score to risk_profile for backward compat with scorer
                if rs >= 4.0:
                    fields['risk_profile'] = 'Laag'
                elif rs >= 2.5:
                    fields['risk_profile'] = 'Gemiddeld'
                else:
                    fields['risk_profile'] = 'Hoog'
            fields['georisques_enriched'] = now_iso
        elif risk_status and 'outside France' in risk_status:
            # One-time mark so needs_enrichment doesn't keep re-trying
            fields['georisques_enriched'] = 'n/a (non-FR)'

        if fields:
            # Legacy combined flag — set only when BOTH sources succeeded,
            # so a partial run can be retried on the next pipeline pass.
            if soil and risks:
                fields['api_enriched'] = now_iso
            upsert(store, url, fields)
            enriched += 1

            parts = []
            if soil:
                parts.append(f"soil={fields.get('soil_quality_score', '?')}")
            if risks:
                parts.append(f"risk={fields.get('risk_score', '?')}")
                parts.append(f"floods={'Y' if risks.get('has_flood_risk') else 'N'}")
                seis = risks.get('seismic_zone')
                if seis:
                    parts.append(f"seis={seis}")
                radon = risks.get('radon_level')
                if radon:
                    parts.append(f"radon={radon}")
                seveso = risks.get('seveso_high_count', 0)
                if seveso:
                    parts.append(f"SEVESO={seveso}")
            print(f"OK ({', '.join(parts)})")
        else:
            failed += 1
            print(f"FAIL (soil={soil_status}, risk={risk_status})")

        # Rate limit: SoilGrids is generous, Géorisques wants ~1/sec
        time.sleep(1.1)

    print(f"\nEnriched {enriched}/{len(candidates)} ({failed} failed)")

    if enriched > 0:
        persist(store)


if __name__ == '__main__':
    main()
