#!/usr/bin/env python3
"""
Paradisomatch Pipeline — Repeatable property discovery and scoring.

All steps are store-native (read/write properties.json via store.py).
Each step is idempotent: safe to run repeatedly, skips already-enriched.

Usage:
    python3 pipeline.py                    # Default: check → geocode → amenities → score
    python3 pipeline.py --all              # Full: search → import → check → geocode → amenities → score
    python3 pipeline.py --from geocode     # Start from step
    python3 pipeline.py --only score       # Single step
    python3 pipeline.py --only search,score # Multiple specific steps
    python3 pipeline.py --dry-run          # Show plan, don't execute
    python3 pipeline.py --force            # Pass --force to sub-scripts
"""
import argparse
import shutil
import subprocess
import sys
import time
from collections import OrderedDict
from datetime import datetime
from pathlib import Path

SCRIPT_DIR = Path(__file__).parent

# ─── Step Registry ───
# Each step: name, script, args, needs_browser, default (runs without --all)

STEPS = OrderedDict([
    ('search',    {'name': 'Search Platforms',     'script': 'search_v2.py',               'browser': False, 'default': False}),
    ('import',    {'name': 'Import Favorites',     'script': 'import_favorites.py',        'browser': False, 'default': False}),
    ('check',     {'name': 'Availability Check',   'script': 'check_availability_store.py','browser': False, 'default': True}),
    ('geocode',   {'name': 'Geocode Coordinates',  'script': 'geocode_properties.py',      'browser': False, 'default': True}),
    ('amenities', {'name': 'Amenity Lookup',       'script': 'lookup_amenities.py',        'browser': False, 'default': True}),
    # OpenAI analyze is OFF by default (2026-05-28) — cyber_prairie_score.py now
    # derives criteria deterministically (derive_proxy_criteria, no LLM). Run
    # `pipeline.py --only analyze` manually only if you want LLM-graded criteria —
    # now via `claude -p` (subscription, no paid API); otherwise the scorer's keyword
    # proxies + hazard rules cover it dependency-free.
    ('analyze',   {'name': 'Claude Criteria Analysis','script': 'analyze_store.py',         'browser': False, 'default': False, 'manual_only': True}),
    ('enrich',    {'name': 'Soil + Risk Enrich',   'script': 'enrich_apis.py',             'browser': False, 'default': True}),
    # Vetting-input enrichers — wired into the default run so the vetted gate never reads
    # absent data (was the cause of 5/15 vetted missing population). Both free/subscription.
    ('vitality',  {'name': 'Community Vitality',   'script': 'community_vitality.py',      'browser': False, 'default': True, 'extra_args': ['--top', '50']}),
    ('character', {'name': 'Character (Claude vision)','script': 'character_score_vision.py','browser': False, 'default': True, 'extra_args': ['--limit', '25']}),
    ('score',     {'name': 'Score & Shortlist',    'script': 'cyber_prairie_score.py',     'browser': False, 'default': True, 'extra_args': ['--top', '10']}),
    # Shortlist precision: fetch parcel coords + building density for the top N,
    # then the next score run uses it as the definitive urban signal. Manual-only
    # (fetches detail pages, ~2s each) — run after score, then re-score.
    ('density',   {'name': 'Urban Density (top 30)','script': 'urban_density.py',           'browser': False, 'default': False, 'manual_only': True, 'extra_args': ['--top', '30']}),
    ('calibrate', {'name': 'Weight Calibration',  'script': 'calibrate_weights.py',       'browser': False, 'default': False}),
])

# ─── Python Detection ───

def find_python():
    for candidate in ['../venv/bin/python3.14', '../venv/bin/python3']:
        p = SCRIPT_DIR / candidate
        if p.exists() and p.is_file():
            return str(p)
    return shutil.which('python3') or 'python3'

PY = find_python()

# ─── Runner ───

def run_step(script, args=None):
    """Run a script with live stdout. Returns (success, elapsed_seconds)."""
    cmd = [PY, str(SCRIPT_DIR / script)] + (args or [])
    t0 = time.time()
    try:
        result = subprocess.run(cmd, cwd=str(SCRIPT_DIR), timeout=3600)
        return result.returncode == 0, time.time() - t0
    except subprocess.TimeoutExpired:
        return False, time.time() - t0
    except FileNotFoundError:
        print(f'  !! {script} not found')
        return False, 0

# ─── Health Report ───

def print_health_report():
    """Read-only pipeline-end snapshot. Warns on coverage gaps + stale auth. Never blocks."""
    import json
    try:
        with open(SCRIPT_DIR / 'properties.json', encoding='utf-8') as f:
            raw = json.load(f)
        store = list(raw.values()) if isinstance(raw, dict) else raw
    except FileNotFoundError:
        print('  (no properties.json yet — skipping health report)')
        return

    active = [p for p in store if isinstance(p, dict) and p.get('status') != 'Removed']
    today = datetime.now().strftime('%Y-%m-%d')
    new_today = [p for p in store if isinstance(p, dict) and (p.get('discovered_at') or '').startswith(today)]

    def pct(predicate):
        if not active: return 0
        return round(100 * sum(1 for p in active if predicate(p)) / len(active))

    cri = pct(lambda p: bool(p.get('criteria')))
    risk = pct(lambda p: p.get('risk_score') is not None or p.get('georisques_enriched'))
    amen = pct(lambda p: bool(p.get('amenities')))

    from collections import Counter
    by_src_today = Counter(p.get('source','?') for p in new_today)

    def mark(v, warn_below):
        return ' OK ' if v >= warn_below else ' WARN '

    print()
    print(f'  {"=" * 50}')
    print(f'  HEALTH REPORT')
    print(f'  {"=" * 50}')
    print(f'  Store:    {len(store)} total | {len(active)} active | {len(new_today)} new today')
    print(f'  Coverage: criteria {cri}%{mark(cri,80)}| risk {risk}%{mark(risk,50)}| amenities {amen}%{mark(amen,70)}')
    if new_today:
        per_src = ' | '.join(f'{src}:+{n}' for src, n in sorted(by_src_today.items()))
        print(f'  New today by source: {per_src}')

    # Auth freshness — Properstar API token (used by import_favorites.py)
    from time import time as _now
    token_path = SCRIPT_DIR / '.properstar_token'
    if token_path.exists():
        age_d = int((_now() - token_path.stat().st_mtime) / 86400)
        marker = ' WARN ' if age_d > 30 else ' OK '
        hint = '  (refresh: python3 import_favorites.py --login)' if age_d > 30 else ''
        print(f'  Properstar token: {age_d}d old{marker}{hint}')

    # Shortlist freshness
    sl_path = SCRIPT_DIR / 'cyber_prairie_shortlist.json'
    if sl_path.exists():
        try:
            with open(sl_path) as f:
                sl = json.load(f)
            sl_items = sl if isinstance(sl, list) else sl.get('shortlist', list(sl.values()) if isinstance(sl, dict) else [])
            print(f'  Shortlist: {len(sl_items)} properties')
        except Exception:
            pass

    print(f'  {"=" * 50}')


# ─── Main ───

def main():
    parser = argparse.ArgumentParser(description='Paradisomatch Pipeline')
    parser.add_argument('--all', action='store_true', help='Include search + import steps')
    parser.add_argument('--from', dest='from_step', metavar='STEP', help='Start from this step')
    parser.add_argument('--only', metavar='STEP[,STEP]', help='Run only these steps')
    parser.add_argument('--dry-run', action='store_true', help='Show plan without executing')
    parser.add_argument('--force', action='store_true', help='Force re-processing')
    args = parser.parse_args()

    step_ids = list(STEPS.keys())

    # Determine which steps to run
    if args.only:
        selected = [s.strip() for s in args.only.split(',')]
        for s in selected:
            if s not in STEPS:
                print(f"Unknown step: {s}. Available: {', '.join(step_ids)}", file=sys.stderr)
                sys.exit(1)
        steps_to_run = selected
    elif args.from_step:
        if args.from_step not in STEPS:
            print(f"Unknown step: {args.from_step}. Available: {', '.join(step_ids)}", file=sys.stderr)
            sys.exit(1)
        idx = step_ids.index(args.from_step)
        steps_to_run = step_ids[idx:]
    else:
        # --all adds search+import to the defaults, but never force-enables
        # 'manual_only' steps (e.g. the OpenAI analyze step). Those run only
        # via explicit `--only analyze`.
        steps_to_run = [s for s in step_ids
                        if (STEPS[s]['default'] or args.all) and not STEPS[s].get('manual_only')]

    # Display plan
    now = datetime.now().strftime('%Y-%m-%d %H:%M')
    print()
    print(f'  FARMMATCH PIPELINE  |  {now}')
    print(f'  {"=" * 50}')

    for step_id in step_ids:
        info = STEPS[step_id]
        marker = ' >> ' if step_id in steps_to_run else '    '
        browser = ' (browser)' if info['browser'] and step_id in steps_to_run else ''
        print(f'{marker}{info["name"]}{browser}')

    print(f'  {"=" * 50}')
    print(f'  {len(steps_to_run)} steps to run')
    print()

    if args.dry_run:
        print('  --dry-run: nothing executed')
        return

    # Execute
    results = []
    t_start = time.time()

    for step_id in steps_to_run:
        info = STEPS[step_id]

        print(f'  {"─" * 50}')
        print(f'  {info["name"]}  ({info["script"]})')
        print(f'  {"─" * 50}')

        step_args = list(info.get('extra_args', []))
        if args.force:
            step_args.append('--force')

        ok, elapsed = run_step(info['script'], step_args)
        results.append((step_id, info['name'], ok, elapsed))

        if ok:
            print(f'  OK ({elapsed:.0f}s)')
        else:
            print(f'  !! FAILED ({elapsed:.0f}s)')
        print()

    # Summary
    total = time.time() - t_start
    print(f'  {"=" * 50}')
    print(f'  DONE  |  {total:.0f}s total')
    print(f'  {"=" * 50}')

    for step_id, name, ok, elapsed in results:
        icon = 'OK' if ok else '!!'
        print(f'  {icon}  {name:25s}  {elapsed:5.0f}s')

    failed = [r for r in results if not r[2]]
    if failed:
        print(f'\n  {len(failed)} step(s) failed')

    print_health_report()

    if failed:
        sys.exit(1)
    else:
        print(f'  All {len(results)} steps completed')


if __name__ == '__main__':
    main()
