#!/usr/bin/env python3
"""Config-driven property search orchestrator (Phase A of the sources/ refactor).

Reads campaigns.yaml, dispatches per (campaign × region × source), aggregates
the resulting PropertyHits, upserts into the store. Replaces the hard-coded
platform iteration of search_properties.py.

Design (6L):
- Single SOT: still uses store.py / properties.json — no parallel store
- Single owner of persistence: orchestrator calls store.upsert + persist;
  sources never touch the store
- Loud-fail preflight: each source's health() runs once before any
  campaign uses it; failed sources skipped with a visible reason
- Dry-run: shows the plan + per-source health without making a single
  search request

Usage:
    python3 search_v2.py                          # Run all enabled campaigns
    python3 search_v2.py --dry-run                # Plan + health only
    python3 search_v2.py --campaign "FR Brittany" # Single campaign
    python3 search_v2.py --source greenacres      # Only this source across campaigns
    python3 search_v2.py --config alt.yaml        # Alternative config
"""
from __future__ import annotations

import argparse
import sys
from datetime import datetime
from pathlib import Path

import yaml

SCRIPT_DIR = Path(__file__).resolve().parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from sources import build_registry, SearchCriteria
from store import load, persist, upsert


def load_campaigns(path: Path) -> list[dict]:
    with open(path) as f:
        data = yaml.safe_load(f)
    return data.get('campaigns', [])


def expand_campaign(campaign: dict) -> list[tuple[str, str, SearchCriteria, str]]:
    """Build (source_name, region, SearchCriteria, campaign_name) per region × source.

    Returns 4-tuples — the campaign name is needed downstream for logging.
    """
    name = campaign['name']
    crit_dict = campaign['criteria']
    regions = campaign.get('regions') or [None]
    sources = campaign.get('sources', [])

    expanded = []
    for region in regions:
        for source in sources:
            crit = SearchCriteria(**crit_dict, region=region)
            expanded.append((source, region, crit, name))
    return expanded


def main():
    ap = argparse.ArgumentParser(description='Config-driven property search')
    ap.add_argument('--config', default='campaigns.yaml', help='Campaigns YAML')
    ap.add_argument('--campaign', help='Run only campaigns matching this substring')
    ap.add_argument('--source', help='Run only this source')
    ap.add_argument('--dry-run', action='store_true',
                    help='Show plan + source health, no requests')
    ap.add_argument('--no-persist', action='store_true',
                    help='Search but do not write to store (debugging)')
    args = ap.parse_args()

    config_path = SCRIPT_DIR / args.config
    if not config_path.exists():
        print(f"Config not found: {config_path}", file=sys.stderr)
        return 1

    campaigns = load_campaigns(config_path)
    if args.campaign:
        campaigns = [c for c in campaigns if args.campaign.lower() in c['name'].lower()]
    campaigns = [c for c in campaigns if c.get('enabled', True)]

    if not campaigns:
        print("No enabled campaigns matched.")
        return 0

    registry = build_registry()

    print(f"\n  SEARCH v2 — {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    print(f"  {'=' * 60}")
    print(f"  Config: {args.config}")
    print(f"  Campaigns: {len(campaigns)} enabled")
    for c in campaigns:
        print(f"    - {c['name']}")
    print()

    # Preflight: health-check every source referenced by enabled campaigns
    referenced = set()
    for c in campaigns:
        for s in c.get('sources', []):
            if args.source and s != args.source:
                continue
            referenced.add(s)

    print(f"  PREFLIGHT (source health)")
    print(f"  {'-' * 60}")
    healthy = {}
    for name in sorted(referenced):
        if name not in registry:
            print(f"  XX  {name:22}  not registered in sources/__init__.py")
            continue
        ok, reason = registry[name].health()
        marker = 'OK ' if ok else 'XX '
        print(f"  {marker} {name:22}  {reason[:60]}")
        if ok:
            healthy[name] = registry[name]
    print()

    if args.dry_run:
        print(f"  DRY RUN — plan")
        print(f"  {'-' * 60}")
        for c in campaigns:
            for src_name, region, crit, camp_name in expand_campaign(c):
                if args.source and src_name != args.source:
                    continue
                health = 'OK' if src_name in healthy else 'SKIP'
                print(f"  [{health:4}] {src_name:22}  {region or '-':22}  {camp_name}")
        return 0

    if not healthy:
        print("  No healthy sources — nothing to do.")
        return 1

    # Run
    store = load()
    pre_count = len(store)
    known_urls = set(store.keys())  # snapshot at start, threaded into sources
    total_new = 0
    by_source = {}
    by_region = {}
    # Sanity rail: track searches that returned 0 results from healthy sources.
    # A healthy source returning 0 across every region of a campaign usually
    # means its scraper broke (CSS change, schema drift) — RC2 territory.
    per_source_attempts = {}   # source → total searches run
    per_source_hits = {}       # source → total NEW hits yielded (any)
    per_source_total = {}      # source → total results yielded (incl. known)
    now_iso = datetime.now().isoformat()

    for c in campaigns:
        for src_name, region, crit, camp_name in expand_campaign(c):
            if args.source and src_name != args.source:
                continue
            if src_name not in healthy:
                continue
            src = healthy[src_name]
            print(f"  [{src_name}] {region or '-'}  ({camp_name})")
            batch_new = 0
            batch_total = 0
            per_source_attempts[src_name] = per_source_attempts.get(src_name, 0) + 1
            try:
                for hit in src.search(crit, known_urls=known_urls):
                    batch_total += 1
                    if hit.url in store:
                        continue
                    fields = hit.to_store_fields()
                    fields['discovered_at'] = now_iso
                    upsert(store, hit.url, fields)
                    known_urls.add(hit.url)
                    batch_new += 1
                    total_new += 1
                    by_source[src_name] = by_source.get(src_name, 0) + 1
                    by_region[region or '-'] = by_region.get(region or '-', 0) + 1
                if batch_new and not args.no_persist:
                    persist(store)
                per_source_hits[src_name] = per_source_hits.get(src_name, 0) + batch_new
                per_source_total[src_name] = per_source_total.get(src_name, 0) + batch_total
                print(f"    -> {batch_new} new ({batch_total} total returned)")
            except Exception as e:
                print(f"    !! source error: {type(e).__name__}: {str(e)[:80]}")

    # Sanity rail — flag healthy sources that ran but yielded ZERO results
    # across every region of every campaign. Note: post-dedup (sources pre-filter
    # known URLs), so 0 can mean "everything already known" rather than "broken".
    # Heuristic: only warn if ALL healthy sources returned 0 across multi-region
    # AND the store is non-empty (so dedup-zero is expected). Single-source-zero
    # is too noisy to warn on — the user sees "+0 new" per region and judges.
    healthy_with_zero = [s for s, a in per_source_attempts.items()
                         if a >= 2 and per_source_total.get(s, 0) == 0]
    suspicious = healthy_with_zero if len(healthy_with_zero) == len(per_source_attempts) and pre_count > 50 else []

    print()
    print(f"  {'=' * 60}")
    print(f"  DONE — {total_new} new properties ({pre_count} -> {len(store)} total)")
    print(f"  {'=' * 60}")
    if by_source:
        print("  By source:")
        for s, n in sorted(by_source.items(), key=lambda x: -x[1]):
            print(f"    {s:22}  +{n}")
    if by_region:
        print("  By region:")
        for r, n in sorted(by_region.items(), key=lambda x: -x[1]):
            print(f"    {r:22}  +{n}")

    if suspicious:
        print()
        print(f"  NOTE — every healthy source returned 0 NEW results across multiple regions.")
        print(f"    Sources affected: {', '.join(suspicious)}")
        print(f"    This is NORMAL when re-running soon after a successful run (all URLs already known)")
        print(f"    OR when criteria filters reject everything (check per-region 'Found N' lines above).")
        print(f"    If you suspect a scraper broke, run: `python3 search_v2.py --source <name> --dry-run`")

    return 0


if __name__ == '__main__':
    sys.exit(main())
