#!/usr/bin/env python3
"""
Lightweight quality gate for FarmMatch data.

Purpose:
- Catch stale/partial analysis before unfavoriting
- Detect duplicate URLs and missing scores
- Summarize geocoding coverage from enriched_data.json
"""
import csv
import json
import os
import time
from pathlib import Path
from typing import Dict, List, Tuple


def _file_age_hours(path: Path) -> float:
    """Return age of file in hours (float)."""
    if not path.exists():
        return float("inf")
    return (time.time() - path.stat().st_mtime) / 3600


def _load_csv(path: Path) -> Tuple[List[Dict], List[str]]:
    """Load CSV into a list of dicts."""
    with path.open(newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        rows = list(reader)
        return rows, reader.fieldnames or []


def _load_json(path: Path) -> List[Dict]:
    """Load JSON safely, returning [] on failure."""
    try:
        with path.open(encoding="utf-8") as f:
            data = json.load(f)
            return data if isinstance(data, list) else []
    except FileNotFoundError:
        return []
    except json.JSONDecodeError:
        return []


def run_quality_gate(
    csv_path: str = "analysis_output.csv",
    json_path: str = "enriched_data.json",
    stale_hours: int = 48,
) -> Dict:
    """Run quality checks and return a structured report."""
    report: Dict = {"errors": [], "warnings": [], "stats": {}, "passed": False}

    csv_file = Path(csv_path)
    json_file = Path(json_path)

    # File existence
    if not csv_file.exists():
        report["errors"].append(f"{csv_path} missing")
        return report

    csv_rows, columns = _load_csv(csv_file)
    csv_count = len(csv_rows)
    report["stats"]["csv_rows"] = csv_count
    report["stats"]["csv_age_hours"] = round(_file_age_hours(csv_file), 1)

    if report["stats"]["csv_age_hours"] > stale_hours:
        report["warnings"].append(
            f"{csv_path} is stale ({report['stats']['csv_age_hours']}h old)"
        )

    # Basic score coverage
    missing_weighted = sum(
        1
        for row in csv_rows
        if not row.get("Gewogen Score") or row.get("Gewogen Score") == "nan"
    )
    missing_combined = sum(
        1
        for row in csv_rows
        if not row.get("overall_score_combined")
        or row.get("overall_score_combined") == "nan"
    )

    report["stats"]["missing_weighted"] = missing_weighted
    report["stats"]["missing_combined"] = missing_combined

    if csv_count:
        missing_ratio = missing_combined / csv_count
        if missing_ratio > 0.2:
            report["errors"].append(
                f"Combined score missing for {missing_combined}/{csv_count} properties"
            )
        elif missing_combined > 0:
            report["warnings"].append(
                f"Combined score missing for {missing_combined} properties"
            )

    # Duplicate URL detection
    seen = set()
    duplicates = []
    for row in csv_rows:
        url = row.get("URL")
        if not url:
            continue
        if url in seen:
            duplicates.append(url)
        seen.add(url)

    if duplicates:
        report["errors"].append(f"Duplicate URLs in CSV: {len(duplicates)} found")

    # Risk profile coverage
    if "risk_profile" in columns:
        missing_risk = sum(
            1 for row in csv_rows if not row.get("risk_profile")
        )
        report["stats"]["missing_risk"] = missing_risk
        if missing_risk > csv_count * 0.5:
            report["warnings"].append(
                f"Risk profile missing for {missing_risk}/{csv_count} properties"
            )

    # CSV geocoding coverage
    missing_geo_csv = 0
    if "Latitude" in columns and "Longitude" in columns:
        for row in csv_rows:
            lat = row.get("Latitude")
            lon = row.get("Longitude")
            if lat in (None, "", "nan") or lon in (None, "", "nan"):
                missing_geo_csv += 1
        report["stats"]["missing_geocoding_csv"] = missing_geo_csv
        if missing_geo_csv > csv_count * 0.3:
            report["warnings"].append(
                f"Geocoding missing in CSV for {missing_geo_csv}/{csv_count} properties"
            )

    # JSON/geocoding coverage
    enriched = _load_json(json_file)
    report["stats"]["json_rows"] = len(enriched)
    if enriched:
        report["stats"]["json_age_hours"] = round(_file_age_hours(json_file), 1)
        missing_geo = sum(
            1
            for prop in enriched
            if prop.get("lat") in (None, "", "nan") or prop.get("lon") in (None, "", "nan")
        )
        report["stats"]["missing_geocoding"] = missing_geo
        if missing_geo > len(enriched) * 0.3:
            report["warnings"].append(
                f"Geocoding missing for {missing_geo}/{len(enriched)} properties"
            )

        # CSV vs JSON consistency
        if csv_count and abs(len(enriched) - csv_count) > max(2, csv_count * 0.1):
            report["warnings"].append(
                f"CSV ({csv_count}) and JSON ({len(enriched)}) counts differ"
            )
    else:
        report["warnings"].append(f"{json_path} missing or invalid - map may be stale")

    report["passed"] = len(report["errors"]) == 0
    return report


def _print_report(report: Dict):
    """Pretty-print the report for CLI usage."""
    print("=" * 70)
    print("📋 FARM MATCH QUALITY GATE")
    print("=" * 70)

    stats = report.get("stats", {})
    if stats:
        print("\nStats:")
        for key, value in stats.items():
            print(f"  - {key}: {value}")

    if report["errors"]:
        print("\n❌ Errors:")
        for err in report["errors"]:
            print(f"  - {err}")

    if report["warnings"]:
        print("\n⚠️  Warnings:")
        for warn in report["warnings"]:
            print(f"  - {warn}")

    if report["passed"]:
        print("\n✅ Quality gate passed")
    else:
        print("\n❌ Quality gate failed")


def main():
    report = run_quality_gate()
    _print_report(report)
    return 0 if report["passed"] else 1


if __name__ == "__main__":
    raise SystemExit(main())
