"""Property-source contract.

Every source module in this directory exposes one class that implements
`Source`. The orchestrator (search_v2.py) discovers them, calls health()
to gate pre-flight, then iterates search() per campaign.

Adding a new platform = drop a new file in here + register it in
sources/__init__.py registry + add to campaigns.yaml. No orchestrator
changes.

Design constraints (6L):
- Sources MUST NOT touch the store directly; they yield normalized dicts
  and the orchestrator handles persistence + dedup.
- Sources SHOULD apply upstream criteria filters in their URL/payload when
  the platform supports it. Downstream filtering is wasteful.
- Sources MUST distinguish 'genuinely no results' from 'query failed'.
  health() should fail loud when the platform is unreachable, so the
  orchestrator can skip cleanly rather than record empty results.
"""
from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Iterator


@dataclass
class SearchCriteria:
    """Unified search filters. Sources translate these into platform-specific URLs/payloads.

    Attributes left as None mean 'no constraint'. country is required because
    most platforms are single-country.
    """
    country: str                               # 'FR', 'IT', 'ES', 'PT', 'NL', etc.
    region: str | None = None                  # platform-specific region slug
    department: str | None = None              # finer-grained (FR départements)
    min_price: int | None = None
    max_price: int | None = None
    min_land_m2: int | None = None
    min_building_m2: int | None = None
    min_bedrooms: int | None = None
    max_pages: int = 3                         # crawl depth per region
    limit: int | None = None                   # cap on results per source per campaign


@dataclass
class PropertyHit:
    """Normalized property record yielded by Source.search().

    Only `url` and `source` are required. Everything else is best-effort.
    The orchestrator upserts these into the store; downstream pipeline steps
    (geocode, amenities, analyze, enrich, score) fill the rest.
    """
    url: str
    source: str                                # short name matching the source module
    title: str | None = None
    price: int | None = None
    city: str | None = None
    country: str | None = None
    building_size: int | None = None           # m²
    land_size: int | None = None               # m²
    bedrooms: int | None = None
    rooms: int | None = None
    thumbnail: str | None = None
    search_region: str | None = None
    extra: dict = field(default_factory=dict)  # source-specific fields not standardized

    def to_store_fields(self) -> dict:
        """Convert to the flat dict shape store.upsert() expects."""
        out = {'url': self.url, 'source': self.source}
        for k in ('title', 'price', 'city', 'country', 'building_size', 'land_size',
                  'bedrooms', 'rooms', 'thumbnail', 'search_region'):
            v = getattr(self, k)
            if v is not None:
                out[k] = v
        if self.extra:
            out.update(self.extra)
        return out


class Source(ABC):
    """Contract every property source implements."""

    name: str                                  # short slug, e.g. 'greenacres'
    countries: list[str]                       # ISO codes this source covers
    requires_auth: bool = False                # True if source needs a stored token

    @abstractmethod
    def health(self) -> tuple[bool, str]:
        """Preflight reachability check.

        Returns (ok, reason). Called once before any search() calls in a run.
        If ok=False, orchestrator SKIPS this source's searches and reports
        the reason in the health line. Examples of failure reasons:
        'auth expired', 'HTTP 403 (Cloudflare)', 'DataDome CAPTCHA',
        'connect timeout'.
        """

    @abstractmethod
    def search(self, criteria: SearchCriteria,
               known_urls: set[str] | None = None) -> Iterator[PropertyHit]:
        """Yield matching properties for a single campaign×region combination.

        Sources SHOULD respect criteria.limit and criteria.max_pages to bound
        the crawl. SHOULD apply upstream filters (price/size) in the URL/payload
        when the platform supports it.

        Args:
            criteria: filters to apply.
            known_urls: optional set of URLs the orchestrator has already seen.
                Sources should skip these EARLY (before HTTP fetch when possible,
                else before yielding) so that limit semantics measure FRESH
                discoveries, not re-discoveries of cached data. When None,
                the source treats all results as new.
        """


class DisabledSource(Source):
    """Base class for parked sources (e.g. bot-blocked platforms).

    Useful for: making blocked sources visible in the orchestrator + yaml
    config without requiring conditional imports or special-casing.
    """
    def __init__(self, name: str, countries: list[str], reason: str):
        self.name = name
        self.countries = countries
        self._disabled_reason = reason

    def health(self) -> tuple[bool, str]:
        return False, f'disabled: {self._disabled_reason}'

    def search(self, criteria: SearchCriteria,
               known_urls: set[str] | None = None) -> Iterator[PropertyHit]:
        return iter(())