"""Email-driven property source ingestion.

Reads property-alert emails from Mail.app via osascript. Bypasses scraper
bot blocks because emails are pushed officially by the platforms after the
user sets up a saved search.

Supports (as of 2026-05-27):
- Leggett (notifications@hestia.leggett.fr) — daily ~10 properties/email
- Properstar (noreply@properstar.com) — saved-search updates

Why this matters: leggett.fr is Cloudflare-blocked for our scraper, but
the email alerts arrive freely with the same properties + a parseable REF
that maps to frenchestateagents.com URLs (verified — REF format identical
to what the legacy scraper used before being blocked).

Adding a new sender:
1. Set up a saved search on the platform (logged in, "save this search +
   email me alerts"). Verify alerts arrive.
2. Inspect a sample email body — find regex patterns for url/price/region.
3. Add a parse_<source>() function returning Iterator[PropertyHit].
4. Add a Source subclass mirroring the pattern below.
5. Register in sources/__init__.py.
6. Add to campaigns.yaml.
"""
from __future__ import annotations

import re
import subprocess
import sys
from pathlib import Path
from typing import Iterator

SCRIPT_DIR = Path(__file__).resolve().parent.parent
if str(SCRIPT_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPT_DIR))

from sources._base import PropertyHit, SearchCriteria, Source  # noqa: E402


# ─── osascript-based Mail.app reader ─────────────────────────────────

# Query the user's Mail.app for emails matching a sender substring, within
# a date window. Returns the structured chunks to the Python caller.
# Note: scans the most recent 500 messages per inbox; sufficient for the
# weekly cadence + 30d windows we use here.
OSASCRIPT_QUERY = '''
on run argv
    set senderFilter to item 1 of argv
    set daysBack to (item 2 of argv) as integer
    set cutoffDate to (current date) - (daysBack * days)
    tell application "Mail"
        set targetAccount to first account whose name is "Gmail"
        set theBox to first mailbox of targetAccount whose name is "INBOX"
        set msgs to messages of theBox
        set results to ""
        set i to 1
        set total to count of msgs
        if total > 500 then set total to 500
        repeat while i ≤ total
            set m to item i of msgs
            try
                set theDate to date received of m
                if theDate < cutoffDate then exit repeat
                set s to (sender of m as string)
                if s contains senderFilter then
                    set results to results & "===EMAIL===" & return
                    set results to results & "SENDER: " & s & return
                    set results to results & "DATE: " & (theDate as string) & return
                    set results to results & "SUBJECT: " & (subject of m as string) & return
                    set results to results & "BODY:" & return
                    try
                        set results to results & (content of m as string) & return
                    end try
                end if
            end try
            set i to i + 1
        end repeat
        return results
    end tell
end run
'''


def fetch_emails(sender_filter: str, days_back: int = 30) -> list[dict]:
    """Return list of {sender, date, subject, body} for matching emails."""
    try:
        result = subprocess.run(
            ['osascript', '-e', OSASCRIPT_QUERY, sender_filter, str(days_back)],
            capture_output=True, text=True, timeout=60,
        )
    except subprocess.TimeoutExpired:
        return []
    if result.returncode != 0:
        return []

    raw = result.stdout
    emails = []
    chunks = raw.split('===EMAIL===\n')
    for chunk in chunks:
        if not chunk.strip():
            continue
        email = {}
        lines = chunk.split('\n')
        body_start = -1
        for idx, line in enumerate(lines):
            if line.startswith('SENDER: '):
                email['sender'] = line[len('SENDER: '):]
            elif line.startswith('DATE: '):
                email['date'] = line[len('DATE: '):]
            elif line.startswith('SUBJECT: '):
                email['subject'] = line[len('SUBJECT: '):]
            elif line == 'BODY:':
                body_start = idx + 1
                break
        if body_start >= 0:
            email['body'] = '\n'.join(lines[body_start:])
        if email.get('body'):
            emails.append(email)
    return emails


# ─── Leggett parser ──────────────────────────────────────────────────

# French département (uppercase, as in Leggett emails) → slug used by our
# search regions. Only listing departments that map to known regions; others
# pass through lowercased.
DEPT_NAMES = {
    "CÔTES-D'ARMOR": "cotes-d-armor",
    'MORBIHAN': 'morbihan',
    'FINISTÈRE': 'finistere',
    'ILLE-ET-VILAINE': 'ille-et-vilaine',
    'VENDÉE': 'vendee',
    'MAYENNE': 'mayenne',
    'ORNE': 'orne',
    'MANCHE': 'manche',
    'AUDE': 'aude',
    'HÉRAULT': 'herault',
    'GARD': 'gard',
    'DRÔME': 'drome',
    'ARDÈCHE': 'ardeche',
    'CHARENTE': 'charente',
    'CHARENTE-MARITIME': 'charente-maritime',
    'DORDOGNE': 'dordogne',
    'LOT': 'lot',
    'CREUSE': 'creuse',
    'CORRÈZE': 'correze',
    'GERS': 'gers',
    'INDRE': 'indre',
    'SARTHE': 'sarthe',
    # Added 2026-05-27 — match the new yaml campaigns
    'CANTAL': 'cantal',
    'AVEYRON': 'aveyron',
    'LOZÈRE': 'lozere',
    'TARN': 'tarn',
    'TARN-ET-GARONNE': 'tarn-et-garonne',
    'PYRÉNÉES-ATLANTIQUES': 'pyrenees-atlantiques',
    'HAUTES-PYRÉNÉES': 'hautes-pyrenees',
    'PYRÉNÉES-ORIENTALES': 'pyrenees-orientales',
    'LOT-ET-GARONNE': 'lot-et-garonne',
}

# URL pattern verified 2026-05-27: matches existing store entries
# (e.g. A43089EDA29 from earlier scrape) — REF alone resolves the page.
LEGGETT_URL_TEMPLATE = 'https://www.frenchestateagents.com/french-property-for-sale/view/{ref}'

# Block markers in the email body:
#   REF : A45183NHA85
#   VENDÉE €266,430 FAI
#   Charming 5-bed country home with pool, terrace ...
RE_LEGGETT_REF = re.compile(r'REF\s*:\s*([A-Z0-9]+)')
RE_LEGGETT_PRICE = re.compile(r"([A-ZÀ-Ÿ'\- ]+?)\s+€([\d,]+)\s*FAI")
RE_LEGGETT_BEDS = re.compile(r'(\d+)[\s-]bed', re.IGNORECASE)


def parse_leggett(email: dict) -> Iterator[PropertyHit]:
    """Yield PropertyHits from one Leggett alert email body."""
    body = email.get('body', '')
    ref_matches = list(RE_LEGGETT_REF.finditer(body))
    for i, m in enumerate(ref_matches):
        ref = m.group(1)
        # Block = from this REF to next REF (or end)
        block_start = m.end()
        block_end = ref_matches[i + 1].start() if i + 1 < len(ref_matches) else len(body)
        block = body[block_start:block_end]

        # Region + price (REGION €PRICE FAI)
        pm = RE_LEGGETT_PRICE.search(block)
        if not pm:
            continue
        region_raw = pm.group(1).strip()
        try:
            price = int(pm.group(2).replace(',', ''))
        except ValueError:
            continue

        # Description: first substantive line after the price block.
        # Skip image placeholders (the unicode object-replacement char Mail.app
        # renders for inline images) and empty/short lines.
        after = block[pm.end():]
        candidates = [
            ln.strip() for ln in after.split('\n')
            if ln.strip() and ln.strip() != '￼' and len(ln.strip()) > 15
        ]
        description = candidates[0][:300] if candidates else ''

        # Bedroom hint from description
        bedrooms = None
        bm = RE_LEGGETT_BEDS.search(description)
        if bm:
            try:
                bedrooms = int(bm.group(1))
            except ValueError:
                pass

        search_region = DEPT_NAMES.get(region_raw, region_raw.lower())
        url = LEGGETT_URL_TEMPLATE.format(ref=ref)

        yield PropertyHit(
            url=url,
            source='leggett_email',
            title=description[:80] if description else f'Leggett {ref}',
            price=price,
            country='FR',
            search_region=search_region,
            bedrooms=bedrooms,
            extra={
                'leggett_ref': ref,
                'leggett_region_raw': region_raw,
                'description': description,
                'email_subject': email.get('subject'),
                'email_date': email.get('date'),
            },
        )


# ─── Properstar parser ───────────────────────────────────────────────

RE_PROPERSTAR_URL = re.compile(r'https://www\.properstar\.\w+/[^\s"\'<>()]+')
RE_PROPERSTAR_PRICE = re.compile(r'EUR\s*([\d.,]+)')
RE_PROPERSTAR_ROOMS = re.compile(r'(\d+)\s*Kamers?', re.IGNORECASE)
RE_PROPERSTAR_BEDS = re.compile(r'(\d+)\s*Bed\.?', re.IGNORECASE)
RE_PROPERSTAR_BATHS = re.compile(r'(\d+)\s*Bad\.?', re.IGNORECASE)


def parse_properstar(email: dict) -> Iterator[PropertyHit]:
    """Yield PropertyHits from one Properstar alert email body.

    Properstar emails contain 1-N listing blocks; each has a properstar.* URL,
    a description line, a room/bed/bath/type line, and EUR price.
    """
    body = email.get('body', '')
    urls = RE_PROPERSTAR_URL.findall(body)
    # Filter to actual listing URLs (skip tracking/unsubscribe links)
    listing_urls = [
        u for u in set(urls)
        if any(marker in u for marker in ('/listing/', '/annonce', '/property/', '/eigendom/'))
    ]

    for url in listing_urls:
        # Pull values near the URL position
        idx = body.find(url)
        nearby = body[max(0, idx - 800): idx + 200]

        price = None
        pm = RE_PROPERSTAR_PRICE.search(nearby)
        if pm:
            try:
                # Properstar uses "EUR 349.800" (Dutch/European format: . = thousand sep)
                price = int(pm.group(1).replace('.', '').replace(',', ''))
            except ValueError:
                pass

        beds = None
        bm = RE_PROPERSTAR_BEDS.search(nearby)
        if bm:
            try:
                beds = int(bm.group(1))
            except ValueError:
                pass

        rooms = None
        rm = RE_PROPERSTAR_ROOMS.search(nearby)
        if rm:
            try:
                rooms = int(rm.group(1))
            except ValueError:
                pass

        # Description: first substantive line above the URL
        lines = nearby.split('\n')
        desc_lines = [
            ln.strip() for ln in lines
            if ln.strip() and ln.strip() != '￼' and len(ln.strip()) > 20 and 'EUR' not in ln
        ]
        description = desc_lines[0][:300] if desc_lines else ''

        yield PropertyHit(
            url=url,
            source='properstar_email',
            title=description[:80] if description else 'Properstar listing',
            price=price,
            country='FR',  # Properstar emails to NL user are dominantly FR; refined post-geocode
            bedrooms=beds,
            rooms=rooms,
            extra={
                'description': description,
                'email_subject': email.get('subject'),
                'email_date': email.get('date'),
            },
        )


# ─── Source classes ──────────────────────────────────────────────────

class _EmailSourceBase(Source):
    """Shared logic: read once per orchestrator run, then serve from cache.

    The orchestrator calls search() once per (region × source × campaign).
    For email sources the data isn't region-specific (emails come pre-
    filtered by the user's saved searches), so we parse once on the first
    call and serve cached PropertyHits on subsequent calls, with downstream
    criteria filters applied per call.
    """
    requires_auth = False

    def __init__(self):
        self._cache: list[PropertyHit] | None = None

    def _load(self) -> list[PropertyHit]:
        raise NotImplementedError

    def _ensure_cache(self):
        if self._cache is None:
            self._cache = self._load()

    def search(self, criteria: SearchCriteria,
               known_urls: set[str] | None = None) -> Iterator[PropertyHit]:
        if criteria.country not in self.countries:
            return
        self._ensure_cache()
        known = known_urls or set()
        for hit in self._cache:
            if hit.url in known:
                continue
            if criteria.region and hit.search_region and hit.search_region != criteria.region:
                continue
            if criteria.min_price and (hit.price or 0) < criteria.min_price:
                continue
            if criteria.max_price and hit.price and hit.price > criteria.max_price:
                continue
            if criteria.min_bedrooms and (hit.bedrooms or 0) < criteria.min_bedrooms:
                continue
            yield hit


class LeggettEmailSource(_EmailSourceBase):
    name = 'leggett_email'
    countries = ['FR']

    def health(self) -> tuple[bool, str]:
        try:
            emails = fetch_emails('leggett', days_back=30)
        except Exception as e:
            return False, f'{type(e).__name__}'
        if not emails:
            return False, 'no emails in 30d (subscribe at leggett.fr?)'
        return True, f'{len(emails)} emails in 30d'

    def _load(self) -> list[PropertyHit]:
        emails = fetch_emails('leggett', days_back=30)
        hits: list[PropertyHit] = []
        for e in emails:
            hits.extend(parse_leggett(e))
        return hits


class ProperstarEmailSource(_EmailSourceBase):
    name = 'properstar_email'
    countries = ['FR', 'IT', 'ES', 'PT']

    def health(self) -> tuple[bool, str]:
        try:
            emails = fetch_emails('properstar.com', days_back=30)
        except Exception as e:
            return False, f'{type(e).__name__}'
        if not emails:
            return False, 'no emails in 30d (set saved-search alerts on properstar.nl?)'
        return True, f'{len(emails)} emails in 30d'

    def _load(self) -> list[PropertyHit]:
        emails = fetch_emails('properstar.com', days_back=30)
        hits: list[PropertyHit] = []
        for e in emails:
            hits.extend(parse_properstar(e))
        return hits
