"""Email-driven property source ingestion. Reads property-alert emails from Mail.app via osascript. Bypasses scraper bot blocks because emails are pushed officially by the platforms after the user sets up a saved search. Supports (as of 2026-05-27): - Leggett (notifications@hestia.leggett.fr) — daily ~10 properties/email - Properstar (noreply@properstar.com) — saved-search updates Why this matters: leggett.fr is Cloudflare-blocked for our scraper, but the email alerts arrive freely with the same properties + a parseable REF that maps to frenchestateagents.com URLs (verified — REF format identical to what the legacy scraper used before being blocked). Adding a new sender: 1. Set up a saved search on the platform (logged in, "save this search + email me alerts"). Verify alerts arrive. 2. Inspect a sample email body — find regex patterns for url/price/region. 3. Add a parse_() function returning Iterator[PropertyHit]. 4. Add a Source subclass mirroring the pattern below. 5. Register in sources/__init__.py. 6. Add to campaigns.yaml. """ from __future__ import annotations import re import subprocess import sys from pathlib import Path from typing import Iterator SCRIPT_DIR = Path(__file__).resolve().parent.parent if str(SCRIPT_DIR) not in sys.path: sys.path.insert(0, str(SCRIPT_DIR)) from sources._base import PropertyHit, SearchCriteria, Source # noqa: E402 # ─── osascript-based Mail.app reader ───────────────────────────────── # Query the user's Mail.app for emails matching a sender substring, within # a date window. Returns the structured chunks to the Python caller. # Note: scans the most recent 500 messages per inbox; sufficient for the # weekly cadence + 30d windows we use here. OSASCRIPT_QUERY = ''' on run argv set senderFilter to item 1 of argv set daysBack to (item 2 of argv) as integer set cutoffDate to (current date) - (daysBack * days) tell application "Mail" set targetAccount to first account whose name is "Gmail" set theBox to first mailbox of targetAccount whose name is "INBOX" set msgs to messages of theBox set results to "" set i to 1 set total to count of msgs if total > 500 then set total to 500 repeat while i ≤ total set m to item i of msgs try set theDate to date received of m if theDate < cutoffDate then exit repeat set s to (sender of m as string) if s contains senderFilter then set results to results & "===EMAIL===" & return set results to results & "SENDER: " & s & return set results to results & "DATE: " & (theDate as string) & return set results to results & "SUBJECT: " & (subject of m as string) & return set results to results & "BODY:" & return try set results to results & (content of m as string) & return end try end if end try set i to i + 1 end repeat return results end tell end run ''' def fetch_emails(sender_filter: str, days_back: int = 30) -> list[dict]: """Return list of {sender, date, subject, body} for matching emails.""" try: result = subprocess.run( ['osascript', '-e', OSASCRIPT_QUERY, sender_filter, str(days_back)], capture_output=True, text=True, timeout=60, ) except subprocess.TimeoutExpired: return [] if result.returncode != 0: return [] raw = result.stdout emails = [] chunks = raw.split('===EMAIL===\n') for chunk in chunks: if not chunk.strip(): continue email = {} lines = chunk.split('\n') body_start = -1 for idx, line in enumerate(lines): if line.startswith('SENDER: '): email['sender'] = line[len('SENDER: '):] elif line.startswith('DATE: '): email['date'] = line[len('DATE: '):] elif line.startswith('SUBJECT: '): email['subject'] = line[len('SUBJECT: '):] elif line == 'BODY:': body_start = idx + 1 break if body_start >= 0: email['body'] = '\n'.join(lines[body_start:]) if email.get('body'): emails.append(email) return emails # ─── Leggett parser ────────────────────────────────────────────────── # French département (uppercase, as in Leggett emails) → slug used by our # search regions. Only listing departments that map to known regions; others # pass through lowercased. DEPT_NAMES = { "CÔTES-D'ARMOR": "cotes-d-armor", 'MORBIHAN': 'morbihan', 'FINISTÈRE': 'finistere', 'ILLE-ET-VILAINE': 'ille-et-vilaine', 'VENDÉE': 'vendee', 'MAYENNE': 'mayenne', 'ORNE': 'orne', 'MANCHE': 'manche', 'AUDE': 'aude', 'HÉRAULT': 'herault', 'GARD': 'gard', 'DRÔME': 'drome', 'ARDÈCHE': 'ardeche', 'CHARENTE': 'charente', 'CHARENTE-MARITIME': 'charente-maritime', 'DORDOGNE': 'dordogne', 'LOT': 'lot', 'CREUSE': 'creuse', 'CORRÈZE': 'correze', 'GERS': 'gers', 'INDRE': 'indre', 'SARTHE': 'sarthe', # Added 2026-05-27 — match the new yaml campaigns 'CANTAL': 'cantal', 'AVEYRON': 'aveyron', 'LOZÈRE': 'lozere', 'TARN': 'tarn', 'TARN-ET-GARONNE': 'tarn-et-garonne', 'PYRÉNÉES-ATLANTIQUES': 'pyrenees-atlantiques', 'HAUTES-PYRÉNÉES': 'hautes-pyrenees', 'PYRÉNÉES-ORIENTALES': 'pyrenees-orientales', 'LOT-ET-GARONNE': 'lot-et-garonne', } # URL pattern verified 2026-05-27: matches existing store entries # (e.g. A43089EDA29 from earlier scrape) — REF alone resolves the page. LEGGETT_URL_TEMPLATE = 'https://www.frenchestateagents.com/french-property-for-sale/view/{ref}' # Block markers in the email body: # REF : A45183NHA85 # VENDÉE €266,430 FAI # Charming 5-bed country home with pool, terrace ... RE_LEGGETT_REF = re.compile(r'REF\s*:\s*([A-Z0-9]+)') RE_LEGGETT_PRICE = re.compile(r"([A-ZÀ-Ÿ'\- ]+?)\s+€([\d,]+)\s*FAI") RE_LEGGETT_BEDS = re.compile(r'(\d+)[\s-]bed', re.IGNORECASE) def parse_leggett(email: dict) -> Iterator[PropertyHit]: """Yield PropertyHits from one Leggett alert email body.""" body = email.get('body', '') ref_matches = list(RE_LEGGETT_REF.finditer(body)) for i, m in enumerate(ref_matches): ref = m.group(1) # Block = from this REF to next REF (or end) block_start = m.end() block_end = ref_matches[i + 1].start() if i + 1 < len(ref_matches) else len(body) block = body[block_start:block_end] # Region + price (REGION €PRICE FAI) pm = RE_LEGGETT_PRICE.search(block) if not pm: continue region_raw = pm.group(1).strip() try: price = int(pm.group(2).replace(',', '')) except ValueError: continue # Description: first substantive line after the price block. # Skip image placeholders (the unicode object-replacement char Mail.app # renders for inline images) and empty/short lines. after = block[pm.end():] candidates = [ ln.strip() for ln in after.split('\n') if ln.strip() and ln.strip() != '' and len(ln.strip()) > 15 ] description = candidates[0][:300] if candidates else '' # Bedroom hint from description bedrooms = None bm = RE_LEGGETT_BEDS.search(description) if bm: try: bedrooms = int(bm.group(1)) except ValueError: pass search_region = DEPT_NAMES.get(region_raw, region_raw.lower()) url = LEGGETT_URL_TEMPLATE.format(ref=ref) yield PropertyHit( url=url, source='leggett_email', title=description[:80] if description else f'Leggett {ref}', price=price, country='FR', search_region=search_region, bedrooms=bedrooms, extra={ 'leggett_ref': ref, 'leggett_region_raw': region_raw, 'description': description, 'email_subject': email.get('subject'), 'email_date': email.get('date'), }, ) # ─── Properstar parser ─────────────────────────────────────────────── RE_PROPERSTAR_URL = re.compile(r'https://www\.properstar\.\w+/[^\s"\'<>()]+') RE_PROPERSTAR_PRICE = re.compile(r'EUR\s*([\d.,]+)') RE_PROPERSTAR_ROOMS = re.compile(r'(\d+)\s*Kamers?', re.IGNORECASE) RE_PROPERSTAR_BEDS = re.compile(r'(\d+)\s*Bed\.?', re.IGNORECASE) RE_PROPERSTAR_BATHS = re.compile(r'(\d+)\s*Bad\.?', re.IGNORECASE) def parse_properstar(email: dict) -> Iterator[PropertyHit]: """Yield PropertyHits from one Properstar alert email body. Properstar emails contain 1-N listing blocks; each has a properstar.* URL, a description line, a room/bed/bath/type line, and EUR price. """ body = email.get('body', '') urls = RE_PROPERSTAR_URL.findall(body) # Filter to actual listing URLs (skip tracking/unsubscribe links) listing_urls = [ u for u in set(urls) if any(marker in u for marker in ('/listing/', '/annonce', '/property/', '/eigendom/')) ] for url in listing_urls: # Pull values near the URL position idx = body.find(url) nearby = body[max(0, idx - 800): idx + 200] price = None pm = RE_PROPERSTAR_PRICE.search(nearby) if pm: try: # Properstar uses "EUR 349.800" (Dutch/European format: . = thousand sep) price = int(pm.group(1).replace('.', '').replace(',', '')) except ValueError: pass beds = None bm = RE_PROPERSTAR_BEDS.search(nearby) if bm: try: beds = int(bm.group(1)) except ValueError: pass rooms = None rm = RE_PROPERSTAR_ROOMS.search(nearby) if rm: try: rooms = int(rm.group(1)) except ValueError: pass # Description: first substantive line above the URL lines = nearby.split('\n') desc_lines = [ ln.strip() for ln in lines if ln.strip() and ln.strip() != '' and len(ln.strip()) > 20 and 'EUR' not in ln ] description = desc_lines[0][:300] if desc_lines else '' yield PropertyHit( url=url, source='properstar_email', title=description[:80] if description else 'Properstar listing', price=price, country='FR', # Properstar emails to NL user are dominantly FR; refined post-geocode bedrooms=beds, rooms=rooms, extra={ 'description': description, 'email_subject': email.get('subject'), 'email_date': email.get('date'), }, ) # ─── Source classes ────────────────────────────────────────────────── class _EmailSourceBase(Source): """Shared logic: read once per orchestrator run, then serve from cache. The orchestrator calls search() once per (region × source × campaign). For email sources the data isn't region-specific (emails come pre- filtered by the user's saved searches), so we parse once on the first call and serve cached PropertyHits on subsequent calls, with downstream criteria filters applied per call. """ requires_auth = False def __init__(self): self._cache: list[PropertyHit] | None = None def _load(self) -> list[PropertyHit]: raise NotImplementedError def _ensure_cache(self): if self._cache is None: self._cache = self._load() def search(self, criteria: SearchCriteria, known_urls: set[str] | None = None) -> Iterator[PropertyHit]: if criteria.country not in self.countries: return self._ensure_cache() known = known_urls or set() for hit in self._cache: if hit.url in known: continue if criteria.region and hit.search_region and hit.search_region != criteria.region: continue if criteria.min_price and (hit.price or 0) < criteria.min_price: continue if criteria.max_price and hit.price and hit.price > criteria.max_price: continue if criteria.min_bedrooms and (hit.bedrooms or 0) < criteria.min_bedrooms: continue yield hit class LeggettEmailSource(_EmailSourceBase): name = 'leggett_email' countries = ['FR'] def health(self) -> tuple[bool, str]: try: emails = fetch_emails('leggett', days_back=30) except Exception as e: return False, f'{type(e).__name__}' if not emails: return False, 'no emails in 30d (subscribe at leggett.fr?)' return True, f'{len(emails)} emails in 30d' def _load(self) -> list[PropertyHit]: emails = fetch_emails('leggett', days_back=30) hits: list[PropertyHit] = [] for e in emails: hits.extend(parse_leggett(e)) return hits class ProperstarEmailSource(_EmailSourceBase): name = 'properstar_email' countries = ['FR', 'IT', 'ES', 'PT'] def health(self) -> tuple[bool, str]: try: emails = fetch_emails('properstar.com', days_back=30) except Exception as e: return False, f'{type(e).__name__}' if not emails: return False, 'no emails in 30d (set saved-search alerts on properstar.nl?)' return True, f'{len(emails)} emails in 30d' def _load(self) -> list[PropertyHit]: emails = fetch_emails('properstar.com', days_back=30) hits: list[PropertyHit] = [] for e in emails: hits.extend(parse_properstar(e)) return hits