#!/usr/bin/env python3
"""
Paradisomatch Property Store — Single source of truth.

Usage:
    from store import load, persist, upsert, is_active, detect_source

    store = load()
    upsert(store, 'https://...', {'price': 150000, 'bedrooms': 4})
    persist(store)
"""
import json
import os
import tempfile
from contextlib import asynccontextmanager
from pathlib import Path


def _atomic_dump(obj, p):
    """Write JSON atomically: serialise to a temp file in the same dir, then
    os.replace() into place. Guarantees readers never see a torn/partial file
    even if two writers race (the last replace wins, both files stay valid)."""
    p = Path(p)
    fd, tmp = tempfile.mkstemp(dir=str(p.parent), prefix='.' + p.name + '.', suffix='.tmp')
    try:
        with os.fdopen(fd, 'w', encoding='utf-8') as f:
            json.dump(_sanitize_for_json(obj), f, indent=2, ensure_ascii=False, allow_nan=False)
        os.replace(tmp, p)
    except BaseException:
        if os.path.exists(tmp):
            os.unlink(tmp)
        raise

SCRIPT_DIR = Path(__file__).parent
STORE_PATH = SCRIPT_DIR / 'properties.json'
ENRICHED_PATH = SCRIPT_DIR / 'enriched_data.json'

STATUS_REMOVED = 'Removed'

CHROME_UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
             "AppleWebKit/537.36 (KHTML, like Gecko) "
             "Chrome/120.0.0.0 Safari/537.36")

NOMINATIM_UA = 'Paradisomatch/1.0 (property-search project; https://github.com/paradisomatch)'


import copy as _copy

# Maps id(store_dict) → baseline deepcopy at load time. Used by save() to do a
# three-way merge against the current on-disk state: any field a concurrent
# writer changed on disk is preserved unless this process actually modified it.
# Prevents the recheck-clobbers-harvester class of bug (single-writer cache race).
_LOAD_BASELINES: dict = {}


def load(path=None):
    """Load store as {url: property_dict}. Captures a baseline snapshot used
    later by save() to perform a merge-aware write (won't clobber concurrent
    writes on fields this process didn't touch)."""
    p = Path(path) if path else STORE_PATH
    if not p.exists():
        store: dict = {}
    else:
        with open(p, encoding='utf-8') as f:
            data = json.load(f)
        if isinstance(data, list):
            store = {item['url']: item for item in data if 'url' in item}
        else:
            store = data
    _LOAD_BASELINES[id(store)] = _copy.deepcopy(store)
    return store


def _sanitize_for_json(obj):
    """Recursively replace NaN/Infinity floats with None.

    Python's json.dump emits literal `NaN` and `Infinity` by default — valid in
    Python's extended JSON but rejected by browser JSON.parse, which breaks the
    map_viewer and shortlist pages on load. Strip them before write so any
    upstream NaN (e.g. from pandas.to_dict, float arithmetic on missing values)
    can't corrupt the JSON output.
    """
    import math
    if isinstance(obj, float):
        return None if (math.isnan(obj) or math.isinf(obj)) else obj
    if isinstance(obj, dict):
        return {k: _sanitize_for_json(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_sanitize_for_json(v) for v in obj]
    return obj


def _three_way_merge(baseline: dict, in_mem: dict, on_disk: dict) -> dict:
    """Merge in-memory changes onto current on-disk state without clobbering
    fields that a concurrent writer changed on disk.

    Rules per (url, field):
      - field added/changed in memory vs baseline → take in-memory value (this
        process wrote it)
      - field absent in memory but present on disk → take on-disk value
        (concurrent writer added it; we shouldn't erase it)
      - new URLs in memory → add to disk
      - new URLs on disk → preserve
      - URL deleted in memory (was in baseline) → respect that, remove from disk
    """
    merged = _copy.deepcopy(on_disk)
    # URLs that this process deleted
    deleted_urls = set(baseline.keys()) - set(in_mem.keys())
    for url in deleted_urls:
        merged.pop(url, None)
    for url, mem_p in in_mem.items():
        if url not in merged:
            merged[url] = _copy.deepcopy(mem_p)
            continue
        base_p = baseline.get(url, {})
        disk_p = merged[url]
        out_p = dict(disk_p)
        # Field-level merge
        for k, mem_v in mem_p.items():
            base_v = base_p.get(k, _SENTINEL)
            if base_v == _SENTINEL or mem_v != base_v:
                # This process wrote this field — take in-memory value
                out_p[k] = mem_v
            # else: in_mem == baseline → leave whatever's on disk
        # Fields this process deleted (were in baseline, gone from memory)
        for k in base_p:
            if k not in mem_p and k in out_p:
                # Only delete if disk hasn't been independently updated.
                # Conservative: don't delete (matches "no clobber" principle).
                pass
        merged[url] = out_p
    return merged


_SENTINEL = object()


def save(store, path=None):
    """Write store to disk via a three-way merge against current disk state.

    Prevents the single-writer cache race: if another process wrote new fields
    (e.g. photo_urls from greenacres_photo_harvest) while this process was
    running (e.g. recheck_availability marking availability_checked_at), both
    sets of changes survive. Without this merge, last-writer-wins clobbers.
    """
    p = Path(path) if path else STORE_PATH
    baseline = _LOAD_BASELINES.get(id(store))
    if baseline is None or not p.exists():
        # No baseline (manual construction) or first write — straight dump
        merged = store
    else:
        with open(p, encoding='utf-8') as f:
            on_disk_raw = json.load(f)
        on_disk = ({item['url']: item for item in on_disk_raw if 'url' in item}
                   if isinstance(on_disk_raw, list) else on_disk_raw)
        merged = _three_way_merge(baseline, store, on_disk)
        # Refresh baseline so subsequent saves in this process compare against
        # the now-current state
        _LOAD_BASELINES[id(store)] = _copy.deepcopy(merged)
        # Also reflect merged state in the live store dict so subsequent
        # in-process reads see the merged values
        store.clear()
        store.update(merged)
    _atomic_dump(merged, p)
    print(f"Saved {len(merged)} properties to {p.name}")


def export_enriched(store, path=None):
    """Export as list-format JSON for backward compat (map viewer, scorer)."""
    p = Path(path) if path else ENRICHED_PATH
    props = list(store.values())
    _atomic_dump(props, p)
    print(f"Exported {len(props)} properties to {p.name}")


def persist(store):
    """Save store + export enriched in one call."""
    save(store)
    export_enriched(store)


def upsert(store, url, fields):
    """Merge fields into a property. Only non-None values, never overwrites unmentioned fields."""
    if url not in store:
        store[url] = {'url': url}
    for k, v in fields.items():
        if v is not None:
            store[url][k] = v


def is_active(prop):
    """Check if property is not removed."""
    return prop.get('status') != STATUS_REMOVED


def detect_source(url):
    """Detect property source from URL."""
    if 'frenchestateagents.com' in url:
        return 'leggett'
    if 'properstar' in url:
        return 'properstar'
    if 'idealista' in url:
        return 'idealista'
    return 'unknown'


def get_score(prop):
    """Get the best available score for a property."""
    return prop.get('cp_score') or prop.get('overall_score') or 0


def short_url(url, n=40):
    """Shorten URL for display output."""
    return url.split('/')[-1][:n] if '/' in url else url[:n]


@asynccontextmanager
async def browser_page(headless=False):
    """Shared Playwright browser context manager."""
    from playwright.async_api import async_playwright
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless)
        context = await browser.new_context(user_agent=CHROME_UA)
        page = await context.new_page()
        try:
            yield page
        finally:
            await browser.close()


async def wait_for_cloudflare(page, timeout_ms=8000):
    """Wait for Cloudflare challenge to resolve. Returns True if page loaded."""
    title = await page.title()
    if 'just a moment' in title.lower() or 'checking' in title.lower():
        await page.wait_for_timeout(timeout_ms)
        title = await page.title()
        return 'just a moment' not in title.lower()
    return True