#!/usr/bin/env python3
"""
Import Idealista properties from locally saved pages (HTML or text format).

Idealista's DataDome blocks automated scraping. This script parses files
you save manually from your browser (Cmd+S on favorites or search pages).

Usage:
    python3 import_idealista_html.py idealista_pages/
    python3 import_idealista_html.py idealista_pages/ --dry-run
"""
import argparse
import re
from datetime import datetime
from pathlib import Path

from store import load, persist, upsert

CRITERIA = {
    'price_min': 50000,
    'price_max': 450000,
}

# Detect country from URL domain
DOMAIN_COUNTRY = {
    'idealista.it': 'IT',
    'idealista.com': 'ES',
    'idealista.pt': 'PT',
}


def parse_text_format(text):
    """Parse Idealista pages saved as text (Cmd+S produces this format).

    Each listing block looks like:
        Title text <https://www.idealista.it/en/immobile/12345678/>
        120,000€
        150 m² 4 rooms ...
        Description text...
        Contact
    """
    properties = []
    seen = set()

    # Detect domain from file content
    domain = 'idealista.it'  # default
    for d in DOMAIN_COUNTRY:
        if d in text[:5000]:
            domain = d
            break

    country = DOMAIN_COUNTRY.get(domain, 'IT')

    # Determine listing path type
    if 'idealista.com' in domain:
        path_type = 'inmueble'
    elif 'idealista.pt' in domain:
        path_type = 'imovel'
    else:
        path_type = 'immobile'

    # Use path-only pattern (URLs are often split across lines or lack domain)
    id_pattern = re.compile(rf'(?:immobile|inmueble|imovel)/(\d+)')
    all_ids = sorted(set(id_pattern.findall(text)))

    # For context extraction, join lines
    lines = text.split('\n')

    for prop_id in all_ids:
        url = f'https://www.{domain}/en/{path_type}/{prop_id}/'
        if url in seen:
            continue
        seen.add(url)

        # Find lines around the first occurrence of this ID
        context_lines = []
        found_idx = None
        for i, line in enumerate(lines):
            if prop_id in line and found_idx is None:
                found_idx = i
        if found_idx is not None:
            start = max(0, found_idx - 5)
            end = min(len(lines), found_idx + 25)
            context_lines = lines[start:end]

        context = ' '.join(context_lines)

        # Title: text before the URL reference on the same line
        title = None
        if found_idx is not None:
            # Title can be on same line or one line above
            for offset in [0, -1]:
                idx = found_idx + offset
                if 0 <= idx < len(lines):
                    line = lines[idx]
                    t = re.match(r'^(.{10,120}?)\s*<', line)
                    if t:
                        title = t.group(1).strip()
                        break

        # Price: number followed by € after the URL line
        price = None
        if found_idx is not None:
            for line in lines[found_idx:found_idx + 5]:
                pm = re.search(r'([\d.,]+)\s*€', line)
                if pm:
                    price_str = re.sub(r'[.,]', '', pm.group(1))
                    if price_str.isdigit() and len(price_str) >= 4:
                        price = int(price_str)
                    break

        # Size in m²
        sqm = None
        if found_idx is not None:
            for line in lines[found_idx:found_idx + 5]:
                sm = re.search(r'([\d.,]+)\s*m[²2]', line)
                if sm:
                    sqm_str = re.sub(r'[.,]', '', sm.group(1))
                    if sqm_str.isdigit():
                        sqm = int(sqm_str)
                    break

        # Rooms
        rooms = None
        rm = re.search(r'(\d+)\s*(?:room|local|vano|stanz|quarto|habitaci)', context, re.I)
        if rm:
            rooms = int(rm.group(1))

        # Description: text block after price/size, before "Contact"
        description = None
        if found_idx is not None:
            desc_lines = []
            skip_markers = {'Contact', 'Favourite', 'Your note', 'Edit',
                            'Call', 'View phone', 'Add your note',
                            'Contactar', 'Favorito', 'Contacto',
                            'Views', 'Photo', 'Land', 'Approximate location.',
                            'Photo.', 'Facade', 'Living room', 'Overview',
                            'Contacteer', 'Favoriet', 'Bel', 'Telefoon bekijken'}
            past_size_line = False
            for line in lines[found_idx + 1:found_idx + 20]:
                stripped = line.strip()
                if not stripped:
                    if past_size_line and desc_lines:
                        break  # Empty line after description = done
                    continue
                if re.match(r'^\d[\d.,]*\s*€', stripped):
                    continue
                if re.match(r'^[\d.,]+\s*m[²2]', stripped):
                    past_size_line = True
                    continue
                if stripped in skip_markers:
                    break
                if re.match(r'^\d+/\d+$', stripped):
                    continue
                # Capture any substantial text after the size line
                if past_size_line and len(stripped) > 15 and not stripped.startswith('<'):
                    desc_lines.append(stripped)
                # Also capture text that looks like a description even before past_size_line
                elif not past_size_line and len(stripped) > 40 and not stripped.startswith('<'):
                    desc_lines.append(stripped)
                    past_size_line = True
            if desc_lines:
                description = ' '.join(desc_lines)[:300]

        # Location from title
        location = None
        if title:
            loc_match = re.search(r"(?:in|,)\s+([A-Z][\w\s'-]+?)$", title)
            if loc_match:
                location = loc_match.group(1).strip()

        properties.append({
            'url': url,
            'price': price,
            'location': location,
            'title': title,
            'sqm': sqm,
            'rooms': rooms,
            'country': country,
            'description': description,
        })

    return properties


def parse_html_format(html):
    """Parse Idealista pages saved as raw HTML."""
    properties = []
    seen = set()

    url_pattern = re.compile(
        r'href="((?:https?://www\.idealista\.\w+)?/(?:en/)?(?:immobile|inmueble|imovel)/\d+[^"]*)"'
    )

    for match in url_pattern.finditer(html):
        raw_url = match.group(1)
        if not raw_url.startswith('http'):
            # Detect domain from file content
            for domain in DOMAIN_COUNTRY:
                if domain in html[:5000]:
                    raw_url = f'https://www.{domain}{raw_url}'
                    break
        url = raw_url.split('?')[0].rstrip('/')
        if url in seen:
            continue
        seen.add(url)

        country = '??'
        for domain, cc in DOMAIN_COUNTRY.items():
            if domain in url:
                country = cc
                break

        # Context for price/size extraction
        start = max(0, match.start() - 1500)
        end = min(len(html), match.end() + 1500)
        context = html[start:end]

        price = None
        price_match = re.search(r'item-price[^>]*>([^<]*)', context)
        if price_match:
            digits = re.sub(r'[^\d]', '', price_match.group(1))
            if digits:
                price = int(digits)

        sqm = None
        sqm_match = re.search(r'(\d[\d.,]*)\s*m[²2\s]', context)
        if sqm_match:
            sqm = int(re.sub(r'[.,]', '', sqm_match.group(1)))

        title = None
        title_match = re.search(r'item-link[^>]*>([^<]+)', context)
        if title_match:
            title = title_match.group(1).strip()

        properties.append({
            'url': url, 'price': price, 'location': None,
            'title': title, 'sqm': sqm, 'rooms': None, 'country': country,
        })

    return properties


def parse_file(filepath):
    """Detect format and parse."""
    text = filepath.read_text(encoding='utf-8', errors='ignore')

    # If it has HTML tags, use HTML parser
    if '<article' in text or 'class="item' in text or '<html' in text[:500]:
        return parse_html_format(text)
    # Otherwise treat as text format (Cmd+S on Mac often saves as webarchive-like text)
    return parse_text_format(text)


def main():
    parser = argparse.ArgumentParser(description='Import Idealista properties from saved files')
    parser.add_argument('path', help='HTML/text file or directory containing saved pages')
    parser.add_argument('--dry-run', action='store_true', help='Preview without saving to store')
    parser.add_argument('--update', action='store_true', help='Update existing properties with new data (title, description)')
    args = parser.parse_args()

    path = Path(args.path)
    if path.is_dir():
        files = sorted(path.glob('*.html')) + sorted(path.glob('*.htm')) + sorted(path.glob('*.txt'))
    elif path.is_file():
        files = [path]
    else:
        print(f"Not found: {path}")
        return

    if not files:
        print(f"No files found in {path}")
        return

    store = load()
    existing = set(store.keys())
    all_new = []
    now = datetime.now().isoformat()

    print(f"Parsing {len(files)} file(s)...")
    print(f"Store: {len(store)} existing properties\n")

    for f in files:
        properties = parse_file(f)

        new_count = 0
        updated_count = 0
        for prop in properties:
            url = prop['url']
            is_existing = url in existing
            if is_existing and not args.update:
                continue
            price = prop.get('price')
            if price and (price < CRITERIA['price_min'] or price > CRITERIA['price_max']):
                continue

            if not is_existing:
                new_count += 1
                existing.add(url)
                all_new.append(prop)
            else:
                updated_count += 1

            if not args.dry_run:
                fields = {
                    'source': 'idealista',
                    'country': prop.get('country', 'IT'),
                }
                if not is_existing:
                    fields['discovered_at'] = now
                if prop.get('price'):
                    fields['price'] = prop['price']
                if prop.get('title'):
                    fields['title'] = prop['title']
                if prop.get('location'):
                    fields['city'] = prop['location']
                if prop.get('sqm'):
                    fields['building_size'] = prop['sqm']
                if prop.get('rooms'):
                    fields['rooms'] = prop['rooms']
                if prop.get('description'):
                    fields['summary'] = prop['description']
                upsert(store, url, fields)

        upd_str = f", {updated_count} updated" if updated_count else ""
        print(f"  {f.name}: {len(properties)} listings, {new_count} new{upd_str}")

    print(f"\n{'=' * 50}")
    print(f"  Total new: {len(all_new)}")

    if args.dry_run:
        for prop in all_new[:30]:
            price_str = f"EUR {prop['price']:,}" if prop.get('price') else '?'
            loc = (prop.get('location') or prop.get('title') or '')[:35]
            print(f"    {prop['country']} {loc:35s} {price_str:>12s}  .../{prop['url'].split('/')[-2]}")
    elif all_new or args.update:
        persist(store)
        print(f"  Saved to store ({len(store)} total)")
    else:
        print("  No new properties found")


if __name__ == '__main__':
    main()
