"""
Deterministic Property Analyzer - NO GPT REQUIRED!

Extracts objective facts from Properstar pages and scores based on rules.
Only uses GPT for final subjective assessment if explicitly requested.

Cost: FREE (no API calls)
Speed: FAST (direct scraping)
"""
import pandas as pd
import requests
import re
import json
from bs4 import BeautifulSoup
import time

def extract_property_facts(url):
    """Extract all objective facts from a Properstar page"""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
        }
        response = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(response.text, 'html.parser')

        facts = {
            'url': url,
            'title': None,
            'price': None,
            'land_size_m2': None,
            'building_size_m2': None,
            'bedrooms': None,
            'bathrooms': None,
            'property_type': None,
            'location': None,
            'country': None,
            'region': None,
            'municipality': None,
            'postal_code': None,
            'latitude': None,
            'longitude': None,
            'description': None,
            'features': [],
            'has_images': False,
            'image_count': 0
        }

        # Title
        h1 = soup.find('h1')
        if h1:
            facts['title'] = h1.get_text(strip=True)

        # Price
        price_elem = soup.find('span', {'itemprop': 'price'})
        if price_elem:
            price_text = price_elem.get_text()
            price_num = re.sub(r'[^\d]', '', price_text)
            if price_num:
                facts['price'] = int(price_num)

        # Description
        desc_meta = soup.find('meta', {'name': 'description'})
        if desc_meta:
            facts['description'] = desc_meta.get('content', '')

        # Extract from structured data
        scripts = soup.find_all('script', type='application/ld+json')
        for script in scripts:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict):
                    # Property details
                    if 'floorSize' in data:
                        size = data['floorSize']
                        if 'value' in size:
                            facts['building_size_m2'] = float(size['value'])

                    if 'numberOfRooms' in data:
                        facts['bedrooms'] = int(data['numberOfRooms'])

                    # Address
                    if 'address' in data:
                        addr = data['address']
                        facts['municipality'] = addr.get('addressLocality')
                        facts['region'] = addr.get('addressRegion')
                        facts['country'] = addr.get('addressCountry')
                        facts['postal_code'] = addr.get('postalCode')

                    # Coordinates
                    if 'geo' in data:
                        geo = data['geo']
                        facts['latitude'] = float(geo.get('latitude', 0)) or None
                        facts['longitude'] = float(geo.get('longitude', 0)) or None
            except:
                continue

        # Extract property details from the page
        details = soup.find_all(['dt', 'dd'])  # Definition lists often used for properties
        i = 0
        while i < len(details) - 1:
            if details[i].name == 'dt' and details[i+1].name == 'dd':
                label = details[i].get_text(strip=True).lower()
                value = details[i+1].get_text(strip=True)

                # Land size
                if 'terrain' in label or 'grond' in label or 'land' in label:
                    size_match = re.search(r'([\d,\.]+)\s*m', value)
                    if size_match:
                        size_str = size_match.group(1).replace(',', '')
                        facts['land_size_m2'] = float(size_str)

                # Building size
                if 'woonoppervlak' in label or 'bebouwd' in label or 'building' in label:
                    size_match = re.search(r'([\d,\.]+)\s*m', value)
                    if size_match:
                        size_str = size_match.group(1).replace(',', '')
                        facts['building_size_m2'] = float(size_str)

                # Bedrooms
                if 'slaapkamer' in label or 'bedroom' in label or 'chambre' in label:
                    num_match = re.search(r'\d+', value)
                    if num_match:
                        facts['bedrooms'] = int(num_match.group())

                # Bathrooms
                if 'badkamer' in label or 'bathroom' in label or 'salle de bain' in label:
                    num_match = re.search(r'\d+', value)
                    if num_match:
                        facts['bathrooms'] = int(num_match.group())

            i += 1

        # Extract features/amenities
        feature_keywords = [
            'zwembad', 'pool', 'piscine',
            'garage', 'parking',
            'tuin', 'garden', 'jardin',
            'zonnepanelen', 'solar',
            'schuur', 'barn', 'grange',
            'werkplaats', 'workshop', 'atelier',
            'airco', 'air conditioning', 'climatisation',
            'verwarming', 'heating', 'chauffage',
            'open haard', 'fireplace', 'cheminée',
            'terras', 'terrace', 'terrasse'
        ]

        page_text = soup.get_text().lower()
        for keyword in feature_keywords:
            if keyword in page_text:
                facts['features'].append(keyword)

        # Count images
        images = soup.find_all('img')
        property_images = [img for img in images if 'property' in img.get('class', []) or
                          'listing' in img.get('class', [])]
        facts['image_count'] = len(property_images)
        facts['has_images'] = len(property_images) > 0

        # Property type (from title or description)
        property_types = {
            'boerderij': 'farm',
            'finca': 'farm',
            'farm': 'farm',
            'villa': 'villa',
            'huis': 'house',
            'maison': 'house',
            'appartement': 'apartment',
            'grond': 'land',
            'terrain': 'land',
            'land': 'land',
            'schuur': 'barn',
            'barn': 'barn',
            'grange': 'barn'
        }

        title_lower = (facts['title'] or '').lower()
        desc_lower = (facts['description'] or '').lower()

        for keyword, prop_type in property_types.items():
            if keyword in title_lower or keyword in desc_lower:
                facts['property_type'] = prop_type
                break

        return facts

    except Exception as e:
        print(f"  ✗ Error extracting facts: {e}")
        return None

def score_property_deterministic(facts):
    """
    Score property based on objective facts - NO GPT!
    Returns scores 1-5 for each criterion
    """
    scores = {
        'market_garden': 3,  # Default average
        'guest_accommodation': 3,
        'workshop': 3,
        'rental_units': 3,
        'location': 3,
        'local_market': 3,
        'risk_profile': 'Gemiddeld',
        'reasoning': {}
    }

    if not facts:
        return scores

    # === MARKET GARDEN SCORING ===
    mg_score = 3  # Start at average
    mg_reasons = []

    # Land size is critical
    if facts['land_size_m2']:
        if facts['land_size_m2'] >= 50000:  # 5+ hectares
            mg_score = 5
            mg_reasons.append(f"Excellent size: {facts['land_size_m2']/10000:.1f} ha")
        elif facts['land_size_m2'] >= 20000:  # 2+ hectares
            mg_score = 4
            mg_reasons.append(f"Good size: {facts['land_size_m2']/10000:.1f} ha")
        elif facts['land_size_m2'] >= 5000:  # 0.5+ hectares
            mg_score = 3
            mg_reasons.append(f"Adequate size: {facts['land_size_m2']/10000:.1f} ha")
        elif facts['land_size_m2'] >= 2000:  # 0.2+ hectares
            mg_score = 2
            mg_reasons.append(f"Small size: {facts['land_size_m2']/10000:.1f} ha")
        else:
            mg_score = 1
            mg_reasons.append(f"Too small: {facts['land_size_m2']/10000:.1f} ha")
    else:
        mg_reasons.append("Unknown land size")

    # Check for garden/farming features
    if 'tuin' in facts['features'] or 'garden' in facts['features']:
        mg_score = min(5, mg_score + 1)
        mg_reasons.append("Has garden")

    if facts['property_type'] == 'farm':
        mg_score = min(5, mg_score + 1)
        mg_reasons.append("Is a farm property")

    scores['market_garden'] = mg_score
    scores['reasoning']['market_garden'] = mg_reasons

    # === GUEST ACCOMMODATION SCORING ===
    guest_score = 3
    guest_reasons = []

    # Bedrooms are important
    if facts['bedrooms']:
        if facts['bedrooms'] >= 5:
            guest_score = 5
            guest_reasons.append(f"Many bedrooms: {facts['bedrooms']}")
        elif facts['bedrooms'] >= 3:
            guest_score = 4
            guest_reasons.append(f"Good bedrooms: {facts['bedrooms']}")
        elif facts['bedrooms'] >= 2:
            guest_score = 3
            guest_reasons.append(f"Adequate bedrooms: {facts['bedrooms']}")
        else:
            guest_score = 2
            guest_reasons.append(f"Few bedrooms: {facts['bedrooms']}")
    else:
        guest_reasons.append("Unknown bedrooms")

    # Amenities boost score
    guest_amenities = ['zwembad', 'pool', 'airco', 'terras', 'terrace']
    amenity_count = sum(1 for a in guest_amenities if a in facts['features'])
    if amenity_count >= 2:
        guest_score = min(5, guest_score + 1)
        guest_reasons.append(f"Good amenities ({amenity_count})")

    if facts['property_type'] in ['villa', 'farm']:
        guest_score = min(5, guest_score + 1)
        guest_reasons.append("Suitable property type")

    scores['guest_accommodation'] = guest_score
    scores['reasoning']['guest_accommodation'] = guest_reasons

    # === WORKSHOP SCORING ===
    workshop_score = 3
    workshop_reasons = []

    # Check for existing buildings/structures
    if facts['building_size_m2']:
        if facts['building_size_m2'] >= 200:
            workshop_score = 4
            workshop_reasons.append(f"Large building: {facts['building_size_m2']:.0f}m²")
        elif facts['building_size_m2'] >= 100:
            workshop_score = 3
            workshop_reasons.append(f"Medium building: {facts['building_size_m2']:.0f}m²")
        else:
            workshop_score = 2
            workshop_reasons.append(f"Small building: {facts['building_size_m2']:.0f}m²")

    # Check for workshop/barn features
    workshop_keywords = ['schuur', 'barn', 'garage', 'werkplaats', 'workshop']
    if any(kw in facts['features'] for kw in workshop_keywords):
        workshop_score = min(5, workshop_score + 1)
        workshop_reasons.append("Has barn/workshop")

    if not facts['building_size_m2']:
        workshop_reasons.append("Unknown building size")

    scores['workshop'] = workshop_score
    scores['reasoning']['workshop'] = workshop_reasons

    # === RENTAL UNITS SCORING ===
    rental_score = 3
    rental_reasons = []

    # Based on bedrooms + property size
    if facts['bedrooms'] and facts['building_size_m2']:
        if facts['bedrooms'] >= 4 and facts['building_size_m2'] >= 200:
            rental_score = 5
            rental_reasons.append("Multiple rental units possible")
        elif facts['bedrooms'] >= 3:
            rental_score = 4
            rental_reasons.append("Good rental potential")
        elif facts['bedrooms'] >= 2:
            rental_score = 3
            rental_reasons.append("Moderate rental potential")
        else:
            rental_score = 2
            rental_reasons.append("Limited rental potential")
    else:
        rental_reasons.append("Unknown rental potential")

    scores['rental_units'] = rental_score
    scores['reasoning']['rental_units'] = rental_reasons

    # === LOCATION SCORING ===
    location_score = 3
    location_reasons = []

    # Score by country (preference for nearby countries)
    country_scores = {
        'Nederland': 5,
        'Netherlands': 5,
        'België': 5,
        'Belgium': 5,
        'France': 4,
        'Frankrijk': 4,
        'Germany': 4,
        'Duitsland': 4,
        'Spain': 3,
        'Spanje': 3,
        'Portugal': 3,
        'Italy': 3,
        'Italië': 3
    }

    if facts['country']:
        location_score = country_scores.get(facts['country'], 3)
        location_reasons.append(f"Country: {facts['country']}")
    else:
        location_reasons.append("Unknown country")

    scores['location'] = location_score
    scores['reasoning']['location'] = location_reasons

    # === LOCAL MARKET SCORING ===
    market_score = 3  # Assume average (hard to determine without GPT)
    market_reasons = ["Needs on-site assessment or GPT analysis"]

    scores['local_market'] = market_score
    scores['reasoning']['local_market'] = market_reasons

    # === RISK ASSESSMENT ===
    # Simple heuristic: missing info = higher risk
    missing_info = 0
    if not facts['land_size_m2']:
        missing_info += 1
    if not facts['price']:
        missing_info += 1
    if not facts['bedrooms']:
        missing_info += 1
    if not facts['latitude']:
        missing_info += 1

    if missing_info >= 3:
        scores['risk_profile'] = 'Hoog'
    elif missing_info >= 1:
        scores['risk_profile'] = 'Gemiddeld'
    else:
        scores['risk_profile'] = 'Laag'

    return scores

def analyze_all_properties_deterministic(csv_file="analysis_output.csv"):
    """
    Analyze all properties using deterministic rules - NO GPT COST!
    """
    print("=" * 70)
    print("🔧 DETERMINISTIC PROPERTY ANALYZER (NO GPT)")
    print("=" * 70)
    print()

    df = pd.read_csv(csv_file)

    # Add columns if they don't exist
    new_columns = [
        'land_size_m2', 'building_size_m2', 'bedrooms', 'bathrooms',
        'property_type', 'features', 'image_count',
        'det_market_garden', 'det_guest', 'det_workshop',
        'det_rental', 'det_location', 'det_market',
        'det_risk', 'det_reasoning'
    ]

    for col in new_columns:
        if col not in df.columns:
            df[col] = None

    processed = 0
    errors = 0

    for idx, row in df.iterrows():
        print(f"\n🔍 [{idx+1}/{len(df)}] {row['URL']}")

        # Extract facts
        print("  → Extracting property facts...")
        facts = extract_property_facts(row['URL'])

        if facts:
            # Update extracted facts
            df.at[idx, 'land_size_m2'] = facts['land_size_m2']
            df.at[idx, 'building_size_m2'] = facts['building_size_m2']
            df.at[idx, 'bedrooms'] = facts['bedrooms']
            df.at[idx, 'bathrooms'] = facts['bathrooms']
            df.at[idx, 'property_type'] = facts['property_type']
            df.at[idx, 'features'] = ', '.join(facts['features'])
            df.at[idx, 'image_count'] = facts['image_count']

            # Update location if found
            if facts['latitude'] and not pd.notna(df.at[idx, 'Latitude']):
                df.at[idx, 'Latitude'] = facts['latitude']
                df.at[idx, 'Longitude'] = facts['longitude']
                df.at[idx, 'LocationSource'] = 'properstar_json'

            if facts['municipality']:
                df.at[idx, 'Municipality'] = facts['municipality']
            if facts['country']:
                df.at[idx, 'Country'] = facts['country']

            # Score deterministically
            print("  → Scoring property...")
            scores = score_property_deterministic(facts)

            df.at[idx, 'det_market_garden'] = scores['market_garden']
            df.at[idx, 'det_guest'] = scores['guest_accommodation']
            df.at[idx, 'det_workshop'] = scores['workshop']
            df.at[idx, 'det_rental'] = scores['rental_units']
            df.at[idx, 'det_location'] = scores['location']
            df.at[idx, 'det_market'] = scores['local_market']
            df.at[idx, 'det_risk'] = scores['risk_profile']
            df.at[idx, 'det_reasoning'] = json.dumps(scores['reasoning'])

            print(f"  ✓ Scores: MG:{scores['market_garden']} G:{scores['guest_accommodation']} W:{scores['workshop']} R:{scores['rental_units']} L:{scores['location']}")

            processed += 1
        else:
            errors += 1
            print("  ✗ Failed to extract facts")

        # Save progress every 10 properties
        if (idx + 1) % 10 == 0:
            df.to_csv(csv_file, index=False, encoding='utf-8')
            print(f"  💾 Progress saved")

        time.sleep(1)  # Be nice to the server

    # Final save
    df.to_csv(csv_file, index=False, encoding='utf-8')

    print("\n" + "=" * 70)
    print("📊 SUMMARY")
    print("=" * 70)
    print(f"Processed: {processed}/{len(df)}")
    print(f"Errors: {errors}")
    print(f"💰 Cost: $0.00 (NO GPT USED!)")
    print(f"✅ Saved to {csv_file}")

if __name__ == "__main__":
    analyze_all_properties_deterministic()
