#!/usr/bin/env python3
"""
Structured Property Facts Extractor
Extracts structured data from property pages for high-quality GPT analysis
"""
import re
from bs4 import BeautifulSoup
from typing import Dict, Optional, List, Any
import json

class PropertyFactsExtractor:
    """Extract structured facts from property HTML"""

    def __init__(self, html_content: str, url: str):
        self.soup = BeautifulSoup(html_content, "html.parser")
        self.url = url
        self.facts = {}

    def extract_all(self) -> Dict[str, Any]:
        """Extract all structured facts from the property page"""
        self.facts = {
            'url': self.url,
            'title': self._extract_title(),
            'description': self._extract_description(),
            'price': self._extract_price(),
            'location': self._extract_location(),
            'property_details': self._extract_property_details(),
            'land_details': self._extract_land_details(),
            'building_details': self._extract_building_details(),
            'amenities': self._extract_amenities(),
            'features': self._extract_features(),
            'energy_info': self._extract_energy_info(),
            'key_highlights': self._extract_key_highlights()
        }
        return self.facts

    def _extract_title(self) -> str:
        """Extract property title"""
        title = self.soup.find('h1')
        return title.get_text(strip=True) if title else ""

    def _extract_description(self) -> str:
        """Extract main property description (cleaned)"""
        # Look for listing-section-content divs (main description)
        sections = self.soup.find_all('div', class_='listing-section-content')
        descriptions = []

        for section in sections[:2]:  # First 2 sections usually have description
            text = section.get_text(strip=True)
            if text and len(text) > 50:  # Meaningful text only
                # Clean up common filler text
                text = re.sub(r'(Cookie|Privacy|Terms|Contact us|Show more|Lees meer).*', '', text, flags=re.IGNORECASE)
                descriptions.append(text)

        return " ".join(descriptions)[:1000]  # Limit to 1000 chars

    def _extract_price(self) -> Optional[int]:
        """Extract price as integer"""
        # Method 1: Structured data
        price_span = self.soup.find('span', {'itemprop': 'price'})
        if price_span:
            price_text = price_span.get_text(strip=True)
            digits = ''.join(filter(str.isdigit, price_text))
            if digits:
                try:
                    price = int(digits)
                    if 10000 <= price <= 10000000:  # Sanity check
                        return price
                except:
                    pass

        # Method 2: Search for price pattern in text
        text = self.soup.get_text()
        # Match patterns like "€ 450,000" or "EUR 450000"
        patterns = [
            r'€\s*(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?)',
            r'EUR\s*(\d{1,3}(?:[.,]\d{3})*)',
            r'Price:?\s*€?\s*(\d{1,3}(?:[.,]\d{3})*)'
        ]
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                digits = ''.join(filter(str.isdigit, match.group(1)))
                if digits:
                    try:
                        price = int(digits)
                        if 10000 <= price <= 10000000:
                            return price
                    except:
                        pass

        return None

    def _extract_location(self) -> Dict[str, Optional[str]]:
        """Extract location details"""
        location_div = self.soup.find('div', class_='item-location')
        location_text = location_div.get_text(strip=True) if location_div else ""

        # Parse address components
        parts = [p.strip() for p in location_text.split(',')]

        return {
            'full': location_text,
            'city': parts[0] if len(parts) > 0 else None,
            'region': parts[1] if len(parts) > 1 else None,
            'country': parts[-1] if len(parts) > 2 else None
        }

    def _extract_property_details(self) -> Dict[str, Any]:
        """Extract core property metrics from 'areas' div"""
        areas_div = self.soup.find('div', class_='areas')
        details = {
            'total_area_m2': None,
            'living_area_m2': None,
            'land_area_m2': None,
            'bedrooms': None,
            'bathrooms': None,
            'rooms': None,
            'property_type': None,
            'year_built': None
        }

        if not areas_div:
            return details

        text = areas_div.get_text()

        # Extract area measurements
        # Patterns: "450 m²", "450m2", "450 sq m"
        area_matches = re.findall(r'(\d{1,6})\s*(?:m²|m2|sq\s*m)', text, re.IGNORECASE)
        if area_matches:
            # First is usually total area
            details['total_area_m2'] = int(area_matches[0])
            if len(area_matches) > 1:
                # Second might be living area
                details['living_area_m2'] = int(area_matches[1])

        # Land area (often shown as "Plot: X m²" or "Land: X hectares")
        land_match = re.search(r'(?:plot|land|terrain|grond)[:\s]*(\d{1,8})\s*(?:m²|m2|sq\s*m)', text, re.IGNORECASE)
        if land_match:
            details['land_area_m2'] = int(land_match.group(1))

        # Hectares to m² (1 ha = 10,000 m²)
        hectare_match = re.search(r'(\d{1,4}(?:[.,]\d{1,2})?)\s*(?:ha|hectare)', text, re.IGNORECASE)
        if hectare_match:
            ha = float(hectare_match.group(1).replace(',', '.'))
            details['land_area_m2'] = int(ha * 10000)

        # Bedrooms
        bedroom_match = re.search(r'(\d{1,2})\s*(?:bedroom|chambre|slaapkamer|bed)', text, re.IGNORECASE)
        if bedroom_match:
            details['bedrooms'] = int(bedroom_match.group(1))

        # Bathrooms
        bathroom_match = re.search(r'(\d{1,2})\s*(?:bathroom|salle de bain|badkamer|bath)', text, re.IGNORECASE)
        if bathroom_match:
            details['bathrooms'] = int(bathroom_match.group(1))

        # Total rooms
        rooms_match = re.search(r'(\d{1,2})\s*(?:room|pièce|kamer|piece)', text, re.IGNORECASE)
        if rooms_match:
            details['rooms'] = int(rooms_match.group(1))

        # Property type
        types = ['farmhouse', 'villa', 'house', 'cottage', 'manor', 'estate',
                 'mas', 'ferme', 'boerderij', 'château', 'castle']
        for prop_type in types:
            if re.search(r'\b' + prop_type + r'\b', text, re.IGNORECASE):
                details['property_type'] = prop_type.title()
                break

        # Year built
        year_match = re.search(r'(?:built|constructed|bouwjaar)[:\s]*(\d{4})', text, re.IGNORECASE)
        if year_match:
            year = int(year_match.group(1))
            if 1600 <= year <= 2030:  # Sanity check
                details['year_built'] = year

        return details

    def _extract_land_details(self) -> Dict[str, Any]:
        """Extract land-specific details (important for regenerative farming)"""
        text = self.soup.get_text()

        details = {
            'has_water_source': None,
            'has_well': None,
            'has_spring': None,
            'has_pond': None,
            'has_river_access': None,
            'has_forest': None,
            'has_orchard': None,
            'has_vineyard': None,
            'has_pasture': None,
            'has_arable_land': None,
            'irrigation_available': None,
            'soil_type': None
        }

        # Water features (critical for farming)
        water_keywords = {
            'has_well': ['well', 'puits', 'waterput', 'bron'],
            'has_spring': ['spring', 'source', 'bron'],
            'has_pond': ['pond', 'étang', 'vijver', 'lake', 'lac', 'meer'],
            'has_river_access': ['river', 'rivière', 'rivier', 'stream', 'ruisseau', 'beek'],
            'has_forest': ['forest', 'woodland', 'forêt', 'bos', 'trees', 'arbres', 'bomen'],
            'has_orchard': ['orchard', 'verger', 'boomgaard', 'fruit trees'],
            'has_vineyard': ['vineyard', 'vignoble', 'wijngaard', 'vines'],
            'has_pasture': ['pasture', 'prairie', 'weide', 'meadow', 'grazing'],
            'has_arable_land': ['arable', 'cultivable', 'farmland', 'agricultural']
        }

        for key, keywords in water_keywords.items():
            for keyword in keywords:
                if re.search(r'\b' + keyword + r'\b', text, re.IGNORECASE):
                    details[key] = True
                    break
            if details[key] is None:
                details[key] = False

        # Check for general water source mention
        details['has_water_source'] = any([
            details['has_well'],
            details['has_spring'],
            details['has_pond'],
            details['has_river_access']
        ])

        # Irrigation
        if re.search(r'\b(irrigation|irrigated|arrosage|irrigatie)\b', text, re.IGNORECASE):
            details['irrigation_available'] = True
        else:
            details['irrigation_available'] = False

        # Soil type
        soil_types = ['clay', 'loam', 'sandy', 'argile', 'limon', 'sableux', 'klei', 'leem', 'zand']
        for soil in soil_types:
            if re.search(r'\b' + soil + r'\b', text, re.IGNORECASE):
                details['soil_type'] = soil.title()
                break

        return details

    def _extract_building_details(self) -> Dict[str, Any]:
        """Extract building-specific details (for workshops, guest accommodation)"""
        text = self.soup.get_text()

        details = {
            'has_barn': None,
            'has_stable': None,
            'has_garage': None,
            'has_workshop': None,
            'has_outbuilding': None,
            'has_separate_apartment': None,
            'has_guest_house': None,
            'total_buildings': None,
            'renovation_needed': None,
            'move_in_ready': None
        }

        building_keywords = {
            'has_barn': ['barn', 'grange', 'schuur', 'hangar'],
            'has_stable': ['stable', 'écurie', 'stal'],
            'has_garage': ['garage', 'carport'],
            'has_workshop': ['workshop', 'atelier', 'werkplaats', 'shed'],
            'has_outbuilding': ['outbuilding', 'dépendance', 'bijgebouw', 'annexe'],
            'has_separate_apartment': ['apartment', 'flat', 'appartement', 'studio'],
            'has_guest_house': ['guest house', 'gîte', 'gastenverblijf', 'cottage']
        }

        for key, keywords in building_keywords.items():
            for keyword in keywords:
                if re.search(r'\b' + keyword + r'\b', text, re.IGNORECASE):
                    details[key] = True
                    break
            if details[key] is None:
                details[key] = False

        # Renovation status
        if re.search(r'\b(renovated|restored|modernized|rénové|gerenoveerd)\b', text, re.IGNORECASE):
            details['move_in_ready'] = True
            details['renovation_needed'] = False
        elif re.search(r'\b(renovation needed|to renovate|à rénover|te renoveren|work required)\b', text, re.IGNORECASE):
            details['renovation_needed'] = True
            details['move_in_ready'] = False
        else:
            details['renovation_needed'] = None
            details['move_in_ready'] = None

        return details

    def _extract_amenities(self) -> List[str]:
        """Extract amenities/features list"""
        amenities = []
        text = self.soup.get_text()

        amenity_list = [
            # Modern amenities
            'internet', 'wifi', 'fiber', 'broadband',
            'heating', 'central heating', 'chauffage', 'verwarming',
            'air conditioning', 'climatisation', 'airco',
            'solar panels', 'panneaux solaires', 'zonnepanelen',
            'heat pump', 'pompe à chaleur', 'warmtepomp',
            'fireplace', 'cheminée', 'open haard',
            'pool', 'swimming pool', 'piscine', 'zwembad',
            'terrace', 'terrasse', 'terras',
            'balcony', 'balcon', 'balkon',
            # Rural amenities
            'electricity', 'electric', 'électricité', 'elektriciteit',
            'water mains', 'town water', 'eau courante', 'leidingwater',
            'septic tank', 'fosse septique', 'septic',
            'well water', 'spring water', 'rainwater harvesting'
        ]

        for amenity in amenity_list:
            if re.search(r'\b' + amenity + r'\b', text, re.IGNORECASE):
                amenities.append(amenity.title())

        return list(set(amenities))  # Remove duplicates

    def _extract_features(self) -> List[str]:
        """Extract special features"""
        features = []
        text = self.soup.get_text()

        feature_list = [
            'panoramic view', 'mountain view', 'sea view', 'valley view',
            'private', 'secluded', 'isolated', 'quiet', 'peaceful',
            'south facing', 'sunny', 'bright',
            'stone', 'traditional', 'authentic', 'character',
            'organic', 'permaculture', 'biodynamic',
            'planning permission', 'building permit', 'permis de construire'
        ]

        for feature in feature_list:
            if re.search(r'\b' + feature + r'\b', text, re.IGNORECASE):
                features.append(feature.title())

        return features

    def _extract_energy_info(self) -> Dict[str, Optional[str]]:
        """Extract energy performance data"""
        text = self.soup.get_text()

        info = {
            'energy_rating': None,
            'dpe_score': None
        }

        # Energy rating (A-G)
        rating_match = re.search(r'Energy\s*(?:rating|class|label)[:\s]*([A-G])', text, re.IGNORECASE)
        if rating_match:
            info['energy_rating'] = rating_match.group(1).upper()

        # DPE score (Diagnostic de Performance Énergétique)
        dpe_match = re.search(r'DPE[:\s]*([A-G]|\d{1,3})', text, re.IGNORECASE)
        if dpe_match:
            info['dpe_score'] = dpe_match.group(1).upper()

        return info

    def _extract_key_highlights(self) -> List[str]:
        """Extract key selling points from structured sections"""
        highlights = []

        # Look for key features section
        key_features = self.soup.find_all(['ul', 'ol'], class_=re.compile('feature|highlight|key'))
        for ul in key_features:
            items = ul.find_all('li')
            for item in items[:10]:  # Limit to top 10
                text = item.get_text(strip=True)
                if text and len(text) > 5:
                    highlights.append(text[:100])  # Limit length

        return highlights

    def to_json(self) -> str:
        """Convert facts to clean JSON"""
        return json.dumps(self.facts, ensure_ascii=False, indent=2)

    def to_prompt_text(self) -> str:
        """Convert facts to GPT-friendly text format"""
        lines = []

        # Title and location
        if self.facts.get('title'):
            lines.append(f"Property: {self.facts['title']}")
        if self.facts.get('location', {}).get('full'):
            lines.append(f"Location: {self.facts['location']['full']}")
        if self.facts.get('price'):
            lines.append(f"Price: €{self.facts['price']:,}")

        lines.append("")  # Blank line

        # Property details
        pd = self.facts.get('property_details', {})
        if pd.get('property_type'):
            lines.append(f"Type: {pd['property_type']}")
        if pd.get('total_area_m2'):
            lines.append(f"Total area: {pd['total_area_m2']} m²")
        if pd.get('living_area_m2'):
            lines.append(f"Living area: {pd['living_area_m2']} m²")
        if pd.get('land_area_m2'):
            lines.append(f"Land area: {pd['land_area_m2']:,} m² ({pd['land_area_m2']/10000:.2f} ha)")
        if pd.get('bedrooms'):
            lines.append(f"Bedrooms: {pd['bedrooms']}")
        if pd.get('bathrooms'):
            lines.append(f"Bathrooms: {pd['bathrooms']}")
        if pd.get('year_built'):
            lines.append(f"Year built: {pd['year_built']}")

        lines.append("")

        # Land features (critical for farming)
        ld = self.facts.get('land_details', {})
        land_features = []
        if ld.get('has_water_source'):
            water_types = []
            if ld.get('has_well'): water_types.append('well')
            if ld.get('has_spring'): water_types.append('spring')
            if ld.get('has_pond'): water_types.append('pond')
            if ld.get('has_river_access'): water_types.append('river access')
            if water_types:
                land_features.append(f"Water: {', '.join(water_types)}")

        if ld.get('has_forest'):
            land_features.append("Forest/woodland present")
        if ld.get('has_orchard'):
            land_features.append("Orchard")
        if ld.get('has_pasture'):
            land_features.append("Pasture/meadow")
        if ld.get('irrigation_available'):
            land_features.append("Irrigation available")
        if ld.get('soil_type'):
            land_features.append(f"Soil: {ld['soil_type']}")

        if land_features:
            lines.append("Land features:")
            for feature in land_features:
                lines.append(f"  - {feature}")
            lines.append("")

        # Buildings
        bd = self.facts.get('building_details', {})
        buildings = []
        if bd.get('has_barn'): buildings.append('barn')
        if bd.get('has_stable'): buildings.append('stable')
        if bd.get('has_garage'): buildings.append('garage')
        if bd.get('has_workshop'): buildings.append('workshop')
        if bd.get('has_outbuilding'): buildings.append('outbuildings')
        if bd.get('has_guest_house'): buildings.append('guest house')
        if bd.get('has_separate_apartment'): buildings.append('separate apartment')

        if buildings:
            lines.append(f"Additional buildings: {', '.join(buildings)}")
            lines.append("")

        if bd.get('renovation_needed') is not None:
            status = "Renovation needed" if bd['renovation_needed'] else "Move-in ready"
            lines.append(f"Condition: {status}")
            lines.append("")

        # Amenities
        amenities = self.facts.get('amenities', [])
        if amenities:
            lines.append(f"Amenities: {', '.join(amenities[:10])}")
            lines.append("")

        # Description (truncated)
        desc = self.facts.get('description', '')
        if desc:
            lines.append("Description:")
            lines.append(desc[:500])  # First 500 chars
            if len(desc) > 500:
                lines.append("...")

        return "\n".join(lines)


def extract_property_facts(html_content: str, url: str) -> Dict[str, Any]:
    """
    Main function to extract structured facts from property HTML

    Args:
        html_content: Raw HTML from property page
        url: Property URL

    Returns:
        Dictionary with structured property facts
    """
    extractor = PropertyFactsExtractor(html_content, url)
    return extractor.extract_all()


if __name__ == "__main__":
    # Test with a sample URL
    import requests

    test_url = "https://www.properstar.com/property-for-sale/france/lot-et-garonne/7837046"

    print("Testing Property Facts Extractor")
    print("=" * 70)
    print(f"URL: {test_url}\n")

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15"
    }

    response = requests.get(test_url, headers=headers, timeout=10)

    if response.status_code == 200:
        facts = extract_property_facts(response.text, test_url)
        extractor = PropertyFactsExtractor(response.text, test_url)
        extractor.extract_all()

        print("STRUCTURED JSON:")
        print("=" * 70)
        print(extractor.to_json())

        print("\n\nGPT PROMPT FORMAT:")
        print("=" * 70)
        print(extractor.to_prompt_text())
    else:
        print(f"Failed to fetch page: {response.status_code}")
