+
    Si!!                         R t ^ RIt^ RIt^ RIt^ RIt^ RIHtHtHtH	t	H
t
HtHt RR ltR tR t]R8X  d   ]P"                  ! ]! 4       4       R# R# )a  
Scrape metadata from Leggett (frenchestateagents.com) property detail pages.

Uses Playwright (Cloudflare blocks plain requests). Extracts: price, bedrooms,
land size, building size, description, coordinates, region.

Usage:
    python3 enrich_leggett.py                # Scrape all missing
    python3 enrich_leggett.py --limit 3      # Test on 3 properties
    python3 enrich_leggett.py --dry-run      # Show what would be scraped
N)loadpersistupsert	is_active	short_urlbrowser_pagewait_for_cloudflarec                    V P                  R4      R8w  d   R# \        V 4      '       g   R# V'       d   R# V P                  R4      '       d   V P                  R4      '       d   R# R# )z6Property is Leggett, active, and missing key metadata.sourceleggettFTpricebedrooms)getr   )propforces   &&V/Users/jonathan/Documents/Zakelijk/ClaudeOS/03_Lab/farmmatch/scraper/enrich_leggett.pyneeds_enrichmentr      sK    xxY&T??xxTXXj11    c                  "    V P                  VRRR7      G Rj  xL
  V P                  R4      G Rj  xL
  \        V 4      G Rj  xL
 '       g   R# V P                  R4      G Rj  xL
 pV'       d/   VP	                  R4      '       g   VP	                  R4      '       g   R# VR	3#  L Lv Le LF  \
         d   pR\        T4      R
,          3u Rp?# Rp?ii ; i5i)z9Navigate to a Leggett property page and extract metadata.domcontentloadedi0u  )
wait_untiltimeoutNi  uW  () => {
            const result = {};

            // Title
            const h1 = document.querySelector('h1');
            if (h1) result.title = h1.textContent.trim();

            // Price
            const priceEl = document.querySelector('.price .new-price, .price, .detail-price');
            if (priceEl) {
                const priceText = priceEl.textContent.replace(/[^0-9]/g, '');
                if (priceText) result.price = parseInt(priceText);
            }

            // Coordinates from map
            const mapImg = document.querySelector('img.map-region, img[data-lat]');
            if (mapImg) {
                const lat = mapImg.dataset?.lat || mapImg.getAttribute('data-lat');
                const lon = mapImg.dataset?.lon || mapImg.getAttribute('data-lon');
                if (lat && lon) {
                    result.lat = parseFloat(lat);
                    result.lon = parseFloat(lon);
                }
            }

            // Also check for map iframe or OpenStreetMap embed
            const iframe = document.querySelector('iframe[src*="openstreetmap"], iframe[src*="google.com/maps"]');
            if (iframe && !result.lat) {
                const src = iframe.getAttribute('src') || '';
                const coordMatch = src.match(/[-+]?\d+\.\d+/g);
                if (coordMatch && coordMatch.length >= 2) {
                    result.lat = parseFloat(coordMatch[0]);
                    result.lon = parseFloat(coordMatch[1]);
                }
            }

            // Region/location
            const locEl = document.querySelector('.locations .primary, .breadcrumb, .detail-location');
            if (locEl) result.region_text = locEl.textContent.trim();

            // Description
            const descEl = document.querySelector('.description, .property-description, .detail-description, #description');
            if (descEl) result.description = descEl.textContent.trim().slice(0, 2000);

            // Feature list (bedrooms, land, building size)
            const features = document.querySelectorAll('.features li, .detail-features li, .key-features li, .property-features li, .summary-list li, ul.list-unstyled li');
            for (const li of features) {
                const text = li.textContent.trim().toLowerCase();

                // Bedrooms
                const bedMatch = text.match(/(\d+)\s*(?:bed|chambre|slaap)/);
                if (bedMatch) result.bedrooms = parseInt(bedMatch[1]);

                // Land size
                const landMatch = text.match(/(\d[\d.,]*)\s*(?:m²|m2|sqm).*(?:land|terrain|ground|plot)/i)
                    || text.match(/(?:land|terrain|ground|plot).*?(\d[\d.,]*)\s*(?:m²|m2|sqm|hectare)/i);
                if (landMatch) {
                    let val = parseFloat(landMatch[1].replace(',', ''));
                    if (text.includes('hectare')) val *= 10000;
                    result.land_size_m2 = val;
                }

                // Also try standalone area patterns
                const areaMatch = text.match(/(\d[\d.,]*)\s*(?:m²|m2|sqm)/);
                if (areaMatch && !result.land_size_m2 && !result.building_size_m2) {
                    const val = parseFloat(areaMatch[1].replace(',', ''));
                    // Guess: > 1000 = land, otherwise building
                    if (val >= 1000) result.land_size_m2 = val;
                    else result.building_size_m2 = val;
                }

                // Building size
                const buildMatch = text.match(/(\d[\d.,]*)\s*(?:m²|m2|sqm).*(?:habitable|living|floor|build)/i)
                    || text.match(/(?:habitable|living|floor|build).*?(\d[\d.,]*)\s*(?:m²|m2|sqm)/i);
                if (buildMatch) result.building_size_m2 = parseFloat(buildMatch[1].replace(',', ''));
            }

            // Fallback: scan full page text for bedroom count
            if (!result.bedrooms) {
                const body = document.body.textContent;
                const bedFallback = body.match(/(\d+)\s*(?:bedroom|bed\b)/i);
                if (bedFallback) result.bedrooms = parseInt(bedFallback[1]);
            }

            return result;
        }r   titleok:Nd   N)NzCloudflare blocked)NzNo data extracted)gotowait_for_timeoutr   evaluater   	Exceptionstr)pageurlfieldses   &&  r   scrape_propertyr$   #   s     e"ii(:EiJJJ##D)))(...--}} U& U Un fjj11&**W:M:M,,t|C 	K).Ux  "SVD\!!"s   CB1 B)B1 B+B1 B-	B1 B1 CB1 (B/)B1 5B1 B1 #B1 $C%B1 (C)B1 +B1 -B1 /B1 1C<CCCCCc                    "   \         P                  ! R R7      p V P                  R\        ^ RR7       V P                  RRRR7       V P                  R	RR
R7       V P                  RRRR7       V P	                  4       p\        4       pVP                  4        UUu. uF$  w  r4\        WAP                  4      '       g   K"  VNK&  	  pppVP                  '       d   VRVP                   p\        R\        V4       24       VP                  '       d   V F  p\        RV 24       K  	  R# V'       g   \        R4       R# ^ p^ p\        VP                  R7      ;_uu_4       GRj  xL
 p\        V4       EFz  w  r\        RV	^,            R\        V4       R\!        V4       R2RR7       \#        W4      G Rj  xL
 w  rV
'       d   / pV
P%                  R4      '       d   V
R,          R8w  d   V
R,          VR&   R) F&  pV
P%                  V4      '       g   K  W,          W&   K(  	  V
P%                  R4      '       d   V
R,          VR&   RVR&   V'       dS   \'        W#V4       V^,          pRP)                  R VP                  4        4       4      R ,          p\        R!V R"24       M-V^,          p\        R#4       MV^,          p\        R$V R"24       VP+                  R%4      G Rj  xL
  EK}  	  RRR4      GRj  xL
  \        R&V R\        V4       R'V R(24       V^ 8  d   \-        V4       R# R# u uppi  EL EL LY LF  + GRj  xL 
 '       g   i     L]; i5i)*z Enrich Leggett property metadata)descriptionz--limitz Max properties to scrape (0=all))typedefaulthelpz	--dry-run
store_truezShow what would be scraped)actionr)   z
--headlessz%Run headless (may fail on Cloudflare)z--forcezRe-scrape already enrichedNz'Leggett properties needing enrichment: z  zNothing to do.)headlessz  [/z] z... )endr   zJust a moment...region_textlocationleggett_scrapelocation_sourcez, c              3   F   "   T F  w  rVR9  g   K  V RV 2x  K  	  R# 5i)r   =N)r   r&    ).0kvs   &  r   	<genexpr>main.<locals>.<genexpr>   s-      *O,-5M,M +5QCq*s   !!:NP   NzOK ()emptyzFAIL (i  z

Enriched z (z failed))r   r   land_size_m2building_size_m2latlonr&   )argparseArgumentParseradd_argumentint
parse_argsr   itemsr   r   limitprintlendry_runr   r,   	enumerater   r$   r   r   joinr   r   )parserargsstorer!   p
candidatesenrichedfailedr    ir"   statuscleankey	extracteds                  r   mainr[      s    $$1STF
	Q=_`
L?[\
\@gh
	,=YZDFE$)KKMUM&#5Ea5T##MJUzzz,
	3C
O3D
EF|||CBse* HFT]]333t
+FAC!uAc*o.b30@D#N#24#==NF::g&&6'?>P+P%+G_E'N9Czz#%+[
9 ::m,,(.}(=E*%/?E+,5u-MH $		 *O *O !OOR!TID1-.aKFEO!vha() ''---A , 43F 
Kz3z?"32fXX
FG!| q V& 4 >: .C 4333s   BM&L: L:A-M&43M&'M (M&+AM
;M
<&M
#3M
$M
 M
BM
,M
-	M
6M&M?M&M
M
M&
M#	M
M#	M#	M&__main__)F)__doc__rC   asyncioretimerQ   r   r   r   r   r   r   r   r   r$   r[   __name__runr6   r   r   <module>rc      sQ   
   	  ` ` `g"TBJ zKK r   