+
    PiF                         R t ^ RIt^ RIt^ RIt^ RIHt ^ RIHtHtHtH	t	H
t
HtHtHt R. ROR. ROR. ROR. ROR. ROR	. ROR
. RO/tRR ltR tR tR tR tR t]R8X  d   ]P,                  ! ]! 4       4       R# R# )u2  
Extract structured data and keyword signals from property listing pages.

Visits each active property page with Playwright (Cloudflare-protected) and extracts:
- JSON-LD structured data (price, rooms, floor size, year built)
- DPE energy label (A-G) — renovation proxy
- Feature keywords (pool, water source, outbuildings, south-facing, septic)
- Property type classification
- Listing date / days on market
- Structured land size vs building size (from feature tables, not body regex)

Usage:
    python3 enrich_page_data.py                # Enrich all active properties missing data
    python3 enrich_page_data.py --limit 5      # Test on 5 properties
    python3 enrich_page_data.py --dry-run      # Show what would be scraped
    python3 enrich_page_data.py --force        # Re-scrape even if already enriched
N)datetime)loadsavepersistupsert	is_active	short_urlbrowser_pagewait_for_cloudflarehas_poolhas_water_sourcehas_outbuildingshas_south_facing
has_septichas_guest_potentialhas_land_descriptionc                h    \        V 4      '       g   R# V'       d   R# V P                  R4      '       * # )z*Property needs page-level data extraction.FTpage_enriched)r   get)propforces   &&X/Users/jonathan/Documents/Zakelijk/ClaudeOS/03_Lab/farmmatch/scraper/enrich_page_data.pyneeds_page_enrichmentr   =   s&    T??xx(((    c                b  "    V P                  VRRR7      G Rj  xL
  V P                  R4      G Rj  xL
  \        V 4      G Rj  xL
 '       g   R	# V P                  R4      G Rj  xL
 pV'       g   R
# VR3#  L_ LH L7 L  \         d   pR\        T4      R,          3u Rp?# Rp?ii ; i5i)z@Visit a property page and extract all structured + keyword data.domcontentloadedi0u  )
wait_untiltimeoutNi	  u  () => {
            const result = {};

            // === JSON-LD structured data ===
            const ldScripts = document.querySelectorAll('script[type="application/ld+json"]');
            for (const s of ldScripts) {
                try {
                    const d = JSON.parse(s.textContent || '{}');
                    // Single product/residence
                    if (d['@type'] && ['Product', 'Residence', 'House', 'Apartment',
                        'SingleFamilyResidence', 'RealEstateListing'].includes(d['@type'])) {
                        if (d.name) result.ld_name = d.name;
                        if (d.description) result.ld_description = (d.description || '').slice(0, 500);
                        if (d.numberOfRooms) result.ld_rooms = parseInt(d.numberOfRooms);
                        if (d.floorSize) {
                            const fs = typeof d.floorSize === 'object' ? d.floorSize.value : d.floorSize;
                            result.ld_floor_size = parseFloat(fs);
                        }
                        if (d.yearBuilt) result.year_built = parseInt(d.yearBuilt);
                        // Price from offers
                        if (d.offers) {
                            const offer = Array.isArray(d.offers) ? d.offers[0] : d.offers;
                            if (offer.price) result.ld_price = parseFloat(String(offer.price).replace(/[^0-9.]/g, ''));
                            if (offer.priceCurrency) result.ld_currency = offer.priceCurrency;
                        }
                        // Address
                        if (d.address) {
                            const addr = typeof d.address === 'string' ? {} : d.address;
                            if (addr.addressLocality) result.ld_locality = addr.addressLocality;
                            if (addr.addressRegion) result.ld_region = addr.addressRegion;
                            if (addr.addressCountry) result.ld_country = addr.addressCountry;
                            if (addr.postalCode) result.ld_postcode = addr.postalCode;
                        }
                        // Geo
                        if (d.geo) {
                            if (d.geo.latitude) result.ld_lat = parseFloat(d.geo.latitude);
                            if (d.geo.longitude) result.ld_lon = parseFloat(d.geo.longitude);
                        }
                    }
                } catch (e) {}
            }

            // === DPE Energy Label ===
            // Look for DPE badge/label in structured elements
            const dpeSelectors = [
                '[class*="dpe"]', '[class*="energy"]', '[data-dpe]',
                '.energy-label', '.diagnostic', '[class*="diagnostic"]'
            ];
            for (const sel of dpeSelectors) {
                const el = document.querySelector(sel);
                if (el) {
                    const text = el.textContent.trim();
                    const match = text.match(/\b([A-G])\b/);
                    if (match) {
                        result.dpe = match[1];
                        break;
                    }
                }
            }
            // Fallback: regex on page text for DPE patterns
            if (!result.dpe) {
                const body = document.body.textContent;
                const dpePatterns = [
                    /DPE\s*[:\s]*([A-G])\b/i,
                    /[Éé]nergie\s*[:\s]*([A-G])\b/i,
                    /energy\s*(?:class|label|rating)\s*[:\s]*([A-G])\b/i,
                    /classe\s*[éé]nerg[ée]tique\s*[:\s]*([A-G])\b/i,
                ];
                for (const pat of dpePatterns) {
                    const m = body.match(pat);
                    if (m) {
                        result.dpe = m[1].toUpperCase();
                        break;
                    }
                }
            }

            // === Feature table / structured KPIs ===
            // Look for labeled feature rows (common on Properstar, Leggett, Idealista)
            const featureEls = document.querySelectorAll(
                '.features li, .property-features li, .detail-features li, ' +
                '.key-features li, dl dt, dl dd, .feature-item, ' +
                'table.features td, .specs li, .characteristics li, ' +
                '[class*="feature"] li, [class*="detail"] li'
            );
            const featureTexts = [];
            for (const el of featureEls) {
                featureTexts.push(el.textContent.trim().toLowerCase());
            }
            result.feature_texts = featureTexts.slice(0, 100);

            // === Listing date ===
            const dateSelectors = [
                '[class*="date"]', '[class*="published"]', '[class*="listed"]',
                'time[datetime]', '[data-date]'
            ];
            for (const sel of dateSelectors) {
                const el = document.querySelector(sel);
                if (el) {
                    const dt = el.getAttribute('datetime') || el.textContent.trim();
                    if (dt && dt.match(/\d{4}/)) {
                        result.listing_date = dt.slice(0, 30);
                        break;
                    }
                }
            }

            // === Property type from breadcrumb or meta ===
            const breadcrumb = document.querySelector('.breadcrumb, nav[aria-label="breadcrumb"]');
            if (breadcrumb) result.breadcrumb_text = breadcrumb.textContent.trim().slice(0, 200);

            const ogType = document.querySelector('meta[property="og:type"]');
            if (ogType) result.og_type = ogType.getAttribute('content');

            // === Full body text for keyword scanning ===
            result.body_text = document.body.textContent.slice(0, 10000);

            return result;
        }ok:Nd   N)Ncloudflare_blocked)Nno_data)gotowait_for_timeoutr
   evaluate	Exceptionstr)pageurldataes   &&  r   extract_page_datar+   G   s     F"ii(:EiJJJ##D)))(...-- ]] v$ v vp ""TzE 	K).vz  "SVD\!!"s   B/B A;B A=B A?	B B B/B (B)B 5B 6B/7B :B/;B =B ?B B B,B'!B,"B/'B,,B/c                   / pV  EF  pVP                  4       p\        P                  ! RV4      pV'       d   \        VP	                  ^4      4      VR&   \        P                  ! RV4      pV'       d   \        VP	                  ^4      4      VR&   \        P                  ! RV4      pV'       d5   VP                  R4      '       g   \        VP	                  ^4      4      VR&   \        P                  ! RV4      ;'       g    \        P                  ! RV4      pV'       dF   \        VP	                  ^4      P                  R	R
4      P                  RR
4      4      pVR8  d   WQR&   \        P                  ! RV4      pV'       dN   VP                  R4      '       g7   \        VP	                  ^4      P                  R	R4      4      pVR,          VR&   \        P                  ! RV4      ;'       g    \        P                  ! RV4      pV'       dF   \        VP	                  ^4      P                  R	R
4      P                  RR
4      4      pVR8  d   WQR&   \        P                  ! RV4      pV'       d1   \        VP	                  ^4      4      pRTu;8:  d   R8:  d   M MWR&   R F  p	W9   g   K  WR&    EK  	  EK  	  V# )z1Parse structured feature texts into typed fields.z6(\d+)\s*(?:bed|chambre|slaap|camera|dormitorio|quarto)bedroomsu<   (\d+)\s*(?:bath|salle de bain|badkamer|bagno|baño|banheiro)	bathroomsu4   (\d+)\s*(?:room|pièce|kamer|stanza|habitaci|comodo)roomsuS   (?:terrain|land|perceel|grond|plot|parcelle|foncier).*?(\d[\d.,]*)\s*(?:m²|m2|sqm)u>   (\d[\d.,]*)\s*(?:m²|m2)\s*(?:terrain|land|perceel|grond|plot), .i  land_size_m2z(\d[\d.,]*)\s*(?:hectare|ha)\b'  uL   (?:living|habitable|woon|surface|floor|built).*?(\d[\d.,]*)\s*(?:m²|m2|sqm)u4   (\d[\d.,]*)\s*(?:m²|m2)\s*(?:living|habitable|woon)  building_size_m2u+   (?:built|année|bouwjaar|constru).*?(\d{4})ix  i  
year_builtheating)zcentral heatingzchauffage centralgaselectricoilfioulu   pompe à chaleurz	heat pumpwoodu   poêle	fireplaceu	   cheminée)lowerresearchintgroupr   floatreplace)
feature_textsfieldstextmland_mvalha_mbuild_myrr8   s
   &         r   parse_featuresrO      sG   Fzz| IIOQUV!$QWWQZF: IIUW[\"%aggaj/F; IIMtTVZZ((!!''!*oF7O ))rtxy h hYY`bfg 	Q//R8@@bIJCcz),~& yy:DA

>22

1--c378C%(5[F>" 99lnrs _ _ii WY]^ 	a(00b9AA#rJKCTz-0)* IIDdKQWWQZBr!T!')|$<G $+y!<a n Mr   c                   a V P                  4       o/ p\        P                  4        FA  w  r#\        ;QJ d    V3R lV 4       F  '       g   K   RM	  RM! V3R lV 4       4      W&   KC  	  V# )zBScan body text for keyword signals. Returns dict of boolean flags.c              3   ,   <"   T F	  qS9   x  K  	  R # 5iN ).0kw
text_lowers   & r   	<genexpr> scan_keywords.<locals>.<genexpr>  s     "Gh#3h   TF)r?   KEYWORD_SIGNALSitemsany)	body_textsignalssignal_namekeywordsrV   s   &   @r   scan_keywordsra     sR    "JG!0!6!6!8"s"Gh"Gsss"Gh"GG "9Nr   c                   V '       g   V'       g   R# ^pV '       d1   R^R^R^R^R^R^R^/pVP                  V P                  4       ^4      pV'       d7   VR	8  d   \        V^,           ^4      pV# VR
8  d   \        V^,
          ^4      pV# )z>Estimate renovation scope score (1-5) from DPE and year built.NABCDEFGi  il  )r   upperminmax)dper7   score
dpe_scoress   &&  r   estimate_renovation_scorerp     s    zE
1c1c1c1c1c1c1M
syy{A.	1%E L $	1%ELr   c                    a"   \         P                  ! R R7      p V P                  R\        ^ RR7       V P                  RRRR7       V P                  R	RR
R7       V P                  RRRRR7       V P	                  4       p\        4       pVP                  4        UUu. uF$  w  r4\        WAP                  4      '       g   K"  VNK&  	  pppVP                  '       d   VRVP                   p\        R\        V4       24       VP                  '       dP   VR,           F  p\        RV 24       K  	  \        V4      ^
8  d    \        R\        V4      ^
,
           R24       R# V'       g   \        R4       R# ^ p^ pR^ R^ R^ R^ R^ /p\        VP                  R7      ;_uu_4       GRj  xL
 p	\        V4       EFh  w  r\!        V^#4      p\        RV
^,            R\        V4       RV R2R RR!7       \#        W4      G Rj  xL
 w  rV'       g4   V^,          p\        R"V R#24       V	P%                  R$4      G Rj  xL
  K  R%\&        P(                  ! 4       P+                  4       /p. pVP-                  R&4      '       dL   VR&,          R'8  d>   VR&,          VR&   VR;;,          ^,          uu&   VP/                  R(VR&,          R) 24       VP-                  R*4      '       d   VR*,          VR+&   VP-                  R,4      '       d*   VR,,          VR-&   VP/                  R.VR,,          R) R/24       VP-                  R04      '       d(   VR0,          VR0&   VP/                  R1VR0,           24       VP-                  R24      '       d0   VP-                  R34      '       d   VR2,          VR4&   VR3,          VR5&   VP-                  R64      '       d   VR6,          VR7&   VP-                  R84      '       d   VR8,          VR9&   VP-                  R4      '       d=   VR,          VR&   VR;;,          ^,          uu&   VP/                  R:VR,           24       \1        VP-                  R;. 4      4      pVP                  4        F  w  ppW#,          P-                  V4      '       d   K%  VVV&   VR<8X  d,   VR;;,          ^,          uu&   VP/                  R=V 24       K\  VR>8X  g   Ke  VR;;,          ^,          uu&   VP/                  R?VR) R/24       K  	  \3        VP-                  R@RA4      4      pVP                  4        UUu. uF  w  ppV'       g   K  VNK  	  pppV'       d8   VVRB&   VR;;,          ^,          uu&   VP/                  RC\        V4       24       \5        VP-                  R4      VP-                  R04      ;'       g    VP-                  R04      4      pVe   VVRD&   VP-                  RE4      '       d   VRE,          VRE&   VP-                  RF4      ;'       g    RAP7                  4       oRG. R\O3RH. R]O3RI. R^O3RJ. R_O3R. R`O33 FM  w  pp\8        ;QJ d    V3RK lV 4       F  '       g   K   RM	  RLM! V3RK lV 4       4      '       g   KH  VVRM&    M	  \;        W#V4       V^,          pV'       d   RNP=                  VRO,          4      MRPp\        RQV R#24       V
^,           ^,          ^ 8X  d   \?        V4       \        RR4       V	P%                  RS4      G Rj  xL
  EKk  	  RRR4      GRj  xL
  \        RTV R\        V4       RUV RV24       \        RWVR,           24       \        RXVR,           24       \        RYVR,           24       \        RZVR,           24       \        R[VR,           24       V^ 8  d   \A        V4       R# R# u uppi  EL0 EL ELu uppi  L L  + GRj  xL 
 '       g   i     L; i5i)az$Enrich properties from listing pages)descriptionz--limitz Max properties to scrape (0=all))typedefaulthelpz	--dry-run
store_truezShow what would be scraped)actionru   z--forcezRe-scrape already enrichedz
--headlessTzRun headless)rw   rt   ru   Nz$Properties needing page enrichment: :N
   Nz  z
  ... and z morezNothing to do.pricebedslandrm   r`   )headlessz  [/z] z... )endflushzFAIL ()r5   r   ld_pricer4   zprice=z.0fld_roomsr/   ld_floor_sizer6   zbuild=m2r7   zyr=ld_latld_lonlatlonld_postcodepostcode
ld_countrycountryzDPE=rF   r-   zbeds=r3   zland=r]   r1   keyword_signalszkw=renovation_scorelisting_datebreadcrumb_textfarmchateauhousebarnc              3   ,   <"   T F	  qS9   x  K  	  R # 5irR   rS   )rT   rU   bcs   & r   rW   main.<locals>.<genexpr>  s     3(BRx(rY   Fproperty_typez, :N   NmarkedzOK (z  [saved progress]i  z

Enriched z (z failed)z  New prices: z  New bedrooms: z  New land sizes: z  DPE labels: z  Keyword signals: )r   ferme	boerderij)u   châteaur   kasteelmanor)r   maisonhuisvilla)r   grangeschuur)r{   terraingrond)!argparseArgumentParseradd_argumentrB   
parse_argsr   r[   r   r   limitprintlendry_runr	   r|   	enumerater   r+   r#   r   now	isoformatr   appendrO   ra   rp   r?   r\   r   joinr   r   )parserargsstorer(   p
candidatesenrichedfailedstatsr'   iprop_idr)   statusrG   	extractedfeature_fieldskvr^   active_signalsrenoptyper`   summaryr   s                            @r   mainr   ,  s    $$1WXF
	Q=_`
L?[\
	,=YZ
\4n]DFE$)KKM ;M&#*1jj9 #MJ ; zzz,
	0Z0A
BC|||c??CBse* #z?RJs:34E:;HFaFAuaQGET]]333t
+FAR(GC!uAc*o.b	=3dS!24!==LD!vha()++D111%x||~'?'?'ABFI xx
##Z(85(@"&z"2wg!#  6$z*:3)?!@Axx
##"&z"2wxx((-1/-B)*  6$*?)DB!GHxx%%'+L'9|$  3tL'9&:!;<xx!!dhhx&8&8 $Xu $Xuxx&&%)-%8z"xx%%$($6y! xx $Uue!  4U}!56 ,DHH_b,IJN&,,.1z~~a(( !F1IJf*!((55n,f*!((53r):; / $DHH["$=>G,3MMOAODAqqaaONA,:()j!Q&!  3s>':&;!<= -TXXe_n>P>PQ]>^>x>xbfbjbjkwbxyD-1)* xx'')-n)=~& ((,-33::<B%+-K$L&/1\%]&-/S%T&,.J%K&,.J%K	$Mx
 33(33333(333.3F?+$M 5v&MH2;dii	".GD	#$ A|q U*,''---} , 43B 
Kz3z?"32fXX
FG	N5>*
+,	U6]O
,-	uV}o
./	N5<.
)*	j 12
34!| ;. 4
 >
 2` BJ . 4333s  B`
__B#`
,?`
+_,`
/A_._
:_.>_!
?A_.A!_.%"_.?_.=_._.._."_./"_.B_.;_.A_.;_$_$A._.5_.8#_.A_.0_.B_._*
	_.(`
3_,4B(`
_.!_.$_.,`
.`	4_75
`	 `	`
__main__)piscinezwembadzswimming poolpoolpiscina)puitssourceforagewaterputwellspringboreholeu   étangvijverpondlacu   rivièreriverstreamruisseau)r   u   dépendanceannexer   outbuildingr   hangaratelierworkshopgarageremise
pigeonnierchaiu   séchoirbergerieu   écuriestable)z	plein sudsudzsouth-facingzsouth facingu   orienté sudzexposition sudzuidgerichtzop het zuiden)assainissementzfosse septiquesepticu   tout-à-l'égoutu	   raccordé	rioleringzmains drainage)u   gîteu   chambre d'hôtezb&bzbed and breakfastzguest housegastenkameru   chambres d'hôtesu   maison d'hôteszholiday rentalu   location saisonnièrevakantiewoning)r   parcellehectareprairieu   prévergerorchard	boomgaardpotagerzkitchen gardenmoestuinvignoblevineyard	wijngaardoliviersolive)F)__doc__r   asyncior@   r   r   r   r   r   r   r   r   r	   r
   rZ   r   r+   rO   ra   rp   r   __name__runrS   r   r   <module>r      s   "   	  f f f      
      
  5D)H"V:z(IX zKK r   