+
    jhO                     :   R t ^ RIt^ RIHt ^ RIHtHtHtHt ^ RI	t	 ! R R4      t
R R lt]R8X  d   ^ RItR	t]! R
4       ]! R4       ]! R] R24       RR/t]P"                  ! ]]^
R7      t]P&                  ^8X  d   ]! ]P(                  ]4      t]
! ]P(                  ]4      t]P/                  4        ]! R4       ]! R4       ]! ]P1                  4       4       ]! R4       ]! R4       ]! ]P3                  4       4       R# ]! R]P&                   24       R# R# )zp
Structured Property Facts Extractor
Extracts structured data from property pages for high-quality GPT analysis
N)BeautifulSoup)DictOptionalListAnyc                   2  a  ] tR t^t o RtV 3R lR ltV 3R lR ltV 3R lR ltV 3R lR	 ltV 3R
 lR lt	V 3R lR lt
V 3R lR ltV 3R lR ltV 3R lR ltV 3R lR ltV 3R lR ltV 3R lR ltV 3R lR ltV 3R lR ltV 3R lR ltR tV tR!# )"PropertyFactsExtractorz+Extract structured facts from property HTMLc                &   < V ^8  d   QhRS[ RS[ /# )   html_contenturlstr)format__classdict__s   "^/Users/jonathan/Documents/Zakelijk/ClaudeOS/03_Lab/farmmatch/scraper/extract_property_facts.py__annotate__#PropertyFactsExtractor.__annotate__   s      S s     c                B    \        VR 4      V n        W n        / V n        R# )zhtml.parserN)r   soupr   facts)selfr   r   s   &&&r   __init__PropertyFactsExtractor.__init__   s    !,>	
r   c                6   < V ^8  d   QhRS[ S[S[3,          /# r
   returnr   r   r   )r   r   s   "r   r   r      s      T#s(^ r   c                   RV P                   RV P                  4       RV P                  4       RV P                  4       RV P	                  4       RV P                  4       RV P                  4       RV P                  4       R	V P                  4       R
V P                  4       RV P                  4       RV P                  4       /V n        V P                  # )z3Extract all structured facts from the property pager   titledescriptionpricelocationproperty_detailsland_detailsbuilding_details	amenitiesfeaturesenergy_infokey_highlights)r   _extract_title_extract_description_extract_price_extract_location_extract_property_details_extract_land_details_extract_building_details_extract_amenities_extract_features_extract_energy_info_extract_key_highlightsr   r   s   &r   extract_all"PropertyFactsExtractor.extract_all   s     488T((*4446T((*..0 > > @D668 > > @002..04446d::<

 zzr   c                    < V ^8  d   QhRS[ /# r   r   )r   r   s   "r   r   r   %   s     ; ; ;r   c                p    V P                   P                  R4      pV'       d   VP                  RR7      # R# )zExtract property titleh1Tstrip )r   findget_text)r   r    s   & r   r+   %PropertyFactsExtractor._extract_title%   s,    		t$-2u~~D~)::r   c                    < V ^8  d   QhRS[ /# r   r   )r   r   s   "r   r   r   *   s     - -c -r   c                ^   V P                   P                  RRR7      p. pVR,           Fj  pVP                  RR7      pV'       g   K  \        V4      ^28  g   K1  \        P
                  ! RRV\        P                  R	7      pVP                  V4       Kl  	  R
P                  V4      R,          # )z+Extract main property description (cleaned)divzlisting-section-contentclass_:Nr
   NTr<   z7(Cookie|Privacy|Terms|Contact us|Show more|Lees meer).*r>   )flags :Ni  N)	r   find_allr@   lenresub
IGNORECASEappendjoin)r   sectionsdescriptionssectiontexts   &    r   r,   +PropertyFactsExtractor._extract_description*   s     99%%e4M%N||G##$#/DtD	BvvXZ\^bjljwjwx##D) $ xx%e,,r   c                0   < V ^8  d   QhRS[ S[,          /# r   )r   int)r   r   s   "r   r   r   9   s     # # #r   c           	        V P                   P                  RRR/4      pV'       de   VP                  RR7      pRP                  \	        \
        P                  V4      4      pV'       d"    \        V4      pRTu;8:  d
   R8:  d    V#  M V P                   P                  4       p. R
OpV F  p\        P                  ! Wu\        P                  4      pV'       g   K2  RP                  \	        \
        P                  VP                  ^4      4      4      pV'       g   Kt   \        V4      pRTu;8:  d   R8:  d	   M K  Vu # K  	  R	#     L; i    K  ; i)zExtract price as integerspanitempropr"   Tr<   r>   '  i N)u+   €\s*(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?)zEUR\s*(\d{1,3}(?:[.,]\d{3})*)u(   Price:?\s*€?\s*(\d{1,3}(?:[.,]\d{3})*))r   r?   r@   rO   filterr   isdigitrV   rK   searchrM   group)	r   
price_span
price_textdigitsr"   rS   patternspatternmatchs	   &        r   r-   %PropertyFactsExtractor._extract_price9   s    YY^^FZ,AB
#,,4,8JWWVCKK<=FKE11$ 2 yy!!#

  GIIgR]];EuU[[^!DE6 #F E5X55#(L 6   /&s$   +E E  E>EEEc                F   < V ^8  d   QhRS[ S[S[S[,          3,          /# r   r   r   r   )r   r   s   "r   r   r   ^   s"     
 
4Xc](:#; 
r   c           
        V P                   P                  RRR7      pV'       d   VP                  RR7      MRpVP                  R4       Uu. uF  q3P	                  4       NK  	  ppRTR	\        V4      ^ 8  d
   V^ ,          MR
R\        V4      ^8  d
   V^,          MR
R\        V4      ^8  d   VR,          /# R
/# u upi )zExtract location detailsrD   zitem-locationrE   Tr<   r>   ,fullcityNregioncountry)r   r?   r@   splitr=   rJ   )r   location_divlocation_textppartss   &    r   r.   (PropertyFactsExtractor._extract_location^   s    yy~~eO~D=I--D-9r %2$7$7$<=$<q$<= ME
QE!HD#e*q.eAhdCJNuRy	
 	
 9=	
 	
 >s   B=c                6   < V ^8  d   QhRS[ S[S[3,          /# r   r   )r   r   s   "r   r   r   m   s     F F4S> Fr   c                   V P                   P                  RRR7      pRRRRRRRRR	RR
RRRRR/pV'       g   V# VP                  4       p\        P                  ! RV\        P
                  4      pV'       d;   \        V^ ,          4      VR&   \        V4      ^8  d   \        V^,          4      VR&   \        P                  ! RV\        P
                  4      pV'       d   \        VP                  ^4      4      VR&   \        P                  ! RV\        P
                  4      pV'       d@   \        VP                  ^4      P                  RR4      4      p\        VR,          4      VR&   \        P                  ! RV\        P
                  4      pV'       d   \        VP                  ^4      4      VR&   \        P                  ! RV\        P
                  4      p	V	'       d   \        V	P                  ^4      4      VR	&   \        P                  ! RV\        P
                  4      p
V
'       d   \        V
P                  ^4      4      VR
&   . ROpV FR  p\        P                  ! RV,           R,           V\        P
                  4      '       g   K?  VP                  4       VR&    M	  \        P                  ! RV\        P
                  4      pV'       d2   \        VP                  ^4      4      pRTu;8:  d   R8:  d	   M V# WR&   V# )z.Extract core property metrics from 'areas' divrD   areasrE   total_area_m2Nliving_area_m2land_area_m2bedrooms	bathroomsroomsproperty_type
year_builtu   (\d{1,6})\s*(?:m²|m2|sq\s*m)u>   (?:plot|land|terrain|grond)[:\s]*(\d{1,8})\s*(?:m²|m2|sq\s*m)z*(\d{1,4}(?:[.,]\d{1,2})?)\s*(?:ha|hectare)ri   .rZ   z.(\d{1,2})\s*(?:bedroom|chambre|slaapkamer|bed)z4(\d{1,2})\s*(?:bathroom|salle de bain|badkamer|bath)u'   (\d{1,2})\s*(?:room|pièce|kamer|piece)\bz+(?:built|constructed|bouwjaar)[:\s]*(\d{4})i@  i  )	farmhousevillahousecottagemanorestatemasferme	boerderiju   châteaucastle)r   r?   r@   rK   findallrM   rV   rJ   r]   r^   floatreplacer    )r   	areas_divdetailsrS   area_matches
land_matchhectare_matchhabedroom_matchbathroom_matchrooms_matchtypes	prop_type
year_matchyears   &              r   r/   0PropertyFactsExtractor._extract_property_detailsm   s   IINN5N9	TdDTT$	
 N!!# zz"BD"--X'*<?';GO$< 1$,/Q,@() YY`bfhjhuhuv
&)**:*:1*=&>GN# 		"OQUWYWdWde}**1-55c3?@B&)"u*oGN# 		"SUY[][h[hi"%m&9&9!&<"=GJ #Z\`bdbobop#&~';';A'>#?GK  ii JDRTR_R_`";#4#4Q#78GGDIyy*U2D"--HH+4??+<(  YYMtUWUbUbc
z''*+Dt#t#  )-%r   c                6   < V ^8  d   QhRS[ S[S[3,          /# r   r   )r   r   s   "r   r   r      s     = =tCH~ =r   c                "   V P                   P                  4       pRRRRRRRRRRRRRRR	RR
RRRRRRR/pR. ROR. ROR. ROR. ROR. ROR. ROR	. ROR
. ROR. RO/	pVP                  4        F^  w  rEV FC  p\        P                  ! RV,           R,           V\        P
                  4      '       g   K?  RW$&    M	  W$,          e   KZ  RW$&   K`  	  \        VR,          VR,          VR,          VR,          .4      VR&   \        P                  ! RV\        P
                  4      '       d   RVR&   MRVR&   . ROpV FS  p\        P                  ! RV,           R,           V\        P
                  4      '       g   K?  VP                  4       VR&    V# 	  V# )zBExtract land-specific details (important for regenerative farming)has_water_sourceNhas_well
has_springhas_pondhas_river_access
has_foresthas_orchardhas_vineyardhas_pasturehas_arable_landirrigation_available	soil_typer   TFz-\b(irrigation|irrigated|arrosage|irrigatie)\b)wellpuitswaterputbron)springsourcer   )pondu   étangvijverlakelacmeer)riveru   rivièrerivierstreamruisseaubeek)forestwoodlandu   forêtbostreesarbresbomen)orchardverger	boomgaardzfruit trees)vineyardvignoble	wijngaardvines)pastureprairieweidemeadowgrazing)arable
cultivablefarmlandagricultural)	clayloamsandyargilelimonsableuxkleileemzand)r   r@   itemsrK   r]   rM   anyr    )	r   rS   r   water_keywordskeykeywordskeyword
soil_typessoils	   &        r   r0   ,PropertyFactsExtractor._extract_land_details   s   yy!!# $$4D4t"D
" =6K ]]LJOS

 ,113MC#99UW_u4dBMMJJ#'GL $ |#$ 4 '*JL!J&'	+
 '"# 99EtR]][[.2G*+.3G*+ e
Dyy-tR]]CC'+zz|$ 
 r   c                6   < V ^8  d   QhRS[ S[S[3,          /# r   r   )r   r   s   "r   r   r      s     . .4S> .r   c                z   V P                   P                  4       pRRRRRRRRRRRRRRR	RR
RRR/
pR. ROR. RORRR.R. ROR. ROR. ROR. RO/pVP                  4        F^  w  rEV FC  p\        P                  ! RV,           R,           V\        P
                  4      '       g   K?  RW$&    M	  W$,          e   KZ  RW$&   K`  	  \        P                  ! RV\        P
                  4      '       d   RVR&   RVR
&   V# \        P                  ! RV\        P
                  4      '       d   RVR
&   RVR&   V# RVR
&   RVR&   V# )zFExtract building-specific details (for workshops, guest accommodation)has_barnN
has_stable
has_garagehas_workshophas_outbuildinghas_separate_apartmenthas_guest_housetotal_buildingsrenovation_neededmove_in_readygaragecarportr   TFu8   \b(renovated|restored|modernized|rénové|gerenoveerd)\buJ   \b(renovation needed|to renovate|à rénover|te renoveren|work required)\b)barngrangeschuurhangar)stableu   écuriestal)workshopatelier
werkplaatsshed)outbuildingu   dépendance	bijgebouwannexe)	apartmentflatappartementstudio)guest houseu   gîtegastenverblijfr   )r   r@   r   rK   r]   rM   )r   rS   r   building_keywordsr   r   r   s   &      r   r1   0PropertyFactsExtractor._extract_building_details   ss   yy!!# $$Dt$dttT
 >78Y/IT$&TT
 /446MC#99UW_u4dBMMJJ#'GL $ |#$ 7 99PRVXZXeXeff'+GO$+0G'(  YYdfjlnlylyzz+/G'(',GO$
  ,0G'('+GO$r   c                0   < V ^8  d   QhRS[ S[,          /# r   r   r   )r   r   s   "r   r   r   $  s     $ $DI $r   c                   . pV P                   P                  4       p. RNRNRNRNRNRNRNRNR	NR
NRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNRNR NR!NR"NR#NR$NR%NR&NR'NR(NR)NR*NR+NR,NpV F^  p\        P                  ! R-V,           R-,           V\        P                  4      '       g   K?  VP                  VP                  4       4       K`  	  \        \        V4      4      # ).zExtract amenities/features listinternetwififiber	broadbandheatingzcentral heating	chauffage
verwarmingzair conditioningclimatisationaircozsolar panelszpanneaux solaireszonnepanelenz	heat pumpu   pompe à chaleur
warmtepomp	fireplaceu	   cheminéez
open haardpoolzswimming poolpiscinezwembadterraceterrasseterrasbalconybalconbalkonelectricityelectricu   électricitéelektriciteitzwater mainsz
town waterzeau couranteleidingwaterzseptic tankzfosse septiquesepticz
well waterzspring waterzrainwater harvestingr   )	r   r@   rK   r]   rM   rN   r    listset)r   r'   rS   amenity_listamenitys   &    r   r2   )PropertyFactsExtractor._extract_amenities$  s&   	yy!!#


 '
)4
 
 )
 +6
 8D
 	
 !0	
 29	

 

 0

 2@
 
 ,
 .:
 
 %
 '3
 
 $
 &/
 1:
 
 "
 $,
 
  
 "*
 
 &
 (7
 9H
 
 (
 *8
 :H
 
 ,
 .6
 
 )
 +A
$ $Gyy50$FF  1 $ C	N##r   c                0   < V ^8  d   QhRS[ S[,          /# r   r  )r   r   s   "r   r   r   A  s      49 r   c                   . pV P                   P                  4       p. ROpV F^  p\        P                  ! RV,           R,           V\        P                  4      '       g   K?  VP                  VP                  4       4       K`  	  V# )zExtract special featuresr   )zpanoramic viewzmountain viewzsea viewzvalley viewprivatesecludedisolatedquietpeacefulzsouth facingsunnybrightstonetraditional	authentic	characterorganicpermaculture
biodynamiczplanning permissionzbuilding permitzpermis de construire)r   r@   rK   r]   rM   rN   r    )r   r(   rS   feature_listfeatures   &    r   r3   (PropertyFactsExtractor._extract_featuresA  se    yy!!#
 $Gyy50$FF0 $ r   c                F   < V ^8  d   QhRS[ S[S[S[,          3,          /# r   rg   )r   r   s   "r   r   r   U  s"      d3+=&> r   c                   V P                   P                  4       pRRRR/p\        P                  ! RV\        P                  4      pV'       d#   VP                  ^4      P                  4       VR&   \        P                  ! RV\        P                  4      pV'       d#   VP                  ^4      P                  4       VR&   V# )zExtract energy performance dataenergy_ratingN	dpe_scorez,Energy\s*(?:rating|class|label)[:\s]*([A-G])zDPE[:\s]*([A-G]|\d{1,3}))r   r@   rK   r]   rM   r^   upper)r   rS   inforating_match	dpe_matchs   &    r   r4   +PropertyFactsExtractor._extract_energy_infoU  s    yy!!# T
 yy!PRVXZXeXef$0$6$6q$9$?$?$AD! II94O	 ) 2 8 8 :Dr   c                0   < V ^8  d   QhRS[ S[,          /# r   r  )r   r   s   "r   r   r   j  s      c r   c                P   . pV P                   P                  RR.\        P                  ! R4      R7      pV Fj  pVP                  R4      pVR,           FI  pVP	                  RR7      pV'       g   K  \        V4      ^8  g   K1  VP                  VR	,          4       KK  	  Kl  	  V# )
z3Extract key selling points from structured sectionsulolzfeature|highlight|keyrE   liN
   NTr<   :Nd   N)r   rI   rK   compiler@   rJ   rN   )r   
highlightskey_featuresrF  r   itemrS   s   &      r   r5   .PropertyFactsExtractor._extract_key_highlightsj  s    
 yy))4,rzzJa?b)cBKK%Ec

}}4}04CIM%%d4j1 #  r   c                    < V ^8  d   QhRS[ /# r   r   )r   r   s   "r   r   r   y  s     D D Dr   c                H    \         P                  ! V P                  R^R7      # )zConvert facts to clean JSONF)ensure_asciiindent)jsondumpsr   r6   s   &r   to_jsonPropertyFactsExtractor.to_jsony  s    zz$**5CCr   c                    < V ^8  d   QhRS[ /# r   r   )r   r   s   "r   r   r   }  s     `  `  ` r   c                   . pV P                   P                  R4      '       d&   VP                  RV P                   R,           24       V P                   P                  R/ 4      P                  R4      '       d-   VP                  RV P                   R,          R,           24       V P                   P                  R4      '       d'   VP                  RV P                   R,          R 24       VP                  R	4       V P                   P                  R
/ 4      pVP                  R4      '       d   VP                  RVR,           24       VP                  R4      '       d   VP                  RVR,           R24       VP                  R4      '       d   VP                  RVR,           R24       VP                  R4      '       d0   VP                  RVR,          R RVR,          R,          R R24       VP                  R4      '       d   VP                  RVR,           24       VP                  R4      '       d   VP                  RVR,           24       VP                  R4      '       d   VP                  RVR,           24       VP                  R	4       V P                   P                  R/ 4      p. pVP                  R4      '       d   . pVP                  R 4      '       d   VP                  R!4       VP                  R"4      '       d   VP                  R#4       VP                  R$4      '       d   VP                  R%4       VP                  R&4      '       d   VP                  R'4       V'       d$   VP                  R(R)P                  V4       24       VP                  R*4      '       d   VP                  R+4       VP                  R,4      '       d   VP                  R-4       VP                  R.4      '       d   VP                  R/4       VP                  R04      '       d   VP                  R14       VP                  R24      '       d   VP                  R3VR2,           24       V'       d@   VP                  R44       V F  pVP                  R5V 24       K  	  VP                  R	4       V P                   P                  R6/ 4      p. pVP                  R74      '       d   VP                  R84       VP                  R94      '       d   VP                  R:4       VP                  R;4      '       d   VP                  R<4       VP                  R=4      '       d   VP                  R>4       VP                  R?4      '       d   VP                  R@4       VP                  RA4      '       d   VP                  RB4       VP                  RC4      '       d   VP                  RD4       V'       d5   VP                  RER)P                  V4       24       VP                  R	4       VP                  RF4      e9   VRF,          '       d   RGMRHp	VP                  RIV	 24       VP                  R	4       V P                   P                  RJ. 4      p
V
'       d<   VP                  RKR)P                  V
RL,          4       24       VP                  R	4       V P                   P                  RMR	4      pV'       dK   VP                  RN4       VP                  VRO,          4       \	        V4      RP8  d   VP                  RQ4       RRP                  V4      # )Sz)Convert facts to GPT-friendly text formatr    z
Property: r#   rj   z
Location: r"   u
   Price: €ri   r>   r$   r~   zType: rx   zTotal area: u    m²ry   zLiving area: rz   zLand area: u    m² (rZ   z.2fz ha)r{   z
Bedrooms: r|   zBathrooms: r   zYear built: r%   r   r   r   r   r   r   r   r   zriver accesszWater: z, r   zForest/woodland presentr   Orchardr   zPasture/meadowr   zIrrigation availabler   zSoil: zLand features:z  - r&   r   r   r   r   r   r   r   r   r   outbuildingsr   r  r   zseparate apartmentzAdditional buildings: r   zRenovation neededzMove-in readyzCondition: r'   zAmenities: rI  r!   zDescription::Ni  Ni  z...
)r   getrN   rO   rJ   )r   linespdldland_featureswater_typesr9  bd	buildingsstatusr'   descs   &           r   to_prompt_text%PropertyFactsExtractor.to_prompt_text}  s    ::>>'""LL:djj&9%:;<::>>*b)--f55LL:djj&<V&D%EFG::>>'""LL:djj&9!%<=>R ZZ^^.366/""LL6"_"5!67866/""LL<?(;'<DAB66"##LL=,<)=(>dCD66.!!LL;r.'9!&<F2nCUV[C[\_B``def66*LL:bn%56766+LL;r+&78966,LL<<(8'9:;R ZZ^^NB/66$%%Kvvj!!;#5#5f#=vvl##[%7%7%Avvj!!;#5#5f#=vv());+=+=n+M$$wtyy/E.F%GH66,  !:;66-    +66-    !1266())  !7866+  6"[/):!;<LL)*(tG9-. )LL ZZ^^.3	66*y//766,!1!1(!;66,!1!1(!;66.!!9#3#3J#?66#$$i&6&6~&F66#$$i&6&6}&E66*++Y-=-=>R-SLL1$))I2F1GHILL66%&2,./B,C,C(FLL;vh/0LL JJNN;3	LL;tyy3'@&ABCLL zz~~mR0LL(LLd$4y3U#yyr   )r   r   r   N)__name__
__module____qualname____firstlineno____doc__r   r7   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   rW  rh  __static_attributes____classdictcell__)r   s   @r   r   r      s     5 
 $; ;
- -# #J
 
F FP= =~. .`$ $: ( * D D`  ` r   r   c                ^    V ^8  d   QhR\         R\         R\        \         \        3,          /# )r
   r   r   r   )r   r   r   )r   s   "r   r   r     s)     # # #3 #4S> #r   c                8    \        W4      pVP                  4       # )z
Main function to extract structured facts from property HTML

Args:
    html_content: Raw HTML from property page
    url: Property URL

Returns:
    Dictionary with structured property facts
)r   r7   )r   r   	extractors   && r   extract_property_factsrt    s     '|9I  ""r   __main__zJhttps://www.properstar.com/property-for-sale/france/lot-et-garonne/7837046z Testing Property Facts ExtractorzURL: r]  z
User-AgentzDMozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15)headerstimeoutzSTRUCTURED JSON:z

GPT PROMPT FORMAT:zFailed to fetch page: zF======================================================================)rn  rK   bs4r   typingr   r   r   r   rU  r   rt  rj  requeststest_urlprintrv  r^  responsestatus_coderS   r   rs  r7   rW  rh   r   r   <module>r     s   
  , , R  R j# z[H	
,-	(O	E(2
 	\G ||HgrBHs"&x}}h?*8==(C	 !hi!"&'hi&&()&x';';&<=>; r   