#!/usr/bin/env python3
# Outline template — respect robots.txt and ToS of target sites.
import csv, json, re, unicodedata
from pathlib import Path
from typing import Dict, Any, List, Optional
import requests
from lxml import html as lxml_html

HEADERS = {'User-Agent': 'REALTczBot/1.0', 'Accept-Language': 'cs-CZ,cs;q=0.9,en;q=0.8'}

def clean_space(x): 
    import re; return re.sub(r'\s+',' ', x or '').strip()

def parse_json_ld(doc):
    out = {}
    for t in doc.xpath('//script[@type="application/ld+json"]/text()'):
        try:
            j = json.loads(t)
            if isinstance(j, dict): out.update(j)
            elif isinstance(j, list):
                for o in j:
                    if isinstance(o, dict): out.update(o)
        except Exception: 
            pass
    return out

def extract_detail(url: str) -> Optional[Dict[str, Any]]:
    r = requests.get(url, headers=HEADERS, timeout=25)
    if r.status_code != 200: return None
    doc = lxml_html.fromstring(r.text)
    j = parse_json_ld(doc)
    item = {k:'' for k in 'source_id,source_url,title,description_html,price_total,photos,city,district_prague,neighborhood,gps_lat,gps_lng'.split(',')}
    item['source_url'] = url
    if isinstance(j, dict):
        item['title'] = j.get('name',''); item['description_html'] = j.get('description','')
        price = j.get('offers',{}).get('price') if isinstance(j.get('offers'), dict) else j.get('price')
        if price: item['price_total'] = re.sub(r'\D','', str(price))
        addr = j.get('address') or {}
        if isinstance(addr, dict):
            loc = clean_space(addr.get('addressLocality',''))
            import re as _re; m = _re.search(r'Praha\s*(\d+)', loc)
            if m: item['district_prague'] = f'praha-{m.group(1)}'
        geo = j.get('geo') or {}
        if isinstance(geo, dict):
            item['gps_lat'] = str(geo.get('latitude') or ''); item['gps_lng'] = str(geo.get('longitude') or '')
        imgs = j.get('image'); 
        if isinstance(imgs, list): item['photos'] = ','.join(imgs)
        elif isinstance(imgs, str): item['photos'] = imgs
    import re as _re; m = _re.search(r'(\d{4,})', url); item['source_id'] = m.group(1) if m else url
    return item

if __name__ == '__main__':
    # Example: python parser.py urls.txt output.csv
    import sys
    if len(sys.argv) < 3:
        print('Usage: parser.py <urls.txt> <out.csv>'); sys.exit(1)
    urls = [u.strip() for u in Path(sys.argv[1]).read_text(encoding='utf-8').splitlines() if u.strip()]
    rows = []
    for u in urls:
        it = extract_detail(u)
        if it: rows.append(it)
    with open(sys.argv[2],'w',newline='',encoding='utf-8') as f:
        w = csv.DictWriter(f, fieldnames=['source_id','source_url','title','description_html','price_total','photos','city','district_prague','neighborhood','gps_lat','gps_lng'])
        w.writeheader(); [w.writerow(r) for r in rows]
