[ADD] added parser: main.py, result.csv, readme.md

kopeyev · kopeyev · commit 9d09eb7ba4e3 · 2025-06-18T01:34:04.000+05:00
diff --git a/README.md b/README.md
@@ -1,35 +1,57 @@
+# Booking.com Parser
 
-<!-- /!\ Non OCA Context : Set here the badge of your runbot / runboat instance. -->
-[![Pre-commit Status](https://github.com/it-projects-llc/parsers/actions/workflows/pre-commit.yml/badge.svg?branch=master)](https://github.com/it-projects-llc/parsers/actions/workflows/pre-commit.yml?query=branch%3Amaster)
-[![Build Status](https://github.com/it-projects-llc/parsers/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/it-projects-llc/parsers/actions/workflows/test.yml?query=branch%3Amaster)
-[![codecov](https://codecov.io/gh/it-projects-llc/parsers/branch/master/graph/badge.svg)](https://codecov.io/gh/it-projects-llc/parsers)
-<!-- /!\ Non OCA Context : Set here the badge of your translation instance. -->
+A Python script to scrape hotel data from **Booking.com** for hotels in Italy.
 
-<!-- /!\ do not modify above this line -->
+---
 
-#
+## Features
 
+The script uses **Selenium** to collect hotel links and extract detailed information from each hotel page, including:
 
+- Hotel name  
+- Street, ZIP, city, and state  
+  (automatically retrieved using OpenStreetMap API if not parsed directly from text)  
+- Whether it's a hotel (`is_hotel`)  
+- Whether it has a license number (`is_company`)  
+- Booking.com hotel ID (slug)
 
-<!-- /!\ do not modify below this line -->
+All results are saved into `result.csv`.
 
-<!-- prettier-ignore-start -->
+---
 
-[//]: # (addons)
+## Requirements
 
-This part will be replaced when running the oca-gen-addons-table script from OCA/maintainer-tools.
+- Python **3.7+** (tested with version 3.11.11)
+- **Google Chrome + ChromeDriver** (**Must be same version**)
 
-[//]: # (end addons)
+## Install dependencies:
 
-<!-- prettier-ignore-end -->
+- pip3 install selenium requests
 
-## Licenses
+## How to Run
 
-This repository is licensed under [LGPL-3.0](LICENSE).
+- python3 main.py
 
-However, each module can have a totally different license, as long as they adhere to IT-Projects LLC
-policy. Consult each module's `__manifest__.py` file, which contains a `license` key
-that explains its license.
+## Configuration
 
-----
-<!-- /!\ Non OCA Context : Set here the full description of your organization. -->
+- In main.py, line 43, you can change PAGES_AMOUNT - amount of pages you want to parse.
+	Each page contains approximately 25 hotel listings.
+	Increase PAGES_AMOUNT to scrape more results.
+
+## Output
+
+- The final data will be written to: result.csv
+- With the following columns:
+	bookingcom_id
+	name
+	street
+	zip
+	city
+	state_id
+	is_hotel
+	is_company
+
+## License
+
+- All rights reserved.
+- This script is the intellectual property of IT-Projects LLC.
diff --git a/main.py b/main.py
@@ -0,0 +1,122 @@
+import csv
+import re
+import time
+import requests
+from urllib.parse import urlparse
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+# --- Get address by latitude and longitude ---
+def get_address(lat, lon):
+    try:
+        res = requests.get(
+            "https://nominatim.openstreetmap.org/reverse",
+            params={"lat": lat, "lon": lon, "format": "json", "addressdetails": 1},
+            headers={"User-Agent": "PyBookingScraper/1.0"},
+            timeout=10
+        )
+        addr = res.json().get("address", {})
+        return {
+            "street": addr.get("road", ""),
+            "zip": addr.get("postcode", ""),
+            "city": addr.get("city") or addr.get("town") or addr.get("village", ""),
+            "state": addr.get("state") or addr.get("county") or addr.get("region", "")
+        }
+    except Exception as e:
+        print(f"Geo API error: {e}")
+        return {"street": "", "zip": "", "city": "", "state": ""}
+
+# --- Selenium setup ---
+options = webdriver.ChromeOptions()
+options.add_argument("--headless=new")
+options.add_argument("--no-sandbox")
+options.add_argument("--disable-dev-shm-usage")
+options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36")
+driver = webdriver.Chrome(service=Service(), options=options)
+
+# --- Collect hotel URLs ---
+urls = []
+PAGES_AMOUNT = 10 # Amount of pages you have to parse: each page contains approximately 25 records
+for page in range(PAGES_AMOUNT):
+    print(f"Scanning page {page + 1}")
+    search_url = f"https://www.booking.com/searchresults.html?ss=Italy&offset={page * 25}"
+    driver.get(search_url)
+    time.sleep(3)
+    try:
+        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-testid='title-link']")))
+        for link in driver.find_elements(By.CSS_SELECTOR, "a[data-testid='title-link']"):
+            href = link.get_attribute("href").split('?')[0]
+            if href and href not in urls:
+                urls.append(href)
+    except Exception as e:
+        print(f"Error on page {page + 1}: {e}")
+
+print(f"Found {len(urls)} links")
+
+# --- Parse hotel details ---
+with open("result.csv", "w", newline="", encoding="utf-8") as f:
+    writer = csv.writer(f)
+    writer.writerow(["bookingcom_id", "name", "street", "zip", "city", "state_id", "is_hotel", "is_company"])
+
+    for i, url in enumerate(urls, 1):
+        driver.get(url)
+        time.sleep(3)
+        print(f"Processing {i}/{len(urls)}: {url}")
+        try:
+            wait = WebDriverWait(driver, 10)
+            name = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "h2.pp-header__title"))).text.strip()
+            name_lower = name.lower()
+
+            is_hotel = "1" if any(w in name_lower for w in ["hotel", "отель", "albergo", "struttura ricettiva"]) else "0"
+            if is_hotel == "0":
+                try:
+                    desc = driver.find_element(By.CSS_SELECTOR, ".hp-description").text.lower()
+                    if any(w in desc for w in ["hotel", "отель", "albergo", "struttura ricettiva"]):
+                        is_hotel = "1"
+                except:
+                    pass
+
+            is_company = "0"
+            try:
+                info = driver.find_element(By.ID, "important_info").text.lower()
+                if re.search(r"(номер лицензии|license number|licenza n?)[:\s]+[\w\d\-, ]+", info):
+                    is_company = "1"
+            except:
+                pass
+
+            street = zip_code = city = state = ""
+            try:
+                addr_text = driver.find_element(By.CSS_SELECTOR, "div.b99b6ef58f.cb4b7a25d9").text.strip()
+                match = re.search(r"^(.*?),\s*(\d{5})\s+([\w\s\-']+),?\s*(Italy)?", addr_text)
+                if match:
+                    street, zip_code, city = match.group(1).strip(), match.group(2).strip(), match.group(3).strip()
+            except:
+                pass
+
+            if not (street and zip_code and city and state):
+                try:
+                    coords = driver.find_element(By.ID, "map_trigger_header").get_attribute("data-atlas-latlng")
+                    lat, lon = map(float, coords.split(","))
+                    addr = get_address(lat, lon)
+                    street = street or addr["street"]
+                    zip_code = zip_code or addr["zip"]
+                    city = city or addr["city"]
+                    state = addr["state"]
+                except Exception as e:
+                    print(f"No coordinates: {e}")
+
+            parsed = urlparse(url)
+            match = re.search(r'hotel/it/.+?-(\d+)\.en-gb\.html', parsed.path)
+            hotel_id = match.group(1) if match else parsed.path.split("/")[-1].split(".")[0]
+
+            writer.writerow([hotel_id, name, street, zip_code, city, state, is_hotel, is_company])
+
+        except Exception as e:
+            print(f"Error processing {url}: {e}")
+
+driver.quit()
+print("Done. Saved to result.csv")
diff --git a/result.csv b/result.csv