Skip to content

Commit 9d09eb7

Browse files
committed
[ADD] added parser: main.py, result.csv, readme.md
1 parent 5bfbf24 commit 9d09eb7

File tree

3 files changed

+164
-20
lines changed

3 files changed

+164
-20
lines changed

README.md

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,57 @@
1+
# Booking.com Parser
12

2-
<!-- /!\ Non OCA Context : Set here the badge of your runbot / runboat instance. -->
3-
[![Pre-commit Status](https://github.com/it-projects-llc/parsers/actions/workflows/pre-commit.yml/badge.svg?branch=master)](https://github.com/it-projects-llc/parsers/actions/workflows/pre-commit.yml?query=branch%3Amaster)
4-
[![Build Status](https://github.com/it-projects-llc/parsers/actions/workflows/test.yml/badge.svg?branch=master)](https://github.com/it-projects-llc/parsers/actions/workflows/test.yml?query=branch%3Amaster)
5-
[![codecov](https://codecov.io/gh/it-projects-llc/parsers/branch/master/graph/badge.svg)](https://codecov.io/gh/it-projects-llc/parsers)
6-
<!-- /!\ Non OCA Context : Set here the badge of your translation instance. -->
3+
A Python script to scrape hotel data from **Booking.com** for hotels in Italy.
74

8-
<!-- /!\ do not modify above this line -->
5+
---
96

10-
#
7+
## Features
118

9+
The script uses **Selenium** to collect hotel links and extract detailed information from each hotel page, including:
1210

11+
- Hotel name
12+
- Street, ZIP, city, and state
13+
(automatically retrieved using OpenStreetMap API if not parsed directly from text)
14+
- Whether it's a hotel (`is_hotel`)
15+
- Whether it has a license number (`is_company`)
16+
- Booking.com hotel ID (slug)
1317

14-
<!-- /!\ do not modify below this line -->
18+
All results are saved into `result.csv`.
1519

16-
<!-- prettier-ignore-start -->
20+
---
1721

18-
[//]: # (addons)
22+
## Requirements
1923

20-
This part will be replaced when running the oca-gen-addons-table script from OCA/maintainer-tools.
24+
- Python **3.7+** (tested with version 3.11.11)
25+
- **Google Chrome + ChromeDriver** (**Must be same version**)
2126

22-
[//]: # (end addons)
27+
## Install dependencies:
2328

24-
<!-- prettier-ignore-end -->
29+
- pip3 install selenium requests
2530

26-
## Licenses
31+
## How to Run
2732

28-
This repository is licensed under [LGPL-3.0](LICENSE).
33+
- python3 main.py
2934

30-
However, each module can have a totally different license, as long as they adhere to IT-Projects LLC
31-
policy. Consult each module's `__manifest__.py` file, which contains a `license` key
32-
that explains its license.
35+
## Configuration
3336

34-
----
35-
<!-- /!\ Non OCA Context : Set here the full description of your organization. -->
37+
- In main.py, line 43, you can change PAGES_AMOUNT - amount of pages you want to parse.
38+
Each page contains approximately 25 hotel listings.
39+
Increase PAGES_AMOUNT to scrape more results.
40+
41+
## Output
42+
43+
- The final data will be written to: result.csv
44+
- With the following columns:
45+
bookingcom_id
46+
name
47+
street
48+
zip
49+
city
50+
state_id
51+
is_hotel
52+
is_company
53+
54+
## License
55+
56+
- All rights reserved.
57+
- This script is the intellectual property of IT-Projects LLC.

main.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import csv
2+
import re
3+
import time
4+
import requests
5+
from urllib.parse import urlparse
6+
7+
from selenium import webdriver
8+
from selenium.webdriver.common.by import By
9+
from selenium.webdriver.chrome.service import Service
10+
from selenium.webdriver.support.ui import WebDriverWait
11+
from selenium.webdriver.support import expected_conditions as EC
12+
13+
# --- Get address by latitude and longitude ---
14+
def get_address(lat, lon):
15+
try:
16+
res = requests.get(
17+
"https://nominatim.openstreetmap.org/reverse",
18+
params={"lat": lat, "lon": lon, "format": "json", "addressdetails": 1},
19+
headers={"User-Agent": "PyBookingScraper/1.0"},
20+
timeout=10
21+
)
22+
addr = res.json().get("address", {})
23+
return {
24+
"street": addr.get("road", ""),
25+
"zip": addr.get("postcode", ""),
26+
"city": addr.get("city") or addr.get("town") or addr.get("village", ""),
27+
"state": addr.get("state") or addr.get("county") or addr.get("region", "")
28+
}
29+
except Exception as e:
30+
print(f"Geo API error: {e}")
31+
return {"street": "", "zip": "", "city": "", "state": ""}
32+
33+
# --- Selenium setup ---
34+
options = webdriver.ChromeOptions()
35+
options.add_argument("--headless=new")
36+
options.add_argument("--no-sandbox")
37+
options.add_argument("--disable-dev-shm-usage")
38+
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36")
39+
driver = webdriver.Chrome(service=Service(), options=options)
40+
41+
# --- Collect hotel URLs ---
42+
urls = []
43+
PAGES_AMOUNT = 10 # Amount of pages you have to parse: each page contains approximately 25 records
44+
for page in range(PAGES_AMOUNT):
45+
print(f"Scanning page {page + 1}")
46+
search_url = f"https://www.booking.com/searchresults.html?ss=Italy&offset={page * 25}"
47+
driver.get(search_url)
48+
time.sleep(3)
49+
try:
50+
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[data-testid='title-link']")))
51+
for link in driver.find_elements(By.CSS_SELECTOR, "a[data-testid='title-link']"):
52+
href = link.get_attribute("href").split('?')[0]
53+
if href and href not in urls:
54+
urls.append(href)
55+
except Exception as e:
56+
print(f"Error on page {page + 1}: {e}")
57+
58+
print(f"Found {len(urls)} links")
59+
60+
# --- Parse hotel details ---
61+
with open("result.csv", "w", newline="", encoding="utf-8") as f:
62+
writer = csv.writer(f)
63+
writer.writerow(["bookingcom_id", "name", "street", "zip", "city", "state_id", "is_hotel", "is_company"])
64+
65+
for i, url in enumerate(urls, 1):
66+
driver.get(url)
67+
time.sleep(3)
68+
print(f"Processing {i}/{len(urls)}: {url}")
69+
try:
70+
wait = WebDriverWait(driver, 10)
71+
name = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "h2.pp-header__title"))).text.strip()
72+
name_lower = name.lower()
73+
74+
is_hotel = "1" if any(w in name_lower for w in ["hotel", "отель", "albergo", "struttura ricettiva"]) else "0"
75+
if is_hotel == "0":
76+
try:
77+
desc = driver.find_element(By.CSS_SELECTOR, ".hp-description").text.lower()
78+
if any(w in desc for w in ["hotel", "отель", "albergo", "struttura ricettiva"]):
79+
is_hotel = "1"
80+
except:
81+
pass
82+
83+
is_company = "0"
84+
try:
85+
info = driver.find_element(By.ID, "important_info").text.lower()
86+
if re.search(r"(номер лицензии|license number|licenza n?)[:\s]+[\w\d\-, ]+", info):
87+
is_company = "1"
88+
except:
89+
pass
90+
91+
street = zip_code = city = state = ""
92+
try:
93+
addr_text = driver.find_element(By.CSS_SELECTOR, "div.b99b6ef58f.cb4b7a25d9").text.strip()
94+
match = re.search(r"^(.*?),\s*(\d{5})\s+([\w\s\-']+),?\s*(Italy)?", addr_text)
95+
if match:
96+
street, zip_code, city = match.group(1).strip(), match.group(2).strip(), match.group(3).strip()
97+
except:
98+
pass
99+
100+
if not (street and zip_code and city and state):
101+
try:
102+
coords = driver.find_element(By.ID, "map_trigger_header").get_attribute("data-atlas-latlng")
103+
lat, lon = map(float, coords.split(","))
104+
addr = get_address(lat, lon)
105+
street = street or addr["street"]
106+
zip_code = zip_code or addr["zip"]
107+
city = city or addr["city"]
108+
state = addr["state"]
109+
except Exception as e:
110+
print(f"No coordinates: {e}")
111+
112+
parsed = urlparse(url)
113+
match = re.search(r'hotel/it/.+?-(\d+)\.en-gb\.html', parsed.path)
114+
hotel_id = match.group(1) if match else parsed.path.split("/")[-1].split(".")[0]
115+
116+
writer.writerow([hotel_id, name, street, zip_code, city, state, is_hotel, is_company])
117+
118+
except Exception as e:
119+
print(f"Error processing {url}: {e}")
120+
121+
driver.quit()
122+
print("Done. Saved to result.csv")

result.csv

Whitespace-only changes.

0 commit comments

Comments
 (0)