Skip to content

Commit 9864fe8

Browse files
committed
[ADD] main.py, result.csv, booking_query_payload.json
1 parent 5bfbf24 commit 9864fe8

File tree

4 files changed

+228
-1
lines changed

4 files changed

+228
-1
lines changed

README.md

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,43 @@
77

88
<!-- /!\ do not modify above this line -->
99

10-
#
10+
# Booking.com Hotel Scraper (Italy - Sardinia & Calabria)
11+
12+
This Python script scrapes hotel data from [Booking.com](https://www.booking.com) using their internal GraphQL API. It collects hotels from the Sardinia and Calabria regions and saves them in a `result.csv` file.
13+
14+
## Features
15+
16+
- Scrapes hotel `id`, `name`, `city`, and `address`
17+
- Adds constant fields: `state_id=base.it`, `is_hotel=1`, `is_company=1`
18+
- Handles pagination
19+
- Avoids duplicates
20+
21+
## Requirements
22+
23+
- Python 3.8+
24+
- Install dependencies:
25+
26+
```bash
27+
pip install requests
28+
```
29+
## Usage
30+
31+
- Clone the repository
32+
- Set maximum number of hotels to scrape in main.py, line 34:
33+
maximum_hotels = 1000 # Set your desired hotel count limit
34+
- Run the script:
35+
python3 main.py
36+
- Output will be saved as result.csv.
37+
38+
## Output Format
39+
40+
id,name,city,address,state_id,is_hotel,is_company
41+
123456,Hotel Roma,Rome,Via Nazionale 1,base.it,1,1
42+
43+
## Notes
44+
45+
This script uses internal (unofficial) Booking.com GraphQL API and may stop working if the site structure changes.
46+
Use responsibly to avoid temporary IP bans. Includes a delay between requests.
1147

1248

1349

parse_hotels_italy/booking_query_payload.json

Lines changed: 55 additions & 0 deletions
Large diffs are not rendered by default.

parse_hotels_italy/main.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import copy
2+
import csv
3+
import json
4+
import logging
5+
import time
6+
7+
import requests
8+
9+
logging.basicConfig(level=logging.INFO)
10+
logger = logging.getLogger(__name__)
11+
12+
url = "https://www.booking.com/dml/graphql?ss=Italy" "&lang=en-us&aid=304142"
13+
14+
headers = {
15+
"User-Agent": (
16+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
17+
"AppleWebKit/537.36 (KHTML, like Gecko) "
18+
"Chrome/137.0.0.0 Safari/537.36"
19+
),
20+
"Content-Type": "application/json",
21+
"Referer": "https://www.booking.com/searchresults.html?region=908",
22+
"x-booking-csrf-token": (
23+
"eyJhbGciOiJIUzUxMiJ9.eyJpc3MiOiJjb250ZXh0LWVucmljaG1lbnQtYXBpIiwic3"
24+
"ViIjoiY3NyZi10b2tlbiIsImlhdCI6MTc1MDM2NzM3NSwiZXhwIjoxNzUwNDUzNzc1fQ."
25+
"1mpLQoicPHNU-p3a9XLxVrqlTQjNK4BmyVcR-HOL81Mt_dsAxAkLDulrTwGa42XBqnp1pfnNIguXKSKTNbBIXQ"
26+
),
27+
"x-booking-pageview-id": "d82194c7b1470608",
28+
"apollographql-client-name": "b-search-web-searchresults_rust",
29+
"x-booking-context-action-name": "searchresults_irene",
30+
"x-booking-context-aid": "304142",
31+
"x-booking-dml-cluster": "rust",
32+
"x-booking-site-type-id": "1",
33+
"x-booking-topic": "capla_browser_b-search-web-searchresults",
34+
}
35+
36+
cookies = {"bkng": ""}
37+
38+
with open("booking_query_payload.json", encoding="utf-8") as f:
39+
base_payload = json.load(f)
40+
41+
regions = [
42+
{"name": "Sardinia", "destId": 908},
43+
{"name": "Calabria", "destId": 897},
44+
]
45+
46+
maximum_hotels = 10000
47+
csv_rows = []
48+
seen_ids = set()
49+
50+
for region in regions:
51+
logger.info("Processing %s", region["name"])
52+
for offset in range(0, maximum_hotels, 25):
53+
logger.info("Loading hotels offset = %s", offset)
54+
55+
payload = copy.deepcopy(base_payload)
56+
payload["variables"]["input"]["pagination"]["offset"] = offset
57+
payload["variables"]["input"]["location"]["destId"] = region["destId"]
58+
payload["variables"]["input"]["location"]["destType"] = "REGION"
59+
60+
try:
61+
response = requests.post(
62+
url,
63+
headers=headers,
64+
cookies=cookies,
65+
json=payload,
66+
timeout=10,
67+
)
68+
except requests.exceptions.RequestException as e:
69+
logger.error("Request failed: %s", e)
70+
break
71+
72+
if response.status_code != 200:
73+
logger.error("HTTP error %s at offset %s", response.status_code, offset)
74+
break
75+
76+
try:
77+
data = response.json()
78+
hotels = data["data"]["searchQueries"]["search"]["results"]
79+
80+
if not hotels:
81+
logger.info("No more results.")
82+
break
83+
84+
for hotel in hotels:
85+
try:
86+
hotel_id = hotel["basicPropertyData"]["id"]
87+
if hotel_id in seen_ids:
88+
continue
89+
90+
name = hotel.get("displayName", {}).get("text", "")
91+
city = (
92+
hotel["basicPropertyData"].get("location", {}).get("city", "")
93+
)
94+
address = (
95+
hotel["basicPropertyData"]
96+
.get("location", {})
97+
.get("address", "")
98+
)
99+
100+
csv_rows.append(
101+
[
102+
hotel_id,
103+
name.replace(",", " ").strip(),
104+
city.replace(",", " ").strip(),
105+
address.replace(",", " ").strip(),
106+
"base.it",
107+
"1",
108+
"1",
109+
]
110+
)
111+
seen_ids.add(hotel_id)
112+
except Exception as e:
113+
logger.warning("Hotel parse error: %s", e)
114+
except Exception as e:
115+
logger.error("JSON parse error at offset %s: %s", offset, e)
116+
logger.debug(response.text[:500])
117+
break
118+
119+
time.sleep(1.5)
120+
121+
with open("result.csv", "w", newline="", encoding="utf-8") as csvfile:
122+
writer = csv.writer(csvfile)
123+
writer.writerow(
124+
[
125+
"id",
126+
"name",
127+
"city",
128+
"address",
129+
"state_id",
130+
"is_hotel",
131+
"is_company",
132+
]
133+
)
134+
writer.writerows(csv_rows)
135+
136+
logger.info("Saved %d unique hotels to 'result.csv'", len(csv_rows))

parse_hotels_italy/result.csv

Whitespace-only changes.

0 commit comments

Comments
 (0)