Skip to content

Commit 282e0db

Browse files
authored
Merge branch 'main' into feat/infer_schema_for_csv_ndjson
2 parents 1eb9928 + 613a23a commit 282e0db

File tree

12 files changed

+181
-92
lines changed

12 files changed

+181
-92
lines changed

.github/actions/build_bindings_python/action.yml

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ runs:
1818
raw_version="${{ inputs.version }}"
1919
# Remove v prefix and suffixes: v1.2.809-nightly -> 1.2.809
2020
VERSION=$(echo "$raw_version" | sed 's/^v//' | sed 's/-nightly//' | sed 's/-p[0-9]*//')
21-
21+
2222
echo "building version: $raw_version -> $VERSION"
2323
# Replace any existing version in Cargo.toml and pyproject.toml
2424
sed "s/^version = \".*\"/version = \"$VERSION\"/" Cargo.toml > Cargo.toml.tmp
@@ -80,9 +80,8 @@ runs:
8080
# Install cargo-zigbuild manually to avoid musl target issues
8181
cargo install cargo-zigbuild --target ${{ inputs.target }}
8282
../../scripts/setup/dev_setup.sh -yb
83-
# Clean up any existing virtual environment
84-
rm -rf .venv || true
85-
uv venv --python=python3.12
83+
# Create virtual environment for build dependencies (overwrite if exists)
84+
uv venv .venv --python=python3.12 --clear
8685
uv sync --all-groups --all-extras
8786
8887
- name: Setup Rust for development builds
@@ -99,11 +98,8 @@ runs:
9998
run: |
10099
echo "Building development wheel for testing..."
101100
102-
# Clean up any existing virtual environment
103-
rm -rf .venv || true
104-
105-
# Create and activate virtual environment
106-
uv venv --python python3.12
101+
# Create virtual environment (overwrite if exists)
102+
uv venv .venv --python python3.12 --clear
107103
source .venv/bin/activate
108104
109105
# Install development dependencies
@@ -121,25 +117,3 @@ runs:
121117
python -m pytest tests/ -v --tb=short
122118
123119
echo "All Python binding tests passed!"
124-
125-
- name: Test built wheels
126-
if: inputs.version != ''
127-
shell: bash
128-
working-directory: src/bendpy
129-
run: |
130-
echo "Testing built wheels..."
131-
132-
# Clean up any existing virtual environment
133-
rm -rf .venv || true
134-
135-
# Install from built wheel for release testing
136-
uv venv --python python3.12
137-
source .venv/bin/activate
138-
uv pip install dist/*.whl
139-
uv pip install pytest pandas polars pyarrow
140-
141-
# Run pytest tests
142-
echo "Executing pytest tests..."
143-
python -m pytest tests/ -v --tb=short
144-
145-
echo "All Python binding tests passed!"

.github/workflows/bindings.python.yml

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,12 @@ jobs:
4444
echo "Error: No version provided for workflow_call"
4545
exit 1
4646
fi
47-
47+
4848
# Validate version format
4949
if [[ ! $VERSION =~ ^v[0-9]+\.[0-9]+\.[0-9]+.*$ ]]; then
5050
echo "Warning: Version $VERSION may not follow semantic versioning (vX.Y.Z)"
5151
fi
52-
52+
5353
echo "Final version: $VERSION"
5454
echo "version=$VERSION" >> $GITHUB_OUTPUT
5555
@@ -104,29 +104,44 @@ jobs:
104104
retention-days: 7
105105

106106
macos:
107-
name: Build macOS Universal Wheel
107+
name: Build macOS Wheels
108108
if: (github.event_name == 'workflow_call' && inputs.version != '') || github.event_name == 'workflow_dispatch'
109109
needs: [get-version, test]
110110
runs-on: macos-latest
111+
continue-on-error: true
112+
strategy:
113+
matrix:
114+
target: [x86_64-apple-darwin, aarch64-apple-darwin]
111115
steps:
112116
- uses: actions/checkout@v4
113117
with:
114118
fetch-depth: 0
119+
120+
- name: Install dependencies
121+
run: |
122+
# Install OpenSSL and necessary tools
123+
brew install openssl@3
124+
125+
# Use vendored OpenSSL to avoid cross-compilation issues
126+
echo "OPENSSL_STATIC=1" >> $GITHUB_ENV
127+
echo "OPENSSL_VENDORED=1" >> $GITHUB_ENV
128+
echo "PKG_CONFIG_ALLOW_CROSS=1" >> $GITHUB_ENV
129+
115130
- uses: ./.github/actions/build_bindings_python
116131
with:
117-
target: universal2-apple-darwin
132+
target: ${{ matrix.target }}
118133
version: ${{ needs.get-version.outputs.version }}
119134
- name: Upload macOS wheel
120135
uses: actions/upload-artifact@v4
121136
with:
122-
name: python-macos-universal
137+
name: python-macos-${{ matrix.target }}
123138
path: src/bendpy/dist/*.whl
124139
retention-days: 7
125140

126141
publish:
127142
name: Publish to PyPI
128143
if: (github.event_name == 'workflow_call' && inputs.version != '') || github.event_name == 'workflow_dispatch'
129-
needs: [get-version, test, linux, macos]
144+
needs: [get-version, test, linux]
130145
runs-on: ubuntu-latest
131146
steps:
132147
- uses: actions/checkout@v4
@@ -135,20 +150,26 @@ jobs:
135150
pattern: python-*
136151
merge-multiple: true
137152
path: src/bendpy/dist
153+
continue-on-error: true
138154

139155
- name: Show packages to publish
140156
run: |
141157
echo "Publishing packages for version: ${{ needs.get-version.outputs.version }}"
142158
echo "Packages found:"
143-
ls -la src/bendpy/dist/
159+
ls -la src/bendpy/dist/ || echo "No packages found"
144160
echo "Total packages: $(ls src/bendpy/dist/*.whl 2>/dev/null | wc -l)"
145161
146162
- name: Publish to PyPI
147163
timeout-minutes: 10
148164
run: |
149165
pip install twine
150166
echo "Publishing to PyPI..."
151-
twine upload --skip-existing --verbose src/bendpy/dist/*.whl
167+
if [ -n "$(find src/bendpy/dist -name "*.whl" 2>/dev/null)" ]; then
168+
twine upload --skip-existing --verbose src/bendpy/dist/*.whl
169+
else
170+
echo "No wheel files found to publish"
171+
exit 1
172+
fi
152173
env:
153174
TWINE_USERNAME: __token__
154175
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}

src/bendpy/README.md

Lines changed: 64 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,91 @@
11
# Databend Python Binding
22

3-
Official Python binding for [Databend](https://databend.com) - The AI-Native Data Warehouse.
3+
Official Python binding for [Databend](https://databend.com) - The multi-modal data warehouse built for the AI era.
44

5-
Databend is the open-source alternative to Snowflake with near 100% SQL compatibility and native AI capabilities. Built in Rust with MPP architecture and S3-native storage, Databend unifies structured tables, JSON documents, and vector embeddings in a single platform.
5+
Databend unifies structured data, JSON documents, and vector embeddings in a single platform with near 100% Snowflake compatibility. Built in Rust with MPP architecture and S3-native storage for cloud-scale analytics.
66

77
## Installation
88

99
```bash
1010
pip install databend
1111
```
1212

13+
To test, run:
14+
```python
15+
python3 -c "import databend; ctx = databend.SessionContext(); ctx.sql('SELECT version() AS version').show()"
16+
```
17+
18+
## API Reference
19+
20+
### Core Operations
21+
| Method | Description |
22+
|--------|-------------|
23+
| `SessionContext()` | Create a new session context |
24+
| `sql(query)` | Execute SQL query, returns DataFrame |
25+
26+
### File Registration
27+
| Method | Description |
28+
|--------|-------------|
29+
| `register_parquet(name, path, pattern=None, connection=None)` | Register Parquet files as table |
30+
| `register_csv(name, path, pattern=None, connection=None)` | Register CSV files as table |
31+
| `register_ndjson(name, path, pattern=None, connection=None)` | Register NDJSON files as table |
32+
| `register_tsv(name, path, pattern=None, connection=None)` | Register TSV files as table |
33+
34+
### Cloud Storage Connections
35+
| Method | Description |
36+
|--------|-------------|
37+
| `create_s3_connection(name, key, secret, endpoint=None, region=None)` | Create S3 connection |
38+
| `create_azblob_connection(name, url, account, key)` | Create Azure Blob connection |
39+
| `create_gcs_connection(name, url, credential)` | Create Google Cloud connection |
40+
| `list_connections()` | List all connections |
41+
| `describe_connection(name)` | Show connection details |
42+
| `drop_connection(name)` | Remove connection |
43+
44+
### Stage Management
45+
| Method | Description |
46+
|--------|-------------|
47+
| `create_stage(name, url, connection)` | Create external stage |
48+
| `show_stages()` | List all stages |
49+
| `list_stages(stage_name)` | List files in stage |
50+
| `describe_stage(name)` | Show stage details |
51+
| `drop_stage(name)` | Remove stage |
52+
53+
### DataFrame Operations
54+
| Method | Description |
55+
|--------|-------------|
56+
| `collect()` | Execute and collect results |
57+
| `show(num=20)` | Display results in console |
58+
| `to_pandas()` | Convert to pandas DataFrame |
59+
| `to_polars()` | Convert to polars DataFrame |
60+
| `to_arrow_table()` | Convert to PyArrow Table |
61+
1362
## Examples
1463

15-
### Local Files
64+
### Local Tables
1665

1766
```python
1867
import databend
1968
ctx = databend.SessionContext()
2069

21-
# Query local Parquet files
22-
ctx.register_parquet("orders", "/path/to/orders/")
23-
ctx.register_parquet("customers", "/path/to/customers/")
24-
df = ctx.sql("SELECT * FROM orders JOIN customers ON orders.customer_id = customers.id").to_pandas()
70+
# Create and query in-memory tables
71+
ctx.sql("CREATE TABLE users (id INT, name STRING, age INT)").collect()
72+
ctx.sql("INSERT INTO users VALUES (1, 'Alice', 25), (2, 'Bob', 30)").collect()
73+
df = ctx.sql("SELECT * FROM users WHERE age > 25").to_pandas()
2574
```
2675

27-
### Local Tables
76+
### Working with Local Files
2877

2978
```python
3079
import databend
3180
ctx = databend.SessionContext()
3281

33-
# Create and query local tables
34-
ctx.sql("CREATE TABLE users (id INT, name STRING, age INT)").collect()
35-
ctx.sql("INSERT INTO users VALUES (1, 'Alice', 25), (2, 'Bob', 30)").collect()
36-
df = ctx.sql("SELECT * FROM users WHERE age > 25").to_pandas()
82+
# Query local Parquet files
83+
ctx.register_parquet("orders", "/path/to/orders/")
84+
ctx.register_parquet("customers", "/path/to/customers/")
85+
df = ctx.sql("SELECT * FROM orders JOIN customers ON orders.customer_id = customers.id").to_pandas()
3786
```
3887

39-
### S3 Remote Files
88+
### Cloud Storage - S3 Files
4089

4190
```python
4291
import databend
@@ -49,14 +98,14 @@ ctx.register_parquet("trips", "s3://bucket/trips/", connection="s3")
4998
df = ctx.sql("SELECT COUNT(*) FROM trips").to_pandas()
5099
```
51100

52-
### Remote Tables
101+
### Cloud Storage - S3 Tables
53102

54103
```python
55104
import databend
56105
import os
57106
ctx = databend.SessionContext()
58107

59-
# Create S3 connection and table
108+
# Create S3 connection and persistent table
60109
ctx.create_s3_connection("s3", os.getenv("AWS_ACCESS_KEY_ID"), os.getenv("AWS_SECRET_ACCESS_KEY"))
61110
ctx.sql("CREATE TABLE s3_table (id INT, name STRING) 's3://bucket/table/' CONNECTION=(CONNECTION_NAME='s3')").collect()
62111
df = ctx.sql("SELECT * FROM s3_table").to_pandas()

src/bendpy/pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ build-backend = "maturin"
77
version = "0.1.0"
88
name = "databend"
99
requires-python = ">=3.10"
10-
description = "Official Python binding for Databend - The AI-Native Data Warehouse. Open-source alternative to Snowflake with near 100% SQL compatibility, native AI capabilities, and unified support for structured, semi-structured, and vector data."
10+
description = "The multi-modal data warehouse built for the AI era. Unified analytics for structured data, JSON, and vector embeddings with near 100% Snowflake compatibility."
11+
readme = "README.md"
1112
license = {text = "Apache-2.0"}
1213
authors = [{name = "Databend Labs", email = "[email protected]"}]
1314
maintainers = [{name = "Databend Community", email = "[email protected]"}]
@@ -38,7 +39,7 @@ test = ["pytest", "pandas", "polars", "pyarrow"]
3839
[project.urls]
3940
Documentation = "https://pypi.org/project/databend/"
4041
Homepage = "https://databend.com"
41-
Repository = "https://github.com/databendlabs/databend"
42+
Repository = "https://github.com/databendlabs/databend/tree/main/src/bendpy"
4243

4344
[dependency-groups]
4445
dev = ["maturin>=1.8.2"]

src/bendpy/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ fn create_embedded_config(
8989

9090
// Query configuration
9191
conf.query.tenant_id = Tenant::new_literal("python_binding");
92+
conf.query.embedded_mode = true;
9293
conf.query.cluster_id = "".to_string();
9394
conf.query.warehouse_id = "".to_string();
9495
conf.query.node_id = "embedded_node".to_string();

src/query/config/src/config.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1791,6 +1791,8 @@ pub struct QueryConfig {
17911791
/// If in management mode, only can do some meta level operations(database/table/user/stage etc.) with metasrv.
17921792
#[clap(long)]
17931793
pub management_mode: bool,
1794+
#[clap(long)]
1795+
pub embedded_mode: bool,
17941796

17951797
/// Deprecated: jwt_key_file is deprecated, use jwt_key_files to add a list of available jwks url
17961798
#[clap(long, value_name = "VALUE", default_value_t)]
@@ -2017,6 +2019,7 @@ impl TryInto<InnerQueryConfig> for QueryConfig {
20172019
max_query_log_size: self.max_query_log_size,
20182020
databend_enterprise_license: self.databend_enterprise_license,
20192021
management_mode: self.management_mode,
2022+
embedded_mode: self.embedded_mode,
20202023
parquet_fast_read_bytes: self.parquet_fast_read_bytes,
20212024
max_storage_io_requests: self.max_storage_io_requests,
20222025
jwt_key_file: self.jwt_key_file,
@@ -2159,6 +2162,7 @@ impl From<InnerQueryConfig> for QueryConfig {
21592162
settings: HashMap::new(),
21602163
resources_management: None,
21612164
enable_queries_executor: inner.enable_queries_executor,
2165+
embedded_mode: inner.embedded_mode,
21622166
}
21632167
}
21642168
}

src/query/config/src/global.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ impl GlobalConfig {
4848
/// Embedded mode is determined by empty cluster_id and warehouse_id
4949
pub fn is_embedded_mode() -> bool {
5050
let config = Self::instance();
51-
config.query.cluster_id.is_empty() && config.query.warehouse_id.is_empty()
51+
config.query.embedded_mode
52+
&& config.query.cluster_id.is_empty()
53+
&& config.query.warehouse_id.is_empty()
5254
}
5355
}

src/query/config/src/inner.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ pub struct QueryConfig {
209209
pub databend_enterprise_license: Option<String>,
210210
/// If in management mode, only can do some meta level operations(database/table/user/stage etc.) with metasrv.
211211
pub management_mode: bool,
212+
pub embedded_mode: bool,
212213

213214
pub parquet_fast_read_bytes: Option<u64>,
214215
pub max_storage_io_requests: Option<u64>,
@@ -302,6 +303,7 @@ impl Default for QueryConfig {
302303
max_query_log_size: 10_000,
303304
databend_enterprise_license: None,
304305
management_mode: false,
306+
embedded_mode: false,
305307
parquet_fast_read_bytes: None,
306308
max_storage_io_requests: None,
307309
jwt_key_file: "".to_string(),

src/query/service/tests/it/storages/testdata/configs_table_basic.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ DB.Table: 'system'.'configs', Table: configs-table_id:1, ver:0, Engine: SystemCo
163163
| 'query' | 'default_storage_format' | 'auto' | '' |
164164
| 'query' | 'disable_system_table_load' | 'false' | '' |
165165
| 'query' | 'discovery_address' | '' | '' |
166+
| 'query' | 'embedded_mode' | 'false' | '' |
166167
| 'query' | 'enable_meta_data_upgrade_json_to_pb_from_v307' | 'false' | '' |
167168
| 'query' | 'enable_queries_executor' | 'false' | '' |
168169
| 'query' | 'enable_udf_js_script' | 'true' | '' |

0 commit comments

Comments
 (0)