Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@ UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak

# Backup files
*.bak

# SQL Server files
*.mdf
*.ldf
Expand Down
48 changes: 45 additions & 3 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,56 @@
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [

"configurations": [
{
"name": "Python Debugger: app-file",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/code/python/app-file.py",
"console": "integratedTerminal",
"justMyCode": true,
"cwd": "${workspaceFolder}/code/python",
"env": {
"PYTHONPATH": "${workspaceFolder}/code/python"
}
},
{
"name": "Python Debugger: db_load with args",
"type": "debugpy",
"request": "launch",
"module": "data_loading.db_load",
"console": "integratedTerminal",
"args": "${command:pickArgs}",
"justMyCode": true,
"cwd": "${workspaceFolder}/code/python",
"env": {
"PYTHONPATH": "${workspaceFolder}/code/python"
}
},
{
"name": "Python Debugger: current file with args",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": "${command:pickArgs}",
"justMyCode": true,
"cwd": "${workspaceFolder}/code/python",
"env": {
"PYTHONPATH": "${workspaceFolder}/code/python"
}
},
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
"console": "integratedTerminal",
"justMyCode": true,
"cwd": "${workspaceFolder}/code/python",
"env": {
"PYTHONPATH": "${workspaceFolder}/code/python"
}
}
]
}
24 changes: 23 additions & 1 deletion code/python/core/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"openai": threading.Lock(),
"gemini": threading.Lock(),
"azure_openai": threading.Lock(),
"qwen_openai": threading.Lock(),
"snowflake": threading.Lock()
}

Expand Down Expand Up @@ -119,7 +120,18 @@ async def get_embedding(
)
logger.debug(f"Azure embeddings received, dimension: {len(result)}")
return result


if provider == "qwen_openai":
logger.debug("Getting Qwen OpenAI embeddings")
# Import here to avoid potential circular imports
from embedding_providers.qwen_embedding import get_qwen_embedding
result = await asyncio.wait_for(
get_qwen_embedding(text, model=model_id),
timeout=timeout
)
logger.debug(f"Qwen OpenAI embeddings received, dimension: {len(result)}")
return result

if provider == "snowflake":
logger.debug("Getting Snowflake embeddings")
# Import here to avoid potential circular imports
Expand Down Expand Up @@ -204,6 +216,16 @@ async def batch_get_embeddings(

try:
# Provider-specific batch implementations with timeout handling
if provider == "qwen_openai":
logger.debug("Getting Qwen OpenAI batch embeddings")
from embedding_providers.qwen_embedding import get_qwen_batch_embeddings
result = await asyncio.wait_for(
get_qwen_batch_embeddings(texts, model=model_id),
timeout=timeout
)
logger.debug(f"Qwen OpenAI batch embeddings received, count: {len(result)}")
return result

if provider == "openai":
# Use OpenAI's batch embedding API
logger.debug("Getting OpenAI batch embeddings")
Expand Down
4 changes: 4 additions & 0 deletions code/python/core/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def init():
"azure_openai": ["openai>=1.12.0"],
"llama_azure": ["openai>=1.12.0"],
"deepseek_azure": ["openai>=1.12.0"],
"qwen_openai": ["openai>=1.12.0"],
"inception": ["aiohttp>=3.9.1"],
"snowflake": ["httpx>=0.28.1"],
"huggingface": ["huggingface_hub>=0.31.0"],
Expand Down Expand Up @@ -138,6 +139,9 @@ def _get_provider(llm_type: str):
elif llm_type == "deepseek_azure":
from llm_providers.azure_deepseek import provider as deepseek_provider
_loaded_providers[llm_type] = deepseek_provider
elif llm_type == "qwen_openai":
from llm_providers.qwen_openai import provider as qwen_provider
_loaded_providers[llm_type] = qwen_provider
elif llm_type == "inception":
from llm_providers.inception import provider as inception_provider
_loaded_providers[llm_type] = inception_provider
Expand Down
76 changes: 76 additions & 0 deletions code/python/data_loading/db_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys
import json
import csv
import pandas as pd
import asyncio
import aiohttp
import tempfile
Expand Down Expand Up @@ -287,6 +288,8 @@ async def detect_file_type(file_path: str) -> Tuple[str, bool]:
return 'json', has_embeddings
elif ext == '.csv':
return 'csv', has_embeddings
elif ext == '.xlsx':
return 'xlsx', has_embeddings
elif ext in ['.xml', '.rss', '.atom']:
# Check if file contains RSS-like elements
try:
Expand Down Expand Up @@ -364,6 +367,76 @@ async def detect_file_type(file_path: str) -> Tuple[str, bool]:

return 'unknown', has_embeddings

async def process_xlsx_file(file_path: str, site: str) -> list:
"""
处理 Excel 文件(.xlsx),转为文档对象列表
Args:
file_path: Excel 文件路径
site: 站点标识
Returns:
文档对象列表
"""
print(f"Processing XLSX file: {file_path}")
documents = []
error_count = 0
success_count = 0

try:
df = pd.read_excel(file_path, dtype=str) # 读为字符串,避免类型问题
if df.empty:
print(f"Warning: XLSX file {file_path} is empty.")
return documents

for index, row in df.iterrows():
try:
row_data = row.to_dict()
# 尝试提取 url/id 字段
url = None
for col in ['url', 'URL', 'link', 'Link', 'id', 'ID', 'identifier']:
if col in row_data and row_data[col]:
url = str(row_data[col])
break
if not url:
url = f"xlsx:{os.path.basename(file_path)}:{index}"

# 转为 JSON
json_data = json.dumps(row_data, ensure_ascii=False)

# 尝试提取 name/title 字段
name = None
for col in ['name', 'Name', 'title', 'Title', 'heading', 'Heading']:
if col in row_data and row_data[col]:
name = str(row_data[col])
break
if not name:
name = f"Row {index} from {os.path.basename(file_path)}"

# 组装文档对象
document = {
"id": str(hash(url) % (2**63)),
"schema_json": json_data,
"url": url,
"name": name,
"site": site
}
documents.append(document)
success_count += 1

if (index + 1) % 1000 == 0:
print(f"Processed {index + 1} rows ({success_count} successful, {error_count} errors)")

except Exception as row_error:
error_count += 1
print(f"Error processing row {index}: {str(row_error)}")
continue

print(f"XLSX processing complete: {success_count} rows processed successfully, {error_count} rows had errors")
return documents

except Exception as e:
print(f"Fatal error processing XLSX file: {str(e)}")
return documents

async def process_csv_file(file_path: str, site: str) -> List[Dict[str, Any]]:
"""
Process a standard CSV file into document objects without using pandas.
Expand Down Expand Up @@ -793,6 +866,9 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
if file_type == 'csv':
# Process standard CSV file
all_documents = await process_csv_file(resolved_path, site)
elif file_type == 'xlsx':
# Process standard CSV file
all_documents = await process_xlsx_file(resolved_path, site)
elif file_type == 'rss' or (file_type == 'xml' and ('/feed' in original_path.lower() or '/rss' in original_path.lower())):
# Process RSS/Atom feed
print("Processing as RSS feed...")
Expand Down
Loading