nlweb-ai · Andy0898 · Jul 10, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 15, 2025
diff --git a/.gitignore b/.gitignore
@@ -268,6 +268,9 @@ UpgradeLog*.htm
 ServiceFabricBackup/
 *.rptproj.bak
 
+# Backup files
+*.bak
+
 # SQL Server files
 *.mdf
 *.ldf

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -3,14 +3,56 @@
     // Hover to view descriptions of existing attributes.
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
-    "configurations": [
-
+    "configurations": [     
+        {
+            "name": "Python Debugger: app-file",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/code/python/app-file.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/code/python",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/code/python"
+            }
+        },
+        {
+            "name": "Python Debugger: db_load with args",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "data_loading.db_load",
+            "console": "integratedTerminal",
+            "args": "${command:pickArgs}",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/code/python",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/code/python"
+            }
+        },
+        {
+            "name": "Python Debugger: current file with args",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": "${command:pickArgs}",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/code/python",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/code/python"
+            }
+        },
         {
             "name": "Python Debugger: Current File",
             "type": "debugpy",
             "request": "launch",
             "program": "${file}",
-            "console": "integratedTerminal"
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/code/python",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/code/python"
+            }
         }
     ]
 }
diff --git a/code/python/core/embedding.py b/code/python/core/embedding.py
@@ -22,6 +22,7 @@
     "openai": threading.Lock(),
     "gemini": threading.Lock(),
     "azure_openai": threading.Lock(),
+    "qwen_openai": threading.Lock(),
     "snowflake": threading.Lock()
 }
 
@@ -119,7 +120,18 @@ async def get_embedding(
             )
             logger.debug(f"Azure embeddings received, dimension: {len(result)}")
             return result
-
+
+        if provider == "qwen_openai":
+            logger.debug("Getting Qwen OpenAI embeddings")
+            # Import here to avoid potential circular imports
+            from embedding_providers.qwen_embedding import get_qwen_embedding
+            result = await asyncio.wait_for(
+                get_qwen_embedding(text, model=model_id),
+                timeout=timeout
+            )
+            logger.debug(f"Qwen OpenAI embeddings received, dimension: {len(result)}")
+            return result
+
         if provider == "snowflake":
             logger.debug("Getting Snowflake embeddings")
             # Import here to avoid potential circular imports
@@ -204,6 +216,16 @@ async def batch_get_embeddings(
 
     try:
         # Provider-specific batch implementations with timeout handling
+        if provider == "qwen_openai":
+            logger.debug("Getting Qwen OpenAI batch embeddings")
+            from embedding_providers.qwen_embedding import get_qwen_batch_embeddings
+            result = await asyncio.wait_for(
+                get_qwen_batch_embeddings(texts, model=model_id),
+                timeout=timeout
+            )
+            logger.debug(f"Qwen OpenAI batch embeddings received, count: {len(result)}")
+            return result
+
         if provider == "openai":
             # Use OpenAI's batch embedding API
             logger.debug("Getting OpenAI batch embeddings")

diff --git a/code/python/core/llm.py b/code/python/core/llm.py
@@ -49,6 +49,7 @@ def init():
     "azure_openai": ["openai>=1.12.0"],
     "llama_azure": ["openai>=1.12.0"],
     "deepseek_azure": ["openai>=1.12.0"],
+    "qwen_openai": ["openai>=1.12.0"],
     "inception": ["aiohttp>=3.9.1"],
     "snowflake": ["httpx>=0.28.1"],
     "huggingface": ["huggingface_hub>=0.31.0"],
@@ -138,6 +139,9 @@ def _get_provider(llm_type: str):
         elif llm_type == "deepseek_azure":
             from llm_providers.azure_deepseek import provider as deepseek_provider
             _loaded_providers[llm_type] = deepseek_provider
+        elif llm_type == "qwen_openai":
+            from llm_providers.qwen_openai import provider as qwen_provider
+            _loaded_providers[llm_type] = qwen_provider
         elif llm_type == "inception":
             from llm_providers.inception import provider as inception_provider
             _loaded_providers[llm_type] = inception_provider

diff --git a/code/python/data_loading/db_load.py b/code/python/data_loading/db_load.py
@@ -10,6 +10,7 @@
 import sys
 import json
 import csv
+import pandas as pd
 import asyncio
 import aiohttp
 import tempfile
@@ -287,6 +288,8 @@ async def detect_file_type(file_path: str) -> Tuple[str, bool]:
         return 'json', has_embeddings
     elif ext == '.csv':
         return 'csv', has_embeddings
+    elif ext == '.xlsx':
+        return 'xlsx', has_embeddings
     elif ext in ['.xml', '.rss', '.atom']:
         # Check if file contains RSS-like elements
         try:
@@ -364,6 +367,76 @@ async def detect_file_type(file_path: str) -> Tuple[str, bool]:
 
     return 'unknown', has_embeddings
 
+async def process_xlsx_file(file_path: str, site: str) -> list:
+    """
+    处理 Excel 文件（.xlsx），转为文档对象列表
+    Args:
+        file_path: Excel 文件路径
+        site: 站点标识
+    Returns:
+        文档对象列表
+    """
+    print(f"Processing XLSX file: {file_path}")
+    documents = []
+    error_count = 0
+    success_count = 0
+
+    try:
+        df = pd.read_excel(file_path, dtype=str)  # 读为字符串，避免类型问题
+        if df.empty:
+            print(f"Warning: XLSX file {file_path} is empty.")
+            return documents
+
+        for index, row in df.iterrows():
+            try:
+                row_data = row.to_dict()
+                # 尝试提取 url/id 字段
+                url = None
+                for col in ['url', 'URL', 'link', 'Link', 'id', 'ID', 'identifier']:
+                    if col in row_data and row_data[col]:
+                        url = str(row_data[col])
+                        break
+                if not url:
+                    url = f"xlsx:{os.path.basename(file_path)}:{index}"
+
+                # 转为 JSON
+                json_data = json.dumps(row_data, ensure_ascii=False)
+
+                # 尝试提取 name/title 字段
+                name = None
+                for col in ['name', 'Name', 'title', 'Title', 'heading', 'Heading']:
+                    if col in row_data and row_data[col]:
+                        name = str(row_data[col])
+                        break
+                if not name:
+                    name = f"Row {index} from {os.path.basename(file_path)}"
+
+                # 组装文档对象
+                document = {
+                    "id": str(hash(url) % (2**63)),
+                    "schema_json": json_data,
+                    "url": url,
+                    "name": name,
+                    "site": site
+                }
+                documents.append(document)
+                success_count += 1
+
+                if (index + 1) % 1000 == 0:
+                    print(f"Processed {index + 1} rows ({success_count} successful, {error_count} errors)")
+
+            except Exception as row_error:
+                error_count += 1
+                print(f"Error processing row {index}: {str(row_error)}")
+                continue
+
+        print(f"XLSX processing complete: {success_count} rows processed successfully, {error_count} rows had errors")
+        return documents
+
+    except Exception as e:
+        print(f"Fatal error processing XLSX file: {str(e)}")
+        return documents
+
 async def process_csv_file(file_path: str, site: str) -> List[Dict[str, Any]]:
     """
     Process a standard CSV file into document objects without using pandas.
@@ -793,6 +866,9 @@ async def loadJsonToDB(file_path: str, site: str, batch_size: int = 100, delete_
         if file_type == 'csv':
             # Process standard CSV file
             all_documents = await process_csv_file(resolved_path, site)
+        elif file_type == 'xlsx':
+            # Process standard CSV file
+            all_documents = await process_xlsx_file(resolved_path, site)
         elif file_type == 'rss' or (file_type == 'xml' and ('/feed' in original_path.lower() or '/rss' in original_path.lower())):
             # Process RSS/Atom feed
             print("Processing as RSS feed...")