Skip to content

Commit dca2bf5

Browse files
committed
supports vlm-http-client
1 parent 7ec587f commit dca2bf5

File tree

3 files changed

+91
-48
lines changed

3 files changed

+91
-48
lines changed

deepdoc/parser/mineru_parser.py

Lines changed: 76 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,24 @@
1515
#
1616
import json
1717
import logging
18+
import os
1819
import platform
1920
import re
2021
import subprocess
2122
import sys
2223
import tempfile
2324
import threading
2425
import time
26+
import zipfile
2527
from io import BytesIO
2628
from os import PathLike
2729
from pathlib import Path
2830
from queue import Empty, Queue
2931
from typing import Any, Callable, Optional
30-
import requests
31-
import os
32-
import zipfile
3332

3433
import numpy as np
3534
import pdfplumber
35+
import requests
3636
from PIL import Image
3737
from strenum import StrEnum
3838

@@ -54,43 +54,44 @@ class MinerUContentType(StrEnum):
5454

5555

5656
class MinerUParser(RAGFlowPdfParser):
57-
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987"):
57+
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
5858
self.mineru_path = Path(mineru_path)
59-
self.mineru_api = mineru_api.rstrip('/')
59+
self.mineru_api = mineru_api.rstrip("/")
60+
self.mineru_server_url = mineru_server_url.rstrip("/")
6061
self.using_api = False
6162
self.logger = logging.getLogger(self.__class__.__name__)
6263

6364
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
64-
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
65+
with zipfile.ZipFile(zip_path, "r") as zip_ref:
6566
if not root_dir:
6667
files = zip_ref.namelist()
67-
if files and files[0].endswith('/'):
68+
if files and files[0].endswith("/"):
6869
root_dir = files[0]
6970
else:
7071
root_dir = None
71-
72-
if not root_dir or not root_dir.endswith('/'):
72+
73+
if not root_dir or not root_dir.endswith("/"):
7374
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
7475
zip_ref.extractall(extract_to)
7576
return
76-
77+
7778
root_len = len(root_dir)
7879
for member in zip_ref.infolist():
7980
filename = member.filename
8081
if filename == root_dir:
8182
self.logger.info("[MinerU] Ignore root folder...")
8283
continue
83-
84+
8485
path = filename
8586
if path.startswith(root_dir):
8687
path = path[root_len:]
87-
88+
8889
full_path = os.path.join(extract_to, path)
8990
if member.is_dir():
9091
os.makedirs(full_path, exist_ok=True)
9192
else:
9293
os.makedirs(os.path.dirname(full_path), exist_ok=True)
93-
with open(full_path, 'wb') as f:
94+
with open(full_path, "wb") as f:
9495
f.write(zip_ref.read(filename))
9596

9697
def _is_http_endpoint_valid(self, url, timeout=5):
@@ -100,7 +101,15 @@ def _is_http_endpoint_valid(self, url, timeout=5):
100101
except Exception:
101102
return False
102103

103-
def check_installation(self) -> bool:
104+
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
105+
reason = ""
106+
107+
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
108+
if backend not in valid_backends:
109+
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
110+
logging.warning(reason)
111+
return False, reason
112+
104113
subprocess_kwargs = {
105114
"capture_output": True,
106115
"text": True,
@@ -112,43 +121,76 @@ def check_installation(self) -> bool:
112121
if platform.system() == "Windows":
113122
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
114123

124+
if server_url is None:
125+
server_url = self.mineru_server_url
126+
127+
if backend == "vlm-http-client" and server_url:
128+
try:
129+
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
130+
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
131+
if server_accessible:
132+
self.using_api = False # We are using http client, not API
133+
return True, reason
134+
else:
135+
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
136+
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
137+
return False, reason
138+
except Exception as e:
139+
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
140+
try:
141+
response = requests.get(server_url, timeout=5)
142+
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
143+
self.using_api = False
144+
return True, reason
145+
except Exception as e:
146+
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
147+
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
148+
return False, reason
149+
115150
try:
116151
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
117152
version_info = result.stdout.strip()
118153
if version_info:
119154
logging.info(f"[MinerU] Detected version: {version_info}")
120155
else:
121156
logging.info("[MinerU] Detected MinerU, but version info is empty.")
122-
return True
157+
return True, reason
123158
except subprocess.CalledProcessError as e:
124159
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
125160
except FileNotFoundError:
126161
logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
127162
except Exception as e:
128163
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
129164

165+
# If executable check fails, try API check
130166
try:
131167
if self.mineru_api:
132168
# check openapi.json
133169
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
170+
if not openapi_exists:
171+
reason = "[MinerU] Failed to detect vaild MinerU API server"
172+
return openapi_exists, reason
134173
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
135174
self.using_api = openapi_exists
136-
return openapi_exists
175+
return openapi_exists, reason
137176
else:
138177
logging.info("[MinerU] api not exists.")
139178
except Exception as e:
179+
reason = f"[MinerU] Unexpected error during api check: {e}"
140180
logging.error(f"[MinerU] Unexpected error during api check: {e}")
141-
return False
181+
return False, reason
142182

143-
def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
183+
def _run_mineru(
184+
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
185+
):
144186
if self.using_api:
145187
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
146188
else:
147-
self._run_mineru_executable(input_path, output_dir, method, backend, lang, callback)
148-
189+
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
190+
149191
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
150192
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
151-
193+
152194
pdf_file_path = str(input_path)
153195

154196
if not os.path.exists(pdf_file_path):
@@ -158,9 +200,7 @@ def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "aut
158200
output_path = os.path.join(str(output_dir), pdf_file_name, method)
159201
os.makedirs(output_path, exist_ok=True)
160202

161-
files = {
162-
"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")
163-
}
203+
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
164204

165205
data = {
166206
"output_dir": "./output",
@@ -177,23 +217,15 @@ def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "aut
177217
"return_images": True,
178218
"response_format_zip": True,
179219
"start_page_id": 0,
180-
"end_page_id": 99999
220+
"end_page_id": 99999,
181221
}
182222

183-
headers = {
184-
"Accept": "application/json"
185-
}
223+
headers = {"Accept": "application/json"}
186224
try:
187225
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
188226
if callback:
189227
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
190-
response = requests.post(
191-
url=f"{self.mineru_api}/file_parse",
192-
files=files,
193-
data=data,
194-
headers=headers,
195-
timeout=1800
196-
)
228+
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
197229

198230
response.raise_for_status()
199231
if response.headers.get("Content-Type") == "application/zip":
@@ -216,12 +248,16 @@ def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "aut
216248
raise RuntimeError(f"[MinerU] api failed with exception {e}")
217249
self.logger.info("[MinerU] Api completed successfully.")
218250

219-
def _run_mineru_executable(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
251+
def _run_mineru_executable(
252+
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
253+
):
220254
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
221255
if backend:
222256
cmd.extend(["-b", backend])
223257
if lang:
224258
cmd.extend(["-l", lang])
259+
if server_url and backend == "vlm-http-client":
260+
cmd.extend(["-u", server_url])
225261

226262
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
227263

@@ -425,6 +461,7 @@ def parse_pdf(
425461
backend: str = "pipeline",
426462
lang: Optional[str] = None,
427463
method: str = "auto",
464+
server_url: Optional[str] = None,
428465
delete_output: bool = True,
429466
) -> tuple:
430467
import shutil
@@ -470,7 +507,7 @@ def parse_pdf(
470507
self.__images__(pdf, zoomin=1)
471508

472509
try:
473-
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, callback=callback)
510+
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
474511
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
475512
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
476513
if callback:
@@ -492,7 +529,8 @@ def parse_pdf(
492529

493530
if __name__ == "__main__":
494531
parser = MinerUParser("mineru")
495-
print("MinerU available:", parser.check_installation())
532+
ok, reason = parser.check_installation()
533+
print("MinerU available:", ok)
496534

497535
filepath = ""
498536
with open(filepath, "rb") as file:

rag/app/naive.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ def md_to_html(self, sections):
365365
html_content = markdown(text)
366366
soup = BeautifulSoup(html_content, 'html.parser')
367367
return soup
368-
368+
369369
def get_picture_urls(self, soup):
370370
if soup:
371371
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
@@ -375,7 +375,7 @@ def get_hyperlink_urls(self, soup):
375375
if soup:
376376
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
377377
return []
378-
378+
379379
def get_pictures(self, text):
380380
"""Download and open all images from markdown text."""
381381
import requests
@@ -548,17 +548,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
548548
elif layout_recognizer == "MinerU":
549549
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
550550
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
551-
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
552-
if not pdf_parser.check_installation():
553-
callback(-1, "MinerU not found.")
551+
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
552+
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
553+
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url)
554+
ok, reason = pdf_parser.check_installation(backend=mineru_backend)
555+
if not ok:
556+
callback(-1, f"MinerU not found or server not accessible: {reason}")
554557
return res
555558

556559
sections, tables = pdf_parser.parse_pdf(
557560
filepath=filename,
558561
binary=binary,
559562
callback=callback,
560563
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
561-
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
564+
backend=mineru_backend,
565+
server_url=mineru_server_url,
562566
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
563567
)
564568
parser_config["chunk_token_num"] = 0
@@ -731,9 +735,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
731735
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
732736
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
733737
url_res.extend(sub_url_res)
734-
738+
735739
logging.info("naive_merge({}): {}".format(filename, timer() - st))
736-
740+
737741
if embed_res:
738742
res.extend(embed_res)
739743
if url_res:

rag/flow/parser/parser.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,9 @@ def _pdf(self, name, blob):
224224
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
225225
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
226226
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
227-
if not pdf_parser.check_installation():
228-
raise RuntimeError("MinerU not found. Please install it via: pip install -U 'mineru[core]'.")
227+
ok, reason = pdf_parser.check_installation()
228+
if not ok:
229+
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
229230

230231
lines, _ = pdf_parser.parse_pdf(
231232
filepath=name,

0 commit comments

Comments
 (0)