1515#
1616import json
1717import logging
18+ import os
1819import platform
1920import re
2021import subprocess
2122import sys
2223import tempfile
2324import threading
2425import time
26+ import zipfile
2527from io import BytesIO
2628from os import PathLike
2729from pathlib import Path
2830from queue import Empty , Queue
2931from typing import Any , Callable , Optional
30- import requests
31- import os
32- import zipfile
3332
3433import numpy as np
3534import pdfplumber
35+ import requests
3636from PIL import Image
3737from strenum import StrEnum
3838
@@ -54,43 +54,44 @@ class MinerUContentType(StrEnum):
5454
5555
5656class MinerUParser (RAGFlowPdfParser ):
57- def __init__ (self , mineru_path : str = "mineru" , mineru_api : str = "http://host.docker.internal:9987" ):
57+ def __init__ (self , mineru_path : str = "mineru" , mineru_api : str = "http://host.docker.internal:9987" , mineru_server_url : str = "" ):
5858 self .mineru_path = Path (mineru_path )
59- self .mineru_api = mineru_api .rstrip ('/' )
59+ self .mineru_api = mineru_api .rstrip ("/" )
60+ self .mineru_server_url = mineru_server_url .rstrip ("/" )
6061 self .using_api = False
6162 self .logger = logging .getLogger (self .__class__ .__name__ )
6263
6364 def _extract_zip_no_root (self , zip_path , extract_to , root_dir ):
64- with zipfile .ZipFile (zip_path , 'r' ) as zip_ref :
65+ with zipfile .ZipFile (zip_path , "r" ) as zip_ref :
6566 if not root_dir :
6667 files = zip_ref .namelist ()
67- if files and files [0 ].endswith ('/' ):
68+ if files and files [0 ].endswith ("/" ):
6869 root_dir = files [0 ]
6970 else :
7071 root_dir = None
71-
72- if not root_dir or not root_dir .endswith ('/' ):
72+
73+ if not root_dir or not root_dir .endswith ("/" ):
7374 self .logger .info (f"[MinerU] No root directory found, extracting all...fff{ root_dir } " )
7475 zip_ref .extractall (extract_to )
7576 return
76-
77+
7778 root_len = len (root_dir )
7879 for member in zip_ref .infolist ():
7980 filename = member .filename
8081 if filename == root_dir :
8182 self .logger .info ("[MinerU] Ignore root folder..." )
8283 continue
83-
84+
8485 path = filename
8586 if path .startswith (root_dir ):
8687 path = path [root_len :]
87-
88+
8889 full_path = os .path .join (extract_to , path )
8990 if member .is_dir ():
9091 os .makedirs (full_path , exist_ok = True )
9192 else :
9293 os .makedirs (os .path .dirname (full_path ), exist_ok = True )
93- with open (full_path , 'wb' ) as f :
94+ with open (full_path , "wb" ) as f :
9495 f .write (zip_ref .read (filename ))
9596
9697 def _is_http_endpoint_valid (self , url , timeout = 5 ):
@@ -100,7 +101,15 @@ def _is_http_endpoint_valid(self, url, timeout=5):
100101 except Exception :
101102 return False
102103
103- def check_installation (self ) -> bool :
104+ def check_installation (self , backend : str = "pipeline" , server_url : Optional [str ] = None ) -> tuple [bool , str ]:
105+ reason = ""
106+
107+ valid_backends = ["pipeline" , "vlm-http-client" , "vlm-transformers" , "vlm-vllm-engine" ]
108+ if backend not in valid_backends :
109+ reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
110+ logging .warning (reason )
111+ return False , reason
112+
104113 subprocess_kwargs = {
105114 "capture_output" : True ,
106115 "text" : True ,
@@ -112,43 +121,76 @@ def check_installation(self) -> bool:
112121 if platform .system () == "Windows" :
113122 subprocess_kwargs ["creationflags" ] = getattr (subprocess , "CREATE_NO_WINDOW" , 0 )
114123
124+ if server_url is None :
125+ server_url = self .mineru_server_url
126+
127+ if backend == "vlm-http-client" and server_url :
128+ try :
129+ server_accessible = self ._is_http_endpoint_valid (server_url + "/openapi.json" )
130+ logging .info (f"[MinerU] vlm-http-client server check: { server_accessible } " )
131+ if server_accessible :
132+ self .using_api = False # We are using http client, not API
133+ return True , reason
134+ else :
135+ reason = f"[MinerU] vlm-http-client server not accessible: { server_url } "
136+ logging .warning (f"[MinerU] vlm-http-client server not accessible: { server_url } " )
137+ return False , reason
138+ except Exception as e :
139+ logging .warning (f"[MinerU] vlm-http-client server check failed: { e } " )
140+ try :
141+ response = requests .get (server_url , timeout = 5 )
142+ logging .info (f"[MinerU] vlm-http-client server connection check: success with status { response .status_code } " )
143+ self .using_api = False
144+ return True , reason
145+ except Exception as e :
146+ reason = f"[MinerU] vlm-http-client server connection check failed: { server_url } : { e } "
147+ logging .warning (f"[MinerU] vlm-http-client server connection check failed: { server_url } : { e } " )
148+ return False , reason
149+
115150 try :
116151 result = subprocess .run ([str (self .mineru_path ), "--version" ], ** subprocess_kwargs )
117152 version_info = result .stdout .strip ()
118153 if version_info :
119154 logging .info (f"[MinerU] Detected version: { version_info } " )
120155 else :
121156 logging .info ("[MinerU] Detected MinerU, but version info is empty." )
122- return True
157+ return True , reason
123158 except subprocess .CalledProcessError as e :
124159 logging .warning (f"[MinerU] Execution failed (exit code { e .returncode } )." )
125160 except FileNotFoundError :
126161 logging .warning ("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'" )
127162 except Exception as e :
128163 logging .error (f"[MinerU] Unexpected error during installation check: { e } " )
129164
165+ # If executable check fails, try API check
130166 try :
131167 if self .mineru_api :
132168 # check openapi.json
133169 openapi_exists = self ._is_http_endpoint_valid (self .mineru_api + "/openapi.json" )
170+ if not openapi_exists :
171+ reason = "[MinerU] Failed to detect vaild MinerU API server"
172+ return openapi_exists , reason
134173 logging .info (f"[MinerU] Detected { self .mineru_api } /openapi.json: { openapi_exists } " )
135174 self .using_api = openapi_exists
136- return openapi_exists
175+ return openapi_exists , reason
137176 else :
138177 logging .info ("[MinerU] api not exists." )
139178 except Exception as e :
179+ reason = f"[MinerU] Unexpected error during api check: { e } "
140180 logging .error (f"[MinerU] Unexpected error during api check: { e } " )
141- return False
181+ return False , reason
142182
143- def _run_mineru (self , input_path : Path , output_dir : Path , method : str = "auto" , backend : str = "pipeline" , lang : Optional [str ] = None , callback : Optional [Callable ] = None ):
183+ def _run_mineru (
184+ self , input_path : Path , output_dir : Path , method : str = "auto" , backend : str = "pipeline" , lang : Optional [str ] = None , server_url : Optional [str ] = None , callback : Optional [Callable ] = None
185+ ):
144186 if self .using_api :
145187 self ._run_mineru_api (input_path , output_dir , method , backend , lang , callback )
146188 else :
147- self ._run_mineru_executable (input_path , output_dir , method , backend , lang , callback )
148-
189+ self ._run_mineru_executable (input_path , output_dir , method , backend , lang , server_url , callback )
190+
149191 def _run_mineru_api (self , input_path : Path , output_dir : Path , method : str = "auto" , backend : str = "pipeline" , lang : Optional [str ] = None , callback : Optional [Callable ] = None ):
150192 OUTPUT_ZIP_PATH = os .path .join (str (output_dir ), "output.zip" )
151-
193+
152194 pdf_file_path = str (input_path )
153195
154196 if not os .path .exists (pdf_file_path ):
@@ -158,9 +200,7 @@ def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "aut
158200 output_path = os .path .join (str (output_dir ), pdf_file_name , method )
159201 os .makedirs (output_path , exist_ok = True )
160202
161- files = {
162- "files" : (pdf_file_name + ".pdf" , open (pdf_file_path , "rb" ), "application/pdf" )
163- }
203+ files = {"files" : (pdf_file_name + ".pdf" , open (pdf_file_path , "rb" ), "application/pdf" )}
164204
165205 data = {
166206 "output_dir" : "./output" ,
@@ -177,23 +217,15 @@ def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "aut
177217 "return_images" : True ,
178218 "response_format_zip" : True ,
179219 "start_page_id" : 0 ,
180- "end_page_id" : 99999
220+ "end_page_id" : 99999 ,
181221 }
182222
183- headers = {
184- "Accept" : "application/json"
185- }
223+ headers = {"Accept" : "application/json" }
186224 try :
187225 self .logger .info (f"[MinerU] invoke api: { self .mineru_api } /file_parse" )
188226 if callback :
189227 callback (0.20 , f"[MinerU] invoke api: { self .mineru_api } /file_parse" )
190- response = requests .post (
191- url = f"{ self .mineru_api } /file_parse" ,
192- files = files ,
193- data = data ,
194- headers = headers ,
195- timeout = 1800
196- )
228+ response = requests .post (url = f"{ self .mineru_api } /file_parse" , files = files , data = data , headers = headers , timeout = 1800 )
197229
198230 response .raise_for_status ()
199231 if response .headers .get ("Content-Type" ) == "application/zip" :
@@ -216,12 +248,16 @@ def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "aut
216248 raise RuntimeError (f"[MinerU] api failed with exception { e } " )
217249 self .logger .info ("[MinerU] Api completed successfully." )
218250
219- def _run_mineru_executable (self , input_path : Path , output_dir : Path , method : str = "auto" , backend : str = "pipeline" , lang : Optional [str ] = None , callback : Optional [Callable ] = None ):
251+ def _run_mineru_executable (
252+ self , input_path : Path , output_dir : Path , method : str = "auto" , backend : str = "pipeline" , lang : Optional [str ] = None , server_url : Optional [str ] = None , callback : Optional [Callable ] = None
253+ ):
220254 cmd = [str (self .mineru_path ), "-p" , str (input_path ), "-o" , str (output_dir ), "-m" , method ]
221255 if backend :
222256 cmd .extend (["-b" , backend ])
223257 if lang :
224258 cmd .extend (["-l" , lang ])
259+ if server_url and backend == "vlm-http-client" :
260+ cmd .extend (["-u" , server_url ])
225261
226262 self .logger .info (f"[MinerU] Running command: { ' ' .join (cmd )} " )
227263
@@ -425,6 +461,7 @@ def parse_pdf(
425461 backend : str = "pipeline" ,
426462 lang : Optional [str ] = None ,
427463 method : str = "auto" ,
464+ server_url : Optional [str ] = None ,
428465 delete_output : bool = True ,
429466 ) -> tuple :
430467 import shutil
@@ -470,7 +507,7 @@ def parse_pdf(
470507 self .__images__ (pdf , zoomin = 1 )
471508
472509 try :
473- self ._run_mineru (pdf , out_dir , method = method , backend = backend , lang = lang , callback = callback )
510+ self ._run_mineru (pdf , out_dir , method = method , backend = backend , lang = lang , server_url = server_url , callback = callback )
474511 outputs = self ._read_output (out_dir , pdf .stem , method = method , backend = backend )
475512 self .logger .info (f"[MinerU] Parsed { len (outputs )} blocks from PDF." )
476513 if callback :
@@ -492,7 +529,8 @@ def parse_pdf(
492529
493530if __name__ == "__main__" :
494531 parser = MinerUParser ("mineru" )
495- print ("MinerU available:" , parser .check_installation ())
532+ ok , reason = parser .check_installation ()
533+ print ("MinerU available:" , ok )
496534
497535 filepath = ""
498536 with open (filepath , "rb" ) as file :
0 commit comments