Skip to content

Commit 78631a3

Browse files
authored
Move some functions out of 'api/utils/common.py' (#10948)
### What problem does this PR solve? as title. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <[email protected]>
1 parent 4117f41 commit 78631a3

File tree

7 files changed

+161
-79
lines changed

7 files changed

+161
-79
lines changed

api/utils/common.py

Lines changed: 0 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,6 @@
1414
# limitations under the License.
1515
#
1616

17-
import threading
18-
import subprocess
19-
import sys
20-
import os
21-
import logging
22-
2317
def string_to_bytes(string):
2418
return string if isinstance(
2519
string, bytes) else string.encode(encoding="utf-8")
@@ -28,70 +22,3 @@ def string_to_bytes(string):
2822
def bytes_to_string(byte):
2923
return byte.decode(encoding="utf-8")
3024

31-
32-
def convert_bytes(size_in_bytes: int) -> str:
33-
"""
34-
Format size in bytes.
35-
"""
36-
if size_in_bytes == 0:
37-
return "0 B"
38-
39-
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
40-
i = 0
41-
size = float(size_in_bytes)
42-
43-
while size >= 1024 and i < len(units) - 1:
44-
size /= 1024
45-
i += 1
46-
47-
if i == 0 or size >= 100:
48-
return f"{size:.0f} {units[i]}"
49-
elif size >= 10:
50-
return f"{size:.1f} {units[i]}"
51-
else:
52-
return f"{size:.2f} {units[i]}"
53-
54-
55-
def once(func):
56-
"""
57-
A thread-safe decorator that ensures the decorated function runs exactly once,
58-
caching and returning its result for all subsequent calls. This prevents
59-
race conditions in multi-threaded environments by using a lock to protect
60-
the execution state.
61-
62-
Args:
63-
func (callable): The function to be executed only once.
64-
65-
Returns:
66-
callable: A wrapper function that executes `func` on the first call
67-
and returns the cached result thereafter.
68-
69-
Example:
70-
@once
71-
def compute_expensive_value():
72-
print("Computing...")
73-
return 42
74-
75-
# First call: executes and prints
76-
# Subsequent calls: return 42 without executing
77-
"""
78-
executed = False
79-
result = None
80-
lock = threading.Lock()
81-
def wrapper(*args, **kwargs):
82-
nonlocal executed, result
83-
with lock:
84-
if not executed:
85-
executed = True
86-
result = func(*args, **kwargs)
87-
return result
88-
return wrapper
89-
90-
@once
91-
def pip_install_torch():
92-
device = os.getenv("DEVICE", "cpu")
93-
if device=="cpu":
94-
return
95-
logging.info("Installing pytorch")
96-
pkg_names = ["torch>=2.5.0,<3.0.0"]
97-
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names])

common/misc_utils.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@
1818
import hashlib
1919
import uuid
2020
import requests
21+
import threading
22+
import subprocess
23+
import sys
24+
import os
25+
import logging
2126

2227
def get_uuid():
2328
return uuid.uuid1().hex
@@ -33,4 +38,71 @@ def download_img(url):
3338

3439

3540
def hash_str2int(line: str, mod: int = 10 ** 8) -> int:
36-
return int(hashlib.sha1(line.encode("utf-8")).hexdigest(), 16) % mod
41+
return int(hashlib.sha1(line.encode("utf-8")).hexdigest(), 16) % mod
42+
43+
def convert_bytes(size_in_bytes: int) -> str:
44+
"""
45+
Format size in bytes.
46+
"""
47+
if size_in_bytes == 0:
48+
return "0 B"
49+
50+
units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
51+
i = 0
52+
size = float(size_in_bytes)
53+
54+
while size >= 1024 and i < len(units) - 1:
55+
size /= 1024
56+
i += 1
57+
58+
if i == 0 or size >= 100:
59+
return f"{size:.0f} {units[i]}"
60+
elif size >= 10:
61+
return f"{size:.1f} {units[i]}"
62+
else:
63+
return f"{size:.2f} {units[i]}"
64+
65+
66+
def once(func):
67+
"""
68+
A thread-safe decorator that ensures the decorated function runs exactly once,
69+
caching and returning its result for all subsequent calls. This prevents
70+
race conditions in multi-thread environments by using a lock to protect
71+
the execution state.
72+
73+
Args:
74+
func (callable): The function to be executed only once.
75+
76+
Returns:
77+
callable: A wrapper function that executes `func` on the first call
78+
and returns the cached result thereafter.
79+
80+
Example:
81+
@once
82+
def compute_expensive_value():
83+
print("Computing...")
84+
return 42
85+
86+
# First call: executes and prints
87+
# Subsequent calls: return 42 without executing
88+
"""
89+
executed = False
90+
result = None
91+
lock = threading.Lock()
92+
def wrapper(*args, **kwargs):
93+
nonlocal executed, result
94+
with lock:
95+
if not executed:
96+
executed = True
97+
result = func(*args, **kwargs)
98+
return result
99+
return wrapper
100+
101+
@once
102+
def pip_install_torch():
103+
device = os.getenv("DEVICE", "cpu")
104+
if device=="cpu":
105+
return
106+
logging.info("Installing pytorch")
107+
pkg_names = ["torch>=2.5.0,<3.0.0"]
108+
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names])

deepdoc/parser/pdf_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
from pypdf import PdfReader as pdf2_read
3636

3737
from common.file_utils import get_project_base_directory
38-
from api.utils.common import pip_install_torch
38+
from common.misc_utils import pip_install_torch
3939
from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer
4040
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
4141
from rag.nlp import rag_tokenizer

deepdoc/vision/ocr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from huggingface_hub import snapshot_download
2323

2424
from common.file_utils import get_project_base_directory
25-
from api.utils.common import pip_install_torch
25+
from common.misc_utils import pip_install_torch
2626
from rag.settings import PARALLEL_DEVICES
2727
from .operators import * # noqa: F403
2828
from . import operators

rag/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import logging
1818
from api.utils.configs import get_base_config, decrypt_database_config
1919
from common.file_utils import get_project_base_directory
20-
from api.utils.common import pip_install_torch
20+
from common.misc_utils import pip_install_torch
2121

2222
# Server
2323
RAG_CONF_PATH = os.path.join(get_project_base_directory(), "conf")

rag/utils/es_conn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from rag.settings import TAG_FLD, PAGERANK_FLD
2929
from common.decorator import singleton
3030
from common.file_utils import get_project_base_directory
31-
from api.utils.common import convert_bytes
31+
from common.misc_utils import convert_bytes
3232
from rag.utils.doc_store_conn import DocStoreConnection, MatchExpr, OrderByExpr, MatchTextExpr, MatchDenseExpr, \
3333
FusionExpr
3434
from rag.nlp import is_english, rag_tokenizer

test/unit_test/common/test_misc_utils.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#
1616
import uuid
1717
import hashlib
18-
from common.misc_utils import get_uuid, download_img, hash_str2int
18+
from common.misc_utils import get_uuid, download_img, hash_str2int, convert_bytes
1919

2020

2121
class TestGetUuid:
@@ -270,3 +270,86 @@ def test_whitespace_strings(self):
270270
result = hash_str2int(test_str)
271271
assert isinstance(result, int)
272272
assert 0 <= result < 10 ** 8
273+
274+
275+
class TestConvertBytes:
276+
"""Test suite for convert_bytes function"""
277+
278+
def test_zero_bytes(self):
279+
"""Test that 0 bytes returns '0 B'"""
280+
assert convert_bytes(0) == "0 B"
281+
282+
def test_single_byte(self):
283+
"""Test single byte values"""
284+
assert convert_bytes(1) == "1 B"
285+
assert convert_bytes(999) == "999 B"
286+
287+
def test_kilobyte_range(self):
288+
"""Test values in kilobyte range with different precisions"""
289+
# Exactly 1 KB
290+
assert convert_bytes(1024) == "1.00 KB"
291+
292+
# Values that should show 1 decimal place (10-99.9 range)
293+
assert convert_bytes(15360) == "15.0 KB" # 15 KB exactly
294+
assert convert_bytes(10752) == "10.5 KB" # 10.5 KB
295+
296+
# Values that should show 2 decimal places (1-9.99 range)
297+
assert convert_bytes(2048) == "2.00 KB" # 2 KB exactly
298+
assert convert_bytes(3072) == "3.00 KB" # 3 KB exactly
299+
assert convert_bytes(5120) == "5.00 KB" # 5 KB exactly
300+
301+
def test_megabyte_range(self):
302+
"""Test values in megabyte range"""
303+
# Exactly 1 MB
304+
assert convert_bytes(1048576) == "1.00 MB"
305+
306+
# Values with different precision requirements
307+
assert convert_bytes(15728640) == "15.0 MB" # 15.0 MB
308+
assert convert_bytes(11010048) == "10.5 MB" # 10.5 MB
309+
310+
def test_gigabyte_range(self):
311+
"""Test values in gigabyte range"""
312+
# Exactly 1 GB
313+
assert convert_bytes(1073741824) == "1.00 GB"
314+
315+
# Large value that should show 0 decimal places
316+
assert convert_bytes(3221225472) == "3.00 GB" # 3 GB exactly
317+
318+
def test_terabyte_range(self):
319+
"""Test values in terabyte range"""
320+
assert convert_bytes(1099511627776) == "1.00 TB" # 1 TB
321+
322+
def test_petabyte_range(self):
323+
"""Test values in petabyte range"""
324+
assert convert_bytes(1125899906842624) == "1.00 PB" # 1 PB
325+
326+
def test_boundary_values(self):
327+
"""Test values at unit boundaries"""
328+
# Just below 1 KB
329+
assert convert_bytes(1023) == "1023 B"
330+
331+
# Just above 1 KB
332+
assert convert_bytes(1025) == "1.00 KB"
333+
334+
# At 100 KB boundary (should switch to 0 decimal places)
335+
assert convert_bytes(102400) == "100 KB"
336+
assert convert_bytes(102300) == "99.9 KB"
337+
338+
def test_precision_transitions(self):
339+
"""Test the precision formatting transitions"""
340+
# Test transition from 2 decimal places to 1 decimal place
341+
assert convert_bytes(9216) == "9.00 KB" # 9.00 KB (2 decimal places)
342+
assert convert_bytes(10240) == "10.0 KB" # 10.0 KB (1 decimal place)
343+
344+
# Test transition from 1 decimal place to 0 decimal places
345+
assert convert_bytes(102400) == "100 KB" # 100 KB (0 decimal places)
346+
347+
def test_large_values_no_overflow(self):
348+
"""Test that very large values don't cause issues"""
349+
# Very large value that should use PB
350+
large_value = 10 * 1125899906842624 # 10 PB
351+
assert "PB" in convert_bytes(large_value)
352+
353+
# Ensure we don't exceed available units
354+
huge_value = 100 * 1125899906842624 # 100 PB (still within PB range)
355+
assert "PB" in convert_bytes(huge_value)

0 commit comments

Comments
 (0)