Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pydantic import ConfigDict, Field
from supervision.config import CLASS_NAME_DATA_FIELD

from inference.core.roboflow_api import post_to_roboflow_api
from inference.core.workflows.core_steps.common.utils import (
attach_parents_coordinates_to_sv_detections,
)
Expand All @@ -22,6 +23,7 @@
from inference.core.workflows.execution_engine.entities.types import (
IMAGE_KIND,
OBJECT_DETECTION_PREDICTION_KIND,
ROBOFLOW_MANAGED_KEY,
SECRET_KIND,
STRING_KIND,
Selector,
Expand All @@ -40,7 +42,8 @@
- `text_detection`: optimized for areas of text within a larger image.
- `ocr_text_detection`: optimized for dense text documents.

You need to provide your Google Vision API key to use this block.
Provide your Google Vision API key or set the value to ``rf_key:account`` (or
``rf_key:user:<id>``) to proxy requests through Roboflow's API.
"""


Expand Down Expand Up @@ -80,17 +83,20 @@ class BlockManifest(WorkflowBlockManifest):
},
},
)
api_key: Union[
Selector(kind=[STRING_KIND, SECRET_KIND, ROBOFLOW_MANAGED_KEY]), str
] = Field(
default="rf_key:account",
description="Your Google Vision API key",
examples=["xxx-xxx", "$inputs.google_api_key"],
private=True,
)
language_hints: Optional[List[str]] = Field(
default=None,
description="Optional list of language codes to pass to the OCR API. If not provided, the API will attempt to detect the language automatically."
"If provided, language codes must be supported by the OCR API, visit https://cloud.google.com/vision/docs/languages for list of supported language codes.",
examples=[["en", "fr"], ["de"]],
)
api_key: Union[Selector(kind=[STRING_KIND, SECRET_KIND]), str] = Field(
description="Your Google Vision API key",
examples=["xxx-xxx", "$inputs.google_api_key"],
private=True,
)

@classmethod
def describe_outputs(cls) -> List[OutputDefinition]:
Expand All @@ -109,6 +115,16 @@ def get_execution_engine_compatibility(cls) -> Optional[str]:

class GoogleVisionOCRBlockV1(WorkflowBlock):

def __init__(
self,
api_key: Optional[str],
):
self._roboflow_api_key = api_key

@classmethod
def get_init_parameters(cls) -> List[str]:
return ["api_key"]

@classmethod
def get_manifest(cls) -> Type[WorkflowBlockManifest]:
return BlockManifest
Expand All @@ -118,105 +134,168 @@ def run(
image: WorkflowImageData,
ocr_type: Literal["text_detection", "ocr_text_detection"],
language_hints: Optional[List[str]],
api_key: str,
api_key: str = "rf_key:account",
) -> BlockResult:
# Decide which type of OCR to use and make the request to Google Vision API
# Decide which type of OCR to use
if ocr_type == "text_detection":
type = "TEXT_DETECTION"
detection_type = "TEXT_DETECTION"
elif ocr_type == "ocr_text_detection":
type = "DOCUMENT_TEXT_DETECTION"
detection_type = "DOCUMENT_TEXT_DETECTION"
else:
raise ValueError(f"Invalid ocr_type: {ocr_type}")

request_json = {
"requests": [
{
"image": {"content": image.base64_image},
"features": [{"type": type}],
}
]
}

if language_hints is not None:
for r in request_json["requests"]:
r["imageContext"] = {"languageHints": language_hints}
response = requests.post(
"https://vision.googleapis.com/v1/images:annotate",
params={"key": api_key},
json=request_json,
request_json = _build_request_json(
image=image,
detection_type=detection_type,
language_hints=language_hints,
)

if response.status_code != 200:
raise RuntimeError(
f"Request to Google Cloud Vision API failed: {str(response.json())}"
# Route to proxy or direct API based on api_key format
if api_key.startswith(("rf_key:account", "rf_key:user:")):
result = _execute_proxied_google_vision_request(
roboflow_api_key=self._roboflow_api_key,
google_vision_api_key=api_key,
request_json=request_json,
)
else:
result = _execute_google_vision_request(
api_key=api_key,
request_json=request_json,
)

result = response.json()["responses"][0]
return _parse_google_vision_response(result=result, image=image)


# Check for image without text
if "textAnnotations" not in result or not result["textAnnotations"]:
return {
"text": "",
"language": "",
"predictions": sv.Detections.empty(),
def _build_request_json(
image: WorkflowImageData,
detection_type: str,
language_hints: Optional[List[str]],
) -> dict:
request_json = {
"requests": [
{
"image": {"content": image.base64_image},
"features": [{"type": detection_type}],
}
]
}

# Extract predictions from the response
text = result["textAnnotations"][0]["description"]
language = result["textAnnotations"][0]["locale"]

xyxy = []
confidence = []
classes = []
detections_id = []

for page in result["fullTextAnnotation"]["pages"]:
for block in page["blocks"]:
# Get bounding box coordinates
box = block["boundingBox"]["vertices"]
x_min = min(v.get("x", 0) for v in box)
y_min = min(v.get("y", 0) for v in box)
x_max = max(v.get("x", 0) for v in box)
y_max = max(v.get("y", 0) for v in box)
xyxy.append([x_min, y_min, x_max, y_max])

# Only DOCUMENT_TEXT_DETECTION provides confidence score, use 1.0 otherwise
confidence.append(block.get("confidence", 1.0))

# Get block text
block_text = []
for paragraph in block["paragraphs"]:
for word in paragraph["words"]:
word_text = "".join(
symbol["text"] for symbol in word["symbols"]
)
block_text.append(word_text)
classes.append(" ".join(block_text))

# Create unique detection id for each block
detections_id.append(uuid4())

predictions = sv.Detections(
xyxy=np.array(xyxy),
confidence=np.array(confidence),
class_id=np.arange(len(classes)),
data={CLASS_NAME_DATA_FIELD: np.array(classes)},
)
if language_hints is not None:
for r in request_json["requests"]:
r["imageContext"] = {"languageHints": language_hints}

return request_json

predictions[DETECTION_ID_KEY] = np.array(detections_id)
predictions[PREDICTION_TYPE_KEY] = np.array(["ocr"] * len(predictions))
image_height, image_width = image.numpy_image.shape[:2]
predictions[IMAGE_DIMENSIONS_KEY] = np.array(
[[image_height, image_width]] * len(predictions)

def _execute_proxied_google_vision_request(
roboflow_api_key: str,
google_vision_api_key: str,
request_json: dict,
) -> dict:
payload = {
"google_vision_api_key": google_vision_api_key,
"request_json": request_json,
}

try:
response_data = post_to_roboflow_api(
endpoint="apiproxy/google_vision_ocr",
api_key=roboflow_api_key,
payload=payload,
)
return response_data["responses"][0]
except requests.exceptions.RequestException as e:
raise RuntimeError(f"Failed to connect to Roboflow proxy: {e}") from e
except (KeyError, IndexError) as e:
raise RuntimeError(
f"Invalid response structure from Roboflow proxy: {e}"
) from e

predictions = attach_parents_coordinates_to_sv_detections(
detections=predictions,
image=image,

def _execute_google_vision_request(
api_key: str,
request_json: dict,
) -> dict:
response = requests.post(
"https://vision.googleapis.com/v1/images:annotate",
params={"key": api_key},
json=request_json,
)

if response.status_code != 200:
raise RuntimeError(
f"Request to Google Cloud Vision API failed: {str(response.json())}"
)

return response.json()["responses"][0]


def _parse_google_vision_response(
result: dict,
image: WorkflowImageData,
) -> BlockResult:
# Check for image without text
if "textAnnotations" not in result or not result["textAnnotations"]:
return {
"text": text,
"language": language,
"predictions": predictions,
"text": "",
"language": "",
"predictions": sv.Detections.empty(),
}

# Extract predictions from the response
text = result["textAnnotations"][0]["description"]
language = result["textAnnotations"][0]["locale"]

xyxy = []
confidence = []
classes = []
detections_id = []

for page in result["fullTextAnnotation"]["pages"]:
for block in page["blocks"]:
# Get bounding box coordinates
box = block["boundingBox"]["vertices"]
x_min = min(v.get("x", 0) for v in box)
y_min = min(v.get("y", 0) for v in box)
x_max = max(v.get("x", 0) for v in box)
y_max = max(v.get("y", 0) for v in box)
xyxy.append([x_min, y_min, x_max, y_max])

# Only DOCUMENT_TEXT_DETECTION provides confidence score, use 1.0 otherwise
confidence.append(block.get("confidence", 1.0))

# Get block text
block_text = []
for paragraph in block["paragraphs"]:
for word in paragraph["words"]:
word_text = "".join(symbol["text"] for symbol in word["symbols"])
block_text.append(word_text)
classes.append(" ".join(block_text))

# Create unique detection id for each block
detections_id.append(uuid4())

predictions = sv.Detections(
xyxy=np.array(xyxy),
confidence=np.array(confidence),
class_id=np.arange(len(classes)),
data={CLASS_NAME_DATA_FIELD: np.array(classes)},
)

predictions[DETECTION_ID_KEY] = np.array(detections_id)
predictions[PREDICTION_TYPE_KEY] = np.array(["ocr"] * len(predictions))
image_height, image_width = image.numpy_image.shape[:2]
predictions[IMAGE_DIMENSIONS_KEY] = np.array(
[[image_height, image_width]] * len(predictions)
)

predictions = attach_parents_coordinates_to_sv_detections(
detections=predictions,
image=image,
)

return {
"text": text,
"language": language,
"predictions": predictions,
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
}


@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No OpenAI API key provided")
@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No Google API key provided")
@pytest.mark.flaky(retries=4, delay=1)
def test_workflow_with_google_api_ocr(
object_detection_service_url: str,
Expand Down