diff --git a/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py b/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py index 1c2b4bc308..ae3376618d 100644 --- a/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py +++ b/inference/core/workflows/core_steps/models/foundation/google_vision_ocr/v1.py @@ -7,6 +7,7 @@ from pydantic import ConfigDict, Field from supervision.config import CLASS_NAME_DATA_FIELD +from inference.core.roboflow_api import post_to_roboflow_api from inference.core.workflows.core_steps.common.utils import ( attach_parents_coordinates_to_sv_detections, ) @@ -22,6 +23,7 @@ from inference.core.workflows.execution_engine.entities.types import ( IMAGE_KIND, OBJECT_DETECTION_PREDICTION_KIND, + ROBOFLOW_MANAGED_KEY, SECRET_KIND, STRING_KIND, Selector, @@ -40,7 +42,8 @@ - `text_detection`: optimized for areas of text within a larger image. - `ocr_text_detection`: optimized for dense text documents. -You need to provide your Google Vision API key to use this block. +Provide your Google Vision API key or set the value to ``rf_key:account`` (or +``rf_key:user:``) to proxy requests through Roboflow's API. """ @@ -80,17 +83,20 @@ class BlockManifest(WorkflowBlockManifest): }, }, ) + api_key: Union[ + Selector(kind=[STRING_KIND, SECRET_KIND, ROBOFLOW_MANAGED_KEY]), str + ] = Field( + default="rf_key:account", + description="Your Google Vision API key", + examples=["xxx-xxx", "$inputs.google_api_key"], + private=True, + ) language_hints: Optional[List[str]] = Field( default=None, description="Optional list of language codes to pass to the OCR API. If not provided, the API will attempt to detect the language automatically." "If provided, language codes must be supported by the OCR API, visit https://cloud.google.com/vision/docs/languages for list of supported language codes.", examples=[["en", "fr"], ["de"]], ) - api_key: Union[Selector(kind=[STRING_KIND, SECRET_KIND]), str] = Field( - description="Your Google Vision API key", - examples=["xxx-xxx", "$inputs.google_api_key"], - private=True, - ) @classmethod def describe_outputs(cls) -> List[OutputDefinition]: @@ -109,6 +115,16 @@ def get_execution_engine_compatibility(cls) -> Optional[str]: class GoogleVisionOCRBlockV1(WorkflowBlock): + def __init__( + self, + api_key: Optional[str], + ): + self._roboflow_api_key = api_key + + @classmethod + def get_init_parameters(cls) -> List[str]: + return ["api_key"] + @classmethod def get_manifest(cls) -> Type[WorkflowBlockManifest]: return BlockManifest @@ -118,105 +134,168 @@ def run( image: WorkflowImageData, ocr_type: Literal["text_detection", "ocr_text_detection"], language_hints: Optional[List[str]], - api_key: str, + api_key: str = "rf_key:account", ) -> BlockResult: - # Decide which type of OCR to use and make the request to Google Vision API + # Decide which type of OCR to use if ocr_type == "text_detection": - type = "TEXT_DETECTION" + detection_type = "TEXT_DETECTION" elif ocr_type == "ocr_text_detection": - type = "DOCUMENT_TEXT_DETECTION" + detection_type = "DOCUMENT_TEXT_DETECTION" else: raise ValueError(f"Invalid ocr_type: {ocr_type}") - request_json = { - "requests": [ - { - "image": {"content": image.base64_image}, - "features": [{"type": type}], - } - ] - } - - if language_hints is not None: - for r in request_json["requests"]: - r["imageContext"] = {"languageHints": language_hints} - response = requests.post( - "https://vision.googleapis.com/v1/images:annotate", - params={"key": api_key}, - json=request_json, + request_json = _build_request_json( + image=image, + detection_type=detection_type, + language_hints=language_hints, ) - if response.status_code != 200: - raise RuntimeError( - f"Request to Google Cloud Vision API failed: {str(response.json())}" + # Route to proxy or direct API based on api_key format + if api_key.startswith(("rf_key:account", "rf_key:user:")): + result = _execute_proxied_google_vision_request( + roboflow_api_key=self._roboflow_api_key, + google_vision_api_key=api_key, + request_json=request_json, + ) + else: + result = _execute_google_vision_request( + api_key=api_key, + request_json=request_json, ) - result = response.json()["responses"][0] + return _parse_google_vision_response(result=result, image=image) + - # Check for image without text - if "textAnnotations" not in result or not result["textAnnotations"]: - return { - "text": "", - "language": "", - "predictions": sv.Detections.empty(), +def _build_request_json( + image: WorkflowImageData, + detection_type: str, + language_hints: Optional[List[str]], +) -> dict: + request_json = { + "requests": [ + { + "image": {"content": image.base64_image}, + "features": [{"type": detection_type}], } + ] + } - # Extract predictions from the response - text = result["textAnnotations"][0]["description"] - language = result["textAnnotations"][0]["locale"] - - xyxy = [] - confidence = [] - classes = [] - detections_id = [] - - for page in result["fullTextAnnotation"]["pages"]: - for block in page["blocks"]: - # Get bounding box coordinates - box = block["boundingBox"]["vertices"] - x_min = min(v.get("x", 0) for v in box) - y_min = min(v.get("y", 0) for v in box) - x_max = max(v.get("x", 0) for v in box) - y_max = max(v.get("y", 0) for v in box) - xyxy.append([x_min, y_min, x_max, y_max]) - - # Only DOCUMENT_TEXT_DETECTION provides confidence score, use 1.0 otherwise - confidence.append(block.get("confidence", 1.0)) - - # Get block text - block_text = [] - for paragraph in block["paragraphs"]: - for word in paragraph["words"]: - word_text = "".join( - symbol["text"] for symbol in word["symbols"] - ) - block_text.append(word_text) - classes.append(" ".join(block_text)) - - # Create unique detection id for each block - detections_id.append(uuid4()) - - predictions = sv.Detections( - xyxy=np.array(xyxy), - confidence=np.array(confidence), - class_id=np.arange(len(classes)), - data={CLASS_NAME_DATA_FIELD: np.array(classes)}, - ) + if language_hints is not None: + for r in request_json["requests"]: + r["imageContext"] = {"languageHints": language_hints} + + return request_json - predictions[DETECTION_ID_KEY] = np.array(detections_id) - predictions[PREDICTION_TYPE_KEY] = np.array(["ocr"] * len(predictions)) - image_height, image_width = image.numpy_image.shape[:2] - predictions[IMAGE_DIMENSIONS_KEY] = np.array( - [[image_height, image_width]] * len(predictions) + +def _execute_proxied_google_vision_request( + roboflow_api_key: str, + google_vision_api_key: str, + request_json: dict, +) -> dict: + payload = { + "google_vision_api_key": google_vision_api_key, + "request_json": request_json, + } + + try: + response_data = post_to_roboflow_api( + endpoint="apiproxy/google_vision_ocr", + api_key=roboflow_api_key, + payload=payload, ) + return response_data["responses"][0] + except requests.exceptions.RequestException as e: + raise RuntimeError(f"Failed to connect to Roboflow proxy: {e}") from e + except (KeyError, IndexError) as e: + raise RuntimeError( + f"Invalid response structure from Roboflow proxy: {e}" + ) from e - predictions = attach_parents_coordinates_to_sv_detections( - detections=predictions, - image=image, + +def _execute_google_vision_request( + api_key: str, + request_json: dict, +) -> dict: + response = requests.post( + "https://vision.googleapis.com/v1/images:annotate", + params={"key": api_key}, + json=request_json, + ) + + if response.status_code != 200: + raise RuntimeError( + f"Request to Google Cloud Vision API failed: {str(response.json())}" ) + return response.json()["responses"][0] + + +def _parse_google_vision_response( + result: dict, + image: WorkflowImageData, +) -> BlockResult: + # Check for image without text + if "textAnnotations" not in result or not result["textAnnotations"]: return { - "text": text, - "language": language, - "predictions": predictions, + "text": "", + "language": "", + "predictions": sv.Detections.empty(), } + + # Extract predictions from the response + text = result["textAnnotations"][0]["description"] + language = result["textAnnotations"][0]["locale"] + + xyxy = [] + confidence = [] + classes = [] + detections_id = [] + + for page in result["fullTextAnnotation"]["pages"]: + for block in page["blocks"]: + # Get bounding box coordinates + box = block["boundingBox"]["vertices"] + x_min = min(v.get("x", 0) for v in box) + y_min = min(v.get("y", 0) for v in box) + x_max = max(v.get("x", 0) for v in box) + y_max = max(v.get("y", 0) for v in box) + xyxy.append([x_min, y_min, x_max, y_max]) + + # Only DOCUMENT_TEXT_DETECTION provides confidence score, use 1.0 otherwise + confidence.append(block.get("confidence", 1.0)) + + # Get block text + block_text = [] + for paragraph in block["paragraphs"]: + for word in paragraph["words"]: + word_text = "".join(symbol["text"] for symbol in word["symbols"]) + block_text.append(word_text) + classes.append(" ".join(block_text)) + + # Create unique detection id for each block + detections_id.append(uuid4()) + + predictions = sv.Detections( + xyxy=np.array(xyxy), + confidence=np.array(confidence), + class_id=np.arange(len(classes)), + data={CLASS_NAME_DATA_FIELD: np.array(classes)}, + ) + + predictions[DETECTION_ID_KEY] = np.array(detections_id) + predictions[PREDICTION_TYPE_KEY] = np.array(["ocr"] * len(predictions)) + image_height, image_width = image.numpy_image.shape[:2] + predictions[IMAGE_DIMENSIONS_KEY] = np.array( + [[image_height, image_width]] * len(predictions) + ) + + predictions = attach_parents_coordinates_to_sv_detections( + detections=predictions, + image=image, + ) + + return { + "text": text, + "language": language, + "predictions": predictions, + } diff --git a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py index 4a1c1cbc65..865ef5f904 100644 --- a/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py +++ b/tests/inference/hosted_platform_tests/workflows_examples/test_workflow_with_google_ocr.py @@ -54,7 +54,7 @@ } -@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No OpenAI API key provided") +@pytest.mark.skipif(GOOGLE_VISION_API_KEY is None, reason="No Google API key provided") @pytest.mark.flaky(retries=4, delay=1) def test_workflow_with_google_api_ocr( object_detection_service_url: str,