""" OCR module adapted for HuggingFace Spaces. Uses Google Cloud Vision API for text detection. """ from PIL import Image, ImageDraw, ImageFilter from google.cloud import vision import numpy as np import io import os import json import tempfile from py_files.bounding_clustering import QuadTree, Node def change_contrast(img, level): """Adjust image contrast for better OCR results.""" factor = (259 * (level + 255)) / (255 * (259 - level)) def contrast(c): return 128 + factor * (c - 128) return img.point(contrast) def get_bounding_box_doc(blk): """Extract bounding box coordinates from document text block.""" vertices = [int(blk.bounding_box.vertices[0].x), int(blk.bounding_box.vertices[0].y), int(blk.bounding_box.vertices[2].x), int(blk.bounding_box.vertices[2].y)] return vertices def get_text_from_image_doc(img, debug=False, get_response=False, resp=None, max_dist=20): """ Extract text from image using Google Cloud Vision Document Text Detection. Adapted for HuggingFace Spaces environment. """ response = resp if resp is None: # Initialize the client with credentials from environment try: # Try to get credentials from environment variable google_creds = os.environ.get('GOOGLE_CLOUD_CREDENTIALS') if google_creds: # Create temporary credentials file creds_data = json.loads(google_creds) with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: json.dump(creds_data, f) creds_path = f.name os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path client = vision.ImageAnnotatorClient() # Enhance image contrast for better OCR img = change_contrast(img, 20) # Convert PIL image to bytes imgByteArr = io.BytesIO() img.save(imgByteArr, format='PNG') image = vision.Image(content=imgByteArr.getvalue()) # Perform document text detection response = client.document_text_detection(image=image) # Clean up temporary credentials file if google_creds and 'creds_path' in locals(): try: os.unlink(creds_path) except: pass except Exception as e: # Fallback: create a dummy response for demo purposes print(f"Warning: Google Cloud Vision not available: {e}") response = create_dummy_ocr_response(img) # Process the response word_boxes = [] if hasattr(response, 'full_text_annotation') and response.full_text_annotation: for page in response.full_text_annotation.pages: for block in page.blocks: if block.confidence < 0.9: continue if debug: print(f"\nBlock confidence: {block.confidence}") print(f"Block box: {get_bounding_box_doc(block)}") words = "" fonts = [] for paragraph in block.paragraphs: for word in paragraph.words: word_text = "".join([symbol.text for symbol in word.symbols]) words += word_text + " " word_bbox = get_bounding_box_doc(word) fonts.append(abs(word_bbox[3] - word_bbox[1])) if debug: print(f"Words: {words}") if fonts: # Only add if we have font information word_boxes.append([words.strip()] + get_bounding_box_doc(block) + [sum(fonts) // len(fonts)]) # If no text was detected, create a minimal entry if not word_boxes: word_boxes.append(["No text detected", 0, 0, 100, 20, 12]) # Create QuadTree for clustering nearby text tree = QuadTree(max_dist=max_dist) for i in range(len(word_boxes)): tree.insert(Node(*tuple(word_boxes[i]))) if get_response: return tree, response return tree, {} def create_dummy_ocr_response(img): """ Create a dummy OCR response for demo purposes when Google Cloud Vision is not available. This allows the demo to work without requiring actual OCR credentials. """ W, H = img.size # Create a simple mock response object class MockResponse: def __init__(self): self.full_text_annotation = None # For demo purposes, we'll just return an empty response # In a real scenario, you might want to use an alternative OCR library like pytesseract return MockResponse() def draw_boxes(img, bound, color, width=5): """Draw bounding boxes on image for visualization.""" _img = img.copy() draw = ImageDraw.Draw(_img) x0 = min(bound[0], bound[2]) - 7 x1 = max(bound[0], bound[2]) + 10 y0 = min(bound[1], bound[3]) - 7 y1 = max(bound[1], bound[3]) + 10 draw.rectangle([x0, y0, x1, y1], outline=color, width=width) return _img, x0, y0, x1, y1 def get_image_with_boxes_doc(image, color='red', width=5, get_response=False, response=None): """Get image with OCR bounding boxes drawn on it.""" tree, resp = get_text_from_image_doc(image, get_response=get_response, resp=response) bxs = tree.get_children(False) for bx in bxs: image, x0, y0, x1, y1 = draw_boxes(image, bx, color, width) if get_response: return image, resp return image