Source code for manuscript._pipeline

import time
from pathlib import Path
from typing import Dict, List, Optional, Union, TYPE_CHECKING

import numpy as np
from PIL import Image

from .data import Page
from .detectors import EAST
from .recognizers import TRBA
from .utils import read_image, visualize_page

if TYPE_CHECKING:
    from .api.base import BaseModel as BaseCorrector


[docs] class Pipeline: """ High-level OCR pipeline combining text detection, recognition, and correction. The Pipeline class orchestrates EAST detector, TRBA recognizer, and optional text corrector to perform complete OCR workflow: detection → crop extraction → recognition → correction → result merging. Attributes ---------- detector : EAST Text detector instance recognizer : TRBA Text recognizer instance corrector : BaseCorrector, optional Text corrector instance (None to skip correction) min_text_size : int Minimum text box size in pixels (width and height) rotate_threshold : float Aspect ratio threshold for automatic rotation of vertical text crops. If ``height > width * rotate_threshold``, crop is rotated 90° clockwise. Examples -------- Create pipeline with default models: >>> from manuscript import Pipeline >>> pipeline = Pipeline() >>> result = pipeline.predict("document.jpg") >>> text = pipeline.get_text(result["page"]) >>> print(text) Create pipeline with custom models: >>> from manuscript import Pipeline >>> from manuscript.detectors import EAST >>> from manuscript.recognizers import TRBA >>> detector = EAST(weights="east_50_g1", score_thresh=0.8) >>> recognizer = TRBA(weights="trba_lite_g1", device="cuda") >>> pipeline = Pipeline(detector=detector, recognizer=recognizer) Create pipeline with text correction: >>> from manuscript import Pipeline >>> from manuscript.correctors import CharLM >>> corrector = CharLM() >>> pipeline = Pipeline(corrector=corrector) Disable automatic rotation of vertical text: >>> pipeline = Pipeline(rotate_threshold=0) """
[docs] def __init__( self, detector: Optional[EAST] = None, recognizer: Optional[TRBA] = None, corrector: Optional["BaseCorrector"] = None, min_text_size: int = 5, rotate_threshold: float = 1.5, ): """ Initialize OCR pipeline. Parameters ---------- detector : EAST, optional Text detector instance. If None, creates default EAST detector. recognizer : TRBA, optional Text recognizer instance. If None, creates default TRBA recognizer. corrector : BaseCorrector, optional Text corrector instance. If None, no text correction is applied. The corrector receives a Page object after recognition and returns a corrected Page object. min_text_size : int, optional Minimum text size in pixels. Boxes smaller than this will be filtered out before recognition. Default is 5. rotate_threshold : float, optional Aspect ratio threshold for automatic rotation of vertical text. If ``height > width * rotate_threshold``, the crop is rotated 90 degrees clockwise to convert vertical text to horizontal. Set to ``None`` or ``0`` to disable automatic rotation. Default is 1.5. """ self.detector = detector if detector is not None else EAST() self.recognizer = recognizer if recognizer is not None else TRBA() self.corrector = corrector self.min_text_size = min_text_size self.rotate_threshold = rotate_threshold self._last_detection_page: Optional[Page] = None self._last_recognition_page: Optional[Page] = None self._last_correction_page: Optional[Page] = None
[docs] def predict( self, image: Union[str, Path, np.ndarray, Image.Image], recognize_text: bool = True, vis: bool = False, profile: bool = False, ) -> Union[Dict, tuple]: """ Run OCR pipeline on a single image. Parameters ---------- image : str, Path, numpy.ndarray, or PIL.Image Input image. Can be: - Path to image file (str or Path) - RGB numpy array with shape (H, W, 3) in uint8 - PIL Image object recognize_text : bool, optional If True, performs both detection and recognition. If False, performs only detection. Default is True. vis : bool, optional If True, returns visualization image along with results. Default is False. profile : bool, optional If True, prints timing information for each pipeline stage. Default is False. Returns ------- dict or tuple If vis=False: dict with keys: - "page" : Page object with detection/recognition results If vis=True: tuple of (result_dict, vis_image) Examples -------- Basic usage: >>> pipeline = Pipeline() >>> result = pipeline.predict("document.jpg") >>> page = result["page"] >>> print(page.blocks[0].lines[0].words[0].text) Detection only: >>> result = pipeline.predict("document.jpg", recognize_text=False) >>> # Words will have polygon and detection_confidence but no text With visualization: >>> result, vis_img = pipeline.predict("document.jpg", vis=True) >>> vis_img.show() With profiling: >>> result = pipeline.predict("document.jpg", profile=True) # Prints timing for each stage """ start_time = time.time() # ---- DETECTION ---- t0 = time.time() detection_result = self.detector.predict( image, return_maps=False, sort_reading_order=True ) page: Page = detection_result["page"] self._last_detection_page = page.model_copy(deep=True) if profile: print(f"Detection: {time.time() - t0:.3f}s") # ---- If recognition not needed ---- if not recognize_text: result = {"page": page} if vis: img_array = read_image(image) pil_img = ( image if isinstance(image, Image.Image) else Image.fromarray(img_array) ) vis_img = visualize_page(pil_img, page, show_order=False) return result, vis_img return result # ---- LOAD IMAGE FOR CROPPING ---- t0 = time.time() image_array = read_image(image) if profile: print(f"Load image for crops: {time.time() - t0:.3f}s") # ---- EXTRACT WORD CROPS ---- t0 = time.time() word_images = [] word_objects = [] # Keep references to Word objects for updating for block in page.blocks: for line in block.lines: for word in line.words: poly = np.array(word.polygon, dtype=np.int32) x_min, y_min = np.min(poly, axis=0) x_max, y_max = np.max(poly, axis=0) width = x_max - x_min height = y_max - y_min # Filter by minimum size if width >= self.min_text_size and height >= self.min_text_size: region_image = self._extract_word_image(image_array, poly) if region_image is not None and region_image.size > 0: word_images.append(region_image) word_objects.append(word) if profile: print(f"Extract {len(word_images)} crops: {time.time() - t0:.3f}s") # ---- RECOGNITION ---- if word_images: t0 = time.time() recognition_results = self.recognizer.predict(word_images, batch_size=32) if profile: print(f"Recognition: {time.time() - t0:.3f}s") # Update Word objects with recognition results for word_obj, result in zip(word_objects, recognition_results): word_obj.text = result["text"] word_obj.recognition_confidence = result["confidence"] self._last_recognition_page = page.model_copy(deep=True) # ---- CORRECTION ---- if self.corrector is not None: t0 = time.time() page = self.corrector.predict(page) self._last_correction_page = page.model_copy(deep=True) if profile: print(f"Correction: {time.time() - t0:.3f}s") else: self._last_correction_page = None if profile: print(f"Pipeline total: {time.time() - start_time:.3f}s") result = {"page": page} if vis: pil_img = ( image if isinstance(image, Image.Image) else Image.fromarray(image_array) ) vis_img = visualize_page(pil_img, page, show_order=True) return result, vis_img return result
[docs] def get_text(self, page: Page) -> str: """ Extract plain text from Page object. Parameters ---------- page : Page Page object with recognition results. Returns ------- str Extracted text with lines separated by newlines. Examples -------- >>> pipeline = Pipeline() >>> result = pipeline.predict("document.jpg") >>> text = pipeline.get_text(result["page"]) >>> print(text) """ lines = [] for block in page.blocks: for line in block.lines: # Extract text from words in the line texts = [w.text for w in line.words if w.text] if texts: lines.append(" ".join(texts)) return "\n".join(lines)
@property def last_detection_page(self) -> Optional[Page]: return self._last_detection_page @property def last_recognition_page(self) -> Optional[Page]: return self._last_recognition_page @property def last_correction_page(self) -> Optional[Page]: return self._last_correction_page def _extract_word_image( self, image: np.ndarray, polygon: np.ndarray ) -> Optional[np.ndarray]: """ Extract word region from image using polygon coordinates. Parameters ---------- image : np.ndarray Input image (H, W, 3) polygon : np.ndarray Polygon coordinates [[x1,y1], [x2,y2], ...] Returns ------- np.ndarray or None Cropped word image or None if extraction failed """ try: x_min, y_min = np.min(polygon, axis=0) x_max, y_max = np.max(polygon, axis=0) h, w = image.shape[:2] x1 = max(0, int(x_min)) y1 = max(0, int(y_min)) x2 = min(w, int(x_max)) y2 = min(h, int(y_max)) region_image = image[y1:y2, x1:x2] if region_image.size == 0: return None region_image = self._prepare_crop(region_image) return region_image if region_image.size > 0 else None except Exception: return None def _prepare_crop(self, crop: np.ndarray) -> np.ndarray: """ Prepare crop for recognition, rotating vertical text if needed. If the crop height is significantly greater than width (based on ``rotate_threshold``), the image is rotated 90 degrees clockwise to convert vertical text to horizontal orientation. Parameters ---------- crop : np.ndarray Cropped word image (H, W, 3) or (H, W) Returns ------- np.ndarray Prepared crop, possibly rotated """ if not self.rotate_threshold: return crop height, width = crop.shape[:2] if height > width * self.rotate_threshold: crop = np.rot90(crop, k=-1) return crop