Source code for manuscript._pipeline

import time
from pathlib import Path
from typing import Dict, List, Optional, Union, TYPE_CHECKING

import numpy as np
from PIL import Image

from .data import Page
from .detectors import EAST
from .recognizers import TRBA
from .utils import read_image, visualize_page

if TYPE_CHECKING:
    from .api.base import BaseModel as BaseCorrector



[docs]
class Pipeline:
    """
    High-level OCR pipeline combining text detection, recognition, and correction.

    The Pipeline class orchestrates EAST detector, TRBA recognizer, and optional
    text corrector to perform complete OCR workflow:
    detection → crop extraction → recognition → correction → result merging.

    Attributes
    ----------
    detector : EAST
        Text detector instance
    recognizer : TRBA
        Text recognizer instance
    corrector : BaseCorrector, optional
        Text corrector instance (None to skip correction)
    min_text_size : int
        Minimum text box size in pixels (width and height)
    rotate_threshold : float
        Aspect ratio threshold for automatic rotation of vertical text crops.
        If ``height > width * rotate_threshold``, crop is rotated 90° clockwise.

    Examples
    --------
    Create pipeline with default models:

    >>> from manuscript import Pipeline
    >>> pipeline = Pipeline()
    >>> result = pipeline.predict("document.jpg")
    >>> text = pipeline.get_text(result["page"])
    >>> print(text)

    Create pipeline with custom models:

    >>> from manuscript import Pipeline
    >>> from manuscript.detectors import EAST
    >>> from manuscript.recognizers import TRBA
    >>> detector = EAST(weights="east_50_g1", score_thresh=0.8)
    >>> recognizer = TRBA(weights="trba_lite_g1", device="cuda")
    >>> pipeline = Pipeline(detector=detector, recognizer=recognizer)

    Create pipeline with text correction:

    >>> from manuscript import Pipeline
    >>> from manuscript.correctors import CharLM
    >>> corrector = CharLM()
    >>> pipeline = Pipeline(corrector=corrector)

    Disable automatic rotation of vertical text:

    >>> pipeline = Pipeline(rotate_threshold=0)
    """


[docs]
    def __init__(
        self,
        detector: Optional[EAST] = None,
        recognizer: Optional[TRBA] = None,
        corrector: Optional["BaseCorrector"] = None,
        min_text_size: int = 5,
        rotate_threshold: float = 1.5,
    ):
        """
        Initialize OCR pipeline.

        Parameters
        ----------
        detector : EAST, optional
            Text detector instance. If None, creates default EAST detector.
        recognizer : TRBA, optional
            Text recognizer instance. If None, creates default TRBA recognizer.
        corrector : BaseCorrector, optional
            Text corrector instance. If None, no text correction is applied.
            The corrector receives a Page object after recognition and returns
            a corrected Page object.
        min_text_size : int, optional
            Minimum text size in pixels. Boxes smaller than this will be
            filtered out before recognition. Default is 5.
        rotate_threshold : float, optional
            Aspect ratio threshold for automatic rotation of vertical text.
            If ``height > width * rotate_threshold``, the crop is rotated
            90 degrees clockwise to convert vertical text to horizontal.
            Set to ``None`` or ``0`` to disable automatic rotation.
            Default is 1.5.
        """
        self.detector = detector if detector is not None else EAST()
        self.recognizer = recognizer if recognizer is not None else TRBA()
        self.corrector = corrector
        self.min_text_size = min_text_size
        self.rotate_threshold = rotate_threshold

        self._last_detection_page: Optional[Page] = None
        self._last_recognition_page: Optional[Page] = None
        self._last_correction_page: Optional[Page] = None



[docs]
    def predict(
        self,
        image: Union[str, Path, np.ndarray, Image.Image],
        recognize_text: bool = True,
        vis: bool = False,
        profile: bool = False,
    ) -> Union[Dict, tuple]:
        """
        Run OCR pipeline on a single image.

        Parameters
        ----------
        image : str, Path, numpy.ndarray, or PIL.Image
            Input image. Can be:
            - Path to image file (str or Path)
            - RGB numpy array with shape (H, W, 3) in uint8
            - PIL Image object
        recognize_text : bool, optional
            If True, performs both detection and recognition.
            If False, performs only detection. Default is True.
        vis : bool, optional
            If True, returns visualization image along with results.
            Default is False.
        profile : bool, optional
            If True, prints timing information for each pipeline stage.
            Default is False.

        Returns
        -------
        dict or tuple
            If vis=False:
                dict with keys:
                - "page" : Page object with detection/recognition results

            If vis=True:
                tuple of (result_dict, vis_image)

        Examples
        --------
        Basic usage:

        >>> pipeline = Pipeline()
        >>> result = pipeline.predict("document.jpg")
        >>> page = result["page"]
        >>> print(page.blocks[0].lines[0].words[0].text)

        Detection only:

        >>> result = pipeline.predict("document.jpg", recognize_text=False)
        >>> # Words will have polygon and detection_confidence but no text

        With visualization:

        >>> result, vis_img = pipeline.predict("document.jpg", vis=True)
        >>> vis_img.show()

        With profiling:

        >>> result = pipeline.predict("document.jpg", profile=True)
        # Prints timing for each stage
        """
        start_time = time.time()

        # ---- DETECTION ----
        t0 = time.time()
        detection_result = self.detector.predict(
            image, return_maps=False, sort_reading_order=True
        )
        page: Page = detection_result["page"]
        self._last_detection_page = page.model_copy(deep=True)

        if profile:
            print(f"Detection: {time.time() - t0:.3f}s")

        # ---- If recognition not needed ----
        if not recognize_text:
            result = {"page": page}

            if vis:
                img_array = read_image(image)
                pil_img = (
                    image
                    if isinstance(image, Image.Image)
                    else Image.fromarray(img_array)
                )
                vis_img = visualize_page(pil_img, page, show_order=False)
                return result, vis_img

            return result

        # ---- LOAD IMAGE FOR CROPPING ----
        t0 = time.time()
        image_array = read_image(image)
        if profile:
            print(f"Load image for crops: {time.time() - t0:.3f}s")

        # ---- EXTRACT WORD CROPS ----
        t0 = time.time()
        word_images = []
        word_objects = []  # Keep references to Word objects for updating

        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    poly = np.array(word.polygon, dtype=np.int32)
                    x_min, y_min = np.min(poly, axis=0)
                    x_max, y_max = np.max(poly, axis=0)

                    width = x_max - x_min
                    height = y_max - y_min

                    # Filter by minimum size
                    if width >= self.min_text_size and height >= self.min_text_size:
                        region_image = self._extract_word_image(image_array, poly)
                        if region_image is not None and region_image.size > 0:
                            word_images.append(region_image)
                            word_objects.append(word)

        if profile:
            print(f"Extract {len(word_images)} crops: {time.time() - t0:.3f}s")

        # ---- RECOGNITION ----
        if word_images:
            t0 = time.time()
            recognition_results = self.recognizer.predict(word_images, batch_size=32)
            if profile:
                print(f"Recognition: {time.time() - t0:.3f}s")

            # Update Word objects with recognition results
            for word_obj, result in zip(word_objects, recognition_results):
                word_obj.text = result["text"]
                word_obj.recognition_confidence = result["confidence"]

        self._last_recognition_page = page.model_copy(deep=True)

        # ---- CORRECTION ----
        if self.corrector is not None:
            t0 = time.time()
            page = self.corrector.predict(page)
            self._last_correction_page = page.model_copy(deep=True)
            if profile:
                print(f"Correction: {time.time() - t0:.3f}s")
        else:
            self._last_correction_page = None

        if profile:
            print(f"Pipeline total: {time.time() - start_time:.3f}s")

        result = {"page": page}

        if vis:
            pil_img = (
                image
                if isinstance(image, Image.Image)
                else Image.fromarray(image_array)
            )
            vis_img = visualize_page(pil_img, page, show_order=True)
            return result, vis_img

        return result



[docs]
    def get_text(self, page: Page) -> str:
        """
        Extract plain text from Page object.

        Parameters
        ----------
        page : Page
            Page object with recognition results.

        Returns
        -------
        str
            Extracted text with lines separated by newlines.

        Examples
        --------
        >>> pipeline = Pipeline()
        >>> result = pipeline.predict("document.jpg")
        >>> text = pipeline.get_text(result["page"])
        >>> print(text)
        """
        lines = []
        for block in page.blocks:
            for line in block.lines:
                # Extract text from words in the line
                texts = [w.text for w in line.words if w.text]
                if texts:
                    lines.append(" ".join(texts))
        return "\n".join(lines)


    @property
    def last_detection_page(self) -> Optional[Page]:
        return self._last_detection_page

    @property
    def last_recognition_page(self) -> Optional[Page]:
        return self._last_recognition_page

    @property
    def last_correction_page(self) -> Optional[Page]:
        return self._last_correction_page

    def _extract_word_image(
        self, image: np.ndarray, polygon: np.ndarray
    ) -> Optional[np.ndarray]:
        """
        Extract word region from image using polygon coordinates.

        Parameters
        ----------
        image : np.ndarray
            Input image (H, W, 3)
        polygon : np.ndarray
            Polygon coordinates [[x1,y1], [x2,y2], ...]

        Returns
        -------
        np.ndarray or None
            Cropped word image or None if extraction failed
        """
        try:
            x_min, y_min = np.min(polygon, axis=0)
            x_max, y_max = np.max(polygon, axis=0)

            h, w = image.shape[:2]
            x1 = max(0, int(x_min))
            y1 = max(0, int(y_min))
            x2 = min(w, int(x_max))
            y2 = min(h, int(y_max))

            region_image = image[y1:y2, x1:x2]

            if region_image.size == 0:
                return None

            region_image = self._prepare_crop(region_image)

            return region_image if region_image.size > 0 else None
        except Exception:
            return None

    def _prepare_crop(self, crop: np.ndarray) -> np.ndarray:
        """
        Prepare crop for recognition, rotating vertical text if needed.

        If the crop height is significantly greater than width (based on
        ``rotate_threshold``), the image is rotated 90 degrees clockwise
        to convert vertical text to horizontal orientation.

        Parameters
        ----------
        crop : np.ndarray
            Cropped word image (H, W, 3) or (H, W)

        Returns
        -------
        np.ndarray
            Prepared crop, possibly rotated
        """
        if not self.rotate_threshold:
            return crop

        height, width = crop.shape[:2]

        if height > width * self.rotate_threshold:
            crop = np.rot90(crop, k=-1)

        return crop