Source code for manuscript.data.structures

import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from pydantic import BaseModel, ConfigDict, Field, model_validator

_SCHEMA_CURRENT = "v0_1_11"
_SCHEMA_LEGACY = "v0_1_10"
_SUPPORTED_SCHEMAS = {_SCHEMA_CURRENT, _SCHEMA_LEGACY}



[docs]
class TextSpan(BaseModel):
    """
    A single detected or recognized text span.

    A text span is the smallest OCR region in the pipeline. It may correspond
    to a word, a whole text line, or any other contiguous text segment returned
    by a detector.

    Attributes
    ----------
    polygon : List[Tuple[float, float]]
        Polygon vertices (x, y), ordered clockwise.
        The public data model supports arbitrary polygons with 4 or more points.
        For quadrilateral text regions, the canonical order is
        TL -> TR -> BR -> BL (Top-Left, Top-Right, Bottom-Right, Bottom-Left).
    detection_confidence : float
        Text detection confidence score from detector (0.0 to 1.0).
    text : str, optional
        Recognized text content (populated by OCR pipeline). None if only detection
        was performed.
    recognition_confidence : float, optional
        Text recognition confidence score from recognizer (0.0 to 1.0). None if only
        detection was performed.
    order : int, optional
        Text span position inside the line after sorting. None before sorting.

    Examples
    --------
    >>> text_span = TextSpan(
    ...     polygon=[(10, 20), (100, 20), (100, 40), (10, 40)],
    ...     detection_confidence=0.95,
    ...     text="Hello",
    ...     recognition_confidence=0.98
    ... )
    >>> print(text_span.text)
    Hello
    """

    model_config = ConfigDict(extra="forbid")

    polygon: List[Tuple[float, float]] = Field(
        ...,
        min_length=4,
        description=(
            "Polygon vertices (x, y), ordered clockwise. Supports arbitrary "
            "polygons with 4 or more points. For quadrilateral text regions: "
            "TL -> TR -> BR -> BL."
        ),
    )
    detection_confidence: float = Field(
        ..., ge=0.0, le=1.0, description="Text detection confidence score from detector"
    )
    text: Optional[str] = Field(
        None, description="Recognized text content (populated by OCR pipeline)"
    )
    recognition_confidence: Optional[float] = Field(
        None,
        ge=0.0,
        le=1.0,
        description="Text recognition confidence score from recognizer",
    )
    order: Optional[int] = Field(
        None,
        description="Text span position inside the line after sorting. None before sorting.",
    )



def __getattr__(name: str):
    if name == "Word":
        return TextSpan
    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")



[docs]
class Line(BaseModel):
    """
    A single text line containing one or more text spans.

    Attributes
    ----------
    text_spans : List[TextSpan]
        List of text spans in the line.
    order : int, optional
        Line position inside a block or page after sorting. None before sorting.

    Examples
    --------
    >>> line = Line(text_spans=[
    ...     TextSpan(
    ...         polygon=[(10, 20), (50, 20), (50, 40), (10, 40)],
    ...         detection_confidence=0.95,
    ...         text="Hello",
    ...     ),
    ...     TextSpan(
    ...         polygon=[(60, 20), (110, 20), (110, 40), (60, 40)],
    ...         detection_confidence=0.97,
    ...         text="World",
    ...     ),
    ... ])
    >>> print(len(line.text_spans))
    2
    """

    model_config = ConfigDict(extra="forbid")

    text_spans: List[TextSpan] = Field(
        default_factory=list,
        description="List of text spans in the line.",
    )
    order: Optional[int] = Field(
        None,
        description="Line position inside a block or page after sorting. None before sorting.",
    )

    @model_validator(mode="before")
    @classmethod
    def _accept_legacy_words_key(cls, data: Any) -> Any:
        if not isinstance(data, dict):
            return data

        if "words" not in data:
            return data

        normalized = dict(data)
        legacy_words = normalized.pop("words")

        if "text_spans" in normalized and normalized["text_spans"] != legacy_words:
            raise ValueError(
                "Line received both 'words' and 'text_spans' with different values."
            )

        normalized.setdefault("text_spans", legacy_words)
        return normalized

    @property
    def words(self) -> List[TextSpan]:
        """Backward-compatible alias for ``text_spans``."""
        return self.text_spans

    @words.setter
    def words(self, value: List[TextSpan]) -> None:
        self.text_spans = value




[docs]
class Block(BaseModel):
    """
    A logical text block (e.g., paragraph, column).

    Attributes
    ----------
    lines : List[Line]
        List of text lines in the block.
    text_spans : List[TextSpan], optional
        Optional flat list of text spans used as a shorthand input. If ``lines``
        is empty and ``text_spans`` are provided, they are wrapped into a
        single line.
    order : int, optional
        Block reading-order position after sorting. None before sorting.

    Examples
    --------
    >>> block = Block(lines=[
    ...     Line(text_spans=[
    ...         TextSpan(
    ...             polygon=[(10, 20), (50, 20), (50, 40), (10, 40)],
    ...             detection_confidence=0.95,
    ...             text="Line 1",
    ...         )
    ...     ]),
    ...     Line(text_spans=[
    ...         TextSpan(
    ...             polygon=[(10, 50), (50, 50), (50, 70), (10, 70)],
    ...             detection_confidence=0.97,
    ...             text="Line 2",
    ...         )
    ...     ]),
    ... ])
    >>> print(len(block.lines))
    2
    """

    model_config = ConfigDict(extra="forbid")

    lines: List[Line] = Field(default_factory=list)
    text_spans: List[TextSpan] = Field(
        default_factory=list,
        description=(
            "Optional flat list of text spans. Use 'lines' for structured output."
        ),
    )
    order: Optional[int] = Field(
        None,
        description="Block reading-order position after sorting. None before sorting.",
    )

    @model_validator(mode="before")
    @classmethod
    def _accept_legacy_words_key(cls, data: Any) -> Any:
        if not isinstance(data, dict):
            return data

        if "words" not in data:
            return data

        normalized = dict(data)
        legacy_words = normalized.pop("words")

        if "text_spans" in normalized and normalized["text_spans"] != legacy_words:
            raise ValueError(
                "Block received both 'words' and 'text_spans' with different values."
            )

        normalized.setdefault("text_spans", legacy_words)
        return normalized


[docs]
    def __init__(self, **data):
        """Initialize Block, normalizing flat ``text_spans`` into one line."""
        super().__init__(**data)
        if not self.lines and self.text_spans:
            self.lines = [Line(text_spans=self.text_spans)]


    @property
    def words(self) -> List[TextSpan]:
        """Backward-compatible alias for flat ``text_spans`` input."""
        return self.text_spans

    @words.setter
    def words(self, value: List[TextSpan]) -> None:
        self.text_spans = value




[docs]
class Page(BaseModel):
    """
    A document page containing blocks of text.

    For a full visual diagram of the data model, see:
    ``DATA_MODEL.md`` located in the same module directory.

    Attributes
    ----------
    blocks : List[Block]
        List of text blocks on the page.

    Examples
    --------
    >>> page = Page(blocks=[
    ...     Block(lines=[
    ...         Line(text_spans=[
    ...             TextSpan(
    ...                 polygon=[(10, 20), (50, 20), (50, 40), (10, 40)],
    ...                 detection_confidence=0.95,
    ...                 text="Hello",
    ...             )
    ...         ])
    ...     ])
    ... ])
    >>> print(len(page.blocks))
    1
    """

    model_config = ConfigDict(extra="forbid")

    blocks: List[Block]

    @staticmethod
    def _normalize_schema(schema: str) -> str:
        if schema not in _SUPPORTED_SCHEMAS:
            supported = ", ".join(sorted(_SUPPORTED_SCHEMAS))
            raise ValueError(f"schema must be one of {{{supported}}}, got: {schema}")
        return schema

    @staticmethod
    def _span_to_legacy_dict(span: TextSpan) -> Dict[str, Any]:
        return span.model_dump()

    @classmethod
    def _line_to_dict(cls, line: Line, schema: str) -> Dict[str, Any]:
        if schema == _SCHEMA_CURRENT:
            return line.model_dump()

        return {
            "words": [cls._span_to_legacy_dict(span) for span in line.text_spans],
            "order": line.order,
        }

    @classmethod
    def _block_to_dict(cls, block: Block, schema: str) -> Dict[str, Any]:
        if schema == _SCHEMA_CURRENT:
            return block.model_dump()

        return {
            "lines": [cls._line_to_dict(line, schema) for line in block.lines],
            "words": [cls._span_to_legacy_dict(span) for span in block.text_spans],
            "order": block.order,
        }


[docs]
    def to_dict(self, schema: str = _SCHEMA_CURRENT) -> Dict[str, Any]:
        """
        Export Page to a plain Python dictionary.

        Parameters
        ----------
        schema : {"v0_1_11", "v0_1_10"}, optional
            Output schema version. Default is ``"v0_1_11"``.
        """
        schema = self._normalize_schema(schema)
        if schema == _SCHEMA_CURRENT:
            return self.model_dump()

        return {
            "blocks": [self._block_to_dict(block, schema) for block in self.blocks]
        }



[docs]
    def to_json(
        self,
        path: Optional[Union[str, Path]] = None,
        indent: int = 2,
        schema: str = _SCHEMA_CURRENT,
    ) -> str:
        """
        Export Page to JSON.

        Parameters
        ----------
        path : str or Path, optional
            If provided, saves JSON to file.
        indent : int, optional
            JSON indentation. Default is 2.
        schema : {"v0_1_11", "v0_1_10"}, optional
            Output schema version. Default is ``"v0_1_11"``.

        Returns
        -------
        str
            JSON string representation.

        Examples
        --------
        >>> page.to_json("result.json")  # save to file
        >>> json_str = page.to_json()    # get as string
        >>> legacy_json = page.to_json(schema="v0_1_10")
        """
        data = self.to_dict(schema=schema)
        json_str = json.dumps(data, ensure_ascii=False, indent=indent)
        if path:
            Path(path).write_text(json_str, encoding="utf-8")
        return json_str



[docs]
    @classmethod
    def from_json(cls, source: Union[str, Path]) -> "Page":
        """
        Load Page from JSON file or string.

        Parameters
        ----------
        source : str or Path
            Path to JSON file or JSON string.

        Returns
        -------
        Page
            Loaded Page object.

        Examples
        --------
        >>> page = Page.from_json("result.json")
        >>> page = Page.from_json('{"blocks": [...]}')
        """
        path = Path(source)
        if path.exists():
            data = json.loads(path.read_text(encoding="utf-8"))
        else:
            data = json.loads(source)
        return cls.model_validate(data)