Source code for manuscript.data.structures

import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

from pydantic import BaseModel, ConfigDict, Field, model_validator

_SCHEMA_CURRENT = "v0_1_11"
_SCHEMA_LEGACY = "v0_1_10"
_SUPPORTED_SCHEMAS = {_SCHEMA_CURRENT, _SCHEMA_LEGACY}


[docs] class TextSpan(BaseModel): """ A single detected or recognized text span. A text span is the smallest OCR region in the pipeline. It may correspond to a word, a whole text line, or any other contiguous text segment returned by a detector. Attributes ---------- polygon : List[Tuple[float, float]] Polygon vertices (x, y), ordered clockwise. The public data model supports arbitrary polygons with 4 or more points. For quadrilateral text regions, the canonical order is TL -> TR -> BR -> BL (Top-Left, Top-Right, Bottom-Right, Bottom-Left). detection_confidence : float Text detection confidence score from detector (0.0 to 1.0). text : str, optional Recognized text content (populated by OCR pipeline). None if only detection was performed. recognition_confidence : float, optional Text recognition confidence score from recognizer (0.0 to 1.0). None if only detection was performed. order : int, optional Text span position inside the line after sorting. None before sorting. Examples -------- >>> text_span = TextSpan( ... polygon=[(10, 20), (100, 20), (100, 40), (10, 40)], ... detection_confidence=0.95, ... text="Hello", ... recognition_confidence=0.98 ... ) >>> print(text_span.text) Hello """ model_config = ConfigDict(extra="forbid") polygon: List[Tuple[float, float]] = Field( ..., min_length=4, description=( "Polygon vertices (x, y), ordered clockwise. Supports arbitrary " "polygons with 4 or more points. For quadrilateral text regions: " "TL -> TR -> BR -> BL." ), ) detection_confidence: float = Field( ..., ge=0.0, le=1.0, description="Text detection confidence score from detector" ) text: Optional[str] = Field( None, description="Recognized text content (populated by OCR pipeline)" ) recognition_confidence: Optional[float] = Field( None, ge=0.0, le=1.0, description="Text recognition confidence score from recognizer", ) order: Optional[int] = Field( None, description="Text span position inside the line after sorting. None before sorting.", )
def __getattr__(name: str): if name == "Word": return TextSpan raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
[docs] class Line(BaseModel): """ A single text line containing one or more text spans. Attributes ---------- text_spans : List[TextSpan] List of text spans in the line. order : int, optional Line position inside a block or page after sorting. None before sorting. Examples -------- >>> line = Line(text_spans=[ ... TextSpan( ... polygon=[(10, 20), (50, 20), (50, 40), (10, 40)], ... detection_confidence=0.95, ... text="Hello", ... ), ... TextSpan( ... polygon=[(60, 20), (110, 20), (110, 40), (60, 40)], ... detection_confidence=0.97, ... text="World", ... ), ... ]) >>> print(len(line.text_spans)) 2 """ model_config = ConfigDict(extra="forbid") text_spans: List[TextSpan] = Field( default_factory=list, description="List of text spans in the line.", ) order: Optional[int] = Field( None, description="Line position inside a block or page after sorting. None before sorting.", ) @model_validator(mode="before") @classmethod def _accept_legacy_words_key(cls, data: Any) -> Any: if not isinstance(data, dict): return data if "words" not in data: return data normalized = dict(data) legacy_words = normalized.pop("words") if "text_spans" in normalized and normalized["text_spans"] != legacy_words: raise ValueError( "Line received both 'words' and 'text_spans' with different values." ) normalized.setdefault("text_spans", legacy_words) return normalized @property def words(self) -> List[TextSpan]: """Backward-compatible alias for ``text_spans``.""" return self.text_spans @words.setter def words(self, value: List[TextSpan]) -> None: self.text_spans = value
[docs] class Block(BaseModel): """ A logical text block (e.g., paragraph, column). Attributes ---------- lines : List[Line] List of text lines in the block. text_spans : List[TextSpan], optional Optional flat list of text spans used as a shorthand input. If ``lines`` is empty and ``text_spans`` are provided, they are wrapped into a single line. order : int, optional Block reading-order position after sorting. None before sorting. Examples -------- >>> block = Block(lines=[ ... Line(text_spans=[ ... TextSpan( ... polygon=[(10, 20), (50, 20), (50, 40), (10, 40)], ... detection_confidence=0.95, ... text="Line 1", ... ) ... ]), ... Line(text_spans=[ ... TextSpan( ... polygon=[(10, 50), (50, 50), (50, 70), (10, 70)], ... detection_confidence=0.97, ... text="Line 2", ... ) ... ]), ... ]) >>> print(len(block.lines)) 2 """ model_config = ConfigDict(extra="forbid") lines: List[Line] = Field(default_factory=list) text_spans: List[TextSpan] = Field( default_factory=list, description=( "Optional flat list of text spans. Use 'lines' for structured output." ), ) order: Optional[int] = Field( None, description="Block reading-order position after sorting. None before sorting.", ) @model_validator(mode="before") @classmethod def _accept_legacy_words_key(cls, data: Any) -> Any: if not isinstance(data, dict): return data if "words" not in data: return data normalized = dict(data) legacy_words = normalized.pop("words") if "text_spans" in normalized and normalized["text_spans"] != legacy_words: raise ValueError( "Block received both 'words' and 'text_spans' with different values." ) normalized.setdefault("text_spans", legacy_words) return normalized
[docs] def __init__(self, **data): """Initialize Block, normalizing flat ``text_spans`` into one line.""" super().__init__(**data) if not self.lines and self.text_spans: self.lines = [Line(text_spans=self.text_spans)]
@property def words(self) -> List[TextSpan]: """Backward-compatible alias for flat ``text_spans`` input.""" return self.text_spans @words.setter def words(self, value: List[TextSpan]) -> None: self.text_spans = value
[docs] class Page(BaseModel): """ A document page containing blocks of text. For a full visual diagram of the data model, see: ``DATA_MODEL.md`` located in the same module directory. Attributes ---------- blocks : List[Block] List of text blocks on the page. Examples -------- >>> page = Page(blocks=[ ... Block(lines=[ ... Line(text_spans=[ ... TextSpan( ... polygon=[(10, 20), (50, 20), (50, 40), (10, 40)], ... detection_confidence=0.95, ... text="Hello", ... ) ... ]) ... ]) ... ]) >>> print(len(page.blocks)) 1 """ model_config = ConfigDict(extra="forbid") blocks: List[Block] @staticmethod def _normalize_schema(schema: str) -> str: if schema not in _SUPPORTED_SCHEMAS: supported = ", ".join(sorted(_SUPPORTED_SCHEMAS)) raise ValueError(f"schema must be one of {{{supported}}}, got: {schema}") return schema @staticmethod def _span_to_legacy_dict(span: TextSpan) -> Dict[str, Any]: return span.model_dump() @classmethod def _line_to_dict(cls, line: Line, schema: str) -> Dict[str, Any]: if schema == _SCHEMA_CURRENT: return line.model_dump() return { "words": [cls._span_to_legacy_dict(span) for span in line.text_spans], "order": line.order, } @classmethod def _block_to_dict(cls, block: Block, schema: str) -> Dict[str, Any]: if schema == _SCHEMA_CURRENT: return block.model_dump() return { "lines": [cls._line_to_dict(line, schema) for line in block.lines], "words": [cls._span_to_legacy_dict(span) for span in block.text_spans], "order": block.order, }
[docs] def to_dict(self, schema: str = _SCHEMA_CURRENT) -> Dict[str, Any]: """ Export Page to a plain Python dictionary. Parameters ---------- schema : {"v0_1_11", "v0_1_10"}, optional Output schema version. Default is ``"v0_1_11"``. """ schema = self._normalize_schema(schema) if schema == _SCHEMA_CURRENT: return self.model_dump() return { "blocks": [self._block_to_dict(block, schema) for block in self.blocks] }
[docs] def to_json( self, path: Optional[Union[str, Path]] = None, indent: int = 2, schema: str = _SCHEMA_CURRENT, ) -> str: """ Export Page to JSON. Parameters ---------- path : str or Path, optional If provided, saves JSON to file. indent : int, optional JSON indentation. Default is 2. schema : {"v0_1_11", "v0_1_10"}, optional Output schema version. Default is ``"v0_1_11"``. Returns ------- str JSON string representation. Examples -------- >>> page.to_json("result.json") # save to file >>> json_str = page.to_json() # get as string >>> legacy_json = page.to_json(schema="v0_1_10") """ data = self.to_dict(schema=schema) json_str = json.dumps(data, ensure_ascii=False, indent=indent) if path: Path(path).write_text(json_str, encoding="utf-8") return json_str
[docs] @classmethod def from_json(cls, source: Union[str, Path]) -> "Page": """ Load Page from JSON file or string. Parameters ---------- source : str or Path Path to JSON file or JSON string. Returns ------- Page Loaded Page object. Examples -------- >>> page = Page.from_json("result.json") >>> page = Page.from_json('{"blocks": [...]}') """ path = Path(source) if path.exists(): data = json.loads(path.read_text(encoding="utf-8")) else: data = json.loads(source) return cls.model_validate(data)