Руководство по использованию Pipeline
Класс Pipeline в manuscript-ocr спроектирован для работы с любыми детекторами, распознавателями и корректорами, реализующими простой интерфейс.
Требования к детектору
Класс детектора должен реализовать метод predict, который принимает изображение и возвращает словарь с ключом "page":
def predict(self, image) -> Dict[str, Any]:
"""
Parameters:
- image: file path (str) or numpy array (H, W, 3) in uint8
Returns dictionary:
{
"page": Page # Page object with detection results
}
"""
pass
Структура результата
Результат должен содержать объект Page с иерархией: Page → Block → Line → Word
См. src/manuscript/data/structures.py для подробной документации по структурам данных.
Минимальный пример создания Page:
from manuscript.data import Word, Line, Block, Page
# Create a word with coordinates and detection confidence
word = Word(
polygon=[(10, 20), (100, 20), (100, 40), (10, 40)],
detection_confidence=0.95
)
# Group words into a line
line = Line(words=[word])
# Group lines into a block
block = Block(lines=[line])
# Create a page
page = Page(blocks=[block])
Требования к распознавателю
Класс распознавателя должен реализовать метод predict, который принимает список изображений и возвращает список результатов:
def predict(self, images: List[np.ndarray]) -> List[Dict[str, Any]]:
"""
Parameters:
- images: list of numpy arrays (RGB word images)
Returns list of dictionaries:
[
{"text": "word1", "confidence": 0.95},
{"text": "word2", "confidence": 0.92},
...
]
"""
pass
Пример:
class MyRecognizer:
def predict(self, images):
results = []
for img in images:
# Your recognition logic
text = "recognized_text"
confidence = 0.92
results.append({"text": text, "confidence": confidence})
return results
Требования к корректору
Класс корректора должен реализовать метод predict, который принимает объект Page и возвращает исправленный Page:
def predict(self, page: Page) -> Page:
"""
Parameters:
- page: Page object with recognized text
Returns:
- Page: Page object with corrected text
"""
pass
Пример:
from manuscript.data import Page
class MyCorrector:
def predict(self, page: Page) -> Page:
result = page.model_copy(deep=True)
for block in result.blocks:
for line in block.lines:
for word in line.words:
if word.text:
# Your correction logic
word.text = self._correct(word.text)
return result
def _correct(self, text: str) -> str:
# Text correction logic
return text
Встроенный корректор CharLM
CharLM — это символьная языковая модель на основе Transformer для исправления ошибок OCR:
from manuscript.correctors import CharLM
# With default settings
corrector = CharLM()
# With custom parameters
corrector = CharLM(
weights="prereform_charlm_g1", # or "modern_charlm_g1"
mask_threshold=0.05, # confidence threshold for correction
apply_threshold=0.95, # minimum model confidence
max_edits=2, # max edits per word
min_word_len=4, # min word length for correction
lexicon="prereform_words" # lexicon of known words
)
Примеры совместимых реализаций
Полный пример детектора
from manuscript.data import Word, Line, Block, Page
class MyDetector:
def predict(self, image):
# Your image detection logic
# ...
# Create result
words = [
Word(
polygon=[(10, 20), (100, 20), (100, 40), (10, 40)],
detection_confidence=0.95
),
Word(
polygon=[(110, 20), (200, 20), (200, 40), (110, 40)],
detection_confidence=0.92
),
]
line = Line(words=words)
block = Block(lines=[line])
page = Page(blocks=[block])
return {"page": page}
Использование кастомных компонентов
from manuscript import Pipeline
from my_package import MyDetector, MyRecognizer, MyCorrector
# Use custom detector and recognizer
detector = MyDetector()
recognizer = MyRecognizer()
corrector = MyCorrector()
pipeline = Pipeline(
detector=detector,
recognizer=recognizer,
corrector=corrector
)
result = pipeline.predict("document.jpg")
Примеры использования Pipeline
Базовое использование
from manuscript import Pipeline
# Initialize with default models
pipeline = Pipeline()
# Process image
result = pipeline.predict("document.jpg")
page = result["page"]
# Extract text
text = pipeline.get_text(page)
print(text)
Только детекция (без распознавания)
result = pipeline.predict("document.jpg", recognize_text=False)
page = result["page"]
# Words have polygon and detection_confidence, but no text
for block in page.blocks:
for line in block.lines:
for word in line.words:
print(f"Polygon: {word.polygon}, Confidence: {word.detection_confidence}")
С визуализацией
result, vis_img = pipeline.predict("document.jpg", vis=True)
vis_img.save("output_visualization.jpg")
Промежуточные результаты
from manuscript.correctors import CharLM
pipeline = Pipeline(corrector=CharLM())
result = pipeline.predict("document.jpg")
# Result after detection (before recognition)
detection_page = pipeline.last_detection_page
# Result after recognition (before correction)
recognition_page = pipeline.last_recognition_page
# Result after correction (None if corrector not used)
correction_page = pipeline.last_correction_page
Экспорт/импорт Page в JSON
page = result["page"]
# Save to file
page.to_json("result.json")
# Get as string
json_str = page.to_json()
# Load from file
from manuscript.data import Page
page = Page.from_json("result.json")
# Load from string
page = Page.from_json('{"blocks": [...]}')
С профилированием
# Prints execution time for each stage
result = pipeline.predict("document.jpg", profile=True)
# Output:
# Detection: 0.123s
# Load image for crops: 0.005s
# Extract 45 crops: 0.012s
# Recognition: 0.234s
# Pipeline total: 0.374s
Пакетная обработка
images = ["page1.jpg", "page2.jpg", "page3.jpg"]
results = pipeline.process_batch(images)
for result in results:
text = pipeline.get_text(result["page"])
print(text)
Настройка компонентов
Замена детектора или распознавателя
from manuscript import Pipeline
# Only custom detector, default recognizer
from my_package import MyCustomDetector
pipeline = Pipeline(detector=MyCustomDetector())
# Only custom recognizer, default detector
from my_package import MyCustomRecognizer
pipeline = Pipeline(recognizer=MyCustomRecognizer())
# Both components custom
pipeline = Pipeline(detector=MyCustomDetector(), recognizer=MyCustomRecognizer())
Настройка встроенных моделей
from manuscript import Pipeline
from manuscript.detectors import EAST
from manuscript.recognizers import TRBA
# EAST with settings
detector = EAST(
weights="east_50_g1", # weight selection
score_thresh=0.8, # confidence threshold
nms_thresh=0.2, # NMS threshold
device="cpu" # device (cpu/cuda)
)
# TRBA with settings
recognizer = TRBA(
weights="trba_lite_g1", # weight selection
device="cuda" # GPU for acceleration
)
pipeline = Pipeline(detector, recognizer)
Фильтрация по размеру
# Ignore text blocks smaller than 10 pixels
pipeline = Pipeline(min_text_size=10)
Управление автоматическим поворотом
# Enable automatic rotation of vertical text (default)
pipeline = Pipeline(rotate_threshold=1.5)
# Disable automatic rotation
pipeline = Pipeline(rotate_threshold=0)