Translated from the original Japanese article
Tech 3 min read
Solving NDLOCR Column Layout Recognition with Histogram Analysis
Contents
Goal
OCR processing of a vertical Japanese book (4-column layout)
Environment
| Item | Details |
|---|---|
| OS | Windows 11 |
| Python | 3.14.0 |
Attempt 1: NDLOCR Alone
Result: Partially works but layout recognition is weak
- Column detection fails, breaking reading order
- NDLOCR’s layout analysis was designed for older vertical-text books and struggles with modern multi-column layouts
Attempt 2: Layout Parser + NDLOCR
Result: Installation failure
- Layout Parser itself installs fine with
pip install layoutparser - Detectron2 build error on Windows
pip install "detectron2@git+..."→ build failure- C++ compiler issues in Windows environment
Attempt 3: PyMuPDF + Histogram Analysis (Alternative)
Result: Works
- No Detectron2 needed
- Vertical histogram detects column boundaries (valleys)
- Fixed 4-column split → pass to NDLOCR
Final Setup
PDF
↓
extract_blocks.py (PyMuPDF + histogram analysis)
↓
4 split images (page01_1.jpg ~ page01_4.jpg)
↓
NDLOCR (OCR only)
↓
Text output
Requirements
- Python + PyMuPDF, PIL, numpy
- Docker + NDLOCR
- extract_blocks.py (histogram analysis script)
extract_blocks.py
import fitz # PyMuPDF
from PIL import Image
import io
import os
import numpy as np
# --- Configuration ---
pdf_path = r"C:\ndlocr_work\input\test.pdf"
output_dir = r"C:\ndlocr_work\input\img"
# ---------------------
os.makedirs(output_dir, exist_ok=True)
try:
pdf_document = fitz.open(pdf_path)
except fitz.errors.FitzError as e:
print(f"Error: Cannot open PDF file. Check the path: {e}")
exit()
print(f"Starting to process {len(pdf_document)} pages.")
def find_split_point(histogram, center_y, search_range):
"""Find the center of a valley in the histogram"""
start = max(0, center_y - search_range)
end = min(len(histogram), center_y + search_range)
# Extract histogram segment within search range
segment = histogram[start:end]
# Find the valley depth (minimum value)
min_val = np.min(segment)
# Get all indices of the minimum value (valley bottom)
valley_indices = np.where(segment == min_val)[0]
# Get the center index of the valley bottom
middle_of_valley = valley_indices[len(valley_indices) // 2]
# Return coordinate in the full histogram
return start + middle_of_valley
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
print(f"\n--- Page {page_num + 1} ---")
# 1. Render page at high resolution
mat = fitz.Matrix(3, 3)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.open(io.BytesIO(pix.tobytes("jpeg")))
# 2. Binarize
gray_img = img.convert('L')
binary_img = gray_img.point(lambda x: 0 if x < 200 else 255, '1')
# 3. Calculate vertical histogram
np_img = np.array(binary_img)
histogram = np.sum(255 - np_img, axis=1)
# 4. Detect 3 split points at valley centers
height, width = np_img.shape
search_range = int(height * 0.05) # Search within ±5% of center
split_points = [
find_split_point(histogram, int(height * 0.25), search_range),
find_split_point(histogram, int(height * 0.50), search_range),
find_split_point(histogram, int(height * 0.75), search_range)
]
split_points.sort()
print(f"Detected split point Y coordinates: {split_points}")
# 5. Crop and save image segments
boundaries = [0] + split_points + [height]
for i in range(4):
y_start = boundaries[i]
y_end = boundaries[i+1]
segment = img.crop((0, y_start, width, y_end))
output_path = os.path.join(output_dir, f"page{page_num+1:02d}_{i+1}.jpg")
segment.save(output_path, "JPEG", quality=95)
print(f" Column {i+1} saved: {output_path}")
pdf_document.close()
print("\nAll processing complete!")