This commit is contained in:
Mia Herkt 2022-08-28 20:00:13 +02:00
commit accf3e5795
Signed by: mia
GPG Key ID: 72E154B8622EC191
4 changed files with 405 additions and 0 deletions

22
README.md Normal file
View File

@ -0,0 +1,22 @@
# scanproc
This is a tool to process high-resolution scans with as little user
input as possible. It produces decently compressed PDF files with
monochrome/color separation and more or less accurate text layers.
## Requirements
In addition to the Python modules listed in `requirements.txt`,
scanproc requires [jbig2](https://github.com/agl/jbig2enc) to
be installed in your `$PATH`.
## Usage
`scanproc.py [-h] [-l LANGS] [-d DPI] [--debug] output [imgs ...]`
Usage is simple. You pass a list of
[EasyOCR language codes](https://www.jaided.ai/easyocr/) to use
with `-l` (default English only; pass multiple times for more than
one language), the image resolution (required for filter kernel
scaling and correct page size), followed by the output PDF file and
the input image files.

180
pdfutil.py Executable file
View File

@ -0,0 +1,180 @@
#!/usr/bin/env python3
import struct
import fitz
import cv2 as cv
class Dict:
def __init__(self, values={}):
self.d = {}
self.d.update(values)
def __bytes__(self):
s = [b"<< "]
for (x, y) in self.d.items():
s.append(b"/" + x + b" " + y + b"\n")
s.append(b">>\n")
return b"".join(s)
class Obj:
def __init__(self, d={}, stream=None):
if stream is not None:
d[b"Length"] = str(len(stream)).encode()
self.d = Dict(d)
self.stream = stream
self.idx = 0
def __bytes__(self):
s = [bytes(self.d)]
if self.stream is not None:
s.append(b"stream\n")
s.append(self.stream)
s.append(b"\nendstream\n")
s.append(b"endobj\n")
return b"".join(s)
class Doc:
def __init__(self):
self.objs = []
self.pages = []
def add_object(self, o):
o.idx = len(self.objs) + 1
self.objs.append(o)
return o
def add_page(self, o):
self.pages.append(o)
return self.add_object(o)
def __bytes__(self):
a = []
j = [0]
offsets = []
def add(x):
a.append(x)
j[0] += len(x) + 1
add(b'%PDF-1.4')
for o in self.objs:
offsets.append(j[0])
add(f"{o.idx} 0 obj".encode())
add(bytes(o))
xrefstart = j[0]
a.append(b'xref')
a.append(f'0 {len(offsets) + 1}'.encode())
a.append(b'0000000000 65535 f ')
for o in offsets:
a.append(f"{o:010d} 00000 n ".encode())
a.append(b'')
a.append(b'trailer')
a.append(f'<< /Size {len(offsets) + 1}\n/Root 1 0 R >>'.encode())
a.append(b'startxref')
a.append(str(xrefstart).encode())
a.append(b'%EOF')
return b'\n'.join(a)
def ref(x):
return f"{x} 0 R".encode()
def mkpdf(symtab, pageblobs, colorimgs, texts, dpi):
doc = Doc()
doc.add_object(Obj({
b'Type' : b'/Catalog',
b'Outlines' : ref(2),
b'Pages' : ref(3)
}))
doc.add_object(Obj({
b'Type' : b'/Outlines',
b'Count': b'0'
}))
pages = Obj({b'Type' : b'/Pages'})
doc.add_object(pages)
with open(symtab, "rb") as stab:
symd = doc.add_object(Obj({}, stab.read()))
page_objs = []
for f in pageblobs:
with open(f, "rb") as pf:
blob = pf.read()
(w, h, xres, yres) = struct.unpack(">IIII", blob[11:27])
if xres == 0:
xres = dpi
if yres == 0:
yres = dpi
xobj = Obj({
b'Type': b'/XObject',
b'Subtype': b'/Image',
b'Width': str(w).encode(),
b'Height': str(h).encode(),
b'ColorSpace': b'/DeviceGray',
b'BitsPerComponent': b'1',
b'Filter': b'/JBIG2Decode',
b'DecodeParms': f' << /JBIG2Globals {symd.idx} 0 R >>'.encode()
}, blob)
xobj = doc.add_object(xobj)
fw = float(w * 72) / xres
fh = float(h * 72) / yres
contents = Obj({}, f'q {fw:.1f} 0 0 {fh:.1f} 0 0 cm /Im1 Do Q'.encode())
contents = doc.add_object(contents)
resources = Obj({
b'ProcSet': b'[/PDF /ImageB]',
b'XObject': f'<< /Im1 {xobj.idx} 0 R >>'.encode()
})
resources = doc.add_object(resources)
page = Obj({
b'Type': b'/Page',
b'Parent': b'3 0 R',
b'MediaBox': f'[ 0 0 {fw:.1f} {fh:.1f} ]'.encode(),
b'Contents': ref(contents.idx),
b'Resources': ref(resources.idx)
})
doc.add_object(page)
page_objs.append(page)
pages.d.d[b'Count'] = str(len(page_objs)).encode()
pages.d.d[b'Kids'] = b'[' + b' '.join([ref(x.idx) for x in page_objs]) + b']'
pdf = fitz.open(stream=bytes(doc))
for p, imgs in enumerate(colorimgs):
for rect, img, mask in imgs:
with open(mask, "rb") as mf:
m = mf.read()
pdf[p].insert_image(fitz.Rect(*[(c*72.0)/dpi for c in rect]), filename=img, mask=m)
for p, txt in enumerate(texts):
for box in txt:
if len(box[1]):
bbox = box[0]
r = fitz.Rect(
min((bbox[0][0]*72.0)/dpi, (bbox[3][0]*72.0)/dpi),
min((bbox[0][1]*72.0)/dpi, (bbox[1][1]*72.0)/dpi),
max((bbox[1][0]*72.0)/dpi, (bbox[2][0]*72.0)/dpi),
max((bbox[2][1]*72.0)/dpi, (bbox[3][1]*72.0)/dpi)
)
fs = r.height
w = fitz.get_text_length(box[1], fontsize=fs)
pivot = fitz.Point(r.x0, r.y0)
matrix = fitz.Matrix(r.width/w, 1.0)
pdf[p].insert_text(fitz.Point(r.x0, r.y0+r.height*0.8),
box[1], fontsize=fs,
morph=(pivot, matrix),
render_mode=3)
return pdf.tobytes(garbage=4, clean=True, deflate=True)

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
deskew
easyocr
entrypoint2
pyMuPDF
tqdm

198
scanproc.py Executable file
View File

@ -0,0 +1,198 @@
#!/usr/bin/env python3
import easyocr
import cv2 as cv
import numpy as np
import warnings
from tqdm import tqdm
from deskew import determine_skew
from PIL import Image, ImageOps, ImageEnhance
from entrypoint2 import entrypoint
from pathlib import Path
from tempfile import TemporaryDirectory
from subprocess import run
from pdfutil import mkpdf
warnings.filterwarnings("ignore")
def rotate(img, angle: float):
(h, w) = img.shape[:2]
center = (w//2, h//2)
M = cv.getRotationMatrix2D(center, angle, 1.0)
return cv.warpAffine(img, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)
def getRot(mask):
(h, w) = mask.shape[:2]
nw = min(w, 500)
nh = int(h * (nw / w))
sm = cv.resize(mask, (nw,nh))
return determine_skew(sm)
def getMono(img):
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
_, mono = cv.threshold(gray, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
kernel = np.ones((3,3),np.uint8)
mono = cv.morphologyEx(mono, cv.MORPH_CLOSE, kernel)
return mono
def getColorMask(img, dpi):
hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
lower_sat = np.array([0,40,10])
upper_sat = np.array([255,255,255])
mask = cv.inRange(hsv, lower_sat, upper_sat)
ksiz = int(dpi*0.005)
ksiz -= ksiz%2-1
kernel = np.ones((ksiz,ksiz),np.uint8)
mask = cv.erode(mask, kernel)
mask = cv.dilate(mask, kernel, iterations = 5)
br = int(dpi*.1)
br -= br%2-1
mask = cv.GaussianBlur(mask, (br,br), 0)
_, mask = cv.threshold(mask, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
return mask
def autoColorContrast(img, mono, dpi):
ksiz = int(dpi*0.005)
ksiz -= ksiz%2-1
kernel = np.ones((ksiz,ksiz),np.uint8)
mask = cv.bitwise_not(mono)
mask = cv.dilate(mask, kernel, iterations = 5)
pim = Image.fromarray(cv.cvtColor(img, cv.COLOR_BGR2RGB))
pimask = Image.fromarray(mask)
color = ImageOps.autocontrast(pim, (20, 30), mask=pimask, preserve_tone=True)
color = cv.cvtColor(np.asarray(color), cv.COLOR_BGR2HLS)
(H, L, S) = cv.split(color)
L = L.astype("float32")
L *= 1.3
L = np.clip(L, 0, 255)
L = L.astype("uint8")
return cv.cvtColor(cv.merge((H, L, S)), cv.COLOR_HLS2RGB)
def getColorSegments(img, mono, cmask):
contours, hierarchy = cv.findContours(cmask, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)
for c in contours:
rect = cv.boundingRect(c)
(x1, y1, x2, y2) = rect
x2 += x1
y2 += y1
yield (x1, y1, x2, y2), img[y1:y2, x1:x2], cmask[y1:y2, x1:x2]
def unsharpMask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
blurred = cv.GaussianBlur(image, kernel_size, sigma)
sharpened = float(amount + 1) * image - float(amount) * blurred
sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
sharpened = sharpened.round().astype(np.uint8)
if threshold > 0:
low_contrast_mask = np.absolute(image - blurred) < threshold
np.copyto(sharpened, image, where=low_contrast_mask)
return sharpened
def processImage(img, reader, dpi):
with tqdm(total=8, leave=False) as t:
t.set_description("Reading image")
im = cv.imread(img)
t.update()
t.set_description("Filter")
mono = getMono(im)
im = cv.cvtColor(im, cv.COLOR_RGB2Lab)
(L, a, b) = cv.split(im)
ksiz = int(dpi*0.015)
ksiz -= ksiz%2-1
L = unsharpMask(L, kernel_size=(ksiz,ksiz), amount=2)
L = cv.bilateralFilter(L, -1, 12, dpi*0.018)
im = cv.cvtColor(cv.merge((L, a, b)), cv.COLOR_Lab2RGB)
im = autoColorContrast(im, mono, dpi)
t.update()
t.set_description("Detect skew")
angle = getRot(im)
t.update()
t.set_description("Deskew")
im = rotate(im, angle)
mono = rotate(mono, angle)
t.update()
t.set_description("OCR")
text = reader.readtext(mono)
t.update()
t.set_description("Color mask")
cmask = getColorMask(im, dpi)
t.update()
t.set_description("Color segments")
csegs = getColorSegments(im, mono, cmask)
t.update()
mono[cmask==255] = 255
return mono, csegs, text
@entrypoint
def main(output, langs=["en"], dpi=600, *imgs):
reader = easyocr.Reader(langs)
with tqdm(total=3, leave=False) as t:
with TemporaryDirectory() as tmp:
tp = Path(tmp)
files = []
colorimgs = []
texts = []
t.set_description("Process pages")
with tqdm(total=len(imgs), leave=False) as pt:
for pagen, img in enumerate(imgs):
pt.set_description(f"Process {img}")
mono, csegs, text = processImage(img, reader, dpi)
fn = str(tp / f"p{pagen}.tif")
files.append(fn)
cv.imwrite(fn, mono)
pimgs = []
for i, seg in enumerate(csegs):
(r, simg, smask) = seg
bp = tp / f"p{pagen}_{i}.jpg"
mp = tp / f"p{pagen}_{i}_m.png"
cv.imwrite(str(bp), simg, [
cv.IMWRITE_JPEG_QUALITY, 90,
cv.IMWRITE_JPEG_OPTIMIZE, 1,
cv.IMWRITE_JPEG_PROGRESSIVE, 1])
cv.imwrite(str(mp), smask, [
cv.IMWRITE_PNG_BILEVEL, 1,
cv.IMWRITE_PNG_COMPRESSION, 9])
pimgs.append(((r), bp, mp))
colorimgs.append(pimgs)
texts.append(text)
pt.update()
t.update()
t.set_description("JBIG2 compress")
run(["jbig2", "-s", "-d", "-a", "-p", *files], capture_output=True, check=True, cwd=tp)
symtab = tp / "output.sym"
pageblobs = [tp / f"output.{p:04d}" for p in range(len(files))]
t.update()
t.set_description("Create PDF")
with open(output, "wb") as outf:
outf.write(mkpdf(symtab, pageblobs, colorimgs, texts, dpi))
t.update()