init
This commit is contained in:
commit
accf3e5795
4 changed files with 405 additions and 0 deletions
22
README.md
Normal file
22
README.md
Normal file
|
@ -0,0 +1,22 @@
|
|||
# scanproc
|
||||
|
||||
This is a tool to process high-resolution scans with as little user
|
||||
input as possible. It produces decently compressed PDF files with
|
||||
monochrome/color separation and more or less accurate text layers.
|
||||
|
||||
## Requirements
|
||||
|
||||
In addition to the Python modules listed in `requirements.txt`,
|
||||
scanproc requires [jbig2](https://github.com/agl/jbig2enc) to
|
||||
be installed in your `$PATH`.
|
||||
|
||||
## Usage
|
||||
|
||||
`scanproc.py [-h] [-l LANGS] [-d DPI] [--debug] output [imgs ...]`
|
||||
|
||||
Usage is simple. You pass a list of
|
||||
[EasyOCR language codes](https://www.jaided.ai/easyocr/) to use
|
||||
with `-l` (default English only; pass multiple times for more than
|
||||
one language), the image resolution (required for filter kernel
|
||||
scaling and correct page size), followed by the output PDF file and
|
||||
the input image files.
|
180
pdfutil.py
Executable file
180
pdfutil.py
Executable file
|
@ -0,0 +1,180 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import struct
|
||||
import fitz
|
||||
import cv2 as cv
|
||||
|
||||
class Dict:
|
||||
def __init__(self, values={}):
|
||||
self.d = {}
|
||||
self.d.update(values)
|
||||
|
||||
def __bytes__(self):
|
||||
s = [b"<< "]
|
||||
|
||||
for (x, y) in self.d.items():
|
||||
s.append(b"/" + x + b" " + y + b"\n")
|
||||
|
||||
s.append(b">>\n")
|
||||
return b"".join(s)
|
||||
|
||||
class Obj:
|
||||
def __init__(self, d={}, stream=None):
|
||||
if stream is not None:
|
||||
d[b"Length"] = str(len(stream)).encode()
|
||||
|
||||
self.d = Dict(d)
|
||||
self.stream = stream
|
||||
self.idx = 0
|
||||
|
||||
def __bytes__(self):
|
||||
s = [bytes(self.d)]
|
||||
|
||||
if self.stream is not None:
|
||||
s.append(b"stream\n")
|
||||
s.append(self.stream)
|
||||
s.append(b"\nendstream\n")
|
||||
s.append(b"endobj\n")
|
||||
|
||||
return b"".join(s)
|
||||
|
||||
class Doc:
|
||||
def __init__(self):
|
||||
self.objs = []
|
||||
self.pages = []
|
||||
|
||||
def add_object(self, o):
|
||||
o.idx = len(self.objs) + 1
|
||||
self.objs.append(o)
|
||||
return o
|
||||
|
||||
def add_page(self, o):
|
||||
self.pages.append(o)
|
||||
return self.add_object(o)
|
||||
|
||||
def __bytes__(self):
|
||||
a = []
|
||||
j = [0]
|
||||
offsets = []
|
||||
|
||||
def add(x):
|
||||
a.append(x)
|
||||
j[0] += len(x) + 1
|
||||
|
||||
add(b'%PDF-1.4')
|
||||
for o in self.objs:
|
||||
offsets.append(j[0])
|
||||
add(f"{o.idx} 0 obj".encode())
|
||||
add(bytes(o))
|
||||
|
||||
xrefstart = j[0]
|
||||
a.append(b'xref')
|
||||
a.append(f'0 {len(offsets) + 1}'.encode())
|
||||
a.append(b'0000000000 65535 f ')
|
||||
|
||||
for o in offsets:
|
||||
a.append(f"{o:010d} 00000 n ".encode())
|
||||
|
||||
a.append(b'')
|
||||
a.append(b'trailer')
|
||||
a.append(f'<< /Size {len(offsets) + 1}\n/Root 1 0 R >>'.encode())
|
||||
a.append(b'startxref')
|
||||
a.append(str(xrefstart).encode())
|
||||
a.append(b'%EOF')
|
||||
|
||||
return b'\n'.join(a)
|
||||
|
||||
def ref(x):
|
||||
return f"{x} 0 R".encode()
|
||||
|
||||
def mkpdf(symtab, pageblobs, colorimgs, texts, dpi):
|
||||
doc = Doc()
|
||||
doc.add_object(Obj({
|
||||
b'Type' : b'/Catalog',
|
||||
b'Outlines' : ref(2),
|
||||
b'Pages' : ref(3)
|
||||
}))
|
||||
doc.add_object(Obj({
|
||||
b'Type' : b'/Outlines',
|
||||
b'Count': b'0'
|
||||
}))
|
||||
pages = Obj({b'Type' : b'/Pages'})
|
||||
doc.add_object(pages)
|
||||
with open(symtab, "rb") as stab:
|
||||
symd = doc.add_object(Obj({}, stab.read()))
|
||||
page_objs = []
|
||||
|
||||
for f in pageblobs:
|
||||
with open(f, "rb") as pf:
|
||||
blob = pf.read()
|
||||
(w, h, xres, yres) = struct.unpack(">IIII", blob[11:27])
|
||||
|
||||
if xres == 0:
|
||||
xres = dpi
|
||||
if yres == 0:
|
||||
yres = dpi
|
||||
|
||||
xobj = Obj({
|
||||
b'Type': b'/XObject',
|
||||
b'Subtype': b'/Image',
|
||||
b'Width': str(w).encode(),
|
||||
b'Height': str(h).encode(),
|
||||
b'ColorSpace': b'/DeviceGray',
|
||||
b'BitsPerComponent': b'1',
|
||||
b'Filter': b'/JBIG2Decode',
|
||||
b'DecodeParms': f' << /JBIG2Globals {symd.idx} 0 R >>'.encode()
|
||||
}, blob)
|
||||
xobj = doc.add_object(xobj)
|
||||
|
||||
fw = float(w * 72) / xres
|
||||
fh = float(h * 72) / yres
|
||||
|
||||
contents = Obj({}, f'q {fw:.1f} 0 0 {fh:.1f} 0 0 cm /Im1 Do Q'.encode())
|
||||
contents = doc.add_object(contents)
|
||||
|
||||
resources = Obj({
|
||||
b'ProcSet': b'[/PDF /ImageB]',
|
||||
b'XObject': f'<< /Im1 {xobj.idx} 0 R >>'.encode()
|
||||
})
|
||||
resources = doc.add_object(resources)
|
||||
|
||||
page = Obj({
|
||||
b'Type': b'/Page',
|
||||
b'Parent': b'3 0 R',
|
||||
b'MediaBox': f'[ 0 0 {fw:.1f} {fh:.1f} ]'.encode(),
|
||||
b'Contents': ref(contents.idx),
|
||||
b'Resources': ref(resources.idx)
|
||||
})
|
||||
doc.add_object(page)
|
||||
|
||||
page_objs.append(page)
|
||||
|
||||
pages.d.d[b'Count'] = str(len(page_objs)).encode()
|
||||
pages.d.d[b'Kids'] = b'[' + b' '.join([ref(x.idx) for x in page_objs]) + b']'
|
||||
|
||||
pdf = fitz.open(stream=bytes(doc))
|
||||
for p, imgs in enumerate(colorimgs):
|
||||
for rect, img, mask in imgs:
|
||||
with open(mask, "rb") as mf:
|
||||
m = mf.read()
|
||||
pdf[p].insert_image(fitz.Rect(*[(c*72.0)/dpi for c in rect]), filename=img, mask=m)
|
||||
|
||||
for p, txt in enumerate(texts):
|
||||
for box in txt:
|
||||
if len(box[1]):
|
||||
bbox = box[0]
|
||||
r = fitz.Rect(
|
||||
min((bbox[0][0]*72.0)/dpi, (bbox[3][0]*72.0)/dpi),
|
||||
min((bbox[0][1]*72.0)/dpi, (bbox[1][1]*72.0)/dpi),
|
||||
max((bbox[1][0]*72.0)/dpi, (bbox[2][0]*72.0)/dpi),
|
||||
max((bbox[2][1]*72.0)/dpi, (bbox[3][1]*72.0)/dpi)
|
||||
)
|
||||
fs = r.height
|
||||
w = fitz.get_text_length(box[1], fontsize=fs)
|
||||
pivot = fitz.Point(r.x0, r.y0)
|
||||
matrix = fitz.Matrix(r.width/w, 1.0)
|
||||
pdf[p].insert_text(fitz.Point(r.x0, r.y0+r.height*0.8),
|
||||
box[1], fontsize=fs,
|
||||
morph=(pivot, matrix),
|
||||
render_mode=3)
|
||||
return pdf.tobytes(garbage=4, clean=True, deflate=True)
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
deskew
|
||||
easyocr
|
||||
entrypoint2
|
||||
pyMuPDF
|
||||
tqdm
|
198
scanproc.py
Executable file
198
scanproc.py
Executable file
|
@ -0,0 +1,198 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import easyocr
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from tqdm import tqdm
|
||||
from deskew import determine_skew
|
||||
from PIL import Image, ImageOps, ImageEnhance
|
||||
from entrypoint2 import entrypoint
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from subprocess import run
|
||||
|
||||
from pdfutil import mkpdf
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
def rotate(img, angle: float):
|
||||
(h, w) = img.shape[:2]
|
||||
center = (w//2, h//2)
|
||||
M = cv.getRotationMatrix2D(center, angle, 1.0)
|
||||
return cv.warpAffine(img, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)
|
||||
|
||||
def getRot(mask):
|
||||
(h, w) = mask.shape[:2]
|
||||
|
||||
nw = min(w, 500)
|
||||
nh = int(h * (nw / w))
|
||||
sm = cv.resize(mask, (nw,nh))
|
||||
|
||||
return determine_skew(sm)
|
||||
|
||||
def getMono(img):
|
||||
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
|
||||
_, mono = cv.threshold(gray, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
|
||||
kernel = np.ones((3,3),np.uint8)
|
||||
mono = cv.morphologyEx(mono, cv.MORPH_CLOSE, kernel)
|
||||
|
||||
return mono
|
||||
|
||||
def getColorMask(img, dpi):
|
||||
hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
|
||||
lower_sat = np.array([0,40,10])
|
||||
upper_sat = np.array([255,255,255])
|
||||
mask = cv.inRange(hsv, lower_sat, upper_sat)
|
||||
ksiz = int(dpi*0.005)
|
||||
ksiz -= ksiz%2-1
|
||||
kernel = np.ones((ksiz,ksiz),np.uint8)
|
||||
mask = cv.erode(mask, kernel)
|
||||
mask = cv.dilate(mask, kernel, iterations = 5)
|
||||
br = int(dpi*.1)
|
||||
br -= br%2-1
|
||||
mask = cv.GaussianBlur(mask, (br,br), 0)
|
||||
_, mask = cv.threshold(mask, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
|
||||
return mask
|
||||
|
||||
def autoColorContrast(img, mono, dpi):
|
||||
ksiz = int(dpi*0.005)
|
||||
ksiz -= ksiz%2-1
|
||||
kernel = np.ones((ksiz,ksiz),np.uint8)
|
||||
mask = cv.bitwise_not(mono)
|
||||
mask = cv.dilate(mask, kernel, iterations = 5)
|
||||
|
||||
pim = Image.fromarray(cv.cvtColor(img, cv.COLOR_BGR2RGB))
|
||||
pimask = Image.fromarray(mask)
|
||||
|
||||
color = ImageOps.autocontrast(pim, (20, 30), mask=pimask, preserve_tone=True)
|
||||
color = cv.cvtColor(np.asarray(color), cv.COLOR_BGR2HLS)
|
||||
|
||||
(H, L, S) = cv.split(color)
|
||||
L = L.astype("float32")
|
||||
L *= 1.3
|
||||
L = np.clip(L, 0, 255)
|
||||
L = L.astype("uint8")
|
||||
|
||||
return cv.cvtColor(cv.merge((H, L, S)), cv.COLOR_HLS2RGB)
|
||||
|
||||
def getColorSegments(img, mono, cmask):
|
||||
contours, hierarchy = cv.findContours(cmask, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for c in contours:
|
||||
rect = cv.boundingRect(c)
|
||||
(x1, y1, x2, y2) = rect
|
||||
x2 += x1
|
||||
y2 += y1
|
||||
yield (x1, y1, x2, y2), img[y1:y2, x1:x2], cmask[y1:y2, x1:x2]
|
||||
|
||||
def unsharpMask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
|
||||
blurred = cv.GaussianBlur(image, kernel_size, sigma)
|
||||
sharpened = float(amount + 1) * image - float(amount) * blurred
|
||||
sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
|
||||
sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
|
||||
sharpened = sharpened.round().astype(np.uint8)
|
||||
if threshold > 0:
|
||||
low_contrast_mask = np.absolute(image - blurred) < threshold
|
||||
np.copyto(sharpened, image, where=low_contrast_mask)
|
||||
return sharpened
|
||||
|
||||
def processImage(img, reader, dpi):
|
||||
with tqdm(total=8, leave=False) as t:
|
||||
t.set_description("Reading image")
|
||||
im = cv.imread(img)
|
||||
t.update()
|
||||
|
||||
t.set_description("Filter")
|
||||
mono = getMono(im)
|
||||
|
||||
im = cv.cvtColor(im, cv.COLOR_RGB2Lab)
|
||||
(L, a, b) = cv.split(im)
|
||||
ksiz = int(dpi*0.015)
|
||||
ksiz -= ksiz%2-1
|
||||
L = unsharpMask(L, kernel_size=(ksiz,ksiz), amount=2)
|
||||
L = cv.bilateralFilter(L, -1, 12, dpi*0.018)
|
||||
im = cv.cvtColor(cv.merge((L, a, b)), cv.COLOR_Lab2RGB)
|
||||
|
||||
im = autoColorContrast(im, mono, dpi)
|
||||
t.update()
|
||||
|
||||
t.set_description("Detect skew")
|
||||
angle = getRot(im)
|
||||
t.update()
|
||||
|
||||
t.set_description("Deskew")
|
||||
im = rotate(im, angle)
|
||||
mono = rotate(mono, angle)
|
||||
t.update()
|
||||
|
||||
t.set_description("OCR")
|
||||
text = reader.readtext(mono)
|
||||
t.update()
|
||||
|
||||
t.set_description("Color mask")
|
||||
cmask = getColorMask(im, dpi)
|
||||
t.update()
|
||||
|
||||
t.set_description("Color segments")
|
||||
csegs = getColorSegments(im, mono, cmask)
|
||||
t.update()
|
||||
|
||||
mono[cmask==255] = 255
|
||||
|
||||
return mono, csegs, text
|
||||
|
||||
@entrypoint
|
||||
def main(output, langs=["en"], dpi=600, *imgs):
|
||||
reader = easyocr.Reader(langs)
|
||||
|
||||
with tqdm(total=3, leave=False) as t:
|
||||
with TemporaryDirectory() as tmp:
|
||||
tp = Path(tmp)
|
||||
files = []
|
||||
colorimgs = []
|
||||
texts = []
|
||||
|
||||
t.set_description("Process pages")
|
||||
with tqdm(total=len(imgs), leave=False) as pt:
|
||||
for pagen, img in enumerate(imgs):
|
||||
pt.set_description(f"Process {img}")
|
||||
mono, csegs, text = processImage(img, reader, dpi)
|
||||
|
||||
fn = str(tp / f"p{pagen}.tif")
|
||||
files.append(fn)
|
||||
cv.imwrite(fn, mono)
|
||||
|
||||
pimgs = []
|
||||
for i, seg in enumerate(csegs):
|
||||
(r, simg, smask) = seg
|
||||
bp = tp / f"p{pagen}_{i}.jpg"
|
||||
mp = tp / f"p{pagen}_{i}_m.png"
|
||||
|
||||
cv.imwrite(str(bp), simg, [
|
||||
cv.IMWRITE_JPEG_QUALITY, 90,
|
||||
cv.IMWRITE_JPEG_OPTIMIZE, 1,
|
||||
cv.IMWRITE_JPEG_PROGRESSIVE, 1])
|
||||
|
||||
cv.imwrite(str(mp), smask, [
|
||||
cv.IMWRITE_PNG_BILEVEL, 1,
|
||||
cv.IMWRITE_PNG_COMPRESSION, 9])
|
||||
pimgs.append(((r), bp, mp))
|
||||
colorimgs.append(pimgs)
|
||||
|
||||
texts.append(text)
|
||||
pt.update()
|
||||
t.update()
|
||||
|
||||
t.set_description("JBIG2 compress")
|
||||
run(["jbig2", "-s", "-d", "-a", "-p", *files], capture_output=True, check=True, cwd=tp)
|
||||
symtab = tp / "output.sym"
|
||||
pageblobs = [tp / f"output.{p:04d}" for p in range(len(files))]
|
||||
t.update()
|
||||
|
||||
t.set_description("Create PDF")
|
||||
with open(output, "wb") as outf:
|
||||
outf.write(mkpdf(symtab, pageblobs, colorimgs, texts, dpi))
|
||||
|
||||
t.update()
|
Loading…
Reference in a new issue