From accf3e5795664be1222eabd793ecdd6815b0f999 Mon Sep 17 00:00:00 2001 From: Mia Herkt Date: Sun, 28 Aug 2022 20:00:13 +0200 Subject: [PATCH] init --- README.md | 22 ++++++ pdfutil.py | 180 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 5 ++ scanproc.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 README.md create mode 100755 pdfutil.py create mode 100644 requirements.txt create mode 100755 scanproc.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..2330747 --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# scanproc + +This is a tool to process high-resolution scans with as little user +input as possible. It produces decently compressed PDF files with +monochrome/color separation and more or less accurate text layers. + +## Requirements + +In addition to the Python modules listed in `requirements.txt`, +scanproc requires [jbig2](https://github.com/agl/jbig2enc) to +be installed in your `$PATH`. + +## Usage + +`scanproc.py [-h] [-l LANGS] [-d DPI] [--debug] output [imgs ...]` + +Usage is simple. You pass a list of +[EasyOCR language codes](https://www.jaided.ai/easyocr/) to use +with `-l` (default English only; pass multiple times for more than +one language), the image resolution (required for filter kernel +scaling and correct page size), followed by the output PDF file and +the input image files. diff --git a/pdfutil.py b/pdfutil.py new file mode 100755 index 0000000..bea5f8c --- /dev/null +++ b/pdfutil.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +import struct +import fitz +import cv2 as cv + +class Dict: + def __init__(self, values={}): + self.d = {} + self.d.update(values) + + def __bytes__(self): + s = [b"<< "] + + for (x, y) in self.d.items(): + s.append(b"/" + x + b" " + y + b"\n") + + s.append(b">>\n") + return b"".join(s) + +class Obj: + def __init__(self, d={}, stream=None): + if stream is not None: + d[b"Length"] = str(len(stream)).encode() + + self.d = Dict(d) + self.stream = stream + self.idx = 0 + + def __bytes__(self): + s = [bytes(self.d)] + + if self.stream is not None: + s.append(b"stream\n") + s.append(self.stream) + s.append(b"\nendstream\n") + s.append(b"endobj\n") + + return b"".join(s) + +class Doc: + def __init__(self): + self.objs = [] + self.pages = [] + + def add_object(self, o): + o.idx = len(self.objs) + 1 + self.objs.append(o) + return o + + def add_page(self, o): + self.pages.append(o) + return self.add_object(o) + + def __bytes__(self): + a = [] + j = [0] + offsets = [] + + def add(x): + a.append(x) + j[0] += len(x) + 1 + + add(b'%PDF-1.4') + for o in self.objs: + offsets.append(j[0]) + add(f"{o.idx} 0 obj".encode()) + add(bytes(o)) + + xrefstart = j[0] + a.append(b'xref') + a.append(f'0 {len(offsets) + 1}'.encode()) + a.append(b'0000000000 65535 f ') + + for o in offsets: + a.append(f"{o:010d} 00000 n ".encode()) + + a.append(b'') + a.append(b'trailer') + a.append(f'<< /Size {len(offsets) + 1}\n/Root 1 0 R >>'.encode()) + a.append(b'startxref') + a.append(str(xrefstart).encode()) + a.append(b'%EOF') + + return b'\n'.join(a) + +def ref(x): + return f"{x} 0 R".encode() + +def mkpdf(symtab, pageblobs, colorimgs, texts, dpi): + doc = Doc() + doc.add_object(Obj({ + b'Type' : b'/Catalog', + b'Outlines' : ref(2), + b'Pages' : ref(3) + })) + doc.add_object(Obj({ + b'Type' : b'/Outlines', + b'Count': b'0' + })) + pages = Obj({b'Type' : b'/Pages'}) + doc.add_object(pages) + with open(symtab, "rb") as stab: + symd = doc.add_object(Obj({}, stab.read())) + page_objs = [] + + for f in pageblobs: + with open(f, "rb") as pf: + blob = pf.read() + (w, h, xres, yres) = struct.unpack(">IIII", blob[11:27]) + + if xres == 0: + xres = dpi + if yres == 0: + yres = dpi + + xobj = Obj({ + b'Type': b'/XObject', + b'Subtype': b'/Image', + b'Width': str(w).encode(), + b'Height': str(h).encode(), + b'ColorSpace': b'/DeviceGray', + b'BitsPerComponent': b'1', + b'Filter': b'/JBIG2Decode', + b'DecodeParms': f' << /JBIG2Globals {symd.idx} 0 R >>'.encode() + }, blob) + xobj = doc.add_object(xobj) + + fw = float(w * 72) / xres + fh = float(h * 72) / yres + + contents = Obj({}, f'q {fw:.1f} 0 0 {fh:.1f} 0 0 cm /Im1 Do Q'.encode()) + contents = doc.add_object(contents) + + resources = Obj({ + b'ProcSet': b'[/PDF /ImageB]', + b'XObject': f'<< /Im1 {xobj.idx} 0 R >>'.encode() + }) + resources = doc.add_object(resources) + + page = Obj({ + b'Type': b'/Page', + b'Parent': b'3 0 R', + b'MediaBox': f'[ 0 0 {fw:.1f} {fh:.1f} ]'.encode(), + b'Contents': ref(contents.idx), + b'Resources': ref(resources.idx) + }) + doc.add_object(page) + + page_objs.append(page) + + pages.d.d[b'Count'] = str(len(page_objs)).encode() + pages.d.d[b'Kids'] = b'[' + b' '.join([ref(x.idx) for x in page_objs]) + b']' + + pdf = fitz.open(stream=bytes(doc)) + for p, imgs in enumerate(colorimgs): + for rect, img, mask in imgs: + with open(mask, "rb") as mf: + m = mf.read() + pdf[p].insert_image(fitz.Rect(*[(c*72.0)/dpi for c in rect]), filename=img, mask=m) + + for p, txt in enumerate(texts): + for box in txt: + if len(box[1]): + bbox = box[0] + r = fitz.Rect( + min((bbox[0][0]*72.0)/dpi, (bbox[3][0]*72.0)/dpi), + min((bbox[0][1]*72.0)/dpi, (bbox[1][1]*72.0)/dpi), + max((bbox[1][0]*72.0)/dpi, (bbox[2][0]*72.0)/dpi), + max((bbox[2][1]*72.0)/dpi, (bbox[3][1]*72.0)/dpi) + ) + fs = r.height + w = fitz.get_text_length(box[1], fontsize=fs) + pivot = fitz.Point(r.x0, r.y0) + matrix = fitz.Matrix(r.width/w, 1.0) + pdf[p].insert_text(fitz.Point(r.x0, r.y0+r.height*0.8), + box[1], fontsize=fs, + morph=(pivot, matrix), + render_mode=3) + return pdf.tobytes(garbage=4, clean=True, deflate=True) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a29edbd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +deskew +easyocr +entrypoint2 +pyMuPDF +tqdm diff --git a/scanproc.py b/scanproc.py new file mode 100755 index 0000000..7fe621e --- /dev/null +++ b/scanproc.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 + +import easyocr +import cv2 as cv +import numpy as np +import warnings + +from tqdm import tqdm +from deskew import determine_skew +from PIL import Image, ImageOps, ImageEnhance +from entrypoint2 import entrypoint +from pathlib import Path +from tempfile import TemporaryDirectory +from subprocess import run + +from pdfutil import mkpdf + +warnings.filterwarnings("ignore") + +def rotate(img, angle: float): + (h, w) = img.shape[:2] + center = (w//2, h//2) + M = cv.getRotationMatrix2D(center, angle, 1.0) + return cv.warpAffine(img, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE) + +def getRot(mask): + (h, w) = mask.shape[:2] + + nw = min(w, 500) + nh = int(h * (nw / w)) + sm = cv.resize(mask, (nw,nh)) + + return determine_skew(sm) + +def getMono(img): + gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY) + _, mono = cv.threshold(gray, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU) + kernel = np.ones((3,3),np.uint8) + mono = cv.morphologyEx(mono, cv.MORPH_CLOSE, kernel) + + return mono + +def getColorMask(img, dpi): + hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV) + lower_sat = np.array([0,40,10]) + upper_sat = np.array([255,255,255]) + mask = cv.inRange(hsv, lower_sat, upper_sat) + ksiz = int(dpi*0.005) + ksiz -= ksiz%2-1 + kernel = np.ones((ksiz,ksiz),np.uint8) + mask = cv.erode(mask, kernel) + mask = cv.dilate(mask, kernel, iterations = 5) + br = int(dpi*.1) + br -= br%2-1 + mask = cv.GaussianBlur(mask, (br,br), 0) + _, mask = cv.threshold(mask, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU) + return mask + +def autoColorContrast(img, mono, dpi): + ksiz = int(dpi*0.005) + ksiz -= ksiz%2-1 + kernel = np.ones((ksiz,ksiz),np.uint8) + mask = cv.bitwise_not(mono) + mask = cv.dilate(mask, kernel, iterations = 5) + + pim = Image.fromarray(cv.cvtColor(img, cv.COLOR_BGR2RGB)) + pimask = Image.fromarray(mask) + + color = ImageOps.autocontrast(pim, (20, 30), mask=pimask, preserve_tone=True) + color = cv.cvtColor(np.asarray(color), cv.COLOR_BGR2HLS) + + (H, L, S) = cv.split(color) + L = L.astype("float32") + L *= 1.3 + L = np.clip(L, 0, 255) + L = L.astype("uint8") + + return cv.cvtColor(cv.merge((H, L, S)), cv.COLOR_HLS2RGB) + +def getColorSegments(img, mono, cmask): + contours, hierarchy = cv.findContours(cmask, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE) + + for c in contours: + rect = cv.boundingRect(c) + (x1, y1, x2, y2) = rect + x2 += x1 + y2 += y1 + yield (x1, y1, x2, y2), img[y1:y2, x1:x2], cmask[y1:y2, x1:x2] + +def unsharpMask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0): + blurred = cv.GaussianBlur(image, kernel_size, sigma) + sharpened = float(amount + 1) * image - float(amount) * blurred + sharpened = np.maximum(sharpened, np.zeros(sharpened.shape)) + sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape)) + sharpened = sharpened.round().astype(np.uint8) + if threshold > 0: + low_contrast_mask = np.absolute(image - blurred) < threshold + np.copyto(sharpened, image, where=low_contrast_mask) + return sharpened + +def processImage(img, reader, dpi): + with tqdm(total=8, leave=False) as t: + t.set_description("Reading image") + im = cv.imread(img) + t.update() + + t.set_description("Filter") + mono = getMono(im) + + im = cv.cvtColor(im, cv.COLOR_RGB2Lab) + (L, a, b) = cv.split(im) + ksiz = int(dpi*0.015) + ksiz -= ksiz%2-1 + L = unsharpMask(L, kernel_size=(ksiz,ksiz), amount=2) + L = cv.bilateralFilter(L, -1, 12, dpi*0.018) + im = cv.cvtColor(cv.merge((L, a, b)), cv.COLOR_Lab2RGB) + + im = autoColorContrast(im, mono, dpi) + t.update() + + t.set_description("Detect skew") + angle = getRot(im) + t.update() + + t.set_description("Deskew") + im = rotate(im, angle) + mono = rotate(mono, angle) + t.update() + + t.set_description("OCR") + text = reader.readtext(mono) + t.update() + + t.set_description("Color mask") + cmask = getColorMask(im, dpi) + t.update() + + t.set_description("Color segments") + csegs = getColorSegments(im, mono, cmask) + t.update() + + mono[cmask==255] = 255 + + return mono, csegs, text + +@entrypoint +def main(output, langs=["en"], dpi=600, *imgs): + reader = easyocr.Reader(langs) + + with tqdm(total=3, leave=False) as t: + with TemporaryDirectory() as tmp: + tp = Path(tmp) + files = [] + colorimgs = [] + texts = [] + + t.set_description("Process pages") + with tqdm(total=len(imgs), leave=False) as pt: + for pagen, img in enumerate(imgs): + pt.set_description(f"Process {img}") + mono, csegs, text = processImage(img, reader, dpi) + + fn = str(tp / f"p{pagen}.tif") + files.append(fn) + cv.imwrite(fn, mono) + + pimgs = [] + for i, seg in enumerate(csegs): + (r, simg, smask) = seg + bp = tp / f"p{pagen}_{i}.jpg" + mp = tp / f"p{pagen}_{i}_m.png" + + cv.imwrite(str(bp), simg, [ + cv.IMWRITE_JPEG_QUALITY, 90, + cv.IMWRITE_JPEG_OPTIMIZE, 1, + cv.IMWRITE_JPEG_PROGRESSIVE, 1]) + + cv.imwrite(str(mp), smask, [ + cv.IMWRITE_PNG_BILEVEL, 1, + cv.IMWRITE_PNG_COMPRESSION, 9]) + pimgs.append(((r), bp, mp)) + colorimgs.append(pimgs) + + texts.append(text) + pt.update() + t.update() + + t.set_description("JBIG2 compress") + run(["jbig2", "-s", "-d", "-a", "-p", *files], capture_output=True, check=True, cwd=tp) + symtab = tp / "output.sym" + pageblobs = [tp / f"output.{p:04d}" for p in range(len(files))] + t.update() + + t.set_description("Create PDF") + with open(output, "wb") as outf: + outf.write(mkpdf(symtab, pageblobs, colorimgs, texts, dpi)) + + t.update()