init

2022-08-28 20:00:13 +02:00 · 2022-08-28 20:00:13 +02:00 · accf3e5795
commit accf3e5795
4 changed files with 405 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,22 @@
 # scanproc
 This is a tool to process high-resolution scans with as little user
 input as possible. It produces decently compressed PDF files with
 monochrome/color separation and more or less accurate text layers.
 ## Requirements
 In addition to the Python modules listed in `requirements.txt`,
 scanproc requires [jbig2](https://github.com/agl/jbig2enc) to
 be installed in your `$PATH`.
 ## Usage
 `scanproc.py [-h] [-l LANGS] [-d DPI] [--debug] output [imgs ...]`
 Usage is simple. You pass a list of
 [EasyOCR language codes](https://www.jaided.ai/easyocr/) to use
 with `-l` (default English only; pass multiple times for more than
 one language), the image resolution (required for filter kernel
 scaling and correct page size), followed by the output PDF file and
 the input image files.
--- a/pdfutil.py
+++ b/pdfutil.py
@ -0,0 +1,180 @@
 #!/usr/bin/env python3
 import struct
 import fitz
 import cv2 as cv
 class Dict:
    def __init__(self, values={}):
        self.d = {}
        self.d.update(values)
    def __bytes__(self):
        s = [b"<< "]
        for (x, y) in self.d.items():
            s.append(b"/" + x + b" " + y + b"\n")
        s.append(b">>\n")
        return b"".join(s)
 class Obj:
    def __init__(self, d={}, stream=None):
        if stream is not None:
            d[b"Length"] = str(len(stream)).encode()
        self.d = Dict(d)
        self.stream = stream
        self.idx = 0
    def __bytes__(self):
        s = [bytes(self.d)]
        if self.stream is not None:
            s.append(b"stream\n")
            s.append(self.stream)
            s.append(b"\nendstream\n")
        s.append(b"endobj\n")
        return b"".join(s)
 class Doc:
    def __init__(self):
        self.objs = []
        self.pages = []
    def add_object(self, o):
        o.idx = len(self.objs) + 1
        self.objs.append(o)
        return o
    def add_page(self, o):
        self.pages.append(o)
        return self.add_object(o)
    def __bytes__(self):
        a = []
        j = [0]
        offsets = []
        def add(x):
            a.append(x)
            j[0] += len(x) + 1
        add(b'%PDF-1.4')
        for o in self.objs:
            offsets.append(j[0])
            add(f"{o.idx} 0 obj".encode())
            add(bytes(o))
        xrefstart = j[0]
        a.append(b'xref')
        a.append(f'0 {len(offsets) + 1}'.encode())
        a.append(b'0000000000 65535 f ')
        for o in offsets:
            a.append(f"{o:010d} 00000 n ".encode())
        a.append(b'')
        a.append(b'trailer')
        a.append(f'<< /Size {len(offsets) + 1}\n/Root 1 0 R >>'.encode())
        a.append(b'startxref')
        a.append(str(xrefstart).encode())
        a.append(b'%EOF')
        return b'\n'.join(a)
 def ref(x):
    return f"{x} 0 R".encode()
 def mkpdf(symtab, pageblobs, colorimgs, texts, dpi):
    doc = Doc()
    doc.add_object(Obj({
        b'Type' : b'/Catalog',
        b'Outlines' : ref(2),
        b'Pages' : ref(3)
        }))
    doc.add_object(Obj({
        b'Type' : b'/Outlines',
        b'Count': b'0'
        }))
    pages = Obj({b'Type' : b'/Pages'})
    doc.add_object(pages)
    with open(symtab, "rb") as stab:
        symd = doc.add_object(Obj({}, stab.read()))
    page_objs = []
    for f in pageblobs:
        with open(f, "rb") as pf:
            blob = pf.read()
        (w, h, xres, yres) = struct.unpack(">IIII", blob[11:27])
        if xres == 0:
            xres = dpi
        if yres == 0:
            yres = dpi
        xobj = Obj({
            b'Type': b'/XObject',
            b'Subtype': b'/Image',
            b'Width': str(w).encode(),
            b'Height': str(h).encode(),
            b'ColorSpace': b'/DeviceGray',
            b'BitsPerComponent': b'1',
            b'Filter': b'/JBIG2Decode',
            b'DecodeParms': f' << /JBIG2Globals {symd.idx} 0 R >>'.encode()
            }, blob)
        xobj = doc.add_object(xobj)
        fw = float(w * 72) / xres
        fh = float(h * 72) / yres
        contents = Obj({}, f'q {fw:.1f} 0 0 {fh:.1f} 0 0 cm /Im1 Do Q'.encode())
        contents = doc.add_object(contents)
        resources = Obj({
            b'ProcSet': b'[/PDF /ImageB]',
            b'XObject': f'<< /Im1 {xobj.idx} 0 R >>'.encode()
            })
        resources = doc.add_object(resources)
        page = Obj({
            b'Type': b'/Page',
            b'Parent': b'3 0 R',
            b'MediaBox': f'[ 0 0 {fw:.1f} {fh:.1f} ]'.encode(),
            b'Contents': ref(contents.idx),
            b'Resources': ref(resources.idx)
            })
        doc.add_object(page)
        page_objs.append(page)
        pages.d.d[b'Count'] = str(len(page_objs)).encode()
        pages.d.d[b'Kids'] = b'[' + b' '.join([ref(x.idx) for x in page_objs]) + b']'
    pdf = fitz.open(stream=bytes(doc))
    for p, imgs in enumerate(colorimgs):
        for rect, img, mask in imgs:
            with open(mask, "rb") as mf:
                m = mf.read()
            pdf[p].insert_image(fitz.Rect(*[(c*72.0)/dpi for c in rect]), filename=img, mask=m)
    for p, txt in enumerate(texts):
        for box in txt:
            if len(box[1]):
                bbox = box[0]
                r = fitz.Rect(
                    min((bbox[0][0]*72.0)/dpi, (bbox[3][0]*72.0)/dpi),
                    min((bbox[0][1]*72.0)/dpi, (bbox[1][1]*72.0)/dpi),
                    max((bbox[1][0]*72.0)/dpi, (bbox[2][0]*72.0)/dpi),
                    max((bbox[2][1]*72.0)/dpi, (bbox[3][1]*72.0)/dpi)
                )
                fs = r.height
                w = fitz.get_text_length(box[1], fontsize=fs)
                pivot = fitz.Point(r.x0, r.y0)
                matrix = fitz.Matrix(r.width/w, 1.0)
                pdf[p].insert_text(fitz.Point(r.x0, r.y0+r.height*0.8),
                                   box[1], fontsize=fs,
                                   morph=(pivot, matrix),
                                   render_mode=3)
    return pdf.tobytes(garbage=4, clean=True, deflate=True)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
 deskew
 easyocr
 entrypoint2
 pyMuPDF
 tqdm
--- a/scanproc.py
+++ b/scanproc.py
@ -0,0 +1,198 @@
 #!/usr/bin/env python3
 import easyocr
 import cv2 as cv
 import numpy as np
 import warnings
 from tqdm import tqdm
 from deskew import determine_skew
 from PIL import Image, ImageOps, ImageEnhance
 from entrypoint2 import entrypoint
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from subprocess import run
 from pdfutil import mkpdf
 warnings.filterwarnings("ignore")
 def rotate(img, angle: float):
    (h, w) = img.shape[:2]
    center = (w//2, h//2)
    M = cv.getRotationMatrix2D(center, angle, 1.0)
    return cv.warpAffine(img, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)
 def getRot(mask):
    (h, w) = mask.shape[:2]
    nw = min(w, 500)
    nh = int(h * (nw / w))
    sm = cv.resize(mask, (nw,nh))
    return determine_skew(sm)
 def getMono(img):
    gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    _, mono = cv.threshold(gray, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
    kernel = np.ones((3,3),np.uint8)
    mono = cv.morphologyEx(mono, cv.MORPH_CLOSE, kernel)
    return mono
 def getColorMask(img, dpi):
    hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
    lower_sat = np.array([0,40,10])
    upper_sat = np.array([255,255,255])
    mask = cv.inRange(hsv, lower_sat, upper_sat)
    ksiz = int(dpi*0.005)
    ksiz -= ksiz%2-1
    kernel = np.ones((ksiz,ksiz),np.uint8)
    mask = cv.erode(mask, kernel)
    mask = cv.dilate(mask, kernel, iterations = 5)
    br = int(dpi*.1)
    br -= br%2-1
    mask = cv.GaussianBlur(mask, (br,br), 0)
    _, mask = cv.threshold(mask, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
    return mask
 def autoColorContrast(img, mono, dpi):
    ksiz = int(dpi*0.005)
    ksiz -= ksiz%2-1
    kernel = np.ones((ksiz,ksiz),np.uint8)
    mask = cv.bitwise_not(mono)
    mask = cv.dilate(mask, kernel, iterations = 5)
    pim = Image.fromarray(cv.cvtColor(img, cv.COLOR_BGR2RGB))
    pimask = Image.fromarray(mask)
    color = ImageOps.autocontrast(pim, (20, 30), mask=pimask, preserve_tone=True)
    color = cv.cvtColor(np.asarray(color), cv.COLOR_BGR2HLS)
    (H, L, S) = cv.split(color)
    L = L.astype("float32")
    L *= 1.3
    L = np.clip(L, 0, 255)
    L = L.astype("uint8")
    return cv.cvtColor(cv.merge((H, L, S)), cv.COLOR_HLS2RGB)
 def getColorSegments(img, mono, cmask):
    contours, hierarchy = cv.findContours(cmask, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)
    for c in contours:
        rect = cv.boundingRect(c)
        (x1, y1, x2, y2) = rect
        x2 += x1
        y2 += y1
        yield (x1, y1, x2, y2), img[y1:y2, x1:x2], cmask[y1:y2, x1:x2]
 def unsharpMask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
    blurred = cv.GaussianBlur(image, kernel_size, sigma)
    sharpened = float(amount + 1) * image - float(amount) * blurred
    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
    sharpened = sharpened.round().astype(np.uint8)
    if threshold > 0:
        low_contrast_mask = np.absolute(image - blurred) < threshold
        np.copyto(sharpened, image, where=low_contrast_mask)
    return sharpened
 def processImage(img, reader, dpi):
    with tqdm(total=8, leave=False) as t:
        t.set_description("Reading image")
        im = cv.imread(img)
        t.update()
        t.set_description("Filter")
        mono = getMono(im)
        im = cv.cvtColor(im, cv.COLOR_RGB2Lab)
        (L, a, b) = cv.split(im)
        ksiz = int(dpi*0.015)
        ksiz -= ksiz%2-1
        L = unsharpMask(L, kernel_size=(ksiz,ksiz), amount=2)
        L = cv.bilateralFilter(L, -1, 12, dpi*0.018)
        im = cv.cvtColor(cv.merge((L, a, b)), cv.COLOR_Lab2RGB)
        im = autoColorContrast(im, mono, dpi)
        t.update()
        t.set_description("Detect skew")
        angle = getRot(im)
        t.update()
        t.set_description("Deskew")
        im = rotate(im, angle)
        mono = rotate(mono, angle)
        t.update()
        t.set_description("OCR")
        text = reader.readtext(mono)
        t.update()
        t.set_description("Color mask")
        cmask = getColorMask(im, dpi)
        t.update()
        t.set_description("Color segments")
        csegs = getColorSegments(im, mono, cmask)
        t.update()
        mono[cmask==255] = 255
    return mono, csegs, text
@entrypoint
 def main(output, langs=["en"], dpi=600, *imgs):
    reader = easyocr.Reader(langs)
    with tqdm(total=3, leave=False) as t:
        with TemporaryDirectory() as tmp:
            tp = Path(tmp)
            files = []
            colorimgs = []
            texts = []
            t.set_description("Process pages")
            with tqdm(total=len(imgs), leave=False) as pt:
                for pagen, img in enumerate(imgs):
                    pt.set_description(f"Process {img}")
                    mono, csegs, text = processImage(img, reader, dpi)
                    fn = str(tp / f"p{pagen}.tif")
                    files.append(fn)
                    cv.imwrite(fn, mono)
                    pimgs = []
                    for i, seg in enumerate(csegs):
                        (r, simg, smask) = seg
                        bp = tp / f"p{pagen}_{i}.jpg"
                        mp = tp / f"p{pagen}_{i}_m.png"
                        cv.imwrite(str(bp), simg, [
                            cv.IMWRITE_JPEG_QUALITY, 90,
                            cv.IMWRITE_JPEG_OPTIMIZE, 1,
                            cv.IMWRITE_JPEG_PROGRESSIVE, 1])
                        cv.imwrite(str(mp), smask, [
                            cv.IMWRITE_PNG_BILEVEL, 1,
                            cv.IMWRITE_PNG_COMPRESSION, 9])
                        pimgs.append(((r), bp, mp))
                    colorimgs.append(pimgs)
                    texts.append(text)
                    pt.update()
            t.update()
            t.set_description("JBIG2 compress")
            run(["jbig2", "-s", "-d", "-a", "-p", *files], capture_output=True, check=True, cwd=tp)
            symtab = tp / "output.sym"
            pageblobs = [tp / f"output.{p:04d}" for p in range(len(files))]
            t.update()
            t.set_description("Create PDF")
            with open(output, "wb") as outf:
                outf.write(mkpdf(symtab, pageblobs, colorimgs, texts, dpi))
            t.update()