scanproc/scanproc.py

#!/usr/bin/env python3

import easyocr
import cv2 as cv
import numpy as np
import warnings

from tqdm import tqdm
from deskew import determine_skew
from PIL import Image, ImageOps, ImageEnhance
from entrypoint2 import entrypoint
from pathlib import Path
from tempfile import TemporaryDirectory
from subprocess import run

from pdfutil import mkpdf

warnings.filterwarnings("ignore")

def rotate(img, angle: float):
    (h, w) = img.shape[:2]
    center = (w//2, h//2)
    M = cv.getRotationMatrix2D(center, angle, 1.0)
    return cv.warpAffine(img, M, (w, h), flags=cv.INTER_CUBIC, borderMode=cv.BORDER_REPLICATE)

def getRot(mask):
    (h, w) = mask.shape[:2]

    nw = min(w, 500)
    nh = int(h * (nw / w))
    sm = cv.resize(mask, (nw,nh))

    return determine_skew(sm)

def getMono(img):
    gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    _, mono = cv.threshold(gray, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
    kernel = np.ones((3,3),np.uint8)
    mono = cv.morphologyEx(mono, cv.MORPH_CLOSE, kernel)

    return mono

def getColorMask(img, dpi):
    hsv = cv.cvtColor(img, cv.COLOR_BGR2HSV)
    lower_sat = np.array([0,40,10])
    upper_sat = np.array([255,255,255])
    mask = cv.inRange(hsv, lower_sat, upper_sat)
    ksiz = int(dpi*0.005)
    ksiz -= ksiz%2-1
    kernel = np.ones((ksiz,ksiz),np.uint8)
    mask = cv.erode(mask, kernel)
    mask = cv.dilate(mask, kernel, iterations = 5)
    br = int(dpi*.1)
    br -= br%2-1
    mask = cv.GaussianBlur(mask, (br,br), 0)
    _, mask = cv.threshold(mask, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)
    return mask

def autoColorContrast(img, mono, dpi):
    ksiz = int(dpi*0.005)
    ksiz -= ksiz%2-1
    kernel = np.ones((ksiz,ksiz),np.uint8)
    mask = cv.bitwise_not(mono)
    mask = cv.dilate(mask, kernel, iterations = 5)

    pim = Image.fromarray(cv.cvtColor(img, cv.COLOR_BGR2RGB))
    pimask = Image.fromarray(mask)

    color = ImageOps.autocontrast(pim, (20, 30), mask=pimask, preserve_tone=True)
    color = cv.cvtColor(np.asarray(color), cv.COLOR_BGR2HLS)

    (H, L, S) = cv.split(color)
    L = L.astype("float32")
    L *= 1.3
    L = np.clip(L, 0, 255)
    L = L.astype("uint8")

    return cv.cvtColor(cv.merge((H, L, S)), cv.COLOR_HLS2RGB)

def getColorSegments(img, mono, cmask):
    contours, hierarchy = cv.findContours(cmask, cv.RETR_LIST, cv.CHAIN_APPROX_SIMPLE)

    for c in contours:
        rect = cv.boundingRect(c)
        (x1, y1, x2, y2) = rect
        x2 += x1
        y2 += y1
        yield (x1, y1, x2, y2), img[y1:y2, x1:x2], cmask[y1:y2, x1:x2]

def unsharpMask(image, kernel_size=(5, 5), sigma=1.0, amount=1.0, threshold=0):
    blurred = cv.GaussianBlur(image, kernel_size, sigma)
    sharpened = float(amount + 1) * image - float(amount) * blurred
    sharpened = np.maximum(sharpened, np.zeros(sharpened.shape))
    sharpened = np.minimum(sharpened, 255 * np.ones(sharpened.shape))
    sharpened = sharpened.round().astype(np.uint8)
    if threshold > 0:
        low_contrast_mask = np.absolute(image - blurred) < threshold
        np.copyto(sharpened, image, where=low_contrast_mask)
    return sharpened

def processImage(img, reader, dpi):
    with tqdm(total=8, leave=False) as t:
        t.set_description("Reading image")
        im = cv.imread(img)
        t.update()

        t.set_description("Filter")
        mono = getMono(im)

        im = cv.cvtColor(im, cv.COLOR_RGB2Lab)
        (L, a, b) = cv.split(im)
        ksiz = int(dpi*0.015)
        ksiz -= ksiz%2-1
        L = unsharpMask(L, kernel_size=(ksiz,ksiz), amount=2)
        L = cv.bilateralFilter(L, -1, 12, dpi*0.018)
        im = cv.cvtColor(cv.merge((L, a, b)), cv.COLOR_Lab2RGB)

        im = autoColorContrast(im, mono, dpi)
        t.update()

        t.set_description("Detect skew")
        angle = getRot(im)
        t.update()

        t.set_description("Deskew")
        im = rotate(im, angle)
        mono = rotate(mono, angle)
        t.update()

        t.set_description("OCR")
        text = reader.readtext(mono)
        t.update()

        t.set_description("Color mask")
        cmask = getColorMask(im, dpi)
        t.update()

        t.set_description("Color segments")
        csegs = getColorSegments(im, mono, cmask)
        t.update()

        mono[cmask==255] = 255

    return mono, csegs, text

@entrypoint
def main(output, langs=["en"], dpi=600, *imgs):
    reader = easyocr.Reader(langs)

    with tqdm(total=3, leave=False) as t:
        with TemporaryDirectory() as tmp:
            tp = Path(tmp)
            files = []
            colorimgs = []
            texts = []

            t.set_description("Process pages")
            with tqdm(total=len(imgs), leave=False) as pt:
                for pagen, img in enumerate(imgs):
                    pt.set_description(f"Process {img}")
                    mono, csegs, text = processImage(img, reader, dpi)

                    fn = str(tp / f"p{pagen}.tif")
                    files.append(fn)
                    cv.imwrite(fn, mono)

                    pimgs = []
                    for i, seg in enumerate(csegs):
                        (r, simg, smask) = seg
                        bp = tp / f"p{pagen}_{i}.jpg"
                        mp = tp / f"p{pagen}_{i}_m.png"

                        cv.imwrite(str(bp), simg, [
                            cv.IMWRITE_JPEG_QUALITY, 90,
                            cv.IMWRITE_JPEG_OPTIMIZE, 1,
                            cv.IMWRITE_JPEG_PROGRESSIVE, 1])

                        cv.imwrite(str(mp), smask, [
                            cv.IMWRITE_PNG_BILEVEL, 1,
                            cv.IMWRITE_PNG_COMPRESSION, 9])
                        pimgs.append(((r), bp, mp))
                    colorimgs.append(pimgs)

                    texts.append(text)
                    pt.update()
            t.update()

            t.set_description("JBIG2 compress")
            run(["jbig2", "-s", "-d", "-a", "-p", *files], capture_output=True, check=True, cwd=tp)
            symtab = tp / "output.sym"
            pageblobs = [tp / f"output.{p:04d}" for p in range(len(files))]
            t.update()

            t.set_description("Create PDF")
            with open(output, "wb") as outf:
                outf.write(mkpdf(symtab, pageblobs, colorimgs, texts, dpi))

            t.update()