scanproc/pdfutil.py

181 lines
5.0 KiB
Python
Executable File

#!/usr/bin/env python3
import struct
import fitz
import cv2 as cv
class Dict:
def __init__(self, values={}):
self.d = {}
self.d.update(values)
def __bytes__(self):
s = [b"<< "]
for (x, y) in self.d.items():
s.append(b"/" + x + b" " + y + b"\n")
s.append(b">>\n")
return b"".join(s)
class Obj:
def __init__(self, d={}, stream=None):
if stream is not None:
d[b"Length"] = str(len(stream)).encode()
self.d = Dict(d)
self.stream = stream
self.idx = 0
def __bytes__(self):
s = [bytes(self.d)]
if self.stream is not None:
s.append(b"stream\n")
s.append(self.stream)
s.append(b"\nendstream\n")
s.append(b"endobj\n")
return b"".join(s)
class Doc:
def __init__(self):
self.objs = []
self.pages = []
def add_object(self, o):
o.idx = len(self.objs) + 1
self.objs.append(o)
return o
def add_page(self, o):
self.pages.append(o)
return self.add_object(o)
def __bytes__(self):
a = []
j = [0]
offsets = []
def add(x):
a.append(x)
j[0] += len(x) + 1
add(b'%PDF-1.4')
for o in self.objs:
offsets.append(j[0])
add(f"{o.idx} 0 obj".encode())
add(bytes(o))
xrefstart = j[0]
a.append(b'xref')
a.append(f'0 {len(offsets) + 1}'.encode())
a.append(b'0000000000 65535 f ')
for o in offsets:
a.append(f"{o:010d} 00000 n ".encode())
a.append(b'')
a.append(b'trailer')
a.append(f'<< /Size {len(offsets) + 1}\n/Root 1 0 R >>'.encode())
a.append(b'startxref')
a.append(str(xrefstart).encode())
a.append(b'%EOF')
return b'\n'.join(a)
def ref(x):
return f"{x} 0 R".encode()
def mkpdf(symtab, pageblobs, colorimgs, texts, dpi):
doc = Doc()
doc.add_object(Obj({
b'Type' : b'/Catalog',
b'Outlines' : ref(2),
b'Pages' : ref(3)
}))
doc.add_object(Obj({
b'Type' : b'/Outlines',
b'Count': b'0'
}))
pages = Obj({b'Type' : b'/Pages'})
doc.add_object(pages)
with open(symtab, "rb") as stab:
symd = doc.add_object(Obj({}, stab.read()))
page_objs = []
for f in pageblobs:
with open(f, "rb") as pf:
blob = pf.read()
(w, h, xres, yres) = struct.unpack(">IIII", blob[11:27])
if xres == 0:
xres = dpi
if yres == 0:
yres = dpi
xobj = Obj({
b'Type': b'/XObject',
b'Subtype': b'/Image',
b'Width': str(w).encode(),
b'Height': str(h).encode(),
b'ColorSpace': b'/DeviceGray',
b'BitsPerComponent': b'1',
b'Filter': b'/JBIG2Decode',
b'DecodeParms': f' << /JBIG2Globals {symd.idx} 0 R >>'.encode()
}, blob)
xobj = doc.add_object(xobj)
fw = float(w * 72) / xres
fh = float(h * 72) / yres
contents = Obj({}, f'q {fw:.1f} 0 0 {fh:.1f} 0 0 cm /Im1 Do Q'.encode())
contents = doc.add_object(contents)
resources = Obj({
b'ProcSet': b'[/PDF /ImageB]',
b'XObject': f'<< /Im1 {xobj.idx} 0 R >>'.encode()
})
resources = doc.add_object(resources)
page = Obj({
b'Type': b'/Page',
b'Parent': b'3 0 R',
b'MediaBox': f'[ 0 0 {fw:.1f} {fh:.1f} ]'.encode(),
b'Contents': ref(contents.idx),
b'Resources': ref(resources.idx)
})
doc.add_object(page)
page_objs.append(page)
pages.d.d[b'Count'] = str(len(page_objs)).encode()
pages.d.d[b'Kids'] = b'[' + b' '.join([ref(x.idx) for x in page_objs]) + b']'
pdf = fitz.open(stream=bytes(doc))
for p, imgs in enumerate(colorimgs):
for rect, img, mask in imgs:
with open(mask, "rb") as mf:
m = mf.read()
pdf[p].insert_image(fitz.Rect(*[(c*72.0)/dpi for c in rect]), filename=img, mask=m)
for p, txt in enumerate(texts):
for box in txt:
if len(box[1]):
bbox = box[0]
r = fitz.Rect(
min((bbox[0][0]*72.0)/dpi, (bbox[3][0]*72.0)/dpi),
min((bbox[0][1]*72.0)/dpi, (bbox[1][1]*72.0)/dpi),
max((bbox[1][0]*72.0)/dpi, (bbox[2][0]*72.0)/dpi),
max((bbox[2][1]*72.0)/dpi, (bbox[3][1]*72.0)/dpi)
)
fs = r.height
w = fitz.get_text_length(box[1], fontsize=fs)
pivot = fitz.Point(r.x0, r.y0)
matrix = fitz.Matrix(r.width/w, 1.0)
pdf[p].insert_text(fitz.Point(r.x0, r.y0+r.height*0.8),
box[1], fontsize=fs,
morph=(pivot, matrix),
render_mode=3)
return pdf.tobytes(garbage=4, clean=True, deflate=True)