In [31]:
import util, cv2
import numpy as np
from config import *
pdf = "page_50.pdf"
# pdf = "page_60.pdf"
# pdf = "page_1.pdf"
words, word_size, word_size_stats, page = util.getWords(pdf, 0)
p2i_task = util.pdf2img(pdf, 0, "page_1.png")
page_num = next(p2i_task)
print("page_num:", page_num)
# 启动协程
p2i_task.send(None)
p2i_task.send(0)
img = cv2.imread("page_1.png")
page_num: 1 save page_1.png page 0
确定流程图区域¶
In [32]:
len(page.curves)
Out[32]:
8
In [33]:
_ = util.drawRects(img, getRcRects(page.curves, page.height), line_color=[255, 0, 0], show_result_img=True)
In [34]:
util.canShow = True
MORPHOLOG_EX_SIZE = 10
_img = util.drawRects(np.zeros(img.shape[:2]), getRectsInPage(page), line_color=255, show_result_img=True)
k = np.ones((MORPHOLOG_EX_SIZE, 1))
vertical_close = cv2.morphologyEx(_img, cv2.MORPH_CLOSE, k)
util.imshow(vertical_close)
k = np.ones((1, MORPHOLOG_EX_SIZE))
horizonal_close = cv2.morphologyEx(vertical_close, cv2.MORPH_CLOSE, k)
util.imshow(horizonal_close)
In [35]:
rs = util.cr2xy(util.simpleLocate(horizonal_close)[-1])
cv2.rectangle(_img, (rs[:2]), (rs[2:]), (255,255,255), 1)
util.imshow(_img)
In [36]:
import table
h, w = img.shape[:2]
table.generate_table_bbox(_img, "out", "table", line_scale=int(h/25))
In [37]:
dt = util.json_load("table/out.json")
h, w = img.shape[:2]
r = dt[0]
rect = util.map_list(int, [h - r["y_left"], r["x_left"], h - r["y_right"], r["x_right"]])
rect = util.map_list(int, [h - 454, r["x_left"], h - 583, r["x_right"]])
util.drawRects(img, rect, line_color=[255,0,255], show_result_img=True)
Out[37]:
array([[[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], ..., [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]]], dtype=uint8)
In [38]:
curve_rects = getRcRects(page.curves, page.height)
In [43]:
page_area_info = dict()
page_area_info["flow_chart"] = []
In [61]:
def has_cross_more_than(img, num, line_len = 20):
h, w = img.shape[:2]
if h < line_len or w < line_len:
return False
k = cv2.getStructuringElement(cv2.MORPH_CROSS, (line_len, line_len))
_img = util.getSubImg(cv2.erode(img, k), [2, 2, h - 2, w -2])
cross_num = len(util.simpleLocate(_img))
logger.info(f"cross_num={cross_num}")
return cross_num > num
In [62]:
may_be_flow_chart = util.filter_list(lambda rect: util.calc_h(rect) > 50 and util.calc_w(rect) > 50 , util.simpleLocate(horizonal_close))
for rect in may_be_flow_chart:
if len(util.filter_list(lambda r: util.isInnerCell(r, rect), curve_rects)) >= 4:
page_area_info["flow_chart"].append(rect)
if not has_cross_more_than(util.getSubImg(_img, rect), 3):
page_area_info["flow_chart"].append(rect)
8
In [40]:
util.drawRects(img, util.simpleLocate(horizonal_close), show_result_img=True)
Out[40]:
array([[[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], ..., [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]]], dtype=uint8)
In [6]:
def write_file(out_file, content, mode='w', encoding = 'utf-8'):
try:
with open(out_file, mode=mode, encoding=encoding) as fw:
fw.write(content)
return True
except Exception as e:
logger.error(f"Failed to write {content} to {out_file}")
return False
def getOuterRect(rects):
outer_rects = []
walked = []
for _r in rects:
if notInRects(_r, walked):
outer_rects.append(_r)
walked.append(_r)
return outer_rects
def notInRects(_r, walked):
if len(walked) < 1:
return True
util.logger.debug(util.map_list(lambda big_cell: util.isInnerCell(_r, big_cell, strict=True)[0], walked))
return not any(util.map_list(lambda big_cell: util.isInnerCell(_r, big_cell,strict=True)[0], walked))
In [7]:
# !/bin/bash/python3
# -*- coding=utf-8 -*-
# %% [markdown]
# # 1.1 确定表格和图片类区域
import cv2
from argparse import ArgumentParser
from importlib import reload
import sys
from config import *
import logging
logger = logging.getLogger(__name__)
argp = ArgumentParser()
argp.add_argument("-f", "--file", required=True,
help="path of the target pdf.")
argp.add_argument("-z", "--zoom", required=False, default="1.0", help="zoom.")
argp.add_argument("-p", "--page", required=False,
help="page, if not been passed on, will deal all.")
argp.add_argument("-o", "--output", required=False, help="output dir.")
argp.add_argument("--outfile_format", required=False, help="output file name format if not pass page.")
argp.add_argument("--outfile", required=False, help="output file name if pass page")
# pdf_paths = [pdfs] if isinstance(pdfs, str) else pdfs
# for pdf_path in pdf_paths:
pdf_path = r"page_1.pdf"
pg = 1
zoom='1.0'
TURN_TO_IMG_EXIT_CODE = 1
# def main(pdf_path, pg=None, zoom=1.0, kv=None):
import os
# logger.info("pdf_path=%s, page=%s, zoom=%s, kv=%s", pdf_path, pg, zoom, kv)
zoom = eval(zoom)
reload(util)
util.canShow = False
# if not cleanTemp:
from datetime import datetime
name = os.path.basename(pdf_path)
util.TEMP_DIR = os.path.join(os.path.dirname(util.TEMP_DIR),
name + "_" + datetime.now().strftime("%m-%d-%S"))
tempImg = os.path.join(os.path.dirname(pdf_path), f"page_{pg}.png")
In [8]:
kv = dict()
of = os.path.join(kv["output"] or "", kv["outfile"]) \
if kv["outfile"] else os.path.join(kv["output"] or "",
kv["outfile_format"] % (pg + 1)
)
_dir = os.path.dirname(of)
if not os.path.exists(_dir):
try:
logger.info(f"创建文件夹{_dir} {'成功' if os.mkdir(_dir) else '失败'}")
except Exception as e:
logger.warn("Erro, %s", e)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-8-be3a79ee4a8d> in <module> 1 kv = dict() 2 of = os.path.join(kv["output"] or "", kv["outfile"]) \ ----> 3 if kv["outfile"] else os.path.join(kv["output"] or "", 4 kv["outfile_format"] % (pg + 1) 5 ) KeyError: 'outfile'
In [9]:
tempImg
Out[9]:
'page_1.png'
In [10]:
if os.path.exists(pdf_path[:-4]):
tempImg = os.path.join(pdf_path[:-4], f"page_{pg}.png")
p2i_task = util.pdf2img(pdf_path, pg, tempImg, zoom_x=zoom, zoom_y=zoom)
page_num = next(p2i_task)
print("page_num:", page_num)
# 启动协程
p2i_task.send(None)
if pg is None:
pages = list(range(page_num))
else:
pages = [int(pg) - 1]
pg = pages[0]
# for pg in pages:
p2i_task.send(pg)
# 基图
img0 = cv2.imread(tempImg)
page_num: 1 save page_1.png page 0
In [11]:
words, w_size, w_s_stats, page = util.getWords(pdf_path, pg)
# TODO
word_size_as_th = max(9.8, getSeriesValue(w_s_stats) or 1)
expand = max(0, int(zoom * word_size_as_th / 2))
image_area_rects = getRcRects(page.images, page.height, zoom)
line_area_rects = getRcRects(page.lines, page.height, zoom)
rect_area_rects = getRcRects(page.rects, page.height, zoom)
img0_and_0 = np.zeros(img0.shape[:2])
getRcRects, zoom= 1.0 getRcRects, zoom= 1.0 getRcRects, zoom= 1.0
In [12]:
# 如果没有文字或者, 有很大的图片, 提前结束
page_area_info = dict()
page_area = page.height * page.width
if len(page.chars) < 0 \
or any(util.filter_list(lambda rect: util.calc_area(rect) > page_area * 0.64,image_area_rects)):
page_area_info["children_page_rects"] = [
[0, 0, page.height - 1, page.width - 1]]
# 写入文件
write_file(of, util.dumps(page_area_info))
# TODO BREAK?
sys.exit(TURN_TO_IMG_EXIT_CODE)
In [50]:
util.canShow = True
# _img = util.drawRects(img0, getRectsInPage(page), show_result_img=True)
_img = util.drawRects(img0_and_0, getRectsInPage(
page), show_result_img=True, line_color=[255, 255, 255])
# 找出所有的类表格区域
rects = util.getMaxRects(_img)
# 重新计算, 找出外边界
outer_rects = getOuterRect(rects)
# 找页眉
# 找有横线的页眉
for _r in outer_rects:
if util.calc_h(_r) < 5 \
and _r[2] < page.height * 0.15 \
and util.calc_w(_r) > page.width * 0.6:
page_area_info["head"] = _r[2]
page_area_info["head"]
getRcRects, zoom= 1 getRcRects, zoom= 1 getRcRects, zoom= 1 getRcRects, zoom= 1
Out[50]:
56
In [51]:
# image_area_rects = getMaxAreaRect(img0, image_area_rects)
# line_area_rects = getMaxAreaRect(img0, line_area_rects)
rect_area_rects = outer_rects
# %%
# rect 区域调整
# rect_area_rects2 = []
# for r in rect_area_rects:
# if util.calc_area(r) < 1:
# continue
# _img = util.getSubImg(img0, r)
# _img_bi = util.getBinary(_img, 250, 0, 255)
# util.imshow(_img_bi)
# kh, kw = list(map(int, (1, word_size_as_th * 2)))
# k = np.ones((kh, kw))
# _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
# util.imshow(_im, "闭运算列连接")
# kh, kw = list(map(int, (word_size_as_th * 2, 1)))
# k = np.ones((kh, kw))
# _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
# util.imshow(_im, "闭运算行连接")
# temp = util.getMaxRectsTwice(_im, offset=r)
# # 画上区域
# _ = util.drawRects(img0, temp, show_result_img=True,
# line_color=[255, 0, 255])
# rect_area_rects2.extend(temp)
# rect_area_rects = rect_area_rects2
# %%
img0_binary = util.getBinary(img0, 200, 255, 0)
img0_rected = _img
In [52]:
# 区域划分示意图
_ = util.drawRects(img0, outer_rects,
line_color=[0, 0, 255], show_result_img=True)
In [53]:
# 隐藏图片和表格区域
# img0_binary_filled = util.drawRects(img0_binary,
# outer_rects,
# line_color=[255, 255, 255],
# fill_color=[255, 255, 255],
# show_result_img=False)
# util.imshow(img0_binary_filled)
In [54]:
# %%
h, w = page.height, page.width
# page_rects = util.filter_list(
# lambda r: h * 0.1 < r[0] and r[2] < h * 0.9, rect_area_rects)
# maybe_lines = util.filter_list(lambda r: util.calc_h(r) < line_th and util.calc_w(r) > 5 * word_size_as_th,
# page_rects)
# # TODO
# if len(page_rects) > 2 and len(maybe_lines) > 0:
# page_rects.sort(key=lambda r: (r[0], -r[2], r[1], -r[3]))
# maybe_tbs = [[]]
# append = False
# for r in page_rects:
# if util.calc_h(r) < line_th or len(maybe_tbs[-1]) == 0:
# append = not append
# elif r[0] - maybe_tbs[-1][-1][0] < 2 * word_size_as_th:
# append = not append
# maybe_tbs.append([])
# if append:
# maybe_tbs[-1].append(r)
# for maybe_tb in maybe_tbs:
# if len(maybe_tb) < 1:
# continue
# st, ed = maybe_tb[0][0], maybe_tb[-1][2]
# words_rects = getWsRects(words, zoom)
# filter_wrects = de_duplicate_rect(util.filter_list(
# lambda x: st < x[0] and x[2] < ed, words_rects))
# logger.info(
# f"原size={len(words_rects)}, 长度过滤之后{len(filter_wrects)}.")
# # util.drawRects(np.zeros(img0_rected.shape[:2]), filter_wrects + [maybe_tb[0] ,maybe_tb[-1]], show_result_img=True)
# k = np.ones((int((ed - st) / 2), 1))
# _img_close = cv2.morphologyEx(
# util.drawRects(img0_rected, filter_wrects, show_result_img=True, title="pdf 上的rect"), cv2.MORPH_CLOSE, k)
# # util.imshow(_img_close, "_img_close")
# _rect = util.getMaxRects(_img_close)
# _rect = util.getMaxRects(util.drawRects(np.zeros(img0.shape[:2]), _rect, fill_color=[0, 0, 0],
# fill_expanding=[0, word_size_as_th], show_result_img=True),
# show_result_img=True)
# _rect = util.map_list(
# lambda r: [
# r[0], r[1] + int(word_size_as_th) - 1, r[2], r[3] - int(word_size_as_th) + 2],
# _rect)
# rect_area_rects.extend(_rect)
# _ = util.drawRects(img0,
# rect_area_rects,
# show_result_img=True,
# title="表格区域调整示意图")
In [55]:
# %% [markdown]
# ## 制作MASK
# %%
mask = np.ones(img0.shape[:2]).astype(np.uint8) * 255
img0_rects_exclude_small_rect = util.map_list(
lambda r: util.map_list(int,
[r[0] - word_size_as_th * 1.5,
# r[1] - word_size_as_th * 1.5,
r[1] - \
word_size_as_th,
r[2] + \
word_size_as_th * 1.5,
# r[3] + word_size_as_th * 1.5
r[3] + \
word_size_as_th
]), outer_rects
)
In [56]:
_ = util.drawRects(mask,
img0_rects_exclude_small_rect,
line_color=0,
fill_color=0,
fill_expanding=[0, 0],
copy_for_show_result=False,
show_result_img=True,
title="mask - img")
In [58]:
# %%
# 方案2
ws_rects = getWsRects(words, zoom)
# 去掉页脚
if all(((page.height - ws_rects[-1][0]) < page.height * 0.15,
len(words[-1]["text"].replace(" ", "")) < 8,
util.calc_h(ws_rects[-1]) < 10)):
logger.info("找到页脚了 %s", ws_rects[-1][0])
page_area_info["tail"] = ws_rects[-1][0]
print(page.height - ws_rects[-1][0], page.height * 0.15, util.calc_h(ws_rects[-1]))
57.89999999999998 126.285 9
In [59]:
page_area_info
Out[59]:
{'head': 56, 'tail': 784}
In [ ]:
ws_rects_not_in_table = []
mask_text = []
for wr in ws_rects:
add_flag = True
for orect in img0_rects_exclude_small_rect:
if util.isInnerCell(wr, orect, strict=False)[0]:
add_flag = False
mask_text.append(wr)
break
if add_flag:
ws_rects_not_in_table.append(wr)
In [22]:
_ = util.drawRects(img0, ws_rects_not_in_table, line_color=[
255, 0, 0], show_result_img=True)
In [ ]:
h, w = img0_binary.shape[:2]
# 绘制文本框
# util.imshow(img0_binary_filled)
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255,
ws_rects_not_in_table,
fill_color=[255, 255, 255],
line_color=[0, 0, 0],
fill_expanding=expand,
show_result_img=True,
title="img9_word_filled")
word_focused = np.bitwise_not(img9_word_filled)
# util.imshow(word_focused)
k = np.ones((int(word_size_as_th * 1.2), int(word_size_as_th)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
# util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
# text_area_rects = util.getMaxRects(
# word_focused, show_result_img=True, result=points_set)
text_area_rects = util.getMaxRects(
word_focused_dilate, show_result_img=True, result=points_set)
text_area_rects = util.map_list(lambda r: [
r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2, r[3] - expand + 2], text_area_rects)
# %%
# k = cv2.getStructuringElement(
# cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255,
mask_text,
fill_color=[255, 255, 255],
line_color=[0, 0, 0],
fill_expanding=expand,
show_result_img=True,
title="img9_word_filled")
word_focused = np.bitwise_not(img9_word_filled)
# util.imshow(word_focused)
# k = np.ones((int(word_size_as_th), int(word_size_as_th * 1.2)))
# word_focused_dilate = util.dilate(word_focused, k)
# word_focused_dilate = util.erode(word_focused_dilate, k)
# util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects_mask = util.getMaxRectsTwice(
word_focused, show_result_img=True, result=points_set)
text_area_rects_mask = util.map_list(lambda r: [
r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2, r[3] - expand + 2], text_area_rects_mask)
# %% [markdown]
# ### 1.2.1 根据字号先划分成多块
# %%
ws_rects_float = getWsRects(words, zoom, save_float=True)
ws_rects_dict = util.groupby(
ws_rects_float, key=lambda r: "%.2f" % (r[2] - r[0]))
# %% [markdown]
# ### 1.2.2 再确定文本区域
#
# %%
text_area_rects_2 = []
for k, group in ws_rects_dict.items():
_expand = max(0, int(eval(k) * zoom * 0.6))
_img = np.zeros(img0.shape[:2]).astype(np.uint8)
for r in group:
util.drawRects(_img, util.map_list(int, r),
line_color=[255, 0, 0],
fill_color=[0, 0, 0],
copy_for_show_result=False,
fill_expanding=_expand,
show_result_img=False)
_img = np.bitwise_and(_img, mask)
temp = util.getMaxRectsTwice(
_img, line_color=[255, 0, 0], show_result_img=True, title=f"字号 {k}")
temp = util.map_list(lambda r: [
r[0] + _expand - 1, r[1] + _expand - 1, r[2] - _expand + 2, r[3] - _expand + 2], temp)
for r in temp:
if (r[2] - r[0]) > eval(k) * 0.8:
text_area_rects_2.append(r)
text_area_rects.append(r)
# 过滤 小文本区域
text_area_rects = text_area_rects_2
text_area_rects.extend(text_area_rects_mask)
# %% [markdown]
# ## 1.3 绘制区域示意图
# %%
rects = [image_area_rects,
line_area_rects,
rect_area_rects,
text_area_rects,
getWsRects(words, zoom)]
# colors = [Color.IMAGE, Color.LINE, Color.RECT,
# Color.TEXT_AREA, Color.TEXT_LINE]
# color_img = None
# for c in colors:
# i = np.array([c for _ in range(100)]).astype(np.uint8)
# i = np.reshape(i, (10, 10, 3))
# if color_img is None:
# color_img = i
# else:
# color_img = np.concatenate((color_img, i), 1)
#
# util.imshow(color_img, " | ".join(
# ["image", "line", "rectangle", "text_area", "text_line"]))
#
# img = img0.copy()
# for r, c in zip(rects, colors):
# util.drawRects(img, r,
# line_color=c,
# # fill_color=c,
# show_result_img=True,
# copy_for_show_result=False)
# %% [markdown]
# ## 1.4 页面表格化
# %%
img = img0.copy() * 0
# for r, c in zip(rects, colors):
# util.drawRects(img, r,
# line_color=c,
# show_result_img=False,
# copy_for_show_result=False)
# util.imshow(img)
# %%
img_bi = util.getBinary(img, 1, 0, 255)
# util.imshow(img_bi)
page_areas = []
page_areas_imgs = []
def extractLines(img_bi, hline=True, rate=0.95):
import util
lines = util.extractHline(
img_bi, rate=rate) if hline else util.extractVline(img_bi, rate=rate)
return util.getSubImgsAndPos(img_bi, lines)
imgs, pos = extractLines(img_bi)
# util.imshow(imgs)
page_areas_imgs.extend(imgs)
page_areas.extend(util._offset([0, 0], pos))
# %% [markdown]
# ## 1.5 页面区域再划分
# %%
img = img0.copy() * 0
# for r, c in zip(rects, colors):
# util.drawRects(img, r,
# line_color=c,
# show_result_img=False,
# copy_for_show_result=False)
# util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)
rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate=0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[
255, 255, 255], show_result_img=True)
page_areas = rs
# img = img0.copy()
# _ = util.drawRects(img,
# page_areas,
# show_result_img=True,
# line_color=[0, 0, 0],
# title="区域划分示意图")
# %%
# rects = [image_area_rects,
# line_area_rects,
# rect_area_rects + page_areas,
# text_area_rects,
# getWsRects(words, zoom)]
# colors = [Color.IMAGE, Color.LINE, Color.RECT,
# Color.TEXT_AREA, Color.TEXT_LINE]
# color_img = None
# for c in colors:
# i = np.array([c for _ in range(100)]).astype(np.uint8)
# i = np.reshape(i, (10, 10, 3))
# if color_img is None:
# color_img = i
# else:
# color_img = np.concatenate((color_img, i), 1)
#
# ls = ["image", "line", "rectangle", "text_area", "text_line"]
# util.imshow(color_img, " | ".join(ls))
#
# img = img0.copy()
# for r, c, t in zip(rects, colors, ls):
# util.drawRects(img, r,
# line_color=c,
# show_result_img=True,
# copy_for_show_result=False,
# title=t)
#
# util.imshow(img)
# %% [markdown]
# # 2.1 图片, rect内部再划分
# %%
# def mergeTextArea(text_area_rects, text_area_rects_add, rate=0.8):
# _rs = []
# walked = set()
# for o in text_area_rects:
# for i in text_area_rects_add:
# if str(i) in walked:
# continue
# f, outer, inner = util.isInnerCell(
# i, o, strict=False, rate=rate)
# if f:
# _rs.append(outer)
# walked.add(str(inner))
#
# for i in text_area_rects:
# for o in text_area_rects_add:
# if str(i) in walked:
# continue
# f, outer, inner = util.isInnerCell(
# i, o, strict=False, rate=rate)
# if f:
# _rs.append(outer)
# walked.add(str(inner))
# return _rs
# %% [markdown]
# ## 2.2 再次绘制图像
# %%
# 大文本区域细分
# page = pdfplumber.open(pdf_path).pages[0]
length = len(text_area_rects)
for i in range(length):
tr = text_area_rects[i]
if (tr[2] - tr[0]) > 4 * word_size_as_th and (tr[3] - tr[1]) > img0.shape[1] * 0.5:
util.drawRects(img0, tr, show_result_img=True,
title="可能需要切分的文本框", line_color=[0, 0, 255])
_img_bi = util.getBinary(util.getSubImg(img0, tr), 250, 0, 255)
util.imshow(_img_bi, "Rect 子图")
kh, kw = list(map(int, (1, word_size_as_th)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算列连接")
kh, kw = list(map(int, (word_size_as_th * 1.8, 1)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算行连接")
temp = util.getMaxRects(_im, offset=tr, show_result_img=True)
if len(temp) > 1:
text_area_rects[i] = temp[0]
text_area_rects.extend(temp[1:])
# 过滤 小文本区域
text_area_rects = list(filter(lambda r: util.calc_area(
r) > word_size_as_th ** 2, text_area_rects))
# util.dump(_words, "ws.json")
# %%
# 调整文本行
# _img = np.zeros(img0.shape[:2])
# util.drawRects(_img, text_area_rects, fill_color=[0,0,0],
# fill_expanding=[expand, 0],
# copy_for_show_result=False)
# text_area_rects = util.getMaxRectsTwice(_img, show_result_img=True)
# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1,
# r[1],
# r[2] - expand + 1 ,
# r[3]], text_area_rects )
# %%
# 重构页面结构
img = img0.copy() * 0
rects = [image_area_rects,
line_area_rects,
rect_area_rects,
text_area_rects,
getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT,
Color.TEXT_AREA, Color.TEXT_LINE]
for r, c in zip(rects, colors):
util.drawRects(img, r,
line_color=c,
show_result_img=False,
copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)
rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate=0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[
255, 255, 255], show_result_img=True)
page_areas = de_duplicate_rect(rs)
# img = img0.copy()
# _ = util.drawRects(img,
# page_areas,
# show_result_img=True,
# line_color=[0, 0, 0],
# title="区域划分示意图")
# %%
# 2022-03-26 重绘rects
maybe_rects = util.getMaxRects(
util.drawRects(np.zeros(img0.shape[:2]), line_area_rects, line_color=255)
)
for i in maybe_rects:
if util.calc_area(i) > 5 * w:
rect_area_rects.append(i)
rect_area_rects = util.getMaxRects(
util.drawRects(np.zeros(img0.shape[:2]), rect_area_rects, line_color=255)
)
# %%
# rects = [image_area_rects,
# line_area_rects,
# rect_area_rects,
# # mergeTextArea(text_area_rects ,text_area_rects_add),
# text_area_rects,
# getWsRects(words, zoom)]
# colors = [Color.IMAGE, Color.LINE, Color.RECT,
# Color.TEXT_AREA, Color.TEXT_LINE]
# color_img = None
# for c in colors:
# i = np.array([c for _ in range(100)]).astype(np.uint8)
# i = np.reshape(i, (10, 10, 3))
# if color_img is None:
# color_img = i
# else:
# color_img = np.concatenate((color_img, i), 1)
#
# ls = ["image", "line", "rectangle", "text_area", "text_line"]
#
# img = img0.copy()
# for r, c, t in zip(rects, colors, ls):
# util.drawRects(img, r,
# line_color=c,
# # fill_color=c,
# show_result_img=True,
# copy_for_show_result=False,
# title=t)
#
# util.imshow(color_img, " | ".join(ls))
# util.canShow = True
# # 画上区域
_img = util.drawRects(
img, page_areas, show_result_img=False, line_color=[0, 0, 0])
# util.imshow(np.concatenate((img0, _img), 1), "各区域划分示意图")
util.canShow = False
# TODO
"""写入一张临时图片"""
# dir_path = os.path.join(util.TEMP_DIR)
# if not os.path.exists(dir_path):
# os.makedirs(dir_path)
# cv2.imwrite(os.path.join(
# dir_path, f"output_img_{0}.png"), np.concatenate((img0, _img), 1))
cv2.imwrite(os.path.join(
kv["output"], f"region_{pg}_{0}.png"), np.concatenate((img0, _img), 1))
# 画上区域
_img = util.drawRects(
img0, page_areas, show_result_img=False, line_color=[0, 0, 0])
util.imshow(np.concatenate((img0, _img), 1), "划分示意图")
# cv2.imwrite(os.path.join(
# dir_path, f"output_img_{1}.png"), np.concatenate((img0, _img), 1))
cv2.imwrite(os.path.join(
kv["output"], f"region_{pg}_{1}.png"), np.concatenate((img0, _img), 1))
# %% [markdown]
# # 3.1 输出区域信息
# %%
# 保存最终结果
pageArea = PageArea(page_areas)
pageArea.imgs = image_area_rects
pageArea.rects = rect_area_rects
pageArea.lines = line_area_rects
pageArea.height = h
pageArea.width = w
h_t, b = delete_head_tail(pageArea.children_page_rects, pdf_path, pg)
pageArea.children_page_rects = b if len(
b) > 0 else [[0, 0, int(h) - 1, int(w) - 1]]
pageArea.head_and_tail = h_t
# util.canShow = True
# # print(str(pageArea))
# util.drawRects(img0, pageArea.children_page_rects, line_color=[255, 0,0], show_result_img=True)
# util.drawRects(img0, h_t, show_result_img=True)
of = os.path.join(kv["output"] or "", kv["outfile"]) \
if kv["outfile"] else os.path.join(kv["output"] or "",
kv["outfile_format"] % (pg + 1)
)
_dir = os.path.dirname(of)
if not os.path.exists(_dir):
try:
logger.info(f"创建文件夹{_dir} {'成功' if os.mkdir(_dir) else '失败'}")
except Exception as e:
logger.warn("Erro, %s", e)
with open(of, mode="w", encoding="utf-8") as fw:
fw.write(str(pageArea))
# print(str(pageArea))
if cleanTemp:
import os
print("临时文件目录", os.path.abspath(util.TEMP_DIR))
# util.clean(os.path.abspath(util.TEMP_DIR))
# %%
def delete_head_tail(rr, pdf_path, pg):
""" 去掉页眉页脚
"""
b, h_t = [], []
rr = sorted(rr, key=lambda rect: rect[0])
import pdfplumber as pdfplumber
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[pg]
h, w = page.height, page.width
def fun(page, _r, h, w):
r0, c0, r1, c1 = _r
r1 = min(r1, h)
c1 = min(c1, w)
crop_box = page.within_bbox(util.xy2rc([r0, c0, r1, c1], 1))
# print(crop_box.extract_words())
def test(rects):
if len(rects) > 0:
r = rects[0]
if r["width"] > crop_box.height * 0.5 and r["height"] < 3:
return False
else:
return True
chs = crop_box.chars
rs = (len(chs) > 5 and min(map(lambda ch: ch["x0"], chs)) / len(chs) < crop_box.width * 0.5,
test(crop_box.rects),
test(crop_box.images),
test(crop_box.lines),
)
logger.debug(_r, rs, all(rs))
return all(rs)
limit = 0.2
for _r in rr:
if _r[2] / h < limit:
if fun(page, _r, h, w):
# b.append(_r)
pass
else:
for _t in rr:
# print("adsadasda", _t, _r)
if _t[2] <= _r[2]:
h_t.append(_t)
h_t.append(_r)
break
else:
break
limit = 0.2
for _r in rr[::-1]:
if (h - _r[0]) / h < limit:
if fun(page, _r, h, w):
# b.append(_r)
pass
else:
if _r not in h_t:
for _t in rr:
if _t[0] >= _r[0]:
h_t.append(_t)
h_t.append(_r)
break
else:
break
for i in rr:
if i not in h_t:
b.append(i)
return h_t, b
# if __name__ == "__main__":
# kv = vars(argp.parse_args())
# print(kv)
# main(kv["file"], pg=kv["page"],
# zoom=kv["zoom"] or 1.0, kv=kv)
# exit(0)