In [31]:
import util, cv2
import numpy as np
from config import *

pdf = "page_50.pdf"
# pdf = "page_60.pdf"
# pdf = "page_1.pdf"
words, word_size, word_size_stats, page = util.getWords(pdf, 0)

p2i_task = util.pdf2img(pdf, 0, "page_1.png")
page_num = next(p2i_task)
print("page_num:", page_num)
# 启动协程
p2i_task.send(None)
p2i_task.send(0)
img = cv2.imread("page_1.png")
page_num: 1
save page_1.png page 0

确定流程图区域¶

In [32]:
len(page.curves)
Out[32]:
8
In [33]:
_ = util.drawRects(img, getRcRects(page.curves, page.height), line_color=[255, 0, 0], show_result_img=True)
No description has been provided for this image
In [34]:
util.canShow = True
MORPHOLOG_EX_SIZE = 10
_img = util.drawRects(np.zeros(img.shape[:2]), getRectsInPage(page), line_color=255, show_result_img=True)
k = np.ones((MORPHOLOG_EX_SIZE, 1))
vertical_close = cv2.morphologyEx(_img, cv2.MORPH_CLOSE, k)
util.imshow(vertical_close)
k = np.ones((1, MORPHOLOG_EX_SIZE))
horizonal_close = cv2.morphologyEx(vertical_close, cv2.MORPH_CLOSE, k)
util.imshow(horizonal_close)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [35]:
rs = util.cr2xy(util.simpleLocate(horizonal_close)[-1])
cv2.rectangle(_img, (rs[:2]), (rs[2:]), (255,255,255), 1)
util.imshow(_img)
No description has been provided for this image
In [36]:
import table
h, w = img.shape[:2]

table.generate_table_bbox(_img, "out", "table", line_scale=int(h/25))
In [37]:
dt = util.json_load("table/out.json")
h, w = img.shape[:2]
r = dt[0]
rect = util.map_list(int, [h - r["y_left"], r["x_left"], h - r["y_right"], r["x_right"]])
rect = util.map_list(int, [h - 454, r["x_left"], h - 583, r["x_right"]])
util.drawRects(img, rect, line_color=[255,0,255], show_result_img=True)
No description has been provided for this image
Out[37]:
array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]]], dtype=uint8)
In [38]:
curve_rects = getRcRects(page.curves, page.height)
In [43]:
page_area_info = dict()
page_area_info["flow_chart"] = []
In [61]:
def has_cross_more_than(img, num, line_len = 20):
    h, w = img.shape[:2]
    if h < line_len or w < line_len:
        return False
    k = cv2.getStructuringElement(cv2.MORPH_CROSS, (line_len, line_len))
    _img = util.getSubImg(cv2.erode(img, k), [2, 2, h - 2, w -2])
    cross_num = len(util.simpleLocate(_img))
    logger.info(f"cross_num={cross_num}")
    return cross_num > num
In [62]:
may_be_flow_chart = util.filter_list(lambda rect: util.calc_h(rect) > 50 and util.calc_w(rect) > 50 , util.simpleLocate(horizonal_close))
for rect in may_be_flow_chart:
    if len(util.filter_list(lambda r: util.isInnerCell(r, rect), curve_rects)) >= 4:
        page_area_info["flow_chart"].append(rect)
    
    if not has_cross_more_than(util.getSubImg(_img, rect), 3):
        page_area_info["flow_chart"].append(rect)
8
No description has been provided for this image
In [40]:
util.drawRects(img, util.simpleLocate(horizonal_close), show_result_img=True)
No description has been provided for this image
Out[40]:
array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]]], dtype=uint8)
In [6]:
def write_file(out_file, content, mode='w', encoding = 'utf-8'):
    try:
        with open(out_file, mode=mode, encoding=encoding) as fw:
            fw.write(content)
        return True
    except Exception as e:
        logger.error(f"Failed to write {content} to {out_file}")
        return False

def getOuterRect(rects):
    outer_rects = []
    walked = []
    for _r in rects:
        if notInRects(_r, walked):
            outer_rects.append(_r)
        walked.append(_r)
    return outer_rects

def notInRects(_r, walked):
    if len(walked) < 1:
        return True
    util.logger.debug(util.map_list(lambda big_cell: util.isInnerCell(_r, big_cell, strict=True)[0], walked))
    return not any(util.map_list(lambda big_cell: util.isInnerCell(_r, big_cell,strict=True)[0], walked))
In [7]:
# !/bin/bash/python3
# -*- coding=utf-8 -*-

# %% [markdown]
# # 1.1 确定表格和图片类区域

import cv2
from argparse import ArgumentParser
from importlib import reload
import sys


from config import *
import logging

logger = logging.getLogger(__name__)

argp = ArgumentParser()
argp.add_argument("-f", "--file", required=True,
                  help="path of the target pdf.")
argp.add_argument("-z", "--zoom", required=False, default="1.0", help="zoom.")
argp.add_argument("-p", "--page", required=False,
                  help="page, if not been passed on, will deal all.")
argp.add_argument("-o", "--output", required=False, help="output dir.")
argp.add_argument("--outfile_format", required=False, help="output file name format if not pass page.")
argp.add_argument("--outfile", required=False, help="output file name if pass page")


# pdf_paths = [pdfs] if isinstance(pdfs, str) else pdfs
# for pdf_path in pdf_paths:

pdf_path = r"page_1.pdf"
pg = 1
zoom='1.0'
TURN_TO_IMG_EXIT_CODE = 1 
# def main(pdf_path, pg=None, zoom=1.0, kv=None):

import os
# logger.info("pdf_path=%s, page=%s, zoom=%s, kv=%s", pdf_path, pg, zoom, kv)
zoom = eval(zoom)
reload(util)
util.canShow = False
# if not cleanTemp:
from datetime import datetime
name = os.path.basename(pdf_path)
util.TEMP_DIR = os.path.join(os.path.dirname(util.TEMP_DIR),
                                name + "_" + datetime.now().strftime("%m-%d-%S"))
tempImg = os.path.join(os.path.dirname(pdf_path), f"page_{pg}.png")
In [8]:
kv = dict()
of = os.path.join(kv["output"] or "", kv["outfile"]) \
    if kv["outfile"] else os.path.join(kv["output"] or "",
                                        kv["outfile_format"] % (pg + 1)
                                        )

_dir = os.path.dirname(of)
if not os.path.exists(_dir):
    try:
        logger.info(f"创建文件夹{_dir} {'成功' if os.mkdir(_dir) else '失败'}")
    except Exception as e:
        logger.warn("Erro, %s", e)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-8-be3a79ee4a8d> in <module>
      1 kv = dict()
      2 of = os.path.join(kv["output"] or "", kv["outfile"]) \
----> 3     if kv["outfile"] else os.path.join(kv["output"] or "",
      4                                         kv["outfile_format"] % (pg + 1)
      5                                         )

KeyError: 'outfile'
In [9]:
tempImg
Out[9]:
'page_1.png'
In [10]:
if os.path.exists(pdf_path[:-4]):
    tempImg = os.path.join(pdf_path[:-4], f"page_{pg}.png")

p2i_task = util.pdf2img(pdf_path, pg, tempImg, zoom_x=zoom, zoom_y=zoom)
page_num = next(p2i_task)
print("page_num:", page_num)
# 启动协程
p2i_task.send(None)

if pg is None:
    pages = list(range(page_num))
else:
    pages = [int(pg) - 1]

pg = pages[0]
# for pg in pages:
p2i_task.send(pg)

# 基图
img0 = cv2.imread(tempImg)
page_num: 1
save page_1.png page 0
In [11]:
words, w_size, w_s_stats, page = util.getWords(pdf_path, pg)
# TODO
word_size_as_th = max(9.8, getSeriesValue(w_s_stats) or 1)
expand = max(0, int(zoom * word_size_as_th / 2))

image_area_rects = getRcRects(page.images, page.height, zoom)
line_area_rects = getRcRects(page.lines, page.height, zoom)
rect_area_rects = getRcRects(page.rects, page.height, zoom)
img0_and_0 = np.zeros(img0.shape[:2])
getRcRects, zoom= 1.0
getRcRects, zoom= 1.0
getRcRects, zoom= 1.0
In [12]:
# 如果没有文字或者, 有很大的图片, 提前结束
page_area_info = dict()
page_area = page.height * page.width
if len(page.chars) < 0 \
        or any(util.filter_list(lambda rect: util.calc_area(rect) > page_area * 0.64,image_area_rects)):
    page_area_info["children_page_rects"] = [
        [0, 0, page.height - 1, page.width - 1]]
    # 写入文件
    write_file(of, util.dumps(page_area_info))
    # TODO BREAK?
    sys.exit(TURN_TO_IMG_EXIT_CODE)
In [50]:
util.canShow = True
# _img = util.drawRects(img0, getRectsInPage(page), show_result_img=True)
_img = util.drawRects(img0_and_0, getRectsInPage(
    page), show_result_img=True, line_color=[255, 255, 255])
# 找出所有的类表格区域
rects = util.getMaxRects(_img)

# 重新计算, 找出外边界
outer_rects = getOuterRect(rects)

# 找页眉
# 找有横线的页眉
for _r in outer_rects:
    if util.calc_h(_r) < 5 \
        and _r[2] < page.height * 0.15 \
        and util.calc_w(_r) > page.width * 0.6:
        page_area_info["head"] = _r[2]
page_area_info["head"]
getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1
No description has been provided for this image
Out[50]:
56
In [51]:
# image_area_rects = getMaxAreaRect(img0, image_area_rects)
# line_area_rects = getMaxAreaRect(img0, line_area_rects)
rect_area_rects = outer_rects

# %%
# rect 区域调整
# rect_area_rects2 = []
# for r in rect_area_rects:
#     if util.calc_area(r) < 1:
#         continue
#     _img = util.getSubImg(img0, r)

#     _img_bi = util.getBinary(_img, 250, 0, 255)
#     util.imshow(_img_bi)

#     kh, kw = list(map(int, (1, word_size_as_th * 2)))
#     k = np.ones((kh, kw))
#     _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
#     util.imshow(_im, "闭运算列连接")
#     kh, kw = list(map(int, (word_size_as_th * 2, 1)))
#     k = np.ones((kh, kw))
#     _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
#     util.imshow(_im, "闭运算行连接")
#     temp = util.getMaxRectsTwice(_im, offset=r)
#     # 画上区域
#     _ = util.drawRects(img0, temp, show_result_img=True,
#                        line_color=[255, 0, 255])
#     rect_area_rects2.extend(temp)
# rect_area_rects = rect_area_rects2

# %%
img0_binary = util.getBinary(img0, 200, 255, 0)
img0_rected = _img
In [52]:
# 区域划分示意图
_ = util.drawRects(img0, outer_rects,
                   line_color=[0, 0, 255], show_result_img=True)
No description has been provided for this image
In [53]:
# 隐藏图片和表格区域
# img0_binary_filled = util.drawRects(img0_binary,
#                                     outer_rects,
#                                     line_color=[255, 255, 255],
#                                     fill_color=[255, 255, 255],
#                                     show_result_img=False)
# util.imshow(img0_binary_filled)
In [54]:
# %%
h, w = page.height, page.width
# page_rects = util.filter_list(
#     lambda r: h * 0.1 < r[0] and r[2] < h * 0.9, rect_area_rects)
# maybe_lines = util.filter_list(lambda r: util.calc_h(r) < line_th and util.calc_w(r) > 5 * word_size_as_th,
#                                page_rects)
# # TODO
# if len(page_rects) > 2 and len(maybe_lines) > 0:
#     page_rects.sort(key=lambda r: (r[0], -r[2], r[1], -r[3]))
#     maybe_tbs = [[]]
#     append = False
#     for r in page_rects:
#         if util.calc_h(r) < line_th or len(maybe_tbs[-1]) == 0:
#             append = not append
#         elif r[0] - maybe_tbs[-1][-1][0] < 2 * word_size_as_th:
#             append = not append
#             maybe_tbs.append([])
#         if append:
#             maybe_tbs[-1].append(r)

#     for maybe_tb in maybe_tbs:
#         if len(maybe_tb) < 1:
#             continue
#         st, ed = maybe_tb[0][0], maybe_tb[-1][2]
#         words_rects = getWsRects(words, zoom)
#         filter_wrects = de_duplicate_rect(util.filter_list(
#             lambda x: st < x[0] and x[2] < ed, words_rects))
#         logger.info(
#             f"原size={len(words_rects)}, 长度过滤之后{len(filter_wrects)}.")
#         # util.drawRects(np.zeros(img0_rected.shape[:2]), filter_wrects + [maybe_tb[0] ,maybe_tb[-1]], show_result_img=True)
#         k = np.ones((int((ed - st) / 2), 1))
#         _img_close = cv2.morphologyEx(
#             util.drawRects(img0_rected, filter_wrects, show_result_img=True, title="pdf 上的rect"), cv2.MORPH_CLOSE, k)
#         # util.imshow(_img_close, "_img_close")
#         _rect = util.getMaxRects(_img_close)
#         _rect = util.getMaxRects(util.drawRects(np.zeros(img0.shape[:2]), _rect, fill_color=[0, 0, 0],
#                                                 fill_expanding=[0, word_size_as_th], show_result_img=True),
#                                  show_result_img=True)
#         _rect = util.map_list(
#             lambda r: [
#                 r[0], r[1] + int(word_size_as_th) - 1, r[2], r[3] - int(word_size_as_th) + 2],
#             _rect)
#         rect_area_rects.extend(_rect)
# _ = util.drawRects(img0,
#                    rect_area_rects,
#                    show_result_img=True,
#                    title="表格区域调整示意图")
In [55]:
# %% [markdown]
# ## 制作MASK

# %%
mask = np.ones(img0.shape[:2]).astype(np.uint8) * 255
img0_rects_exclude_small_rect = util.map_list(
    lambda r: util.map_list(int,
                            [r[0] - word_size_as_th * 1.5,
                             # r[1] - word_size_as_th * 1.5,
                             r[1] - \
                             word_size_as_th,
                             r[2] + \
                             word_size_as_th * 1.5,
                             # r[3] + word_size_as_th * 1.5
                             r[3] + \
                             word_size_as_th
                             ]), outer_rects
)
In [56]:
_ = util.drawRects(mask,
                   img0_rects_exclude_small_rect,
                   line_color=0,
                   fill_color=0,
                   fill_expanding=[0, 0],
                   copy_for_show_result=False,
                   show_result_img=True,
                   title="mask - img")
No description has been provided for this image
In [58]:
# %%
# 方案2
ws_rects = getWsRects(words, zoom)
# 去掉页脚
if all(((page.height - ws_rects[-1][0]) < page.height * 0.15,
       len(words[-1]["text"].replace(" ", "")) < 8,
       util.calc_h(ws_rects[-1]) < 10)):
    logger.info("找到页脚了 %s", ws_rects[-1][0])
    page_area_info["tail"] = ws_rects[-1][0]
print(page.height - ws_rects[-1][0], page.height * 0.15, util.calc_h(ws_rects[-1]))
57.89999999999998 126.285 9
In [59]:
page_area_info
Out[59]:
{'head': 56, 'tail': 784}
In [ ]:
ws_rects_not_in_table = []

mask_text = []
for wr in ws_rects:
    add_flag = True
    for orect in img0_rects_exclude_small_rect:
        if util.isInnerCell(wr, orect, strict=False)[0]:
            add_flag = False
            mask_text.append(wr)
            break
    if add_flag:
        ws_rects_not_in_table.append(wr)
        
In [22]:
_ = util.drawRects(img0, ws_rects_not_in_table, line_color=[
    255, 0, 0], show_result_img=True)
No description has been provided for this image
In [ ]:
h, w = img0_binary.shape[:2]
# 绘制文本框
# util.imshow(img0_binary_filled)
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255,
                                  ws_rects_not_in_table,
                                  fill_color=[255, 255, 255],
                                  line_color=[0, 0, 0],
                                  fill_expanding=expand,
                                  show_result_img=True,
                                  title="img9_word_filled")

word_focused = np.bitwise_not(img9_word_filled)
# util.imshow(word_focused)
k = np.ones((int(word_size_as_th * 1.2), int(word_size_as_th)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
# util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
# text_area_rects = util.getMaxRects(
#     word_focused, show_result_img=True, result=points_set)
text_area_rects = util.getMaxRects(
    word_focused_dilate, show_result_img=True, result=points_set)
text_area_rects = util.map_list(lambda r: [
    r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2, r[3] - expand + 2], text_area_rects)

# %%
# k = cv2.getStructuringElement(
#     cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255,
                                  mask_text,
                                  fill_color=[255, 255, 255],
                                  line_color=[0, 0, 0],
                                  fill_expanding=expand,
                                  show_result_img=True,
                                  title="img9_word_filled")

word_focused = np.bitwise_not(img9_word_filled)
# util.imshow(word_focused)
# k = np.ones((int(word_size_as_th), int(word_size_as_th * 1.2)))
# word_focused_dilate = util.dilate(word_focused, k)
# word_focused_dilate = util.erode(word_focused_dilate, k)
# util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects_mask = util.getMaxRectsTwice(
    word_focused, show_result_img=True, result=points_set)
text_area_rects_mask = util.map_list(lambda r: [
    r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2, r[3] - expand + 2], text_area_rects_mask)

# %% [markdown]
# ### 1.2.1 根据字号先划分成多块

# %%
ws_rects_float = getWsRects(words, zoom, save_float=True)
ws_rects_dict = util.groupby(
    ws_rects_float, key=lambda r: "%.2f" % (r[2] - r[0]))

# %% [markdown]
# ### 1.2.2 再确定文本区域
#

# %%
text_area_rects_2 = []
for k, group in ws_rects_dict.items():
    _expand = max(0, int(eval(k) * zoom * 0.6))
    _img = np.zeros(img0.shape[:2]).astype(np.uint8)
    for r in group:
        util.drawRects(_img, util.map_list(int, r),
                       line_color=[255, 0, 0],
                       fill_color=[0, 0, 0],
                       copy_for_show_result=False,
                       fill_expanding=_expand,
                       show_result_img=False)

    _img = np.bitwise_and(_img, mask)

    temp = util.getMaxRectsTwice(
        _img, line_color=[255, 0, 0], show_result_img=True, title=f"字号 {k}")
    temp = util.map_list(lambda r: [
        r[0] + _expand - 1, r[1] + _expand - 1, r[2] - _expand + 2, r[3] - _expand + 2], temp)
    for r in temp:
        if (r[2] - r[0]) > eval(k) * 0.8:
            text_area_rects_2.append(r)
            text_area_rects.append(r)

# 过滤 小文本区域
text_area_rects = text_area_rects_2
text_area_rects.extend(text_area_rects_mask)

# %% [markdown]
# ## 1.3 绘制区域示意图

# %%
rects = [image_area_rects,
         line_area_rects,
         rect_area_rects,
         text_area_rects,
         getWsRects(words, zoom)]
# colors = [Color.IMAGE, Color.LINE, Color.RECT,
#           Color.TEXT_AREA, Color.TEXT_LINE]
# color_img = None
# for c in colors:
#     i = np.array([c for _ in range(100)]).astype(np.uint8)
#     i = np.reshape(i, (10, 10, 3))
#     if color_img is None:
#         color_img = i
#     else:
#         color_img = np.concatenate((color_img, i), 1)
#
# util.imshow(color_img, " | ".join(
#     ["image", "line", "rectangle", "text_area", "text_line"]))
#
# img = img0.copy()
# for r, c in zip(rects, colors):
#     util.drawRects(img, r,
#                    line_color=c,
#                    #    fill_color=c,
#                    show_result_img=True,
#                    copy_for_show_result=False)

# %% [markdown]
# ## 1.4 页面表格化

# %%
img = img0.copy() * 0
# for r, c in zip(rects, colors):
#     util.drawRects(img, r,
#                    line_color=c,
#                    show_result_img=False,
#                    copy_for_show_result=False)
# util.imshow(img)

# %%

img_bi = util.getBinary(img, 1, 0, 255)
# util.imshow(img_bi)
page_areas = []
page_areas_imgs = []


def extractLines(img_bi, hline=True, rate=0.95):
    import util
    lines = util.extractHline(
        img_bi, rate=rate) if hline else util.extractVline(img_bi, rate=rate)
    return util.getSubImgsAndPos(img_bi, lines)


imgs, pos = extractLines(img_bi)
# util.imshow(imgs)
page_areas_imgs.extend(imgs)
page_areas.extend(util._offset([0, 0], pos))

# %% [markdown]
# ## 1.5 页面区域再划分

# %%
img = img0.copy() * 0
# for r, c in zip(rects, colors):
#     util.drawRects(img, r,
#                    line_color=c,
#                    show_result_img=False,
#                    copy_for_show_result=False)
# util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)

rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate=0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[
    255, 255, 255], show_result_img=True)
page_areas = rs
# img = img0.copy()
# _ = util.drawRects(img,
#                    page_areas,
#                    show_result_img=True,
#                    line_color=[0, 0, 0],
#                    title="区域划分示意图")

# %%
# rects = [image_area_rects,
#          line_area_rects,
#          rect_area_rects + page_areas,
#          text_area_rects,
#          getWsRects(words, zoom)]
# colors = [Color.IMAGE, Color.LINE, Color.RECT,
#           Color.TEXT_AREA, Color.TEXT_LINE]
# color_img = None
# for c in colors:
#     i = np.array([c for _ in range(100)]).astype(np.uint8)
#     i = np.reshape(i, (10, 10, 3))
#     if color_img is None:
#         color_img = i
#     else:
#         color_img = np.concatenate((color_img, i), 1)
#
# ls = ["image", "line", "rectangle", "text_area", "text_line"]
# util.imshow(color_img, " | ".join(ls))
#
# img = img0.copy()
# for r, c, t in zip(rects, colors, ls):
#     util.drawRects(img, r,
#                    line_color=c,
#                    show_result_img=True,
#                    copy_for_show_result=False,
#                    title=t)
#
# util.imshow(img)

# %% [markdown]
# # 2.1 图片, rect内部再划分

# %%

# def mergeTextArea(text_area_rects, text_area_rects_add, rate=0.8):
#     _rs = []
#     walked = set()
#     for o in text_area_rects:
#         for i in text_area_rects_add:
#             if str(i) in walked:
#                 continue
#             f, outer, inner = util.isInnerCell(
#                 i, o, strict=False, rate=rate)
#             if f:
#                 _rs.append(outer)
#                 walked.add(str(inner))
#
#     for i in text_area_rects:
#         for o in text_area_rects_add:
#             if str(i) in walked:
#                 continue
#             f, outer, inner = util.isInnerCell(
#                 i, o, strict=False, rate=rate)
#             if f:
#                 _rs.append(outer)
#                 walked.add(str(inner))
#     return _rs

# %% [markdown]
# ## 2.2 再次绘制图像

# %%

# 大文本区域细分

# page = pdfplumber.open(pdf_path).pages[0]

length = len(text_area_rects)
for i in range(length):
    tr = text_area_rects[i]
    if (tr[2] - tr[0]) > 4 * word_size_as_th and (tr[3] - tr[1]) > img0.shape[1] * 0.5:

        util.drawRects(img0, tr, show_result_img=True,
                       title="可能需要切分的文本框", line_color=[0, 0, 255])

        _img_bi = util.getBinary(util.getSubImg(img0, tr), 250, 0, 255)
        util.imshow(_img_bi, "Rect 子图")

        kh, kw = list(map(int, (1, word_size_as_th)))
        k = np.ones((kh, kw))
        _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
        util.imshow(_im, "闭运算列连接")
        kh, kw = list(map(int, (word_size_as_th * 1.8, 1)))
        k = np.ones((kh, kw))
        _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
        util.imshow(_im, "闭运算行连接")

        temp = util.getMaxRects(_im, offset=tr, show_result_img=True)
        if len(temp) > 1:
            text_area_rects[i] = temp[0]
            text_area_rects.extend(temp[1:])
# 过滤 小文本区域

text_area_rects = list(filter(lambda r: util.calc_area(
    r) > word_size_as_th ** 2, text_area_rects))
# util.dump(_words, "ws.json")

# %%
# 调整文本行
# _img = np.zeros(img0.shape[:2])

# util.drawRects(_img, text_area_rects, fill_color=[0,0,0],
#                fill_expanding=[expand, 0],
#                copy_for_show_result=False)

# text_area_rects = util.getMaxRectsTwice(_img, show_result_img=True)

# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1,
#                                            r[1],
#                                            r[2] - expand + 1 ,
#                                            r[3]], text_area_rects )

# %%
# 重构页面结构
img = img0.copy() * 0
rects = [image_area_rects,
         line_area_rects,
         rect_area_rects,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT,
          Color.TEXT_AREA, Color.TEXT_LINE]
for r, c in zip(rects, colors):
    util.drawRects(img, r,
                   line_color=c,
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)

rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate=0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[
    255, 255, 255], show_result_img=True)

page_areas = de_duplicate_rect(rs)
# img = img0.copy()

# _ = util.drawRects(img,
#                    page_areas,
#                    show_result_img=True,
#                    line_color=[0, 0, 0],
#                    title="区域划分示意图")

# %%
# 2022-03-26 重绘rects

maybe_rects = util.getMaxRects(
    util.drawRects(np.zeros(img0.shape[:2]), line_area_rects, line_color=255)
)
for i in maybe_rects:
    if util.calc_area(i) > 5 * w:
        rect_area_rects.append(i)
rect_area_rects = util.getMaxRects(
    util.drawRects(np.zeros(img0.shape[:2]), rect_area_rects, line_color=255)
)

# %%

# rects = [image_area_rects,
#          line_area_rects,
#          rect_area_rects,
#          #  mergeTextArea(text_area_rects ,text_area_rects_add),
#          text_area_rects,
#          getWsRects(words, zoom)]
# colors = [Color.IMAGE, Color.LINE, Color.RECT,
#           Color.TEXT_AREA, Color.TEXT_LINE]
# color_img = None
# for c in colors:
#     i = np.array([c for _ in range(100)]).astype(np.uint8)
#     i = np.reshape(i, (10, 10, 3))
#     if color_img is None:
#         color_img = i
#     else:
#         color_img = np.concatenate((color_img, i), 1)
#
# ls = ["image", "line", "rectangle", "text_area", "text_line"]
#
# img = img0.copy()
# for r, c, t in zip(rects, colors, ls):
#     util.drawRects(img, r,
#                    line_color=c,
#                    #    fill_color=c,
#                    show_result_img=True,
#                    copy_for_show_result=False,
#                    title=t)
#
# util.imshow(color_img, " | ".join(ls))

# util.canShow = True

# # 画上区域
_img = util.drawRects(
    img, page_areas, show_result_img=False, line_color=[0, 0, 0])
# util.imshow(np.concatenate((img0, _img), 1), "各区域划分示意图")

util.canShow = False

# TODO
"""写入一张临时图片"""

# dir_path = os.path.join(util.TEMP_DIR)
# if not os.path.exists(dir_path):
#     os.makedirs(dir_path)
# cv2.imwrite(os.path.join(
#     dir_path, f"output_img_{0}.png"), np.concatenate((img0, _img), 1))

cv2.imwrite(os.path.join(
    kv["output"], f"region_{pg}_{0}.png"), np.concatenate((img0, _img), 1))
# 画上区域
_img = util.drawRects(
    img0, page_areas, show_result_img=False, line_color=[0, 0, 0])
util.imshow(np.concatenate((img0, _img), 1), "划分示意图")
# cv2.imwrite(os.path.join(
#     dir_path, f"output_img_{1}.png"), np.concatenate((img0, _img), 1))
cv2.imwrite(os.path.join(
    kv["output"], f"region_{pg}_{1}.png"), np.concatenate((img0, _img), 1))

# %% [markdown]
# # 3.1 输出区域信息

# %%
# 保存最终结果
pageArea = PageArea(page_areas)
pageArea.imgs = image_area_rects
pageArea.rects = rect_area_rects
pageArea.lines = line_area_rects
pageArea.height = h
pageArea.width = w

h_t, b = delete_head_tail(pageArea.children_page_rects, pdf_path, pg)
pageArea.children_page_rects = b if len(
    b) > 0 else [[0, 0, int(h) - 1, int(w) - 1]]
pageArea.head_and_tail = h_t
# util.canShow = True
# # print(str(pageArea))
# util.drawRects(img0, pageArea.children_page_rects, line_color=[255, 0,0], show_result_img=True)
# util.drawRects(img0, h_t, show_result_img=True)

of = os.path.join(kv["output"] or "", kv["outfile"]) \
    if kv["outfile"] else os.path.join(kv["output"] or "",
                                       kv["outfile_format"] % (pg + 1)
                                       )

_dir = os.path.dirname(of)
if not os.path.exists(_dir):
    try:
        logger.info(f"创建文件夹{_dir} {'成功' if os.mkdir(_dir) else '失败'}")
    except Exception as e:
        logger.warn("Erro, %s", e)

with open(of, mode="w", encoding="utf-8") as fw:
    fw.write(str(pageArea))

# print(str(pageArea))
if cleanTemp:
    import os
    print("临时文件目录", os.path.abspath(util.TEMP_DIR))
    # util.clean(os.path.abspath(util.TEMP_DIR))

# %%


def delete_head_tail(rr, pdf_path, pg):
    """ 去掉页眉页脚
    """
    b, h_t = [], []
    rr = sorted(rr, key=lambda rect: rect[0])
    import pdfplumber as pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[pg]
        h, w = page.height, page.width

        def fun(page, _r, h, w):
            r0, c0, r1, c1 = _r
            r1 = min(r1, h)
            c1 = min(c1, w)
            crop_box = page.within_bbox(util.xy2rc([r0, c0, r1, c1], 1))

            # print(crop_box.extract_words())
            def test(rects):
                if len(rects) > 0:
                    r = rects[0]
                    if r["width"] > crop_box.height * 0.5 and r["height"] < 3:
                        return False
                else:
                    return True

            chs = crop_box.chars
            rs = (len(chs) > 5 and min(map(lambda ch: ch["x0"], chs)) / len(chs) < crop_box.width * 0.5,
                  test(crop_box.rects),
                  test(crop_box.images),
                  test(crop_box.lines),
                  )
            logger.debug(_r, rs, all(rs))
            return all(rs)

        limit = 0.2
        for _r in rr:
            if _r[2] / h < limit:
                if fun(page, _r, h, w):
                    # b.append(_r)
                    pass
                else:
                    for _t in rr:
                        # print("adsadasda", _t, _r)
                        if _t[2] <= _r[2]:
                            h_t.append(_t)
                    h_t.append(_r)
                    break
            else:
                break

        limit = 0.2
        for _r in rr[::-1]:
            if (h - _r[0]) / h < limit:
                if fun(page, _r, h, w):
                    # b.append(_r)
                    pass
                else:
                    if _r not in h_t:
                        for _t in rr:
                            if _t[0] >= _r[0]:
                                h_t.append(_t)
                        h_t.append(_r)
                    break
            else:
                break

        for i in rr:
            if i not in h_t:
                b.append(i)
        return h_t, b


# if __name__ == "__main__":
#     kv = vars(argp.parse_args())
#     print(kv)
#     main(kv["file"], pg=kv["page"],
#          zoom=kv["zoom"] or 1.0, kv=kv)
#     exit(0)