from importlib import reload
reload(util)

import util
from pandas import DataFrame
import numpy as np
import cv2


def getRcRects(rects, h, zoom=1.0, exclude_page_rect=True):
    print("getRcRects, zoom=", zoom)
    _rects =  util.map_list(lambda r: util.xy2rc(
        util.map_list(int, (r['x0'] * zoom, (h - r['y1']) * zoom, r['x1'] * zoom, (h - r['y0']) * zoom))),
        rects
    )
    
    res = []
    for rect in _rects:
        if exclude_page_rect:
            r, c, r1, c1 = rect
            if r == c == 0:
                continue
        res.append(rect)
    return res


def getWsRects(words, zoom=1.0):  
    return util.map_list(lambda r: util.xy2rc(
        util.map_list(int, (r['x0'] * zoom, r['top'] * zoom, r['x1'] * zoom, r['bottom'] * zoom))),
        words
    )


def getRectsInPage(page, exclude_page_rect=True, zoom=1, draw_image=False):
    """ 得到图片上可能存在的矩形框（包括直线， 图片...）

    Args:
        page (_type_): pdfplumber.Page

    Returns:
        _type_: _description_
    """
    rects = []
    items = [page.lines, page.curves, page.rects]
    if draw_image:
        items.append(page.images)
    for it in items:
        _rects = getRcRects(it, page.height, zoom)
        for rect in _rects:
            if exclude_page_rect:
                r, c, r1, c1 = rect
                if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
                    continue
            rects.append(rect)
    return rects


def getSeriesValue(series):
    v = series.values[0]
    kw = series.index[0][0]
    for index, num in series.items():
        if num == v:
            kw = min(kw, index[0])
    return kw


def adjustLines(lines, rects_exclude_small_rect, th = 0, is_hline=True):
    if is_hline:
        rects_exclude_small_rect.sort(key=lambda d: (d[0], -d[2]))
    else:
        rects_exclude_small_rect.sort(key=lambda d: (d[1], -d[3])) 
    lines_new = []
    for i in range(len(lines)):
        if is_hline:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (r + th) < y < (r1 + th):
                    lines[i] = [x, max(0, r - 1), x1, r]
                    lines_new.append([x, max(0, r - 1), x1, r])
                    lines_new.append([x, r1-1, x1, r1])
                    break
            lines_new.append(lines[i])
        else:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (c + th) < x < (c1 + th):
                    lines[i] = [ max(0, x - 1), r, x, r1]
                    lines_new.append([max(0, x - 1), r, x, r1])
                    lines_new.append([c1-1, r, c1, r1])
                    break
            lines_new.append(lines[i])
    
    # line_set = set(map(lambda it:str(it), lines_new))
    # for r in rects_exclude_small_rect:
    #     r, c, r1, c1 = r
    #     top_line = [c, max(0, r - 1), c1, r]
    #     bottom_line = [c, r1-1, c1, r1]
    #     if str(top_line) not in line_set:
    #         lines_new.append(top_line)
    #         line_set.add(str(top_line))
    #     if str(bottom_line) not in line_set:
    #         lines_new.append(bottom_line)
    #         line_set.add(str(bottom_line))
    lines_new.sort(key=lambda it: it[1])
    return lines_new

import util

pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 5

util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=2, zoom_y=2)

page.extract_tables({"vertical_strategy": "lines_strict"})

import util

pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\tu_pian.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\ban_fl.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\fl2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl0.pdf'
pg = 0
import pdfplumber as pdfplumber
from pandas import DataFrame
pdf = pdfplumber.open(pdf_path)
# 第一页的信息
page = pdf.pages[pg]
# # keep_blank_chars 不以空格为分界符
# # x_tolerance 同一行左右两个文本之间的最大容忍距离
# # 这里横向距离设置的比较大，会把一些表格里比较近的文本合在一起
# # TODO 优化！
# words = page.extract_words(
#     keep_blank_chars=False, x_tolerance=4.50 * 1.5, y_tolerance=4.5)

# words.sort(key=lambda x: x["top"])
# w_size = list(map(lambda w: w["adv"], words))
# df = DataFrame(w_size)


util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
util.imshow("0.png")

import pandas as pd
from pandas import DataFrame

for it in page.objects["image"]:
    rect = [it[attr] for attr in ["x0", "y0", "x1", "y1"]]
    page_crop = page.crop(rect)
    print(f"rect={rect}", DataFrame(util.map_list(
        lambda it: it["size"],
        page_crop.chars)).value_counts())

word_size = getSeriesValue(DataFrame(
    util.map_list(
        lambda it: it["size"],
        page.chars)).value_counts()
)

# image
import numpy as np
h, w = tuple(map(int, (page.height, page.width)))
img = np.zeros((h, w)).astype(np.uint8)
for image in page.objects["image"]:
    util.drawRects(img, [util.map_list(int, [h - image["y1"], 
                         image["x0"], 
                         h - image["y0"],
                         image["x1"]
                               ])],
                   copy_for_show_result=False, 
                   show_result_img=True
                         )

# line
for image in page.objects["line"]:
    util.drawRects(img, [util.map_list(int, [h - image["y1"], 
                         image["x0"], 
                         h - image["y0"],
                         image["x1"]
                               ])],
                   copy_for_show_result=False, 
                   show_result_img=True
                         )

h, w

# rect
for image in page.objects["rect"]:
    rect = [util.map_list(int, [h - image["y1"],
                                image["x0"],
                                h - image["y0"],
                                image["x1"]
                                ])]
    if util._calc_area(rect[0]) >= (h-1)//1 * w//1:
        print([h - image["y1"],
               image["x0"],
               h - image["y0"],
               image["x1"]
               ])
        continue

    util.drawRects(img, rect,
                   copy_for_show_result=False,
                   show_result_img=True
                   )

rects = util.getMaxRects(img, show_result_img=True)
h, w = img.shape[:2]
hlines = []
dilateMode = True
for rect in rects:
    r, c, r1, c1 = rect
    if dilateMode:
        c, c1 = 0, w
    
    t = 1 if r-2 <= 0 else (r-2)
    top = [c, t - 1, c1, t]
    
    t = h-2 if r1 + 2 >= h else r1 + 1
    bot = [ c, t,  c1, t + 1]
    
    hlines.append(top)
    hlines.append(bot)
# 将开头和结尾作为一条分割线
x1, y1, x2, y2 = hlines[0]
if y2 > 2:
    hlines.insert(0, [x1, 0, x2, 1])

x1, y1, x2, y2 = hlines[-1]
if y2 < (h - 2):
    hlines.append([x1, h - 2, x2, h - 1])
hlines

page.edges

util.drawRects(img, util.xy2rc(hlines), show_result_img=True)

img0 = cv2.imread('0.png')
imgs, pos =util.getSubImgsAndPos(img0, hlines, filterFun=lambda img: img.shape[0] > word_size)

util.imshow(imgs)

import util
for attr in "objects/.chars/.lines/.rects/.curves/.figures/.images".split("/."):
    if hasattr(page, attr):
        util.dump(str(getattr(page, attr)), f"{attr}.json")

import util

pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 8
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
pg = 9
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)

import cv2
import numpy as np
import util

img8 = util.getBinary('8.png', 200, 255, 0).astype(np.uint8)
img9 = util.getBinary('9.png', 200, 255, 0).astype(np.uint8)
util.imshow([img8, img9])

util.imshow(img8)
words, w_size, w_s_stats, page = util.getWords(pdf_path, 8)
img8_rected = util.getBinary(
    util.drawRects(np.empty(img8.shape).astype(np.uint8), getRectsInPage(page), show_result_img=False, line_color=[255,0,0]),
    0,
    255,
    0
    )
util.imshow(img8_rected)
# 得到最大的矩形区域
img8_rects_exclude_small_rect = util.getMaxRects(img8_rected, show_result_img=False)

# 区域划分示意图
_ = util.drawRects(img8, img8_rects_exclude_small_rect,line_color=[0,0,255], show_result_img=True)
img8_binary = img8
rs = img8_rects_exclude_small_rect.copy()
img8_binary_filled = util.drawRects(img8_binary, rs, line_color=[255,255,255], fill_color=[255,255,255], show_result_img=True)

w8, h8 = img8_binary.shape[:2]
# 以字符 size 决定 kernel 高度
k = cv2.getStructuringElement(cv2.MORPH_RECT, (w8 * 3 // 4, int(getSeriesValue(w_s_stats))))
img8_erode = util.erode(img8_binary, k)
util.imshow(img8_erode)

lines = util.extractHline(img8_erode)# 调整直线的位置
adjustLines(lines, img8_rects_exclude_small_rect)
util.imwrite(util.drawRects(img8, util.xy2rc(lines), show_result_img=True))
imgs, pos = util.getSubImgsAndPos(img8, lines)
util.imshow(imgs)

# 绘制文本框
img8_word_filled = util.drawRects(img8_binary_filled, getWsRects(words), fill_color=[255,255,255], line_color=0, show_result_img=True)

lines = util.extractHline(img8_erode)# 调整直线的位置
adjustLines(lines, img8_rects_exclude_small_rect)
util.drawRects(img8, util.xy2rc(lines), show_result_img=True)
imgs, pos = util.getSubImgsAndPos(img8_word_filled, lines)
util.imshow(imgs)

img8_children, _ = util.getSubImgsAndPos(img8, lines)
img8_children = iter(img8_children)
# 分栏
for img in imgs:
    
    w, h = img.shape[:2]
    df = DataFrame(util.getWhiteBarWidthList(img, isHline=False, rate=0.8))
    series = df.value_counts()
    
    image = next(img8_children)
    if series.size > 2:
        # 需要继续切分
        print(series, f'[len={len(series)}]')
        print(f'{series.index[0][0]} 出现次数最多， 为 {series.array[0]} 次。')

        kh = h * 3 // 5
        kw = int(getSeriesValue(w_s_stats) * 1.2)
        k = np.ones((kh, kw))
        
        img_erode = util.erode(img, k, anchor=[kh - 1, -1])
        vlines = util.extractVline(img_erode, rate=0.75)
        util.drawRects(image, util.xy2rc(vlines), show_result_img=True)
        _imgs, _pos = util.getSubImgsAndPos(image, vlines)
        # _imgs, _pos = util.getSubImgsAndPos(img, vlines)
        util.imshow(_imgs)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-a295eaed4814> in <module>
----> 1 img8_children, _ = util.getSubImgsAndPos(img8, lines)
      2 img8_children = iter(img8_children)
      3 # 分栏
      4 for img in imgs:
      5 

NameError: name 'util' is not defined

import util
from pandas import DataFrame
import numpy as np
import cv2


def de_duplicate_rect(page_areas: list, iou=0.95):
    """# 根据交并比去除重复的区域


    Args:
        page_areas (list): 区域
        iou (float, optional): 交并比大于iou会被去除. Defaults to 0.95.

    Returns:
        list: 去重后的rects
    """    
    sorted_page_areas = sorted(page_areas, key=lambda rect: (rect[0], rect[2], rect[1], rect[3]))
    de_duplicate = []
    for r in sorted_page_areas:
        if len(de_duplicate) < 1:
            de_duplicate.append(r)
        else:
            last = de_duplicate.pop()
            if util.calc_iou(last, r) > iou:
                de_duplicate.append(last)
            else:
                de_duplicate.append(last)
                de_duplicate.append(r)
            
    return de_duplicate 


def getRcRects(rects, h, zoom=1.0, save_float=False, exclude_page_rect=True):
    print("getRcRects, zoom=", zoom)
    _rects =  util.map_list(lambda r: util.xy2rc(
        util.map_list(float if save_float else int, (r['x0'] * zoom, (h - r['y1']) * zoom, r['x1'] * zoom, (h - r['y0']) * zoom))),
        rects
    )
    
    res = []
    for rect in _rects:
        if exclude_page_rect:
            r, c, r1, c1 = rect
            if r == c == 0:
                continue
        res.append(rect)
    return res


def getWsRects(words, zoom=1.0, save_float=False, **kw):
    
    offset = [1, 1] if not hasattr(kw, "offset") else kw["offset"][:2]

    _rects = util.map_list(lambda r: util.xy2rc(
        util.map_list(float if save_float else int, (r['x0'] * zoom, r['top'] * zoom, r['x1'] * zoom, r['bottom'] * zoom))),
        words
    )

    def apply_offset(_rect):
        r_, c_ = offset
        r_, c_ = r_ - 1, c_ - 1
        r, c, r1, c1 = _rect
        return [r + r_, c + c_, r1 + r_, c1 + c_]

    return util.map_list(apply_offset, _rects)


def getRectsInPage(page, exclude_page_rect=True, zoom=1, draw_image=False, is_line_th=4):
    """ 得到图片上可能存在的矩形框（包括直线， 图片...）

    Args:
        page (_type_): pdfplumber.Page

    Returns:
        _type_: _description_
    """
    rects = []
    items = [page.lines, page.curves, page.rects]
    if draw_image:
        items.append(page.images)
    for it in items:
        _rects = getRcRects(it, page.height, zoom)
        for rect in _rects:
            if exclude_page_rect:
                r, c, r1, c1 = rect
                if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
                    continue
            rects.append(rect)

    # 有些线段是图片。。。
    _rects = getRcRects(page.images, page.height, zoom)
    for rect in _rects:
        if exclude_page_rect:
            r, c, r1, c1 = rect
            if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
                continue
        img_h, img_w = util.calc_h_w(rect)
        if img_h < is_line_th or img_w < is_line_th:
            util.logger.info(f"有一个图片可能是线段：{rect}.")
            rects.append(rect)
            
    return rects


def getSeriesValue(series):
    v = series.values[0]
    kw = series.index[0][0]
    for index, num in series.items():
        if num == v:
            kw = min(kw, index[0])
    return kw


def adjustLines(lines, rects_exclude_small_rect, th = 0, is_hline=True):
    if is_hline:
        rects_exclude_small_rect.sort(key=lambda d: (d[0], -d[2]))
    else:
        rects_exclude_small_rect.sort(key=lambda d: (d[1], -d[3])) 
    lines_new = []
    for i in range(len(lines)):
        if is_hline:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (r + th) < y < (r1 + th):
                    lines[i] = [x, max(0, r - 1), x1, r]
                    lines_new.append([x, max(0, r - 1), x1, r])
                    lines_new.append([x, r1-1, x1, r1])
                    break
            lines_new.append(lines[i])
        else:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (c + th) < x < (c1 + th):
                    lines[i] = [ max(0, x - 1), r, x, r1]
                    lines_new.append([max(0, x - 1), r, x, r1])
                    lines_new.append([c1-1, r, c1, r1])
                    break
            lines_new.append(lines[i])
    lines_new.sort(key=lambda it: it[1])
    return lines_new

from pandas import DataFrame
# from funs import *
import numpy as np
from importlib import reload

PAGE_OUT = "page_rect.json"
pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\tu_pian.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\ban_fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl_.pdf'

# pdf_path = r'C:\Users\Administrator\Desktop\stream0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream1.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream3.pdf'

# pdf_path = r'C:\Users\Administrator\Desktop\nb0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb1.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb5.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\nb4.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\lc.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\lc1.pdf'

pg = 0
line_th = 4
zoom = 1
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=zoom, zoom_y=zoom)
exec(f"""
import cv2
img{pg} = cv2.imread('{pg}.png')
""")
img0.shape
import sys
# sys.setrecursionlimit(80000)

class Color(object):
    IMAGE = [255, 0, 0]
    RECT = [0, 0, 255]
    TEXT_AREA = [50, 120, 255]
    LINE = [255, 0, 255]
    TEXT_LINE = [211, 211, 0]
words, w_size, w_s_stats, page = util.getWords(pdf_path, pg)
image_area_rects = getRcRects(page.images, page.height, zoom)        
line_area_rects = getRcRects(page.lines, page.height, zoom)
rect_area_rects = getRcRects(page.rects, page.height, zoom)
util.drawRects(img0, rect_area_rects, show_result_img=True)

word_size_as_th = getSeriesValue(w_s_stats)
expand = max(0, int(word_size_as_th * zoom / 2))

# 有些页眉和页脚是矩形。。。
rect_lines = []
for _r in rect_area_rects:
    img_h, img_w = util.calc_h_w(_r)
    if img_h < max(line_th, word_size_as_th) \
        and img_w > 0.7 * page.width \
            and (_r[0] < page.height * 0.2 or _r[0] > page.width * 0.8):
        util.logger.info(f"有一个矩形可能是线段：{_r}")
        rect_lines.append(_r)
        line_area_rects.append(_r)

def getMaxAreaRect(img, area_rects, 
                   directs: list=[(0, 1), (1, 0), (-1, 0), (0, -1)],
                   show_result_img=True):
    img_rected = util.getBinary(
        util.drawRects(np.empty(img.shape).astype(np.uint8), area_rects,
                       show_result_img=show_result_img, line_color=[255, 0, 0]),
        0,
        255,
        0
    )
    # 得到最大的矩形区域
    return util.getMaxRects(img_rected,directs=directs,show_result_img=show_result_img)
image_area_rects = getMaxAreaRect(img0, image_area_rects)
line_area_rects = getMaxAreaRect(img0, line_area_rects)
rect_area_rects = getMaxAreaRect(img0, rect_area_rects)

2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  944 - xref found: pos=b'554914'
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  962 - read_xref_from: start=554914, token=/b'xref'
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  134 - xref objects: {1: (None, 17, 0), 2: (None, 66, 0), 3: (None, 122, 0), 4: (None, 209, 0), 5: (None, 605, 0), 6: (None, 2432, 0), 7: (None, 2569, 0), 8: (None, 2597, 0), 9: (None, 2763, 0), 10: (None, 2834, 0), 11: (None, 3089, 0), 12: (None, 363150, 0), 13: (None, 363315, 0), 14: (None, 372526, 0), 15: (None, 372703, 0), 16: (None, 372941, 0), 17: (None, 373567, 0), 18: (None, 373738, 0), 19: (None, 373978, 0), 20: (None, 549919, 0), 21: (None, 549945, 0), 22: (None, 550127, 0), 23: (None, 550370, 0), 24: (None, 550790, 0), 25: (None, 550967, 0), 26: (None, 551222, 0), 27: (None, 551434, 0), 28: (None, 551699, 0), 29: (None, 551930, 0), 30: (None, 552195, 0), 31: (None, 552426, 0), 32: (None, 552691, 0), 33: (None, 552922, 0), 34: (None, 553187, 0), 35: (None, 553418, 0), 36: (None, 553682, 0), 37: (None, 553913, 0), 38: (None, 554184, 0), 39: (None, 554418, 0), 40: (None, 554683, 0)}
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  976 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  182 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:12 [INFO] [pdfpage-MainThread]  109 - Pages: Kids=[<PDFObjRef:4>]
2022-03-09 09:46:12 [INFO] [pdfpage-MainThread]  113 - Page: {'Contents': <PDFObjRef:5>, 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'}, 'MediaBox': [0, 0, 595.32, 841.92], 'Parent': <PDFObjRef:2>, 'Resources': {'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, 'Tabs': /'S', 'Type': /'Page'}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  994 - Processing page: <PDFPage: Resources={'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, MediaBox=[0, 0, 595.32, 841.92]>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  1020 - render_contents: resources={'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, streams=[<PDFStream(5): raw=1756, {'Filter': /'FlateDecode', 'Length': 1754}>], ctm=(1, 0, 0, 1, 0, 0)
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=6, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'DescendantFonts': <PDFObjRef:7>, 'Encoding': /'Identity-H', 'Subtype': /'Type0', 'ToUnicode': <PDFObjRef:13>, 'Type': /'Font'}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=None, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'CIDSystemInfo': <PDFObjRef:9>, 'CIDToGIDMap': /'Identity', 'DW': 1000, 'FontDescriptor': <PDFObjRef:10>, 'Subtype': /'CIDFontType2', 'Type': /'Font', 'W': <PDFObjRef:12>, 'Encoding': /'Identity-H', 'ToUnicode': <PDFStream(13): raw=9139, {'Filter': /'FlateDecode', 'Length': 9137}>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=14, spec={'BaseFont': /'Times New Roman', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:15>, 'LastChar': 247, 'Name': /'F2', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:16>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=17, spec={'BaseFont': /'ABCDEE+Calibri', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:18>, 'LastChar': 32, 'Name': /'F5', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:20>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=21, spec={'BaseFont': /'Times New Roman,Bold', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:22>, 'LastChar': 178, 'Name': /'F6', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:23>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=24, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:25>, 'LastChar': 107, 'Name': /'F7', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:26>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(37): raw=82, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 80, 'SMask': <PDFObjRef:38>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 295}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(31): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:32>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(27): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:28>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(27): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:28>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(33): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:34>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(39): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:40>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(35): raw=75, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 66, 'Interpolate': False, 'Length': 73, 'SMask': <PDFObjRef:36>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(39): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:40>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>

getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1

2022-03-09 09:46:12 [INFO] [<ipython-input-88-3d717eb512cd>-MainThread]  61 - 有一个矩形可能是线段：[ 55  88  55 506]

2022-03-09 09:46:13 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.18946218490600586]s

2022-03-09 09:46:13 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.1501309871673584]s

2022-03-09 09:46:13 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.31815361976623535]s

rect_lines

[array([ 55,  88,  55, 506])]

# rect 区域调整
rect_area_rects2 = []
img = img0.copy()
for r in rect_area_rects:
    if util.calc_area(r) < 1:
        continue
    _img = util.getSubImg(img, r)
    
    _img_bi = util.getBinary(_img, 250, 0, 255)
    util.imshow(_img_bi)
    
    
    kh, kw = list(map(int, (1, word_size_as_th * 2)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算列连接")
    kh, kw = list(map(int, (word_size_as_th * 2, 1)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算行连接")
    temp = util.getMaxRectsTwice(_im, offset=r)
    # 画上区域
    _ = util.drawRects(img0, temp, show_result_img=True, line_color=[255,0,255])
    rect_area_rects2.extend(temp)
        
rect_area_rects = rect_area_rects2

2022-03-09 09:46:16 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.5954070091247559]s
2022-03-09 09:46:16 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.6034188270568848]s
2022-03-09 09:46:16 [INFO] [util-MainThread]  508 - 有offset函数.

_ = util.drawRects(img0, rect_area_rects, show_result_img=True, line_color=[255,0,255])

# util.canShow = False
img9 = img0
img9_binary = util.getBinary(img9, 200, 255, 0)

img9_rected = util.getBinary(
    util.drawRects(np.empty(img9.shape[:2]).astype(np.uint8), getRectsInPage(
        page, True, zoom), show_result_img=True, line_color=[255, 0, 0], title="绘制meatadata中的rectangle"),
    0,
    255,
    0
)
# util.imshow([img9_binary, img9_rected])
# 得到最大的矩形区域
# img9_rects_exclude_small_rect = util.getMaxRects(img9_rected,
#                                                 #  directs=[(0, 1), (1, 0), ],
#                                                  show_result_img=True)
img9_rects_exclude_small_rect = []
img9_rects_exclude_small_rect.extend(image_area_rects)
img9_rects_exclude_small_rect.extend(line_area_rects)
img9_rects_exclude_small_rect.extend(rect_area_rects)

# 区域划分示意图
_ = util.drawRects(img9, img9_rects_exclude_small_rect,
                   line_color=[0, 0, 255], show_result_img=True)
rs = img9_rects_exclude_small_rect.copy()
img9_binary_filled = util.drawRects(img9_binary, rs, line_color=[
                                    255, 255, 255], fill_color=[255, 255, 255], show_result_img=False)

getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1

h, w = page.height, page.width
w_th = w * 0.6
page_rects = util.filter_list(lambda r: h * 0.1 < r[0] and r[2] < h * 0.9 , rect_area_rects)
maybe_lines = util.filter_list(lambda r: util.calc_h(r) < line_th and util.calc_w(r) > 5 * word_size_as_th, page_rects)
# TODO
if len(page_rects) > 2 and len(maybe_lines) > 0:
    page_rects.sort(key=lambda r: (r[0], -r[2], r[1], -r[3]))
    maybe_tbs = [[]]
    append = False
    for r in page_rects:
        if util.calc_h(r) < line_th:
            if len(maybe_tbs[-1]) == 0:
                append = not append
            elif r[0] - maybe_tbs[-1][-1][0] < 2 * word_size_as_th:
                append = not append
                maybe_tbs.append([])
            else:
                print(maybe_tbs, r)
        if append:
            maybe_tbs[-1].append(r)

    maybe_tb = maybe_tbs[0]
    st, ed = maybe_tb[0][0], maybe_tb[-1][2]
    words_rects = getWsRects(words, zoom)
    filter_wrects = de_duplicate_rect(util.filter_list(lambda x: st < x[0] and x[2] < ed, words_rects))
    print(f"原size={len(words_rects)}, 长度过滤之后{len(filter_wrects)}.")
    # util.drawRects(np.zeros(img9_rected.shape[:2]), filter_wrects + [maybe_tb[0] ,maybe_tb[-1]], show_result_img=True)
    k = np.ones((int((ed - st) / 2), 1))
    _img_close = cv2.morphologyEx(util.drawRects(img9_rected, filter_wrects, show_result_img=True, title="pdf 上的rect"),
                    cv2.MORPH_CLOSE,
                    k)
    util.imshow(_img_close, "_img_close")

    _rect = util.getMaxRects(_img_close)
    _rect = util.getMaxRects(util.drawRects(np.zeros(img0.shape[:2]), _rect,fill_color=[0,0,0], fill_expanding=[0, word_size_as_th] ,show_result_img=True),
                            show_result_img=True)
    _rect = util.map_list(lambda r:[r[0], r[1] + int(word_size_as_th) - 1, r[2],  r[3] - int(word_size_as_th) + 2], _rect)
    rect_area_rects.extend(_rect)
    print(rect_area_rects)
else:
    print("没找到表格候选区域")
    
_ = util.drawRects(img0, 
                   rect_area_rects,
                   show_result_img=True)

没找到表格候选区域

mask = np.ones(img0.shape[:2]).astype(np.uint8) * 255
img9_rects_exclude_small_rect = util.map_list(lambda r: util.map_list(int, [r[0] - word_size_as_th * 1.5,
                                                                            # r[1] - word_size_as_th * 1.5,
                                                                            r[1] - word_size_as_th,
                                                                            r[2] + word_size_as_th * 1.5,
                                                                            # r[3] + word_size_as_th * 1.5
                                                                            r[3] + word_size_as_th
                                                                            ]), img9_rects_exclude_small_rect)
_ = util.drawRects(mask,
                   img9_rects_exclude_small_rect,
                   line_color=0,
                   fill_color=0,
                   fill_expanding=[0, 0],
                   copy_for_show_result=False,
                   show_result_img=True)

# # 方案1
# ws_rects = getWsRects(words, zoom)

# expand = max(0, int(word_size_as_th * zoom / 2))
# _img = np.zeros(img0.shape[:2]).astype(np.uint8)
# util.drawRects(_img, ws_rects,
#                line_color=[255, 0, 0],
#                fill_color=[0, 0, 0],
#                fill_expanding=expand,
#                show_result_img=True, copy_for_show_result=False)
# word_focused = np.bitwise_and(_img, mask)
# util.imshow(word_focused)
# points_set = []
# text_area_rects = util.getMaxRectsTwice(
#     word_focused, show_result_img=True, result=points_set)

# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 ,  r[3] -expand + 2], text_area_rects )

# _ = util.drawRects(img0, text_area_rects, show_result_img=True, title="游离的文本行")

# 方案2

ws_rects = getWsRects(words, zoom)
ws_rects_not_in_table = []

mask_text = []
for wr in ws_rects:
    add_flag = True
    for orect in img9_rects_exclude_small_rect:
        if  util.isInnerCell(wr, orect, strict=False)[0]:
            add_flag = False
            mask_text.append(wr)
            break
    if add_flag: 
        ws_rects_not_in_table.append(wr)
_ = util.drawRects(img0, ws_rects_not_in_table, line_color=[255, 0,0], show_result_img=True)

w, h = img9_binary.shape[:2]
# 以字符 size 决定 kernel 高度
k = cv2.getStructuringElement(
    cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
# img9_erode = util.erode(img9_binary_filled, k)
# util.imshow([img9_binary_filled, img9_erode])

# ws_rects = getWsRects(words, zoom)
# # 绘制文本框
# img9_word_filled = util.drawRects(img9_binary_filled, ws_rects, fill_color=[255, 255, 255], line_color=0, show_result_img=True)

# TODO 可能不需要
# ws_rects = getWsRects(words, zoom)
# 绘制文本框
util.imshow(img9_binary_filled)
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255, 
                                  ws_rects_not_in_table, 
                                  fill_color=[255, 255, 255], 
                                  line_color=[0,0,0], 
                                  fill_expanding=expand,
                                  show_result_img=True,
                                  title="img9_word_filled")


word_focused = np.bitwise_not(img9_word_filled)
util.imshow(word_focused)
k = np.ones((int(word_size_as_th * 1.2), int(word_size_as_th)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects = util.getMaxRects(word_focused_dilate, show_result_img=True, result=points_set)
text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 ,  r[3] -expand + 2], text_area_rects )

2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  210 - right_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in

2022-03-09 09:46:20 [INFO] [util-MainThread]  350 - 关键字参数中存在result, 将会将点集放入result中.

2022-03-09 09:46:20 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.1436166763305664]s

k = cv2.getStructuringElement(
    cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255, 
                                  mask_text, 
                                  fill_color=[255, 255, 255], 
                                  line_color=[0,0,0], 
                                  fill_expanding=expand,
                                  show_result_img=True,
                                  title="img9_word_filled")

word_focused = np.bitwise_not(img9_word_filled)
util.imshow(word_focused)
k = np.ones((int(word_size_as_th), int(word_size_as_th * 1.2)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects_mask = util.getMaxRectsTwice(word_focused, show_result_img=True, result=points_set)
text_area_rects_mask = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 ,  r[3] -expand + 2], text_area_rects_mask )

2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.05585312843322754]s
2022-03-09 09:46:21 [INFO] [util-MainThread]  350 - 关键字参数中存在result, 将会将点集放入result中.

2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.1934823989868164]s

ws_rects_float = getWsRects(words, zoom, save_float=True)
ws_rects_dict = util.groupby(ws_rects_float, key=lambda r: "%.2f" % (r[2] - r[0]))

text_area_rects_2 = []
for k, group in ws_rects_dict.items():
    _expand = max(0, int(eval(k) * zoom * 0.6))
    _img = np.zeros(img0.shape[:2]).astype(np.uint8)
    for r in group:
        util.drawRects(_img, util.map_list(int, r), 
                       line_color=[255,0,0],
                       fill_color=[0,0,0], 
                       copy_for_show_result=False, 
                       fill_expanding=_expand,
                       show_result_img=False,
                       title=f"字号 {k}")
        
        
    _img = np.bitwise_and(_img, mask)
    
    temp = util.getMaxRectsTwice(_img, line_color=[255, 0, 0], show_result_img=True, title=f"字号 {k}")
    # break
    temp = util.map_list(lambda r: [r[0] + _expand - 1, r[1] + _expand - 1, r[2] - _expand + 2 ,  r[3] -_expand + 2], temp )
    for r in temp:
        if (r[2] - r[0]) > eval(k) * 0.8:
            text_area_rects_2.append(r)
            text_area_rects.append(r)
    # TODO
    # _rects = text_area_rects.copy()
    # for cell_i in temp:
    #     for cell_o in _rects:
    #         if 
    #         text_area_rects.append(cell_i)
# 过滤 小文本区域

text_area_rects = text_area_rects_2
text_area_rects.extend(text_area_rects_mask)

# text_area_rects = list(filter(lambda r: util.calc_area(r) > word_size_as_th ** 2, text_area_rects))
# util.dump(_words, "ws.json")

2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.005983114242553711]s

2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.18650007247924805]s
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.004987955093383789]s

2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.13463973999023438]s
2022-03-09 09:46:21 [WARNING] [util-MainThread]  363 - 图片不存在白色区域!

2022-03-09 09:46:21 [WARNING] [util-MainThread]  363 - 图片不存在白色区域!

rects = [image_area_rects, 
         line_area_rects ,
         rect_area_rects,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
    i = np.array([c for _ in range(100)]).astype(np.uint8)
    i = np.reshape(i, (10,10,3))
    if color_img is None:
        color_img = i
    else:
        color_img = np.concatenate((color_img, i), 1)

util.imshow(color_img, " | ".join(["image", "line", "rectangle", "text_area", "text_line"]))

    
img = img0.copy()
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                #    fill_color=c,
                   show_result_img=True,
                   copy_for_show_result=False)

img = img0.copy() * 0
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)

img_bi = util.getBinary(img, 1, 0, 255)
util.imshow(img_bi)
page_areas = []
page_areas_imgs = []
def extractLines(img_bi, hline=True, rate=0.95):
    import util
    lines = util.extractHline(img_bi, rate=rate) if hline else util.extractVline(img_bi,  rate=rate)
    return util.getSubImgsAndPos(img_bi, lines)

imgs , pos = extractLines(img_bi)
util.imshow(imgs)
page_areas_imgs.extend(imgs)
page_areas.extend(util._offset([0,0], pos))

img = img0.copy() * 0
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)

rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate = 0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[255,255,255], show_result_img=True)
page_areas = rs
img = img0.copy()
_ = util.drawRects(img, 
               page_areas,
               show_result_img=True,
               line_color=[0,0,0], 
               title="区域划分示意图")

2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=44

2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=30

2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=621

2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=106
2022-03-09 09:46:25 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:25 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:25 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:26 [INFO] [util-MainThread]  1227 - 没有子图了

rects = [image_area_rects,
         line_area_rects,
         rect_area_rects + page_areas,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT,
          Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
    i = np.array([c for _ in range(100)]).astype(np.uint8)
    i = np.reshape(i, (10, 10, 3))
    if color_img is None:
        color_img = i
    else:
        color_img = np.concatenate((color_img, i), 1)


ls = ["image", "line", "rectangle", "text_area", "text_line"]
util.imshow(color_img, " | ".join(ls))

    
img = img0.copy()
for r, c, t in zip(rects, colors, ls):
    util.drawRects(img, r, 
                   line_color=c, 
                #    fill_color=c,
                   show_result_img=True,
                   copy_for_show_result=False,
                   title=t)

util.imshow(img)

# text_area_rects_add = []
# import pdfplumber
# page = pdfplumber.open(pdf_path).pages[0]

# for _rect in rect_area_rects:
#     if util._calc_area(_rect) < word_size_as_th ** 2 * 4:
#         continue
#     crop_box  = page.within_bbox(util.xy2rc(_rect, 1 / zoom))
#     _words = crop_box.extract_words(
#                 keep_blank_chars=False, 
#                 x_tolerance=word_size_as_th * 1.5, 
#                 y_tolerance=word_size_as_th * 0.25)
#     _rects = []
#     for category in rects:
#         for r in category:
#             if util.isInnerCell(r, _rect)[0]:
#                 _rects.append(r)
#     _img = util.getSubImg(
#         util.drawRects(img0, getWsRects(_words, zoom=zoom), 
#                        line_color=[255,0,0],
#                        show_result_img=True, title="绘制文本行"),
#     _rect
#     )    
    
#     th = DataFrame(_img.flatten()).value_counts().index[0][0]
#     _img_bi = np.bitwise_not(util.getBinary(_img, th - 2, 255, 0))
    
#     util.imshow(_img_bi, "Rect 子图")
    
#     kh, kw = list(map(int, (1, word_size_as_th)))
#     k = np.ones((kh, kw))
#     _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
#     util.imshow(_im, "闭运算列连接")
#     kh, kw = list(map(int, (word_size_as_th, 1)))
#     k = np.ones((kh, kw))
#     _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
#     util.imshow(_im, "闭运算行连接")
    
#     text_area_rects_add.extend(util.getMaxRectsTwice(_im,
#                                                 directs=[(0, 1), (1, 0)],
#                                                 show_result_img=True,
#                                                 filter=lambda rect: util._calc_area(
#                                                     rect) > word_size_as_th ** 2,
#                                                 offset=_rect,
#                                                 title="绘制文本框"
#                                                 )
#                             )

# # util.dump(_words, "ws.json")

def mergeTextArea(text_area_rects, text_area_rects_add, rate = 0.8):
    _rs = []
    walked = set()
    for o in text_area_rects:
        for i in text_area_rects_add:
            if str(i) in walked:
                continue
            f, outer, inner = util.isInnerCell(i, o, strict=False, rate=rate)
            if f:
                _rs.append(outer)
                walked.add(str(inner))

    for i in text_area_rects:
        for o in text_area_rects_add:
            if str(i) in walked:
                continue
            f, outer, inner = util.isInnerCell(i, o, strict=False, rate=rate)
            if f:
                _rs.append(outer)
                walked.add(str(inner))     
    return _rs

pre = text_area_rects.copy()

text_area_rects = pre

pre = text_area_rects.copy()
_ = util.drawRects(_img, text_area_rects,
               show_result_img=True)

# 大文本区域细分

import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]

length = len(text_area_rects)
for  i in range(length):
    tr = text_area_rects[i]
    if (tr[2] - tr[0]) > 4 * word_size_as_th and (tr[3] - tr[1]) >  img0.shape[1] * 0.5:
        util.drawRects(img0, tr, show_result_img=True)
    
        _img_bi = util.getBinary(util.getSubImg(img0, tr), 250, 0, 255)
        util.imshow(_img_bi, "Rect 子图")
        
        kh, kw = list(map(int, (1, word_size_as_th)))
        k = np.ones((kh, kw))
        _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
        util.imshow(_im, "闭运算列连接")
        kh, kw = list(map(int, (word_size_as_th * 1.8, 1)))
        k = np.ones((kh, kw))
        _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
        util.imshow(_im, "闭运算行连接")
        
        temp = util.getMaxRects(_im, offset=tr, show_result_img=True)
        if len(temp) > 1:
            text_area_rects[i] = temp[0]
            text_area_rects.extend(temp[1:])
# 过滤 小文本区域

text_area_rects = list(filter(lambda r: util.calc_area(r) > word_size_as_th ** 2, text_area_rects))
util.drawRects(_img, text_area_rects,
               show_result_img=True)
# util.dump(_words, "ws.json")

2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  944 - xref found: pos=b'554914'
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  962 - read_xref_from: start=554914, token=/b'xref'
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  134 - xref objects: {1: (None, 17, 0), 2: (None, 66, 0), 3: (None, 122, 0), 4: (None, 209, 0), 5: (None, 605, 0), 6: (None, 2432, 0), 7: (None, 2569, 0), 8: (None, 2597, 0), 9: (None, 2763, 0), 10: (None, 2834, 0), 11: (None, 3089, 0), 12: (None, 363150, 0), 13: (None, 363315, 0), 14: (None, 372526, 0), 15: (None, 372703, 0), 16: (None, 372941, 0), 17: (None, 373567, 0), 18: (None, 373738, 0), 19: (None, 373978, 0), 20: (None, 549919, 0), 21: (None, 549945, 0), 22: (None, 550127, 0), 23: (None, 550370, 0), 24: (None, 550790, 0), 25: (None, 550967, 0), 26: (None, 551222, 0), 27: (None, 551434, 0), 28: (None, 551699, 0), 29: (None, 551930, 0), 30: (None, 552195, 0), 31: (None, 552426, 0), 32: (None, 552691, 0), 33: (None, 552922, 0), 34: (None, 553187, 0), 35: (None, 553418, 0), 36: (None, 553682, 0), 37: (None, 553913, 0), 38: (None, 554184, 0), 39: (None, 554418, 0), 40: (None, 554683, 0)}
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  976 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  182 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:30 [INFO] [pdfpage-MainThread]  109 - Pages: Kids=[<PDFObjRef:4>]
2022-03-09 09:46:30 [INFO] [pdfpage-MainThread]  113 - Page: {'Contents': <PDFObjRef:5>, 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'}, 'MediaBox': [0, 0, 595.32, 841.92], 'Parent': <PDFObjRef:2>, 'Resources': {'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, 'Tabs': /'S', 'Type': /'Page'}

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# 调整文本行
# _img = np.zeros(img0.shape[:2])

# util.drawRects(_img, text_area_rects, fill_color=[0,0,0],
#                fill_expanding=[expand, 0],
#                copy_for_show_result=False)

# text_area_rects = util.getMaxRectsTwice(_img, show_result_img=True)

# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, 
#                                            r[1], 
#                                            r[2] - expand + 1 ,  
#                                            r[3]], text_area_rects )

# 重构页面结构
img = img0.copy() * 0
rects = [image_area_rects, 
         line_area_rects ,
         rect_area_rects,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)

rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate = 0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[255,255,255], show_result_img=True)

page_areas = de_duplicate_rect(rs)
img = img0.copy()
_ = util.drawRects(img, 
               page_areas,
               show_result_img=True,
               line_color=[0,0,0], 
               title="区域划分示意图")

2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=44

2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=30

2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=621

2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=106
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了

rects = [image_area_rects, 
         line_area_rects ,
         rect_area_rects,
        #  mergeTextArea(text_area_rects ,text_area_rects_add),
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
    i = np.array([c for _ in range(100)]).astype(np.uint8)
    i = np.reshape(i, (10,10,3))
    if color_img is None:
        color_img = i
    else:
        color_img = np.concatenate((color_img, i), 1)

ls = ["image", "line", "rectangle", "text_area", "text_line"]
util.imshow(color_img, " | ".join(ls))

    
img = img0.copy()
for r, c, t in zip(rects, colors, ls):
    util.drawRects(img, r, 
                   line_color=c, 
                #    fill_color=c,
                   show_result_img=True,
                   copy_for_show_result=False,
                   title=t)
# 画上区域
_ = util.drawRects(img, page_areas, show_result_img=True, line_color=[0,0,0])

# 画上区域
_img = util.drawRects(img0, page_areas, show_result_img=True, line_color=[0,0,0])
util.imshow(np.concatenate((img0, _img), 1), "划分示意图")

class PageAreaMetaClass(type):
    def __new__(cls, name, bases, attrs):
        pageArea = type.__new__(cls, name, bases, attrs)
        print(cls, name, bases, attrs)
        return pageArea


# class PageArea(dict, metaclass=PageAreaMetaClass):
class PageArea(dict):

    THRESHOLD = 0.98
    HORIZON = "horizon"
    VERTICAL = "vertical"

    def _calc_h(self, rect):
        return rect[2] - rect[0]

    def _calc_w(self, rect):
        return rect[3] - rect[1]

    def calc_h_w(self, rect):
        return self._calc_h(rect), self._calc_w(rect)

    def __init__(self, rects):
        max_area = max(util.map_list(util.calc_area, rects))
        self.area = max_area
        # 添加自身所在区域
        for r in rects:
            if util.calc_area(r) == max_area:
                self.rect = list(r)
                break
        # 加入子页面
        children_rects = list(
            filter(lambda rect: util.calc_area(rect) != max_area, rects))
        children_rects.sort(key=lambda rect: (
            rect[0], rect[2], rect[1], rect[3]))
        self.children_rects = util.map_list(list, children_rects)

        mode = None

        if len(children_rects) > 0:
            _rect = children_rects[0]
            h, w = self.calc_h_w(_rect)
            h1, w1 = self.calc_h_w(self.rect)
            if h / h1 > self.THRESHOLD and w / w1 < self.THRESHOLD:
                mode = self.VERTICAL
            elif h / h1 < self.THRESHOLD and w / w1 > self.THRESHOLD:
                mode = self.HORIZON
            else:
                raise ValueError("未知的分类")

            chs_dict = {}
            keys = util.map_list(str,
                                 list(filter(lambda rect: self._calc_h(rect) / h1 > self.THRESHOLD
                                             if mode == self.VERTICAL else self._calc_w(rect) / w1 > self.THRESHOLD, children_rects)))
            for _rect in children_rects:
                for k in keys:
                    if util.isInnerCell(_rect, eval(k))[0]:
                        if k not in chs_dict:
                            chs_dict[k] = []
                        chs_dict[k].append(_rect)

            self.tag = mode

            children_pages = []
            for v in chs_dict.values():
                children_pages.append(PageArea(v))

            self.children_pages = children_pages

    def __getitem__(self, key):
        return eval(f"self.{key}", globals(), locals()) if hasattr(self, key) else None

    def __repr__(self):
        return util.dumps(eval(str(self.__dict__)))

    __str__ = __repr__


PageArea(page_areas)

{
    "area": 501832,
    "rect": [
        0,
        0,
        842,
        596
    ],
    "children_rects": [
        [
            21,
            0,
            65,
            596
        ],
        [
            64,
            0,
            94,
            596
        ],
        [
            93,
            0,
            714,
            596
        ],
        [
            713,
            0,
            819,
            596
        ]
    ],
    "tag": "horizon",
    "children_pages": [
        {
            "area": 26224,
            "rect": [
                21,
                0,
                65,
                596
            ],
            "children_rects": []
        },
        {
            "area": 17880,
            "rect": [
                64,
                0,
                94,
                596
            ],
            "children_rects": []
        },
        {
            "area": 370116,
            "rect": [
                93,
                0,
                714,
                596
            ],
            "children_rects": []
        },
        {
            "area": 63176,
            "rect": [
                713,
                0,
                819,
                596
            ],
            "children_rects": []
        }
    ]
}

_page = PageArea(page_areas).children_pages[2]
_rect = _page.rect
print(_page)
util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True, 
                title=str(_rect))

# for _page in PageArea(page_areas).children_pages:
#     _rect = _page.rect
#     util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True, 
#                     title=str(_rect))

{
    "area": 132312,
    "rect": [
        86,
        0,
        308,
        596
    ],
    "children_rects": [
        [
            86,
            39,
            308,
            305
        ],
        [
            86,
            304,
            308,
            562
        ],
        [
            91,
            39,
            117,
            305
        ],
        [
            116,
            39,
            307,
            305
        ]
    ],
    "tag": "vertical",
    "children_pages": [
        {
            "area": 59052,
            "rect": [
                86,
                39,
                308,
                305
            ],
            "children_rects": [
                [
                    91,
                    39,
                    117,
                    305
                ],
                [
                    116,
                    39,
                    307,
                    305
                ]
            ],
            "tag": "horizon",
            "children_pages": [
                {
                    "area": 6916,
                    "rect": [
                        91,
                        39,
                        117,
                        305
                    ],
                    "children_rects": []
                },
                {
                    "area": 50806,
                    "rect": [
                        116,
                        39,
                        307,
                        305
                    ],
                    "children_rects": []
                }
            ]
        },
        {
            "area": 57276,
            "rect": [
                86,
                304,
                308,
                562
            ],
            "children_rects": []
        }
    ]
}

array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[254, 254, 254],
        [254, 254, 254],
        [254, 254, 254],
        ...,
        [254, 254, 254],
        [254, 254, 254],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]]], dtype=uint8)

# 根据交并比去除重复的区域

sorted_page_areas = sorted(page_areas, key=lambda rect: (rect[0], rect[1], -rect[2],  -rect[3]))
de_duplicate = []
for r in sorted_page_areas:
    if len(de_duplicate) < 1:
        de_duplicate.append(r)
    else:
        last = de_duplicate.pop()
        if util.calc_iou(last, r) > 0.95:
            print('交并比 > 0.95', r, last, util.calc_iou(r, last))
            de_duplicate.append(last)
        else:
            de_duplicate.append(last)
            de_duplicate.append(r)
        
page_areas = de_duplicate             
for _rect in de_duplicate:
    util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True, 
                   title=str(_rect))

rect_area_rects2 = []
img = img0.copy()
for i in range(len(rect_area_rects)):
    r = rect_area_rects[i]
    _img = util.getSubImg(img0, r)
    
    _img_bi = util.getBinary(_img, 250, 0, 255)
    # util.imshow(_img_bi)
    
    
    kh, kw = list(map(int, (1, word_size_as_th)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算列连接")
    kh, kw = list(map(int, (word_size_as_th, 1)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算行连接")
    temp = util.getMaxRectsTwice(_im, offset=r)
    # 画上区域
    _ = util.drawRects(img0, temp, show_result_img=True, line_color=[255,0,255])
    rect_area_rects2.extend(temp)

2022-03-04 16:01:06 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6801793575286865]s
2022-03-04 16:01:07 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.7130606174468994]s
2022-03-04 16:01:07 [INFO] [util-MainThread]  490 - 有offset函数.

2022-03-04 16:01:08 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6921482086181641]s
2022-03-04 16:01:09 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6392576694488525]s
2022-03-04 16:01:09 [INFO] [util-MainThread]  490 - 有offset函数.

2022-03-04 16:01:10 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6811752319335938]s
2022-03-04 16:01:11 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.7280511856079102]s
2022-03-04 16:01:11 [INFO] [util-MainThread]  490 - 有offset函数.

# 3.1

img = m.copy()
all_points = list(zip(*(np.where(img > 0))))
im = img == 100
np.where(im)[0]
np.where(img)

kh, kw = list(map(int, (_img.shape[0], word_size_as_th * 0.6)))
k = np.ones((kh, kw))
_img_erode = util.erode(_img_, k)
util.imshow(_img_erode)
lines = util.extractVline(_img_erode)
print(f"找到 {len(lines)} 条竖线{lines}")
util.drawRects(_img_, util.xy2rc(lines), show_result_img=True)
if len(lines) > 2:
    _imgs, pos = util.getSubImgsAndPos(_img_, lines)
    util.imshow(_imgs, "竖线分割图")

def line_split(points, horizonal=True):
    result = []
    line = [points[0]]
    for p in points:
        r, c = line[-1]
        r1, c1 = p
        if not horizonal:
            r, c = c, r
            r1, c1 = c1, r1
        if c1 -c <2:
            line.append(p)
        else:
            result.append(line)
            line = [p]
    
    if len(line) > 0:
        result.append(line)
    if len(result) > 2:
        print(f"找到{len(result)}条线段")
    return result

from collections import OrderedDict as odict
horizon_lines = odict()
n = 0
# 行收集
line_points = []
for p in all_points:
    r, c = p
    if r not in horizon_lines.keys():
        horizon_lines[r] = ...
        if len(line_points) > 0:
            horizon_lines[r]=line_split(line_points)
            line_points = []
    else:
        line_points.append(p)  
        
    if len(line_points) > 0:
        horizon_lines[line_points[0][0]]=line_split(line_points)

keys = list(horizon_lines)

hareas = []

kset = [keys[0]]
for k in keys[1:]:
    if k - kset[-1] < 2:
        kset.append(k)
    else:
        hareas.append(kset)
        kset = [k]
if len(kset) > 0:
    hareas.append(kset)

def isConnect(l1, l2):
    l1_start, l1_end = l1[0][1], l1[-1][1]
    l2_start, l2_end = l2[0][1], l2[-1][1]
    return (l1_start < l2_start < l1_end) or \
        (l1_start < l2_end < l1_end)
r_range = hareas[1]

flag_list = []

init_lines = horizon_lines[r_range[0]]

line_flags = []
for line in init_lines:
    for point in line:
        flag = {}
        flag["start"] = (point[0][1], point[-1][1])
        flag["mode"] = 0
        line_flags.append(flag)
flag_list.append(line_flags)

pre = r_range[0]
for cur in range(r_range[0], r_range[-1] + 1):
    cur_line = horizon_lines[cur]
    

areas = []
for kset in hareas:
    if len(kset) == 1:
        areas.append(horizon_lines[kset[0]])
        
    ind = 0
    # 横线合并
    walked = set()

    ind = 0
    max_ind = len(kset)
    pre = kset[0]
    pre_lines = horizon_lines[pre]
    for l in pre_lines:
        area = set()
        area.add(str(l[0]))
        areas.append(area)
        for p in l:
            walked.add(p)
    
    for cur in kset[1:]:
        pre_lines = horizon_lines[pre]
        cur_lines = horizon_lines[cur]
        for pre_l in pre_lines:
            
            if pre_l[0] not in walked:
                    area.add(str(pre_l[0]))
                    for p in pre_l:
                        walked.add(p)
                            
            for cur_l in cur_lines:
                if isConnect(cur_l, pre_l):
                    for p in cur_l:
                        walked.add(p)

                    
 
# TEST CELL

import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]

crop_box  = page.within_bbox(util.xy2rc(text_area_rects[3], 1 / zoom))
crop_box.chars = []
crop_box.extract_words(
            keep_blank_chars=False, 
            x_tolerance=word_size_as_th * 1.5, 
            y_tolerance=word_size_as_th * 0.25)
# TEST CELL
import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]

crop_box  = page.within_bbox(util.xy2rc(text_area_rects[3], 1 / zoom))
_words = crop_box.extract_words(
            keep_blank_chars=False, 
            x_tolerance=word_size_as_th * 1.5, 
            y_tolerance=word_size_as_th * 0.25)

_words

找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到4条线段
找到4条线段
找到3条线段
找到4条线段
找到5条线段
找到6条线段
找到6条线段
找到3条线段
找到4条线段
找到5条线段
找到6条线段
找到6条线段
找到3条线段
找到4条线段
找到5条线段
找到6条线段
找到6条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到4条线段
找到4条线段
找到4条线段
找到4条线段
找到5条线段
找到5条线段
找到6条线段
找到6条线段
找到6条线段
找到6条线段
找到7条线段
找到8条线段
找到9条线段
找到9条线段
找到10条线段
找到10条线段
找到10条线段
找到10条线段
找到11条线段
找到12条线段
找到13条线段
找到14条线段
找到15条线段
找到16条线段
找到17条线段
找到18条线段
找到18条线段
找到3条线段
找到4条线段
找到5条线段
找到5条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段

lines = util.extractHline(img9_erode)  # 调整直线的位置
lines = adjustLines(lines, img9_rects_exclude_small_rect, th=word_size_as_th * 1.2 * zoom)

util.drawRects(img9, util.xy2rc(lines), show_result_img=True)
imgs, pos = util.getSubImgsAndPos(img9_word_filled, lines)
# imgs, pos = util.getSubImgsAndPos(img9, lines)
# util.imshow(imgs)

kh, kw = list(map(int, (word_size_as_th * 0.6, _img.shape[1])))
k = np.ones((kh, kw))
_img_erode = util.erode(_img_, k)
util.imshow(_img_erode)
lines = util.extractHline(_img_erode)
util.drawRects(_img_, util.xy2rc(lines), show_result_img=True)
if len(lines) > 2:
    pass

img_children, _ = util.getSubImgsAndPos(img9, lines)
img_children = iter(img_children)

lines_iter = iter(lines)
# 分栏
offset = 0
for img in imgs:
    offset = next(lines_iter)[1]
    h, w = img.shape[:2]
    whiteBars = util.getWhiteBarWidthList(img, isHline=False, rate=0.8)
    whiteBars = [bar for bar in whiteBars if bar > word_size_as_th]

    image = next(img_children)
    if len(whiteBars) > 2:
        # 需要继续切分
        kh = h * 3 // 5
        kw = int(word_size_as_th * 1.2 * zoom)
        k = np.ones((kh, kw))

        img_erode = util.erode(img, k, anchor=[kh - 1, -1])
        vlines = util.extractVline(img_erode, rate=0.75, th=word_size_as_th)
        # vlines = adjustLines(vlines, img9_rects_exclude_small_rect, th=word_size_as_th, is_hline=False)
        util.drawRects(image, util.xy2rc(vlines), show_result_img=True)
        _imgs, _pos = util.getSubImgsAndPos(image, vlines)
        for line in vlines:
            line[1] = line[1] + offset
            line[3] = line[3] + offset
            print("add:", line)
            lines.append(line)
        # _imgs, _pos = util.getSubImgsAndPos(img, vlines)
        util.imshow(_imgs)
    else:
        util.imshow(image)
util.canShow = True
_ = util.drawRects(img0, util.xy2rc(lines), show_result_img=True)

df = DataFrame(util.getWhiteBarWidthList(img9, isHline=True, rate=0.8))
series = df.value_counts()
print(series)
print(f'{series.index[0][0]} 出现次数最多， 为 {series.array[0]} 次。')
util.imshow(img9)
k = np.ones((series.index[0][0] * 2, img9.shape[1]))
img9_erode = util.erode(img9, k)
util.imshow(img9_erode)

hlines = util.extractHline(img9_erode)

util.drawRects(img9, util.xy2rc(hlines), show_result_img=True)

img9_imgs, img9_poses = util.getSubImgsAndPos(img9, hlines)
util.imshow(img9_imgs)

img9_body = img9_imgs[1]
util.imshow(img9_body)
# rects = util.simpleLocate(np.bitwise_not(img9_body), show_result_img=True)

h, w = img9_body.shape[:2]
img9_area = h * w
img9_area

import importlib
import util
importlib.reload(util)
util.debug =False

rects = [d for d in page.rects if d['y1'] < 500]

import pdfplumber
import pandas as pd

from pdfplumber.page import Page

with pdfplumber.open(r"C:\Users\Administrator\Desktop\xiaokuang.pdf") as pdf:
    page = pdf.pages[0]  # 第一页的信息

    print(type(page.rects[0]))
    print("----------------------------------------------")

    tables = Page.extract_tables(page, {'vertical_strategy': 'explicit',
                                        'horizontal_strategy': 'explicit',
                                        'explicit_horizontal_lines': 
                                            list(
                                                map(lambda d: d['y0'], rects))
                                            + list(map(lambda d: d['y1'], rects)),
                                        'explicit_vertical_lines':
                                            list(
                                                map(lambda d: d['x0'], rects))
                                            + list(map(lambda d: d['x1'], rects))
                                        })
    import pandas as pd
    print(util.dumps(tables), file=open("log.txt", 'w', encoding="utf-8"))

w_s_stats

importlib.reload(util)

# 文本框
# rects.extend(getWsRects(ws))

# pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
pg = 0
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
ws, w_size , w_s_stats, page = util.getWords(pdf_path, pg)
img0 = None
exec(f"""
import cv2
img{pg} = cv2.imread('{pg}.png')
""")

h, w = img0.shape[:2]
util.dump(ws, 'ws.json')

util.imshow(img0)
img0_rects = util.getBinary(
    util.drawRects(np.empty(img0.shape).astype(np.uint8), getRectsInPage(page), show_result_img=False, line_color=[255,0,0]),
    0,
    255,
    0
    )
util.imshow(img0_rects)
# 得到最大的矩形区域
img0_rects_exclude_small_rect = util.getMaxRects(img0_rects, show_result_img=False)
# 区域划分示意图

_ = util.drawRects(img0, img0_rects_exclude_small_rect,line_color=[0,0,255], show_result_img=True)

img0_binary = util.getBinary(img0, 200, 255, 0)
util.imshow(img0_binary)
rs = img0_rects_exclude_small_rect.copy()

img0_binary_filled = util.drawRects(img0_binary, rs, line_color=[255,255,255], fill_color=[255,255,255], show_result_img=True)

# 绘制文本行区域
img0_binary_filled = util.drawRects(img0_binary_filled, getWsRects(ws),line_color=[0,0,0], fill_color=[255,255,255], show_result_img=True)

import pandas as pd
from pandas import DataFrame

df = DataFrame(util.getWhiteBarWidthList(img0_binary_filled, rate=0.8))
df.describe()
series = df.value_counts()
print(f'{series.index[0][0]} 出现次数最多， 为 {series.array[0]} 次。')

w, h = img0.shape[:2]
k = np.ones((series.index[0][0] // 1, w * 3 // 4))
img0_erode = util.erode(img0_binary_filled, k)
util.imshow(img0_erode)

lines = util.extractHline(img0_erode)
# 调整直线的位置
for i in range(len(lines)):
    x, y, x1, y1 = lines[i]
    for r in img0_rects_exclude_small_rect:
        r, c, r1, c1 = r
        if r < y < r1:
            print(lines[i], r)
            lines[i] = [x, max(0, r - 1), x1, r]
        
util.imwrite(util.drawRects(img0, util.xy2rc(lines), show_result_img=True))
imgs, pos = util.getSubImgsAndPos(img0_binary_filled, lines)
util.imshow(imgs)

for img in imgs:

    df = DataFrame(util.getWhiteBarWidthList(img, isHline=False, rate=0.8))
    series = df.value_counts()
    if series.size > 2:
        # 需要继续切分
        print(series, f'[len={len(series)}]')
        print(f'{series.index[0][0]} 出现次数最多， 为 {series.array[0]} 次。')

        w, h = img.shape[:2]
        k = np.ones((h * 3 // 5, getSeriesValue(series) // 2))
        img_erode = util.erode(img, k, anchor=[h * 3 // 5 - 1,  getSeriesValue(series) // 2 - 1])
        vlines = util.extractVline(img_erode, rate=0.75)
        util.drawRects(img, util.xy2rc(vlines), show_result_img=True)
        _imgs, _pos = util.getSubImgsAndPos(img, vlines)
        print(len(_imgs), vlines)
        util.imshow(_imgs)

import numpy as np
img0_binary = util.getBinary(img0, 200, 255, 0)
img_draw_rects = util.drawRects(img0, rects)
util.imshow(img_draw_rects)

rs = list(map(lambda rect: util.xy2rc(
    list(map(int, (rect["x0"], rect["top"], rect["x1"], rect["bottom"])))
    ), 
              ws))

util.drawRects(img9, rects=rs, line_color=[125,0,0], show_result_img=True)

m9 = cv2.imread('9.png')
m9_imgs, _ = util.getSubImgsAndPos(m9, hlines)

for r in rects:
    if util._calc_area(r) > img9_area * 0.125:
        print(util._calc_area(r))
        util.drawRects(m9_imgs[1], [r], line_color=[255, 0, 0], show_result_img=True)

import util

pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 11
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)

import sys
sys.getrecursionlimit()

import cv2
import numpy as np
import util
import sys
sys.setrecursionlimit(100000)

binary_img = lambda path: cv2.cvtColor(cv2.imread(path), cv2.COLOR_RGB2GRAY) 

tm_binary = np.where(binary_img('11.png') > 200, 255, 0).astype(np.uint8)
tm = cv2.imread('11.png')

from pandas import DataFrame

df = DataFrame(util.getWhiteBarWidthList(tm_binary, isHline=True, rate=0.8))
series = df.value_counts()
print(series)
print(f'{series.index[0][0]} 出现次数最多， 为 {series.array[0]} 次。')
util.imshow(tm_binary)
# TODO, how to get 16
k = np.ones((16, tm_binary.shape[1]))
tm_erode = util.erode(tm_binary, k)
util.imshow(tm_erode)

k = util.CV2_K_CROSS(tm_binary.shape[0] // 8)
util.imshow(k * 255)

t = util.erode(np.bitwise_not(tm_binary), k)
util.imshow([np.bitwise_not(tm_binary), t])

hlines = util.extractHline(tm_erode)
rects = util.drawRects(tm, util.xy2rc(hlines), show_result_img=True)

tm_imgs, _ = util.getSubImgsAndPos(tm, hlines)
util.imshow(tm_imgs[1])

_ = util.drawRects(tm, rects, show_result_img=True)

EXT： RELOAD¶

函数定义¶

1. PDF Page To Image¶

1.1 To Image¶

1.2 二值图¶

2. 全文字¶

1. 将图片，表格等区域去除¶

2.1 水平切几刀¶

2.1.1 计算白线分隔值¶

2.1.2 水平分割¶

2.1.3 处理分栏¶

3. 带有表格¶

TODO¶

1.1 确定表格和图片类区域¶

TODO 判断无线框表格区域¶

制作MASK¶

1.2 确定文字区域¶

1.2.1 根据字号先划分成多块¶

1.2.2 再确定文本区域¶

1.3 绘制区域示意图¶

1.4 页面表格化¶

1.5 页面区域再划分¶

2.1 图片， rect内部再划分¶

2.2 再次绘制图像¶

3.1 输出区域信息¶

2.3 TODO 大的文本块，再判断是不是表格¶

META DATA 分析¶

3.1 划分表格，图片区域¶

3.1.1 处理分栏的情况¶

EXT 定位有线表格区域¶

EXT： RELOAD¶

函数定义¶

1. PDF Page To Image¶

1.1 To Image¶

1.2 二值图¶

2. 全文字¶

1. 将图片， 表格等区域去除¶

2.1 水平切几刀¶

2.1.1 计算白线分隔值¶

2.1.2 水平分割¶

2.1.3 处理分栏¶

3. 带有表格¶

TODO¶

1.1 确定表格和图片类区域¶

TODO 判断无线框表格区域¶

制作MASK¶

1.2 确定文字区域¶

1.2.1 根据字号先划分成多块¶

1.2.2 再确定文本区域¶

1.3 绘制区域示意图¶

1.4 页面表格化¶

1.5 页面区域再划分¶

2.1 图片， rect内部再划分¶

2.2 再次绘制图像¶

3.1 输出区域信息¶

2.3 TODO 大的文本块， 再判断是不是表格¶

META DATA 分析¶

3.1 划分表格， 图片区域¶

3.1.1 处理分栏的情况¶

EXT 定位有线表格区域¶

1. 将图片，表格等区域去除¶

2.3 TODO 大的文本块，再判断是不是表格¶

3.1 划分表格，图片区域¶