EXT: RELOAD¶

In [ ]:
from importlib import reload
reload(util)

函数定义¶

In [ ]:
import util
from pandas import DataFrame
import numpy as np
import cv2


def getRcRects(rects, h, zoom=1.0, exclude_page_rect=True):
    print("getRcRects, zoom=", zoom)
    _rects =  util.map_list(lambda r: util.xy2rc(
        util.map_list(int, (r['x0'] * zoom, (h - r['y1']) * zoom, r['x1'] * zoom, (h - r['y0']) * zoom))),
        rects
    )
    
    res = []
    for rect in _rects:
        if exclude_page_rect:
            r, c, r1, c1 = rect
            if r == c == 0:
                continue
        res.append(rect)
    return res


def getWsRects(words, zoom=1.0):  
    return util.map_list(lambda r: util.xy2rc(
        util.map_list(int, (r['x0'] * zoom, r['top'] * zoom, r['x1'] * zoom, r['bottom'] * zoom))),
        words
    )


def getRectsInPage(page, exclude_page_rect=True, zoom=1, draw_image=False):
    """ 得到图片上可能存在的矩形框(包括直线, 图片...)

    Args:
        page (_type_): pdfplumber.Page

    Returns:
        _type_: _description_
    """
    rects = []
    items = [page.lines, page.curves, page.rects]
    if draw_image:
        items.append(page.images)
    for it in items:
        _rects = getRcRects(it, page.height, zoom)
        for rect in _rects:
            if exclude_page_rect:
                r, c, r1, c1 = rect
                if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
                    continue
            rects.append(rect)
    return rects


def getSeriesValue(series):
    v = series.values[0]
    kw = series.index[0][0]
    for index, num in series.items():
        if num == v:
            kw = min(kw, index[0])
    return kw


def adjustLines(lines, rects_exclude_small_rect, th = 0, is_hline=True):
    if is_hline:
        rects_exclude_small_rect.sort(key=lambda d: (d[0], -d[2]))
    else:
        rects_exclude_small_rect.sort(key=lambda d: (d[1], -d[3])) 
    lines_new = []
    for i in range(len(lines)):
        if is_hline:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (r + th) < y < (r1 + th):
                    lines[i] = [x, max(0, r - 1), x1, r]
                    lines_new.append([x, max(0, r - 1), x1, r])
                    lines_new.append([x, r1-1, x1, r1])
                    break
            lines_new.append(lines[i])
        else:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (c + th) < x < (c1 + th):
                    lines[i] = [ max(0, x - 1), r, x, r1]
                    lines_new.append([max(0, x - 1), r, x, r1])
                    lines_new.append([c1-1, r, c1, r1])
                    break
            lines_new.append(lines[i])
    
    # line_set = set(map(lambda it:str(it), lines_new))
    # for r in rects_exclude_small_rect:
    #     r, c, r1, c1 = r
    #     top_line = [c, max(0, r - 1), c1, r]
    #     bottom_line = [c, r1-1, c1, r1]
    #     if str(top_line) not in line_set:
    #         lines_new.append(top_line)
    #         line_set.add(str(top_line))
    #     if str(bottom_line) not in line_set:
    #         lines_new.append(bottom_line)
    #         line_set.add(str(bottom_line))
    lines_new.sort(key=lambda it: it[1])
    return lines_new

1. PDF Page To Image¶

1.1 To Image¶

In [ ]:
import util

pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 5

util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=2, zoom_y=2)
In [ ]:
page.extract_tables({"vertical_strategy": "lines_strict"})
In [ ]:
import util

pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\tu_pian.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\ban_fl.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\fl2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl0.pdf'
pg = 0
import pdfplumber as pdfplumber
from pandas import DataFrame
pdf = pdfplumber.open(pdf_path)
# 第一页的信息
page = pdf.pages[pg]
# # keep_blank_chars 不以空格为分界符
# # x_tolerance 同一行左右两个文本之间的最大容忍距离
# # 这里横向距离设置的比较大,会把一些表格里比较近的文本合在一起
# # TODO 优化!
# words = page.extract_words(
#     keep_blank_chars=False, x_tolerance=4.50 * 1.5, y_tolerance=4.5)

# words.sort(key=lambda x: x["top"])
# w_size = list(map(lambda w: w["adv"], words))
# df = DataFrame(w_size)


util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
util.imshow("0.png")
In [ ]:
import pandas as pd
from pandas import DataFrame

for it in page.objects["image"]:
    rect = [it[attr] for attr in ["x0", "y0", "x1", "y1"]]
    page_crop = page.crop(rect)
    print(f"rect={rect}", DataFrame(util.map_list(
        lambda it: it["size"],
        page_crop.chars)).value_counts())

word_size = getSeriesValue(DataFrame(
    util.map_list(
        lambda it: it["size"],
        page.chars)).value_counts()
)
In [ ]:
# image
import numpy as np
h, w = tuple(map(int, (page.height, page.width)))
img = np.zeros((h, w)).astype(np.uint8)
for image in page.objects["image"]:
    util.drawRects(img, [util.map_list(int, [h - image["y1"], 
                         image["x0"], 
                         h - image["y0"],
                         image["x1"]
                               ])],
                   copy_for_show_result=False, 
                   show_result_img=True
                         )
In [ ]:
# line
for image in page.objects["line"]:
    util.drawRects(img, [util.map_list(int, [h - image["y1"], 
                         image["x0"], 
                         h - image["y0"],
                         image["x1"]
                               ])],
                   copy_for_show_result=False, 
                   show_result_img=True
                         )
In [ ]:
h, w
In [ ]:
# rect
for image in page.objects["rect"]:
    rect = [util.map_list(int, [h - image["y1"],
                                image["x0"],
                                h - image["y0"],
                                image["x1"]
                                ])]
    if util._calc_area(rect[0]) >= (h-1)//1 * w//1:
        print([h - image["y1"],
               image["x0"],
               h - image["y0"],
               image["x1"]
               ])
        continue

    util.drawRects(img, rect,
                   copy_for_show_result=False,
                   show_result_img=True
                   )
In [ ]:
rects = util.getMaxRects(img, show_result_img=True)
h, w = img.shape[:2]
hlines = []
dilateMode = True
for rect in rects:
    r, c, r1, c1 = rect
    if dilateMode:
        c, c1 = 0, w
    
    t = 1 if r-2 <= 0 else (r-2)
    top = [c, t - 1, c1, t]
    
    t = h-2 if r1 + 2 >= h else r1 + 1
    bot = [ c, t,  c1, t + 1]
    
    hlines.append(top)
    hlines.append(bot)
# 将开头和结尾作为一条分割线
x1, y1, x2, y2 = hlines[0]
if y2 > 2:
    hlines.insert(0, [x1, 0, x2, 1])

x1, y1, x2, y2 = hlines[-1]
if y2 < (h - 2):
    hlines.append([x1, h - 2, x2, h - 1])
hlines
In [ ]:
page.edges
In [ ]:
util.drawRects(img, util.xy2rc(hlines), show_result_img=True)

img0 = cv2.imread('0.png')
imgs, pos =util.getSubImgsAndPos(img0, hlines, filterFun=lambda img: img.shape[0] > word_size)

util.imshow(imgs)
In [ ]:
import util
for attr in "objects/.chars/.lines/.rects/.curves/.figures/.images".split("/."):
    if hasattr(page, attr):
        util.dump(str(getattr(page, attr)), f"{attr}.json")
    
In [ ]:
import util

pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 8
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
pg = 9
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)

1.2 二值图¶

In [ ]:
import cv2
import numpy as np
import util

img8 = util.getBinary('8.png', 200, 255, 0).astype(np.uint8)
img9 = util.getBinary('9.png', 200, 255, 0).astype(np.uint8)
util.imshow([img8, img9])

2. 全文字¶

1. 将图片, 表格等区域去除¶

In [ ]:
util.imshow(img8)
words, w_size, w_s_stats, page = util.getWords(pdf_path, 8)
img8_rected = util.getBinary(
    util.drawRects(np.empty(img8.shape).astype(np.uint8), getRectsInPage(page), show_result_img=False, line_color=[255,0,0]),
    0,
    255,
    0
    )
util.imshow(img8_rected)
# 得到最大的矩形区域
img8_rects_exclude_small_rect = util.getMaxRects(img8_rected, show_result_img=False)

# 区域划分示意图
_ = util.drawRects(img8, img8_rects_exclude_small_rect,line_color=[0,0,255], show_result_img=True)
img8_binary = img8
rs = img8_rects_exclude_small_rect.copy()
img8_binary_filled = util.drawRects(img8_binary, rs, line_color=[255,255,255], fill_color=[255,255,255], show_result_img=True)

2.1 水平切几刀¶

2.1.1 计算白线分隔值¶

In [ ]:
w8, h8 = img8_binary.shape[:2]
# 以字符 size 决定 kernel 高度
k = cv2.getStructuringElement(cv2.MORPH_RECT, (w8 * 3 // 4, int(getSeriesValue(w_s_stats))))
img8_erode = util.erode(img8_binary, k)
util.imshow(img8_erode)

2.1.2 水平分割¶

In [ ]:
lines = util.extractHline(img8_erode)# 调整直线的位置
adjustLines(lines, img8_rects_exclude_small_rect)
util.imwrite(util.drawRects(img8, util.xy2rc(lines), show_result_img=True))
imgs, pos = util.getSubImgsAndPos(img8, lines)
util.imshow(imgs)

2.1.3 处理分栏¶

In [ ]:
# 绘制文本框
img8_word_filled = util.drawRects(img8_binary_filled, getWsRects(words), fill_color=[255,255,255], line_color=0, show_result_img=True)
In [ ]:
lines = util.extractHline(img8_erode)# 调整直线的位置
adjustLines(lines, img8_rects_exclude_small_rect)
util.drawRects(img8, util.xy2rc(lines), show_result_img=True)
imgs, pos = util.getSubImgsAndPos(img8_word_filled, lines)
util.imshow(imgs)
In [1]:
img8_children, _ = util.getSubImgsAndPos(img8, lines)
img8_children = iter(img8_children)
# 分栏
for img in imgs:
    
    w, h = img.shape[:2]
    df = DataFrame(util.getWhiteBarWidthList(img, isHline=False, rate=0.8))
    series = df.value_counts()
    
    image = next(img8_children)
    if series.size > 2:
        # 需要继续切分
        print(series, f'[len={len(series)}]')
        print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')

        kh = h * 3 // 5
        kw = int(getSeriesValue(w_s_stats) * 1.2)
        k = np.ones((kh, kw))
        
        img_erode = util.erode(img, k, anchor=[kh - 1, -1])
        vlines = util.extractVline(img_erode, rate=0.75)
        util.drawRects(image, util.xy2rc(vlines), show_result_img=True)
        _imgs, _pos = util.getSubImgsAndPos(image, vlines)
        # _imgs, _pos = util.getSubImgsAndPos(img, vlines)
        util.imshow(_imgs)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-a295eaed4814> in <module>
----> 1 img8_children, _ = util.getSubImgsAndPos(img8, lines)
      2 img8_children = iter(img8_children)
      3 # 分栏
      4 for img in imgs:
      5 

NameError: name 'util' is not defined

3. 带有表格¶

TODO¶

In [87]:
import util
from pandas import DataFrame
import numpy as np
import cv2


def de_duplicate_rect(page_areas: list, iou=0.95):
    """# 根据交并比去除重复的区域


    Args:
        page_areas (list): 区域
        iou (float, optional): 交并比大于iou会被去除. Defaults to 0.95.

    Returns:
        list: 去重后的rects
    """    
    sorted_page_areas = sorted(page_areas, key=lambda rect: (rect[0], rect[2], rect[1], rect[3]))
    de_duplicate = []
    for r in sorted_page_areas:
        if len(de_duplicate) < 1:
            de_duplicate.append(r)
        else:
            last = de_duplicate.pop()
            if util.calc_iou(last, r) > iou:
                de_duplicate.append(last)
            else:
                de_duplicate.append(last)
                de_duplicate.append(r)
            
    return de_duplicate 


def getRcRects(rects, h, zoom=1.0, save_float=False, exclude_page_rect=True):
    print("getRcRects, zoom=", zoom)
    _rects =  util.map_list(lambda r: util.xy2rc(
        util.map_list(float if save_float else int, (r['x0'] * zoom, (h - r['y1']) * zoom, r['x1'] * zoom, (h - r['y0']) * zoom))),
        rects
    )
    
    res = []
    for rect in _rects:
        if exclude_page_rect:
            r, c, r1, c1 = rect
            if r == c == 0:
                continue
        res.append(rect)
    return res


def getWsRects(words, zoom=1.0, save_float=False, **kw):
    
    offset = [1, 1] if not hasattr(kw, "offset") else kw["offset"][:2]

    _rects = util.map_list(lambda r: util.xy2rc(
        util.map_list(float if save_float else int, (r['x0'] * zoom, r['top'] * zoom, r['x1'] * zoom, r['bottom'] * zoom))),
        words
    )

    def apply_offset(_rect):
        r_, c_ = offset
        r_, c_ = r_ - 1, c_ - 1
        r, c, r1, c1 = _rect
        return [r + r_, c + c_, r1 + r_, c1 + c_]

    return util.map_list(apply_offset, _rects)


def getRectsInPage(page, exclude_page_rect=True, zoom=1, draw_image=False, is_line_th=4):
    """ 得到图片上可能存在的矩形框(包括直线, 图片...)

    Args:
        page (_type_): pdfplumber.Page

    Returns:
        _type_: _description_
    """
    rects = []
    items = [page.lines, page.curves, page.rects]
    if draw_image:
        items.append(page.images)
    for it in items:
        _rects = getRcRects(it, page.height, zoom)
        for rect in _rects:
            if exclude_page_rect:
                r, c, r1, c1 = rect
                if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
                    continue
            rects.append(rect)

    # 有些线段是图片。。。
    _rects = getRcRects(page.images, page.height, zoom)
    for rect in _rects:
        if exclude_page_rect:
            r, c, r1, c1 = rect
            if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
                continue
        img_h, img_w = util.calc_h_w(rect)
        if img_h < is_line_th or img_w < is_line_th:
            util.logger.info(f"有一个图片可能是线段:{rect}.")
            rects.append(rect)
            
    return rects


def getSeriesValue(series):
    v = series.values[0]
    kw = series.index[0][0]
    for index, num in series.items():
        if num == v:
            kw = min(kw, index[0])
    return kw


def adjustLines(lines, rects_exclude_small_rect, th = 0, is_hline=True):
    if is_hline:
        rects_exclude_small_rect.sort(key=lambda d: (d[0], -d[2]))
    else:
        rects_exclude_small_rect.sort(key=lambda d: (d[1], -d[3])) 
    lines_new = []
    for i in range(len(lines)):
        if is_hline:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (r + th) < y < (r1 + th):
                    lines[i] = [x, max(0, r - 1), x1, r]
                    lines_new.append([x, max(0, r - 1), x1, r])
                    lines_new.append([x, r1-1, x1, r1])
                    break
            lines_new.append(lines[i])
        else:
            x, y, x1, y1 = lines[i]
            for r in rects_exclude_small_rect:
                r, c, r1, c1 = r
                if (c + th) < x < (c1 + th):
                    lines[i] = [ max(0, x - 1), r, x, r1]
                    lines_new.append([max(0, x - 1), r, x, r1])
                    lines_new.append([c1-1, r, c1, r1])
                    break
            lines_new.append(lines[i])
    lines_new.sort(key=lambda it: it[1])
    return lines_new

1.1 确定表格和图片类区域¶

In [88]:
from pandas import DataFrame
# from funs import *
import numpy as np
from importlib import reload

PAGE_OUT = "page_rect.json"
pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\tu_pian.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\ban_fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl_.pdf'

# pdf_path = r'C:\Users\Administrator\Desktop\stream0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream1.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream3.pdf'

# pdf_path = r'C:\Users\Administrator\Desktop\nb0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb1.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb5.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\nb4.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\lc.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\lc1.pdf'

pg = 0
line_th = 4
zoom = 1
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=zoom, zoom_y=zoom)
exec(f"""
import cv2
img{pg} = cv2.imread('{pg}.png')
""")
img0.shape
import sys
# sys.setrecursionlimit(80000)

class Color(object):
    IMAGE = [255, 0, 0]
    RECT = [0, 0, 255]
    TEXT_AREA = [50, 120, 255]
    LINE = [255, 0, 255]
    TEXT_LINE = [211, 211, 0]
words, w_size, w_s_stats, page = util.getWords(pdf_path, pg)
image_area_rects = getRcRects(page.images, page.height, zoom)        
line_area_rects = getRcRects(page.lines, page.height, zoom)
rect_area_rects = getRcRects(page.rects, page.height, zoom)
util.drawRects(img0, rect_area_rects, show_result_img=True)

word_size_as_th = getSeriesValue(w_s_stats)
expand = max(0, int(word_size_as_th * zoom / 2))

# 有些页眉和页脚是矩形。。。
rect_lines = []
for _r in rect_area_rects:
    img_h, img_w = util.calc_h_w(_r)
    if img_h < max(line_th, word_size_as_th) \
        and img_w > 0.7 * page.width \
            and (_r[0] < page.height * 0.2 or _r[0] > page.width * 0.8):
        util.logger.info(f"有一个矩形可能是线段:{_r}")
        rect_lines.append(_r)
        line_area_rects.append(_r)

def getMaxAreaRect(img, area_rects, 
                   directs: list=[(0, 1), (1, 0), (-1, 0), (0, -1)],
                   show_result_img=True):
    img_rected = util.getBinary(
        util.drawRects(np.empty(img.shape).astype(np.uint8), area_rects,
                       show_result_img=show_result_img, line_color=[255, 0, 0]),
        0,
        255,
        0
    )
    # 得到最大的矩形区域
    return util.getMaxRects(img_rected,directs=directs,show_result_img=show_result_img)
image_area_rects = getMaxAreaRect(img0, image_area_rects)
line_area_rects = getMaxAreaRect(img0, line_area_rects)
rect_area_rects = getMaxAreaRect(img0, rect_area_rects)
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  944 - xref found: pos=b'554914'
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  962 - read_xref_from: start=554914, token=/b'xref'
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  134 - xref objects: {1: (None, 17, 0), 2: (None, 66, 0), 3: (None, 122, 0), 4: (None, 209, 0), 5: (None, 605, 0), 6: (None, 2432, 0), 7: (None, 2569, 0), 8: (None, 2597, 0), 9: (None, 2763, 0), 10: (None, 2834, 0), 11: (None, 3089, 0), 12: (None, 363150, 0), 13: (None, 363315, 0), 14: (None, 372526, 0), 15: (None, 372703, 0), 16: (None, 372941, 0), 17: (None, 373567, 0), 18: (None, 373738, 0), 19: (None, 373978, 0), 20: (None, 549919, 0), 21: (None, 549945, 0), 22: (None, 550127, 0), 23: (None, 550370, 0), 24: (None, 550790, 0), 25: (None, 550967, 0), 26: (None, 551222, 0), 27: (None, 551434, 0), 28: (None, 551699, 0), 29: (None, 551930, 0), 30: (None, 552195, 0), 31: (None, 552426, 0), 32: (None, 552691, 0), 33: (None, 552922, 0), 34: (None, 553187, 0), 35: (None, 553418, 0), 36: (None, 553682, 0), 37: (None, 553913, 0), 38: (None, 554184, 0), 39: (None, 554418, 0), 40: (None, 554683, 0)}
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  976 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread]  182 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:12 [INFO] [pdfpage-MainThread]  109 - Pages: Kids=[<PDFObjRef:4>]
2022-03-09 09:46:12 [INFO] [pdfpage-MainThread]  113 - Page: {'Contents': <PDFObjRef:5>, 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'}, 'MediaBox': [0, 0, 595.32, 841.92], 'Parent': <PDFObjRef:2>, 'Resources': {'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, 'Tabs': /'S', 'Type': /'Page'}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  994 - Processing page: <PDFPage: Resources={'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, MediaBox=[0, 0, 595.32, 841.92]>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  1020 - render_contents: resources={'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, streams=[<PDFStream(5): raw=1756, {'Filter': /'FlateDecode', 'Length': 1754}>], ctm=(1, 0, 0, 1, 0, 0)
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=6, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'DescendantFonts': <PDFObjRef:7>, 'Encoding': /'Identity-H', 'Subtype': /'Type0', 'ToUnicode': <PDFObjRef:13>, 'Type': /'Font'}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=None, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'CIDSystemInfo': <PDFObjRef:9>, 'CIDToGIDMap': /'Identity', 'DW': 1000, 'FontDescriptor': <PDFObjRef:10>, 'Subtype': /'CIDFontType2', 'Type': /'Font', 'W': <PDFObjRef:12>, 'Encoding': /'Identity-H', 'ToUnicode': <PDFStream(13): raw=9139, {'Filter': /'FlateDecode', 'Length': 9137}>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=14, spec={'BaseFont': /'Times New Roman', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:15>, 'LastChar': 247, 'Name': /'F2', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:16>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=17, spec={'BaseFont': /'ABCDEE+Calibri', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:18>, 'LastChar': 32, 'Name': /'F5', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:20>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=21, spec={'BaseFont': /'Times New Roman,Bold', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:22>, 'LastChar': 178, 'Name': /'F6', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:23>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  187 - get_font: create: objid=24, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:25>, 'LastChar': 107, 'Name': /'F7', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:26>}
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(37): raw=82, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 80, 'SMask': <PDFObjRef:38>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 295}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(31): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:32>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(27): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:28>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(27): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:28>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(33): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:34>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(39): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:40>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(35): raw=75, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 66, 'Interpolate': False, 'Length': 73, 'SMask': <PDFObjRef:36>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread]  965 - Processing xobj: <PDFStream(39): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:40>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1
No description has been provided for this image
2022-03-09 09:46:12 [INFO] [<ipython-input-88-3d717eb512cd>-MainThread]  61 - 有一个矩形可能是线段:[ 55  88  55 506]
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:13 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.18946218490600586]s
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:13 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.1501309871673584]s
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:13 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.31815361976623535]s
In [89]:
rect_lines
Out[89]:
[array([ 55,  88,  55, 506])]
In [90]:
# rect 区域调整
rect_area_rects2 = []
img = img0.copy()
for r in rect_area_rects:
    if util.calc_area(r) < 1:
        continue
    _img = util.getSubImg(img, r)
    
    _img_bi = util.getBinary(_img, 250, 0, 255)
    util.imshow(_img_bi)
    
    
    kh, kw = list(map(int, (1, word_size_as_th * 2)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算列连接")
    kh, kw = list(map(int, (word_size_as_th * 2, 1)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算行连接")
    temp = util.getMaxRectsTwice(_im, offset=r)
    # 画上区域
    _ = util.drawRects(img0, temp, show_result_img=True, line_color=[255,0,255])
    rect_area_rects2.extend(temp)
        
rect_area_rects = rect_area_rects2
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:16 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.5954070091247559]s
2022-03-09 09:46:16 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.6034188270568848]s
2022-03-09 09:46:16 [INFO] [util-MainThread]  508 - 有offset函数.
No description has been provided for this image
In [91]:
_ = util.drawRects(img0, rect_area_rects, show_result_img=True, line_color=[255,0,255])
No description has been provided for this image
In [92]:
# util.canShow = False
img9 = img0
img9_binary = util.getBinary(img9, 200, 255, 0)

img9_rected = util.getBinary(
    util.drawRects(np.empty(img9.shape[:2]).astype(np.uint8), getRectsInPage(
        page, True, zoom), show_result_img=True, line_color=[255, 0, 0], title="绘制meatadata中的rectangle"),
    0,
    255,
    0
)
# util.imshow([img9_binary, img9_rected])
# 得到最大的矩形区域
# img9_rects_exclude_small_rect = util.getMaxRects(img9_rected,
#                                                 #  directs=[(0, 1), (1, 0), ],
#                                                  show_result_img=True)
img9_rects_exclude_small_rect = []
img9_rects_exclude_small_rect.extend(image_area_rects)
img9_rects_exclude_small_rect.extend(line_area_rects)
img9_rects_exclude_small_rect.extend(rect_area_rects)

# 区域划分示意图
_ = util.drawRects(img9, img9_rects_exclude_small_rect,
                   line_color=[0, 0, 255], show_result_img=True)
rs = img9_rects_exclude_small_rect.copy()
img9_binary_filled = util.drawRects(img9_binary, rs, line_color=[
                                    255, 255, 255], fill_color=[255, 255, 255], show_result_img=False)
getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1
getRcRects, zoom= 1
No description has been provided for this image
No description has been provided for this image

TODO 判断无线框表格区域¶

In [93]:
h, w = page.height, page.width
w_th = w * 0.6
page_rects = util.filter_list(lambda r: h * 0.1 < r[0] and r[2] < h * 0.9 , rect_area_rects)
maybe_lines = util.filter_list(lambda r: util.calc_h(r) < line_th and util.calc_w(r) > 5 * word_size_as_th, page_rects)
# TODO
if len(page_rects) > 2 and len(maybe_lines) > 0:
    page_rects.sort(key=lambda r: (r[0], -r[2], r[1], -r[3]))
    maybe_tbs = [[]]
    append = False
    for r in page_rects:
        if util.calc_h(r) < line_th:
            if len(maybe_tbs[-1]) == 0:
                append = not append
            elif r[0] - maybe_tbs[-1][-1][0] < 2 * word_size_as_th:
                append = not append
                maybe_tbs.append([])
            else:
                print(maybe_tbs, r)
        if append:
            maybe_tbs[-1].append(r)

    maybe_tb = maybe_tbs[0]
    st, ed = maybe_tb[0][0], maybe_tb[-1][2]
    words_rects = getWsRects(words, zoom)
    filter_wrects = de_duplicate_rect(util.filter_list(lambda x: st < x[0] and x[2] < ed, words_rects))
    print(f"原size={len(words_rects)}, 长度过滤之后{len(filter_wrects)}.")
    # util.drawRects(np.zeros(img9_rected.shape[:2]), filter_wrects + [maybe_tb[0] ,maybe_tb[-1]], show_result_img=True)
    k = np.ones((int((ed - st) / 2), 1))
    _img_close = cv2.morphologyEx(util.drawRects(img9_rected, filter_wrects, show_result_img=True, title="pdf 上的rect"),
                    cv2.MORPH_CLOSE,
                    k)
    util.imshow(_img_close, "_img_close")

    _rect = util.getMaxRects(_img_close)
    _rect = util.getMaxRects(util.drawRects(np.zeros(img0.shape[:2]), _rect,fill_color=[0,0,0], fill_expanding=[0, word_size_as_th] ,show_result_img=True),
                            show_result_img=True)
    _rect = util.map_list(lambda r:[r[0], r[1] + int(word_size_as_th) - 1, r[2],  r[3] - int(word_size_as_th) + 2], _rect)
    rect_area_rects.extend(_rect)
    print(rect_area_rects)
else:
    print("没找到表格候选区域")
    
_ = util.drawRects(img0, 
                   rect_area_rects,
                   show_result_img=True)
没找到表格候选区域
No description has been provided for this image
In [ ]:
 

制作MASK¶

In [94]:
mask = np.ones(img0.shape[:2]).astype(np.uint8) * 255
img9_rects_exclude_small_rect = util.map_list(lambda r: util.map_list(int, [r[0] - word_size_as_th * 1.5,
                                                                            # r[1] - word_size_as_th * 1.5,
                                                                            r[1] - word_size_as_th,
                                                                            r[2] + word_size_as_th * 1.5,
                                                                            # r[3] + word_size_as_th * 1.5
                                                                            r[3] + word_size_as_th
                                                                            ]), img9_rects_exclude_small_rect)
_ = util.drawRects(mask,
                   img9_rects_exclude_small_rect,
                   line_color=0,
                   fill_color=0,
                   fill_expanding=[0, 0],
                   copy_for_show_result=False,
                   show_result_img=True)
No description has been provided for this image

1.2 确定文字区域¶

In [95]:
# # 方案1
# ws_rects = getWsRects(words, zoom)

# expand = max(0, int(word_size_as_th * zoom / 2))
# _img = np.zeros(img0.shape[:2]).astype(np.uint8)
# util.drawRects(_img, ws_rects,
#                line_color=[255, 0, 0],
#                fill_color=[0, 0, 0],
#                fill_expanding=expand,
#                show_result_img=True, copy_for_show_result=False)
# word_focused = np.bitwise_and(_img, mask)
# util.imshow(word_focused)
# points_set = []
# text_area_rects = util.getMaxRectsTwice(
#     word_focused, show_result_img=True, result=points_set)

# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 ,  r[3] -expand + 2], text_area_rects )

# _ = util.drawRects(img0, text_area_rects, show_result_img=True, title="游离的文本行")
In [96]:
# 方案2

ws_rects = getWsRects(words, zoom)
ws_rects_not_in_table = []

mask_text = []
for wr in ws_rects:
    add_flag = True
    for orect in img9_rects_exclude_small_rect:
        if  util.isInnerCell(wr, orect, strict=False)[0]:
            add_flag = False
            mask_text.append(wr)
            break
    if add_flag: 
        ws_rects_not_in_table.append(wr)
_ = util.drawRects(img0, ws_rects_not_in_table, line_color=[255, 0,0], show_result_img=True)

w, h = img9_binary.shape[:2]
# 以字符 size 决定 kernel 高度
k = cv2.getStructuringElement(
    cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
# img9_erode = util.erode(img9_binary_filled, k)
# util.imshow([img9_binary_filled, img9_erode])

# ws_rects = getWsRects(words, zoom)
# # 绘制文本框
# img9_word_filled = util.drawRects(img9_binary_filled, ws_rects, fill_color=[255, 255, 255], line_color=0, show_result_img=True)

# TODO 可能不需要
# ws_rects = getWsRects(words, zoom)
# 绘制文本框
util.imshow(img9_binary_filled)
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255, 
                                  ws_rects_not_in_table, 
                                  fill_color=[255, 255, 255], 
                                  line_color=[0,0,0], 
                                  fill_expanding=expand,
                                  show_result_img=True,
                                  title="img9_word_filled")


word_focused = np.bitwise_not(img9_word_filled)
util.imshow(word_focused)
k = np.ones((int(word_size_as_th * 1.2), int(word_size_as_th)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects = util.getMaxRects(word_focused_dilate, show_result_img=True, result=points_set)
text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 ,  r[3] -expand + 2], text_area_rects )
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  210 - right_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
2022-03-09 09:46:19 [INFO] [util-MainThread]  206 - left_in
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:20 [INFO] [util-MainThread]  350 - 关键字参数中存在result, 将会将点集放入result中.
No description has been provided for this image
2022-03-09 09:46:20 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.1436166763305664]s
In [97]:
k = cv2.getStructuringElement(
    cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255, 
                                  mask_text, 
                                  fill_color=[255, 255, 255], 
                                  line_color=[0,0,0], 
                                  fill_expanding=expand,
                                  show_result_img=True,
                                  title="img9_word_filled")

word_focused = np.bitwise_not(img9_word_filled)
util.imshow(word_focused)
k = np.ones((int(word_size_as_th), int(word_size_as_th * 1.2)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects_mask = util.getMaxRectsTwice(word_focused, show_result_img=True, result=points_set)
text_area_rects_mask = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 ,  r[3] -expand + 2], text_area_rects_mask )
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.05585312843322754]s
2022-03-09 09:46:21 [INFO] [util-MainThread]  350 - 关键字参数中存在result, 将会将点集放入result中.
No description has been provided for this image
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.1934823989868164]s

1.2.1 根据字号先划分成多块¶

In [98]:
ws_rects_float = getWsRects(words, zoom, save_float=True)
ws_rects_dict = util.groupby(ws_rects_float, key=lambda r: "%.2f" % (r[2] - r[0]))

1.2.2 再确定文本区域¶

In [99]:
text_area_rects_2 = []
for k, group in ws_rects_dict.items():
    _expand = max(0, int(eval(k) * zoom * 0.6))
    _img = np.zeros(img0.shape[:2]).astype(np.uint8)
    for r in group:
        util.drawRects(_img, util.map_list(int, r), 
                       line_color=[255,0,0],
                       fill_color=[0,0,0], 
                       copy_for_show_result=False, 
                       fill_expanding=_expand,
                       show_result_img=False,
                       title=f"字号 {k}")
        
        
    _img = np.bitwise_and(_img, mask)
    
    temp = util.getMaxRectsTwice(_img, line_color=[255, 0, 0], show_result_img=True, title=f"字号 {k}")
    # break
    temp = util.map_list(lambda r: [r[0] + _expand - 1, r[1] + _expand - 1, r[2] - _expand + 2 ,  r[3] -_expand + 2], temp )
    for r in temp:
        if (r[2] - r[0]) > eval(k) * 0.8:
            text_area_rects_2.append(r)
            text_area_rects.append(r)
    # TODO
    # _rects = text_area_rects.copy()
    # for cell_i in temp:
    #     for cell_o in _rects:
    #         if 
    #         text_area_rects.append(cell_i)
# 过滤 小文本区域

text_area_rects = text_area_rects_2
text_area_rects.extend(text_area_rects_mask)

# text_area_rects = list(filter(lambda r: util.calc_area(r) > word_size_as_th ** 2, text_area_rects))
# util.dump(_words, "ws.json")   
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.005983114242553711]s
No description has been provided for this image
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.18650007247924805]s
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.004987955093383789]s
No description has been provided for this image
2022-03-09 09:46:21 [INFO] [util-MainThread]  406 - 找外围最大矩形框耗时: [0.13463973999023438]s
2022-03-09 09:46:21 [WARNING] [util-MainThread]  363 - 图片不存在白色区域!
No description has been provided for this image
2022-03-09 09:46:21 [WARNING] [util-MainThread]  363 - 图片不存在白色区域!
No description has been provided for this image

1.3 绘制区域示意图¶

In [100]:
rects = [image_area_rects, 
         line_area_rects ,
         rect_area_rects,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
    i = np.array([c for _ in range(100)]).astype(np.uint8)
    i = np.reshape(i, (10,10,3))
    if color_img is None:
        color_img = i
    else:
        color_img = np.concatenate((color_img, i), 1)

util.imshow(color_img, " | ".join(["image", "line", "rectangle", "text_area", "text_line"]))

    
img = img0.copy()
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                #    fill_color=c,
                   show_result_img=True,
                   copy_for_show_result=False)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

1.4 页面表格化¶

In [101]:
img = img0.copy() * 0
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)
No description has been provided for this image
In [102]:
img_bi = util.getBinary(img, 1, 0, 255)
util.imshow(img_bi)
page_areas = []
page_areas_imgs = []
def extractLines(img_bi, hline=True, rate=0.95):
    import util
    lines = util.extractHline(img_bi, rate=rate) if hline else util.extractVline(img_bi,  rate=rate)
    return util.getSubImgsAndPos(img_bi, lines)

imgs , pos = extractLines(img_bi)
util.imshow(imgs)
page_areas_imgs.extend(imgs)
page_areas.extend(util._offset([0,0], pos))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

1.5 页面区域再划分¶

In [103]:
img = img0.copy() * 0
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)

rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate = 0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[255,255,255], show_result_img=True)
page_areas = rs
img = img0.copy()
_ = util.drawRects(img, 
               page_areas,
               show_result_img=True,
               line_color=[0,0,0], 
               title="区域划分示意图")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=44
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=30
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=621
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:25 [INFO] [util-MainThread]  1086 - start=0, end=106
2022-03-09 09:46:25 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:25 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:25 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:26 [INFO] [util-MainThread]  1227 - 没有子图了
No description has been provided for this image
No description has been provided for this image
In [104]:
rects = [image_area_rects,
         line_area_rects,
         rect_area_rects + page_areas,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT,
          Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
    i = np.array([c for _ in range(100)]).astype(np.uint8)
    i = np.reshape(i, (10, 10, 3))
    if color_img is None:
        color_img = i
    else:
        color_img = np.concatenate((color_img, i), 1)


ls = ["image", "line", "rectangle", "text_area", "text_line"]
util.imshow(color_img, " | ".join(ls))

    
img = img0.copy()
for r, c, t in zip(rects, colors, ls):
    util.drawRects(img, r, 
                   line_color=c, 
                #    fill_color=c,
                   show_result_img=True,
                   copy_for_show_result=False,
                   title=t)

util.imshow(img)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

2.1 图片, rect内部再划分¶

In [105]:
# text_area_rects_add = []
# import pdfplumber
# page = pdfplumber.open(pdf_path).pages[0]

# for _rect in rect_area_rects:
#     if util._calc_area(_rect) < word_size_as_th ** 2 * 4:
#         continue
#     crop_box  = page.within_bbox(util.xy2rc(_rect, 1 / zoom))
#     _words = crop_box.extract_words(
#                 keep_blank_chars=False, 
#                 x_tolerance=word_size_as_th * 1.5, 
#                 y_tolerance=word_size_as_th * 0.25)
#     _rects = []
#     for category in rects:
#         for r in category:
#             if util.isInnerCell(r, _rect)[0]:
#                 _rects.append(r)
#     _img = util.getSubImg(
#         util.drawRects(img0, getWsRects(_words, zoom=zoom), 
#                        line_color=[255,0,0],
#                        show_result_img=True, title="绘制文本行"),
#     _rect
#     )    
    
#     th = DataFrame(_img.flatten()).value_counts().index[0][0]
#     _img_bi = np.bitwise_not(util.getBinary(_img, th - 2, 255, 0))
    
#     util.imshow(_img_bi, "Rect 子图")
    
#     kh, kw = list(map(int, (1, word_size_as_th)))
#     k = np.ones((kh, kw))
#     _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
#     util.imshow(_im, "闭运算列连接")
#     kh, kw = list(map(int, (word_size_as_th, 1)))
#     k = np.ones((kh, kw))
#     _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
#     util.imshow(_im, "闭运算行连接")
    
#     text_area_rects_add.extend(util.getMaxRectsTwice(_im,
#                                                 directs=[(0, 1), (1, 0)],
#                                                 show_result_img=True,
#                                                 filter=lambda rect: util._calc_area(
#                                                     rect) > word_size_as_th ** 2,
#                                                 offset=_rect,
#                                                 title="绘制文本框"
#                                                 )
#                             )

# # util.dump(_words, "ws.json")

    
In [106]:
def mergeTextArea(text_area_rects, text_area_rects_add, rate = 0.8):
    _rs = []
    walked = set()
    for o in text_area_rects:
        for i in text_area_rects_add:
            if str(i) in walked:
                continue
            f, outer, inner = util.isInnerCell(i, o, strict=False, rate=rate)
            if f:
                _rs.append(outer)
                walked.add(str(inner))

    for i in text_area_rects:
        for o in text_area_rects_add:
            if str(i) in walked:
                continue
            f, outer, inner = util.isInnerCell(i, o, strict=False, rate=rate)
            if f:
                _rs.append(outer)
                walked.add(str(inner))     
    return _rs  

2.2 再次绘制图像¶

In [107]:
pre = text_area_rects.copy()
In [108]:
text_area_rects = pre

pre = text_area_rects.copy()
_ = util.drawRects(_img, text_area_rects,
               show_result_img=True)
No description has been provided for this image
In [109]:
# 大文本区域细分

import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]

length = len(text_area_rects)
for  i in range(length):
    tr = text_area_rects[i]
    if (tr[2] - tr[0]) > 4 * word_size_as_th and (tr[3] - tr[1]) >  img0.shape[1] * 0.5:
        util.drawRects(img0, tr, show_result_img=True)
    
        _img_bi = util.getBinary(util.getSubImg(img0, tr), 250, 0, 255)
        util.imshow(_img_bi, "Rect 子图")
        
        kh, kw = list(map(int, (1, word_size_as_th)))
        k = np.ones((kh, kw))
        _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
        util.imshow(_im, "闭运算列连接")
        kh, kw = list(map(int, (word_size_as_th * 1.8, 1)))
        k = np.ones((kh, kw))
        _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
        util.imshow(_im, "闭运算行连接")
        
        temp = util.getMaxRects(_im, offset=tr, show_result_img=True)
        if len(temp) > 1:
            text_area_rects[i] = temp[0]
            text_area_rects.extend(temp[1:])
# 过滤 小文本区域

text_area_rects = list(filter(lambda r: util.calc_area(r) > word_size_as_th ** 2, text_area_rects))
util.drawRects(_img, text_area_rects,
               show_result_img=True)
# util.dump(_words, "ws.json")
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  944 - xref found: pos=b'554914'
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  962 - read_xref_from: start=554914, token=/b'xref'
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  134 - xref objects: {1: (None, 17, 0), 2: (None, 66, 0), 3: (None, 122, 0), 4: (None, 209, 0), 5: (None, 605, 0), 6: (None, 2432, 0), 7: (None, 2569, 0), 8: (None, 2597, 0), 9: (None, 2763, 0), 10: (None, 2834, 0), 11: (None, 3089, 0), 12: (None, 363150, 0), 13: (None, 363315, 0), 14: (None, 372526, 0), 15: (None, 372703, 0), 16: (None, 372941, 0), 17: (None, 373567, 0), 18: (None, 373738, 0), 19: (None, 373978, 0), 20: (None, 549919, 0), 21: (None, 549945, 0), 22: (None, 550127, 0), 23: (None, 550370, 0), 24: (None, 550790, 0), 25: (None, 550967, 0), 26: (None, 551222, 0), 27: (None, 551434, 0), 28: (None, 551699, 0), 29: (None, 551930, 0), 30: (None, 552195, 0), 31: (None, 552426, 0), 32: (None, 552691, 0), 33: (None, 552922, 0), 34: (None, 553187, 0), 35: (None, 553418, 0), 36: (None, 553682, 0), 37: (None, 553913, 0), 38: (None, 554184, 0), 39: (None, 554418, 0), 40: (None, 554683, 0)}
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  976 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread]  182 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']}
2022-03-09 09:46:30 [INFO] [pdfpage-MainThread]  109 - Pages: Kids=[<PDFObjRef:4>]
2022-03-09 09:46:30 [INFO] [pdfpage-MainThread]  113 - Page: {'Contents': <PDFObjRef:5>, 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'}, 'MediaBox': [0, 0, 595.32, 841.92], 'Parent': <PDFObjRef:2>, 'Resources': {'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, 'Tabs': /'S', 'Type': /'Page'}
No description has been provided for this image
Out[109]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)
In [110]:
# 调整文本行
# _img = np.zeros(img0.shape[:2])

# util.drawRects(_img, text_area_rects, fill_color=[0,0,0],
#                fill_expanding=[expand, 0],
#                copy_for_show_result=False)

# text_area_rects = util.getMaxRectsTwice(_img, show_result_img=True)

# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, 
#                                            r[1], 
#                                            r[2] - expand + 1 ,  
#                                            r[3]], text_area_rects )
In [111]:
# 重构页面结构
img = img0.copy() * 0
rects = [image_area_rects, 
         line_area_rects ,
         rect_area_rects,
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
for r, c in zip(rects, colors):
    util.drawRects(img, r, 
                   line_color=c, 
                   show_result_img=False,
                   copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)

rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate = 0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[255,255,255], show_result_img=True)

page_areas = de_duplicate_rect(rs)
img = img0.copy()
_ = util.drawRects(img, 
               page_areas,
               show_result_img=True,
               line_color=[0,0,0], 
               title="区域划分示意图")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=44
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=30
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=621
No description has been provided for this image
No description has been provided for this image
2022-03-09 09:46:32 [INFO] [util-MainThread]  1086 - start=0, end=106
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
2022-03-09 09:46:32 [INFO] [util-MainThread]  1227 - 没有子图了
No description has been provided for this image
No description has been provided for this image
In [115]:
rects = [image_area_rects, 
         line_area_rects ,
         rect_area_rects,
        #  mergeTextArea(text_area_rects ,text_area_rects_add),
         text_area_rects,
         getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
    i = np.array([c for _ in range(100)]).astype(np.uint8)
    i = np.reshape(i, (10,10,3))
    if color_img is None:
        color_img = i
    else:
        color_img = np.concatenate((color_img, i), 1)

ls = ["image", "line", "rectangle", "text_area", "text_line"]
util.imshow(color_img, " | ".join(ls))

    
img = img0.copy()
for r, c, t in zip(rects, colors, ls):
    util.drawRects(img, r, 
                   line_color=c, 
                #    fill_color=c,
                   show_result_img=True,
                   copy_for_show_result=False,
                   title=t)
# 画上区域
_ = util.drawRects(img, page_areas, show_result_img=True, line_color=[0,0,0])
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [116]:
# 画上区域
_img = util.drawRects(img0, page_areas, show_result_img=True, line_color=[0,0,0])
util.imshow(np.concatenate((img0, _img), 1), "划分示意图")
No description has been provided for this image
No description has been provided for this image

3.1 输出区域信息¶

In [114]:
class PageAreaMetaClass(type):
    def __new__(cls, name, bases, attrs):
        pageArea = type.__new__(cls, name, bases, attrs)
        print(cls, name, bases, attrs)
        return pageArea


# class PageArea(dict, metaclass=PageAreaMetaClass):
class PageArea(dict):

    THRESHOLD = 0.98
    HORIZON = "horizon"
    VERTICAL = "vertical"

    def _calc_h(self, rect):
        return rect[2] - rect[0]

    def _calc_w(self, rect):
        return rect[3] - rect[1]

    def calc_h_w(self, rect):
        return self._calc_h(rect), self._calc_w(rect)

    def __init__(self, rects):
        max_area = max(util.map_list(util.calc_area, rects))
        self.area = max_area
        # 添加自身所在区域
        for r in rects:
            if util.calc_area(r) == max_area:
                self.rect = list(r)
                break
        # 加入子页面
        children_rects = list(
            filter(lambda rect: util.calc_area(rect) != max_area, rects))
        children_rects.sort(key=lambda rect: (
            rect[0], rect[2], rect[1], rect[3]))
        self.children_rects = util.map_list(list, children_rects)

        mode = None

        if len(children_rects) > 0:
            _rect = children_rects[0]
            h, w = self.calc_h_w(_rect)
            h1, w1 = self.calc_h_w(self.rect)
            if h / h1 > self.THRESHOLD and w / w1 < self.THRESHOLD:
                mode = self.VERTICAL
            elif h / h1 < self.THRESHOLD and w / w1 > self.THRESHOLD:
                mode = self.HORIZON
            else:
                raise ValueError("未知的分类")

            chs_dict = {}
            keys = util.map_list(str,
                                 list(filter(lambda rect: self._calc_h(rect) / h1 > self.THRESHOLD
                                             if mode == self.VERTICAL else self._calc_w(rect) / w1 > self.THRESHOLD, children_rects)))
            for _rect in children_rects:
                for k in keys:
                    if util.isInnerCell(_rect, eval(k))[0]:
                        if k not in chs_dict:
                            chs_dict[k] = []
                        chs_dict[k].append(_rect)

            self.tag = mode

            children_pages = []
            for v in chs_dict.values():
                children_pages.append(PageArea(v))

            self.children_pages = children_pages

    def __getitem__(self, key):
        return eval(f"self.{key}", globals(), locals()) if hasattr(self, key) else None

    def __repr__(self):
        return util.dumps(eval(str(self.__dict__)))

    __str__ = __repr__


PageArea(page_areas)
Out[114]:
{
    "area": 501832,
    "rect": [
        0,
        0,
        842,
        596
    ],
    "children_rects": [
        [
            21,
            0,
            65,
            596
        ],
        [
            64,
            0,
            94,
            596
        ],
        [
            93,
            0,
            714,
            596
        ],
        [
            713,
            0,
            819,
            596
        ]
    ],
    "tag": "horizon",
    "children_pages": [
        {
            "area": 26224,
            "rect": [
                21,
                0,
                65,
                596
            ],
            "children_rects": []
        },
        {
            "area": 17880,
            "rect": [
                64,
                0,
                94,
                596
            ],
            "children_rects": []
        },
        {
            "area": 370116,
            "rect": [
                93,
                0,
                714,
                596
            ],
            "children_rects": []
        },
        {
            "area": 63176,
            "rect": [
                713,
                0,
                819,
                596
            ],
            "children_rects": []
        }
    ]
}
In [ ]:
 
In [188]:
_page = PageArea(page_areas).children_pages[2]
_rect = _page.rect
print(_page)
util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True, 
                title=str(_rect))

# for _page in PageArea(page_areas).children_pages:
#     _rect = _page.rect
#     util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True, 
#                     title=str(_rect))
{
    "area": 132312,
    "rect": [
        86,
        0,
        308,
        596
    ],
    "children_rects": [
        [
            86,
            39,
            308,
            305
        ],
        [
            86,
            304,
            308,
            562
        ],
        [
            91,
            39,
            117,
            305
        ],
        [
            116,
            39,
            307,
            305
        ]
    ],
    "tag": "vertical",
    "children_pages": [
        {
            "area": 59052,
            "rect": [
                86,
                39,
                308,
                305
            ],
            "children_rects": [
                [
                    91,
                    39,
                    117,
                    305
                ],
                [
                    116,
                    39,
                    307,
                    305
                ]
            ],
            "tag": "horizon",
            "children_pages": [
                {
                    "area": 6916,
                    "rect": [
                        91,
                        39,
                        117,
                        305
                    ],
                    "children_rects": []
                },
                {
                    "area": 50806,
                    "rect": [
                        116,
                        39,
                        307,
                        305
                    ],
                    "children_rects": []
                }
            ]
        },
        {
            "area": 57276,
            "rect": [
                86,
                304,
                308,
                562
            ],
            "children_rects": []
        }
    ]
}
No description has been provided for this image
Out[188]:
array([[[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       ...,

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]],

       [[254, 254, 254],
        [254, 254, 254],
        [254, 254, 254],
        ...,
        [254, 254, 254],
        [254, 254, 254],
        [255, 255, 255]],

       [[255, 255, 255],
        [255, 255, 255],
        [255, 255, 255],
        ...,
        [255, 255, 255],
        [255, 255, 255],
        [255, 255, 255]]], dtype=uint8)
In [25]:
# 根据交并比去除重复的区域

sorted_page_areas = sorted(page_areas, key=lambda rect: (rect[0], rect[1], -rect[2],  -rect[3]))
de_duplicate = []
for r in sorted_page_areas:
    if len(de_duplicate) < 1:
        de_duplicate.append(r)
    else:
        last = de_duplicate.pop()
        if util.calc_iou(last, r) > 0.95:
            print('交并比 > 0.95', r, last, util.calc_iou(r, last))
            de_duplicate.append(last)
        else:
            de_duplicate.append(last)
            de_duplicate.append(r)
        
page_areas = de_duplicate             
for _rect in de_duplicate:
    util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True, 
                   title=str(_rect))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [62]:
rect_area_rects2 = []
img = img0.copy()
for i in range(len(rect_area_rects)):
    r = rect_area_rects[i]
    _img = util.getSubImg(img0, r)
    
    _img_bi = util.getBinary(_img, 250, 0, 255)
    # util.imshow(_img_bi)
    
    
    kh, kw = list(map(int, (1, word_size_as_th)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算列连接")
    kh, kw = list(map(int, (word_size_as_th, 1)))
    k = np.ones((kh, kw))
    _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
    util.imshow(_im, "闭运算行连接")
    temp = util.getMaxRectsTwice(_im, offset=r)
    # 画上区域
    _ = util.drawRects(img0, temp, show_result_img=True, line_color=[255,0,255])
    rect_area_rects2.extend(temp)
No description has been provided for this image
No description has been provided for this image
2022-03-04 16:01:06 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6801793575286865]s
2022-03-04 16:01:07 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.7130606174468994]s
2022-03-04 16:01:07 [INFO] [util-MainThread]  490 - 有offset函数.
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-04 16:01:08 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6921482086181641]s
2022-03-04 16:01:09 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6392576694488525]s
2022-03-04 16:01:09 [INFO] [util-MainThread]  490 - 有offset函数.
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
2022-03-04 16:01:10 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.6811752319335938]s
2022-03-04 16:01:11 [INFO] [util-MainThread]  388 - 找外围最大矩形框耗时: [0.7280511856079102]s
2022-03-04 16:01:11 [INFO] [util-MainThread]  490 - 有offset函数.
No description has been provided for this image

2.3 TODO 大的文本块, 再判断是不是表格¶

聚焦文字, 分析是否有多个散开的文本行, 文本块

In [ ]:
# 3.1 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
img = m.copy()
all_points = list(zip(*(np.where(img > 0))))
im = img == 100
np.where(im)[0]
np.where(img)
In [ ]:
kh, kw = list(map(int, (_img.shape[0], word_size_as_th * 0.6)))
k = np.ones((kh, kw))
_img_erode = util.erode(_img_, k)
util.imshow(_img_erode)
lines = util.extractVline(_img_erode)
print(f"找到 {len(lines)} 条竖线{lines}")
util.drawRects(_img_, util.xy2rc(lines), show_result_img=True)
if len(lines) > 2:
    _imgs, pos = util.getSubImgsAndPos(_img_, lines)
    util.imshow(_imgs, "竖线分割图")
In [14]:
def line_split(points, horizonal=True):
    result = []
    line = [points[0]]
    for p in points:
        r, c = line[-1]
        r1, c1 = p
        if not horizonal:
            r, c = c, r
            r1, c1 = c1, r1
        if c1 -c <2:
            line.append(p)
        else:
            result.append(line)
            line = [p]
    
    if len(line) > 0:
        result.append(line)
    if len(result) > 2:
        print(f"找到{len(result)}条线段")
    return result

from collections import OrderedDict as odict
horizon_lines = odict()
n = 0
# 行收集
line_points = []
for p in all_points:
    r, c = p
    if r not in horizon_lines.keys():
        horizon_lines[r] = ...
        if len(line_points) > 0:
            horizon_lines[r]=line_split(line_points)
            line_points = []
    else:
        line_points.append(p)  
        
    if len(line_points) > 0:
        horizon_lines[line_points[0][0]]=line_split(line_points)

keys = list(horizon_lines)

hareas = []

kset = [keys[0]]
for k in keys[1:]:
    if k - kset[-1] < 2:
        kset.append(k)
    else:
        hareas.append(kset)
        kset = [k]
if len(kset) > 0:
    hareas.append(kset)

def isConnect(l1, l2):
    l1_start, l1_end = l1[0][1], l1[-1][1]
    l2_start, l2_end = l2[0][1], l2[-1][1]
    return (l1_start < l2_start < l1_end) or \
        (l1_start < l2_end < l1_end)
r_range = hareas[1]

flag_list = []

init_lines = horizon_lines[r_range[0]]

line_flags = []
for line in init_lines:
    for point in line:
        flag = {}
        flag["start"] = (point[0][1], point[-1][1])
        flag["mode"] = 0
        line_flags.append(flag)
flag_list.append(line_flags)

pre = r_range[0]
for cur in range(r_range[0], r_range[-1] + 1):
    cur_line = horizon_lines[cur]
    

areas = []
for kset in hareas:
    if len(kset) == 1:
        areas.append(horizon_lines[kset[0]])
        
    ind = 0
    # 横线合并
    walked = set()

    ind = 0
    max_ind = len(kset)
    pre = kset[0]
    pre_lines = horizon_lines[pre]
    for l in pre_lines:
        area = set()
        area.add(str(l[0]))
        areas.append(area)
        for p in l:
            walked.add(p)
    
    for cur in kset[1:]:
        pre_lines = horizon_lines[pre]
        cur_lines = horizon_lines[cur]
        for pre_l in pre_lines:
            
            if pre_l[0] not in walked:
                    area.add(str(pre_l[0]))
                    for p in pre_l:
                        walked.add(p)
                            
            for cur_l in cur_lines:
                if isConnect(cur_l, pre_l):
                    for p in cur_l:
                        walked.add(p)

                    
 
# TEST CELL

import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]

crop_box  = page.within_bbox(util.xy2rc(text_area_rects[3], 1 / zoom))
crop_box.chars = []
crop_box.extract_words(
            keep_blank_chars=False, 
            x_tolerance=word_size_as_th * 1.5, 
            y_tolerance=word_size_as_th * 0.25)
# TEST CELL
import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]

crop_box  = page.within_bbox(util.xy2rc(text_area_rects[3], 1 / zoom))
_words = crop_box.extract_words(
            keep_blank_chars=False, 
            x_tolerance=word_size_as_th * 1.5, 
            y_tolerance=word_size_as_th * 0.25)

_words
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到4条线段
找到4条线段
找到3条线段
找到4条线段
找到5条线段
找到6条线段
找到6条线段
找到3条线段
找到4条线段
找到5条线段
找到6条线段
找到6条线段
找到3条线段
找到4条线段
找到5条线段
找到6条线段
找到6条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到4条线段
找到4条线段
找到4条线段
找到4条线段
找到5条线段
找到5条线段
找到6条线段
找到6条线段
找到6条线段
找到6条线段
找到7条线段
找到8条线段
找到9条线段
找到9条线段
找到10条线段
找到10条线段
找到10条线段
找到10条线段
找到11条线段
找到12条线段
找到13条线段
找到14条线段
找到15条线段
找到16条线段
找到17条线段
找到18条线段
找到18条线段
找到3条线段
找到4条线段
找到5条线段
找到5条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
找到3条线段
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
lines = util.extractHline(img9_erode)  # 调整直线的位置
lines = adjustLines(lines, img9_rects_exclude_small_rect, th=word_size_as_th * 1.2 * zoom)

util.drawRects(img9, util.xy2rc(lines), show_result_img=True)
imgs, pos = util.getSubImgsAndPos(img9_word_filled, lines)
# imgs, pos = util.getSubImgsAndPos(img9, lines)
# util.imshow(imgs)
In [ ]:
kh, kw = list(map(int, (word_size_as_th * 0.6, _img.shape[1])))
k = np.ones((kh, kw))
_img_erode = util.erode(_img_, k)
util.imshow(_img_erode)
lines = util.extractHline(_img_erode)
util.drawRects(_img_, util.xy2rc(lines), show_result_img=True)
if len(lines) > 2:
    pass
In [ ]:
img_children, _ = util.getSubImgsAndPos(img9, lines)
img_children = iter(img_children)

lines_iter = iter(lines)
# 分栏
offset = 0
for img in imgs:
    offset = next(lines_iter)[1]
    h, w = img.shape[:2]
    whiteBars = util.getWhiteBarWidthList(img, isHline=False, rate=0.8)
    whiteBars = [bar for bar in whiteBars if bar > word_size_as_th]

    image = next(img_children)
    if len(whiteBars) > 2:
        # 需要继续切分
        kh = h * 3 // 5
        kw = int(word_size_as_th * 1.2 * zoom)
        k = np.ones((kh, kw))

        img_erode = util.erode(img, k, anchor=[kh - 1, -1])
        vlines = util.extractVline(img_erode, rate=0.75, th=word_size_as_th)
        # vlines = adjustLines(vlines, img9_rects_exclude_small_rect, th=word_size_as_th, is_hline=False)
        util.drawRects(image, util.xy2rc(vlines), show_result_img=True)
        _imgs, _pos = util.getSubImgsAndPos(image, vlines)
        for line in vlines:
            line[1] = line[1] + offset
            line[3] = line[3] + offset
            print("add:", line)
            lines.append(line)
        # _imgs, _pos = util.getSubImgsAndPos(img, vlines)
        util.imshow(_imgs)
    else:
        util.imshow(image)
util.canShow = True
_ = util.drawRects(img0, util.xy2rc(lines), show_result_img=True)
In [ ]:
 
In [ ]:
df = DataFrame(util.getWhiteBarWidthList(img9, isHline=True, rate=0.8))
series = df.value_counts()
print(series)
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
util.imshow(img9)
k = np.ones((series.index[0][0] * 2, img9.shape[1]))
img9_erode = util.erode(img9, k)
util.imshow(img9_erode)
In [ ]:
hlines = util.extractHline(img9_erode)

util.drawRects(img9, util.xy2rc(hlines), show_result_img=True)
In [ ]:
img9_imgs, img9_poses = util.getSubImgsAndPos(img9, hlines)
util.imshow(img9_imgs)
In [ ]:
img9_body = img9_imgs[1]
util.imshow(img9_body)
# rects = util.simpleLocate(np.bitwise_not(img9_body), show_result_img=True)
In [ ]:
h, w = img9_body.shape[:2]
img9_area = h * w
img9_area

META DATA 分析¶

In [ ]:
import importlib
import util
importlib.reload(util)
util.debug =False
In [ ]:
rects = [d for d in page.rects if d['y1'] < 500]
In [ ]:
import pdfplumber
import pandas as pd

from pdfplumber.page import Page

with pdfplumber.open(r"C:\Users\Administrator\Desktop\xiaokuang.pdf") as pdf:
    page = pdf.pages[0]  # 第一页的信息

    print(type(page.rects[0]))
    print("----------------------------------------------")

    tables = Page.extract_tables(page, {'vertical_strategy': 'explicit',
                                        'horizontal_strategy': 'explicit',
                                        'explicit_horizontal_lines': 
                                            list(
                                                map(lambda d: d['y0'], rects))
                                            + list(map(lambda d: d['y1'], rects)),
                                        'explicit_vertical_lines':
                                            list(
                                                map(lambda d: d['x0'], rects))
                                            + list(map(lambda d: d['x1'], rects))
                                        })
    import pandas as pd
    print(util.dumps(tables), file=open("log.txt", 'w', encoding="utf-8"))
    
In [ ]:
w_s_stats
In [ ]:
importlib.reload(util)
In [ ]:
# 文本框
# rects.extend(getWsRects(ws))

# pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
pg = 0
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
ws, w_size , w_s_stats, page = util.getWords(pdf_path, pg)
img0 = None
exec(f"""
import cv2
img{pg} = cv2.imread('{pg}.png')
""")

h, w = img0.shape[:2]
util.dump(ws, 'ws.json')

3.1 划分表格, 图片区域¶

In [ ]:
util.imshow(img0)
img0_rects = util.getBinary(
    util.drawRects(np.empty(img0.shape).astype(np.uint8), getRectsInPage(page), show_result_img=False, line_color=[255,0,0]),
    0,
    255,
    0
    )
util.imshow(img0_rects)
# 得到最大的矩形区域
img0_rects_exclude_small_rect = util.getMaxRects(img0_rects, show_result_img=False)
# 区域划分示意图

_ = util.drawRects(img0, img0_rects_exclude_small_rect,line_color=[0,0,255], show_result_img=True)
In [ ]:
img0_binary = util.getBinary(img0, 200, 255, 0)
util.imshow(img0_binary)
rs = img0_rects_exclude_small_rect.copy()

img0_binary_filled = util.drawRects(img0_binary, rs, line_color=[255,255,255], fill_color=[255,255,255], show_result_img=True)
In [ ]:
# 绘制文本行区域
img0_binary_filled = util.drawRects(img0_binary_filled, getWsRects(ws),line_color=[0,0,0], fill_color=[255,255,255], show_result_img=True)
In [ ]:
import pandas as pd
from pandas import DataFrame

df = DataFrame(util.getWhiteBarWidthList(img0_binary_filled, rate=0.8))
df.describe()
series = df.value_counts()
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
In [ ]:
w, h = img0.shape[:2]
k = np.ones((series.index[0][0] // 1, w * 3 // 4))
img0_erode = util.erode(img0_binary_filled, k)
util.imshow(img0_erode)
In [ ]:
lines = util.extractHline(img0_erode)
# 调整直线的位置
for i in range(len(lines)):
    x, y, x1, y1 = lines[i]
    for r in img0_rects_exclude_small_rect:
        r, c, r1, c1 = r
        if r < y < r1:
            print(lines[i], r)
            lines[i] = [x, max(0, r - 1), x1, r]
        
util.imwrite(util.drawRects(img0, util.xy2rc(lines), show_result_img=True))
imgs, pos = util.getSubImgsAndPos(img0_binary_filled, lines)
util.imshow(imgs)

3.1.1 处理分栏的情况¶

In [ ]:
for img in imgs:

    df = DataFrame(util.getWhiteBarWidthList(img, isHline=False, rate=0.8))
    series = df.value_counts()
    if series.size > 2:
        # 需要继续切分
        print(series, f'[len={len(series)}]')
        print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')

        w, h = img.shape[:2]
        k = np.ones((h * 3 // 5, getSeriesValue(series) // 2))
        img_erode = util.erode(img, k, anchor=[h * 3 // 5 - 1,  getSeriesValue(series) // 2 - 1])
        vlines = util.extractVline(img_erode, rate=0.75)
        util.drawRects(img, util.xy2rc(vlines), show_result_img=True)
        _imgs, _pos = util.getSubImgsAndPos(img, vlines)
        print(len(_imgs), vlines)
        util.imshow(_imgs)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
import numpy as np
img0_binary = util.getBinary(img0, 200, 255, 0)
img_draw_rects = util.drawRects(img0, rects)
util.imshow(img_draw_rects)
In [ ]:
rs = list(map(lambda rect: util.xy2rc(
    list(map(int, (rect["x0"], rect["top"], rect["x1"], rect["bottom"])))
    ), 
              ws))
In [ ]:
util.drawRects(img9, rects=rs, line_color=[125,0,0], show_result_img=True)
In [ ]:
 
In [ ]:
m9 = cv2.imread('9.png')
m9_imgs, _ = util.getSubImgsAndPos(m9, hlines)

EXT 定位有线表格区域¶

In [ ]:
for r in rects:
    if util._calc_area(r) > img9_area * 0.125:
        print(util._calc_area(r))
        util.drawRects(m9_imgs[1], [r], line_color=[255, 0, 0], show_result_img=True)
In [ ]:
import util

pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 11
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
In [ ]:
import sys
sys.getrecursionlimit()
In [ ]:
import cv2
import numpy as np
import util
import sys
sys.setrecursionlimit(100000)

binary_img = lambda path: cv2.cvtColor(cv2.imread(path), cv2.COLOR_RGB2GRAY) 

tm_binary = np.where(binary_img('11.png') > 200, 255, 0).astype(np.uint8)
tm = cv2.imread('11.png')
In [ ]:
from pandas import DataFrame

df = DataFrame(util.getWhiteBarWidthList(tm_binary, isHline=True, rate=0.8))
series = df.value_counts()
print(series)
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
util.imshow(tm_binary)
# TODO, how to get 16
k = np.ones((16, tm_binary.shape[1]))
tm_erode = util.erode(tm_binary, k)
util.imshow(tm_erode)
In [ ]:
k = util.CV2_K_CROSS(tm_binary.shape[0] // 8)
util.imshow(k * 255)
In [ ]:
t = util.erode(np.bitwise_not(tm_binary), k)
util.imshow([np.bitwise_not(tm_binary), t])
In [ ]:
hlines = util.extractHline(tm_erode)
rects = util.drawRects(tm, util.xy2rc(hlines), show_result_img=True)
In [ ]:
tm_imgs, _ = util.getSubImgsAndPos(tm, hlines)
util.imshow(tm_imgs[1])
In [ ]:
_ = util.drawRects(tm, rects, show_result_img=True)