EXT: RELOAD¶
In [ ]:
from importlib import reload
reload(util)
函数定义¶
In [ ]:
import util
from pandas import DataFrame
import numpy as np
import cv2
def getRcRects(rects, h, zoom=1.0, exclude_page_rect=True):
print("getRcRects, zoom=", zoom)
_rects = util.map_list(lambda r: util.xy2rc(
util.map_list(int, (r['x0'] * zoom, (h - r['y1']) * zoom, r['x1'] * zoom, (h - r['y0']) * zoom))),
rects
)
res = []
for rect in _rects:
if exclude_page_rect:
r, c, r1, c1 = rect
if r == c == 0:
continue
res.append(rect)
return res
def getWsRects(words, zoom=1.0):
return util.map_list(lambda r: util.xy2rc(
util.map_list(int, (r['x0'] * zoom, r['top'] * zoom, r['x1'] * zoom, r['bottom'] * zoom))),
words
)
def getRectsInPage(page, exclude_page_rect=True, zoom=1, draw_image=False):
""" 得到图片上可能存在的矩形框(包括直线, 图片...)
Args:
page (_type_): pdfplumber.Page
Returns:
_type_: _description_
"""
rects = []
items = [page.lines, page.curves, page.rects]
if draw_image:
items.append(page.images)
for it in items:
_rects = getRcRects(it, page.height, zoom)
for rect in _rects:
if exclude_page_rect:
r, c, r1, c1 = rect
if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
continue
rects.append(rect)
return rects
def getSeriesValue(series):
v = series.values[0]
kw = series.index[0][0]
for index, num in series.items():
if num == v:
kw = min(kw, index[0])
return kw
def adjustLines(lines, rects_exclude_small_rect, th = 0, is_hline=True):
if is_hline:
rects_exclude_small_rect.sort(key=lambda d: (d[0], -d[2]))
else:
rects_exclude_small_rect.sort(key=lambda d: (d[1], -d[3]))
lines_new = []
for i in range(len(lines)):
if is_hline:
x, y, x1, y1 = lines[i]
for r in rects_exclude_small_rect:
r, c, r1, c1 = r
if (r + th) < y < (r1 + th):
lines[i] = [x, max(0, r - 1), x1, r]
lines_new.append([x, max(0, r - 1), x1, r])
lines_new.append([x, r1-1, x1, r1])
break
lines_new.append(lines[i])
else:
x, y, x1, y1 = lines[i]
for r in rects_exclude_small_rect:
r, c, r1, c1 = r
if (c + th) < x < (c1 + th):
lines[i] = [ max(0, x - 1), r, x, r1]
lines_new.append([max(0, x - 1), r, x, r1])
lines_new.append([c1-1, r, c1, r1])
break
lines_new.append(lines[i])
# line_set = set(map(lambda it:str(it), lines_new))
# for r in rects_exclude_small_rect:
# r, c, r1, c1 = r
# top_line = [c, max(0, r - 1), c1, r]
# bottom_line = [c, r1-1, c1, r1]
# if str(top_line) not in line_set:
# lines_new.append(top_line)
# line_set.add(str(top_line))
# if str(bottom_line) not in line_set:
# lines_new.append(bottom_line)
# line_set.add(str(bottom_line))
lines_new.sort(key=lambda it: it[1])
return lines_new
1. PDF Page To Image¶
1.1 To Image¶
In [ ]:
import util
pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 5
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=2, zoom_y=2)
In [ ]:
page.extract_tables({"vertical_strategy": "lines_strict"})
In [ ]:
import util
pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\tu_pian.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\ban_fl.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\fl2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl0.pdf'
pg = 0
import pdfplumber as pdfplumber
from pandas import DataFrame
pdf = pdfplumber.open(pdf_path)
# 第一页的信息
page = pdf.pages[pg]
# # keep_blank_chars 不以空格为分界符
# # x_tolerance 同一行左右两个文本之间的最大容忍距离
# # 这里横向距离设置的比较大,会把一些表格里比较近的文本合在一起
# # TODO 优化!
# words = page.extract_words(
# keep_blank_chars=False, x_tolerance=4.50 * 1.5, y_tolerance=4.5)
# words.sort(key=lambda x: x["top"])
# w_size = list(map(lambda w: w["adv"], words))
# df = DataFrame(w_size)
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
util.imshow("0.png")
In [ ]:
import pandas as pd
from pandas import DataFrame
for it in page.objects["image"]:
rect = [it[attr] for attr in ["x0", "y0", "x1", "y1"]]
page_crop = page.crop(rect)
print(f"rect={rect}", DataFrame(util.map_list(
lambda it: it["size"],
page_crop.chars)).value_counts())
word_size = getSeriesValue(DataFrame(
util.map_list(
lambda it: it["size"],
page.chars)).value_counts()
)
In [ ]:
# image
import numpy as np
h, w = tuple(map(int, (page.height, page.width)))
img = np.zeros((h, w)).astype(np.uint8)
for image in page.objects["image"]:
util.drawRects(img, [util.map_list(int, [h - image["y1"],
image["x0"],
h - image["y0"],
image["x1"]
])],
copy_for_show_result=False,
show_result_img=True
)
In [ ]:
# line
for image in page.objects["line"]:
util.drawRects(img, [util.map_list(int, [h - image["y1"],
image["x0"],
h - image["y0"],
image["x1"]
])],
copy_for_show_result=False,
show_result_img=True
)
In [ ]:
h, w
In [ ]:
# rect
for image in page.objects["rect"]:
rect = [util.map_list(int, [h - image["y1"],
image["x0"],
h - image["y0"],
image["x1"]
])]
if util._calc_area(rect[0]) >= (h-1)//1 * w//1:
print([h - image["y1"],
image["x0"],
h - image["y0"],
image["x1"]
])
continue
util.drawRects(img, rect,
copy_for_show_result=False,
show_result_img=True
)
In [ ]:
rects = util.getMaxRects(img, show_result_img=True)
h, w = img.shape[:2]
hlines = []
dilateMode = True
for rect in rects:
r, c, r1, c1 = rect
if dilateMode:
c, c1 = 0, w
t = 1 if r-2 <= 0 else (r-2)
top = [c, t - 1, c1, t]
t = h-2 if r1 + 2 >= h else r1 + 1
bot = [ c, t, c1, t + 1]
hlines.append(top)
hlines.append(bot)
# 将开头和结尾作为一条分割线
x1, y1, x2, y2 = hlines[0]
if y2 > 2:
hlines.insert(0, [x1, 0, x2, 1])
x1, y1, x2, y2 = hlines[-1]
if y2 < (h - 2):
hlines.append([x1, h - 2, x2, h - 1])
hlines
In [ ]:
page.edges
In [ ]:
util.drawRects(img, util.xy2rc(hlines), show_result_img=True)
img0 = cv2.imread('0.png')
imgs, pos =util.getSubImgsAndPos(img0, hlines, filterFun=lambda img: img.shape[0] > word_size)
util.imshow(imgs)
In [ ]:
import util
for attr in "objects/.chars/.lines/.rects/.curves/.figures/.images".split("/."):
if hasattr(page, attr):
util.dump(str(getattr(page, attr)), f"{attr}.json")
In [ ]:
import util
pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 8
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
pg = 9
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
1.2 二值图¶
In [ ]:
import cv2
import numpy as np
import util
img8 = util.getBinary('8.png', 200, 255, 0).astype(np.uint8)
img9 = util.getBinary('9.png', 200, 255, 0).astype(np.uint8)
util.imshow([img8, img9])
2. 全文字¶
1. 将图片, 表格等区域去除¶
In [ ]:
util.imshow(img8)
words, w_size, w_s_stats, page = util.getWords(pdf_path, 8)
img8_rected = util.getBinary(
util.drawRects(np.empty(img8.shape).astype(np.uint8), getRectsInPage(page), show_result_img=False, line_color=[255,0,0]),
0,
255,
0
)
util.imshow(img8_rected)
# 得到最大的矩形区域
img8_rects_exclude_small_rect = util.getMaxRects(img8_rected, show_result_img=False)
# 区域划分示意图
_ = util.drawRects(img8, img8_rects_exclude_small_rect,line_color=[0,0,255], show_result_img=True)
img8_binary = img8
rs = img8_rects_exclude_small_rect.copy()
img8_binary_filled = util.drawRects(img8_binary, rs, line_color=[255,255,255], fill_color=[255,255,255], show_result_img=True)
2.1 水平切几刀¶
2.1.1 计算白线分隔值¶
In [ ]:
w8, h8 = img8_binary.shape[:2]
# 以字符 size 决定 kernel 高度
k = cv2.getStructuringElement(cv2.MORPH_RECT, (w8 * 3 // 4, int(getSeriesValue(w_s_stats))))
img8_erode = util.erode(img8_binary, k)
util.imshow(img8_erode)
2.1.2 水平分割¶
In [ ]:
lines = util.extractHline(img8_erode)# 调整直线的位置
adjustLines(lines, img8_rects_exclude_small_rect)
util.imwrite(util.drawRects(img8, util.xy2rc(lines), show_result_img=True))
imgs, pos = util.getSubImgsAndPos(img8, lines)
util.imshow(imgs)
2.1.3 处理分栏¶
In [ ]:
# 绘制文本框
img8_word_filled = util.drawRects(img8_binary_filled, getWsRects(words), fill_color=[255,255,255], line_color=0, show_result_img=True)
In [ ]:
lines = util.extractHline(img8_erode)# 调整直线的位置
adjustLines(lines, img8_rects_exclude_small_rect)
util.drawRects(img8, util.xy2rc(lines), show_result_img=True)
imgs, pos = util.getSubImgsAndPos(img8_word_filled, lines)
util.imshow(imgs)
In [1]:
img8_children, _ = util.getSubImgsAndPos(img8, lines)
img8_children = iter(img8_children)
# 分栏
for img in imgs:
w, h = img.shape[:2]
df = DataFrame(util.getWhiteBarWidthList(img, isHline=False, rate=0.8))
series = df.value_counts()
image = next(img8_children)
if series.size > 2:
# 需要继续切分
print(series, f'[len={len(series)}]')
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
kh = h * 3 // 5
kw = int(getSeriesValue(w_s_stats) * 1.2)
k = np.ones((kh, kw))
img_erode = util.erode(img, k, anchor=[kh - 1, -1])
vlines = util.extractVline(img_erode, rate=0.75)
util.drawRects(image, util.xy2rc(vlines), show_result_img=True)
_imgs, _pos = util.getSubImgsAndPos(image, vlines)
# _imgs, _pos = util.getSubImgsAndPos(img, vlines)
util.imshow(_imgs)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-a295eaed4814> in <module> ----> 1 img8_children, _ = util.getSubImgsAndPos(img8, lines) 2 img8_children = iter(img8_children) 3 # 分栏 4 for img in imgs: 5 NameError: name 'util' is not defined
3. 带有表格¶
TODO¶
In [87]:
import util
from pandas import DataFrame
import numpy as np
import cv2
def de_duplicate_rect(page_areas: list, iou=0.95):
"""# 根据交并比去除重复的区域
Args:
page_areas (list): 区域
iou (float, optional): 交并比大于iou会被去除. Defaults to 0.95.
Returns:
list: 去重后的rects
"""
sorted_page_areas = sorted(page_areas, key=lambda rect: (rect[0], rect[2], rect[1], rect[3]))
de_duplicate = []
for r in sorted_page_areas:
if len(de_duplicate) < 1:
de_duplicate.append(r)
else:
last = de_duplicate.pop()
if util.calc_iou(last, r) > iou:
de_duplicate.append(last)
else:
de_duplicate.append(last)
de_duplicate.append(r)
return de_duplicate
def getRcRects(rects, h, zoom=1.0, save_float=False, exclude_page_rect=True):
print("getRcRects, zoom=", zoom)
_rects = util.map_list(lambda r: util.xy2rc(
util.map_list(float if save_float else int, (r['x0'] * zoom, (h - r['y1']) * zoom, r['x1'] * zoom, (h - r['y0']) * zoom))),
rects
)
res = []
for rect in _rects:
if exclude_page_rect:
r, c, r1, c1 = rect
if r == c == 0:
continue
res.append(rect)
return res
def getWsRects(words, zoom=1.0, save_float=False, **kw):
offset = [1, 1] if not hasattr(kw, "offset") else kw["offset"][:2]
_rects = util.map_list(lambda r: util.xy2rc(
util.map_list(float if save_float else int, (r['x0'] * zoom, r['top'] * zoom, r['x1'] * zoom, r['bottom'] * zoom))),
words
)
def apply_offset(_rect):
r_, c_ = offset
r_, c_ = r_ - 1, c_ - 1
r, c, r1, c1 = _rect
return [r + r_, c + c_, r1 + r_, c1 + c_]
return util.map_list(apply_offset, _rects)
def getRectsInPage(page, exclude_page_rect=True, zoom=1, draw_image=False, is_line_th=4):
""" 得到图片上可能存在的矩形框(包括直线, 图片...)
Args:
page (_type_): pdfplumber.Page
Returns:
_type_: _description_
"""
rects = []
items = [page.lines, page.curves, page.rects]
if draw_image:
items.append(page.images)
for it in items:
_rects = getRcRects(it, page.height, zoom)
for rect in _rects:
if exclude_page_rect:
r, c, r1, c1 = rect
if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
continue
rects.append(rect)
# 有些线段是图片。。。
_rects = getRcRects(page.images, page.height, zoom)
for rect in _rects:
if exclude_page_rect:
r, c, r1, c1 = rect
if r == c == 0 and r1 - page.height * zoom < 1 and c1 - page.width * zoom < 1:
continue
img_h, img_w = util.calc_h_w(rect)
if img_h < is_line_th or img_w < is_line_th:
util.logger.info(f"有一个图片可能是线段:{rect}.")
rects.append(rect)
return rects
def getSeriesValue(series):
v = series.values[0]
kw = series.index[0][0]
for index, num in series.items():
if num == v:
kw = min(kw, index[0])
return kw
def adjustLines(lines, rects_exclude_small_rect, th = 0, is_hline=True):
if is_hline:
rects_exclude_small_rect.sort(key=lambda d: (d[0], -d[2]))
else:
rects_exclude_small_rect.sort(key=lambda d: (d[1], -d[3]))
lines_new = []
for i in range(len(lines)):
if is_hline:
x, y, x1, y1 = lines[i]
for r in rects_exclude_small_rect:
r, c, r1, c1 = r
if (r + th) < y < (r1 + th):
lines[i] = [x, max(0, r - 1), x1, r]
lines_new.append([x, max(0, r - 1), x1, r])
lines_new.append([x, r1-1, x1, r1])
break
lines_new.append(lines[i])
else:
x, y, x1, y1 = lines[i]
for r in rects_exclude_small_rect:
r, c, r1, c1 = r
if (c + th) < x < (c1 + th):
lines[i] = [ max(0, x - 1), r, x, r1]
lines_new.append([max(0, x - 1), r, x, r1])
lines_new.append([c1-1, r, c1, r1])
break
lines_new.append(lines[i])
lines_new.sort(key=lambda it: it[1])
return lines_new
1.1 确定表格和图片类区域¶
In [88]:
from pandas import DataFrame
# from funs import *
import numpy as np
from importlib import reload
PAGE_OUT = "page_rect.json"
pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\tu_pian.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\ban_fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\fl_.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream1.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\stream3.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb0.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb2.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb1.pdf'
# pdf_path = r'C:\Users\Administrator\Desktop\nb5.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\nb4.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\lc.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\lc1.pdf'
pg = 0
line_th = 4
zoom = 1
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=zoom, zoom_y=zoom)
exec(f"""
import cv2
img{pg} = cv2.imread('{pg}.png')
""")
img0.shape
import sys
# sys.setrecursionlimit(80000)
class Color(object):
IMAGE = [255, 0, 0]
RECT = [0, 0, 255]
TEXT_AREA = [50, 120, 255]
LINE = [255, 0, 255]
TEXT_LINE = [211, 211, 0]
words, w_size, w_s_stats, page = util.getWords(pdf_path, pg)
image_area_rects = getRcRects(page.images, page.height, zoom)
line_area_rects = getRcRects(page.lines, page.height, zoom)
rect_area_rects = getRcRects(page.rects, page.height, zoom)
util.drawRects(img0, rect_area_rects, show_result_img=True)
word_size_as_th = getSeriesValue(w_s_stats)
expand = max(0, int(word_size_as_th * zoom / 2))
# 有些页眉和页脚是矩形。。。
rect_lines = []
for _r in rect_area_rects:
img_h, img_w = util.calc_h_w(_r)
if img_h < max(line_th, word_size_as_th) \
and img_w > 0.7 * page.width \
and (_r[0] < page.height * 0.2 or _r[0] > page.width * 0.8):
util.logger.info(f"有一个矩形可能是线段:{_r}")
rect_lines.append(_r)
line_area_rects.append(_r)
def getMaxAreaRect(img, area_rects,
directs: list=[(0, 1), (1, 0), (-1, 0), (0, -1)],
show_result_img=True):
img_rected = util.getBinary(
util.drawRects(np.empty(img.shape).astype(np.uint8), area_rects,
show_result_img=show_result_img, line_color=[255, 0, 0]),
0,
255,
0
)
# 得到最大的矩形区域
return util.getMaxRects(img_rected,directs=directs,show_result_img=show_result_img)
image_area_rects = getMaxAreaRect(img0, image_area_rects)
line_area_rects = getMaxAreaRect(img0, line_area_rects)
rect_area_rects = getMaxAreaRect(img0, rect_area_rects)
2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread] 944 - xref found: pos=b'554914' 2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread] 962 - read_xref_from: start=554914, token=/b'xref' 2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread] 134 - xref objects: {1: (None, 17, 0), 2: (None, 66, 0), 3: (None, 122, 0), 4: (None, 209, 0), 5: (None, 605, 0), 6: (None, 2432, 0), 7: (None, 2569, 0), 8: (None, 2597, 0), 9: (None, 2763, 0), 10: (None, 2834, 0), 11: (None, 3089, 0), 12: (None, 363150, 0), 13: (None, 363315, 0), 14: (None, 372526, 0), 15: (None, 372703, 0), 16: (None, 372941, 0), 17: (None, 373567, 0), 18: (None, 373738, 0), 19: (None, 373978, 0), 20: (None, 549919, 0), 21: (None, 549945, 0), 22: (None, 550127, 0), 23: (None, 550370, 0), 24: (None, 550790, 0), 25: (None, 550967, 0), 26: (None, 551222, 0), 27: (None, 551434, 0), 28: (None, 551699, 0), 29: (None, 551930, 0), 30: (None, 552195, 0), 31: (None, 552426, 0), 32: (None, 552691, 0), 33: (None, 552922, 0), 34: (None, 553187, 0), 35: (None, 553418, 0), 36: (None, 553682, 0), 37: (None, 553913, 0), 38: (None, 554184, 0), 39: (None, 554418, 0), 40: (None, 554683, 0)} 2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread] 976 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']} 2022-03-09 09:46:12 [INFO] [pdfdocument-MainThread] 182 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']} 2022-03-09 09:46:12 [INFO] [pdfpage-MainThread] 109 - Pages: Kids=[<PDFObjRef:4>] 2022-03-09 09:46:12 [INFO] [pdfpage-MainThread] 113 - Page: {'Contents': <PDFObjRef:5>, 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'}, 'MediaBox': [0, 0, 595.32, 841.92], 'Parent': <PDFObjRef:2>, 'Resources': {'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, 'Tabs': /'S', 'Type': /'Page'} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 994 - Processing page: <PDFPage: Resources={'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, MediaBox=[0, 0, 595.32, 841.92]> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 1020 - render_contents: resources={'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, streams=[<PDFStream(5): raw=1756, {'Filter': /'FlateDecode', 'Length': 1754}>], ctm=(1, 0, 0, 1, 0, 0) 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 187 - get_font: create: objid=6, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'DescendantFonts': <PDFObjRef:7>, 'Encoding': /'Identity-H', 'Subtype': /'Type0', 'ToUnicode': <PDFObjRef:13>, 'Type': /'Font'} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 187 - get_font: create: objid=None, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'CIDSystemInfo': <PDFObjRef:9>, 'CIDToGIDMap': /'Identity', 'DW': 1000, 'FontDescriptor': <PDFObjRef:10>, 'Subtype': /'CIDFontType2', 'Type': /'Font', 'W': <PDFObjRef:12>, 'Encoding': /'Identity-H', 'ToUnicode': <PDFStream(13): raw=9139, {'Filter': /'FlateDecode', 'Length': 9137}>} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 187 - get_font: create: objid=14, spec={'BaseFont': /'Times New Roman', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:15>, 'LastChar': 247, 'Name': /'F2', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:16>} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 187 - get_font: create: objid=17, spec={'BaseFont': /'ABCDEE+Calibri', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:18>, 'LastChar': 32, 'Name': /'F5', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:20>} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 187 - get_font: create: objid=21, spec={'BaseFont': /'Times New Roman,Bold', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:22>, 'LastChar': 178, 'Name': /'F6', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:23>} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 187 - get_font: create: objid=24, spec={'BaseFont': /b'ABCDEE+\xcb\xce\xcc\xe5', 'Encoding': /'WinAnsiEncoding', 'FirstChar': 32, 'FontDescriptor': <PDFObjRef:25>, 'LastChar': 107, 'Name': /'F7', 'Subtype': /'TrueType', 'Type': /'Font', 'Widths': <PDFObjRef:26>} 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(37): raw=82, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 80, 'SMask': <PDFObjRef:38>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 295}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(31): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:32>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(27): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:28>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(27): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:28>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(29): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:30>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(33): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:34>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(39): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:40>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(35): raw=75, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 66, 'Interpolate': False, 'Length': 73, 'SMask': <PDFObjRef:36>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}> 2022-03-09 09:46:12 [INFO] [pdfinterp-MainThread] 965 - Processing xobj: <PDFStream(39): raw=76, {'BitsPerComponent': 8, 'ColorSpace': /'DeviceRGB', 'Filter': /'FlateDecode', 'Height': 67, 'Interpolate': False, 'Length': 74, 'SMask': <PDFObjRef:40>, 'Subtype': /'Image', 'Type': /'XObject', 'Width': 262}>
getRcRects, zoom= 1 getRcRects, zoom= 1 getRcRects, zoom= 1
2022-03-09 09:46:12 [INFO] [<ipython-input-88-3d717eb512cd>-MainThread] 61 - 有一个矩形可能是线段:[ 55 88 55 506]
2022-03-09 09:46:13 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.18946218490600586]s
2022-03-09 09:46:13 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.1501309871673584]s
2022-03-09 09:46:13 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.31815361976623535]s
In [89]:
rect_lines
Out[89]:
[array([ 55, 88, 55, 506])]
In [90]:
# rect 区域调整
rect_area_rects2 = []
img = img0.copy()
for r in rect_area_rects:
if util.calc_area(r) < 1:
continue
_img = util.getSubImg(img, r)
_img_bi = util.getBinary(_img, 250, 0, 255)
util.imshow(_img_bi)
kh, kw = list(map(int, (1, word_size_as_th * 2)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算列连接")
kh, kw = list(map(int, (word_size_as_th * 2, 1)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算行连接")
temp = util.getMaxRectsTwice(_im, offset=r)
# 画上区域
_ = util.drawRects(img0, temp, show_result_img=True, line_color=[255,0,255])
rect_area_rects2.extend(temp)
rect_area_rects = rect_area_rects2
2022-03-09 09:46:16 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.5954070091247559]s 2022-03-09 09:46:16 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.6034188270568848]s 2022-03-09 09:46:16 [INFO] [util-MainThread] 508 - 有offset函数.
In [91]:
_ = util.drawRects(img0, rect_area_rects, show_result_img=True, line_color=[255,0,255])
In [92]:
# util.canShow = False
img9 = img0
img9_binary = util.getBinary(img9, 200, 255, 0)
img9_rected = util.getBinary(
util.drawRects(np.empty(img9.shape[:2]).astype(np.uint8), getRectsInPage(
page, True, zoom), show_result_img=True, line_color=[255, 0, 0], title="绘制meatadata中的rectangle"),
0,
255,
0
)
# util.imshow([img9_binary, img9_rected])
# 得到最大的矩形区域
# img9_rects_exclude_small_rect = util.getMaxRects(img9_rected,
# # directs=[(0, 1), (1, 0), ],
# show_result_img=True)
img9_rects_exclude_small_rect = []
img9_rects_exclude_small_rect.extend(image_area_rects)
img9_rects_exclude_small_rect.extend(line_area_rects)
img9_rects_exclude_small_rect.extend(rect_area_rects)
# 区域划分示意图
_ = util.drawRects(img9, img9_rects_exclude_small_rect,
line_color=[0, 0, 255], show_result_img=True)
rs = img9_rects_exclude_small_rect.copy()
img9_binary_filled = util.drawRects(img9_binary, rs, line_color=[
255, 255, 255], fill_color=[255, 255, 255], show_result_img=False)
getRcRects, zoom= 1 getRcRects, zoom= 1 getRcRects, zoom= 1 getRcRects, zoom= 1
TODO 判断无线框表格区域¶
In [93]:
h, w = page.height, page.width
w_th = w * 0.6
page_rects = util.filter_list(lambda r: h * 0.1 < r[0] and r[2] < h * 0.9 , rect_area_rects)
maybe_lines = util.filter_list(lambda r: util.calc_h(r) < line_th and util.calc_w(r) > 5 * word_size_as_th, page_rects)
# TODO
if len(page_rects) > 2 and len(maybe_lines) > 0:
page_rects.sort(key=lambda r: (r[0], -r[2], r[1], -r[3]))
maybe_tbs = [[]]
append = False
for r in page_rects:
if util.calc_h(r) < line_th:
if len(maybe_tbs[-1]) == 0:
append = not append
elif r[0] - maybe_tbs[-1][-1][0] < 2 * word_size_as_th:
append = not append
maybe_tbs.append([])
else:
print(maybe_tbs, r)
if append:
maybe_tbs[-1].append(r)
maybe_tb = maybe_tbs[0]
st, ed = maybe_tb[0][0], maybe_tb[-1][2]
words_rects = getWsRects(words, zoom)
filter_wrects = de_duplicate_rect(util.filter_list(lambda x: st < x[0] and x[2] < ed, words_rects))
print(f"原size={len(words_rects)}, 长度过滤之后{len(filter_wrects)}.")
# util.drawRects(np.zeros(img9_rected.shape[:2]), filter_wrects + [maybe_tb[0] ,maybe_tb[-1]], show_result_img=True)
k = np.ones((int((ed - st) / 2), 1))
_img_close = cv2.morphologyEx(util.drawRects(img9_rected, filter_wrects, show_result_img=True, title="pdf 上的rect"),
cv2.MORPH_CLOSE,
k)
util.imshow(_img_close, "_img_close")
_rect = util.getMaxRects(_img_close)
_rect = util.getMaxRects(util.drawRects(np.zeros(img0.shape[:2]), _rect,fill_color=[0,0,0], fill_expanding=[0, word_size_as_th] ,show_result_img=True),
show_result_img=True)
_rect = util.map_list(lambda r:[r[0], r[1] + int(word_size_as_th) - 1, r[2], r[3] - int(word_size_as_th) + 2], _rect)
rect_area_rects.extend(_rect)
print(rect_area_rects)
else:
print("没找到表格候选区域")
_ = util.drawRects(img0,
rect_area_rects,
show_result_img=True)
没找到表格候选区域
In [ ]:
制作MASK¶
In [94]:
mask = np.ones(img0.shape[:2]).astype(np.uint8) * 255
img9_rects_exclude_small_rect = util.map_list(lambda r: util.map_list(int, [r[0] - word_size_as_th * 1.5,
# r[1] - word_size_as_th * 1.5,
r[1] - word_size_as_th,
r[2] + word_size_as_th * 1.5,
# r[3] + word_size_as_th * 1.5
r[3] + word_size_as_th
]), img9_rects_exclude_small_rect)
_ = util.drawRects(mask,
img9_rects_exclude_small_rect,
line_color=0,
fill_color=0,
fill_expanding=[0, 0],
copy_for_show_result=False,
show_result_img=True)
1.2 确定文字区域¶
In [95]:
# # 方案1
# ws_rects = getWsRects(words, zoom)
# expand = max(0, int(word_size_as_th * zoom / 2))
# _img = np.zeros(img0.shape[:2]).astype(np.uint8)
# util.drawRects(_img, ws_rects,
# line_color=[255, 0, 0],
# fill_color=[0, 0, 0],
# fill_expanding=expand,
# show_result_img=True, copy_for_show_result=False)
# word_focused = np.bitwise_and(_img, mask)
# util.imshow(word_focused)
# points_set = []
# text_area_rects = util.getMaxRectsTwice(
# word_focused, show_result_img=True, result=points_set)
# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 , r[3] -expand + 2], text_area_rects )
# _ = util.drawRects(img0, text_area_rects, show_result_img=True, title="游离的文本行")
In [96]:
# 方案2
ws_rects = getWsRects(words, zoom)
ws_rects_not_in_table = []
mask_text = []
for wr in ws_rects:
add_flag = True
for orect in img9_rects_exclude_small_rect:
if util.isInnerCell(wr, orect, strict=False)[0]:
add_flag = False
mask_text.append(wr)
break
if add_flag:
ws_rects_not_in_table.append(wr)
_ = util.drawRects(img0, ws_rects_not_in_table, line_color=[255, 0,0], show_result_img=True)
w, h = img9_binary.shape[:2]
# 以字符 size 决定 kernel 高度
k = cv2.getStructuringElement(
cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
# img9_erode = util.erode(img9_binary_filled, k)
# util.imshow([img9_binary_filled, img9_erode])
# ws_rects = getWsRects(words, zoom)
# # 绘制文本框
# img9_word_filled = util.drawRects(img9_binary_filled, ws_rects, fill_color=[255, 255, 255], line_color=0, show_result_img=True)
# TODO 可能不需要
# ws_rects = getWsRects(words, zoom)
# 绘制文本框
util.imshow(img9_binary_filled)
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255,
ws_rects_not_in_table,
fill_color=[255, 255, 255],
line_color=[0,0,0],
fill_expanding=expand,
show_result_img=True,
title="img9_word_filled")
word_focused = np.bitwise_not(img9_word_filled)
util.imshow(word_focused)
k = np.ones((int(word_size_as_th * 1.2), int(word_size_as_th)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects = util.getMaxRects(word_focused_dilate, show_result_img=True, result=points_set)
text_area_rects = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 , r[3] -expand + 2], text_area_rects )
2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 210 - right_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in 2022-03-09 09:46:19 [INFO] [util-MainThread] 206 - left_in
2022-03-09 09:46:20 [INFO] [util-MainThread] 350 - 关键字参数中存在result, 将会将点集放入result中.
2022-03-09 09:46:20 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.1436166763305664]s
In [97]:
k = cv2.getStructuringElement(
cv2.MORPH_RECT, (w * 3 // 4, int(word_size_as_th)))
img9_word_filled = util.drawRects(np.ones(img0.shape[:2]).astype(np.uint8) * 255,
mask_text,
fill_color=[255, 255, 255],
line_color=[0,0,0],
fill_expanding=expand,
show_result_img=True,
title="img9_word_filled")
word_focused = np.bitwise_not(img9_word_filled)
util.imshow(word_focused)
k = np.ones((int(word_size_as_th), int(word_size_as_th * 1.2)))
word_focused_dilate = util.dilate(word_focused, k)
word_focused_dilate = util.erode(word_focused_dilate, k)
util.imshow(word_focused_dilate, "文字膨胀")
points_set = []
text_area_rects_mask = util.getMaxRectsTwice(word_focused, show_result_img=True, result=points_set)
text_area_rects_mask = util.map_list(lambda r: [r[0] + expand - 1, r[1] + expand - 1, r[2] - expand + 2 , r[3] -expand + 2], text_area_rects_mask )
2022-03-09 09:46:21 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.05585312843322754]s 2022-03-09 09:46:21 [INFO] [util-MainThread] 350 - 关键字参数中存在result, 将会将点集放入result中.
2022-03-09 09:46:21 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.1934823989868164]s
1.2.1 根据字号先划分成多块¶
In [98]:
ws_rects_float = getWsRects(words, zoom, save_float=True)
ws_rects_dict = util.groupby(ws_rects_float, key=lambda r: "%.2f" % (r[2] - r[0]))
1.2.2 再确定文本区域¶
In [99]:
text_area_rects_2 = []
for k, group in ws_rects_dict.items():
_expand = max(0, int(eval(k) * zoom * 0.6))
_img = np.zeros(img0.shape[:2]).astype(np.uint8)
for r in group:
util.drawRects(_img, util.map_list(int, r),
line_color=[255,0,0],
fill_color=[0,0,0],
copy_for_show_result=False,
fill_expanding=_expand,
show_result_img=False,
title=f"字号 {k}")
_img = np.bitwise_and(_img, mask)
temp = util.getMaxRectsTwice(_img, line_color=[255, 0, 0], show_result_img=True, title=f"字号 {k}")
# break
temp = util.map_list(lambda r: [r[0] + _expand - 1, r[1] + _expand - 1, r[2] - _expand + 2 , r[3] -_expand + 2], temp )
for r in temp:
if (r[2] - r[0]) > eval(k) * 0.8:
text_area_rects_2.append(r)
text_area_rects.append(r)
# TODO
# _rects = text_area_rects.copy()
# for cell_i in temp:
# for cell_o in _rects:
# if
# text_area_rects.append(cell_i)
# 过滤 小文本区域
text_area_rects = text_area_rects_2
text_area_rects.extend(text_area_rects_mask)
# text_area_rects = list(filter(lambda r: util.calc_area(r) > word_size_as_th ** 2, text_area_rects))
# util.dump(_words, "ws.json")
2022-03-09 09:46:21 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.005983114242553711]s
2022-03-09 09:46:21 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.18650007247924805]s 2022-03-09 09:46:21 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.004987955093383789]s
2022-03-09 09:46:21 [INFO] [util-MainThread] 406 - 找外围最大矩形框耗时: [0.13463973999023438]s 2022-03-09 09:46:21 [WARNING] [util-MainThread] 363 - 图片不存在白色区域!
2022-03-09 09:46:21 [WARNING] [util-MainThread] 363 - 图片不存在白色区域!
1.3 绘制区域示意图¶
In [100]:
rects = [image_area_rects,
line_area_rects ,
rect_area_rects,
text_area_rects,
getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
i = np.array([c for _ in range(100)]).astype(np.uint8)
i = np.reshape(i, (10,10,3))
if color_img is None:
color_img = i
else:
color_img = np.concatenate((color_img, i), 1)
util.imshow(color_img, " | ".join(["image", "line", "rectangle", "text_area", "text_line"]))
img = img0.copy()
for r, c in zip(rects, colors):
util.drawRects(img, r,
line_color=c,
# fill_color=c,
show_result_img=True,
copy_for_show_result=False)
1.4 页面表格化¶
In [101]:
img = img0.copy() * 0
for r, c in zip(rects, colors):
util.drawRects(img, r,
line_color=c,
show_result_img=False,
copy_for_show_result=False)
util.imshow(img)
In [102]:
img_bi = util.getBinary(img, 1, 0, 255)
util.imshow(img_bi)
page_areas = []
page_areas_imgs = []
def extractLines(img_bi, hline=True, rate=0.95):
import util
lines = util.extractHline(img_bi, rate=rate) if hline else util.extractVline(img_bi, rate=rate)
return util.getSubImgsAndPos(img_bi, lines)
imgs , pos = extractLines(img_bi)
util.imshow(imgs)
page_areas_imgs.extend(imgs)
page_areas.extend(util._offset([0,0], pos))
1.5 页面区域再划分¶
In [103]:
img = img0.copy() * 0
for r, c in zip(rects, colors):
util.drawRects(img, r,
line_color=c,
show_result_img=False,
copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)
rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate = 0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[255,255,255], show_result_img=True)
page_areas = rs
img = img0.copy()
_ = util.drawRects(img,
page_areas,
show_result_img=True,
line_color=[0,0,0],
title="区域划分示意图")
2022-03-09 09:46:25 [INFO] [util-MainThread] 1086 - start=0, end=44
2022-03-09 09:46:25 [INFO] [util-MainThread] 1086 - start=0, end=30
2022-03-09 09:46:25 [INFO] [util-MainThread] 1086 - start=0, end=621
2022-03-09 09:46:25 [INFO] [util-MainThread] 1086 - start=0, end=106 2022-03-09 09:46:25 [INFO] [util-MainThread] 1227 - 没有子图了 2022-03-09 09:46:25 [INFO] [util-MainThread] 1227 - 没有子图了 2022-03-09 09:46:25 [INFO] [util-MainThread] 1227 - 没有子图了 2022-03-09 09:46:26 [INFO] [util-MainThread] 1227 - 没有子图了
In [104]:
rects = [image_area_rects,
line_area_rects,
rect_area_rects + page_areas,
text_area_rects,
getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT,
Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
i = np.array([c for _ in range(100)]).astype(np.uint8)
i = np.reshape(i, (10, 10, 3))
if color_img is None:
color_img = i
else:
color_img = np.concatenate((color_img, i), 1)
ls = ["image", "line", "rectangle", "text_area", "text_line"]
util.imshow(color_img, " | ".join(ls))
img = img0.copy()
for r, c, t in zip(rects, colors, ls):
util.drawRects(img, r,
line_color=c,
# fill_color=c,
show_result_img=True,
copy_for_show_result=False,
title=t)
util.imshow(img)
2.1 图片, rect内部再划分¶
In [105]:
# text_area_rects_add = []
# import pdfplumber
# page = pdfplumber.open(pdf_path).pages[0]
# for _rect in rect_area_rects:
# if util._calc_area(_rect) < word_size_as_th ** 2 * 4:
# continue
# crop_box = page.within_bbox(util.xy2rc(_rect, 1 / zoom))
# _words = crop_box.extract_words(
# keep_blank_chars=False,
# x_tolerance=word_size_as_th * 1.5,
# y_tolerance=word_size_as_th * 0.25)
# _rects = []
# for category in rects:
# for r in category:
# if util.isInnerCell(r, _rect)[0]:
# _rects.append(r)
# _img = util.getSubImg(
# util.drawRects(img0, getWsRects(_words, zoom=zoom),
# line_color=[255,0,0],
# show_result_img=True, title="绘制文本行"),
# _rect
# )
# th = DataFrame(_img.flatten()).value_counts().index[0][0]
# _img_bi = np.bitwise_not(util.getBinary(_img, th - 2, 255, 0))
# util.imshow(_img_bi, "Rect 子图")
# kh, kw = list(map(int, (1, word_size_as_th)))
# k = np.ones((kh, kw))
# _im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
# util.imshow(_im, "闭运算列连接")
# kh, kw = list(map(int, (word_size_as_th, 1)))
# k = np.ones((kh, kw))
# _im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
# util.imshow(_im, "闭运算行连接")
# text_area_rects_add.extend(util.getMaxRectsTwice(_im,
# directs=[(0, 1), (1, 0)],
# show_result_img=True,
# filter=lambda rect: util._calc_area(
# rect) > word_size_as_th ** 2,
# offset=_rect,
# title="绘制文本框"
# )
# )
# # util.dump(_words, "ws.json")
In [106]:
def mergeTextArea(text_area_rects, text_area_rects_add, rate = 0.8):
_rs = []
walked = set()
for o in text_area_rects:
for i in text_area_rects_add:
if str(i) in walked:
continue
f, outer, inner = util.isInnerCell(i, o, strict=False, rate=rate)
if f:
_rs.append(outer)
walked.add(str(inner))
for i in text_area_rects:
for o in text_area_rects_add:
if str(i) in walked:
continue
f, outer, inner = util.isInnerCell(i, o, strict=False, rate=rate)
if f:
_rs.append(outer)
walked.add(str(inner))
return _rs
2.2 再次绘制图像¶
In [107]:
pre = text_area_rects.copy()
In [108]:
text_area_rects = pre
pre = text_area_rects.copy()
_ = util.drawRects(_img, text_area_rects,
show_result_img=True)
In [109]:
# 大文本区域细分
import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]
length = len(text_area_rects)
for i in range(length):
tr = text_area_rects[i]
if (tr[2] - tr[0]) > 4 * word_size_as_th and (tr[3] - tr[1]) > img0.shape[1] * 0.5:
util.drawRects(img0, tr, show_result_img=True)
_img_bi = util.getBinary(util.getSubImg(img0, tr), 250, 0, 255)
util.imshow(_img_bi, "Rect 子图")
kh, kw = list(map(int, (1, word_size_as_th)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算列连接")
kh, kw = list(map(int, (word_size_as_th * 1.8, 1)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算行连接")
temp = util.getMaxRects(_im, offset=tr, show_result_img=True)
if len(temp) > 1:
text_area_rects[i] = temp[0]
text_area_rects.extend(temp[1:])
# 过滤 小文本区域
text_area_rects = list(filter(lambda r: util.calc_area(r) > word_size_as_th ** 2, text_area_rects))
util.drawRects(_img, text_area_rects,
show_result_img=True)
# util.dump(_words, "ws.json")
2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread] 944 - xref found: pos=b'554914' 2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread] 962 - read_xref_from: start=554914, token=/b'xref' 2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread] 134 - xref objects: {1: (None, 17, 0), 2: (None, 66, 0), 3: (None, 122, 0), 4: (None, 209, 0), 5: (None, 605, 0), 6: (None, 2432, 0), 7: (None, 2569, 0), 8: (None, 2597, 0), 9: (None, 2763, 0), 10: (None, 2834, 0), 11: (None, 3089, 0), 12: (None, 363150, 0), 13: (None, 363315, 0), 14: (None, 372526, 0), 15: (None, 372703, 0), 16: (None, 372941, 0), 17: (None, 373567, 0), 18: (None, 373738, 0), 19: (None, 373978, 0), 20: (None, 549919, 0), 21: (None, 549945, 0), 22: (None, 550127, 0), 23: (None, 550370, 0), 24: (None, 550790, 0), 25: (None, 550967, 0), 26: (None, 551222, 0), 27: (None, 551434, 0), 28: (None, 551699, 0), 29: (None, 551930, 0), 30: (None, 552195, 0), 31: (None, 552426, 0), 32: (None, 552691, 0), 33: (None, 552922, 0), 34: (None, 553187, 0), 35: (None, 553418, 0), 36: (None, 553682, 0), 37: (None, 553913, 0), 38: (None, 554184, 0), 39: (None, 554418, 0), 40: (None, 554683, 0)} 2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread] 976 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']} 2022-03-09 09:46:30 [INFO] [pdfdocument-MainThread] 182 - trailer: {'Root': <PDFObjRef:1>, 'Info': <PDFObjRef:3>, 'Size': 41, 'ID': [b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143', b'\xf3W\xa2Lo\xc4\x8cN\x00&\xdd\x07\xf7\xe3\x143']} 2022-03-09 09:46:30 [INFO] [pdfpage-MainThread] 109 - Pages: Kids=[<PDFObjRef:4>] 2022-03-09 09:46:30 [INFO] [pdfpage-MainThread] 113 - Page: {'Contents': <PDFObjRef:5>, 'Group': {'CS': /'DeviceRGB', 'S': /'Transparency', 'Type': /'Group'}, 'MediaBox': [0, 0, 595.32, 841.92], 'Parent': <PDFObjRef:2>, 'Resources': {'Font': {'F1': <PDFObjRef:6>, 'F2': <PDFObjRef:14>, 'F5': <PDFObjRef:17>, 'F6': <PDFObjRef:21>, 'F7': <PDFObjRef:24>}, 'ProcSet': [/'PDF', /'Text', /'ImageB', /'ImageC', /'ImageI'], 'XObject': {'Image692': <PDFObjRef:27>, 'Image696': <PDFObjRef:29>, 'Image698': <PDFObjRef:31>, 'Image704': <PDFObjRef:33>, 'Image706': <PDFObjRef:35>, 'Image709': <PDFObjRef:37>, 'Image713': <PDFObjRef:39>}}, 'Tabs': /'S', 'Type': /'Page'}
Out[109]:
array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)
In [110]:
# 调整文本行
# _img = np.zeros(img0.shape[:2])
# util.drawRects(_img, text_area_rects, fill_color=[0,0,0],
# fill_expanding=[expand, 0],
# copy_for_show_result=False)
# text_area_rects = util.getMaxRectsTwice(_img, show_result_img=True)
# text_area_rects = util.map_list(lambda r: [r[0] + expand - 1,
# r[1],
# r[2] - expand + 1 ,
# r[3]], text_area_rects )
In [111]:
# 重构页面结构
img = img0.copy() * 0
rects = [image_area_rects,
line_area_rects ,
rect_area_rects,
text_area_rects,
getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
for r, c in zip(rects, colors):
util.drawRects(img, r,
line_color=c,
show_result_img=False,
copy_for_show_result=False)
util.imshow(img)
img_bi = util.getBinary(img, 1, 0, 255)
rs = []
util.dump(util.page_rebulid(img_bi, rects=rs, rate = 0.98), PAGE_OUT)
_ = util.drawRects(img, rs, line_color=[255,255,255], show_result_img=True)
page_areas = de_duplicate_rect(rs)
img = img0.copy()
_ = util.drawRects(img,
page_areas,
show_result_img=True,
line_color=[0,0,0],
title="区域划分示意图")
2022-03-09 09:46:32 [INFO] [util-MainThread] 1086 - start=0, end=44
2022-03-09 09:46:32 [INFO] [util-MainThread] 1086 - start=0, end=30
2022-03-09 09:46:32 [INFO] [util-MainThread] 1086 - start=0, end=621
2022-03-09 09:46:32 [INFO] [util-MainThread] 1086 - start=0, end=106 2022-03-09 09:46:32 [INFO] [util-MainThread] 1227 - 没有子图了 2022-03-09 09:46:32 [INFO] [util-MainThread] 1227 - 没有子图了 2022-03-09 09:46:32 [INFO] [util-MainThread] 1227 - 没有子图了 2022-03-09 09:46:32 [INFO] [util-MainThread] 1227 - 没有子图了
In [115]:
rects = [image_area_rects,
line_area_rects ,
rect_area_rects,
# mergeTextArea(text_area_rects ,text_area_rects_add),
text_area_rects,
getWsRects(words, zoom)]
colors = [Color.IMAGE, Color.LINE, Color.RECT, Color.TEXT_AREA, Color.TEXT_LINE]
color_img = None
for c in colors:
i = np.array([c for _ in range(100)]).astype(np.uint8)
i = np.reshape(i, (10,10,3))
if color_img is None:
color_img = i
else:
color_img = np.concatenate((color_img, i), 1)
ls = ["image", "line", "rectangle", "text_area", "text_line"]
util.imshow(color_img, " | ".join(ls))
img = img0.copy()
for r, c, t in zip(rects, colors, ls):
util.drawRects(img, r,
line_color=c,
# fill_color=c,
show_result_img=True,
copy_for_show_result=False,
title=t)
# 画上区域
_ = util.drawRects(img, page_areas, show_result_img=True, line_color=[0,0,0])
In [116]:
# 画上区域
_img = util.drawRects(img0, page_areas, show_result_img=True, line_color=[0,0,0])
util.imshow(np.concatenate((img0, _img), 1), "划分示意图")
3.1 输出区域信息¶
In [114]:
class PageAreaMetaClass(type):
def __new__(cls, name, bases, attrs):
pageArea = type.__new__(cls, name, bases, attrs)
print(cls, name, bases, attrs)
return pageArea
# class PageArea(dict, metaclass=PageAreaMetaClass):
class PageArea(dict):
THRESHOLD = 0.98
HORIZON = "horizon"
VERTICAL = "vertical"
def _calc_h(self, rect):
return rect[2] - rect[0]
def _calc_w(self, rect):
return rect[3] - rect[1]
def calc_h_w(self, rect):
return self._calc_h(rect), self._calc_w(rect)
def __init__(self, rects):
max_area = max(util.map_list(util.calc_area, rects))
self.area = max_area
# 添加自身所在区域
for r in rects:
if util.calc_area(r) == max_area:
self.rect = list(r)
break
# 加入子页面
children_rects = list(
filter(lambda rect: util.calc_area(rect) != max_area, rects))
children_rects.sort(key=lambda rect: (
rect[0], rect[2], rect[1], rect[3]))
self.children_rects = util.map_list(list, children_rects)
mode = None
if len(children_rects) > 0:
_rect = children_rects[0]
h, w = self.calc_h_w(_rect)
h1, w1 = self.calc_h_w(self.rect)
if h / h1 > self.THRESHOLD and w / w1 < self.THRESHOLD:
mode = self.VERTICAL
elif h / h1 < self.THRESHOLD and w / w1 > self.THRESHOLD:
mode = self.HORIZON
else:
raise ValueError("未知的分类")
chs_dict = {}
keys = util.map_list(str,
list(filter(lambda rect: self._calc_h(rect) / h1 > self.THRESHOLD
if mode == self.VERTICAL else self._calc_w(rect) / w1 > self.THRESHOLD, children_rects)))
for _rect in children_rects:
for k in keys:
if util.isInnerCell(_rect, eval(k))[0]:
if k not in chs_dict:
chs_dict[k] = []
chs_dict[k].append(_rect)
self.tag = mode
children_pages = []
for v in chs_dict.values():
children_pages.append(PageArea(v))
self.children_pages = children_pages
def __getitem__(self, key):
return eval(f"self.{key}", globals(), locals()) if hasattr(self, key) else None
def __repr__(self):
return util.dumps(eval(str(self.__dict__)))
__str__ = __repr__
PageArea(page_areas)
Out[114]:
{ "area": 501832, "rect": [ 0, 0, 842, 596 ], "children_rects": [ [ 21, 0, 65, 596 ], [ 64, 0, 94, 596 ], [ 93, 0, 714, 596 ], [ 713, 0, 819, 596 ] ], "tag": "horizon", "children_pages": [ { "area": 26224, "rect": [ 21, 0, 65, 596 ], "children_rects": [] }, { "area": 17880, "rect": [ 64, 0, 94, 596 ], "children_rects": [] }, { "area": 370116, "rect": [ 93, 0, 714, 596 ], "children_rects": [] }, { "area": 63176, "rect": [ 713, 0, 819, 596 ], "children_rects": [] } ] }
In [ ]:
In [188]:
_page = PageArea(page_areas).children_pages[2]
_rect = _page.rect
print(_page)
util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True,
title=str(_rect))
# for _page in PageArea(page_areas).children_pages:
# _rect = _page.rect
# util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True,
# title=str(_rect))
{ "area": 132312, "rect": [ 86, 0, 308, 596 ], "children_rects": [ [ 86, 39, 308, 305 ], [ 86, 304, 308, 562 ], [ 91, 39, 117, 305 ], [ 116, 39, 307, 305 ] ], "tag": "vertical", "children_pages": [ { "area": 59052, "rect": [ 86, 39, 308, 305 ], "children_rects": [ [ 91, 39, 117, 305 ], [ 116, 39, 307, 305 ] ], "tag": "horizon", "children_pages": [ { "area": 6916, "rect": [ 91, 39, 117, 305 ], "children_rects": [] }, { "area": 50806, "rect": [ 116, 39, 307, 305 ], "children_rects": [] } ] }, { "area": 57276, "rect": [ 86, 304, 308, 562 ], "children_rects": [] } ] }
Out[188]:
array([[[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], ..., [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]], [[254, 254, 254], [254, 254, 254], [254, 254, 254], ..., [254, 254, 254], [254, 254, 254], [255, 255, 255]], [[255, 255, 255], [255, 255, 255], [255, 255, 255], ..., [255, 255, 255], [255, 255, 255], [255, 255, 255]]], dtype=uint8)
In [25]:
# 根据交并比去除重复的区域
sorted_page_areas = sorted(page_areas, key=lambda rect: (rect[0], rect[1], -rect[2], -rect[3]))
de_duplicate = []
for r in sorted_page_areas:
if len(de_duplicate) < 1:
de_duplicate.append(r)
else:
last = de_duplicate.pop()
if util.calc_iou(last, r) > 0.95:
print('交并比 > 0.95', r, last, util.calc_iou(r, last))
de_duplicate.append(last)
else:
de_duplicate.append(last)
de_duplicate.append(r)
page_areas = de_duplicate
for _rect in de_duplicate:
util.drawRects(img0, _rect, line_color=[0, 0, 255], show_result_img=True,
title=str(_rect))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [62]:
rect_area_rects2 = []
img = img0.copy()
for i in range(len(rect_area_rects)):
r = rect_area_rects[i]
_img = util.getSubImg(img0, r)
_img_bi = util.getBinary(_img, 250, 0, 255)
# util.imshow(_img_bi)
kh, kw = list(map(int, (1, word_size_as_th)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_img_bi, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算列连接")
kh, kw = list(map(int, (word_size_as_th, 1)))
k = np.ones((kh, kw))
_im = cv2.morphologyEx(_im, cv2.MORPH_CLOSE, k)
util.imshow(_im, "闭运算行连接")
temp = util.getMaxRectsTwice(_im, offset=r)
# 画上区域
_ = util.drawRects(img0, temp, show_result_img=True, line_color=[255,0,255])
rect_area_rects2.extend(temp)
2022-03-04 16:01:06 [INFO] [util-MainThread] 388 - 找外围最大矩形框耗时: [0.6801793575286865]s 2022-03-04 16:01:07 [INFO] [util-MainThread] 388 - 找外围最大矩形框耗时: [0.7130606174468994]s 2022-03-04 16:01:07 [INFO] [util-MainThread] 490 - 有offset函数.
2022-03-04 16:01:08 [INFO] [util-MainThread] 388 - 找外围最大矩形框耗时: [0.6921482086181641]s 2022-03-04 16:01:09 [INFO] [util-MainThread] 388 - 找外围最大矩形框耗时: [0.6392576694488525]s 2022-03-04 16:01:09 [INFO] [util-MainThread] 490 - 有offset函数.
2022-03-04 16:01:10 [INFO] [util-MainThread] 388 - 找外围最大矩形框耗时: [0.6811752319335938]s 2022-03-04 16:01:11 [INFO] [util-MainThread] 388 - 找外围最大矩形框耗时: [0.7280511856079102]s 2022-03-04 16:01:11 [INFO] [util-MainThread] 490 - 有offset函数.
2.3 TODO 大的文本块, 再判断是不是表格¶
聚焦文字, 分析是否有多个散开的文本行, 文本块
In [ ]:
# 3.1
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
img = m.copy()
all_points = list(zip(*(np.where(img > 0))))
im = img == 100
np.where(im)[0]
np.where(img)
In [ ]:
kh, kw = list(map(int, (_img.shape[0], word_size_as_th * 0.6)))
k = np.ones((kh, kw))
_img_erode = util.erode(_img_, k)
util.imshow(_img_erode)
lines = util.extractVline(_img_erode)
print(f"找到 {len(lines)} 条竖线{lines}")
util.drawRects(_img_, util.xy2rc(lines), show_result_img=True)
if len(lines) > 2:
_imgs, pos = util.getSubImgsAndPos(_img_, lines)
util.imshow(_imgs, "竖线分割图")
In [14]:
def line_split(points, horizonal=True):
result = []
line = [points[0]]
for p in points:
r, c = line[-1]
r1, c1 = p
if not horizonal:
r, c = c, r
r1, c1 = c1, r1
if c1 -c <2:
line.append(p)
else:
result.append(line)
line = [p]
if len(line) > 0:
result.append(line)
if len(result) > 2:
print(f"找到{len(result)}条线段")
return result
from collections import OrderedDict as odict
horizon_lines = odict()
n = 0
# 行收集
line_points = []
for p in all_points:
r, c = p
if r not in horizon_lines.keys():
horizon_lines[r] = ...
if len(line_points) > 0:
horizon_lines[r]=line_split(line_points)
line_points = []
else:
line_points.append(p)
if len(line_points) > 0:
horizon_lines[line_points[0][0]]=line_split(line_points)
keys = list(horizon_lines)
hareas = []
kset = [keys[0]]
for k in keys[1:]:
if k - kset[-1] < 2:
kset.append(k)
else:
hareas.append(kset)
kset = [k]
if len(kset) > 0:
hareas.append(kset)
def isConnect(l1, l2):
l1_start, l1_end = l1[0][1], l1[-1][1]
l2_start, l2_end = l2[0][1], l2[-1][1]
return (l1_start < l2_start < l1_end) or \
(l1_start < l2_end < l1_end)
r_range = hareas[1]
flag_list = []
init_lines = horizon_lines[r_range[0]]
line_flags = []
for line in init_lines:
for point in line:
flag = {}
flag["start"] = (point[0][1], point[-1][1])
flag["mode"] = 0
line_flags.append(flag)
flag_list.append(line_flags)
pre = r_range[0]
for cur in range(r_range[0], r_range[-1] + 1):
cur_line = horizon_lines[cur]
areas = []
for kset in hareas:
if len(kset) == 1:
areas.append(horizon_lines[kset[0]])
ind = 0
# 横线合并
walked = set()
ind = 0
max_ind = len(kset)
pre = kset[0]
pre_lines = horizon_lines[pre]
for l in pre_lines:
area = set()
area.add(str(l[0]))
areas.append(area)
for p in l:
walked.add(p)
for cur in kset[1:]:
pre_lines = horizon_lines[pre]
cur_lines = horizon_lines[cur]
for pre_l in pre_lines:
if pre_l[0] not in walked:
area.add(str(pre_l[0]))
for p in pre_l:
walked.add(p)
for cur_l in cur_lines:
if isConnect(cur_l, pre_l):
for p in cur_l:
walked.add(p)
# TEST CELL
import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]
crop_box = page.within_bbox(util.xy2rc(text_area_rects[3], 1 / zoom))
crop_box.chars = []
crop_box.extract_words(
keep_blank_chars=False,
x_tolerance=word_size_as_th * 1.5,
y_tolerance=word_size_as_th * 0.25)
# TEST CELL
import pdfplumber
page = pdfplumber.open(pdf_path).pages[0]
crop_box = page.within_bbox(util.xy2rc(text_area_rects[3], 1 / zoom))
_words = crop_box.extract_words(
keep_blank_chars=False,
x_tolerance=word_size_as_th * 1.5,
y_tolerance=word_size_as_th * 0.25)
_words
找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到4条线段 找到4条线段 找到3条线段 找到4条线段 找到5条线段 找到6条线段 找到6条线段 找到3条线段 找到4条线段 找到5条线段 找到6条线段 找到6条线段 找到3条线段 找到4条线段 找到5条线段 找到6条线段 找到6条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到4条线段 找到4条线段 找到4条线段 找到4条线段 找到5条线段 找到5条线段 找到6条线段 找到6条线段 找到6条线段 找到6条线段 找到7条线段 找到8条线段 找到9条线段 找到9条线段 找到10条线段 找到10条线段 找到10条线段 找到10条线段 找到11条线段 找到12条线段 找到13条线段 找到14条线段 找到15条线段 找到16条线段 找到17条线段 找到18条线段 找到18条线段 找到3条线段 找到4条线段 找到5条线段 找到5条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段 找到3条线段
In [ ]:
In [ ]:
In [ ]:
In [ ]:
lines = util.extractHline(img9_erode) # 调整直线的位置
lines = adjustLines(lines, img9_rects_exclude_small_rect, th=word_size_as_th * 1.2 * zoom)
util.drawRects(img9, util.xy2rc(lines), show_result_img=True)
imgs, pos = util.getSubImgsAndPos(img9_word_filled, lines)
# imgs, pos = util.getSubImgsAndPos(img9, lines)
# util.imshow(imgs)
In [ ]:
kh, kw = list(map(int, (word_size_as_th * 0.6, _img.shape[1])))
k = np.ones((kh, kw))
_img_erode = util.erode(_img_, k)
util.imshow(_img_erode)
lines = util.extractHline(_img_erode)
util.drawRects(_img_, util.xy2rc(lines), show_result_img=True)
if len(lines) > 2:
pass
In [ ]:
img_children, _ = util.getSubImgsAndPos(img9, lines)
img_children = iter(img_children)
lines_iter = iter(lines)
# 分栏
offset = 0
for img in imgs:
offset = next(lines_iter)[1]
h, w = img.shape[:2]
whiteBars = util.getWhiteBarWidthList(img, isHline=False, rate=0.8)
whiteBars = [bar for bar in whiteBars if bar > word_size_as_th]
image = next(img_children)
if len(whiteBars) > 2:
# 需要继续切分
kh = h * 3 // 5
kw = int(word_size_as_th * 1.2 * zoom)
k = np.ones((kh, kw))
img_erode = util.erode(img, k, anchor=[kh - 1, -1])
vlines = util.extractVline(img_erode, rate=0.75, th=word_size_as_th)
# vlines = adjustLines(vlines, img9_rects_exclude_small_rect, th=word_size_as_th, is_hline=False)
util.drawRects(image, util.xy2rc(vlines), show_result_img=True)
_imgs, _pos = util.getSubImgsAndPos(image, vlines)
for line in vlines:
line[1] = line[1] + offset
line[3] = line[3] + offset
print("add:", line)
lines.append(line)
# _imgs, _pos = util.getSubImgsAndPos(img, vlines)
util.imshow(_imgs)
else:
util.imshow(image)
util.canShow = True
_ = util.drawRects(img0, util.xy2rc(lines), show_result_img=True)
In [ ]:
In [ ]:
df = DataFrame(util.getWhiteBarWidthList(img9, isHline=True, rate=0.8))
series = df.value_counts()
print(series)
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
util.imshow(img9)
k = np.ones((series.index[0][0] * 2, img9.shape[1]))
img9_erode = util.erode(img9, k)
util.imshow(img9_erode)
In [ ]:
hlines = util.extractHline(img9_erode)
util.drawRects(img9, util.xy2rc(hlines), show_result_img=True)
In [ ]:
img9_imgs, img9_poses = util.getSubImgsAndPos(img9, hlines)
util.imshow(img9_imgs)
In [ ]:
img9_body = img9_imgs[1]
util.imshow(img9_body)
# rects = util.simpleLocate(np.bitwise_not(img9_body), show_result_img=True)
In [ ]:
h, w = img9_body.shape[:2]
img9_area = h * w
img9_area
META DATA 分析¶
In [ ]:
import importlib
import util
importlib.reload(util)
util.debug =False
In [ ]:
rects = [d for d in page.rects if d['y1'] < 500]
In [ ]:
import pdfplumber
import pandas as pd
from pdfplumber.page import Page
with pdfplumber.open(r"C:\Users\Administrator\Desktop\xiaokuang.pdf") as pdf:
page = pdf.pages[0] # 第一页的信息
print(type(page.rects[0]))
print("----------------------------------------------")
tables = Page.extract_tables(page, {'vertical_strategy': 'explicit',
'horizontal_strategy': 'explicit',
'explicit_horizontal_lines':
list(
map(lambda d: d['y0'], rects))
+ list(map(lambda d: d['y1'], rects)),
'explicit_vertical_lines':
list(
map(lambda d: d['x0'], rects))
+ list(map(lambda d: d['x1'], rects))
})
import pandas as pd
print(util.dumps(tables), file=open("log.txt", 'w', encoding="utf-8"))
In [ ]:
w_s_stats
In [ ]:
importlib.reload(util)
In [ ]:
# 文本框
# rects.extend(getWsRects(ws))
# pdf_path = r'C:\Users\Administrator\Desktop\xiaokuang.pdf'
pdf_path = r'C:\Users\Administrator\Desktop\fl.pdf'
pg = 0
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
ws, w_size , w_s_stats, page = util.getWords(pdf_path, pg)
img0 = None
exec(f"""
import cv2
img{pg} = cv2.imread('{pg}.png')
""")
h, w = img0.shape[:2]
util.dump(ws, 'ws.json')
3.1 划分表格, 图片区域¶
In [ ]:
util.imshow(img0)
img0_rects = util.getBinary(
util.drawRects(np.empty(img0.shape).astype(np.uint8), getRectsInPage(page), show_result_img=False, line_color=[255,0,0]),
0,
255,
0
)
util.imshow(img0_rects)
# 得到最大的矩形区域
img0_rects_exclude_small_rect = util.getMaxRects(img0_rects, show_result_img=False)
# 区域划分示意图
_ = util.drawRects(img0, img0_rects_exclude_small_rect,line_color=[0,0,255], show_result_img=True)
In [ ]:
img0_binary = util.getBinary(img0, 200, 255, 0)
util.imshow(img0_binary)
rs = img0_rects_exclude_small_rect.copy()
img0_binary_filled = util.drawRects(img0_binary, rs, line_color=[255,255,255], fill_color=[255,255,255], show_result_img=True)
In [ ]:
# 绘制文本行区域
img0_binary_filled = util.drawRects(img0_binary_filled, getWsRects(ws),line_color=[0,0,0], fill_color=[255,255,255], show_result_img=True)
In [ ]:
import pandas as pd
from pandas import DataFrame
df = DataFrame(util.getWhiteBarWidthList(img0_binary_filled, rate=0.8))
df.describe()
series = df.value_counts()
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
In [ ]:
w, h = img0.shape[:2]
k = np.ones((series.index[0][0] // 1, w * 3 // 4))
img0_erode = util.erode(img0_binary_filled, k)
util.imshow(img0_erode)
In [ ]:
lines = util.extractHline(img0_erode)
# 调整直线的位置
for i in range(len(lines)):
x, y, x1, y1 = lines[i]
for r in img0_rects_exclude_small_rect:
r, c, r1, c1 = r
if r < y < r1:
print(lines[i], r)
lines[i] = [x, max(0, r - 1), x1, r]
util.imwrite(util.drawRects(img0, util.xy2rc(lines), show_result_img=True))
imgs, pos = util.getSubImgsAndPos(img0_binary_filled, lines)
util.imshow(imgs)
3.1.1 处理分栏的情况¶
In [ ]:
for img in imgs:
df = DataFrame(util.getWhiteBarWidthList(img, isHline=False, rate=0.8))
series = df.value_counts()
if series.size > 2:
# 需要继续切分
print(series, f'[len={len(series)}]')
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
w, h = img.shape[:2]
k = np.ones((h * 3 // 5, getSeriesValue(series) // 2))
img_erode = util.erode(img, k, anchor=[h * 3 // 5 - 1, getSeriesValue(series) // 2 - 1])
vlines = util.extractVline(img_erode, rate=0.75)
util.drawRects(img, util.xy2rc(vlines), show_result_img=True)
_imgs, _pos = util.getSubImgsAndPos(img, vlines)
print(len(_imgs), vlines)
util.imshow(_imgs)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
import numpy as np
img0_binary = util.getBinary(img0, 200, 255, 0)
img_draw_rects = util.drawRects(img0, rects)
util.imshow(img_draw_rects)
In [ ]:
rs = list(map(lambda rect: util.xy2rc(
list(map(int, (rect["x0"], rect["top"], rect["x1"], rect["bottom"])))
),
ws))
In [ ]:
util.drawRects(img9, rects=rs, line_color=[125,0,0], show_result_img=True)
In [ ]:
In [ ]:
m9 = cv2.imread('9.png')
m9_imgs, _ = util.getSubImgsAndPos(m9, hlines)
EXT 定位有线表格区域¶
In [ ]:
for r in rects:
if util._calc_area(r) > img9_area * 0.125:
print(util._calc_area(r))
util.drawRects(m9_imgs[1], [r], line_color=[255, 0, 0], show_result_img=True)
In [ ]:
import util
pdf_path = r'D:\DongDongsFiles\file\2022-02\H2_AN202011171430084484_1.pdf'
pg = 11
util.pdf2img(pdf_path, pg, f'{pg}.png', zoom_x=1, zoom_y=1)
In [ ]:
import sys
sys.getrecursionlimit()
In [ ]:
import cv2
import numpy as np
import util
import sys
sys.setrecursionlimit(100000)
binary_img = lambda path: cv2.cvtColor(cv2.imread(path), cv2.COLOR_RGB2GRAY)
tm_binary = np.where(binary_img('11.png') > 200, 255, 0).astype(np.uint8)
tm = cv2.imread('11.png')
In [ ]:
from pandas import DataFrame
df = DataFrame(util.getWhiteBarWidthList(tm_binary, isHline=True, rate=0.8))
series = df.value_counts()
print(series)
print(f'{series.index[0][0]} 出现次数最多, 为 {series.array[0]} 次。')
util.imshow(tm_binary)
# TODO, how to get 16
k = np.ones((16, tm_binary.shape[1]))
tm_erode = util.erode(tm_binary, k)
util.imshow(tm_erode)
In [ ]:
k = util.CV2_K_CROSS(tm_binary.shape[0] // 8)
util.imshow(k * 255)
In [ ]:
t = util.erode(np.bitwise_not(tm_binary), k)
util.imshow([np.bitwise_not(tm_binary), t])
In [ ]:
hlines = util.extractHline(tm_erode)
rects = util.drawRects(tm, util.xy2rc(hlines), show_result_img=True)
In [ ]:
tm_imgs, _ = util.getSubImgsAndPos(tm, hlines)
util.imshow(tm_imgs[1])
In [ ]:
_ = util.drawRects(tm, rects, show_result_img=True)