从 pdf.js 提取数据¶
In [16]:
var btn = document.createElement('div')
btn.style = "position: absolute;right:10px;top:10px;width:100px;height:50px;color:#000;z-index:99999;border:1px solid #ddd;"
btn.id = 'download'
btn.innerText = '下载字符信息'
var viewer = document.querySelector('#viewer .page')
viewer.appendChild(btn)
btn.onclick = function () {
let loc = location.href;
let infocode = (loc.match(/AN.*?(?=\/)/g) || ['words'])[0]
let page = (loc.match(/(?<=page_)\d+/g) || ['1'])[0]
let attrs = ['font-size', 'font-family', 'color', 'top', 'left', 'width', 'height', 'transform', 'direction'];
// 这是一个简化的示例,需要根据你的实际 HTML 结构和 PDF.js 用法进行调整
let factor = parseFloat(document.getElementById("viewer").style.getPropertyValue("--scale-factor"));
let viewerComputedStyle = window.getComputedStyle(document.querySelector('div.page'));
let height = viewerComputedStyle.getPropertyValue('height');
let width = viewerComputedStyle.getPropertyValue('width');
// 获取渲染后的 PDF.js 页面容器
let pdfContainer = document.getElementById('viewerContainer'); // 替换成你的 PDF 容器元素 ID
// 获取所有包含文本内容的元素
let textElements = pdfContainer.querySelectorAll('.textLayer span[style*="left"]');
let textBlocks = [];
// 遍历文本元素并获取 computed style
textElements.forEach(textElement => {
let textContent = textElement.textContent;
let computedStyle = window.getComputedStyle(textElement);
// 组织每个文本块的信息
let textBlock = {
"text": textContent,
};
// 遍历 attrs 数组,往 textBlock 添加样式属性
attrs.forEach(attr => {
// 获取属性值,并根据需要转换为数值或字符串
let value = computedStyle.getPropertyValue(attr);
// 如果需要,进行适当的处理或转换
textBlock[attr] = value;
});
let bbox = textElement.getBoundingClientRect();
textBlock['width'] = `${bbox.width}px`
textBlock['height'] = `${bbox.height}px`
let match = computedStyle.getPropertyValue('transform').match(/rotate\(([-]?\d+\.?\d*)deg\)/);
if (match) {
const rotationAngle = parseFloat(match[1]);
textBlock['angle'] = rotationAngle;
}
if (bbox.width != 0 && bbox.height != 0) {
textBlocks.push(textBlock);
}
});
// 将数据输出为 JSON 文件
// let jsonData = JSON.stringify({textBlocks:textBlocks, "scaleFactor":factor, height: height, width: width}, null, 2);
let jsonData = JSON.stringify({ textBlocks: textBlocks, "scaleFactor": factor, height: height, width: width });
// 生成一个 Blob 对象并创建下载链接
let blob = new Blob([jsonData], { type: 'application/json' });
let url = URL.createObjectURL(blob);
// 创建一个链接并模拟点击下载
let a = document.createElement('a');
a.href = url;
a.download = `${infocode}_page_${page}.json`;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
}
Cell In[16], line 1 var btn = document.createElement('div') ^ SyntaxError: invalid syntax
In [ ]:
let attrs = ['font-size', 'font-family', 'color', 'top', 'left', 'width', 'height', 'transform', 'direction'];
// 这是一个简化的示例,需要根据你的实际 HTML 结构和 PDF.js 用法进行调整
let factor = parseFloat(document.getElementById("viewer").style.getPropertyValue("--scale-factor"));
let viewerComputedStyle = window.getComputedStyle(document.querySelector('div.page'));
let height = viewerComputedStyle.getPropertyValue('height');
let width = viewerComputedStyle.getPropertyValue('width');
// 获取渲染后的 PDF.js 页面容器
let pdfContainer = document.getElementById('viewerContainer'); // 替换成你的 PDF 容器元素 ID
// 获取所有包含文本内容的元素
let textElements = pdfContainer.querySelectorAll('.textLayer span[style*="left"]');
let textBlocks = [];
// 遍历文本元素并获取 computed style
textElements.forEach(textElement => {
let textContent = textElement.textContent;
let computedStyle = window.getComputedStyle(textElement);
// 组织每个文本块的信息
let textBlock = {
"text": textContent,
};
// 遍历 attrs 数组,往 textBlock 添加样式属性
attrs.forEach(attr => {
// 获取属性值,并根据需要转换为数值或字符串
let value = computedStyle.getPropertyValue(attr);
// 如果需要,进行适当的处理或转换
textBlock[attr] = value;
});
let bbox = textElement.getBoundingClientRect();
textBlock['width'] = `${bbox.width}px`
textBlock['height'] = `${bbox.height}px`
let match = computedStyle.getPropertyValue('transform').match(/rotate\(([-]?\d+\.?\d*)deg\)/);
if (match) {
const rotationAngle = parseFloat(match[1]);
textBlock['angle'] = rotationAngle;
}
if (bbox.width != 0 && bbox.height !=0) {
textBlocks.push(textBlock);
}
});
// 将数据输出为 JSON 文件
// let jsonData = JSON.stringify({textBlocks:textBlocks, "scaleFactor":factor, height: height, width: width}, null, 2);
let jsonData = JSON.stringify({ textBlocks: textBlocks, "scaleFactor": factor, height: height, width: width });
return jsonData;
In [37]:
to_word_js = '''
let attrs = ['font-size', 'font-family', 'color', 'top', 'left', 'width', 'height', 'transform', 'direction'];
// 这是一个简化的示例,需要根据你的实际 HTML 结构和 PDF.js 用法进行调整
let factor = parseFloat(document.getElementById("viewer").style.getPropertyValue("--scale-factor"));
let viewerComputedStyle = window.getComputedStyle(document.querySelector('div.page'));
let height = viewerComputedStyle.getPropertyValue('height');
let width = viewerComputedStyle.getPropertyValue('width');
// 获取渲染后的 PDF.js 页面容器
let pdfContainer = document.getElementById('viewerContainer'); // 替换成你的 PDF 容器元素 ID
// 获取所有包含文本内容的元素
let textElements = pdfContainer.querySelectorAll('.textLayer span[style*="left"]');
let textBlocks = [];
// 遍历文本元素并获取 computed style
textElements.forEach(textElement => {
let textContent = textElement.textContent;
let computedStyle = window.getComputedStyle(textElement);
// 组织每个文本块的信息
let textBlock = {
"text": textContent,
};
// 遍历 attrs 数组,往 textBlock 添加样式属性
attrs.forEach(attr => {
// 获取属性值,并根据需要转换为数值或字符串
let value = computedStyle.getPropertyValue(attr);
// 如果需要,进行适当的处理或转换
textBlock[attr] = value;
});
let bbox = textElement.getBoundingClientRect();
textBlock['width'] = `${bbox.width}px`
textBlock['height'] = `${bbox.height}px`
let match = computedStyle.getPropertyValue('transform').match(/rotate\(([-]?\d+\.?\d*)deg\)/);
if (match) {
const rotationAngle = parseFloat(match[1]);
textBlock['angle'] = rotationAngle;
}
if (bbox.width != 0 && bbox.height !=0) {
textBlocks.push(textBlock);
}
});
// 将数据输出为 JSON 文件
// let jsonData = JSON.stringify({textBlocks:textBlocks, "scaleFactor":factor, height: height, width: width}, null, 2);
let jsonData = JSON.stringify({ textBlocks: textBlocks, "scaleFactor": factor, height: height, width: width });
return jsonData;
'''
In [4]:
QUERY_DOWNLOADS_SCRIPT = "// 获取 Downloads 列表对象\n" +"var downloadsList = window.Downloads._lists;\n" +"\n" +"// 获取所有下载项\n" +"var downloadItems = downloadsList[window.Downloads.ALL]._downloads;\n" +"\n" +"\n" +"var result = [];\n" +"// 遍历下载项并获取信息\n" +"for (var i = 0; i < downloadItems.length; i++) {\n" +" var downloadItem = downloadItems[i];\n" +"\n" +" // 获取下载文件的下载地址\n" +" var downloadUrl = downloadItem.source.url;\n" +"\n" +" // 获取下载文件的本地文件路径\n" +" var downloadPath = downloadItem.target.path;\n" +" var contentType = downloadItem.contentType;\n" +"\n" +" // 获取下载状态\n" +" var succeeded = downloadItem.succeeded;\n" +" var stopped = downloadItem.stopped;\n" +"\tresult.push({\n" +"\t\t'url': downloadUrl,\n" +"\t\t'path': downloadPath,\n" +"\t\t'type': contentType,\n" +"\t\t'succeeded': succeeded,\n" +"\t\t'stopped': stopped,\n" +"\t}); \n" +"}\n" +"return JSON.stringify(result)";
In [6]:
print(QUERY_DOWNLOADS_SCRIPT)
// 获取 Downloads 列表对象 var downloadsList = window.Downloads._lists; // 获取所有下载项 var downloadItems = downloadsList[window.Downloads.ALL]._downloads; var result = []; // 遍历下载项并获取信息 for (var i = 0; i < downloadItems.length; i++) { var downloadItem = downloadItems[i]; // 获取下载文件的下载地址 var downloadUrl = downloadItem.source.url; // 获取下载文件的本地文件路径 var downloadPath = downloadItem.target.path; var contentType = downloadItem.contentType; // 获取下载状态 var succeeded = downloadItem.succeeded; var stopped = downloadItem.stopped; result.push({ 'url': downloadUrl, 'path': downloadPath, 'type': contentType, 'succeeded': succeeded, 'stopped': stopped, }); } return JSON.stringify(result)
In [36]:
import requests
headers = {
# Already added when you pass json=
'Content-Type': 'application/json',
}
response = requests.post('http://127.0.0.1:8090/api/webdriver/pdf-text', headers=headers, json=json_data)
response.json()
--------------------------------------------------------------------------- ConnectionRefusedError Traceback (most recent call last) d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in _new_conn(self) 174 conn = connection.create_connection( --> 175 (self._dns_host, self.port), self.timeout, **extra_kw 176 ) d:\anaconda3\envs\py36\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options) 94 if err is not None: ---> 95 raise err 96 d:\anaconda3\envs\py36\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options) 84 sock.bind(source_address) ---> 85 sock.connect(sa) 86 return sock ConnectionRefusedError: [WinError 10061] 由于目标计算机积极拒绝,无法连接。 During handling of the above exception, another exception occurred: NewConnectionError Traceback (most recent call last) d:\anaconda3\envs\py36\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 709 headers=headers, --> 710 chunked=chunked, 711 ) d:\anaconda3\envs\py36\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw) 397 else: --> 398 conn.request(method, url, **httplib_request_kw) 399 d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in request(self, method, url, body, headers) 238 headers["User-Agent"] = _get_default_user_agent() --> 239 super(HTTPConnection, self).request(method, url, body=body, headers=headers) 240 d:\anaconda3\envs\py36\lib\http\client.py in request(self, method, url, body, headers, encode_chunked) 1286 """Send a complete request to the server.""" -> 1287 self._send_request(method, url, body, headers, encode_chunked) 1288 d:\anaconda3\envs\py36\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked) 1332 body = _encode(body, 'body') -> 1333 self.endheaders(body, encode_chunked=encode_chunked) 1334 d:\anaconda3\envs\py36\lib\http\client.py in endheaders(self, message_body, encode_chunked) 1281 raise CannotSendHeader() -> 1282 self._send_output(message_body, encode_chunked=encode_chunked) 1283 d:\anaconda3\envs\py36\lib\http\client.py in _send_output(self, message_body, encode_chunked) 1041 del self._buffer[:] -> 1042 self.send(msg) 1043 d:\anaconda3\envs\py36\lib\http\client.py in send(self, data) 979 if self.auto_open: --> 980 self.connect() 981 else: d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in connect(self) 204 def connect(self): --> 205 conn = self._new_conn() 206 self._prepare_conn(conn) d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in _new_conn(self) 186 raise NewConnectionError( --> 187 self, "Failed to establish a new connection: %s" % e 188 ) NewConnectionError: <urllib3.connection.HTTPConnection object at 0x00000209AFB79080>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝,无法连接。 During handling of the above exception, another exception occurred: MaxRetryError Traceback (most recent call last) d:\anaconda3\envs\py36\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 449 retries=self.max_retries, --> 450 timeout=timeout 451 ) d:\anaconda3\envs\py36\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw) 785 retries = retries.increment( --> 786 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2] 787 ) d:\anaconda3\envs\py36\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace) 591 if new_retry.is_exhausted(): --> 592 raise MaxRetryError(_pool, url, error or ResponseError(cause)) 593 MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=8090): Max retries exceeded with url: /api/webdriver/pdf-text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000209AFB79080>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝,无法连接。',)) During handling of the above exception, another exception occurred: ConnectionError Traceback (most recent call last) <ipython-input-36-ede605ff4b25> in <module> 8 9 ---> 10 response = requests.post('http://127.0.0.1:8090/api/webdriver/pdf-text', headers=headers, json=json_data) 11 response.json() d:\anaconda3\envs\py36\lib\site-packages\requests\api.py in post(url, data, json, **kwargs) 115 """ 116 --> 117 return request('post', url, data=data, json=json, **kwargs) 118 119 d:\anaconda3\envs\py36\lib\site-packages\requests\api.py in request(method, url, **kwargs) 59 # cases, and look like a memory leak in others. 60 with sessions.Session() as session: ---> 61 return session.request(method=method, url=url, **kwargs) 62 63 d:\anaconda3\envs\py36\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 527 } 528 send_kwargs.update(settings) --> 529 resp = self.send(prep, **send_kwargs) 530 531 return resp d:\anaconda3\envs\py36\lib\site-packages\requests\sessions.py in send(self, request, **kwargs) 643 644 # Send the request --> 645 r = adapter.send(request, **kwargs) 646 647 # Total elapsed time of the request (approximately) d:\anaconda3\envs\py36\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies) 517 raise SSLError(e, request=request) 518 --> 519 raise ConnectionError(e, request=request) 520 521 except ClosedPoolError as e: ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8090): Max retries exceeded with url: /api/webdriver/pdf-text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000209AFB79080>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝,无法连接。',))
In [13]:
import time
time.time()
Out[13]:
1701227208.4880955
In [24]:
import pandas as pd
p2h = pd.read_csv(r'D:\life-is-short-i-use-python\tb_p2h_info_config_202312071316.csv')
In [25]:
grouped = p2h.groupby('infocode')['process_record_id'].count()
# 选择符合条件(count > 1)的分组
selected_groups = grouped[grouped > 1]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) d:\life-is-short-i-use-python\公告项目.ipynb Cell 10 line 1 ----> <a href='vscode-notebook-cell:/d%3A/life-is-short-i-use-python/%E5%85%AC%E5%91%8A%E9%A1%B9%E7%9B%AE.ipynb#X52sZmlsZQ%3D%3D?line=0'>1</a> grouped = p2h.groupby('infocode')['process_record_id'].count() <a href='vscode-notebook-cell:/d%3A/life-is-short-i-use-python/%E5%85%AC%E5%91%8A%E9%A1%B9%E7%9B%AE.ipynb#X52sZmlsZQ%3D%3D?line=2'>3</a> # 选择符合条件(count > 1)的分组 <a href='vscode-notebook-cell:/d%3A/life-is-short-i-use-python/%E5%85%AC%E5%91%8A%E9%A1%B9%E7%9B%AE.ipynb#X52sZmlsZQ%3D%3D?line=3'>4</a> selected_groups = grouped[grouped > 1] File d:\anaconda3\envs\py311\Lib\site-packages\pandas\core\groupby\generic.py:1961, in DataFrameGroupBy.__getitem__(self, key) 1954 if isinstance(key, tuple) and len(key) > 1: 1955 # if len == 1, then it becomes a SeriesGroupBy and this is actually 1956 # valid syntax, so don't raise 1957 raise ValueError( 1958 "Cannot subset columns with a tuple with more than one element. " 1959 "Use a list instead." 1960 ) -> 1961 return super().__getitem__(key) File d:\anaconda3\envs\py311\Lib\site-packages\pandas\core\base.py:244, in SelectionMixin.__getitem__(self, key) 242 else: 243 if key not in self.obj: --> 244 raise KeyError(f"Column not found: {key}") 245 ndim = self.obj[key].ndim 246 return self._gotitem(key, ndim=ndim) KeyError: 'Column not found: process_record_id'
In [8]:
# 按照 infocode 分组,获取每个分组中 id 最大的索引
max_id_indices = p2h.groupby('infocode')['process_record_id'].idxmax()
In [9]:
# 根据最大的索引获取相应的行数据
result = p2h.loc[max_id_indices]
# 输出结果
result
Out[9]:
process_record_id | infocode | path | hash | company | priority | file_size | total_pages | total_chars | cur_process | ... | id | infocode.1 | hash.1 | display_type | version | create_time.1 | update_time.1 | create_by | update_by | file_type.1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10774 | 6077274 | AC202310311607030815 | http://pdf.dfcfw.com/pdf/H2_AC2023103116070308... | 9282914619da0f853e671e695b69d72a | NaN | NaN | 49417 | 1 | NaN | E | ... | 4562766 | AC202310311607030815 | 9282914619da0f853e671e695b69d72a | R | NaN | 2023-10-31 13:43:32 | 2023-10-31 13:43:40 | NaN | NaN | ppdf |
57693 | 6147484 | AN201203290005088457 | http://pdf.dfcfw.com/pdf/H2_AN2012032900050884... | 4ae7e2fd04eb787af22a2730ff9eb3b4 | NaN | 0.0 | 449089 | 107 | NaN | E | ... | 4631719 | AN201203290005088457 | 4ae7e2fd04eb787af22a2730ff9eb3b4 | R | NaN | 2023-11-14 09:38:46 | 2023-11-14 09:39:16 | NaN | NaN | ppdf |
88041 | 6193612 | AN201303290003144095 | http://pdf.dfcfw.com/pdf/H2_AN2013032900031440... | 06e75572e0317e56f4c2c216b52d2de2 | NaN | 0.0 | 197771 | 2 | NaN | E | ... | 4676958 | AN201303290003144095 | 06e75572e0317e56f4c2c216b52d2de2 | R | NaN | 2023-11-22 14:03:06 | 2023-11-22 14:03:13 | NaN | NaN | |
88040 | 6193609 | AN201404230005490649 | http://pdf.dfcfw.com/pdf/H2_AN2014042300054906... | fc23c45c6d0848b194e5b58272f7d877 | NaN | 0.0 | 136002 | 1 | NaN | E | ... | 4676955 | AN201404230005490649 | fc23c45c6d0848b194e5b58272f7d877 | R | NaN | 2023-11-22 14:02:27 | 2023-11-22 14:02:32 | NaN | NaN | |
51280 | 6138029 | AN201505130009599349 | http://pdf.dfcfw.com/pdf/H2_AN2015051300095993... | 74ffdf952b7ad1ca86f7d84f08b8c57f | NaN | 0.0 | 2064618 | 170 | NaN | E | ... | 4622378 | AN201505130009599349 | 74ffdf952b7ad1ca86f7d84f08b8c57f | R | NaN | 2023-11-10 21:17:07 | 2023-11-10 21:18:02 | NaN | NaN | ppdf |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
95693 | 6204888 | AN202311231612215965 | http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... | bb0bd432bcf2a1d53ec0ca274ae077f5 | NaN | 0.0 | 116501 | 3 | NaN | E | ... | 4688057 | AN202311231612215965 | bb0bd432bcf2a1d53ec0ca274ae077f5 | R | NaN | 2023-11-23 23:19:27 | 2023-11-23 23:19:35 | NaN | NaN | ppdf |
95692 | 6204887 | AN202311231612215966 | http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... | ead0a12d9ff6198d8311f5be38574f6d | NaN | 0.0 | 273434 | 1 | NaN | E | ... | 4688056 | AN202311231612215966 | ead0a12d9ff6198d8311f5be38574f6d | R | NaN | 2023-11-23 23:19:27 | 2023-11-23 23:19:32 | NaN | NaN | ppdf |
95694 | 6204889 | AN202311231612215982 | http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... | 66283f33f7528f04ac4f1138a874fab4 | NaN | 0.0 | 686837 | 6 | NaN | E | ... | 4688058 | AN202311231612215982 | 66283f33f7528f04ac4f1138a874fab4 | R | NaN | 2023-11-23 23:22:27 | 2023-11-23 23:30:47 | NaN | NaN | ppdf |
95695 | 6204890 | AN202311231612215983 | http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... | 66283f33f7528f04ac4f1138a874fab4 | NaN | 0.0 | 686837 | 6 | NaN | E | ... | 4688059 | AN202311231612215983 | 66283f33f7528f04ac4f1138a874fab4 | R | NaN | 2023-11-23 23:24:21 | 2023-11-23 23:24:36 | NaN | NaN | |
95696 | 6204891 | AN202311231612215984 | http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... | 66283f33f7528f04ac4f1138a874fab4 | NaN | 0.0 | 686837 | 6 | NaN | E | ... | 4688060 | AN202311231612215984 | 66283f33f7528f04ac4f1138a874fab4 | R | NaN | 2023-11-23 23:24:21 | 2023-11-23 23:24:37 | NaN | NaN |
95603 rows × 38 columns
In [26]:
result['display'] = result.apply(lambda row: 1 if row['display_type'] == 'R' else 0, axis=1)
fret = result[['infocode', 'display']]
In [27]:
fret.to_csv('p2h_ret_2023-12-07_update.csv', index=False)
In [ ]:
import traceback
traceback.format_exc()
In [ ]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
# 配置 Firefox WebDriver
firefox_options = Options()
firefox_options.headless = True
# 设置 Firefox WebDriver 的执行路径
# 启动 Firefox WebDriver
driver = webdriver.Firefox(firefox_binary='/opt/firefox-91.0.1esr-zh-CN/firefox-bin',
executable_path='/home/baoleiji/geckodriver_31/geckodriver',
options=firefox_options)
# driver = webdriver.Firefox(options=firefox_options)
# 示例:访问一个下载链接
pdf_file = 'file:////home/baoleiji/AN202304161585487275_24.pdf'
pdf_file = 'file:////home/baoleiji/page_63.pdf'
driver.get(pdf_file) # 替换成实际的下载链接
driver.find_elements(By.CSS_SELECTOR, "#viewer")[0].get_attribute("outerHTML")[:100]
driver.find_elements(By.CSS_SELECTOR, "#viewerContainer")[0].get_attribute("outerHTML")[:100]
driver.find_elements(By.CSS_SELECTOR, "span[role]")[-3].get_attribute("outerHTML")
In [39]:
import time, os
deltas = []
ind = 0
for dirpath, dirnames, filenames in os.walk(r'D:\Temp\htmls'):
for file in filenames:
if not file.endswith('.pdf'):
continue
start = time.time()
ind += 1
json_data = {
'pdfPath': os.path.join(dirpath, file),
}
response = requests.post('http://127.0.0.1:8090/api/webdriver/pdf-text', headers=headers, json=json_data)
if len(response.json()['data']) < 500:
print(ind, os.path.join(dirpath, file))
# print(ind, time.time() - start, response.json())
# deltas.append(time.time() - start)
In [19]:
import pandas as pd
pd.DataFrame(deltas).describe()
Out[19]:
0 | |
---|---|
count | 784.000000 |
mean | 0.425131 |
std | 0.243149 |
min | 0.246715 |
25% | 0.336466 |
50% | 0.374103 |
75% | 0.430255 |
max | 2.289095 |
In [4]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 设置 Firefox Profile 来指定下载路径
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2) # 自定义下载路径
profile.set_preference("browser.download.dir", "d:/words") # 设置下载文件的目录
profile.set_preference("browser.download.useDownloadDir", True)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/json") # 设置要下载的文件类型
profile.set_preference("layout.css.devPixelsPerPx", "1.1")
# 配置 Firefox WebDriver
firefox_options = Options()
firefox_options.profile = profile
# 设置 Firefox WebDriver 的执行路径
gecko_driver_path = 'd:/geckodriver.exe' # GeckoDriver 的路径
service = Service(gecko_driver_path)
# 启动 Firefox WebDriver
driver = webdriver.Firefox(service=service, options=firefox_options)
# 示例:访问一个下载链接
driver.get("file:///D:/Temp/htmls/AN202311161611013398_r_mini/page_1.pdf") # 替换成实际的下载链接
# 等待一段时间,确保下载完成
# 这里可以添加一些逻辑,例如等待文件下载完成或其他必要的操作
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_30652\2332689472.py:11: DeprecationWarning: firefox_profile has been deprecated, please use an Options object profile = webdriver.FirefoxProfile() C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_30652\2332689472.py:19: DeprecationWarning: Setting a profile has been deprecated. Please use the set_preference and install_addons methods firefox_options.profile = profile C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_30652\2332689472.py:23: DeprecationWarning: Firefox will soon stop logging to geckodriver.log by default; Specify desired logs with log_output service = Service(gecko_driver_path)
In [22]:
import time
import os
ct = 0
errors = []
for root, dirs, files in os.walk(r'D:\Temp\htmls'):
for file in files:
if file.endswith('.pdf') and 'page_' not in root:
ct += 1
pdf_file = 'file:///' + os.path.join(root, file).replace('\\', '/')
print(ct, pdf_file)
driver.get(pdf_file)
try:
# 获取要滚动到的元素
element = driver.find_elements(By.CSS_SELECTOR, "#viewerContainer .textLayer > span")[-1] # 替换为你要滚动到的元素选择器
# 滚动到指定元素位置
driver.execute_script("arguments[0].scrollIntoView(true);", element)
# 设置等待时间
wait = WebDriverWait(driver, 10) # 设置最长等待时间为 10 秒钟
try:
# 等待元素加载完成
text_layer = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#viewerContainer .textLayer > span')))
# 进行你的操作,元素已经加载完成
print("Element found:", text_layer) # 示例:输出找到的元素文本内容
except:
print("Element not found within specified time.")
time.sleep(0.1)
# 下载文件
driver.execute_script(to_word_js)
except Exception as e:
errors.append(pdf_file)
print(e)
1 file:///D:/Temp/htmls/AN202302161583234787/page_1.pdf Element not found within specified time. 2 file:///D:/Temp/htmls/AN202302161583234787/page_5.pdf Element not found within specified time. 3 file:///D:/Temp/htmls/AN202302161583235908/page_3.pdf Element not found within specified time. 4 file:///D:/Temp/htmls/AN202302281583881638/page_95.pdf Element not found within specified time. 5 file:///D:/Temp/htmls/AN202302281583881638/page_96.pdf Element not found within specified time. 6 file:///D:/Temp/htmls/AN202302281583881638/page_98.pdf Element not found within specified time. 7 file:///D:/Temp/htmls/AN202302281583881638/page_99.pdf Element not found within specified time. 8 file:///D:/Temp/htmls/AN202303011583892542/page_1.pdf Element not found within specified time. 9 file:///D:/Temp/htmls/AN202303011583892545/page_1.pdf Element not found within specified time. 10 file:///D:/Temp/htmls/AN202303011583892548/page_1.pdf Element not found within specified time. 11 file:///D:/Temp/htmls/AN202303011583892551/page_1.pdf Element not found within specified time. 12 file:///D:/Temp/htmls/AN202303011583892552/page_1.pdf Element not found within specified time. 13 file:///D:/Temp/htmls/AN202303011583892555/page_1.pdf Element not found within specified time. 14 file:///D:/Temp/htmls/AN202303011583892556/page_1.pdf Element not found within specified time. 15 file:///D:/Temp/htmls/AN202303011583892578/page_1.pdf Element not found within specified time. 16 file:///D:/Temp/htmls/AN202303011583892582/page_1.pdf Element not found within specified time. 17 file:///D:/Temp/htmls/AN202303011583892586/page_1.pdf Element not found within specified time. 18 file:///D:/Temp/htmls/AN202303011583892589/page_1.pdf Element not found within specified time. 19 file:///D:/Temp/htmls/AN202303011583892592/page_1.pdf Element not found within specified time. 20 file:///D:/Temp/htmls/AN202303011583892593/page_1.pdf Element not found within specified time. 21 file:///D:/Temp/htmls/AN202303011583892598/page_1.pdf Element not found within specified time. 22 file:///D:/Temp/htmls/AN202303011583892598/page_2.pdf Element not found within specified time. 23 file:///D:/Temp/htmls/AN202303011583892598/page_3.pdf Element not found within specified time. 24 file:///D:/Temp/htmls/AN202303011583892598/page_4.pdf Element not found within specified time. 25 file:///D:/Temp/htmls/AN202303011583897916/page_16.pdf Element not found within specified time. 26 file:///D:/Temp/htmls/AN202303011583898495/page_16.pdf Element not found within specified time. 27 file:///D:/Temp/htmls/AN202303011583898575/page_15.pdf Element not found within specified time. 28 file:///D:/Temp/htmls/AN202303011583898653/page_15.pdf Element not found within specified time. 29 file:///D:/Temp/htmls/AN202303011583898726/page_1.pdf Element not found within specified time. 30 file:///D:/Temp/htmls/AN202303011583899064/page_1.pdf Element not found within specified time. 31 file:///D:/Temp/htmls/AN202303011583901479/page_1.pdf Element not found within specified time. 32 file:///D:/Temp/htmls/AN202303011583903892/page_1.pdf Element not found within specified time. 33 file:///D:/Temp/htmls/AN202303011583910026/page_1.pdf Element not found within specified time. 34 file:///D:/Temp/htmls/AN202303011583910543/page_2.pdf Element not found within specified time. 35 file:///D:/Temp/htmls/AN202303011583910543/page_3.pdf Element not found within specified time. 36 file:///D:/Temp/htmls/AN202303011583913104/page_1.pdf Element not found within specified time. 37 file:///D:/Temp/htmls/AN202303011583915212/page_1.pdf Element not found within specified time. 38 file:///D:/Temp/htmls/AN202303011583917131/page_28.pdf Element not found within specified time. 39 file:///D:/Temp/htmls/AN202303011583920374/page_2.pdf Element not found within specified time. 40 file:///D:/Temp/htmls/AN202303011583920769/page_10.pdf Element not found within specified time. 41 file:///D:/Temp/htmls/AN202303011583920817/page_269.pdf Element not found within specified time. 42 file:///D:/Temp/htmls/AN202303011583920817/page_59.pdf Element not found within specified time. 43 file:///D:/Temp/htmls/AN202303011583920850/page_17.pdf Element not found within specified time. 44 file:///D:/Temp/htmls/AN202303011583920850/page_18.pdf Element not found within specified time. 45 file:///D:/Temp/htmls/AN202303011583922005/page_41.pdf Element not found within specified time. 46 file:///D:/Temp/htmls/AN202303011583922005/page_79.pdf Element not found within specified time. 47 file:///D:/Temp/htmls/AN202303011583922005/page_97.pdf Element not found within specified time. 48 file:///D:/Temp/htmls/AN202303011583922259/page_16.pdf Element not found within specified time. 49 file:///D:/Temp/htmls/AN202303011583922259/page_17.pdf Element not found within specified time. 50 file:///D:/Temp/htmls/AN202303011583922267/page_60.pdf Element not found within specified time. 51 file:///D:/Temp/htmls/AN202303011583922267/page_61.pdf Element not found within specified time. 52 file:///D:/Temp/htmls/AN202303011583922267/page_62.pdf Element not found within specified time. 53 file:///D:/Temp/htmls/AN202303011583922619/page_1.pdf Element not found within specified time. 54 file:///D:/Temp/htmls/AN202303011583922619/page_10.pdf Element not found within specified time. 55 file:///D:/Temp/htmls/AN202303011583922619/page_11.pdf Element not found within specified time. 56 file:///D:/Temp/htmls/AN202303011583922619/page_12.pdf Element not found within specified time. 57 file:///D:/Temp/htmls/AN202303011583922619/page_14.pdf Element not found within specified time. 58 file:///D:/Temp/htmls/AN202303011583922619/page_15.pdf Element not found within specified time. 59 file:///D:/Temp/htmls/AN202303011583922619/page_17.pdf Element not found within specified time. 60 file:///D:/Temp/htmls/AN202303011583922619/page_19.pdf Element not found within specified time. 61 file:///D:/Temp/htmls/AN202303011583922619/page_22.pdf Element not found within specified time. 62 file:///D:/Temp/htmls/AN202303011583922619/page_23.pdf Element not found within specified time. 63 file:///D:/Temp/htmls/AN202303011583922619/page_24.pdf Element not found within specified time. 64 file:///D:/Temp/htmls/AN202303011583922619/page_26.pdf Element not found within specified time. 65 file:///D:/Temp/htmls/AN202303011583922619/page_27.pdf Element not found within specified time. 66 file:///D:/Temp/htmls/AN202303011583922619/page_28.pdf Element not found within specified time. 67 file:///D:/Temp/htmls/AN202303011583922619/page_3.pdf Element not found within specified time. 68 file:///D:/Temp/htmls/AN202303011583922619/page_30.pdf Element not found within specified time. 69 file:///D:/Temp/htmls/AN202303011583922619/page_31.pdf Element not found within specified time. 70 file:///D:/Temp/htmls/AN202303011583922619/page_32.pdf Element not found within specified time. 71 file:///D:/Temp/htmls/AN202303011583922619/page_34.pdf Element not found within specified time. 72 file:///D:/Temp/htmls/AN202303011583922619/page_35.pdf Element not found within specified time. 73 file:///D:/Temp/htmls/AN202303011583922619/page_36.pdf Element not found within specified time. 74 file:///D:/Temp/htmls/AN202303011583922619/page_5.pdf Element not found within specified time. 75 file:///D:/Temp/htmls/AN202303011583922619/page_6.pdf Element not found within specified time. 76 file:///D:/Temp/htmls/AN202303011583922619/page_7.pdf Element not found within specified time. 77 file:///D:/Temp/htmls/AN202303011583922619/page_8.pdf Element not found within specified time. 78 file:///D:/Temp/htmls/AN202303011583922619/page_9.pdf Element not found within specified time. 79 file:///D:/Temp/htmls/AN202303011583922751/page_1.pdf Element not found within specified time. 80 file:///D:/Temp/htmls/AN202303011583923603/page_1.pdf Element not found within specified time. 81 file:///D:/Temp/htmls/AN202303011583923803/page_147.pdf Element not found within specified time. 82 file:///D:/Temp/htmls/AN202303011583923803/page_20.pdf Element not found within specified time. 83 file:///D:/Temp/htmls/AN202303011583924194/page_135.pdf Element not found within specified time. 84 file:///D:/Temp/htmls/AN202303011583924194/page_136.pdf Element not found within specified time. 85 file:///D:/Temp/htmls/AN202303011583924194/page_137.pdf Element not found within specified time. 86 file:///D:/Temp/htmls/AN202303011583924194/page_138.pdf Element not found within specified time. 87 file:///D:/Temp/htmls/AN202303011583924194/page_139.pdf Element not found within specified time. 88 file:///D:/Temp/htmls/AN202303011583924194/page_140.pdf Element not found within specified time. 89 file:///D:/Temp/htmls/AN202303011583924194/page_141.pdf Element not found within specified time. 90 file:///D:/Temp/htmls/AN202303011583924194/page_143.pdf Element not found within specified time. 91 file:///D:/Temp/htmls/AN202303011583924194/page_144.pdf Element not found within specified time. 92 file:///D:/Temp/htmls/AN202303011583924194/page_145.pdf Element not found within specified time. 93 file:///D:/Temp/htmls/AN202303011583924194/page_146.pdf Element not found within specified time. 94 file:///D:/Temp/htmls/AN202303011583924228/page_3.pdf Element not found within specified time. 95 file:///D:/Temp/htmls/AN202303011583924228/page_4.pdf Element not found within specified time. 96 file:///D:/Temp/htmls/AN202303011583924785/page_14.pdf Element not found within specified time. 97 file:///D:/Temp/htmls/AN202303011583924785/page_15.pdf Element not found within specified time. 98 file:///D:/Temp/htmls/AN202303011583924786/page_15.pdf Element not found within specified time. 99 file:///D:/Temp/htmls/AN202303011583925000/page_1.pdf Element not found within specified time. 100 file:///D:/Temp/htmls/AN202303011583925002/page_103.pdf Element not found within specified time. 101 file:///D:/Temp/htmls/AN202303011583925002/page_107.pdf Element not found within specified time. 102 file:///D:/Temp/htmls/AN202303011583925002/page_21.pdf Element not found within specified time. 103 file:///D:/Temp/htmls/AN202303011583925202/page_3.pdf Element not found within specified time. 104 file:///D:/Temp/htmls/AN202303011583925291/page_5.pdf Element not found within specified time. 105 file:///D:/Temp/htmls/AN202303011583925660/page_16.pdf Element not found within specified time. 106 file:///D:/Temp/htmls/AN202303011583925660/page_17.pdf Element not found within specified time. 107 file:///D:/Temp/htmls/AN202303011583925672/page_2.pdf Element not found within specified time. 108 file:///D:/Temp/htmls/AN202303011583925685/page_60.pdf Element not found within specified time. 109 file:///D:/Temp/htmls/AN202303011583925685/page_61.pdf Element not found within specified time. 110 file:///D:/Temp/htmls/AN202303011583925685/page_62.pdf Element not found within specified time. 111 file:///D:/Temp/htmls/AN202303011583925830/page_103.pdf Element not found within specified time. 112 file:///D:/Temp/htmls/AN202303011583925830/page_107.pdf Element not found within specified time. 113 file:///D:/Temp/htmls/AN202303011583925830/page_21.pdf Element not found within specified time. 114 file:///D:/Temp/htmls/AN202303011583926787/page_11.pdf Element not found within specified time. 115 file:///D:/Temp/htmls/AN202303011583926787/page_12.pdf Element not found within specified time. 116 file:///D:/Temp/htmls/AN202303011583927517/page_12.pdf Element not found within specified time. 117 file:///D:/Temp/htmls/AN202303011583927517/page_20.pdf Element not found within specified time. 118 file:///D:/Temp/htmls/AN202303141584239808/page_1.pdf Element not found within specified time. 119 file:///D:/Temp/htmls/AN202303151584289331/page_2.pdf Element not found within specified time. 120 file:///D:/Temp/htmls/AN202303151584289605/page_1.pdf Element not found within specified time. 121 file:///D:/Temp/htmls/AN202303151584289605/page_10.pdf Element not found within specified time. 122 file:///D:/Temp/htmls/AN202303151584289605/page_101.pdf Element not found within specified time. 123 file:///D:/Temp/htmls/AN202303151584289605/page_102.pdf Element not found within specified time. 124 file:///D:/Temp/htmls/AN202303151584289605/page_104.pdf Element not found within specified time. 125 file:///D:/Temp/htmls/AN202303151584289605/page_105.pdf Element not found within specified time. 126 file:///D:/Temp/htmls/AN202303151584289605/page_106.pdf Element not found within specified time. 127 file:///D:/Temp/htmls/AN202303151584289605/page_11.pdf Element not found within specified time. 128 file:///D:/Temp/htmls/AN202303151584289605/page_12.pdf Element not found within specified time. 129 file:///D:/Temp/htmls/AN202303151584289605/page_14.pdf Element not found within specified time. 130 file:///D:/Temp/htmls/AN202303151584289605/page_15.pdf Element not found within specified time. 131 file:///D:/Temp/htmls/AN202303151584289605/page_16.pdf Element not found within specified time. 132 file:///D:/Temp/htmls/AN202303151584289605/page_18.pdf Element not found within specified time. 133 file:///D:/Temp/htmls/AN202303151584289605/page_19.pdf Element not found within specified time. 134 file:///D:/Temp/htmls/AN202303151584289605/page_2.pdf Element not found within specified time. 135 file:///D:/Temp/htmls/AN202303151584289605/page_20.pdf Element not found within specified time. 136 file:///D:/Temp/htmls/AN202303151584289605/page_21.pdf Element not found within specified time. 137 file:///D:/Temp/htmls/AN202303151584289605/page_22.pdf Element not found within specified time. 138 file:///D:/Temp/htmls/AN202303151584289605/page_26.pdf Element not found within specified time. 139 file:///D:/Temp/htmls/AN202303151584289605/page_27.pdf Element not found within specified time. 140 file:///D:/Temp/htmls/AN202303151584289605/page_28.pdf Element not found within specified time. 141 file:///D:/Temp/htmls/AN202303151584289605/page_29.pdf Element not found within specified time. 142 file:///D:/Temp/htmls/AN202303151584289605/page_3.pdf Element not found within specified time. 143 file:///D:/Temp/htmls/AN202303151584289605/page_30.pdf Element not found within specified time. 144 file:///D:/Temp/htmls/AN202303151584289605/page_31.pdf Element not found within specified time. 145 file:///D:/Temp/htmls/AN202303151584289605/page_39.pdf Element not found within specified time. 146 file:///D:/Temp/htmls/AN202303151584289605/page_4.pdf Element not found within specified time. 147 file:///D:/Temp/htmls/AN202303151584289605/page_40.pdf Element not found within specified time. 148 file:///D:/Temp/htmls/AN202303151584289605/page_41.pdf Element not found within specified time. 149 file:///D:/Temp/htmls/AN202303151584289605/page_43.pdf Element not found within specified time. 150 file:///D:/Temp/htmls/AN202303151584289605/page_44.pdf Element not found within specified time. 151 file:///D:/Temp/htmls/AN202303151584289605/page_45.pdf Element not found within specified time. 152 file:///D:/Temp/htmls/AN202303151584289605/page_46.pdf Element not found within specified time. 153 file:///D:/Temp/htmls/AN202303151584289605/page_47.pdf Element not found within specified time. 154 file:///D:/Temp/htmls/AN202303151584289605/page_48.pdf Element not found within specified time. 155 file:///D:/Temp/htmls/AN202303151584289605/page_49.pdf Element not found within specified time. 156 file:///D:/Temp/htmls/AN202303151584289605/page_5.pdf Element not found within specified time. 157 file:///D:/Temp/htmls/AN202303151584289605/page_50.pdf Element not found within specified time. 158 file:///D:/Temp/htmls/AN202303151584289605/page_51.pdf Element not found within specified time. 159 file:///D:/Temp/htmls/AN202303151584289605/page_53.pdf Element not found within specified time. 160 file:///D:/Temp/htmls/AN202303151584289605/page_54.pdf Element not found within specified time. 161 file:///D:/Temp/htmls/AN202303151584289605/page_55.pdf Element not found within specified time. 162 file:///D:/Temp/htmls/AN202303151584289605/page_56.pdf Element not found within specified time. 163 file:///D:/Temp/htmls/AN202303151584289605/page_57.pdf Element not found within specified time. 164 file:///D:/Temp/htmls/AN202303151584289605/page_60.pdf Element not found within specified time. 165 file:///D:/Temp/htmls/AN202303151584289605/page_62.pdf Element not found within specified time. 166 file:///D:/Temp/htmls/AN202303151584289605/page_63.pdf Element not found within specified time. 167 file:///D:/Temp/htmls/AN202303151584289605/page_68.pdf Element not found within specified time. 168 file:///D:/Temp/htmls/AN202303151584289605/page_69.pdf Element not found within specified time. 169 file:///D:/Temp/htmls/AN202303151584289605/page_7.pdf Element not found within specified time. 170 file:///D:/Temp/htmls/AN202303151584289605/page_70.pdf Element not found within specified time. 171 file:///D:/Temp/htmls/AN202303151584289605/page_74.pdf Element not found within specified time. 172 file:///D:/Temp/htmls/AN202303151584289605/page_75.pdf Element not found within specified time. 173 file:///D:/Temp/htmls/AN202303151584289605/page_77.pdf Element not found within specified time. 174 file:///D:/Temp/htmls/AN202303151584289605/page_79.pdf Element not found within specified time. 175 file:///D:/Temp/htmls/AN202303151584289605/page_8.pdf Element not found within specified time. 176 file:///D:/Temp/htmls/AN202303151584289605/page_80.pdf Element not found within specified time. 177 file:///D:/Temp/htmls/AN202303151584289605/page_82.pdf Element not found within specified time. 178 file:///D:/Temp/htmls/AN202303151584289605/page_83.pdf Element not found within specified time. 179 file:///D:/Temp/htmls/AN202303151584289605/page_84.pdf Element not found within specified time. 180 file:///D:/Temp/htmls/AN202303151584289605/page_85.pdf Element not found within specified time. 181 file:///D:/Temp/htmls/AN202303151584289605/page_87.pdf Element not found within specified time. 182 file:///D:/Temp/htmls/AN202303151584289605/page_88.pdf Element not found within specified time. 183 file:///D:/Temp/htmls/AN202303151584289605/page_89.pdf Element not found within specified time. 184 file:///D:/Temp/htmls/AN202303151584289605/page_9.pdf Element not found within specified time. 185 file:///D:/Temp/htmls/AN202303151584289605/page_90.pdf Element not found within specified time. 186 file:///D:/Temp/htmls/AN202303151584289605/page_91.pdf Element not found within specified time. 187 file:///D:/Temp/htmls/AN202303151584289605/page_92.pdf Element not found within specified time. 188 file:///D:/Temp/htmls/AN202303151584289605/page_94.pdf Element not found within specified time. 189 file:///D:/Temp/htmls/AN202303151584289605/page_97.pdf Element not found within specified time. 190 file:///D:/Temp/htmls/AN202303151584289611/page_1.pdf Element not found within specified time. 191 file:///D:/Temp/htmls/AN202303151584289611/page_2.pdf Element not found within specified time. 192 file:///D:/Temp/htmls/AN202303151584289611/page_4.pdf Element not found within specified time. 193 file:///D:/Temp/htmls/AN202303151584289611/page_5.pdf Element not found within specified time. 194 file:///D:/Temp/htmls/AN202303151584289611/page_6.pdf Element not found within specified time. 195 file:///D:/Temp/htmls/AN202303151584289616/page_1.pdf Element not found within specified time. 196 file:///D:/Temp/htmls/AN202303151584291140/page_3.pdf Element not found within specified time. 197 file:///D:/Temp/htmls/AN202303151584292038/page_14.pdf Element not found within specified time. 198 file:///D:/Temp/htmls/AN202303151584292945_r_invisiable_text_r_none_table/page_12.pdf Element not found within specified time. 199 file:///D:/Temp/htmls/AN202303151584292945_r_invisiable_text_r_none_table/page_20.pdf Element not found within specified time. 200 file:///D:/Temp/htmls/AN202303151584292945_r_invisiable_text_r_none_table/page_51.pdf Element not found within specified time. 201 file:///D:/Temp/htmls/AN202303151584293181/page_15.pdf Element not found within specified time. 202 file:///D:/Temp/htmls/AN202303151584294225/page_1.pdf Element not found within specified time. 203 file:///D:/Temp/htmls/AN202303151584295721/page_99.pdf Element not found within specified time. 204 file:///D:/Temp/htmls/AN202303151584295941/page_14.pdf Element not found within specified time. 205 file:///D:/Temp/htmls/AN202303151584296409/page_83.pdf Element not found within specified time. 206 file:///D:/Temp/htmls/AN202303151584296823/page_1.pdf Element not found within specified time. 207 file:///D:/Temp/htmls/AN202303151584296823/page_11.pdf Element not found within specified time. 208 file:///D:/Temp/htmls/AN202303151584296823/page_14.pdf Element not found within specified time. 209 file:///D:/Temp/htmls/AN202303151584296823/page_15.pdf Element not found within specified time. 210 file:///D:/Temp/htmls/AN202303151584296823/page_19.pdf Element not found within specified time. 211 file:///D:/Temp/htmls/AN202303151584296823/page_2.pdf Element not found within specified time. 212 file:///D:/Temp/htmls/AN202303151584296823/page_20.pdf Element not found within specified time. 213 file:///D:/Temp/htmls/AN202303151584296823/page_21.pdf Element not found within specified time. 214 file:///D:/Temp/htmls/AN202303151584296823/page_22.pdf Element not found within specified time. 215 file:///D:/Temp/htmls/AN202303151584296823/page_23.pdf Element not found within specified time. 216 file:///D:/Temp/htmls/AN202303151584296823/page_25.pdf Element not found within specified time. 217 file:///D:/Temp/htmls/AN202303151584296823/page_27.pdf Element not found within specified time. 218 file:///D:/Temp/htmls/AN202303151584296823/page_28.pdf Element not found within specified time. 219 file:///D:/Temp/htmls/AN202303151584296823/page_29.pdf Element not found within specified time. 220 file:///D:/Temp/htmls/AN202303151584296823/page_3.pdf Element not found within specified time. 221 file:///D:/Temp/htmls/AN202303151584296823/page_31.pdf Element not found within specified time. 222 file:///D:/Temp/htmls/AN202303151584296823/page_34.pdf Element not found within specified time. 223 file:///D:/Temp/htmls/AN202303151584296823/page_37.pdf Element not found within specified time. 224 file:///D:/Temp/htmls/AN202303151584296823/page_39.pdf Element not found within specified time. 225 file:///D:/Temp/htmls/AN202303151584296823/page_4.pdf Element not found within specified time. 226 file:///D:/Temp/htmls/AN202303151584296823/page_40.pdf Element not found within specified time. 227 file:///D:/Temp/htmls/AN202303151584296823/page_41.pdf Element not found within specified time. 228 file:///D:/Temp/htmls/AN202303151584296823/page_44.pdf Element not found within specified time. 229 file:///D:/Temp/htmls/AN202303151584296823/page_46.pdf Element not found within specified time. 230 file:///D:/Temp/htmls/AN202303151584296823/page_48.pdf Element not found within specified time. 231 file:///D:/Temp/htmls/AN202303151584296823/page_49.pdf Element not found within specified time. 232 file:///D:/Temp/htmls/AN202303151584296823/page_5.pdf Element not found within specified time. 233 file:///D:/Temp/htmls/AN202303151584296823/page_50.pdf Element not found within specified time. 234 file:///D:/Temp/htmls/AN202303151584296823/page_6.pdf Element not found within specified time. 235 file:///D:/Temp/htmls/AN202303151584296823/page_7.pdf Element not found within specified time. 236 file:///D:/Temp/htmls/AN202303151584296823/page_8.pdf Element not found within specified time. 237 file:///D:/Temp/htmls/AN202303151584296823/page_9.pdf Element not found within specified time. 238 file:///D:/Temp/htmls/AN202303161584298993/page_95.pdf Element not found within specified time. 239 file:///D:/Temp/htmls/AN202303161584303367/page_1.pdf Element not found within specified time. 240 file:///D:/Temp/htmls/AN202303161584303376/page_1.pdf Element not found within specified time. 241 file:///D:/Temp/htmls/AN202303161584303443/page_1.pdf Element not found within specified time. 242 file:///D:/Temp/htmls/AN202303161584303902/page_1.pdf Element not found within specified time. 243 file:///D:/Temp/htmls/AN202303161584303903_r_text_miss_r_table/page_1.pdf Element not found within specified time. 244 file:///D:/Temp/htmls/AN202303161584303905_r_mini/page_1.pdf Element not found within specified time. 245 file:///D:/Temp/htmls/AN202303161584303906_r_to_img/page_1.pdf Element not found within specified time. 246 file:///D:/Temp/htmls/AN202303161584303907_r_to_img/page_1.pdf Element not found within specified time. 247 file:///D:/Temp/htmls/AN202303161584303908_r_to_img/page_1.pdf Element not found within specified time. 248 file:///D:/Temp/htmls/AN202303161584303911_r_to_img/page_1.pdf Element not found within specified time. 249 file:///D:/Temp/htmls/AN202303161584303911_r_to_img/page_2.pdf Element not found within specified time. 250 file:///D:/Temp/htmls/AN202303161584303911_r_to_img/page_3.pdf Element not found within specified time. 251 file:///D:/Temp/htmls/AN202303161584303913_r_text_miss_r_pg/page_1.pdf Element not found within specified time. 252 file:///D:/Temp/htmls/AN202303161584303916_r_mini/page_1.pdf Element not found within specified time. 253 file:///D:/Temp/htmls/AN202303161584303919_r_to_img/page_1.pdf Element not found within specified time. 254 file:///D:/Temp/htmls/AN202303161584303920_r_mini/page_1.pdf Element not found within specified time. 255 file:///D:/Temp/htmls/AN202303161584303922_r_mini/page_1.pdf Element not found within specified time. 256 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_1.pdf Element not found within specified time. 257 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_2.pdf Element not found within specified time. 258 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_3.pdf Element not found within specified time. 259 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_4.pdf Element not found within specified time. 260 file:///D:/Temp/htmls/AN202303161584303928_r_to_img/page_1.pdf Element not found within specified time. 261 file:///D:/Temp/htmls/AN202303161584304119_r_table/page_1.pdf Element not found within specified time. 262 file:///D:/Temp/htmls/AN202303161584304121_r_mini/page_1.pdf Element not found within specified time. 263 file:///D:/Temp/htmls/AN202303161584304123_r_mini/page_1.pdf Element not found within specified time. 264 file:///D:/Temp/htmls/AN202303161584304125_r_to_img/page_1.pdf Element not found within specified time. 265 file:///D:/Temp/htmls/AN202303161584304127_r_text_miss_r_table/page_1.pdf Element not found within specified time. 266 file:///D:/Temp/htmls/AN202303161584304128_r_mini/page_1.pdf Element not found within specified time. 267 file:///D:/Temp/htmls/AN202303161584304132_r_mini/page_1.pdf Element not found within specified time. 268 file:///D:/Temp/htmls/AN202303161584304134_r_to_img/page_1.pdf Element not found within specified time. 269 file:///D:/Temp/htmls/AN202303161584304136_r_text_miss_r_order/page_1.pdf Element not found within specified time. 270 file:///D:/Temp/htmls/AN202303161584304137_r_mini/page_1.pdf Element not found within specified time. 271 file:///D:/Temp/htmls/AN202303161584304147_r_to_img/page_1.pdf Element not found within specified time. 272 file:///D:/Temp/htmls/AN202303161584304152_r_to_img/page_1.pdf Element not found within specified time. 273 file:///D:/Temp/htmls/AN202303161584304160_r_table/page_1.pdf Element not found within specified time. 274 file:///D:/Temp/htmls/AN202303161584311395_r_invisiable_text/page_103.pdf Element not found within specified time. 275 file:///D:/Temp/htmls/AN202303161584311395_r_invisiable_text/page_2.pdf Element not found within specified time. 276 file:///D:/Temp/htmls/AN202303161584311395_r_invisiable_text/page_84.pdf Element not found within specified time. 277 file:///D:/Temp/htmls/AN202303161584311613_r_table/page_298.pdf Element not found within specified time. 278 file:///D:/Temp/htmls/AN202303161584311613_r_table/page_299.pdf Element not found within specified time. 279 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_1.pdf Element not found within specified time. 280 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_10.pdf Element not found within specified time. 281 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_11.pdf Element not found within specified time. 282 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_13.pdf Element not found within specified time. 283 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_14.pdf Element not found within specified time. 284 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_15.pdf Element not found within specified time. 285 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_17.pdf Element not found within specified time. 286 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_18.pdf Element not found within specified time. 287 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_20.pdf Element not found within specified time. 288 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_21.pdf Element not found within specified time. 289 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_23.pdf Element not found within specified time. 290 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_25.pdf Element not found within specified time. 291 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_26.pdf Element not found within specified time. 292 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_27.pdf Element not found within specified time. 293 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_28.pdf Element not found within specified time. 294 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_29.pdf Element not found within specified time. 295 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_30.pdf Element not found within specified time. 296 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_31.pdf Element not found within specified time. 297 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_32.pdf Element not found within specified time. 298 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_33.pdf Element not found within specified time. 299 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_34.pdf Element not found within specified time. 300 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_35.pdf Element not found within specified time. 301 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_36.pdf Element not found within specified time. 302 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_37.pdf Element not found within specified time. 303 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_38.pdf Element not found within specified time. 304 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_4.pdf Element not found within specified time. 305 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_5.pdf Element not found within specified time. 306 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_7.pdf Element not found within specified time. 307 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_8.pdf Element not found within specified time. 308 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_9.pdf Element not found within specified time. 309 file:///D:/Temp/htmls/AN202303161584313359_r_order/page_1.pdf Element not found within specified time. 310 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_10.pdf Element not found within specified time. 311 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_11.pdf Element not found within specified time. 312 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_12.pdf Element not found within specified time. 313 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_14.pdf Element not found within specified time. 314 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_15.pdf Element not found within specified time. 315 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_17.pdf Element not found within specified time. 316 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_19.pdf Element not found within specified time. 317 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_20.pdf Element not found within specified time. 318 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_21.pdf Element not found within specified time. 319 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_24.pdf Element not found within specified time. 320 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_25.pdf Element not found within specified time. 321 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_26.pdf Element not found within specified time. 322 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_27.pdf Element not found within specified time. 323 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_28.pdf Element not found within specified time. 324 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_29.pdf Element not found within specified time. 325 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_30.pdf Element not found within specified time. 326 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_31.pdf Element not found within specified time. 327 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_33.pdf Element not found within specified time. 328 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_8.pdf Element not found within specified time. 329 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_9.pdf Element not found within specified time. 330 file:///D:/Temp/htmls/AN202303161584313374_r_order/page_1.pdf Element not found within specified time. 331 file:///D:/Temp/htmls/AN202303161584313381_r_order/page_1.pdf Element not found within specified time. 332 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_139.pdf Element not found within specified time. 333 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_144.pdf Element not found within specified time. 334 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_146.pdf Element not found within specified time. 335 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_148.pdf Element not found within specified time. 336 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_150.pdf Element not found within specified time. 337 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_154.pdf Element not found within specified time. 338 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_159.pdf Element not found within specified time. 339 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_164.pdf Element not found within specified time. 340 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_165.pdf Element not found within specified time. 341 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_166.pdf Element not found within specified time. 342 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_170.pdf Element not found within specified time. 343 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_175.pdf Element not found within specified time. 344 file:///D:/Temp/htmls/AN202303161584313465_r_table_invisiable_text/page_32.pdf Element not found within specified time. 345 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_1.pdf Element not found within specified time. 346 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_10.pdf Element not found within specified time. 347 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_16.pdf Element not found within specified time. 348 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_20.pdf Element not found within specified time. 349 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_21.pdf Element not found within specified time. 350 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_22.pdf Element not found within specified time. 351 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_23.pdf Element not found within specified time. 352 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_24.pdf Element not found within specified time. 353 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_25.pdf Element not found within specified time. 354 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_26.pdf Element not found within specified time. 355 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_27.pdf Element not found within specified time. 356 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_31.pdf Element not found within specified time. 357 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_32.pdf Element not found within specified time. 358 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_33.pdf Element not found within specified time. 359 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_34.pdf Element not found within specified time. 360 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_35.pdf Element not found within specified time. 361 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_36.pdf Element not found within specified time. 362 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_9.pdf Element not found within specified time. 363 file:///D:/Temp/htmls/AN202308181595092754_r_misordered/page_1.pdf Element not found within specified time. 364 file:///D:/Temp/htmls/AN202309051597490339_r_mini/page_1.pdf Element not found within specified time. 365 file:///D:/Temp/htmls/AN202309151598686950_r_failed/page_1.pdf Element not found within specified time. 366 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_10.pdf Element not found within specified time. 367 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_113.pdf Element not found within specified time. 368 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_122.pdf Element not found within specified time. 369 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_173.pdf Element not found within specified time. 370 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_178.pdf Element not found within specified time. 371 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_18.pdf Element not found within specified time. 372 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_19.pdf Element not found within specified time. 373 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_198.pdf Element not found within specified time. 374 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_199.pdf Element not found within specified time. 375 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_204.pdf Element not found within specified time. 376 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_205.pdf Element not found within specified time. 377 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_26.pdf Element not found within specified time. 378 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_33.pdf Element not found within specified time. 379 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_5.pdf Element not found within specified time. 380 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_6.pdf Element not found within specified time. 381 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_9.pdf Element not found within specified time. 382 file:///D:/Temp/htmls/AN202311161611013349_r_mini/page_1.pdf Element not found within specified time. 383 file:///D:/Temp/htmls/AN202311161611013353_r_mini_text_miss_r_merge/page_1.pdf Element not found within specified time. 384 file:///D:/Temp/htmls/AN202311161611013355_r/page_1.pdf Element not found within specified time. 385 file:///D:/Temp/htmls/AN202311161611013379_r_text_miss_x/page_1.pdf Element not found within specified time. 386 file:///D:/Temp/htmls/AN202311161611013391_r_mini/page_1.pdf Element not found within specified time. 387 file:///D:/Temp/htmls/AN202311161611013392_r_mini/page_1.pdf Element not found within specified time. 388 file:///D:/Temp/htmls/AN202311161611013398_r_mini/page_1.pdf Element not found within specified time. 389 file:///D:/Temp/htmls/AN202311161611013404_r/page_1.pdf Element not found within specified time. 390 file:///D:/Temp/htmls/AN202311161611019493_r_table/page_187.pdf Element not found within specified time. 391 file:///D:/Temp/htmls/AN202311161611025479_r_order/page_1.pdf Element not found within specified time.
In [35]:
# 关闭 WebDriver
driver.quit()
In [1]:
from tqdm import tqdm
import json
from operator import itemgetter
import math
import os
def px_text_to_float(args):
for arg in args:
yield float(arg.replace("px", ""))
def apply_scale(items, factor):
return [it / factor for it in items]
def to_chars(word,hw , str_id):
(page_height, page_width) = hw
ct = len(word['text'])
w = word['width'] / ct
chars = []
for i in range(ct):
char = {
"matrix": word['matrix'],
"fontname": word['fontname'],
"adv": word['adv'],
"render": 0,
"char_ind": ((str_id, i), i),
"is_horizontal": word['is_horizontal'],
"upright": word['is_horizontal'],
"x0": word['x0'] + i * w,
"y0": page_height - word['bottom'],
"x1": word['x0'] + i * w + w,
"y1": page_height - word['top'],
"width": w,
"height": word['height'],
"size": word['size'],
"object_type": "char",
"page_number": 1,
"stroking_color": 0,
"non_stroking_color": (0, 0, 0, 1),
"gs": {
"linewidth": 0,
"linecap": None,
"linejoin": None,
"miterlimit": None,
"dash": None,
"intent": None,
"flatness": None,
"scolor": 0,
"ncolor": (0, 0, 0, 1),
},
"text": word['text'][i],
"top": word['top'],
"bottom": word['bottom'],
"doctop": word['top'],
"angle": word['angle'],
}
chars.append(char)
return chars
def to_word(textBlock: dict, scaleFactor: float, page, str_id) -> dict:
items = itemgetter("left", "top", "width", "height", 'font-size')
is_horizontal = textBlock["direction"] == "ltr"
left, top, width, height, fontSize = apply_scale(
px_text_to_float(items(textBlock)), scaleFactor
)
for ind, attr in enumerate(("left", "top", "width", "height", 'font-size')):
textBlock[attr] = (left, top, width, height, fontSize)[ind]
matrix = eval(textBlock["transform"].replace("none", "None").replace("matrix", ""))
angle = '0'
if isinstance(matrix, tuple):
angle = str(round(math.degrees(math.atan2(matrix[1], matrix[3]))) % 360)
word = {
"text": textBlock["text"],
"height": height,
"width": width,
"matrix": matrix,
"x0": left,
"x1": left + width,
"top": top,
"bottom": top + height,
"size": height if is_horizontal else width,
"adv": 1.0,
"angle": angle,
# "chars": to_chars(textBlock),
"is_horizontal": is_horizontal,
"fontname": textBlock['font-family'],
}
word["chars"] = to_chars(word, (page['height'], page['width']), str_id)
return word
def convert(words_json_path):
with open(words_json_path, 'r', encoding='utf-8') as fpr:
page = json.load(fpr)
textBlocks = page['textBlocks']
scaleFactor = page['scaleFactor']
page['height'], page['width'] = apply_scale(px_text_to_float((page['height'], page['width'])), scaleFactor)
words = []
for ind, textBlock in enumerate(textBlocks):
if len(textBlock['text'].strip()) < 1 \
or textBlock['width'] == 0 \
or textBlock['height'] == 0:
# print("skip", words_json_path)
continue
words.append(to_word(textBlock, scaleFactor, page, ind))
with open(rf"D:/Temp/words/{os.path.basename(words_json_path)}", 'w', encoding='utf-8') as fpw:
json.dump(words, fpw, ensure_ascii=False)
In [ ]:
src_dir = r'C:\Users\Administrator\Downloads'
for words_json in tqdm(os.listdir(src_dir), desc='items'):
if not words_json.endswith('.json'):
continue
words_json_path = os.path.join(src_dir, words_json)
convert(words_json_path)
In [2]:
# convert(r'D:\Downloads\AN202303161584313457_r_144_invisiable_text_page_139.json')
# convert(r'D:\Downloads\AN202303141584239808_page_1.json')
# convert(r'D:\Downloads\AN202308181595092754_r_misordered_page_1.json')
# convert(r'D:\Downloads\AN202112271537020821_x_page_50.json')
# convert(r'D:\Downloads\AN202311161611027928_x_page_1.json')
convert(r'D:\Downloads\words_page_1.json')
In [29]:
with open(r'D:\Downloads\AN202311161611027928_x_page_1.json', 'r', encoding='utf-8') as fpr:
page = json.load(fpr)
textBlocks = page['textBlocks']
scaleFactor = page['scaleFactor']
page['height'], page['width'] = apply_scale(px_text_to_float((page['height'], page['width'])), scaleFactor)
words = []
for ind, textBlock in enumerate(textBlocks):
if len(textBlock['text']) < 1:
# print("skip", words_json_path)
continue
words.append(to_word(textBlock, scaleFactor, page, ind))
with open(rf"D:/Temp/words/{os.path.basename(words_json)}", 'w', encoding='utf-8') as fpw:
json.dump(words, fpw, ensure_ascii=False)
In [31]:
rf"D:/Temp/words/{os.path.basename(words_json)}"
Out[31]:
'D:/Temp/words/TortoiseGit-LanguagePack-2.15.0.0-64bit-zh_CN.msi.rsa.asc'
In [1]:
import pandas as pd
infocode_pags = pd.read_csv(r'd:/infocodes_totalpages.csv')
In [3]:
infocode_pags.drop_duplicates(['infocode'])
Out[3]:
infocode | total_pages | |
---|---|---|
0 | AN202311161611028130 | 3 |
1 | AN202311161611028247 | 1 |
2 | AN202311161611028107 | 1 |
3 | AN202311161611028083 | 3 |
4 | AN202311161611028082 | 2 |
... | ... | ... |
995 | AN202311151610993566 | 41 |
996 | AN202311151610993569 | 34 |
997 | AN202311151610993584 | 8 |
998 | AN202311151610993563 | 20 |
999 | AN202311151610993364 | 3 |
1000 rows × 2 columns
In [24]:
import requests
cookies = {
'sid': 'ef0e68df-ce10-4835-ac76-ce5f9978f831',
'qgqp_b_id': 'af916ddbf277df5130a755f5ec7ea73c',
'st_pvi': '97517784234262',
'st_sp': '2023-09-11%2010%3A50%3A09',
'st_inirUrl': '',
'authorized-token': '{%22expires%22:null}',
'sidebarStatus': '0',
'token': 'MUEyNEM3QUEzMjQwQjk4MzE2Q0RCN0E0N0QyMDhENTU2QUY4QzBERUMxQTk2MjE3QUI3OTQ4MDY4NERGOTMzNQ==',
'pub_ticket': 'b03b1663-1f22-40c0-910a-1cfdda7da616',
'pub_empId': '210891',
'CAS_badge': '210891',
'CAS_name': '%E6%A2%81%E5%BE%B7%E8%83%9C',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# 'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json',
'X-Requested-With': 'XMLHttpRequest',
'Authorization': 'Bearer undefined',
'Origin': 'https://new-ucp-mgr.eastmoney.com',
'Connection': 'keep-alive',
'Referer': 'https://new-ucp-mgr.eastmoney.com/',
# 'Cookie': 'sid=ef0e68df-ce10-4835-ac76-ce5f9978f831; qgqp_b_id=af916ddbf277df5130a755f5ec7ea73c; st_pvi=97517784234262; st_sp=2023-09-11%2010%3A50%3A09; st_inirUrl=; authorized-token={%22expires%22:null}; sidebarStatus=0; token=MUEyNEM3QUEzMjQwQjk4MzE2Q0RCN0E0N0QyMDhENTU2QUY4QzBERUMxQTk2MjE3QUI3OTQ4MDY4NERGOTMzNQ==; pub_ticket=b03b1663-1f22-40c0-910a-1cfdda7da616; pub_empId=210891; CAS_badge=210891; CAS_name=%E6%A2%81%E5%BE%B7%E8%83%9C',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
json_data = {
'name': '10.150.108.203',
'type': 'S',
'code': '10.150.108.203',
'ip': '10.150.108.203',
'zoneId': 19,
'healthState': 'G',
}
for i in range(10, 11):
json_data['name'] = f'10.150.108.2{i}'
json_data['code'] = f'10.150.108.2{i}'
json_data['ip'] = f'10.150.108.2{i}'
response = requests.post('https://new-ucp-mgr.eastmoney.com/ucp-mgr/api/device/', cookies=cookies, headers=headers, json=json_data)
# Note: json_data will not be serialized by requests
# exactly as it was in the original request.
#data = '{"name":"10.150.108.203","type":"S","code":"10.150.108.203","ip":"10.150.108.203","zoneId":19,"healthState":"G"}'
#response = requests.post('https://new-ucp-mgr.eastmoney.com/ucp-mgr/api/device/', cookies=cookies, headers=headers, data=data)
In [20]:
import os
import shutil
from lds_util import *
base_dir = r'D:\Temp\htmls'
ct = 0
has_read = 0
infocodes = []
for infocode in os.listdir(base_dir):
if '_x' in infocode:
ct += 1
shutil.move(os.path.join(base_dir, infocode), f'd:/Temp/xxx/{infocode}')
In [14]:
import os
import shutil
from lds_util import *
base_dir = r'D:\Temp\htmls'
ct = 0
has_read = 0
infocodes = []
for infocode in os.listdir(base_dir):
if not '_' in infocode:
ct += 1
infocodes.append(infocode)
else:
has_read += 1
print(ct, has_read)
102 170
In [13]:
import pandas as pd
pd.DataFrame(infocodes).to_excel("todo_infocodes.xlsx")
In [9]:
import os
import shutil
from lds_util import *
base_dir = r'D:\Temp\htmls'
for infocode in os.listdir(base_dir):
print(infocode)
pages_path = os.path.join(base_dir, infocode)
if os.path.exists(os.path.join(base_dir, infocode, 'custom_srot_text.txt')):
os.remove(os.path.join(base_dir, infocode, 'custom_srot_text.txt'))
if os.path.exists(os.path.join(base_dir, infocode, 'use_text_flow_text.txt')):
os.remove(os.path.join(base_dir, infocode, 'use_text_flow_text.txt'))
if os.path.exists(pages_path):
for dirpath, dirnames, filenames in os.walk(os.path.join(r'D:\Temp\compare_result', infocode.split("_")[0])):
for file in filenames:
if file.endswith('txt'):
shutil.copyfile(os.path.join(dirpath, file), os.path.join(base_dir, infocode, file.replace('.txt', f'{os.path.basename(dirpath)}.txt')))
AN202112271537020821_x AN202302161583234765_x_z AN202302161583234767_z_z AN202302161583234769_z_z AN202302161583234772_z_z AN202302161583234774_z_z AN202302161583234775_z_z AN202302161583234780_z_z AN202302161583234781_z AN202302161583234782_z AN202302161583234785_z AN202302161583234787 AN202302161583234788_z AN202302161583234790_z AN202302161583234800_z AN202302161583234803_z AN202302161583234808_x_z AN202302161583234814_z AN202302161583234822_z AN202302161583234825_z AN202302161583234832_z AN202302161583234833_z AN202302161583234836_z AN202302161583234851_z AN202302161583234860_z AN202302161583234868_z AN202302161583234872_z AN202302161583234875_z AN202302161583234876_z AN202302161583234881_z AN202302161583234888_z AN202302161583234899_z AN202302161583234916_z AN202302161583234923_z AN202302161583234927_z AN202302161583234933_z AN202302161583234940_z AN202302161583234942_z AN202302161583234960_z AN202302161583234974_z AN202302161583234980_z AN202302161583234981_z AN202302161583234985_z AN202302161583234994_z AN202302161583234995_z AN202302161583235545_z AN202302161583235569_z AN202302161583235580_z AN202302161583235590_z AN202302161583235607_z AN202302161583235608_z AN202302161583235615_z AN202302161583235620_z AN202302161583235623_z AN202302161583235636_z AN202302161583235639_z AN202302161583235642_z AN202302161583235643_z AN202302161583235645_z AN202302161583235649_z AN202302161583235660_z AN202302161583235690_z AN202302161583235693_z AN202302161583235710_z AN202302161583235711_z AN202302161583235713_z AN202302161583235728_z AN202302161583235756_z AN202302161583235759_z AN202302161583235762_z AN202302161583235774_z AN202302161583235782_z AN202302161583235784_z AN202302161583235795_z AN202302161583235797_z AN202302161583235811_z AN202302161583235814_z AN202302161583235818_z AN202302161583235819_z AN202302161583235822_z AN202302161583235830_z AN202302161583235834_z AN202302161583235838_z AN202302161583235840_z AN202302161583235841_z AN202302161583235847_z AN202302161583235855_z AN202302161583235859_z AN202302161583235860_z AN202302161583235870_z AN202302161583235880_z AN202302161583235882_z AN202302161583235883_z AN202302161583235889_z AN202302161583235896_z AN202302161583235908 AN202302161583235915_z AN202302161583235921_z AN202302161583235928_z AN202302161583235940_z AN202302161583235947_z AN202302161583235951_z AN202302161583235961_z AN202302161583235963_z AN202302161583236537_z AN202302161583236544_z AN202302161583236561_z AN202302161583236562_z AN202302161583236565_z AN202302161583236582_z AN202302281583881638 AN202303011583892542 AN202303011583892545 AN202303011583892548 AN202303011583892551 AN202303011583892552 AN202303011583892555 AN202303011583892556 AN202303011583892578 AN202303011583892582 AN202303011583892586 AN202303011583892589 AN202303011583892592 AN202303011583892593 AN202303011583892598 AN202303011583892601_z AN202303011583897916 AN202303011583898495 AN202303011583898575 AN202303011583898653 AN202303011583898726 AN202303011583899064 AN202303011583901479 AN202303011583903892 AN202303011583910026 AN202303011583910543 AN202303011583913104 AN202303011583915212 AN202303011583917131 AN202303011583920374 AN202303011583920769 AN202303011583920817 AN202303011583920850 AN202303011583922005 AN202303011583922259 AN202303011583922267 AN202303011583922619 AN202303011583922685_z AN202303011583922751 AN202303011583923603 AN202303011583923803 AN202303011583924194 AN202303011583924228 AN202303011583924785 AN202303011583924786 AN202303011583924970_z AN202303011583925000 AN202303011583925002 AN202303011583925202 AN202303011583925291 AN202303011583925660 AN202303011583925672 AN202303011583925685 AN202303011583925830 AN202303011583926787 AN202303011583927517 AN202303141584239808 AN202303151584289331 AN202303151584289605 AN202303151584289611 AN202303151584289616 AN202303151584291140 AN202303151584292038 AN202303151584292945 AN202303151584293181 AN202303151584294223_z AN202303151584294225 AN202303151584295721 AN202303151584295941 AN202303151584296409 AN202303151584296448_z AN202303151584296823 AN202303161584298993 AN202303161584303367 AN202303161584303376 AN202303161584303443 AN202303161584303902 AN202303161584303903 AN202303161584303904_z AN202303161584303905 AN202303161584303906 AN202303161584303907 AN202303161584303908 AN202303161584303911 AN202303161584303913 AN202303161584303915_z AN202303161584303916 AN202303161584303917 AN202303161584303919 AN202303161584303920 AN202303161584303922 AN202303161584303924_z AN202303161584303925 AN202303161584303928 AN202303161584303930_z AN202303161584303934_z AN202303161584304119 AN202303161584304121 AN202303161584304123 AN202303161584304125 AN202303161584304127 AN202303161584304128 AN202303161584304132 AN202303161584304134 AN202303161584304136 AN202303161584304137 AN202303161584304140_z AN202303161584304147 AN202303161584304152 AN202303161584304157_z AN202303161584304160 AN202303161584304166_z AN202303161584311395 AN202303161584311613 AN202303161584312005 AN202303161584312613 AN202303161584313298 AN202303161584313359 AN202303161584313361 AN202303161584313374 AN202303161584313376 AN202303161584313381 AN202303161584313385 AN202303161584313457 AN202303161584313465 AN202303161584313810 AN202303161584313855 AN202303161584314390 AN202303161584315523 AN202303161584315719 AN202303161584315766 AN202303161584316679_z AN202303161584317034 AN202303161584318554 AN202309051597490339 AN202309151598686950 AN202311151610996963 AN202311151610996966 AN202311161611013349 AN202311161611013353 AN202311161611013355 AN202311161611013377_z AN202311161611013379 AN202311161611013391 AN202311161611013392 AN202311161611013398 AN202311161611013404 AN202311161611013773 AN202311161611018540 AN202311161611019423 AN202311161611019489 AN202311161611019493 AN202311161611019511 AN202311161611019537 AN202311161611019545 AN202311161611021874 AN202311161611025479 AN202311161611026592 AN202311161611026594 AN202311161611027171 AN202311161611027185 AN202311161611027928_x
In [4]:
import os
import shutil
from lds_util import *
base_dir = r'D:\Temp\htmls'
for infocode in os.listdir(base_dir):
pages_path = os.path.join(base_dir, infocode)
if os.path.exists(pages_path):
if tuple(filter(lambda it: it.endswith('.pdf'),os.listdir(pages_path))) == ('page_1.pdf', 'page_2.pdf'):
shutil.move(pages_path, pages_path + '_z')
In [2]:
import os
import shutil
from lds_util import *
base_dir = r'D:\Temp\compare_result'
for infocode in os.listdir(base_dir):
pages_path = os.path.join(base_dir, infocode, "pages")
if os.path.exists(pages_path):
for file in os.listdir(pages_path):
if file.endswith('.html') or file.endswith('.pdf'):
shutil.copyfile(os.path.join(pages_path, file),
mkpdir(fr'd:/Temp/htmls/{infocode}/{file}'))
makedirs d:/Temp/htmls/AN202112271537020821 makedirs d:/Temp/htmls/AN202302161583234765 makedirs d:/Temp/htmls/AN202302161583234767 makedirs d:/Temp/htmls/AN202302161583234769 makedirs d:/Temp/htmls/AN202302161583234772 makedirs d:/Temp/htmls/AN202302161583234774 makedirs d:/Temp/htmls/AN202302161583234775 makedirs d:/Temp/htmls/AN202302161583234780 makedirs d:/Temp/htmls/AN202302161583234781 makedirs d:/Temp/htmls/AN202302161583234782 makedirs d:/Temp/htmls/AN202302161583234785 makedirs d:/Temp/htmls/AN202302161583234787 makedirs d:/Temp/htmls/AN202302161583234788 makedirs d:/Temp/htmls/AN202302161583234790 makedirs d:/Temp/htmls/AN202302161583234800 makedirs d:/Temp/htmls/AN202302161583234803 makedirs d:/Temp/htmls/AN202302161583234808 makedirs d:/Temp/htmls/AN202302161583234814 makedirs d:/Temp/htmls/AN202302161583234822 makedirs d:/Temp/htmls/AN202302161583234825 makedirs d:/Temp/htmls/AN202302161583234832 makedirs d:/Temp/htmls/AN202302161583234833 makedirs d:/Temp/htmls/AN202302161583234836 makedirs d:/Temp/htmls/AN202302161583234851 makedirs d:/Temp/htmls/AN202302161583234860 makedirs d:/Temp/htmls/AN202302161583234868 makedirs d:/Temp/htmls/AN202302161583234872 makedirs d:/Temp/htmls/AN202302161583234875 makedirs d:/Temp/htmls/AN202302161583234876 makedirs d:/Temp/htmls/AN202302161583234881 makedirs d:/Temp/htmls/AN202302161583234888 makedirs d:/Temp/htmls/AN202302161583234899 makedirs d:/Temp/htmls/AN202302161583234916 makedirs d:/Temp/htmls/AN202302161583234923 makedirs d:/Temp/htmls/AN202302161583234927 makedirs d:/Temp/htmls/AN202302161583234933 makedirs d:/Temp/htmls/AN202302161583234940 makedirs d:/Temp/htmls/AN202302161583234942 makedirs d:/Temp/htmls/AN202302161583234960 makedirs d:/Temp/htmls/AN202302161583234974 makedirs d:/Temp/htmls/AN202302161583234980 makedirs d:/Temp/htmls/AN202302161583234981 makedirs d:/Temp/htmls/AN202302161583234985 makedirs d:/Temp/htmls/AN202302161583234994 makedirs d:/Temp/htmls/AN202302161583234995 makedirs d:/Temp/htmls/AN202302161583235545 makedirs d:/Temp/htmls/AN202302161583235569 makedirs d:/Temp/htmls/AN202302161583235580 makedirs d:/Temp/htmls/AN202302161583235590 makedirs d:/Temp/htmls/AN202302161583235607 makedirs d:/Temp/htmls/AN202302161583235608 makedirs d:/Temp/htmls/AN202302161583235615 makedirs d:/Temp/htmls/AN202302161583235620 makedirs d:/Temp/htmls/AN202302161583235623 makedirs d:/Temp/htmls/AN202302161583235636 makedirs d:/Temp/htmls/AN202302161583235639 makedirs d:/Temp/htmls/AN202302161583235642 makedirs d:/Temp/htmls/AN202302161583235643 makedirs d:/Temp/htmls/AN202302161583235645 makedirs d:/Temp/htmls/AN202302161583235649 makedirs d:/Temp/htmls/AN202302161583235660 makedirs d:/Temp/htmls/AN202302161583235690 makedirs d:/Temp/htmls/AN202302161583235693 makedirs d:/Temp/htmls/AN202302161583235710 makedirs d:/Temp/htmls/AN202302161583235711 makedirs d:/Temp/htmls/AN202302161583235713 makedirs d:/Temp/htmls/AN202302161583235728 makedirs d:/Temp/htmls/AN202302161583235756 makedirs d:/Temp/htmls/AN202302161583235759 makedirs d:/Temp/htmls/AN202302161583235762 makedirs d:/Temp/htmls/AN202302161583235774 makedirs d:/Temp/htmls/AN202302161583235782 makedirs d:/Temp/htmls/AN202302161583235784 makedirs d:/Temp/htmls/AN202302161583235795 makedirs d:/Temp/htmls/AN202302161583235797 makedirs d:/Temp/htmls/AN202302161583235811 makedirs d:/Temp/htmls/AN202302161583235814 makedirs d:/Temp/htmls/AN202302161583235818 makedirs d:/Temp/htmls/AN202302161583235819 makedirs d:/Temp/htmls/AN202302161583235822 makedirs d:/Temp/htmls/AN202302161583235830 makedirs d:/Temp/htmls/AN202302161583235834 makedirs d:/Temp/htmls/AN202302161583235838 makedirs d:/Temp/htmls/AN202302161583235840 makedirs d:/Temp/htmls/AN202302161583235841 makedirs d:/Temp/htmls/AN202302161583235847 makedirs d:/Temp/htmls/AN202302161583235855 makedirs d:/Temp/htmls/AN202302161583235859 makedirs d:/Temp/htmls/AN202302161583235860 makedirs d:/Temp/htmls/AN202302161583235870 makedirs d:/Temp/htmls/AN202302161583235880 makedirs d:/Temp/htmls/AN202302161583235882 makedirs d:/Temp/htmls/AN202302161583235883 makedirs d:/Temp/htmls/AN202302161583235889 makedirs d:/Temp/htmls/AN202302161583235896 makedirs d:/Temp/htmls/AN202302161583235908 makedirs d:/Temp/htmls/AN202302161583235915 makedirs d:/Temp/htmls/AN202302161583235921 makedirs d:/Temp/htmls/AN202302161583235928 makedirs d:/Temp/htmls/AN202302161583235940 makedirs d:/Temp/htmls/AN202302161583235947 makedirs d:/Temp/htmls/AN202302161583235951 makedirs d:/Temp/htmls/AN202302161583235961 makedirs d:/Temp/htmls/AN202302161583235963 makedirs d:/Temp/htmls/AN202302161583236537 makedirs d:/Temp/htmls/AN202302161583236544 makedirs d:/Temp/htmls/AN202302161583236561 makedirs d:/Temp/htmls/AN202302161583236562 makedirs d:/Temp/htmls/AN202302161583236565 makedirs d:/Temp/htmls/AN202302161583236582 makedirs d:/Temp/htmls/AN202302281583881638 makedirs d:/Temp/htmls/AN202303011583892542 makedirs d:/Temp/htmls/AN202303011583892545 makedirs d:/Temp/htmls/AN202303011583892548 makedirs d:/Temp/htmls/AN202303011583892551 makedirs d:/Temp/htmls/AN202303011583892552 makedirs d:/Temp/htmls/AN202303011583892555 makedirs d:/Temp/htmls/AN202303011583892556 makedirs d:/Temp/htmls/AN202303011583892578 makedirs d:/Temp/htmls/AN202303011583892582 makedirs d:/Temp/htmls/AN202303011583892586 makedirs d:/Temp/htmls/AN202303011583892589 makedirs d:/Temp/htmls/AN202303011583892592 makedirs d:/Temp/htmls/AN202303011583892593 makedirs d:/Temp/htmls/AN202303011583892598 makedirs d:/Temp/htmls/AN202303011583892601 makedirs d:/Temp/htmls/AN202303011583897916 makedirs d:/Temp/htmls/AN202303011583898495 makedirs d:/Temp/htmls/AN202303011583898575 makedirs d:/Temp/htmls/AN202303011583898653 makedirs d:/Temp/htmls/AN202303011583898726 makedirs d:/Temp/htmls/AN202303011583899064 makedirs d:/Temp/htmls/AN202303011583901479 makedirs d:/Temp/htmls/AN202303011583903892 makedirs d:/Temp/htmls/AN202303011583910026 makedirs d:/Temp/htmls/AN202303011583910543 makedirs d:/Temp/htmls/AN202303011583913104 makedirs d:/Temp/htmls/AN202303011583915212 makedirs d:/Temp/htmls/AN202303011583917131 makedirs d:/Temp/htmls/AN202303011583920374 makedirs d:/Temp/htmls/AN202303011583920769 makedirs d:/Temp/htmls/AN202303011583920817 makedirs d:/Temp/htmls/AN202303011583920850 makedirs d:/Temp/htmls/AN202303011583922005 makedirs d:/Temp/htmls/AN202303011583922259 makedirs d:/Temp/htmls/AN202303011583922267 makedirs d:/Temp/htmls/AN202303011583922619 makedirs d:/Temp/htmls/AN202303011583922685 makedirs d:/Temp/htmls/AN202303011583922751 makedirs d:/Temp/htmls/AN202303011583923603 makedirs d:/Temp/htmls/AN202303011583923803 makedirs d:/Temp/htmls/AN202303011583924194 makedirs d:/Temp/htmls/AN202303011583924228 makedirs d:/Temp/htmls/AN202303011583924785 makedirs d:/Temp/htmls/AN202303011583924786 makedirs d:/Temp/htmls/AN202303011583924970 makedirs d:/Temp/htmls/AN202303011583925000 makedirs d:/Temp/htmls/AN202303011583925002 makedirs d:/Temp/htmls/AN202303011583925202 makedirs d:/Temp/htmls/AN202303011583925291 makedirs d:/Temp/htmls/AN202303011583925660 makedirs d:/Temp/htmls/AN202303011583925672 makedirs d:/Temp/htmls/AN202303011583925685 makedirs d:/Temp/htmls/AN202303011583925830 makedirs d:/Temp/htmls/AN202303011583926787 makedirs d:/Temp/htmls/AN202303011583927517 makedirs d:/Temp/htmls/AN202303141584239808 makedirs d:/Temp/htmls/AN202303151584289331 makedirs d:/Temp/htmls/AN202303151584289605 makedirs d:/Temp/htmls/AN202303151584289611 makedirs d:/Temp/htmls/AN202303151584289616 makedirs d:/Temp/htmls/AN202303151584291140 makedirs d:/Temp/htmls/AN202303151584292038 makedirs d:/Temp/htmls/AN202303151584292945 makedirs d:/Temp/htmls/AN202303151584293181 makedirs d:/Temp/htmls/AN202303151584294223 makedirs d:/Temp/htmls/AN202303151584294225 makedirs d:/Temp/htmls/AN202303151584295721 makedirs d:/Temp/htmls/AN202303151584295941 makedirs d:/Temp/htmls/AN202303151584296409 makedirs d:/Temp/htmls/AN202303151584296448 makedirs d:/Temp/htmls/AN202303151584296823 makedirs d:/Temp/htmls/AN202303161584298993 makedirs d:/Temp/htmls/AN202303161584303367 makedirs d:/Temp/htmls/AN202303161584303376 makedirs d:/Temp/htmls/AN202303161584303443 makedirs d:/Temp/htmls/AN202303161584303902 makedirs d:/Temp/htmls/AN202303161584303903 makedirs d:/Temp/htmls/AN202303161584303904 makedirs d:/Temp/htmls/AN202303161584303905 makedirs d:/Temp/htmls/AN202303161584303906 makedirs d:/Temp/htmls/AN202303161584303907 makedirs d:/Temp/htmls/AN202303161584303908 makedirs d:/Temp/htmls/AN202303161584303911 makedirs d:/Temp/htmls/AN202303161584303913 makedirs d:/Temp/htmls/AN202303161584303915 makedirs d:/Temp/htmls/AN202303161584303916 makedirs d:/Temp/htmls/AN202303161584303917 makedirs d:/Temp/htmls/AN202303161584303919 makedirs d:/Temp/htmls/AN202303161584303920 makedirs d:/Temp/htmls/AN202303161584303922 makedirs d:/Temp/htmls/AN202303161584303924 makedirs d:/Temp/htmls/AN202303161584303925 makedirs d:/Temp/htmls/AN202303161584303928 makedirs d:/Temp/htmls/AN202303161584303930 makedirs d:/Temp/htmls/AN202303161584303934 makedirs d:/Temp/htmls/AN202303161584304119 makedirs d:/Temp/htmls/AN202303161584304121 makedirs d:/Temp/htmls/AN202303161584304123 makedirs d:/Temp/htmls/AN202303161584304125 makedirs d:/Temp/htmls/AN202303161584304127 makedirs d:/Temp/htmls/AN202303161584304128 makedirs d:/Temp/htmls/AN202303161584304132 makedirs d:/Temp/htmls/AN202303161584304134 makedirs d:/Temp/htmls/AN202303161584304136 makedirs d:/Temp/htmls/AN202303161584304137 makedirs d:/Temp/htmls/AN202303161584304140 makedirs d:/Temp/htmls/AN202303161584304147 makedirs d:/Temp/htmls/AN202303161584304152 makedirs d:/Temp/htmls/AN202303161584304157 makedirs d:/Temp/htmls/AN202303161584304160 makedirs d:/Temp/htmls/AN202303161584304166 makedirs d:/Temp/htmls/AN202303161584311395 makedirs d:/Temp/htmls/AN202303161584311613 makedirs d:/Temp/htmls/AN202303161584312005 makedirs d:/Temp/htmls/AN202303161584312613 makedirs d:/Temp/htmls/AN202303161584313298 makedirs d:/Temp/htmls/AN202303161584313359 makedirs d:/Temp/htmls/AN202303161584313361 makedirs d:/Temp/htmls/AN202303161584313374 makedirs d:/Temp/htmls/AN202303161584313376 makedirs d:/Temp/htmls/AN202303161584313381 makedirs d:/Temp/htmls/AN202303161584313385 makedirs d:/Temp/htmls/AN202303161584313457 makedirs d:/Temp/htmls/AN202303161584313465 makedirs d:/Temp/htmls/AN202303161584313810 makedirs d:/Temp/htmls/AN202303161584313855 makedirs d:/Temp/htmls/AN202303161584314390 makedirs d:/Temp/htmls/AN202303161584315523 makedirs d:/Temp/htmls/AN202303161584315719 makedirs d:/Temp/htmls/AN202303161584315766 makedirs d:/Temp/htmls/AN202303161584316679 makedirs d:/Temp/htmls/AN202303161584317034 makedirs d:/Temp/htmls/AN202303161584318554 makedirs d:/Temp/htmls/AN202309051597490339 makedirs d:/Temp/htmls/AN202309151598686950 makedirs d:/Temp/htmls/AN202311151610996963 makedirs d:/Temp/htmls/AN202311151610996966 makedirs d:/Temp/htmls/AN202311161611013349 makedirs d:/Temp/htmls/AN202311161611013353 makedirs d:/Temp/htmls/AN202311161611013355 makedirs d:/Temp/htmls/AN202311161611013377 makedirs d:/Temp/htmls/AN202311161611013379 makedirs d:/Temp/htmls/AN202311161611013391 makedirs d:/Temp/htmls/AN202311161611013392 makedirs d:/Temp/htmls/AN202311161611013398 makedirs d:/Temp/htmls/AN202311161611013404 makedirs d:/Temp/htmls/AN202311161611013773 makedirs d:/Temp/htmls/AN202311161611018540 makedirs d:/Temp/htmls/AN202311161611019423 makedirs d:/Temp/htmls/AN202311161611019489 makedirs d:/Temp/htmls/AN202311161611019493 makedirs d:/Temp/htmls/AN202311161611019511 makedirs d:/Temp/htmls/AN202311161611019537 makedirs d:/Temp/htmls/AN202311161611019545 makedirs d:/Temp/htmls/AN202311161611021874 makedirs d:/Temp/htmls/AN202311161611025479 makedirs d:/Temp/htmls/AN202311161611026592 makedirs d:/Temp/htmls/AN202311161611026594 makedirs d:/Temp/htmls/AN202311161611027171 makedirs d:/Temp/htmls/AN202311161611027185 makedirs d:/Temp/htmls/AN202311161611027928
In [1]:
import os
import json
import shutil
basedir = r'D:\Temp\p2h'
# if os.path.exists(r'd:/Temp/compare_result'):
# shutil.rmtree('d:/Temp/compare_result')
def list_infocodes() -> list[str]:
yield from os.listdir(basedir)
def compare_words(custom_srot_words, use_text_flow_words) -> tuple[bool, str, str]:
custom_srot_wd_ls = list(map(lambda it: it['text'], custom_srot_words))
use_text_flow_wd_ls = list(map(lambda it: it['text'], use_text_flow_words))
target_text = "".join(use_text_flow_wd_ls).replace(' ', '')
# TODO
same = True
for wd in custom_srot_wd_ls:
if wd.replace(' ', '') not in target_text:
same = False
break
# same = "".join(custom_srot_wd_ls).replace(" ", '') == "".join(use_text_flow_wd_ls).replace(" ", '')
return (same,
"\n".join(custom_srot_wd_ls),
"\n".join(use_text_flow_wd_ls)
)
from lds_util import *
not_same_ct = 0
all_ct = 0
infocodes = set()
for infocode in '''AN202303011583892542
AN202303011583925256
AN202309051597490339
AN202309151598686950
AN202112271537020821
AN202303141584239808
AN202304201585626705'''.splitlines():
# for infocode in list_infocodes():
pdf_dir = fr'{basedir}\{infocode}'
for root, dirs, files in os.walk(pdf_dir):
for file in files:
if 'use_text_flow_words' in file:
try:
fw1 = os.path.join(root, file.replace('use_text_flow_words', 'custom_srot_words'))
with open(fw1, 'r', encoding='utf-8') as fpr:
custom_srot_words = json.load(fpr)
custom_srot_words.sort(key=lambda it: int(it['top']/3))
with open(fw1.replace(".json", "_text.txt"), 'w', encoding='utf-8') as fpw:
fpw.write('\n'.join(map(lambda it: it['text'], custom_srot_words)))
fw2 = os.path.join(root, file)
with open(fw2, 'r', encoding='utf-8') as fpr:
use_text_flow_words = json.load(fpr)
use_text_flow_words.sort(key=lambda it: int(it['top']/3))
with open(fw2.replace(".json", "_text.txt"), 'w', encoding='utf-8') as fpw:
fpw.write('\n'.join(map(lambda it: it['text'], use_text_flow_words)))
i = file.replace('use_text_flow_words_', '').replace('.json', '')
i = int(i) + 1
same, custom_srot_text, use_text_flow_text = compare_words(custom_srot_words, use_text_flow_words)
infocodes.add(infocode)
if not same:
not_same_ct += 1
print(f'{not_same_ct}/{all_ct}', "="*15, infocode, i, '='*15)
with open(mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/custom_srot_text.txt'), 'w', encoding='utf-8') as fpw:
fpw.write(custom_srot_text)
with open(mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/use_text_flow_text.txt'), 'w', encoding='utf-8') as fpw:
fpw.write(use_text_flow_text)
shutil.copyfile(fw1, mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/custom_srot_text_{i}.json'))
shutil.copyfile(fw1, mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/use_text_flow_text_{i}.json'))
shutil.copyfile(fr'D:\Temp\p2h\{infocode}\{infocode}.pdf', mkpdir(f'd:/Temp/compare_result/{infocode}/{infocode}.pdf'))
# print(custom_srot_text)
# print("-"*15, '-'*15)
# print(use_text_flow_text)
elif len(custom_srot_words) > 0:
all_ct += 1
except Exception as e:
print(e)
Expecting value: line 1 column 1 (char 0) Expecting value: line 5 column 2 (char 9) Expecting value: line 4 column 2 (char 7) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) 1/5 =============== AN202309051597490339 1 =============== Expecting value: line 1 column 1 (char 0) 2/5 =============== AN202309151598686950 1 =============== Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) 3/49 =============== AN202112271537020821 50 =============== Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) 4/84 =============== AN202112271537020821 82 =============== Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0) 5/87 =============== AN202303141584239808 1 =============== Expecting value: line 1 column 1 (char 0) Expecting value: line 1 column 1 (char 0)