从 pdf.js 提取数据¶

In [16]:
var btn = document.createElement('div')
btn.style = "position: absolute;right:10px;top:10px;width:100px;height:50px;color:#000;z-index:99999;border:1px solid #ddd;"
btn.id = 'download'
btn.innerText = '下载字符信息'
var viewer = document.querySelector('#viewer .page')
viewer.appendChild(btn)
btn.onclick = function () {
    let loc = location.href;
    let infocode = (loc.match(/AN.*?(?=\/)/g) || ['words'])[0]
    let page = (loc.match(/(?<=page_)\d+/g) || ['1'])[0]
    let attrs = ['font-size', 'font-family', 'color', 'top', 'left', 'width', 'height', 'transform', 'direction'];
    // 这是一个简化的示例,需要根据你的实际 HTML 结构和 PDF.js 用法进行调整
    let factor = parseFloat(document.getElementById("viewer").style.getPropertyValue("--scale-factor"));
    let viewerComputedStyle = window.getComputedStyle(document.querySelector('div.page'));
    let height = viewerComputedStyle.getPropertyValue('height');
    let width = viewerComputedStyle.getPropertyValue('width');

    // 获取渲染后的 PDF.js 页面容器
    let pdfContainer = document.getElementById('viewerContainer'); // 替换成你的 PDF 容器元素 ID

    // 获取所有包含文本内容的元素
    let textElements = pdfContainer.querySelectorAll('.textLayer span[style*="left"]');

    let textBlocks = [];

    // 遍历文本元素并获取 computed style
    textElements.forEach(textElement => {
        let textContent = textElement.textContent;
        let computedStyle = window.getComputedStyle(textElement);

        // 组织每个文本块的信息
        let textBlock = {
            "text": textContent,
        };
        // 遍历 attrs 数组,往 textBlock 添加样式属性
        attrs.forEach(attr => {
            // 获取属性值,并根据需要转换为数值或字符串
            let value = computedStyle.getPropertyValue(attr);
            // 如果需要,进行适当的处理或转换
            textBlock[attr] = value;
        });
        let bbox = textElement.getBoundingClientRect();
        textBlock['width'] = `${bbox.width}px`
        textBlock['height'] = `${bbox.height}px`
        let match = computedStyle.getPropertyValue('transform').match(/rotate\(([-]?\d+\.?\d*)deg\)/);
        if (match) {
            const rotationAngle = parseFloat(match[1]);
            textBlock['angle'] = rotationAngle;
        } 
        if (bbox.width != 0 && bbox.height != 0) {
            textBlocks.push(textBlock);
        }
    });

    // 将数据输出为 JSON 文件
    // let jsonData = JSON.stringify({textBlocks:textBlocks, "scaleFactor":factor, height: height, width: width}, null, 2);
    let jsonData = JSON.stringify({ textBlocks: textBlocks, "scaleFactor": factor, height: height, width: width });

    // 生成一个 Blob 对象并创建下载链接
    let blob = new Blob([jsonData], { type: 'application/json' });
    let url = URL.createObjectURL(blob);

    // 创建一个链接并模拟点击下载
    let a = document.createElement('a');
    a.href = url;
    a.download = `${infocode}_page_${page}.json`;
    document.body.appendChild(a);
    a.click();
    document.body.removeChild(a);
}
  Cell In[16], line 1
    var btn = document.createElement('div')
        ^
SyntaxError: invalid syntax
In [ ]:
let attrs = ['font-size', 'font-family', 'color', 'top', 'left', 'width', 'height', 'transform', 'direction'];
// 这是一个简化的示例,需要根据你的实际 HTML 结构和 PDF.js 用法进行调整
let factor = parseFloat(document.getElementById("viewer").style.getPropertyValue("--scale-factor"));
let viewerComputedStyle = window.getComputedStyle(document.querySelector('div.page'));
let height = viewerComputedStyle.getPropertyValue('height');
let width = viewerComputedStyle.getPropertyValue('width');

// 获取渲染后的 PDF.js 页面容器
let pdfContainer = document.getElementById('viewerContainer'); // 替换成你的 PDF 容器元素 ID

// 获取所有包含文本内容的元素
let textElements = pdfContainer.querySelectorAll('.textLayer span[style*="left"]');

let textBlocks = [];

// 遍历文本元素并获取 computed style
textElements.forEach(textElement => {
    let textContent = textElement.textContent;
    let computedStyle = window.getComputedStyle(textElement);

    // 组织每个文本块的信息
    let textBlock = {
        "text": textContent,
    };
    // 遍历 attrs 数组,往 textBlock 添加样式属性
    attrs.forEach(attr => {
        // 获取属性值,并根据需要转换为数值或字符串
        let value = computedStyle.getPropertyValue(attr);
        // 如果需要,进行适当的处理或转换
        textBlock[attr] = value;
    });
    let bbox = textElement.getBoundingClientRect();
    textBlock['width'] = `${bbox.width}px` 
    textBlock['height'] = `${bbox.height}px` 
    let match = computedStyle.getPropertyValue('transform').match(/rotate\(([-]?\d+\.?\d*)deg\)/);
    if (match) {
        const rotationAngle = parseFloat(match[1]);
        textBlock['angle'] = rotationAngle;
    } 
    if (bbox.width != 0 && bbox.height !=0) {
        textBlocks.push(textBlock);
    }
});

// 将数据输出为 JSON 文件
// let jsonData = JSON.stringify({textBlocks:textBlocks, "scaleFactor":factor, height: height, width: width}, null, 2);
let jsonData = JSON.stringify({ textBlocks: textBlocks, "scaleFactor": factor, height: height, width: width });
return jsonData;
In [37]:
to_word_js = ''' 
let attrs = ['font-size', 'font-family', 'color', 'top', 'left', 'width', 'height', 'transform', 'direction'];
// 这是一个简化的示例,需要根据你的实际 HTML 结构和 PDF.js 用法进行调整
let factor = parseFloat(document.getElementById("viewer").style.getPropertyValue("--scale-factor"));
let viewerComputedStyle = window.getComputedStyle(document.querySelector('div.page'));
let height = viewerComputedStyle.getPropertyValue('height');
let width = viewerComputedStyle.getPropertyValue('width');

// 获取渲染后的 PDF.js 页面容器
let pdfContainer = document.getElementById('viewerContainer'); // 替换成你的 PDF 容器元素 ID

// 获取所有包含文本内容的元素
let textElements = pdfContainer.querySelectorAll('.textLayer span[style*="left"]');

let textBlocks = [];

// 遍历文本元素并获取 computed style
textElements.forEach(textElement => {
    let textContent = textElement.textContent;
    let computedStyle = window.getComputedStyle(textElement);

    // 组织每个文本块的信息
    let textBlock = {
        "text": textContent,
    };
    // 遍历 attrs 数组,往 textBlock 添加样式属性
    attrs.forEach(attr => {
        // 获取属性值,并根据需要转换为数值或字符串
        let value = computedStyle.getPropertyValue(attr);
        // 如果需要,进行适当的处理或转换
        textBlock[attr] = value;
    });
    let bbox = textElement.getBoundingClientRect();
    textBlock['width'] = `${bbox.width}px` 
    textBlock['height'] = `${bbox.height}px` 
    let match = computedStyle.getPropertyValue('transform').match(/rotate\(([-]?\d+\.?\d*)deg\)/);
    if (match) {
        const rotationAngle = parseFloat(match[1]);
        textBlock['angle'] = rotationAngle;
    } 
    if (bbox.width != 0 && bbox.height !=0) {
        textBlocks.push(textBlock);
    }
});

// 将数据输出为 JSON 文件
// let jsonData = JSON.stringify({textBlocks:textBlocks, "scaleFactor":factor, height: height, width: width}, null, 2);
let jsonData = JSON.stringify({ textBlocks: textBlocks, "scaleFactor": factor, height: height, width: width });
return jsonData;
'''
In [4]:
QUERY_DOWNLOADS_SCRIPT = "// 获取 Downloads 列表对象\n" +"var downloadsList = window.Downloads._lists;\n" +"\n" +"// 获取所有下载项\n" +"var downloadItems = downloadsList[window.Downloads.ALL]._downloads;\n" +"\n" +"\n" +"var result = [];\n" +"// 遍历下载项并获取信息\n" +"for (var i = 0; i < downloadItems.length; i++) {\n" +"    var downloadItem = downloadItems[i];\n" +"\n" +"    // 获取下载文件的下载地址\n" +"    var downloadUrl = downloadItem.source.url;\n" +"\n" +"    // 获取下载文件的本地文件路径\n" +"    var downloadPath = downloadItem.target.path;\n" +"    var contentType = downloadItem.contentType;\n" +"\n" +"    // 获取下载状态\n" +"    var succeeded = downloadItem.succeeded;\n" +"    var stopped = downloadItem.stopped;\n" +"\tresult.push({\n" +"\t\t'url': downloadUrl,\n" +"\t\t'path': downloadPath,\n" +"\t\t'type': contentType,\n" +"\t\t'succeeded': succeeded,\n" +"\t\t'stopped': stopped,\n" +"\t}); \n" +"}\n" +"return JSON.stringify(result)";
In [6]:
print(QUERY_DOWNLOADS_SCRIPT)
// 获取 Downloads 列表对象
var downloadsList = window.Downloads._lists;

// 获取所有下载项
var downloadItems = downloadsList[window.Downloads.ALL]._downloads;


var result = [];
// 遍历下载项并获取信息
for (var i = 0; i < downloadItems.length; i++) {
    var downloadItem = downloadItems[i];

    // 获取下载文件的下载地址
    var downloadUrl = downloadItem.source.url;

    // 获取下载文件的本地文件路径
    var downloadPath = downloadItem.target.path;
    var contentType = downloadItem.contentType;

    // 获取下载状态
    var succeeded = downloadItem.succeeded;
    var stopped = downloadItem.stopped;
	result.push({
		'url': downloadUrl,
		'path': downloadPath,
		'type': contentType,
		'succeeded': succeeded,
		'stopped': stopped,
	}); 
}
return JSON.stringify(result)
In [36]:
import requests

headers = {
    # Already added when you pass json=
    'Content-Type': 'application/json',
}



response = requests.post('http://127.0.0.1:8090/api/webdriver/pdf-text', headers=headers, json=json_data)
response.json()
---------------------------------------------------------------------------
ConnectionRefusedError                    Traceback (most recent call last)
d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in _new_conn(self)
    174             conn = connection.create_connection(
--> 175                 (self._dns_host, self.port), self.timeout, **extra_kw
    176             )

d:\anaconda3\envs\py36\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
     94     if err is not None:
---> 95         raise err
     96 

d:\anaconda3\envs\py36\lib\site-packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
     84                 sock.bind(source_address)
---> 85             sock.connect(sa)
     86             return sock

ConnectionRefusedError: [WinError 10061] 由于目标计算机积极拒绝,无法连接。

During handling of the above exception, another exception occurred:

NewConnectionError                        Traceback (most recent call last)
d:\anaconda3\envs\py36\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    709                 headers=headers,
--> 710                 chunked=chunked,
    711             )

d:\anaconda3\envs\py36\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    397             else:
--> 398                 conn.request(method, url, **httplib_request_kw)
    399 

d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in request(self, method, url, body, headers)
    238             headers["User-Agent"] = _get_default_user_agent()
--> 239         super(HTTPConnection, self).request(method, url, body=body, headers=headers)
    240 

d:\anaconda3\envs\py36\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
   1286         """Send a complete request to the server."""
-> 1287         self._send_request(method, url, body, headers, encode_chunked)
   1288 

d:\anaconda3\envs\py36\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1332             body = _encode(body, 'body')
-> 1333         self.endheaders(body, encode_chunked=encode_chunked)
   1334 

d:\anaconda3\envs\py36\lib\http\client.py in endheaders(self, message_body, encode_chunked)
   1281             raise CannotSendHeader()
-> 1282         self._send_output(message_body, encode_chunked=encode_chunked)
   1283 

d:\anaconda3\envs\py36\lib\http\client.py in _send_output(self, message_body, encode_chunked)
   1041         del self._buffer[:]
-> 1042         self.send(msg)
   1043 

d:\anaconda3\envs\py36\lib\http\client.py in send(self, data)
    979             if self.auto_open:
--> 980                 self.connect()
    981             else:

d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in connect(self)
    204     def connect(self):
--> 205         conn = self._new_conn()
    206         self._prepare_conn(conn)

d:\anaconda3\envs\py36\lib\site-packages\urllib3\connection.py in _new_conn(self)
    186             raise NewConnectionError(
--> 187                 self, "Failed to establish a new connection: %s" % e
    188             )

NewConnectionError: <urllib3.connection.HTTPConnection object at 0x00000209AFB79080>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝,无法连接。

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
d:\anaconda3\envs\py36\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    449                     retries=self.max_retries,
--> 450                     timeout=timeout
    451                 )

d:\anaconda3\envs\py36\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    785             retries = retries.increment(
--> 786                 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
    787             )

d:\anaconda3\envs\py36\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    591         if new_retry.is_exhausted():
--> 592             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    593 

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=8090): Max retries exceeded with url: /api/webdriver/pdf-text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000209AFB79080>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝,无法连接。',))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
<ipython-input-36-ede605ff4b25> in <module>
      8 
      9 
---> 10 response = requests.post('http://127.0.0.1:8090/api/webdriver/pdf-text', headers=headers, json=json_data)
     11 response.json()

d:\anaconda3\envs\py36\lib\site-packages\requests\api.py in post(url, data, json, **kwargs)
    115     """
    116 
--> 117     return request('post', url, data=data, json=json, **kwargs)
    118 
    119 

d:\anaconda3\envs\py36\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     59     # cases, and look like a memory leak in others.
     60     with sessions.Session() as session:
---> 61         return session.request(method=method, url=url, **kwargs)
     62 
     63 

d:\anaconda3\envs\py36\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    527         }
    528         send_kwargs.update(settings)
--> 529         resp = self.send(prep, **send_kwargs)
    530 
    531         return resp

d:\anaconda3\envs\py36\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    643 
    644         # Send the request
--> 645         r = adapter.send(request, **kwargs)
    646 
    647         # Total elapsed time of the request (approximately)

d:\anaconda3\envs\py36\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    517                 raise SSLError(e, request=request)
    518 
--> 519             raise ConnectionError(e, request=request)
    520 
    521         except ClosedPoolError as e:

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8090): Max retries exceeded with url: /api/webdriver/pdf-text (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000209AFB79080>: Failed to establish a new connection: [WinError 10061] 由于目标计算机积极拒绝,无法连接。',))
In [13]:
import time

time.time()
Out[13]:
1701227208.4880955
In [24]:
import pandas as pd
p2h = pd.read_csv(r'D:\life-is-short-i-use-python\tb_p2h_info_config_202312071316.csv')
In [25]:
grouped = p2h.groupby('infocode')['process_record_id'].count()

# 选择符合条件(count > 1)的分组
selected_groups = grouped[grouped > 1]
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
d:\life-is-short-i-use-python\公告项目.ipynb Cell 10 line 1
----> <a href='vscode-notebook-cell:/d%3A/life-is-short-i-use-python/%E5%85%AC%E5%91%8A%E9%A1%B9%E7%9B%AE.ipynb#X52sZmlsZQ%3D%3D?line=0'>1</a> grouped = p2h.groupby('infocode')['process_record_id'].count()
      <a href='vscode-notebook-cell:/d%3A/life-is-short-i-use-python/%E5%85%AC%E5%91%8A%E9%A1%B9%E7%9B%AE.ipynb#X52sZmlsZQ%3D%3D?line=2'>3</a> # 选择符合条件(count > 1)的分组
      <a href='vscode-notebook-cell:/d%3A/life-is-short-i-use-python/%E5%85%AC%E5%91%8A%E9%A1%B9%E7%9B%AE.ipynb#X52sZmlsZQ%3D%3D?line=3'>4</a> selected_groups = grouped[grouped > 1]

File d:\anaconda3\envs\py311\Lib\site-packages\pandas\core\groupby\generic.py:1961, in DataFrameGroupBy.__getitem__(self, key)
   1954 if isinstance(key, tuple) and len(key) > 1:
   1955     # if len == 1, then it becomes a SeriesGroupBy and this is actually
   1956     # valid syntax, so don't raise
   1957     raise ValueError(
   1958         "Cannot subset columns with a tuple with more than one element. "
   1959         "Use a list instead."
   1960     )
-> 1961 return super().__getitem__(key)

File d:\anaconda3\envs\py311\Lib\site-packages\pandas\core\base.py:244, in SelectionMixin.__getitem__(self, key)
    242 else:
    243     if key not in self.obj:
--> 244         raise KeyError(f"Column not found: {key}")
    245     ndim = self.obj[key].ndim
    246     return self._gotitem(key, ndim=ndim)

KeyError: 'Column not found: process_record_id'
In [8]:
# 按照 infocode 分组,获取每个分组中 id 最大的索引
max_id_indices = p2h.groupby('infocode')['process_record_id'].idxmax()
In [9]:
# 根据最大的索引获取相应的行数据
result = p2h.loc[max_id_indices]

# 输出结果
result
Out[9]:
process_record_id infocode path hash company priority file_size total_pages total_chars cur_process ... id infocode.1 hash.1 display_type version create_time.1 update_time.1 create_by update_by file_type.1
10774 6077274 AC202310311607030815 http://pdf.dfcfw.com/pdf/H2_AC2023103116070308... 9282914619da0f853e671e695b69d72a NaN NaN 49417 1 NaN E ... 4562766 AC202310311607030815 9282914619da0f853e671e695b69d72a R NaN 2023-10-31 13:43:32 2023-10-31 13:43:40 NaN NaN ppdf
57693 6147484 AN201203290005088457 http://pdf.dfcfw.com/pdf/H2_AN2012032900050884... 4ae7e2fd04eb787af22a2730ff9eb3b4 NaN 0.0 449089 107 NaN E ... 4631719 AN201203290005088457 4ae7e2fd04eb787af22a2730ff9eb3b4 R NaN 2023-11-14 09:38:46 2023-11-14 09:39:16 NaN NaN ppdf
88041 6193612 AN201303290003144095 http://pdf.dfcfw.com/pdf/H2_AN2013032900031440... 06e75572e0317e56f4c2c216b52d2de2 NaN 0.0 197771 2 NaN E ... 4676958 AN201303290003144095 06e75572e0317e56f4c2c216b52d2de2 R NaN 2023-11-22 14:03:06 2023-11-22 14:03:13 NaN NaN pdf
88040 6193609 AN201404230005490649 http://pdf.dfcfw.com/pdf/H2_AN2014042300054906... fc23c45c6d0848b194e5b58272f7d877 NaN 0.0 136002 1 NaN E ... 4676955 AN201404230005490649 fc23c45c6d0848b194e5b58272f7d877 R NaN 2023-11-22 14:02:27 2023-11-22 14:02:32 NaN NaN pdf
51280 6138029 AN201505130009599349 http://pdf.dfcfw.com/pdf/H2_AN2015051300095993... 74ffdf952b7ad1ca86f7d84f08b8c57f NaN 0.0 2064618 170 NaN E ... 4622378 AN201505130009599349 74ffdf952b7ad1ca86f7d84f08b8c57f R NaN 2023-11-10 21:17:07 2023-11-10 21:18:02 NaN NaN ppdf
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95693 6204888 AN202311231612215965 http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... bb0bd432bcf2a1d53ec0ca274ae077f5 NaN 0.0 116501 3 NaN E ... 4688057 AN202311231612215965 bb0bd432bcf2a1d53ec0ca274ae077f5 R NaN 2023-11-23 23:19:27 2023-11-23 23:19:35 NaN NaN ppdf
95692 6204887 AN202311231612215966 http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... ead0a12d9ff6198d8311f5be38574f6d NaN 0.0 273434 1 NaN E ... 4688056 AN202311231612215966 ead0a12d9ff6198d8311f5be38574f6d R NaN 2023-11-23 23:19:27 2023-11-23 23:19:32 NaN NaN ppdf
95694 6204889 AN202311231612215982 http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... 66283f33f7528f04ac4f1138a874fab4 NaN 0.0 686837 6 NaN E ... 4688058 AN202311231612215982 66283f33f7528f04ac4f1138a874fab4 R NaN 2023-11-23 23:22:27 2023-11-23 23:30:47 NaN NaN ppdf
95695 6204890 AN202311231612215983 http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... 66283f33f7528f04ac4f1138a874fab4 NaN 0.0 686837 6 NaN E ... 4688059 AN202311231612215983 66283f33f7528f04ac4f1138a874fab4 R NaN 2023-11-23 23:24:21 2023-11-23 23:24:36 NaN NaN pdf
95696 6204891 AN202311231612215984 http://pdf.dfcfw.com/pdf/H2_AN2023112316122159... 66283f33f7528f04ac4f1138a874fab4 NaN 0.0 686837 6 NaN E ... 4688060 AN202311231612215984 66283f33f7528f04ac4f1138a874fab4 R NaN 2023-11-23 23:24:21 2023-11-23 23:24:37 NaN NaN pdf

95603 rows × 38 columns

In [26]:
result['display'] = result.apply(lambda row: 1 if row['display_type'] == 'R' else 0, axis=1)

fret = result[['infocode', 'display']]
In [27]:
fret.to_csv('p2h_ret_2023-12-07_update.csv', index=False)
In [ ]:
import  traceback

traceback.format_exc()
In [ ]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# 配置 Firefox WebDriver
firefox_options = Options()
firefox_options.headless = True

# 设置 Firefox WebDriver 的执行路径
# 启动 Firefox WebDriver
driver =  webdriver.Firefox(firefox_binary='/opt/firefox-91.0.1esr-zh-CN/firefox-bin',
                            executable_path='/home/baoleiji/geckodriver_31/geckodriver',
                            options=firefox_options)
# driver =  webdriver.Firefox(options=firefox_options)
    # 示例:访问一个下载链接
pdf_file = 'file:////home/baoleiji/AN202304161585487275_24.pdf'
pdf_file = 'file:////home/baoleiji/page_63.pdf'
driver.get(pdf_file)  # 替换成实际的下载链接
driver.find_elements(By.CSS_SELECTOR, "#viewer")[0].get_attribute("outerHTML")[:100]
driver.find_elements(By.CSS_SELECTOR, "#viewerContainer")[0].get_attribute("outerHTML")[:100]
driver.find_elements(By.CSS_SELECTOR, "span[role]")[-3].get_attribute("outerHTML")
In [39]:
import time, os


deltas = []
ind = 0
for dirpath, dirnames, filenames  in os.walk(r'D:\Temp\htmls'):
    for file in filenames:
        if not file.endswith('.pdf'):
            continue
        start = time.time()
        ind += 1
        json_data = {
            'pdfPath': os.path.join(dirpath, file),
        }
        response = requests.post('http://127.0.0.1:8090/api/webdriver/pdf-text', headers=headers, json=json_data)
        if len(response.json()['data']) < 500:
            print(ind, os.path.join(dirpath, file))
        # print(ind, time.time() - start, response.json())
        # deltas.append(time.time() - start)
In [19]:
import pandas as pd
pd.DataFrame(deltas).describe()
Out[19]:
0
count 784.000000
mean 0.425131
std 0.243149
min 0.246715
25% 0.336466
50% 0.374103
75% 0.430255
max 2.289095
In [4]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



# 设置 Firefox Profile 来指定下载路径
profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)  # 自定义下载路径
profile.set_preference("browser.download.dir", "d:/words")  # 设置下载文件的目录
profile.set_preference("browser.download.useDownloadDir", True)
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/json")  # 设置要下载的文件类型
profile.set_preference("layout.css.devPixelsPerPx", "1.1")

# 配置 Firefox WebDriver
firefox_options = Options()
firefox_options.profile = profile

# 设置 Firefox WebDriver 的执行路径
gecko_driver_path = 'd:/geckodriver.exe'  # GeckoDriver 的路径
service = Service(gecko_driver_path)
# 启动 Firefox WebDriver
driver = webdriver.Firefox(service=service, options=firefox_options)

# 示例:访问一个下载链接
driver.get("file:///D:/Temp/htmls/AN202311161611013398_r_mini/page_1.pdf")  # 替换成实际的下载链接

# 等待一段时间,确保下载完成
# 这里可以添加一些逻辑,例如等待文件下载完成或其他必要的操作
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_30652\2332689472.py:11: DeprecationWarning: firefox_profile has been deprecated, please use an Options object
  profile = webdriver.FirefoxProfile()
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_30652\2332689472.py:19: DeprecationWarning: Setting a profile has been deprecated. Please use the set_preference and install_addons methods
  firefox_options.profile = profile
C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_30652\2332689472.py:23: DeprecationWarning: Firefox will soon stop logging to geckodriver.log by default; Specify desired logs with log_output
  service = Service(gecko_driver_path)
In [22]:
import time  
import os


ct = 0
errors = []
for root, dirs, files in os.walk(r'D:\Temp\htmls'):
    for file in files:
        if file.endswith('.pdf') and 'page_' not in root:
            ct += 1
            pdf_file =  'file:///' + os.path.join(root, file).replace('\\', '/')
            print(ct, pdf_file)
            driver.get(pdf_file)
            
            try:
                # 获取要滚动到的元素
                element = driver.find_elements(By.CSS_SELECTOR, "#viewerContainer .textLayer > span")[-1]  # 替换为你要滚动到的元素选择器

                # 滚动到指定元素位置
                driver.execute_script("arguments[0].scrollIntoView(true);", element)

                # 设置等待时间
                wait = WebDriverWait(driver, 10)  # 设置最长等待时间为 10 秒钟

                try:
                    # 等待元素加载完成
                    text_layer = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#viewerContainer .textLayer > span')))
                    # 进行你的操作,元素已经加载完成
                    print("Element found:", text_layer)  # 示例:输出找到的元素文本内容
                except:
                    print("Element not found within specified time.")
                
                time.sleep(0.1)

                # 下载文件
                driver.execute_script(to_word_js)
            except Exception as e:
                errors.append(pdf_file)
                print(e)
1 file:///D:/Temp/htmls/AN202302161583234787/page_1.pdf
Element not found within specified time.
2 file:///D:/Temp/htmls/AN202302161583234787/page_5.pdf
Element not found within specified time.
3 file:///D:/Temp/htmls/AN202302161583235908/page_3.pdf
Element not found within specified time.
4 file:///D:/Temp/htmls/AN202302281583881638/page_95.pdf
Element not found within specified time.
5 file:///D:/Temp/htmls/AN202302281583881638/page_96.pdf
Element not found within specified time.
6 file:///D:/Temp/htmls/AN202302281583881638/page_98.pdf
Element not found within specified time.
7 file:///D:/Temp/htmls/AN202302281583881638/page_99.pdf
Element not found within specified time.
8 file:///D:/Temp/htmls/AN202303011583892542/page_1.pdf
Element not found within specified time.
9 file:///D:/Temp/htmls/AN202303011583892545/page_1.pdf
Element not found within specified time.
10 file:///D:/Temp/htmls/AN202303011583892548/page_1.pdf
Element not found within specified time.
11 file:///D:/Temp/htmls/AN202303011583892551/page_1.pdf
Element not found within specified time.
12 file:///D:/Temp/htmls/AN202303011583892552/page_1.pdf
Element not found within specified time.
13 file:///D:/Temp/htmls/AN202303011583892555/page_1.pdf
Element not found within specified time.
14 file:///D:/Temp/htmls/AN202303011583892556/page_1.pdf
Element not found within specified time.
15 file:///D:/Temp/htmls/AN202303011583892578/page_1.pdf
Element not found within specified time.
16 file:///D:/Temp/htmls/AN202303011583892582/page_1.pdf
Element not found within specified time.
17 file:///D:/Temp/htmls/AN202303011583892586/page_1.pdf
Element not found within specified time.
18 file:///D:/Temp/htmls/AN202303011583892589/page_1.pdf
Element not found within specified time.
19 file:///D:/Temp/htmls/AN202303011583892592/page_1.pdf
Element not found within specified time.
20 file:///D:/Temp/htmls/AN202303011583892593/page_1.pdf
Element not found within specified time.
21 file:///D:/Temp/htmls/AN202303011583892598/page_1.pdf
Element not found within specified time.
22 file:///D:/Temp/htmls/AN202303011583892598/page_2.pdf
Element not found within specified time.
23 file:///D:/Temp/htmls/AN202303011583892598/page_3.pdf
Element not found within specified time.
24 file:///D:/Temp/htmls/AN202303011583892598/page_4.pdf
Element not found within specified time.
25 file:///D:/Temp/htmls/AN202303011583897916/page_16.pdf
Element not found within specified time.
26 file:///D:/Temp/htmls/AN202303011583898495/page_16.pdf
Element not found within specified time.
27 file:///D:/Temp/htmls/AN202303011583898575/page_15.pdf
Element not found within specified time.
28 file:///D:/Temp/htmls/AN202303011583898653/page_15.pdf
Element not found within specified time.
29 file:///D:/Temp/htmls/AN202303011583898726/page_1.pdf
Element not found within specified time.
30 file:///D:/Temp/htmls/AN202303011583899064/page_1.pdf
Element not found within specified time.
31 file:///D:/Temp/htmls/AN202303011583901479/page_1.pdf
Element not found within specified time.
32 file:///D:/Temp/htmls/AN202303011583903892/page_1.pdf
Element not found within specified time.
33 file:///D:/Temp/htmls/AN202303011583910026/page_1.pdf
Element not found within specified time.
34 file:///D:/Temp/htmls/AN202303011583910543/page_2.pdf
Element not found within specified time.
35 file:///D:/Temp/htmls/AN202303011583910543/page_3.pdf
Element not found within specified time.
36 file:///D:/Temp/htmls/AN202303011583913104/page_1.pdf
Element not found within specified time.
37 file:///D:/Temp/htmls/AN202303011583915212/page_1.pdf
Element not found within specified time.
38 file:///D:/Temp/htmls/AN202303011583917131/page_28.pdf
Element not found within specified time.
39 file:///D:/Temp/htmls/AN202303011583920374/page_2.pdf
Element not found within specified time.
40 file:///D:/Temp/htmls/AN202303011583920769/page_10.pdf
Element not found within specified time.
41 file:///D:/Temp/htmls/AN202303011583920817/page_269.pdf
Element not found within specified time.
42 file:///D:/Temp/htmls/AN202303011583920817/page_59.pdf
Element not found within specified time.
43 file:///D:/Temp/htmls/AN202303011583920850/page_17.pdf
Element not found within specified time.
44 file:///D:/Temp/htmls/AN202303011583920850/page_18.pdf
Element not found within specified time.
45 file:///D:/Temp/htmls/AN202303011583922005/page_41.pdf
Element not found within specified time.
46 file:///D:/Temp/htmls/AN202303011583922005/page_79.pdf
Element not found within specified time.
47 file:///D:/Temp/htmls/AN202303011583922005/page_97.pdf
Element not found within specified time.
48 file:///D:/Temp/htmls/AN202303011583922259/page_16.pdf
Element not found within specified time.
49 file:///D:/Temp/htmls/AN202303011583922259/page_17.pdf
Element not found within specified time.
50 file:///D:/Temp/htmls/AN202303011583922267/page_60.pdf
Element not found within specified time.
51 file:///D:/Temp/htmls/AN202303011583922267/page_61.pdf
Element not found within specified time.
52 file:///D:/Temp/htmls/AN202303011583922267/page_62.pdf
Element not found within specified time.
53 file:///D:/Temp/htmls/AN202303011583922619/page_1.pdf
Element not found within specified time.
54 file:///D:/Temp/htmls/AN202303011583922619/page_10.pdf
Element not found within specified time.
55 file:///D:/Temp/htmls/AN202303011583922619/page_11.pdf
Element not found within specified time.
56 file:///D:/Temp/htmls/AN202303011583922619/page_12.pdf
Element not found within specified time.
57 file:///D:/Temp/htmls/AN202303011583922619/page_14.pdf
Element not found within specified time.
58 file:///D:/Temp/htmls/AN202303011583922619/page_15.pdf
Element not found within specified time.
59 file:///D:/Temp/htmls/AN202303011583922619/page_17.pdf
Element not found within specified time.
60 file:///D:/Temp/htmls/AN202303011583922619/page_19.pdf
Element not found within specified time.
61 file:///D:/Temp/htmls/AN202303011583922619/page_22.pdf
Element not found within specified time.
62 file:///D:/Temp/htmls/AN202303011583922619/page_23.pdf
Element not found within specified time.
63 file:///D:/Temp/htmls/AN202303011583922619/page_24.pdf
Element not found within specified time.
64 file:///D:/Temp/htmls/AN202303011583922619/page_26.pdf
Element not found within specified time.
65 file:///D:/Temp/htmls/AN202303011583922619/page_27.pdf
Element not found within specified time.
66 file:///D:/Temp/htmls/AN202303011583922619/page_28.pdf
Element not found within specified time.
67 file:///D:/Temp/htmls/AN202303011583922619/page_3.pdf
Element not found within specified time.
68 file:///D:/Temp/htmls/AN202303011583922619/page_30.pdf
Element not found within specified time.
69 file:///D:/Temp/htmls/AN202303011583922619/page_31.pdf
Element not found within specified time.
70 file:///D:/Temp/htmls/AN202303011583922619/page_32.pdf
Element not found within specified time.
71 file:///D:/Temp/htmls/AN202303011583922619/page_34.pdf
Element not found within specified time.
72 file:///D:/Temp/htmls/AN202303011583922619/page_35.pdf
Element not found within specified time.
73 file:///D:/Temp/htmls/AN202303011583922619/page_36.pdf
Element not found within specified time.
74 file:///D:/Temp/htmls/AN202303011583922619/page_5.pdf
Element not found within specified time.
75 file:///D:/Temp/htmls/AN202303011583922619/page_6.pdf
Element not found within specified time.
76 file:///D:/Temp/htmls/AN202303011583922619/page_7.pdf
Element not found within specified time.
77 file:///D:/Temp/htmls/AN202303011583922619/page_8.pdf
Element not found within specified time.
78 file:///D:/Temp/htmls/AN202303011583922619/page_9.pdf
Element not found within specified time.
79 file:///D:/Temp/htmls/AN202303011583922751/page_1.pdf
Element not found within specified time.
80 file:///D:/Temp/htmls/AN202303011583923603/page_1.pdf
Element not found within specified time.
81 file:///D:/Temp/htmls/AN202303011583923803/page_147.pdf
Element not found within specified time.
82 file:///D:/Temp/htmls/AN202303011583923803/page_20.pdf
Element not found within specified time.
83 file:///D:/Temp/htmls/AN202303011583924194/page_135.pdf
Element not found within specified time.
84 file:///D:/Temp/htmls/AN202303011583924194/page_136.pdf
Element not found within specified time.
85 file:///D:/Temp/htmls/AN202303011583924194/page_137.pdf
Element not found within specified time.
86 file:///D:/Temp/htmls/AN202303011583924194/page_138.pdf
Element not found within specified time.
87 file:///D:/Temp/htmls/AN202303011583924194/page_139.pdf
Element not found within specified time.
88 file:///D:/Temp/htmls/AN202303011583924194/page_140.pdf
Element not found within specified time.
89 file:///D:/Temp/htmls/AN202303011583924194/page_141.pdf
Element not found within specified time.
90 file:///D:/Temp/htmls/AN202303011583924194/page_143.pdf
Element not found within specified time.
91 file:///D:/Temp/htmls/AN202303011583924194/page_144.pdf
Element not found within specified time.
92 file:///D:/Temp/htmls/AN202303011583924194/page_145.pdf
Element not found within specified time.
93 file:///D:/Temp/htmls/AN202303011583924194/page_146.pdf
Element not found within specified time.
94 file:///D:/Temp/htmls/AN202303011583924228/page_3.pdf
Element not found within specified time.
95 file:///D:/Temp/htmls/AN202303011583924228/page_4.pdf
Element not found within specified time.
96 file:///D:/Temp/htmls/AN202303011583924785/page_14.pdf
Element not found within specified time.
97 file:///D:/Temp/htmls/AN202303011583924785/page_15.pdf
Element not found within specified time.
98 file:///D:/Temp/htmls/AN202303011583924786/page_15.pdf
Element not found within specified time.
99 file:///D:/Temp/htmls/AN202303011583925000/page_1.pdf
Element not found within specified time.
100 file:///D:/Temp/htmls/AN202303011583925002/page_103.pdf
Element not found within specified time.
101 file:///D:/Temp/htmls/AN202303011583925002/page_107.pdf
Element not found within specified time.
102 file:///D:/Temp/htmls/AN202303011583925002/page_21.pdf
Element not found within specified time.
103 file:///D:/Temp/htmls/AN202303011583925202/page_3.pdf
Element not found within specified time.
104 file:///D:/Temp/htmls/AN202303011583925291/page_5.pdf
Element not found within specified time.
105 file:///D:/Temp/htmls/AN202303011583925660/page_16.pdf
Element not found within specified time.
106 file:///D:/Temp/htmls/AN202303011583925660/page_17.pdf
Element not found within specified time.
107 file:///D:/Temp/htmls/AN202303011583925672/page_2.pdf
Element not found within specified time.
108 file:///D:/Temp/htmls/AN202303011583925685/page_60.pdf
Element not found within specified time.
109 file:///D:/Temp/htmls/AN202303011583925685/page_61.pdf
Element not found within specified time.
110 file:///D:/Temp/htmls/AN202303011583925685/page_62.pdf
Element not found within specified time.
111 file:///D:/Temp/htmls/AN202303011583925830/page_103.pdf
Element not found within specified time.
112 file:///D:/Temp/htmls/AN202303011583925830/page_107.pdf
Element not found within specified time.
113 file:///D:/Temp/htmls/AN202303011583925830/page_21.pdf
Element not found within specified time.
114 file:///D:/Temp/htmls/AN202303011583926787/page_11.pdf
Element not found within specified time.
115 file:///D:/Temp/htmls/AN202303011583926787/page_12.pdf
Element not found within specified time.
116 file:///D:/Temp/htmls/AN202303011583927517/page_12.pdf
Element not found within specified time.
117 file:///D:/Temp/htmls/AN202303011583927517/page_20.pdf
Element not found within specified time.
118 file:///D:/Temp/htmls/AN202303141584239808/page_1.pdf
Element not found within specified time.
119 file:///D:/Temp/htmls/AN202303151584289331/page_2.pdf
Element not found within specified time.
120 file:///D:/Temp/htmls/AN202303151584289605/page_1.pdf
Element not found within specified time.
121 file:///D:/Temp/htmls/AN202303151584289605/page_10.pdf
Element not found within specified time.
122 file:///D:/Temp/htmls/AN202303151584289605/page_101.pdf
Element not found within specified time.
123 file:///D:/Temp/htmls/AN202303151584289605/page_102.pdf
Element not found within specified time.
124 file:///D:/Temp/htmls/AN202303151584289605/page_104.pdf
Element not found within specified time.
125 file:///D:/Temp/htmls/AN202303151584289605/page_105.pdf
Element not found within specified time.
126 file:///D:/Temp/htmls/AN202303151584289605/page_106.pdf
Element not found within specified time.
127 file:///D:/Temp/htmls/AN202303151584289605/page_11.pdf
Element not found within specified time.
128 file:///D:/Temp/htmls/AN202303151584289605/page_12.pdf
Element not found within specified time.
129 file:///D:/Temp/htmls/AN202303151584289605/page_14.pdf
Element not found within specified time.
130 file:///D:/Temp/htmls/AN202303151584289605/page_15.pdf
Element not found within specified time.
131 file:///D:/Temp/htmls/AN202303151584289605/page_16.pdf
Element not found within specified time.
132 file:///D:/Temp/htmls/AN202303151584289605/page_18.pdf
Element not found within specified time.
133 file:///D:/Temp/htmls/AN202303151584289605/page_19.pdf
Element not found within specified time.
134 file:///D:/Temp/htmls/AN202303151584289605/page_2.pdf
Element not found within specified time.
135 file:///D:/Temp/htmls/AN202303151584289605/page_20.pdf
Element not found within specified time.
136 file:///D:/Temp/htmls/AN202303151584289605/page_21.pdf
Element not found within specified time.
137 file:///D:/Temp/htmls/AN202303151584289605/page_22.pdf
Element not found within specified time.
138 file:///D:/Temp/htmls/AN202303151584289605/page_26.pdf
Element not found within specified time.
139 file:///D:/Temp/htmls/AN202303151584289605/page_27.pdf
Element not found within specified time.
140 file:///D:/Temp/htmls/AN202303151584289605/page_28.pdf
Element not found within specified time.
141 file:///D:/Temp/htmls/AN202303151584289605/page_29.pdf
Element not found within specified time.
142 file:///D:/Temp/htmls/AN202303151584289605/page_3.pdf
Element not found within specified time.
143 file:///D:/Temp/htmls/AN202303151584289605/page_30.pdf
Element not found within specified time.
144 file:///D:/Temp/htmls/AN202303151584289605/page_31.pdf
Element not found within specified time.
145 file:///D:/Temp/htmls/AN202303151584289605/page_39.pdf
Element not found within specified time.
146 file:///D:/Temp/htmls/AN202303151584289605/page_4.pdf
Element not found within specified time.
147 file:///D:/Temp/htmls/AN202303151584289605/page_40.pdf
Element not found within specified time.
148 file:///D:/Temp/htmls/AN202303151584289605/page_41.pdf
Element not found within specified time.
149 file:///D:/Temp/htmls/AN202303151584289605/page_43.pdf
Element not found within specified time.
150 file:///D:/Temp/htmls/AN202303151584289605/page_44.pdf
Element not found within specified time.
151 file:///D:/Temp/htmls/AN202303151584289605/page_45.pdf
Element not found within specified time.
152 file:///D:/Temp/htmls/AN202303151584289605/page_46.pdf
Element not found within specified time.
153 file:///D:/Temp/htmls/AN202303151584289605/page_47.pdf
Element not found within specified time.
154 file:///D:/Temp/htmls/AN202303151584289605/page_48.pdf
Element not found within specified time.
155 file:///D:/Temp/htmls/AN202303151584289605/page_49.pdf
Element not found within specified time.
156 file:///D:/Temp/htmls/AN202303151584289605/page_5.pdf
Element not found within specified time.
157 file:///D:/Temp/htmls/AN202303151584289605/page_50.pdf
Element not found within specified time.
158 file:///D:/Temp/htmls/AN202303151584289605/page_51.pdf
Element not found within specified time.
159 file:///D:/Temp/htmls/AN202303151584289605/page_53.pdf
Element not found within specified time.
160 file:///D:/Temp/htmls/AN202303151584289605/page_54.pdf
Element not found within specified time.
161 file:///D:/Temp/htmls/AN202303151584289605/page_55.pdf
Element not found within specified time.
162 file:///D:/Temp/htmls/AN202303151584289605/page_56.pdf
Element not found within specified time.
163 file:///D:/Temp/htmls/AN202303151584289605/page_57.pdf
Element not found within specified time.
164 file:///D:/Temp/htmls/AN202303151584289605/page_60.pdf
Element not found within specified time.
165 file:///D:/Temp/htmls/AN202303151584289605/page_62.pdf
Element not found within specified time.
166 file:///D:/Temp/htmls/AN202303151584289605/page_63.pdf
Element not found within specified time.
167 file:///D:/Temp/htmls/AN202303151584289605/page_68.pdf
Element not found within specified time.
168 file:///D:/Temp/htmls/AN202303151584289605/page_69.pdf
Element not found within specified time.
169 file:///D:/Temp/htmls/AN202303151584289605/page_7.pdf
Element not found within specified time.
170 file:///D:/Temp/htmls/AN202303151584289605/page_70.pdf
Element not found within specified time.
171 file:///D:/Temp/htmls/AN202303151584289605/page_74.pdf
Element not found within specified time.
172 file:///D:/Temp/htmls/AN202303151584289605/page_75.pdf
Element not found within specified time.
173 file:///D:/Temp/htmls/AN202303151584289605/page_77.pdf
Element not found within specified time.
174 file:///D:/Temp/htmls/AN202303151584289605/page_79.pdf
Element not found within specified time.
175 file:///D:/Temp/htmls/AN202303151584289605/page_8.pdf
Element not found within specified time.
176 file:///D:/Temp/htmls/AN202303151584289605/page_80.pdf
Element not found within specified time.
177 file:///D:/Temp/htmls/AN202303151584289605/page_82.pdf
Element not found within specified time.
178 file:///D:/Temp/htmls/AN202303151584289605/page_83.pdf
Element not found within specified time.
179 file:///D:/Temp/htmls/AN202303151584289605/page_84.pdf
Element not found within specified time.
180 file:///D:/Temp/htmls/AN202303151584289605/page_85.pdf
Element not found within specified time.
181 file:///D:/Temp/htmls/AN202303151584289605/page_87.pdf
Element not found within specified time.
182 file:///D:/Temp/htmls/AN202303151584289605/page_88.pdf
Element not found within specified time.
183 file:///D:/Temp/htmls/AN202303151584289605/page_89.pdf
Element not found within specified time.
184 file:///D:/Temp/htmls/AN202303151584289605/page_9.pdf
Element not found within specified time.
185 file:///D:/Temp/htmls/AN202303151584289605/page_90.pdf
Element not found within specified time.
186 file:///D:/Temp/htmls/AN202303151584289605/page_91.pdf
Element not found within specified time.
187 file:///D:/Temp/htmls/AN202303151584289605/page_92.pdf
Element not found within specified time.
188 file:///D:/Temp/htmls/AN202303151584289605/page_94.pdf
Element not found within specified time.
189 file:///D:/Temp/htmls/AN202303151584289605/page_97.pdf
Element not found within specified time.
190 file:///D:/Temp/htmls/AN202303151584289611/page_1.pdf
Element not found within specified time.
191 file:///D:/Temp/htmls/AN202303151584289611/page_2.pdf
Element not found within specified time.
192 file:///D:/Temp/htmls/AN202303151584289611/page_4.pdf
Element not found within specified time.
193 file:///D:/Temp/htmls/AN202303151584289611/page_5.pdf
Element not found within specified time.
194 file:///D:/Temp/htmls/AN202303151584289611/page_6.pdf
Element not found within specified time.
195 file:///D:/Temp/htmls/AN202303151584289616/page_1.pdf
Element not found within specified time.
196 file:///D:/Temp/htmls/AN202303151584291140/page_3.pdf
Element not found within specified time.
197 file:///D:/Temp/htmls/AN202303151584292038/page_14.pdf
Element not found within specified time.
198 file:///D:/Temp/htmls/AN202303151584292945_r_invisiable_text_r_none_table/page_12.pdf
Element not found within specified time.
199 file:///D:/Temp/htmls/AN202303151584292945_r_invisiable_text_r_none_table/page_20.pdf
Element not found within specified time.
200 file:///D:/Temp/htmls/AN202303151584292945_r_invisiable_text_r_none_table/page_51.pdf
Element not found within specified time.
201 file:///D:/Temp/htmls/AN202303151584293181/page_15.pdf
Element not found within specified time.
202 file:///D:/Temp/htmls/AN202303151584294225/page_1.pdf
Element not found within specified time.
203 file:///D:/Temp/htmls/AN202303151584295721/page_99.pdf
Element not found within specified time.
204 file:///D:/Temp/htmls/AN202303151584295941/page_14.pdf
Element not found within specified time.
205 file:///D:/Temp/htmls/AN202303151584296409/page_83.pdf
Element not found within specified time.
206 file:///D:/Temp/htmls/AN202303151584296823/page_1.pdf
Element not found within specified time.
207 file:///D:/Temp/htmls/AN202303151584296823/page_11.pdf
Element not found within specified time.
208 file:///D:/Temp/htmls/AN202303151584296823/page_14.pdf
Element not found within specified time.
209 file:///D:/Temp/htmls/AN202303151584296823/page_15.pdf
Element not found within specified time.
210 file:///D:/Temp/htmls/AN202303151584296823/page_19.pdf
Element not found within specified time.
211 file:///D:/Temp/htmls/AN202303151584296823/page_2.pdf
Element not found within specified time.
212 file:///D:/Temp/htmls/AN202303151584296823/page_20.pdf
Element not found within specified time.
213 file:///D:/Temp/htmls/AN202303151584296823/page_21.pdf
Element not found within specified time.
214 file:///D:/Temp/htmls/AN202303151584296823/page_22.pdf
Element not found within specified time.
215 file:///D:/Temp/htmls/AN202303151584296823/page_23.pdf
Element not found within specified time.
216 file:///D:/Temp/htmls/AN202303151584296823/page_25.pdf
Element not found within specified time.
217 file:///D:/Temp/htmls/AN202303151584296823/page_27.pdf
Element not found within specified time.
218 file:///D:/Temp/htmls/AN202303151584296823/page_28.pdf
Element not found within specified time.
219 file:///D:/Temp/htmls/AN202303151584296823/page_29.pdf
Element not found within specified time.
220 file:///D:/Temp/htmls/AN202303151584296823/page_3.pdf
Element not found within specified time.
221 file:///D:/Temp/htmls/AN202303151584296823/page_31.pdf
Element not found within specified time.
222 file:///D:/Temp/htmls/AN202303151584296823/page_34.pdf
Element not found within specified time.
223 file:///D:/Temp/htmls/AN202303151584296823/page_37.pdf
Element not found within specified time.
224 file:///D:/Temp/htmls/AN202303151584296823/page_39.pdf
Element not found within specified time.
225 file:///D:/Temp/htmls/AN202303151584296823/page_4.pdf
Element not found within specified time.
226 file:///D:/Temp/htmls/AN202303151584296823/page_40.pdf
Element not found within specified time.
227 file:///D:/Temp/htmls/AN202303151584296823/page_41.pdf
Element not found within specified time.
228 file:///D:/Temp/htmls/AN202303151584296823/page_44.pdf
Element not found within specified time.
229 file:///D:/Temp/htmls/AN202303151584296823/page_46.pdf
Element not found within specified time.
230 file:///D:/Temp/htmls/AN202303151584296823/page_48.pdf
Element not found within specified time.
231 file:///D:/Temp/htmls/AN202303151584296823/page_49.pdf
Element not found within specified time.
232 file:///D:/Temp/htmls/AN202303151584296823/page_5.pdf
Element not found within specified time.
233 file:///D:/Temp/htmls/AN202303151584296823/page_50.pdf
Element not found within specified time.
234 file:///D:/Temp/htmls/AN202303151584296823/page_6.pdf
Element not found within specified time.
235 file:///D:/Temp/htmls/AN202303151584296823/page_7.pdf
Element not found within specified time.
236 file:///D:/Temp/htmls/AN202303151584296823/page_8.pdf
Element not found within specified time.
237 file:///D:/Temp/htmls/AN202303151584296823/page_9.pdf
Element not found within specified time.
238 file:///D:/Temp/htmls/AN202303161584298993/page_95.pdf
Element not found within specified time.
239 file:///D:/Temp/htmls/AN202303161584303367/page_1.pdf
Element not found within specified time.
240 file:///D:/Temp/htmls/AN202303161584303376/page_1.pdf
Element not found within specified time.
241 file:///D:/Temp/htmls/AN202303161584303443/page_1.pdf
Element not found within specified time.
242 file:///D:/Temp/htmls/AN202303161584303902/page_1.pdf
Element not found within specified time.
243 file:///D:/Temp/htmls/AN202303161584303903_r_text_miss_r_table/page_1.pdf
Element not found within specified time.
244 file:///D:/Temp/htmls/AN202303161584303905_r_mini/page_1.pdf
Element not found within specified time.
245 file:///D:/Temp/htmls/AN202303161584303906_r_to_img/page_1.pdf
Element not found within specified time.
246 file:///D:/Temp/htmls/AN202303161584303907_r_to_img/page_1.pdf
Element not found within specified time.
247 file:///D:/Temp/htmls/AN202303161584303908_r_to_img/page_1.pdf
Element not found within specified time.
248 file:///D:/Temp/htmls/AN202303161584303911_r_to_img/page_1.pdf
Element not found within specified time.
249 file:///D:/Temp/htmls/AN202303161584303911_r_to_img/page_2.pdf
Element not found within specified time.
250 file:///D:/Temp/htmls/AN202303161584303911_r_to_img/page_3.pdf
Element not found within specified time.
251 file:///D:/Temp/htmls/AN202303161584303913_r_text_miss_r_pg/page_1.pdf
Element not found within specified time.
252 file:///D:/Temp/htmls/AN202303161584303916_r_mini/page_1.pdf
Element not found within specified time.
253 file:///D:/Temp/htmls/AN202303161584303919_r_to_img/page_1.pdf
Element not found within specified time.
254 file:///D:/Temp/htmls/AN202303161584303920_r_mini/page_1.pdf
Element not found within specified time.
255 file:///D:/Temp/htmls/AN202303161584303922_r_mini/page_1.pdf
Element not found within specified time.
256 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_1.pdf
Element not found within specified time.
257 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_2.pdf
Element not found within specified time.
258 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_3.pdf
Element not found within specified time.
259 file:///D:/Temp/htmls/AN202303161584303925_r_mini/page_4.pdf
Element not found within specified time.
260 file:///D:/Temp/htmls/AN202303161584303928_r_to_img/page_1.pdf
Element not found within specified time.
261 file:///D:/Temp/htmls/AN202303161584304119_r_table/page_1.pdf
Element not found within specified time.
262 file:///D:/Temp/htmls/AN202303161584304121_r_mini/page_1.pdf
Element not found within specified time.
263 file:///D:/Temp/htmls/AN202303161584304123_r_mini/page_1.pdf
Element not found within specified time.
264 file:///D:/Temp/htmls/AN202303161584304125_r_to_img/page_1.pdf
Element not found within specified time.
265 file:///D:/Temp/htmls/AN202303161584304127_r_text_miss_r_table/page_1.pdf
Element not found within specified time.
266 file:///D:/Temp/htmls/AN202303161584304128_r_mini/page_1.pdf
Element not found within specified time.
267 file:///D:/Temp/htmls/AN202303161584304132_r_mini/page_1.pdf
Element not found within specified time.
268 file:///D:/Temp/htmls/AN202303161584304134_r_to_img/page_1.pdf
Element not found within specified time.
269 file:///D:/Temp/htmls/AN202303161584304136_r_text_miss_r_order/page_1.pdf
Element not found within specified time.
270 file:///D:/Temp/htmls/AN202303161584304137_r_mini/page_1.pdf
Element not found within specified time.
271 file:///D:/Temp/htmls/AN202303161584304147_r_to_img/page_1.pdf
Element not found within specified time.
272 file:///D:/Temp/htmls/AN202303161584304152_r_to_img/page_1.pdf
Element not found within specified time.
273 file:///D:/Temp/htmls/AN202303161584304160_r_table/page_1.pdf
Element not found within specified time.
274 file:///D:/Temp/htmls/AN202303161584311395_r_invisiable_text/page_103.pdf
Element not found within specified time.
275 file:///D:/Temp/htmls/AN202303161584311395_r_invisiable_text/page_2.pdf
Element not found within specified time.
276 file:///D:/Temp/htmls/AN202303161584311395_r_invisiable_text/page_84.pdf
Element not found within specified time.
277 file:///D:/Temp/htmls/AN202303161584311613_r_table/page_298.pdf
Element not found within specified time.
278 file:///D:/Temp/htmls/AN202303161584311613_r_table/page_299.pdf
Element not found within specified time.
279 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_1.pdf
Element not found within specified time.
280 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_10.pdf
Element not found within specified time.
281 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_11.pdf
Element not found within specified time.
282 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_13.pdf
Element not found within specified time.
283 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_14.pdf
Element not found within specified time.
284 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_15.pdf
Element not found within specified time.
285 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_17.pdf
Element not found within specified time.
286 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_18.pdf
Element not found within specified time.
287 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_20.pdf
Element not found within specified time.
288 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_21.pdf
Element not found within specified time.
289 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_23.pdf
Element not found within specified time.
290 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_25.pdf
Element not found within specified time.
291 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_26.pdf
Element not found within specified time.
292 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_27.pdf
Element not found within specified time.
293 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_28.pdf
Element not found within specified time.
294 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_29.pdf
Element not found within specified time.
295 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_30.pdf
Element not found within specified time.
296 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_31.pdf
Element not found within specified time.
297 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_32.pdf
Element not found within specified time.
298 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_33.pdf
Element not found within specified time.
299 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_34.pdf
Element not found within specified time.
300 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_35.pdf
Element not found within specified time.
301 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_36.pdf
Element not found within specified time.
302 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_37.pdf
Element not found within specified time.
303 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_38.pdf
Element not found within specified time.
304 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_4.pdf
Element not found within specified time.
305 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_5.pdf
Element not found within specified time.
306 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_7.pdf
Element not found within specified time.
307 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_8.pdf
Element not found within specified time.
308 file:///D:/Temp/htmls/AN202303161584313298_r_none_table/page_9.pdf
Element not found within specified time.
309 file:///D:/Temp/htmls/AN202303161584313359_r_order/page_1.pdf
Element not found within specified time.
310 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_10.pdf
Element not found within specified time.
311 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_11.pdf
Element not found within specified time.
312 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_12.pdf
Element not found within specified time.
313 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_14.pdf
Element not found within specified time.
314 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_15.pdf
Element not found within specified time.
315 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_17.pdf
Element not found within specified time.
316 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_19.pdf
Element not found within specified time.
317 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_20.pdf
Element not found within specified time.
318 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_21.pdf
Element not found within specified time.
319 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_24.pdf
Element not found within specified time.
320 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_25.pdf
Element not found within specified time.
321 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_26.pdf
Element not found within specified time.
322 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_27.pdf
Element not found within specified time.
323 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_28.pdf
Element not found within specified time.
324 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_29.pdf
Element not found within specified time.
325 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_30.pdf
Element not found within specified time.
326 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_31.pdf
Element not found within specified time.
327 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_33.pdf
Element not found within specified time.
328 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_8.pdf
Element not found within specified time.
329 file:///D:/Temp/htmls/AN202303161584313361_r_order/page_9.pdf
Element not found within specified time.
330 file:///D:/Temp/htmls/AN202303161584313374_r_order/page_1.pdf
Element not found within specified time.
331 file:///D:/Temp/htmls/AN202303161584313381_r_order/page_1.pdf
Element not found within specified time.
332 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_139.pdf
Element not found within specified time.
333 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_144.pdf
Element not found within specified time.
334 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_146.pdf
Element not found within specified time.
335 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_148.pdf
Element not found within specified time.
336 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_150.pdf
Element not found within specified time.
337 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_154.pdf
Element not found within specified time.
338 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_159.pdf
Element not found within specified time.
339 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_164.pdf
Element not found within specified time.
340 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_165.pdf
Element not found within specified time.
341 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_166.pdf
Element not found within specified time.
342 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_170.pdf
Element not found within specified time.
343 file:///D:/Temp/htmls/AN202303161584313457_r_144_invisiable_text/page_175.pdf
Element not found within specified time.
344 file:///D:/Temp/htmls/AN202303161584313465_r_table_invisiable_text/page_32.pdf
Element not found within specified time.
345 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_1.pdf
Element not found within specified time.
346 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_10.pdf
Element not found within specified time.
347 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_16.pdf
Element not found within specified time.
348 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_20.pdf
Element not found within specified time.
349 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_21.pdf
Element not found within specified time.
350 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_22.pdf
Element not found within specified time.
351 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_23.pdf
Element not found within specified time.
352 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_24.pdf
Element not found within specified time.
353 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_25.pdf
Element not found within specified time.
354 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_26.pdf
Element not found within specified time.
355 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_27.pdf
Element not found within specified time.
356 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_31.pdf
Element not found within specified time.
357 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_32.pdf
Element not found within specified time.
358 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_33.pdf
Element not found within specified time.
359 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_34.pdf
Element not found within specified time.
360 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_35.pdf
Element not found within specified time.
361 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_36.pdf
Element not found within specified time.
362 file:///D:/Temp/htmls/AN202303161584314390_r_order/page_9.pdf
Element not found within specified time.
363 file:///D:/Temp/htmls/AN202308181595092754_r_misordered/page_1.pdf
Element not found within specified time.
364 file:///D:/Temp/htmls/AN202309051597490339_r_mini/page_1.pdf
Element not found within specified time.
365 file:///D:/Temp/htmls/AN202309151598686950_r_failed/page_1.pdf
Element not found within specified time.
366 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_10.pdf
Element not found within specified time.
367 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_113.pdf
Element not found within specified time.
368 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_122.pdf
Element not found within specified time.
369 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_173.pdf
Element not found within specified time.
370 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_178.pdf
Element not found within specified time.
371 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_18.pdf
Element not found within specified time.
372 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_19.pdf
Element not found within specified time.
373 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_198.pdf
Element not found within specified time.
374 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_199.pdf
Element not found within specified time.
375 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_204.pdf
Element not found within specified time.
376 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_205.pdf
Element not found within specified time.
377 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_26.pdf
Element not found within specified time.
378 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_33.pdf
Element not found within specified time.
379 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_5.pdf
Element not found within specified time.
380 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_6.pdf
Element not found within specified time.
381 file:///D:/Temp/htmls/AN202311151610996963_r_table/page_9.pdf
Element not found within specified time.
382 file:///D:/Temp/htmls/AN202311161611013349_r_mini/page_1.pdf
Element not found within specified time.
383 file:///D:/Temp/htmls/AN202311161611013353_r_mini_text_miss_r_merge/page_1.pdf
Element not found within specified time.
384 file:///D:/Temp/htmls/AN202311161611013355_r/page_1.pdf
Element not found within specified time.
385 file:///D:/Temp/htmls/AN202311161611013379_r_text_miss_x/page_1.pdf
Element not found within specified time.
386 file:///D:/Temp/htmls/AN202311161611013391_r_mini/page_1.pdf
Element not found within specified time.
387 file:///D:/Temp/htmls/AN202311161611013392_r_mini/page_1.pdf
Element not found within specified time.
388 file:///D:/Temp/htmls/AN202311161611013398_r_mini/page_1.pdf
Element not found within specified time.
389 file:///D:/Temp/htmls/AN202311161611013404_r/page_1.pdf
Element not found within specified time.
390 file:///D:/Temp/htmls/AN202311161611019493_r_table/page_187.pdf
Element not found within specified time.
391 file:///D:/Temp/htmls/AN202311161611025479_r_order/page_1.pdf
Element not found within specified time.
In [35]:
# 关闭 WebDriver
driver.quit()
In [1]:
from tqdm import tqdm
import json
from operator import itemgetter
import math
import os


def px_text_to_float(args):
    for arg in args:
        yield float(arg.replace("px", ""))


def apply_scale(items, factor):
    return [it / factor for it in items]

def to_chars(word,hw , str_id):
    (page_height, page_width) = hw
    ct = len(word['text'])
    w = word['width'] / ct
    chars = []
    for i in range(ct):
        char = {
            "matrix": word['matrix'],
            "fontname": word['fontname'],
            "adv": word['adv'],
            "render": 0,
            "char_ind": ((str_id, i), i),
            "is_horizontal": word['is_horizontal'],
            "upright": word['is_horizontal'],
            "x0":  word['x0'] + i * w,
            "y0": page_height - word['bottom'],
            "x1": word['x0'] + i * w + w,
            "y1": page_height - word['top'],
            "width": w,
            "height": word['height'],
            "size": word['size'],
            "object_type": "char",
            "page_number": 1,
            "stroking_color": 0,
            "non_stroking_color": (0, 0, 0, 1),
            "gs": {
                "linewidth": 0,
                "linecap": None,
                "linejoin": None,
                "miterlimit": None,
                "dash": None,
                "intent": None,
                "flatness": None,
                "scolor": 0,
                "ncolor": (0, 0, 0, 1),
            },
            "text": word['text'][i],
            "top": word['top'],
            "bottom": word['bottom'],
            "doctop": word['top'],
            "angle": word['angle'],
        }
        chars.append(char)
    return chars


def to_word(textBlock: dict, scaleFactor: float, page, str_id) -> dict:
    
    items = itemgetter("left", "top", "width", "height", 'font-size')
    is_horizontal = textBlock["direction"] == "ltr"
    left, top, width, height, fontSize = apply_scale(
        px_text_to_float(items(textBlock)), scaleFactor
    )
    for ind, attr in enumerate(("left", "top", "width", "height", 'font-size')):
        textBlock[attr] = (left, top, width, height, fontSize)[ind]
    matrix = eval(textBlock["transform"].replace("none", "None").replace("matrix", ""))
    angle = '0'
    if isinstance(matrix, tuple):
        angle = str(round(math.degrees(math.atan2(matrix[1], matrix[3]))) % 360)
    word = {
        "text": textBlock["text"],
        "height": height,
        "width": width,
        "matrix": matrix,
        "x0": left,
        "x1": left + width,
        "top": top,
        "bottom": top + height,
        "size": height if is_horizontal else width,
        "adv": 1.0,
        "angle": angle,
        # "chars": to_chars(textBlock),
        "is_horizontal": is_horizontal,
        "fontname": textBlock['font-family'],
    }
    word["chars"] = to_chars(word, (page['height'], page['width']), str_id)
    return word

def convert(words_json_path):
    with open(words_json_path, 'r', encoding='utf-8') as fpr:
        page = json.load(fpr)
    textBlocks = page['textBlocks']
    scaleFactor = page['scaleFactor']

    page['height'], page['width'] = apply_scale(px_text_to_float((page['height'], page['width'])), scaleFactor)

    words = []
    for ind, textBlock in enumerate(textBlocks):
        if len(textBlock['text'].strip()) < 1 \
            or textBlock['width'] == 0 \
            or textBlock['height'] == 0:
            # print("skip", words_json_path)
            continue
        words.append(to_word(textBlock, scaleFactor, page, ind))
    with open(rf"D:/Temp/words/{os.path.basename(words_json_path)}", 'w', encoding='utf-8') as fpw:
        json.dump(words, fpw, ensure_ascii=False)
In [ ]:
src_dir = r'C:\Users\Administrator\Downloads'

for words_json in tqdm(os.listdir(src_dir), desc='items'):
    if not words_json.endswith('.json'):
        continue
        
    words_json_path = os.path.join(src_dir, words_json)
    convert(words_json_path)
In [2]:
# convert(r'D:\Downloads\AN202303161584313457_r_144_invisiable_text_page_139.json')
# convert(r'D:\Downloads\AN202303141584239808_page_1.json')
# convert(r'D:\Downloads\AN202308181595092754_r_misordered_page_1.json')
# convert(r'D:\Downloads\AN202112271537020821_x_page_50.json')
# convert(r'D:\Downloads\AN202311161611027928_x_page_1.json')
convert(r'D:\Downloads\words_page_1.json')
In [29]:
with open(r'D:\Downloads\AN202311161611027928_x_page_1.json', 'r', encoding='utf-8') as fpr:
        page = json.load(fpr)
textBlocks = page['textBlocks']
scaleFactor = page['scaleFactor']

page['height'], page['width'] = apply_scale(px_text_to_float((page['height'], page['width'])), scaleFactor)

words = []
for ind, textBlock in enumerate(textBlocks):
    if len(textBlock['text']) < 1:
        # print("skip", words_json_path)
        continue
    words.append(to_word(textBlock, scaleFactor, page, ind))
with open(rf"D:/Temp/words/{os.path.basename(words_json)}", 'w', encoding='utf-8') as fpw:
    json.dump(words, fpw, ensure_ascii=False)
In [31]:
rf"D:/Temp/words/{os.path.basename(words_json)}"
Out[31]:
'D:/Temp/words/TortoiseGit-LanguagePack-2.15.0.0-64bit-zh_CN.msi.rsa.asc'
In [1]:
import pandas as pd   
infocode_pags = pd.read_csv(r'd:/infocodes_totalpages.csv')
In [3]:
infocode_pags.drop_duplicates(['infocode'])
Out[3]:
infocode total_pages
0 AN202311161611028130 3
1 AN202311161611028247 1
2 AN202311161611028107 1
3 AN202311161611028083 3
4 AN202311161611028082 2
... ... ...
995 AN202311151610993566 41
996 AN202311151610993569 34
997 AN202311151610993584 8
998 AN202311151610993563 20
999 AN202311151610993364 3

1000 rows × 2 columns

In [24]:
import requests

cookies = {
    'sid': 'ef0e68df-ce10-4835-ac76-ce5f9978f831',
    'qgqp_b_id': 'af916ddbf277df5130a755f5ec7ea73c',
    'st_pvi': '97517784234262',
    'st_sp': '2023-09-11%2010%3A50%3A09',
    'st_inirUrl': '',
    'authorized-token': '{%22expires%22:null}',
    'sidebarStatus': '0',
    'token': 'MUEyNEM3QUEzMjQwQjk4MzE2Q0RCN0E0N0QyMDhENTU2QUY4QzBERUMxQTk2MjE3QUI3OTQ4MDY4NERGOTMzNQ==',
    'pub_ticket': 'b03b1663-1f22-40c0-910a-1cfdda7da616',
    'pub_empId': '210891',
    'CAS_badge': '210891',
    'CAS_name': '%E6%A2%81%E5%BE%B7%E8%83%9C',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Content-Type': 'application/json',
    'X-Requested-With': 'XMLHttpRequest',
    'Authorization': 'Bearer undefined',
    'Origin': 'https://new-ucp-mgr.eastmoney.com',
    'Connection': 'keep-alive',
    'Referer': 'https://new-ucp-mgr.eastmoney.com/',
    # 'Cookie': 'sid=ef0e68df-ce10-4835-ac76-ce5f9978f831; qgqp_b_id=af916ddbf277df5130a755f5ec7ea73c; st_pvi=97517784234262; st_sp=2023-09-11%2010%3A50%3A09; st_inirUrl=; authorized-token={%22expires%22:null}; sidebarStatus=0; token=MUEyNEM3QUEzMjQwQjk4MzE2Q0RCN0E0N0QyMDhENTU2QUY4QzBERUMxQTk2MjE3QUI3OTQ4MDY4NERGOTMzNQ==; pub_ticket=b03b1663-1f22-40c0-910a-1cfdda7da616; pub_empId=210891; CAS_badge=210891; CAS_name=%E6%A2%81%E5%BE%B7%E8%83%9C',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}

json_data = {
    'name': '10.150.108.203',
    'type': 'S',
    'code': '10.150.108.203',
    'ip': '10.150.108.203',
    'zoneId': 19,
    'healthState': 'G',
}
for i in range(10, 11):
    json_data['name'] = f'10.150.108.2{i}'
    json_data['code'] = f'10.150.108.2{i}'
    json_data['ip'] = f'10.150.108.2{i}'
    response = requests.post('https://new-ucp-mgr.eastmoney.com/ucp-mgr/api/device/', cookies=cookies, headers=headers, json=json_data)

# Note: json_data will not be serialized by requests
# exactly as it was in the original request.
#data = '{"name":"10.150.108.203","type":"S","code":"10.150.108.203","ip":"10.150.108.203","zoneId":19,"healthState":"G"}'
#response = requests.post('https://new-ucp-mgr.eastmoney.com/ucp-mgr/api/device/', cookies=cookies, headers=headers, data=data)
In [20]:
import os
import shutil
from lds_util import *

base_dir = r'D:\Temp\htmls'

ct = 0
has_read = 0
infocodes = []
for infocode in os.listdir(base_dir):
    if '_x' in infocode:
        ct += 1
        shutil.move(os.path.join(base_dir, infocode), f'd:/Temp/xxx/{infocode}')
In [14]:
import os
import shutil
from lds_util import *

base_dir = r'D:\Temp\htmls'

ct = 0
has_read = 0
infocodes = []
for infocode in os.listdir(base_dir):
    if not '_' in infocode:
        ct += 1
        infocodes.append(infocode)
    else:
        has_read += 1
print(ct, has_read)
102 170
In [13]:
import pandas as pd   

pd.DataFrame(infocodes).to_excel("todo_infocodes.xlsx")
In [9]:
import os
import shutil
from lds_util import *

base_dir = r'D:\Temp\htmls'

for infocode in os.listdir(base_dir):
    print(infocode)
    pages_path = os.path.join(base_dir, infocode)
    if os.path.exists(os.path.join(base_dir, infocode, 'custom_srot_text.txt')):
        os.remove(os.path.join(base_dir, infocode, 'custom_srot_text.txt'))
    if os.path.exists(os.path.join(base_dir, infocode, 'use_text_flow_text.txt')):
        os.remove(os.path.join(base_dir, infocode, 'use_text_flow_text.txt'))
    if os.path.exists(pages_path):
        for dirpath, dirnames, filenames  in os.walk(os.path.join(r'D:\Temp\compare_result', infocode.split("_")[0])):
            for file in filenames:
                if file.endswith('txt'):
                    shutil.copyfile(os.path.join(dirpath, file), os.path.join(base_dir, infocode, file.replace('.txt', f'{os.path.basename(dirpath)}.txt')))
AN202112271537020821_x
AN202302161583234765_x_z
AN202302161583234767_z_z
AN202302161583234769_z_z
AN202302161583234772_z_z
AN202302161583234774_z_z
AN202302161583234775_z_z
AN202302161583234780_z_z
AN202302161583234781_z
AN202302161583234782_z
AN202302161583234785_z
AN202302161583234787
AN202302161583234788_z
AN202302161583234790_z
AN202302161583234800_z
AN202302161583234803_z
AN202302161583234808_x_z
AN202302161583234814_z
AN202302161583234822_z
AN202302161583234825_z
AN202302161583234832_z
AN202302161583234833_z
AN202302161583234836_z
AN202302161583234851_z
AN202302161583234860_z
AN202302161583234868_z
AN202302161583234872_z
AN202302161583234875_z
AN202302161583234876_z
AN202302161583234881_z
AN202302161583234888_z
AN202302161583234899_z
AN202302161583234916_z
AN202302161583234923_z
AN202302161583234927_z
AN202302161583234933_z
AN202302161583234940_z
AN202302161583234942_z
AN202302161583234960_z
AN202302161583234974_z
AN202302161583234980_z
AN202302161583234981_z
AN202302161583234985_z
AN202302161583234994_z
AN202302161583234995_z
AN202302161583235545_z
AN202302161583235569_z
AN202302161583235580_z
AN202302161583235590_z
AN202302161583235607_z
AN202302161583235608_z
AN202302161583235615_z
AN202302161583235620_z
AN202302161583235623_z
AN202302161583235636_z
AN202302161583235639_z
AN202302161583235642_z
AN202302161583235643_z
AN202302161583235645_z
AN202302161583235649_z
AN202302161583235660_z
AN202302161583235690_z
AN202302161583235693_z
AN202302161583235710_z
AN202302161583235711_z
AN202302161583235713_z
AN202302161583235728_z
AN202302161583235756_z
AN202302161583235759_z
AN202302161583235762_z
AN202302161583235774_z
AN202302161583235782_z
AN202302161583235784_z
AN202302161583235795_z
AN202302161583235797_z
AN202302161583235811_z
AN202302161583235814_z
AN202302161583235818_z
AN202302161583235819_z
AN202302161583235822_z
AN202302161583235830_z
AN202302161583235834_z
AN202302161583235838_z
AN202302161583235840_z
AN202302161583235841_z
AN202302161583235847_z
AN202302161583235855_z
AN202302161583235859_z
AN202302161583235860_z
AN202302161583235870_z
AN202302161583235880_z
AN202302161583235882_z
AN202302161583235883_z
AN202302161583235889_z
AN202302161583235896_z
AN202302161583235908
AN202302161583235915_z
AN202302161583235921_z
AN202302161583235928_z
AN202302161583235940_z
AN202302161583235947_z
AN202302161583235951_z
AN202302161583235961_z
AN202302161583235963_z
AN202302161583236537_z
AN202302161583236544_z
AN202302161583236561_z
AN202302161583236562_z
AN202302161583236565_z
AN202302161583236582_z
AN202302281583881638
AN202303011583892542
AN202303011583892545
AN202303011583892548
AN202303011583892551
AN202303011583892552
AN202303011583892555
AN202303011583892556
AN202303011583892578
AN202303011583892582
AN202303011583892586
AN202303011583892589
AN202303011583892592
AN202303011583892593
AN202303011583892598
AN202303011583892601_z
AN202303011583897916
AN202303011583898495
AN202303011583898575
AN202303011583898653
AN202303011583898726
AN202303011583899064
AN202303011583901479
AN202303011583903892
AN202303011583910026
AN202303011583910543
AN202303011583913104
AN202303011583915212
AN202303011583917131
AN202303011583920374
AN202303011583920769
AN202303011583920817
AN202303011583920850
AN202303011583922005
AN202303011583922259
AN202303011583922267
AN202303011583922619
AN202303011583922685_z
AN202303011583922751
AN202303011583923603
AN202303011583923803
AN202303011583924194
AN202303011583924228
AN202303011583924785
AN202303011583924786
AN202303011583924970_z
AN202303011583925000
AN202303011583925002
AN202303011583925202
AN202303011583925291
AN202303011583925660
AN202303011583925672
AN202303011583925685
AN202303011583925830
AN202303011583926787
AN202303011583927517
AN202303141584239808
AN202303151584289331
AN202303151584289605
AN202303151584289611
AN202303151584289616
AN202303151584291140
AN202303151584292038
AN202303151584292945
AN202303151584293181
AN202303151584294223_z
AN202303151584294225
AN202303151584295721
AN202303151584295941
AN202303151584296409
AN202303151584296448_z
AN202303151584296823
AN202303161584298993
AN202303161584303367
AN202303161584303376
AN202303161584303443
AN202303161584303902
AN202303161584303903
AN202303161584303904_z
AN202303161584303905
AN202303161584303906
AN202303161584303907
AN202303161584303908
AN202303161584303911
AN202303161584303913
AN202303161584303915_z
AN202303161584303916
AN202303161584303917
AN202303161584303919
AN202303161584303920
AN202303161584303922
AN202303161584303924_z
AN202303161584303925
AN202303161584303928
AN202303161584303930_z
AN202303161584303934_z
AN202303161584304119
AN202303161584304121
AN202303161584304123
AN202303161584304125
AN202303161584304127
AN202303161584304128
AN202303161584304132
AN202303161584304134
AN202303161584304136
AN202303161584304137
AN202303161584304140_z
AN202303161584304147
AN202303161584304152
AN202303161584304157_z
AN202303161584304160
AN202303161584304166_z
AN202303161584311395
AN202303161584311613
AN202303161584312005
AN202303161584312613
AN202303161584313298
AN202303161584313359
AN202303161584313361
AN202303161584313374
AN202303161584313376
AN202303161584313381
AN202303161584313385
AN202303161584313457
AN202303161584313465
AN202303161584313810
AN202303161584313855
AN202303161584314390
AN202303161584315523
AN202303161584315719
AN202303161584315766
AN202303161584316679_z
AN202303161584317034
AN202303161584318554
AN202309051597490339
AN202309151598686950
AN202311151610996963
AN202311151610996966
AN202311161611013349
AN202311161611013353
AN202311161611013355
AN202311161611013377_z
AN202311161611013379
AN202311161611013391
AN202311161611013392
AN202311161611013398
AN202311161611013404
AN202311161611013773
AN202311161611018540
AN202311161611019423
AN202311161611019489
AN202311161611019493
AN202311161611019511
AN202311161611019537
AN202311161611019545
AN202311161611021874
AN202311161611025479
AN202311161611026592
AN202311161611026594
AN202311161611027171
AN202311161611027185
AN202311161611027928_x
In [4]:
import os
import shutil
from lds_util import *

base_dir = r'D:\Temp\htmls'

for infocode in os.listdir(base_dir):
    pages_path = os.path.join(base_dir, infocode)
    if os.path.exists(pages_path):
        if tuple(filter(lambda it: it.endswith('.pdf'),os.listdir(pages_path))) == ('page_1.pdf', 'page_2.pdf'):
            shutil.move(pages_path, pages_path + '_z')
In [2]:
import os
import shutil
from lds_util import *

base_dir = r'D:\Temp\compare_result'

for infocode in os.listdir(base_dir):
    pages_path = os.path.join(base_dir, infocode, "pages")
    if os.path.exists(pages_path):
        for file in os.listdir(pages_path):
            if file.endswith('.html') or file.endswith('.pdf'):
                shutil.copyfile(os.path.join(pages_path, file), 
                                mkpdir(fr'd:/Temp/htmls/{infocode}/{file}'))
makedirs d:/Temp/htmls/AN202112271537020821
makedirs d:/Temp/htmls/AN202302161583234765
makedirs d:/Temp/htmls/AN202302161583234767
makedirs d:/Temp/htmls/AN202302161583234769
makedirs d:/Temp/htmls/AN202302161583234772
makedirs d:/Temp/htmls/AN202302161583234774
makedirs d:/Temp/htmls/AN202302161583234775
makedirs d:/Temp/htmls/AN202302161583234780
makedirs d:/Temp/htmls/AN202302161583234781
makedirs d:/Temp/htmls/AN202302161583234782
makedirs d:/Temp/htmls/AN202302161583234785
makedirs d:/Temp/htmls/AN202302161583234787
makedirs d:/Temp/htmls/AN202302161583234788
makedirs d:/Temp/htmls/AN202302161583234790
makedirs d:/Temp/htmls/AN202302161583234800
makedirs d:/Temp/htmls/AN202302161583234803
makedirs d:/Temp/htmls/AN202302161583234808
makedirs d:/Temp/htmls/AN202302161583234814
makedirs d:/Temp/htmls/AN202302161583234822
makedirs d:/Temp/htmls/AN202302161583234825
makedirs d:/Temp/htmls/AN202302161583234832
makedirs d:/Temp/htmls/AN202302161583234833
makedirs d:/Temp/htmls/AN202302161583234836
makedirs d:/Temp/htmls/AN202302161583234851
makedirs d:/Temp/htmls/AN202302161583234860
makedirs d:/Temp/htmls/AN202302161583234868
makedirs d:/Temp/htmls/AN202302161583234872
makedirs d:/Temp/htmls/AN202302161583234875
makedirs d:/Temp/htmls/AN202302161583234876
makedirs d:/Temp/htmls/AN202302161583234881
makedirs d:/Temp/htmls/AN202302161583234888
makedirs d:/Temp/htmls/AN202302161583234899
makedirs d:/Temp/htmls/AN202302161583234916
makedirs d:/Temp/htmls/AN202302161583234923
makedirs d:/Temp/htmls/AN202302161583234927
makedirs d:/Temp/htmls/AN202302161583234933
makedirs d:/Temp/htmls/AN202302161583234940
makedirs d:/Temp/htmls/AN202302161583234942
makedirs d:/Temp/htmls/AN202302161583234960
makedirs d:/Temp/htmls/AN202302161583234974
makedirs d:/Temp/htmls/AN202302161583234980
makedirs d:/Temp/htmls/AN202302161583234981
makedirs d:/Temp/htmls/AN202302161583234985
makedirs d:/Temp/htmls/AN202302161583234994
makedirs d:/Temp/htmls/AN202302161583234995
makedirs d:/Temp/htmls/AN202302161583235545
makedirs d:/Temp/htmls/AN202302161583235569
makedirs d:/Temp/htmls/AN202302161583235580
makedirs d:/Temp/htmls/AN202302161583235590
makedirs d:/Temp/htmls/AN202302161583235607
makedirs d:/Temp/htmls/AN202302161583235608
makedirs d:/Temp/htmls/AN202302161583235615
makedirs d:/Temp/htmls/AN202302161583235620
makedirs d:/Temp/htmls/AN202302161583235623
makedirs d:/Temp/htmls/AN202302161583235636
makedirs d:/Temp/htmls/AN202302161583235639
makedirs d:/Temp/htmls/AN202302161583235642
makedirs d:/Temp/htmls/AN202302161583235643
makedirs d:/Temp/htmls/AN202302161583235645
makedirs d:/Temp/htmls/AN202302161583235649
makedirs d:/Temp/htmls/AN202302161583235660
makedirs d:/Temp/htmls/AN202302161583235690
makedirs d:/Temp/htmls/AN202302161583235693
makedirs d:/Temp/htmls/AN202302161583235710
makedirs d:/Temp/htmls/AN202302161583235711
makedirs d:/Temp/htmls/AN202302161583235713
makedirs d:/Temp/htmls/AN202302161583235728
makedirs d:/Temp/htmls/AN202302161583235756
makedirs d:/Temp/htmls/AN202302161583235759
makedirs d:/Temp/htmls/AN202302161583235762
makedirs d:/Temp/htmls/AN202302161583235774
makedirs d:/Temp/htmls/AN202302161583235782
makedirs d:/Temp/htmls/AN202302161583235784
makedirs d:/Temp/htmls/AN202302161583235795
makedirs d:/Temp/htmls/AN202302161583235797
makedirs d:/Temp/htmls/AN202302161583235811
makedirs d:/Temp/htmls/AN202302161583235814
makedirs d:/Temp/htmls/AN202302161583235818
makedirs d:/Temp/htmls/AN202302161583235819
makedirs d:/Temp/htmls/AN202302161583235822
makedirs d:/Temp/htmls/AN202302161583235830
makedirs d:/Temp/htmls/AN202302161583235834
makedirs d:/Temp/htmls/AN202302161583235838
makedirs d:/Temp/htmls/AN202302161583235840
makedirs d:/Temp/htmls/AN202302161583235841
makedirs d:/Temp/htmls/AN202302161583235847
makedirs d:/Temp/htmls/AN202302161583235855
makedirs d:/Temp/htmls/AN202302161583235859
makedirs d:/Temp/htmls/AN202302161583235860
makedirs d:/Temp/htmls/AN202302161583235870
makedirs d:/Temp/htmls/AN202302161583235880
makedirs d:/Temp/htmls/AN202302161583235882
makedirs d:/Temp/htmls/AN202302161583235883
makedirs d:/Temp/htmls/AN202302161583235889
makedirs d:/Temp/htmls/AN202302161583235896
makedirs d:/Temp/htmls/AN202302161583235908
makedirs d:/Temp/htmls/AN202302161583235915
makedirs d:/Temp/htmls/AN202302161583235921
makedirs d:/Temp/htmls/AN202302161583235928
makedirs d:/Temp/htmls/AN202302161583235940
makedirs d:/Temp/htmls/AN202302161583235947
makedirs d:/Temp/htmls/AN202302161583235951
makedirs d:/Temp/htmls/AN202302161583235961
makedirs d:/Temp/htmls/AN202302161583235963
makedirs d:/Temp/htmls/AN202302161583236537
makedirs d:/Temp/htmls/AN202302161583236544
makedirs d:/Temp/htmls/AN202302161583236561
makedirs d:/Temp/htmls/AN202302161583236562
makedirs d:/Temp/htmls/AN202302161583236565
makedirs d:/Temp/htmls/AN202302161583236582
makedirs d:/Temp/htmls/AN202302281583881638
makedirs d:/Temp/htmls/AN202303011583892542
makedirs d:/Temp/htmls/AN202303011583892545
makedirs d:/Temp/htmls/AN202303011583892548
makedirs d:/Temp/htmls/AN202303011583892551
makedirs d:/Temp/htmls/AN202303011583892552
makedirs d:/Temp/htmls/AN202303011583892555
makedirs d:/Temp/htmls/AN202303011583892556
makedirs d:/Temp/htmls/AN202303011583892578
makedirs d:/Temp/htmls/AN202303011583892582
makedirs d:/Temp/htmls/AN202303011583892586
makedirs d:/Temp/htmls/AN202303011583892589
makedirs d:/Temp/htmls/AN202303011583892592
makedirs d:/Temp/htmls/AN202303011583892593
makedirs d:/Temp/htmls/AN202303011583892598
makedirs d:/Temp/htmls/AN202303011583892601
makedirs d:/Temp/htmls/AN202303011583897916
makedirs d:/Temp/htmls/AN202303011583898495
makedirs d:/Temp/htmls/AN202303011583898575
makedirs d:/Temp/htmls/AN202303011583898653
makedirs d:/Temp/htmls/AN202303011583898726
makedirs d:/Temp/htmls/AN202303011583899064
makedirs d:/Temp/htmls/AN202303011583901479
makedirs d:/Temp/htmls/AN202303011583903892
makedirs d:/Temp/htmls/AN202303011583910026
makedirs d:/Temp/htmls/AN202303011583910543
makedirs d:/Temp/htmls/AN202303011583913104
makedirs d:/Temp/htmls/AN202303011583915212
makedirs d:/Temp/htmls/AN202303011583917131
makedirs d:/Temp/htmls/AN202303011583920374
makedirs d:/Temp/htmls/AN202303011583920769
makedirs d:/Temp/htmls/AN202303011583920817
makedirs d:/Temp/htmls/AN202303011583920850
makedirs d:/Temp/htmls/AN202303011583922005
makedirs d:/Temp/htmls/AN202303011583922259
makedirs d:/Temp/htmls/AN202303011583922267
makedirs d:/Temp/htmls/AN202303011583922619
makedirs d:/Temp/htmls/AN202303011583922685
makedirs d:/Temp/htmls/AN202303011583922751
makedirs d:/Temp/htmls/AN202303011583923603
makedirs d:/Temp/htmls/AN202303011583923803
makedirs d:/Temp/htmls/AN202303011583924194
makedirs d:/Temp/htmls/AN202303011583924228
makedirs d:/Temp/htmls/AN202303011583924785
makedirs d:/Temp/htmls/AN202303011583924786
makedirs d:/Temp/htmls/AN202303011583924970
makedirs d:/Temp/htmls/AN202303011583925000
makedirs d:/Temp/htmls/AN202303011583925002
makedirs d:/Temp/htmls/AN202303011583925202
makedirs d:/Temp/htmls/AN202303011583925291
makedirs d:/Temp/htmls/AN202303011583925660
makedirs d:/Temp/htmls/AN202303011583925672
makedirs d:/Temp/htmls/AN202303011583925685
makedirs d:/Temp/htmls/AN202303011583925830
makedirs d:/Temp/htmls/AN202303011583926787
makedirs d:/Temp/htmls/AN202303011583927517
makedirs d:/Temp/htmls/AN202303141584239808
makedirs d:/Temp/htmls/AN202303151584289331
makedirs d:/Temp/htmls/AN202303151584289605
makedirs d:/Temp/htmls/AN202303151584289611
makedirs d:/Temp/htmls/AN202303151584289616
makedirs d:/Temp/htmls/AN202303151584291140
makedirs d:/Temp/htmls/AN202303151584292038
makedirs d:/Temp/htmls/AN202303151584292945
makedirs d:/Temp/htmls/AN202303151584293181
makedirs d:/Temp/htmls/AN202303151584294223
makedirs d:/Temp/htmls/AN202303151584294225
makedirs d:/Temp/htmls/AN202303151584295721
makedirs d:/Temp/htmls/AN202303151584295941
makedirs d:/Temp/htmls/AN202303151584296409
makedirs d:/Temp/htmls/AN202303151584296448
makedirs d:/Temp/htmls/AN202303151584296823
makedirs d:/Temp/htmls/AN202303161584298993
makedirs d:/Temp/htmls/AN202303161584303367
makedirs d:/Temp/htmls/AN202303161584303376
makedirs d:/Temp/htmls/AN202303161584303443
makedirs d:/Temp/htmls/AN202303161584303902
makedirs d:/Temp/htmls/AN202303161584303903
makedirs d:/Temp/htmls/AN202303161584303904
makedirs d:/Temp/htmls/AN202303161584303905
makedirs d:/Temp/htmls/AN202303161584303906
makedirs d:/Temp/htmls/AN202303161584303907
makedirs d:/Temp/htmls/AN202303161584303908
makedirs d:/Temp/htmls/AN202303161584303911
makedirs d:/Temp/htmls/AN202303161584303913
makedirs d:/Temp/htmls/AN202303161584303915
makedirs d:/Temp/htmls/AN202303161584303916
makedirs d:/Temp/htmls/AN202303161584303917
makedirs d:/Temp/htmls/AN202303161584303919
makedirs d:/Temp/htmls/AN202303161584303920
makedirs d:/Temp/htmls/AN202303161584303922
makedirs d:/Temp/htmls/AN202303161584303924
makedirs d:/Temp/htmls/AN202303161584303925
makedirs d:/Temp/htmls/AN202303161584303928
makedirs d:/Temp/htmls/AN202303161584303930
makedirs d:/Temp/htmls/AN202303161584303934
makedirs d:/Temp/htmls/AN202303161584304119
makedirs d:/Temp/htmls/AN202303161584304121
makedirs d:/Temp/htmls/AN202303161584304123
makedirs d:/Temp/htmls/AN202303161584304125
makedirs d:/Temp/htmls/AN202303161584304127
makedirs d:/Temp/htmls/AN202303161584304128
makedirs d:/Temp/htmls/AN202303161584304132
makedirs d:/Temp/htmls/AN202303161584304134
makedirs d:/Temp/htmls/AN202303161584304136
makedirs d:/Temp/htmls/AN202303161584304137
makedirs d:/Temp/htmls/AN202303161584304140
makedirs d:/Temp/htmls/AN202303161584304147
makedirs d:/Temp/htmls/AN202303161584304152
makedirs d:/Temp/htmls/AN202303161584304157
makedirs d:/Temp/htmls/AN202303161584304160
makedirs d:/Temp/htmls/AN202303161584304166
makedirs d:/Temp/htmls/AN202303161584311395
makedirs d:/Temp/htmls/AN202303161584311613
makedirs d:/Temp/htmls/AN202303161584312005
makedirs d:/Temp/htmls/AN202303161584312613
makedirs d:/Temp/htmls/AN202303161584313298
makedirs d:/Temp/htmls/AN202303161584313359
makedirs d:/Temp/htmls/AN202303161584313361
makedirs d:/Temp/htmls/AN202303161584313374
makedirs d:/Temp/htmls/AN202303161584313376
makedirs d:/Temp/htmls/AN202303161584313381
makedirs d:/Temp/htmls/AN202303161584313385
makedirs d:/Temp/htmls/AN202303161584313457
makedirs d:/Temp/htmls/AN202303161584313465
makedirs d:/Temp/htmls/AN202303161584313810
makedirs d:/Temp/htmls/AN202303161584313855
makedirs d:/Temp/htmls/AN202303161584314390
makedirs d:/Temp/htmls/AN202303161584315523
makedirs d:/Temp/htmls/AN202303161584315719
makedirs d:/Temp/htmls/AN202303161584315766
makedirs d:/Temp/htmls/AN202303161584316679
makedirs d:/Temp/htmls/AN202303161584317034
makedirs d:/Temp/htmls/AN202303161584318554
makedirs d:/Temp/htmls/AN202309051597490339
makedirs d:/Temp/htmls/AN202309151598686950
makedirs d:/Temp/htmls/AN202311151610996963
makedirs d:/Temp/htmls/AN202311151610996966
makedirs d:/Temp/htmls/AN202311161611013349
makedirs d:/Temp/htmls/AN202311161611013353
makedirs d:/Temp/htmls/AN202311161611013355
makedirs d:/Temp/htmls/AN202311161611013377
makedirs d:/Temp/htmls/AN202311161611013379
makedirs d:/Temp/htmls/AN202311161611013391
makedirs d:/Temp/htmls/AN202311161611013392
makedirs d:/Temp/htmls/AN202311161611013398
makedirs d:/Temp/htmls/AN202311161611013404
makedirs d:/Temp/htmls/AN202311161611013773
makedirs d:/Temp/htmls/AN202311161611018540
makedirs d:/Temp/htmls/AN202311161611019423
makedirs d:/Temp/htmls/AN202311161611019489
makedirs d:/Temp/htmls/AN202311161611019493
makedirs d:/Temp/htmls/AN202311161611019511
makedirs d:/Temp/htmls/AN202311161611019537
makedirs d:/Temp/htmls/AN202311161611019545
makedirs d:/Temp/htmls/AN202311161611021874
makedirs d:/Temp/htmls/AN202311161611025479
makedirs d:/Temp/htmls/AN202311161611026592
makedirs d:/Temp/htmls/AN202311161611026594
makedirs d:/Temp/htmls/AN202311161611027171
makedirs d:/Temp/htmls/AN202311161611027185
makedirs d:/Temp/htmls/AN202311161611027928
In [1]:
import os
import json
import shutil

basedir = r'D:\Temp\p2h'

# if os.path.exists(r'd:/Temp/compare_result'):
#     shutil.rmtree('d:/Temp/compare_result')
def list_infocodes() -> list[str]:
    yield from os.listdir(basedir)

def compare_words(custom_srot_words, use_text_flow_words) -> tuple[bool, str, str]:
    custom_srot_wd_ls = list(map(lambda it: it['text'], custom_srot_words))
    use_text_flow_wd_ls = list(map(lambda it: it['text'], use_text_flow_words))
    
    target_text = "".join(use_text_flow_wd_ls).replace(' ', '')
    # TODO 
    same = True
    for wd in custom_srot_wd_ls:
        if wd.replace(' ', '') not in target_text:
            same = False
            break
    # same = "".join(custom_srot_wd_ls).replace(" ", '') == "".join(use_text_flow_wd_ls).replace(" ", '')
    return (same,
        "\n".join(custom_srot_wd_ls),
        "\n".join(use_text_flow_wd_ls)
    )

from lds_util import *

not_same_ct = 0
all_ct = 0
infocodes = set()
for infocode in '''AN202303011583892542
AN202303011583925256
AN202309051597490339
AN202309151598686950
AN202112271537020821
AN202303141584239808
AN202304201585626705'''.splitlines():    
# for infocode in list_infocodes():
    pdf_dir = fr'{basedir}\{infocode}'
    for root, dirs, files in os.walk(pdf_dir):
        for file in files:
            if 'use_text_flow_words' in file:
                try:
                    fw1 = os.path.join(root, file.replace('use_text_flow_words', 'custom_srot_words'))
                    with open(fw1, 'r', encoding='utf-8') as fpr:
                        custom_srot_words = json.load(fpr)
                        custom_srot_words.sort(key=lambda it: int(it['top']/3))
                        with open(fw1.replace(".json", "_text.txt"), 'w', encoding='utf-8') as fpw:
                            fpw.write('\n'.join(map(lambda it: it['text'], custom_srot_words)))
                    
                    fw2 = os.path.join(root, file)
                    with open(fw2, 'r', encoding='utf-8') as fpr:
                        use_text_flow_words = json.load(fpr)
                        use_text_flow_words.sort(key=lambda it: int(it['top']/3))
                        with open(fw2.replace(".json", "_text.txt"), 'w', encoding='utf-8') as fpw:
                            fpw.write('\n'.join(map(lambda it: it['text'], use_text_flow_words)))
                    
                    i = file.replace('use_text_flow_words_', '').replace('.json', '')
                    i = int(i) + 1
                    same, custom_srot_text, use_text_flow_text = compare_words(custom_srot_words, use_text_flow_words)
                    infocodes.add(infocode)
                    if not same:
                        not_same_ct += 1
                        print(f'{not_same_ct}/{all_ct}', "="*15, infocode, i, '='*15)
                        with open(mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/custom_srot_text.txt'), 'w', encoding='utf-8') as fpw:
                            fpw.write(custom_srot_text)
                        with open(mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/use_text_flow_text.txt'), 'w', encoding='utf-8') as fpw:
                            fpw.write(use_text_flow_text)
                        shutil.copyfile(fw1, mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/custom_srot_text_{i}.json'))
                        shutil.copyfile(fw1, mkpdir(f'd:/Temp/compare_result/{infocode}/{i}/use_text_flow_text_{i}.json'))
                        shutil.copyfile(fr'D:\Temp\p2h\{infocode}\{infocode}.pdf', mkpdir(f'd:/Temp/compare_result/{infocode}/{infocode}.pdf'))
                        # print(custom_srot_text)
                        # print("-"*15, '-'*15)
                        # print(use_text_flow_text)
                    elif len(custom_srot_words) > 0:
                        all_ct += 1
                except Exception as e:
                    print(e)
                    
                
Expecting value: line 1 column 1 (char 0)
Expecting value: line 5 column 2 (char 9)
Expecting value: line 4 column 2 (char 7)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
1/5 =============== AN202309051597490339 1 ===============
Expecting value: line 1 column 1 (char 0)
2/5 =============== AN202309151598686950 1 ===============
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
3/49 =============== AN202112271537020821 50 ===============
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
4/84 =============== AN202112271537020821 82 ===============
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
5/87 =============== AN202303141584239808 1 ===============
Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)