In [1]:
import sqlite3
import os
import sys
import json 


topic_ = "Financial-Markets"
db_files = [
    rf'd:\chrome_for_Download_PDFS_{topic_}\Default\History'
    ]
has_downloaded = set()
has_downloaded_ = []

for db_file in db_files:
    print(db_file, end='\r')

    sqlite3_con = sqlite3.connect(db_file)  # 硬盘上创建并连接数据库对象(有则连接,无则创建并连接)
    sqlite3_cursor = sqlite3_con.cursor()  # 创建游标对象
    select_sql = '''
    SELECT guid, url,current_path, target_path, total_bytes, tab_url, mime_type, original_mime_type
    FROM downloads dl, downloads_url_chains duc 
    where dl.id = duc.id;
    '''
    
    try:
        sqlite3_cursor.execute(select_sql)
        # 获取查询结果
        results = sqlite3_cursor.fetchall()
        data = []
        for res in results:
            guid, url,current_path, target_path, total_bytes, tab_url, mime_type, original_mime_type = res
            try:
                tab_url = tab_url[tab_url.index("publication"):]
                has_downloaded.add(tab_url.split("/")[1])
                has_downloaded_.append(tab_url.split("/")[1])
            except Exception as e:
                print(e)
                
        print(db_file, len(results))
    except Exception as e:
        print('更新数据执行失败', e)
    finally:
        sqlite3_cursor.close()  # 关闭游标
        sqlite3_con.close()  # 关闭连接
		
		
print(len(has_downloaded_), len(has_downloaded))
json.dump(has_downloaded_, open("d:/has_downloaded.json",'w',encoding="utf-8"))
substring not foundoad_PDFS_Financial-Markets\Default\History
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
substring not found
d:\chrome_for_Download_PDFS_Financial-Markets\Default\History 7919
7850 7154
In [1]:
import selenium
from selenium import webdriver
import json
import time
In [2]:
topic_ = "Financial-Markets"
option = webdriver.ChromeOptions()
option.add_experimental_option("excludeSwitches", ['enable-automation'])
option.add_argument("--disable-javascript")
option.add_argument("--disable-cache")
option.add_argument(f"--user-data-dir=d:/chrome_for_Download_PDFS_{topic_}")

option.add_experimental_option(
    'prefs',
    {
        "download.default_directory": f"D:\\{topic_}",
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "plugins.always_open_pdf_externally": True,  # 这句配置很重要
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.media_stream": 2
    }
)
driver = webdriver.Chrome(
    executable_path="d:/chromedriver.exe",
    chrome_options=option
)

driver.implicitly_wait(2)
d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\ipykernel_launcher.py:21: DeprecationWarning: use options instead of chrome_options

获取当前已经下载的文件¶

In [3]:
has_downloaded = json.load(open("d:/has_downloaded.json",encoding="utf-8"))
# has_download = set()
print(len(set(has_downloaded)), has_downloaded[:2])
has_downloaded = set(has_downloaded)
7154 ['353284745_Computing_near-optimal_Value-at-Risk_portfolios_using_Integer_Programming_techniques', '365044547_Development_of_Innovative_Tools_in_Global_Financial_Markets']
In [4]:
pdf_download_page_urls = json.load(open(fr'd:/out_pdfs/{topic_}_pdf_download_page_urls.json',"r", encoding="utf-8"))
In [5]:
ind = 0
for download_page_url in set(pdf_download_page_urls):
    
    if download_page_url.split("/")[1]  in has_downloaded or download_page_url in has_downloaded:
        ind += 1
print("has downloaded",ind, len(pdf_download_page_urls) - ind, len(set(pdf_download_page_urls)))
has downloaded 7152 2842 9994
In [6]:
pdf_urls = {}
print("pdf_download_page_urls", len(set(pdf_download_page_urls)))
driver.get("https://www.researchgate.net/" + pdf_download_page_urls[0])
ind = 0
for download_page_url in set(pdf_download_page_urls):
    if download_page_url.split("/")[1]  in has_downloaded or download_page_url in has_downloaded:
        ind += 1
        print("skip", ind, download_page_url, end='\r')
        continue
    # driver.get("https://www.researchgate.net/" + download_page_url.replace("/download", ""))
    driver.get("https://www.researchgate.net/" + download_page_url)
    has_downloaded.add(download_page_url)
    try:
        driver.find_element_by_css_selector("span.gtm-download-fulltext-btn-header").click()
        driver.find_element_by_css_selector("div.public-research-detail-download-public-links-card__file-list-item-btn").click()
        # pdf_url = driver.find_element_by_css_selector(
        #     "a[data-lite]").get_attribute("href")
        
        # pdf_detail = None
        # try:
        #     pdf_detail = driver.find_element_by_css_selector("div.research-detail-header-section__ie11").get_attribute("innerHTML")
        # except:
        #     print("获取DOI出错:", pdf_url)
        # print(pdf_url, pdf_detail)
        # if pdf_url:
        #     pdf_urls[download_page_url] = (pdf_url, pdf_detail)
    except Exception as e:
        print("ERROR", str(e).splitlines()[0], end="\r")
        pdf_urls[download_page_url] = None
    time.sleep(0.5)
    ind += 1
pdf_download_page_urls 9994
skip 6947 publication/333500307_La_fintegracion_del_sistema_bancario/downloadctor","selector":"div.public-research-detail-download-public-links-card__file-list-item-btn"}Equations_Equipped_with_the_Homogeneous_Neumann_Boundary_Condi/download
---------------------------------------------------------------------------
TimeoutException                          Traceback (most recent call last)
<ipython-input-6-b69aee0540d4> in <module>
      9         continue
     10     # driver.get("https://www.researchgate.net/" + download_page_url.replace("/download", ""))
---> 11     driver.get("https://www.researchgate.net/" + download_page_url)
     12     has_downloaded.add(download_page_url)
     13     try:

d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py in get(self, url)
    331         Loads a web page in the current browser session.
    332         """
--> 333         self.execute(Command.GET, {'url': url})
    334 
    335     @property

d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py in execute(self, driver_command, params)
    319         response = self.command_executor.execute(driver_command, params)
    320         if response:
--> 321             self.error_handler.check_response(response)
    322             response['value'] = self._unwrap_value(
    323                 response.get('value', None))

d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\selenium\webdriver\remote\errorhandler.py in check_response(self, response)
    240                 alert_text = value['alert'].get('text')
    241             raise exception_class(message, screen, stacktrace, alert_text)
--> 242         raise exception_class(message, screen, stacktrace)
    243 
    244     def _value_or_default(self, obj, key, default):

TimeoutException: Message: timeout: Timed out receiving message from renderer: 262.201
  (Session info: chrome=114.0.5735.199)