In [1]:
import sqlite3
import os
import sys
import json
topic_ = "Financial-Markets"
db_files = [
rf'd:\chrome_for_Download_PDFS_{topic_}\Default\History'
]
has_downloaded = set()
has_downloaded_ = []
for db_file in db_files:
print(db_file, end='\r')
sqlite3_con = sqlite3.connect(db_file) # 硬盘上创建并连接数据库对象(有则连接,无则创建并连接)
sqlite3_cursor = sqlite3_con.cursor() # 创建游标对象
select_sql = '''
SELECT guid, url,current_path, target_path, total_bytes, tab_url, mime_type, original_mime_type
FROM downloads dl, downloads_url_chains duc
where dl.id = duc.id;
'''
try:
sqlite3_cursor.execute(select_sql)
# 获取查询结果
results = sqlite3_cursor.fetchall()
data = []
for res in results:
guid, url,current_path, target_path, total_bytes, tab_url, mime_type, original_mime_type = res
try:
tab_url = tab_url[tab_url.index("publication"):]
has_downloaded.add(tab_url.split("/")[1])
has_downloaded_.append(tab_url.split("/")[1])
except Exception as e:
print(e)
print(db_file, len(results))
except Exception as e:
print('更新数据执行失败', e)
finally:
sqlite3_cursor.close() # 关闭游标
sqlite3_con.close() # 关闭连接
print(len(has_downloaded_), len(has_downloaded))
json.dump(has_downloaded_, open("d:/has_downloaded.json",'w',encoding="utf-8"))
substring not foundoad_PDFS_Financial-Markets\Default\History substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found substring not found d:\chrome_for_Download_PDFS_Financial-Markets\Default\History 7919 7850 7154
In [1]:
import selenium
from selenium import webdriver
import json
import time
In [2]:
topic_ = "Financial-Markets"
option = webdriver.ChromeOptions()
option.add_experimental_option("excludeSwitches", ['enable-automation'])
option.add_argument("--disable-javascript")
option.add_argument("--disable-cache")
option.add_argument(f"--user-data-dir=d:/chrome_for_Download_PDFS_{topic_}")
option.add_experimental_option(
'prefs',
{
"download.default_directory": f"D:\\{topic_}",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True, # 这句配置很重要
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.media_stream": 2
}
)
driver = webdriver.Chrome(
executable_path="d:/chromedriver.exe",
chrome_options=option
)
driver.implicitly_wait(2)
d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\ipykernel_launcher.py:21: DeprecationWarning: use options instead of chrome_options
获取当前已经下载的文件¶
In [3]:
has_downloaded = json.load(open("d:/has_downloaded.json",encoding="utf-8"))
# has_download = set()
print(len(set(has_downloaded)), has_downloaded[:2])
has_downloaded = set(has_downloaded)
7154 ['353284745_Computing_near-optimal_Value-at-Risk_portfolios_using_Integer_Programming_techniques', '365044547_Development_of_Innovative_Tools_in_Global_Financial_Markets']
In [4]:
pdf_download_page_urls = json.load(open(fr'd:/out_pdfs/{topic_}_pdf_download_page_urls.json',"r", encoding="utf-8"))
In [5]:
ind = 0
for download_page_url in set(pdf_download_page_urls):
if download_page_url.split("/")[1] in has_downloaded or download_page_url in has_downloaded:
ind += 1
print("has downloaded",ind, len(pdf_download_page_urls) - ind, len(set(pdf_download_page_urls)))
has downloaded 7152 2842 9994
In [6]:
pdf_urls = {}
print("pdf_download_page_urls", len(set(pdf_download_page_urls)))
driver.get("https://www.researchgate.net/" + pdf_download_page_urls[0])
ind = 0
for download_page_url in set(pdf_download_page_urls):
if download_page_url.split("/")[1] in has_downloaded or download_page_url in has_downloaded:
ind += 1
print("skip", ind, download_page_url, end='\r')
continue
# driver.get("https://www.researchgate.net/" + download_page_url.replace("/download", ""))
driver.get("https://www.researchgate.net/" + download_page_url)
has_downloaded.add(download_page_url)
try:
driver.find_element_by_css_selector("span.gtm-download-fulltext-btn-header").click()
driver.find_element_by_css_selector("div.public-research-detail-download-public-links-card__file-list-item-btn").click()
# pdf_url = driver.find_element_by_css_selector(
# "a[data-lite]").get_attribute("href")
# pdf_detail = None
# try:
# pdf_detail = driver.find_element_by_css_selector("div.research-detail-header-section__ie11").get_attribute("innerHTML")
# except:
# print("获取DOI出错:", pdf_url)
# print(pdf_url, pdf_detail)
# if pdf_url:
# pdf_urls[download_page_url] = (pdf_url, pdf_detail)
except Exception as e:
print("ERROR", str(e).splitlines()[0], end="\r")
pdf_urls[download_page_url] = None
time.sleep(0.5)
ind += 1
pdf_download_page_urls 9994 skip 6947 publication/333500307_La_fintegracion_del_sistema_bancario/downloadctor","selector":"div.public-research-detail-download-public-links-card__file-list-item-btn"}Equations_Equipped_with_the_Homogeneous_Neumann_Boundary_Condi/download
--------------------------------------------------------------------------- TimeoutException Traceback (most recent call last) <ipython-input-6-b69aee0540d4> in <module> 9 continue 10 # driver.get("https://www.researchgate.net/" + download_page_url.replace("/download", "")) ---> 11 driver.get("https://www.researchgate.net/" + download_page_url) 12 has_downloaded.add(download_page_url) 13 try: d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py in get(self, url) 331 Loads a web page in the current browser session. 332 """ --> 333 self.execute(Command.GET, {'url': url}) 334 335 @property d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\selenium\webdriver\remote\webdriver.py in execute(self, driver_command, params) 319 response = self.command_executor.execute(driver_command, params) 320 if response: --> 321 self.error_handler.check_response(response) 322 response['value'] = self._unwrap_value( 323 response.get('value', None)) d:\Users\Administrator\anaconda3\envs\crawler\lib\site-packages\selenium\webdriver\remote\errorhandler.py in check_response(self, response) 240 alert_text = value['alert'].get('text') 241 raise exception_class(message, screen, stacktrace, alert_text) --> 242 raise exception_class(message, screen, stacktrace) 243 244 def _value_or_default(self, obj, key, default): TimeoutException: Message: timeout: Timed out receiving message from renderer: 262.201 (Session info: chrome=114.0.5735.199)