同花顺公式采集验证¶
In [ ]:
%pip install Appium-Python-Client
In [ ]:
from appium import webdriver
desired_caps = {
"app": r'E:\同花顺\hexin.exe',
# "appTopLevelWindow": 0x361506,
"platformName": "Windows",
"deviceName": "WindowsPC"
}
driver = webdriver.Remote(
command_executor='http://127.0.0.1:4723/wd/hub',
desired_capabilities=desired_caps
)
current_window = driver.current_window_handle
all_windows = driver.window_handles
print(all_windows)
In [ ]:
# 关闭新人福利窗口
driver.find_element_by_accessibility_id('1000').click()
In [ ]:
from appium.webdriver.common.touch_action import TouchAction
In [ ]:
driver.tap([(14,323)])
In [ ]:
win1 = driver.find_element_by_accessibility_id('59648')
In [ ]:
# TODO 点击K线图
action = TouchAction(driver)
action.tap(element=win1, x=-200, y=-200).perform()
In [ ]:
指标标签 = driver.find_element_by_accessibility_id('20556')
In [ ]:
指标标签.text
In [ ]:
主要曲线 = driver.find_element_by_name("主要曲线")
In [ ]:
主要曲线.click()
主要曲线.click()
主要曲线.click()
In [ ]:
action = TouchAction(driver)
action.tap(主要曲线).wait(10).press(主要曲线).perform()
In [ ]:
driver.find_element_by_accessibility_id("293654952").text
In [ ]:
with open("page_source.xml", 'w', encoding='utf-8') as fpw:
fpw.write(driver.page_source)
In [ ]:
# 在这里执行其他操作
driver.quit()
In [ ]:
driver.find_element_by_name('文件(F)').click()
from time import sleep
sleep(2)
driver.find_element_by_name('保存(S) Ctrl+S').click()
# driver.find_element_by_name('退出(X)').click()
sleep(1)
import pyautogui
pyautogui.PAUSE = 0.5
pyautogui.typewrite(r'D:\hello.txt')
pyautogui.press('enter')
1_采集列表¶
In [ ]:
import requests
from datetime import datetime, timedelta
import logging
import pandas as pd
import time
from lds_util import dump, dumps, mkpdir, get_dict
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
excel_file = r"D:\Downloads\沪深A股_sample.xlsx"
data_frame = pd.read_excel(excel_file)
excel_data = data_frame.values.tolist()
In [ ]:
excel_data
In [ ]:
import redis
pool = redis.ConnectionPool(host='127.0.0.1', port=6379, decode_responses=True)
redis_cli = redis.Redis(host='127.0.0.1', port=6379, password='lds', decode_responses=True, db=0)
lbty_cookie_key = "lds:lbty:cookie"
g_cookies = eval(redis_cli.get(lbty_cookie_key))
g_cookies
In [ ]:
class CookieError(RuntimeError):
def __init__(self, *args: object) -> None:
super().__init__(*args)
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-cn',
# 'Accept-Encoding': 'gzip, deflate, br',
'Origin': 'https://robo.datayes.com',
'Connection': 'keep-alive',
'Referer': 'https://robo.datayes.com/',
# 'Cookie': 'gr_user_id=26250982-1a9b-4973-970a-fe103ede4ed6; cloud-anonymous-token=f07471e9ffa64e23a91b5245a6642387; cloud-sso-token=E5B1067A4772527AFAE513AAEA4684BF; rrp-wechat-login-token=520c813c-81ea-4e7b-aeed-7c4ae9ece81d; _ga=GA1.2.1213974505.1692239696; _gid=GA1.2.382730469.1692239696; _ga_YBHR4XSTK7=GS1.2.1692239696.1.1.1692239872.60.0.0; ba895d61f7404b76_gr_last_sent_cs1=10618609@wmcloud.com; ba895d61f7404b76_gr_session_id=36a6d9aa-b878-4b0b-a756-7aad7dab326a; ba895d61f7404b76_gr_last_sent_sid_with_cs1=36a6d9aa-b878-4b0b-a756-7aad7dab326a; ba895d61f7404b76_gr_cs1=10618609@wmcloud.com; ba895d61f7404b76_gr_session_id_sent_vst=36a6d9aa-b878-4b0b-a756-7aad7dab326a',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
MAX_PAGE = 8
OUTFILE_BASEDIR = "e:/萝卜投研"
OUTFILE_FROMAT = OUTFILE_BASEDIR + "/{}/{}_{}.json"
mkpdir(OUTFILE_BASEDIR + "/das")
PARAMS = {
'pageNow': '1',
'pageSize': '30',
'sortField': 'effectiveTime',
'sortOrder': 'desc',
'isDidMount': 'true',
'ticker': '002249',
'sentiment': '1',
}
g_session = requests.session()
update_cookie = time.time()
def one_ticker(ticker_item: list[str]):
global g_cookies
ticker = ticker_item[0].split(".")[0]
logger.info("ticker: %s", ticker_item)
info_file = f"{OUTFILE_BASEDIR}/{ticker}/info.json"
mkpdir(info_file)
with open(info_file, 'w', encoding='utf-8') as fpw:
dump(ticker_item, fpw)
PARAMS["ticker"] = ticker
for sentiment in ["1", "-1"]:
PARAMS["sentiment"] = sentiment
for page in range(1, MAX_PAGE + 1):
def crawl():
global g_cookies, PARAMS, g_session, OUTFILE_FROMAT
outfile = OUTFILE_FROMAT.format(ticker, sentiment, page)
if os.path.exists(outfile):
# logger.info("已经采集:%s", outfile)
return None
PARAMS["pageNow"] = page
logger.info("%s_%s", ticker, page)
response = g_session.get(
'https://gw.datayes.com/rrp_adventure/publicOpinion/getSentimentNews',
params=PARAMS,
cookies=g_cookies,
headers=headers,
)
resp_json = response.json()
code = resp_json["code"]
if code == 1:
# total_page = resp_json["data"]["total"]
# assert total_page is not None
outfile = OUTFILE_FROMAT.format(ticker, sentiment, page)
mkpdir(outfile)
logger.info("outfile: %s", outfile)
with open(outfile, 'w', encoding='utf-8') as fpw:
fpw.write(response.text)
time.sleep(1)
elif code == -403 or resp_json["message"] == "Need login":
logger.error("请更换cookie, 距离上一次更换已经过去:%s", time.time() - update_cookie)
time.sleep(120)
g_cookies = eval(redis_cli.get(lbty_cookie_key))
raise CookieError()
try:
crawl()
except CookieError:
crawl()
In [ ]:
for item in excel_data:
one_ticker(item)
2_解析数据¶
In [ ]:
import json
from urllib.parse import urlparse
def get_domain(url: str):
return urlparse(url).netloc
In [ ]:
g_domain = set()
g_siteName = set()
class Model(object):
def __init__(self, *args, **kwargs):
self.stockName = None
self.stockCode = None
self.bullish = None
self.title = None
self.content = None
self.publishTime = None
self.author = None
self.industry = None
self.url = None
self.source = None
self.siteName = None
@get_dict
def parse(item, g_stock_name, g_stock_code, g_sentiment):
'''
解析数据
'''
model = Model()
model.stockName = g_stock_name
model.stockCode = g_stock_code
if g_sentiment == '1':
model.bullish = 1
elif g_sentiment == '-1':
model.bullish = 0
else:
raise ValueError(g_sentiment)
model.title = item["title"]
model.content = None
date = datetime.fromtimestamp(item["publishTimeStm"] / 1000)
model.publishTime = date.strftime("%Y-%m-%d %H:%M:%S")
model.author = item.get("author")
model.source = item.get("sourceName")
model.industry = item["industry"]
model.siteName = item["siteName"]
model.url = item["url"]
g_domain.add(get_domain(model.url))
g_siteName.add(model.siteName)
logger.info("dict: %s, domain: %s",model.__dict__, get_domain(model.url))
return model
res = {}
# for dirpath, dirnames, filenames in os.walk(OUTFILE_BASEDIR):
# if dirpath == 'e:/萝卜投研' or "html" in dirpath:
# continue
# logger.info(f"dirpath: {dirpath}")
# g_stock_name = None
# stock = os.path.basename(dirpath)
# g_sentiment= None
# with open(os.path.join(dirpath, "info.json"), 'r', encoding='utf-8') as fpr:
# info_json = json.load(fpr)
# g_stock_code, g_stock_name, _, _ = info_json
# logger.info(f"g_stock_code: {g_stock_code}, g_stock_name: {g_stock_name}")
# key1 = f'{stock}_1'
# key2 = f'{stock}_-1'
# res[key1] = res.get(key1, [])
# res[key2] = res.get(key2, [])
# for fname in filenames:
# if 'info' in fname:
# continue
# else:
# assert len(fname.split("_")) == 2
# g_sentiment = fname.split("_")[0]
# with open(os.path.join(dirpath, fname), 'r', encoding='utf-8') as fpr:
# res[f'{stock}_{g_sentiment}'].extend(map(lambda it: parse(it, g_stock_name, g_stock_code, g_sentiment),
# json.load(fpr)["data"]["hits"]))
In [ ]:
for k, v in res.items():
v.sort(key=lambda it: it["publishTime"], reverse=True)
with open(f"{OUTFILE_BASEDIR}/data.json", 'w', encoding='utf-8') as fpw:
dump(res, fpw)
In [ ]:
In [ ]:
len(g_domain), len(g_siteName)
In [ ]:
for k, models in res.items():
temp = []
date_set = set()
for model in models:
if model["publishTime"][:10] not in date_set:
temp.append(model)
date_set.add(model["publishTime"][:10])
if len(temp) == 20:
res[k] = temp
break
else:
if len(date_set) < 10:
logger.error("数据不够啊" + k + " date_set " + str(date_set))
with open(f"{OUTFILE_BASEDIR}/data_recent10.json", 'w', encoding='utf-8') as fpw:
dump(res, fpw)
In [ ]:
g_domain = set()
g_siteName = set()
for k, models in res.items():
for model in models:
g_domain.add(get_domain(model["url"]))
g_siteName.add(model["siteName"])
In [ ]:
g_domain
In [ ]:
import traceback
from selenium.webdriver.common.by import By
import requests
from datetime import datetime, timedelta
import traceback
from selenium.webdriver.chromium.webdriver import ChromiumDriver
from selenium import webdriver
from selenium.webdriver.chromium import service
def get_driver():
for _ in range(3):
try:
option = webdriver.ChromeOptions()
option.add_experimental_option("excludeSwitches", ['enable-automation'])
# 添加启动参数,打开开发者工具
# option.add_argument("--auto-open-devtools-for-tabs")
option.add_argument("--disable-cache")
option.add_argument('--ignore-certificate-errors')
# option.add_argument(f"--user-data-dir=d:/luobatouyan")
option.add_experimental_option(
'prefs',
{
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True, # 这句配置很重要
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.media_stream": 2
}
)
# chrome_service = service.ChromiumService(executable_path="d:/chromedriver")
driver = webdriver.Chrome(
options=option,
# service=chrome_service
)
driver.implicitly_wait(5)
return driver
except:
logger.error(traceback.format_exc())
time.sleep(3)
raise ValueError("创建 webdriver 失败, 请人工检查")
# g_driver = get_driver()
In [ ]:
model
In [ ]:
import hashlib
def md5_hash(text):
# 创建MD5对象
md5 = hashlib.md5()
# 更新摘要对象的内容
md5.update(text.encode('utf-8'))
# 获取摘要结果
result = md5.hexdigest()
return result
In [ ]:
md5_hash("https://caifuhao.eastmoney.com/news/20230704112547970423200")
In [ ]:
from concurrent.futures import ThreadPoolExecutor
def crawl_html( models):
driver:ChromiumDriver = None
try:
count = 0
for model in models:
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
if os.path.exists(html_file):
print("已经采集 ", html_file, end="\r")
continue
if driver is None:
driver = get_driver()
logger.warning("get %s", model["url"])
driver.get(model["url"])
page_source = driver.page_source
with open(mkpdir(html_file), 'w', encoding='utf-8') as fpw:
fpw.write(page_source)
model["html"] = driver.page_source
count += 1
finally:
if driver is not None:
driver.quit()
with ThreadPoolExecutor(max_workers=5, thread_name_prefix="chrome_crawl") as executor:
for k, models in res.items():
executor.submit(crawl_html, models)
In [ ]:
import json
import bs4
OUTFILE_BASEDIR = "e:/萝卜投研"
with open(f"{OUTFILE_BASEDIR}/data_recent10.json", 'r', encoding='utf-8') as fpr:
res = json.load(fpr)
In [ ]:
g_domain = set()
g_siteName = set()
for k, models in res.items():
for model in models:
g_domain.add(get_domain(model["url"]))
g_siteName.add(model["siteName"])
In [ ]:
len(g_domain)
In [ ]:
soup: bs4.BeautifulSoup
In [ ]:
g_driver = get_driver()
In [ ]:
# g_driver = webdriver.Firefox()
In [ ]:
walked = set()
In [ ]:
del common_process
del finance_stockstar_com
3_fun¶
In [ ]:
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup as bsoup
def common_process(soup: bs4.BeautifulSoup, model, css_selector, exclude_kw=None):
if not isinstance(css_selector, (list, tuple)):
css_selector = [css_selector]
for css_selector in list(css_selector):
try:
content = ""
try:
for child in soup.select_one(css_selector).children:
if len(child.text.strip()) >= 1 and (exclude_kw and exclude_kw not in child.text):
content += child.text.strip() + "\n"
assert len(content) > 2
except Exception as e:
print(traceback.format_exc())
content = ""
g_driver.get(model["url"])
g_driver.find_elements(By.CSS_SELECTOR,css_selector)
time.sleep(1)
content = "\n".join(map(lambda it: it.text, g_driver.find_elements(By.CSS_SELECTOR,css_selector)))
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
with open(html_file, 'w', encoding='utf-8') as fpw:
fpw.write(g_driver.page_source)
print(content)
return content
except:
pass
def finance_eastmoney_com(soup: bs4.BeautifulSoup, model):
try:
if '抱歉,您访问的页面不存在或已删除' in str(soup):
print(model["url"], '抱歉,您访问的页面不存在或已删除')
return content
for child in soup.select_one("div#ContentBody").children:
if 'APP内免费看>>' in str(child.text) \
or 'class="b-chart' in str(child):
pass
else:
content += child.text
print(content)
return content
except:
pass
try:
g_driver.get(model["url"])
content= "\n".join(map(lambda it: it.text,g_driver.find_elements(By.CSS_SELECTOR,
"div#ContentBody")))
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
with open(html_file, 'w', encoding='utf-8') as fpw:
fpw.write(g_driver.page_source)
return content
# content = soup.select_one("div.abstract").text.strip()
except:
content = ''
if '抱歉,您访问的页面不存在或已删除' in str(soup):
print(model["url"], '抱歉,您访问的页面不存在或已删除')
return content
for child in soup.select_one("div#ContentBody").children:
if 'APP内免费看>>' in str(child.text) \
or 'class="b-chart' in str(child):
pass
else:
content += child.text
print(content)
return content
def caifuhao_eastmoney_com(soup: bs4.BeautifulSoup, model):
try:
content = soup.select_one("div.article-body").text.strip()
return content
except:
content = ''
if '抱歉,您访问的页面不存在或已删除' in str(soup) or '该文章已被删除' in str(soup):
print(model["url"], '抱歉,您访问的页面不存在或已删除')
return content
for child in soup.select_one("div#ContentBody").children:
if 'APP内免费看>>' in str(child.text) \
or 'class="b-chart' in str(child):
pass
else:
content += child.text
print(content)
return content
def yuanchuang_10jqka_com_cn(soup: bs4.BeautifulSoup, model):
content = ""
for child in soup.select("div.main-text p:not([class])"):
content += child.text + "\n"
print(content)
return content
def gu_qq_com(soup: bs4.BeautifulSoup, model):
content = None
try:
content = soup.select_one("div.news-text").text
except:
try:
g_driver.get(model["url"])
content= g_driver.find_element(By.CSS_SELECTOR,"div.news-text").text
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
with open(html_file, 'w', encoding='utf-8') as fpw:
fpw.write(g_driver.page_source)
except NoSuchElementException:
return None
print(content)
return content
def www_wabei_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, 'div.subject-content')
def stock_hexun_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, ['div.art_contextBox','div.quick_context'])
def stockstar_com(soup: bs4.BeautifulSoup, model):
if '您正在搜索的页面可能已经删除' in soup.text:
return None
content = ""
try:
for child in soup.select_one("div.article_content").children:
content += child.text + "\n"
except:
content = ""
g_driver.get(model["url"])
try:
content= g_driver.find_element(By.CSS_SELECTOR,"div.article_content").text
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
with open(html_file, 'w', encoding='utf-8') as fpw:
fpw.write(g_driver.page_source)
except NoSuchElementException:
return None
print(content)
return content
def sina_com_cn(soup: bs4.BeautifulSoup, model):
content = ""
for child in soup.select("div#artibody p"):
content += child.text + "\n"
print(content)
return content
def gelonghui_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, ["article", "div.dtb-content"])
def www_abi_com_cn(soup: bs4.BeautifulSoup, model):
content = ""
try:
for child in soup.select_one("section.textblock").children:
content += child.text + "\n"
except:
content = ""
g_driver.get(model["url"])
content= g_driver.find_element(By.CSS_SELECTOR,"section.textblock").text
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
with open(html_file, 'w', encoding='utf-8') as fpw:
fpw.write(g_driver.page_source)
print(content)
return content
def www_sohu_com(soup: bs4.BeautifulSoup, model):
css_sel = "article.article"
if soup.select_one("section.lookall"):
content = ""
g_driver.get(model["url"])
try:
g_driver.find_element(By.CSS_SELECTOR, 'section.lookall').click()
print("="*50, "查看全文")
except Exception as e:
print(traceback.format_exc())
content= g_driver.find_element(By.CSS_SELECTOR,css_sel).text
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
with open(html_file, 'w', encoding='utf-8') as fpw:
fpw.write(g_driver.page_source)
else:
content = common_process(soup, model, css_sel)
print(content)
return content
def www_laoyaoba_com(soup: bs4.BeautifulSoup, model):
if '很抱歉,您访问的新闻不存在' in soup.text:
return None
css_sel = "div.media-article-content"
if model["source"] is None:
source = soup.select_one("div.media-source span").text
if '来源:' in source:
model["source"] = source.replace('来源:', "").strip()
content = common_process(soup, model, css_sel)
print(content)
return content
def jrj_com_cn(soup: bs4.BeautifulSoup, model):
css_sel = "div.article_content"
if model["source"] is None:
if soup.select_one("div.article_info i"):
source = soup.select_one("div.article_info i").text
if '来源:' in source:
model["source"] = source.replace('来源:', "").strip()
content = common_process(soup, model, css_sel)
print(content)
return content
def www_cls_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-main-box div.detail-content")
def www_ccement_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.word")
def www_guandian_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.con_l_inner")
def www_lejucaijing_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.detail-content")
def hexun_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.art_contextBox")
def irnews_cfbond_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.tab-item")
def ggjd_cnstock_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.qmt_content_div")
def cs_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article.cont_article section")
def ifeng_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-info")
def new_qq_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#ArticleContent")
def news_winshang_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.win-news-content")
def www_meadin_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.out-article")
def www_traveldaily_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.articleContent")
def www_bjnews_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#contentStr div")
def xueqiu_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article.article__bd")
def www_p5w_net(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-text")
def www_zhitongcaijing_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.news-content div.news-body-content")
def caijing_com_cn(soup: bs4.BeautifulSoup, model):
if 'https://file.caijing.com.cn/404.gif' in str(soup.select("div.center")):
print("404", model["url"])
return None
return common_process(soup, model, "div.article-content")
def xuangubao_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, ".article_1Q_lp")
def www_jwview_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content_zw")
# def lights_ofweek_com(soup: bs4.BeautifulSoup, model):
# return common_process(soup, model, "div.artical")
def ofweek_com(soup: bs4.BeautifulSoup, model):
if soup.select_one("div#content-404") and not soup.select_one("div.artical"):
print("404", model["url"])
return None
return common_process(soup, model, "div.artical")
def preview_ylaigc_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "section.content")
def www_jiemian_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article-content")
def www_cb_com_cn(soup: bs4.BeautifulSoup, model):
if len(soup.text.strip()) == 0:
print("webdriver 失效")
return None
return common_process(soup, model, "div.content_page")
def www_eeo_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.xd-nr div.xx_boxsing")
def www_bbtnews_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#pageContent")
def cnfol_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.Article" ,exclude_kw="由证券之星根据公开信息整理")
def stock_cnfol_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.Article" ,exclude_kw="由证券之星根据公开信息整理")
def company_cnstock_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#qmt_content_div")
def cnstock_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#qmt_content_div")
def finance_ce_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#articleText")
def www_portcontainer_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.nei tbody tbody tbody")
def www_yicai_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.m-txt")
def www_163_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.post_body")
def www_tmtpost_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article")
def www_nbd_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.g-articl-text")
def _36kr_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.articleDetailContent")
def flash_jin10_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.detail-content")
def news_smm_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.news-detail-main-article")
def www_cqcb_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.sycontent")
def bjx_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.cc-article")
def www_elecfans_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.rticle-content")
def info_texnet_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.detail-text")
def www_mydigit_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.pcb table")
def sigmaintell_com(soup: bs4.BeautifulSoup, model):
return None
# return common_process(soup, model, "TODO")
def mp_weixin_qq_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#js_content")
def news_hea_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-wrap")
def news_cheaa_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article")
def www_techweb_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#content")
def news_mydrivers_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.news_info")
def www_chinatimes_net_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content")
def finance_china_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#fontzoom")
def _3bf_cc(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.contTxt")
def static_hongzhoukan_com(soup: bs4.BeautifulSoup, model):
if '404 Not Found' in soup.text:
return None
assert 0, "TODO"
def www_mei_net_cn(soup: bs4.BeautifulSoup, model):
assert 0, "手动处理吧"
def www_zqrb_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.news_content")
def www_cnmn_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#txtcont")
def news_leju_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.detail-content")
def www_iyiou_com(soup: bs4.BeautifulSoup, model):
return None
def www_evpartner_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#newscontent")
def www_cnautonews_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.articleDetailContent")
def www_dcement_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.conTxt")
def www_cnfol_hk(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#Content")
def api3_cls_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "section.content-box")
def www_yemacaijing_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.wangEditor-container")
def news_mysteel_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content")
def www_fx678_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article-cont")
def www_powerpigs_net(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#content")
def www_soozhu_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article_body")
def www_jiemodui_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article.content")
def info_chineseshipping_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.TRS_Editor")
def www_thepaper_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.index_cententWrap__Jv8jK")
def www_escn_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "span#detailContent")
def www_d1ev_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.ws-md")
def www_china5e_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#articleBox")
def www_zyzhan_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.contentText")
def www_ebrun_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.post-text")
def www_semi_org_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.single-page-post")
def www_dsb_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article-content")
def www_changjiangtimes_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.c_zw", exclude_kw='CJTIMES_GET')
def www_time_weekly_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article.main_article")
def www_dzzq_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article-content")
def www_investorscn_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-text")
def www_ccmn_cn(soup: bs4.BeautifulSoup, model):
return None
def www_lanjinger_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.lj-layout-main")
def www_linkshop_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article.page")
def www_gzdaily_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.detailNews div.mianConLeft > div.ng-binding")
def h5_newaircloud_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.TRS_Editor")
def paper_cnstock_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article-content")
def www_csrc_gov_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content")
def www_jjckb_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.mainCon")
def www_cceep_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.text")
def www_ifnews_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.centerText")
def www_mscbsc_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#articlebody")
def www_nmpa_gov_cn(soup: bs4.BeautifulSoup, model):
assert 0, '手动'
return common_process(soup, model, "div#articlebody")
def www_caiguu_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#newscontent")
def www_eefocus_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.article-content")
def www_chinaipo_com(soup: bs4.BeautifulSoup, model):
return None
# assert 0, 'TODO'
# return common_process(soup, model, "div.article-content")
def www_gg_lb_com(soup: bs4.BeautifulSoup, model):
if 'cannot connect server' in soup.text:
return None
assert 0, 'TODO'
return common_process(soup, model, "div.article-content")
def news_eccn_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.Newsdetaile")
def stock_eastmoney_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#ContentBody")
def futures_eastmoney_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#ContentBody")
def www_szse_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#desContent")
def house_ifeng_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-info")
def www_financialnews_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content_body")
def huanqiu_com(soup: bs4.BeautifulSoup, model):
return bsoup(soup.select_one("textarea.article-content").text).text
def news_pedaily_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.news-content")
def www_gongkong_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.summary")
def www_chinaports_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#news_content")
def www_port_org_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.info_content")
def m_mp_oeeee_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#BodySection")
def society_people_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.rm_txt_con")
def auto_gasgoo_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.contentDetailed")
def www_guancha_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.all-txt")
def news_21_sun_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content")
def www_cninct_com(soup: bs4.BeautifulSoup, model):
return None
# return common_process(soup, model, "div.content")
def www_news_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#detail")
def news_pharmnet_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content")
def www_cpnn_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.cpnnlist")
def www_97caijing_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article")
def www_cbminfo_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.box-news table")
def www_96369_net(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.main-content")
def www_ledinside_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.node_body")
def www_eaonline_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "body > div:nth-child(4) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > table:nth-child(1) > tbody:nth-child(1)")
def news_ccmn_cn(soup: bs4.BeautifulSoup, model):
assert 0, 'TODO'
return common_process(soup, model, "TODO")
def www_aicaijing_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.ant-spin-container .ArticleDetail-module__content--3Pe68")
def news_carnoc_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#newstext")
def www_ccaonline_cn(soup: bs4.BeautifulSoup, model):
if '未找到页面 - 404' in soup.text:
return None
return common_process(soup, model, "TODO")
def gas_in_en_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#article")
def www_linkshop_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "article.page section")
def www_texindex_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.necontent1")
def www_huxiu_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#article-content")
def www_sxcoal_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content")
def people_com_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.rm_txt_con", exclude_kw='.tjewm')
def www_cas_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.xl_content")
def www_canyin88_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.content-editor")
def www_cnlist_com(soup: bs4.BeautifulSoup, model):
if 'The proxy server received an invalid response' in soup.text:
return None
assert 0, 'TODO'
return common_process(soup, model, "TODO")
def www_xxsb_com(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div#content")
def tech_gmw_cn(soup: bs4.BeautifulSoup, model):
return common_process(soup, model, "div.u-mainText")
_fun end¶
In [ ]:
model
In [ ]:
html_file
In [ ]:
g_driver.get(model["url"])
"\n".join(map(lambda it: it.text,g_driver.find_elements(By.CSS_SELECTOR,
"div#content")))
In [ ]:
soup.text
In [ ]:
content = ""
for child in soup.select_one("div.u-mainText ").children:
if len(child.text.strip()) > 0 and '.tjewm' not in child.text:
content += child.text.strip() + "\n"
print(content)
In [ ]:
walked.add(html_file)
In [ ]:
print(model, '\n',model["url"], end='\r')
In [ ]:
domain_parse_fun = {
"finance.eastmoney.com": finance_eastmoney_com,
"yuanchuang.10jqka.com.cn": yuanchuang_10jqka_com_cn,
"gu.qq.com": gu_qq_com,
"www.wabei.cn": www_wabei_cn,
"stock.hexun.com": stock_hexun_com,
"hexun.com": stock_hexun_com,
"finance.stockstar.com": stockstar_com,
"10jqka.com.cn": yuanchuang_10jqka_com_cn,
"finance.sina.com.cn": sina_com_cn,
"sina.com.cn": sina_com_cn,
"www.gelonghui.com": gelonghui_com,
"stockstar.com": stockstar_com,
"www.abi.com.cn": www_abi_com_cn,
"www.sohu.com": www_sohu_com,
"www.laoyaoba.com": www_laoyaoba_com,
"www.cls.cn": www_cls_cn,
"stock.jrj.com.cn": jrj_com_cn,
"www.ccement.com": www_ccement_com,
"www.guandian.cn": www_guandian_cn,
"www.lejucaijing.com": www_lejucaijing_com,
}
def parse_html(model: str, soup) -> str:
return domain_parse_fun[get_domain(model["url"])](soup)
In [ ]:
errors = []
count = 0
4_refresh¶
In [ ]:
from bs4 import BeautifulSoup as bsoup
count = len(walked)
_globals = globals()
for k, models in res.items():
for model in models:
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
if html_file in walked:
continue
elif model["content"] and len(model["content"]) > 5:
walked.add(html_file)
continue
elif os.path.exists(html_file):
try:
print(count, "html_file:", html_file)
def can_parse():
domain = get_domain(model["url"])
if domain.endswith("10jqka.com.cn"):
return domain_parse_fun["10jqka.com.cn"]
elif domain.endswith("stockstar.com"):
return domain_parse_fun["stockstar.com"]
elif domain.endswith("sina.com.cn"):
return domain_parse_fun["sina.com.cn"]
else:
domain_splits = domain.replace("-", '_').split(".")
return _globals.get('_'.join(domain_splits)) \
or _globals.get("_".join(domain_splits[1:]))\
or _globals.get("_" + "_".join(domain_splits))\
or _globals.get("_" + "_".join(domain_splits[1:]))
fun = can_parse()
if model["content"] is not None and len(model["content"]) > 1:
walked.add(html_file)
count += 1
print("*" * 30)
continue
elif not fun:
with open(html_file, 'r', encoding='utf-8') as fpr:
soup = bsoup(fpr.read())
raise KeyError
elif html_file in walked:
print("^" * 5, count,"skip:", html_file)
else:
with open(html_file, 'r', encoding='utf-8') as fpr:
soup = bsoup(fpr.read())
model["content"] = fun(soup, model)
walked.add(html_file)
print("=" * 5,"add", count, html_file)
count += 1
if count % 50 == 0:
with open(f"{OUTFILE_BASEDIR}/data_recent_08-22.json", 'w', encoding='utf-8') as fpw:
dump(res, fpw)
except:
errors.append(model)
else:
print("*" * 5, html_file)
In [ ]:
from bs4 import BeautifulSoup as bsoup
count = len(walked)
_globals = globals()
for k, models in res.items():
for model in models:
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
if html_file not in walked and model in errors:
if os.path.exists(html_file):
try:
print(count, "html_file:", html_file)
def can_parse():
domain = get_domain(model["url"])
if domain.endswith("10jqka.com.cn"):
return domain_parse_fun["10jqka.com.cn"]
elif domain.endswith("stockstar.com"):
return domain_parse_fun["stockstar.com"]
elif domain.endswith("sina.com.cn"):
return domain_parse_fun["sina.com.cn"]
else:
domain_splits = domain.replace("-", '_').split(".")
return _globals.get('_'.join(domain_splits)) \
or _globals.get("_".join(domain_splits[1:]))\
or _globals.get("_" + "_".join(domain_splits))\
or _globals.get("_" + "_".join(domain_splits[1:]))
fun = can_parse()
if model["content"]:
walked.add(html_file)
elif not fun:
with open(html_file, 'r', encoding='utf-8') as fpr:
soup = bsoup(fpr.read())
raise KeyError
elif html_file in walked:
print("^" * 5, count,"skip:", html_file)
else:
with open(html_file, 'r', encoding='utf-8') as fpr:
soup = bsoup(fpr.read())
model["content"] = fun(soup, model)
walked.add(html_file)
print("=" * 5,"add", count, html_file)
count += 1
if count % 50 == 0:
with open(f"{OUTFILE_BASEDIR}/data_recent_08-22.json", 'w', encoding='utf-8') as fpw:
dump(res, fpw)
errors.remove(model)
except Exception as e:
# print(traceback.format_exc())
errors.append(model)
raise e
else:
print("*" * 5, html_file)
In [29]:
In [26]:
data = []
ct = 0
for k, models in res.items():
for model in models:
if model["content"] and len(model["content"]) > 5:
ct += 1
if model["source"] is None:
model["source"] = model["siteName"]
data.append(model)
print('count', len(data))
with open(f"{OUTFILE_BASEDIR}/out_08-22.json", 'w', encoding='utf-8') as fpw:
dump(data, fpw)
count 13701
In [28]:
import pandas as pd
pd.DataFrame(data).to_excel(f"E:/萝卜投研/萝卜投研.xlsx", index=False)
In [ ]:
with open(f"{OUTFILE_BASEDIR}/data_recent10.json", 'w', encoding='utf-8') as fpw:
dump(res, fpw)
In [ ]:
soup.select_one("body")
In [ ]:
has_add = set()
out_cout = 0
In [ ]:
ind = 0
need = 0
todo = dict()
for k, models in res.items():
ct = 0
for model in models:
if model["content"] and len(model["content"]) > 6:
ct += 1
if ct < 10:
ind += 1
need += 10 - ct
print(ind, k, 10 - ct)
todo[k] = models
print('need',need)
In [ ]:
todo
In [ ]:
for k, models in todo.items():
stock_code , g_sentiment = k.split("_")
bdir = os.path.join(OUTFILE_BASEDIR, stock_code)
for name in os.listdir(bdir):
if name.startswith(g_sentiment):
g_stock_name = None
stock = os.path.basename(bdir)
with open(os.path.join(bdir, "info.json"), 'r', encoding='utf-8') as fpr:
info_json = json.load(fpr)
g_stock_code, g_stock_name, _, _ = info_json
with open(os.path.join(bdir, name), 'r', encoding='utf-8') as fpr:
to_add = map(lambda it: parse(it, g_stock_name, g_stock_code, g_sentiment),
json.load(fpr)["data"]["hits"])
all_urls = set(map(lambda it: it["url"], models))
models.extend(filter(lambda it: it['url'] not in all_urls, to_add))
print(k, len(models))
In [ ]:
In [ ]:
for k, models in res.items():
for model in models:
domain = get_domain(model["url"])
if domain in has_add:
continue
html_file = f'{OUTFILE_BASEDIR}/html/{model["stockCode"]}_{model["bullish"]}/{md5_hash(model["url"])}.html'
if not os.path.exists(html_file):
continue
with open(html_file, 'r', encoding='utf-8') as fpr:
soup = bsoup(fpr.read())
if soup.select_one("body") and model["title"] in soup.select_one("body").text:
out1 = {}
out1["url"] = model["url"]
text = soup.select_one("body").text
out1["txt"] = text[text.index(model["title"]):]
with open(f'e:/to_check/{domain}.json', 'w', encoding='utf-8') as fpw:
out_cout += 1
print(out_cout)
has_add.add(domain)
fpw.write(dumps(out1))
else:
print(model["title"], model["url"])